2 e2b5610c 2022-04-11 op * Copyright (c) 2022 Omar Polo <op@openbsd.org>
4 e2b5610c 2022-04-11 op * Permission to use, copy, modify, and distribute this software for any
5 e2b5610c 2022-04-11 op * purpose with or without fee is hereby granted, provided that the above
6 e2b5610c 2022-04-11 op * copyright notice and this permission notice appear in all copies.
8 e2b5610c 2022-04-11 op * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 e2b5610c 2022-04-11 op * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 e2b5610c 2022-04-11 op * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 e2b5610c 2022-04-11 op * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 e2b5610c 2022-04-11 op * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 e2b5610c 2022-04-11 op * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 e2b5610c 2022-04-11 op * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 e2b5610c 2022-04-11 op #include <sys/mman.h>
19 e2b5610c 2022-04-11 op #include <stdio.h>
20 e2b5610c 2022-04-11 op #include <stdint.h>
21 e2b5610c 2022-04-11 op #include <stdlib.h>
22 e2b5610c 2022-04-11 op #include <string.h>
23 e2b5610c 2022-04-11 op #include <unistd.h>
25 e2b5610c 2022-04-11 op #include "db.h"
26 e2b5610c 2022-04-11 op #include "dictionary.h"
28 e2b5610c 2022-04-11 op #define IDX_ENTRY_SIZE (DB_WORDLEN + sizeof(int64_t))
31 e2b5610c 2022-04-11 op write_dictionary(FILE *fp, struct dictionary *dict)
36 e2b5610c 2022-04-11 op size_t i, len;
38 e2b5610c 2022-04-11 op if ((uint64_t)dict->len > UINT32_MAX)
41 e2b5610c 2022-04-11 op n = dict->len;
42 e2b5610c 2022-04-11 op if (fwrite(&n, sizeof(n), 1, fp) != 1)
45 e2b5610c 2022-04-11 op if ((start = ftello(fp)) == -1)
48 e2b5610c 2022-04-11 op len = DB_WORDLEN + sizeof(int64_t);
49 e2b5610c 2022-04-11 op pos = start + (n * len);
50 e2b5610c 2022-04-11 op for (i = 0; i < dict->len; ++i) {
51 e2b5610c 2022-04-11 op char word[DB_WORDLEN];
53 e2b5610c 2022-04-11 op memset(word, 0, sizeof(word));
54 e2b5610c 2022-04-11 op strlcpy(word, dict->entries[i].word, sizeof(word));
55 e2b5610c 2022-04-11 op if (fwrite(word, sizeof(word), 1, fp) != 1)
58 e2b5610c 2022-04-11 op if (fwrite(&pos, sizeof(pos), 1, fp) != 1)
61 e2b5610c 2022-04-11 op /* one for the len */
62 e2b5610c 2022-04-11 op pos += sizeof(uint32_t) * (dict->entries[i].len + 1);
65 e2b5610c 2022-04-11 op for (i = 0; i < dict->len; ++i) {
67 e2b5610c 2022-04-11 op uint32_t t, x;
69 e2b5610c 2022-04-11 op x = dict->entries[i].len;
70 e2b5610c 2022-04-11 op if (fwrite(&x, sizeof(x), 1, fp) != 1)
73 e2b5610c 2022-04-11 op for (j = 0; j < x; ++j) {
74 e2b5610c 2022-04-11 op t = dict->entries[i].ids[j];
75 e2b5610c 2022-04-11 op if (fwrite(&t, sizeof(t), 1, fp) != 1)
84 e2b5610c 2022-04-11 op db_create(FILE *fp, struct dictionary *dict, struct db_entry *entries,
87 e2b5610c 2022-04-11 op int64_t endidx;
89 e2b5610c 2022-04-11 op uint32_t version = DB_VERSION;
91 e2b5610c 2022-04-11 op if (n > INT32_MAX)
94 e2b5610c 2022-04-11 op if (fwrite(&version, sizeof(version), 1, fp) != 1)
97 e2b5610c 2022-04-11 op /* reserve space for the start pointer -- filled later */
98 e2b5610c 2022-04-11 op if (fseek(fp, sizeof(int64_t), SEEK_CUR) == -1)
101 e2b5610c 2022-04-11 op if (write_dictionary(fp, dict) == -1)
104 e2b5610c 2022-04-11 op if ((endidx = ftello(fp)) == -1)
107 e2b5610c 2022-04-11 op for (i = 0; i < n; ++i) {
108 e2b5610c 2022-04-11 op uint16_t namelen, descrlen = 0;
110 e2b5610c 2022-04-11 op namelen = strlen(entries[i].name);
111 e2b5610c 2022-04-11 op if (entries[i].descr != NULL)
112 e2b5610c 2022-04-11 op descrlen = strlen(entries[i].descr);
114 e2b5610c 2022-04-11 op if (fwrite(&namelen, sizeof(namelen), 1, fp) != 1)
116 e2b5610c 2022-04-11 op if (fwrite(entries[i].name, namelen+1, 1, fp) != 1)
119 e2b5610c 2022-04-11 op if (fwrite(&descrlen, sizeof(descrlen), 1, fp) != 1)
121 e2b5610c 2022-04-11 op if (descrlen > 0 &&
122 e2b5610c 2022-04-11 op fwrite(entries[i].descr, descrlen, 1, fp) != 1)
124 e2b5610c 2022-04-11 op if (fwrite("", 1, 1, fp) != 1)
128 e2b5610c 2022-04-11 op if (fseek(fp, sizeof(version), SEEK_SET) == -1)
131 e2b5610c 2022-04-11 op if (fwrite(&endidx, sizeof(endidx), 1, fp) != 1)
138 e2b5610c 2022-04-11 op initdb(struct db *db)
140 e2b5610c 2022-04-11 op off_t hdrlen = sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t);
141 e2b5610c 2022-04-11 op int64_t end_off;
142 e2b5610c 2022-04-11 op uint8_t *p = db->m;
144 e2b5610c 2022-04-11 op if (hdrlen > db->len)
147 e2b5610c 2022-04-11 op memcpy(&db->version, p, sizeof(db->version));
148 e2b5610c 2022-04-11 op p += sizeof(db->version);
150 e2b5610c 2022-04-11 op memcpy(&end_off, p, sizeof(end_off));
151 e2b5610c 2022-04-11 op p += sizeof(end_off);
153 e2b5610c 2022-04-11 op memcpy(&db->nwords, p, sizeof(db->nwords));
154 e2b5610c 2022-04-11 op p += sizeof(db->nwords);
156 e2b5610c 2022-04-11 op db->idx_start = p;
157 e2b5610c 2022-04-11 op db->idx_end = p + db->nwords * IDX_ENTRY_SIZE;
158 e2b5610c 2022-04-11 op db->list_start = db->idx_end;
159 e2b5610c 2022-04-11 op db->list_end = db->m + end_off;
160 e2b5610c 2022-04-11 op db->docs_start = db->list_end;
161 e2b5610c 2022-04-11 op db->docs_end = db->m + db->len;
163 e2b5610c 2022-04-11 op if (db->idx_end > db->docs_end)
165 e2b5610c 2022-04-11 op if (db->list_end > db->docs_end)
172 e2b5610c 2022-04-11 op db_open(struct db *db, int fd)
174 e2b5610c 2022-04-11 op memset(db, 0, sizeof(*db));
176 e2b5610c 2022-04-11 op if ((db->len = lseek(fd, 0, SEEK_END)) == -1)
179 e2b5610c 2022-04-11 op if (lseek(fd, 0, SEEK_SET) == -1)
182 e2b5610c 2022-04-11 op db->m = mmap(NULL, db->len, PROT_READ, MAP_PRIVATE, fd, 0);
183 e2b5610c 2022-04-11 op if (db->m == MAP_FAILED)
186 e2b5610c 2022-04-11 op if (initdb(db) == -1) {
187 e2b5610c 2022-04-11 op db_close(db);
195 e2b5610c 2022-04-11 op db_countdocs(struct db *db, struct db_entry *e, void *d)
197 e2b5610c 2022-04-11 op struct db_stats *stats = d;
199 e2b5610c 2022-04-11 op stats->ndocs++;
204 e2b5610c 2022-04-11 op db_idx_compar(const void *key, const void *elem)
206 e2b5610c 2022-04-11 op const char *word = key;
207 e2b5610c 2022-04-11 op const char *idx_entry = elem;
209 e2b5610c 2022-04-11 op if (idx_entry[DB_WORDLEN-1] != '\0')
211 e2b5610c 2022-04-11 op return strcmp(word, idx_entry);
214 e2b5610c 2022-04-11 op static inline uint32_t *
215 e2b5610c 2022-04-11 op db_getdocs(struct db *db, const uint8_t *entry, size_t *len)
220 e2b5610c 2022-04-11 op entry += DB_WORDLEN;
221 e2b5610c 2022-04-11 op memcpy(&pos, entry, sizeof(pos));
223 e2b5610c 2022-04-11 op entry = db->m + pos;
224 e2b5610c 2022-04-11 op if (entry < db->list_start || entry > db->list_end)
227 e2b5610c 2022-04-11 op memcpy(&l, entry, sizeof(l));
228 e2b5610c 2022-04-11 op entry += sizeof(l);
230 e2b5610c 2022-04-11 op return (uint32_t *)entry;
234 e2b5610c 2022-04-11 op db_word_docs(struct db *db, const char *word, size_t *len)
240 e2b5610c 2022-04-11 op e = bsearch(word, db->idx_start, db->nwords, IDX_ENTRY_SIZE,
241 e2b5610c 2022-04-11 op db_idx_compar);
242 e2b5610c 2022-04-11 op if (e == NULL)
244 e2b5610c 2022-04-11 op return db_getdocs(db, e, len);
248 e2b5610c 2022-04-11 op db_stats(struct db *db, struct db_stats *stats)
250 e2b5610c 2022-04-11 op const uint8_t *p;
251 e2b5610c 2022-04-11 op size_t l, maxl = 0, idlen;
253 e2b5610c 2022-04-11 op memset(stats, 0, sizeof(*stats));
255 e2b5610c 2022-04-11 op if (db_listall(db, db_countdocs, stats) == -1)
258 e2b5610c 2022-04-11 op stats->nwords = db->nwords;
260 e2b5610c 2022-04-11 op p = db->idx_start;
261 e2b5610c 2022-04-11 op while (p < db->idx_end) {
262 e2b5610c 2022-04-11 op if (p + DB_WORDLEN > db->idx_end)
265 e2b5610c 2022-04-11 op if (p[DB_WORDLEN-1] != '\0')
268 e2b5610c 2022-04-11 op l = strlen(p);
269 e2b5610c 2022-04-11 op if (l > maxl) {
271 e2b5610c 2022-04-11 op stats->longest_word = p;
274 e2b5610c 2022-04-11 op if (db_getdocs(db, p, &idlen) == NULL)
277 e2b5610c 2022-04-11 op if (idlen > stats->most_popular_ndocs) {
278 e2b5610c 2022-04-11 op stats->most_popular_ndocs = idlen;
279 e2b5610c 2022-04-11 op stats->most_popular = p;
282 e2b5610c 2022-04-11 op p += IDX_ENTRY_SIZE;
288 e2b5610c 2022-04-11 op static inline uint8_t *
289 e2b5610c 2022-04-11 op db_extract_doc(struct db *db, uint8_t *p, struct db_entry *e)
291 e2b5610c 2022-04-11 op uint16_t namelen, descrlen;
294 e2b5610c 2022-04-11 op * namelen[2] name[namelen]
295 e2b5610c 2022-04-11 op * descrlen[2] descr[descrlen]
298 e2b5610c 2022-04-11 op if (p + 2 > db->docs_end)
300 e2b5610c 2022-04-11 op memcpy(&namelen, p, sizeof(namelen));
301 e2b5610c 2022-04-11 op p += sizeof(namelen);
303 e2b5610c 2022-04-11 op if (p + namelen > db->docs_end || p[namelen] != '\0')
306 e2b5610c 2022-04-11 op p += namelen + 1;
308 e2b5610c 2022-04-11 op if (p + 2 > db->docs_end)
310 e2b5610c 2022-04-11 op memcpy(&descrlen, p, sizeof(descrlen));
311 e2b5610c 2022-04-11 op p += sizeof(descrlen);
313 e2b5610c 2022-04-11 op if (p + descrlen > db->docs_end || p[descrlen] != '\0')
315 e2b5610c 2022-04-11 op e->descr = p;
316 e2b5610c 2022-04-11 op p += descrlen + 1;
322 e2b5610c 2022-04-11 op db_listall(struct db *db, db_hit_cb cb, void *data)
324 e2b5610c 2022-04-11 op uint8_t *p = db->docs_start;
326 e2b5610c 2022-04-11 op while (p < db->docs_end) {
327 e2b5610c 2022-04-11 op struct db_entry e;
329 e2b5610c 2022-04-11 op if ((p = db_extract_doc(db, p, &e)) == NULL)
332 e2b5610c 2022-04-11 op if (cb(db, &e, data) == -1)
340 e2b5610c 2022-04-11 op db_doc_by_id(struct db *db, int docid, struct db_entry *e)
342 e2b5610c 2022-04-11 op uint8_t *p = db->docs_start;
345 e2b5610c 2022-04-11 op while (p < db->docs_end) {
346 e2b5610c 2022-04-11 op if ((p = db_extract_doc(db, p, e)) == NULL)
349 e2b5610c 2022-04-11 op if (n == docid)
359 e2b5610c 2022-04-11 op db_close(struct db *db)
361 e2b5610c 2022-04-11 op munmap(db->m, db->len);
362 e2b5610c 2022-04-11 op memset(db, 0, sizeof(*db));