2 * Copyright (c) 2022 Omar Polo <op@omarpolo.com>
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
26 #include "dictionary.h"
28 #define IDX_ENTRY_SIZE (DB_WORDLEN + sizeof(int64_t))
31 write_dictionary(FILE *fp, struct dictionary *dict)
38 if ((uint64_t)dict->len > UINT32_MAX)
42 if (fwrite(&n, sizeof(n), 1, fp) != 1)
45 if ((start = ftello(fp)) == -1)
48 len = DB_WORDLEN + sizeof(int64_t);
49 pos = start + (n * len);
50 for (i = 0; i < dict->len; ++i) {
51 char word[DB_WORDLEN];
53 memset(word, 0, sizeof(word));
54 strlcpy(word, dict->entries[i].word, sizeof(word));
55 if (fwrite(word, sizeof(word), 1, fp) != 1)
58 if (fwrite(&pos, sizeof(pos), 1, fp) != 1)
62 pos += sizeof(uint32_t) * (dict->entries[i].len + 1);
65 for (i = 0; i < dict->len; ++i) {
69 x = dict->entries[i].len;
70 if (fwrite(&x, sizeof(x), 1, fp) != 1)
73 for (j = 0; j < x; ++j) {
74 t = dict->entries[i].ids[j];
75 if (fwrite(&t, sizeof(t), 1, fp) != 1)
84 db_create(FILE *fp, struct dictionary *dict, struct db_entry *entries,
89 uint32_t version = DB_VERSION;
94 if (fwrite(&version, sizeof(version), 1, fp) != 1)
97 /* reserve space for the start pointer -- filled later */
98 if (fseek(fp, sizeof(int64_t), SEEK_CUR) == -1)
101 if (write_dictionary(fp, dict) == -1)
104 if ((endidx = ftello(fp)) == -1)
107 for (i = 0; i < n; ++i) {
108 uint16_t namelen, descrlen = 0;
110 namelen = strlen(entries[i].name);
111 if (entries[i].descr != NULL)
112 descrlen = strlen(entries[i].descr);
114 if (fwrite(&namelen, sizeof(namelen), 1, fp) != 1)
116 if (fwrite(entries[i].name, namelen+1, 1, fp) != 1)
119 if (fwrite(&descrlen, sizeof(descrlen), 1, fp) != 1)
122 fwrite(entries[i].descr, descrlen, 1, fp) != 1)
124 if (fwrite("", 1, 1, fp) != 1)
128 if (fseek(fp, sizeof(version), SEEK_SET) == -1)
131 if (fwrite(&endidx, sizeof(endidx), 1, fp) != 1)
138 initdb(struct db *db)
140 off_t hdrlen = sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t);
144 if (hdrlen > db->len)
147 memcpy(&db->version, p, sizeof(db->version));
148 p += sizeof(db->version);
150 memcpy(&end_off, p, sizeof(end_off));
151 p += sizeof(end_off);
153 memcpy(&db->nwords, p, sizeof(db->nwords));
154 p += sizeof(db->nwords);
157 db->idx_end = p + db->nwords * IDX_ENTRY_SIZE;
158 db->list_start = db->idx_end;
159 db->list_end = db->m + end_off;
160 db->docs_start = db->list_end;
161 db->docs_end = db->m + db->len;
163 if (db->idx_end > db->docs_end)
165 if (db->list_end > db->docs_end)
172 db_open(struct db *db, int fd)
174 memset(db, 0, sizeof(*db));
176 if ((db->len = lseek(fd, 0, SEEK_END)) == -1)
179 if (lseek(fd, 0, SEEK_SET) == -1)
182 db->m = mmap(NULL, db->len, PROT_READ, MAP_PRIVATE, fd, 0);
183 if (db->m == MAP_FAILED)
186 if (initdb(db) == -1) {
195 db_countdocs(struct db *db, struct db_entry *e, void *d)
197 struct db_stats *stats = d;
204 db_idx_compar(const void *key, const void *elem)
206 const char *word = key;
207 const char *idx_entry = elem;
209 if (idx_entry[DB_WORDLEN-1] != '\0')
211 return strcmp(word, idx_entry);
214 static inline uint32_t *
215 db_getdocs(struct db *db, const uint8_t *entry, size_t *len)
221 memcpy(&pos, entry, sizeof(pos));
224 if (entry < db->list_start || entry > db->list_end)
227 memcpy(&l, entry, sizeof(l));
230 return (uint32_t *)entry;
234 db_word_docs(struct db *db, const char *word, size_t *len)
240 e = bsearch(word, db->idx_start, db->nwords, IDX_ENTRY_SIZE,
244 return db_getdocs(db, e, len);
248 db_stats(struct db *db, struct db_stats *stats)
251 size_t l, maxl = 0, idlen;
253 memset(stats, 0, sizeof(*stats));
255 if (db_listall(db, db_countdocs, stats) == -1)
258 stats->nwords = db->nwords;
261 while (p < db->idx_end) {
262 if (p + DB_WORDLEN > db->idx_end)
265 if (p[DB_WORDLEN-1] != '\0')
271 stats->longest_word = p;
274 if (db_getdocs(db, p, &idlen) == NULL)
277 if (idlen > stats->most_popular_ndocs) {
278 stats->most_popular_ndocs = idlen;
279 stats->most_popular = p;
288 static inline uint8_t *
289 db_extract_doc(struct db *db, uint8_t *p, struct db_entry *e)
291 uint16_t namelen, descrlen;
294 * namelen[2] name[namelen]
295 * descrlen[2] descr[descrlen]
298 if (p + 2 > db->docs_end)
300 memcpy(&namelen, p, sizeof(namelen));
301 p += sizeof(namelen);
303 if (p + namelen > db->docs_end || p[namelen] != '\0')
308 if (p + 2 > db->docs_end)
310 memcpy(&descrlen, p, sizeof(descrlen));
311 p += sizeof(descrlen);
313 if (p + descrlen > db->docs_end || p[descrlen] != '\0')
322 db_listall(struct db *db, db_hit_cb cb, void *data)
324 uint8_t *p = db->docs_start;
326 while (p < db->docs_end) {
329 if ((p = db_extract_doc(db, p, &e)) == NULL)
332 if (cb(db, &e, data) == -1)
340 db_doc_by_id(struct db *db, int docid, struct db_entry *e)
342 uint8_t *p = db->docs_start;
345 while (p < db->docs_end) {
346 if ((p = db_extract_doc(db, p, e)) == NULL)
359 db_close(struct db *db)
361 munmap(db->m, db->len);
362 memset(db, 0, sizeof(*db));