Blob


1 /*
2 * Copyright (c) 2022 Omar Polo <op@openbsd.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
17 #include <sys/mman.h>
19 #include <stdio.h>
20 #include <stdint.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <unistd.h>
25 #include "db.h"
26 #include "dictionary.h"
28 #define IDX_ENTRY_SIZE (DB_WORDLEN + sizeof(int64_t))
30 static int
31 write_dictionary(FILE *fp, struct dictionary *dict)
32 {
33 off_t start;
34 uint64_t pos;
35 uint32_t n;
36 size_t i, len;
38 if ((uint64_t)dict->len > UINT32_MAX)
39 return -1;
41 n = dict->len;
42 if (fwrite(&n, sizeof(n), 1, fp) != 1)
43 return -1;
45 if ((start = ftello(fp)) == -1)
46 return -1;
48 len = DB_WORDLEN + sizeof(int64_t);
49 pos = start + (n * len);
50 for (i = 0; i < dict->len; ++i) {
51 char word[DB_WORDLEN];
53 memset(word, 0, sizeof(word));
54 strlcpy(word, dict->entries[i].word, sizeof(word));
55 if (fwrite(word, sizeof(word), 1, fp) != 1)
56 return -1;
58 if (fwrite(&pos, sizeof(pos), 1, fp) != 1)
59 return -1;
61 /* one for the len */
62 pos += sizeof(uint32_t) * (dict->entries[i].len + 1);
63 }
65 for (i = 0; i < dict->len; ++i) {
66 size_t j;
67 uint32_t t, x;
69 x = dict->entries[i].len;
70 if (fwrite(&x, sizeof(x), 1, fp) != 1)
71 return -1;
73 for (j = 0; j < x; ++j) {
74 t = dict->entries[i].ids[j];
75 if (fwrite(&t, sizeof(t), 1, fp) != 1)
76 return -1;
77 }
78 }
80 return 0;
81 }
83 int
84 db_create(FILE *fp, struct dictionary *dict, struct db_entry *entries,
85 size_t n)
86 {
87 int64_t endidx;
88 size_t i;
89 uint32_t version = DB_VERSION;
91 if (n > INT32_MAX)
92 return -1;
94 if (fwrite(&version, sizeof(version), 1, fp) != 1)
95 return -1;
97 /* reserve space for the start pointer -- filled later */
98 if (fseek(fp, sizeof(int64_t), SEEK_CUR) == -1)
99 return -1;
101 if (write_dictionary(fp, dict) == -1)
102 return -1;
104 if ((endidx = ftello(fp)) == -1)
105 return -1;
107 for (i = 0; i < n; ++i) {
108 uint16_t namelen, descrlen = 0;
110 namelen = strlen(entries[i].name);
111 if (entries[i].descr != NULL)
112 descrlen = strlen(entries[i].descr);
114 if (fwrite(&namelen, sizeof(namelen), 1, fp) != 1)
115 return -1;
116 if (fwrite(entries[i].name, namelen+1, 1, fp) != 1)
117 return -1;
119 if (fwrite(&descrlen, sizeof(descrlen), 1, fp) != 1)
120 return -1;
121 if (descrlen > 0 &&
122 fwrite(entries[i].descr, descrlen, 1, fp) != 1)
123 return -1;
124 if (fwrite("", 1, 1, fp) != 1)
125 return -1;
128 if (fseek(fp, sizeof(version), SEEK_SET) == -1)
129 return -1;
131 if (fwrite(&endidx, sizeof(endidx), 1, fp) != 1)
132 return -1;
134 return 0;
137 static int
138 initdb(struct db *db)
140 off_t hdrlen = sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t);
141 int64_t end_off;
142 uint8_t *p = db->m;
144 if (hdrlen > db->len)
145 return -1;
147 memcpy(&db->version, p, sizeof(db->version));
148 p += sizeof(db->version);
150 memcpy(&end_off, p, sizeof(end_off));
151 p += sizeof(end_off);
153 memcpy(&db->nwords, p, sizeof(db->nwords));
154 p += sizeof(db->nwords);
156 db->idx_start = p;
157 db->idx_end = p + db->nwords * IDX_ENTRY_SIZE;
158 db->list_start = db->idx_end;
159 db->list_end = db->m + end_off;
160 db->docs_start = db->list_end;
161 db->docs_end = db->m + db->len;
163 if (db->idx_end > db->docs_end)
164 return -1;
165 if (db->list_end > db->docs_end)
166 return -1;
168 return 0;
171 int
172 db_open(struct db *db, int fd)
174 memset(db, 0, sizeof(*db));
176 if ((db->len = lseek(fd, 0, SEEK_END)) == -1)
177 return -1;
179 if (lseek(fd, 0, SEEK_SET) == -1)
180 return -1;
182 db->m = mmap(NULL, db->len, PROT_READ, MAP_PRIVATE, fd, 0);
183 if (db->m == MAP_FAILED)
184 return -1;
186 if (initdb(db) == -1) {
187 db_close(db);
188 return -1;
191 return 0;
194 static int
195 db_countdocs(struct db *db, struct db_entry *e, void *d)
197 struct db_stats *stats = d;
199 stats->ndocs++;
200 return 0;
203 static int
204 db_idx_compar(const void *key, const void *elem)
206 const char *word = key;
207 const char *idx_entry = elem;
209 if (idx_entry[DB_WORDLEN-1] != '\0')
210 return -1;
211 return strcmp(word, idx_entry);
214 static inline uint32_t *
215 db_getdocs(struct db *db, const uint8_t *entry, size_t *len)
217 int64_t pos;
218 uint32_t l;
220 entry += DB_WORDLEN;
221 memcpy(&pos, entry, sizeof(pos));
223 entry = db->m + pos;
224 if (entry < db->list_start || entry > db->list_end)
225 return NULL;
227 memcpy(&l, entry, sizeof(l));
228 entry += sizeof(l);
229 *len = l;
230 return (uint32_t *)entry;
233 uint32_t *
234 db_word_docs(struct db *db, const char *word, size_t *len)
236 uint8_t *e;
238 *len = 0;
240 e = bsearch(word, db->idx_start, db->nwords, IDX_ENTRY_SIZE,
241 db_idx_compar);
242 if (e == NULL)
243 return NULL;
244 return db_getdocs(db, e, len);
247 int
248 db_stats(struct db *db, struct db_stats *stats)
250 const uint8_t *p;
251 size_t l, maxl = 0, idlen;
253 memset(stats, 0, sizeof(*stats));
255 if (db_listall(db, db_countdocs, stats) == -1)
256 return -1;
258 stats->nwords = db->nwords;
260 p = db->idx_start;
261 while (p < db->idx_end) {
262 if (p + DB_WORDLEN > db->idx_end)
263 return -1;
265 if (p[DB_WORDLEN-1] != '\0')
266 return -1;
268 l = strlen(p);
269 if (l > maxl) {
270 maxl = l;
271 stats->longest_word = p;
274 if (db_getdocs(db, p, &idlen) == NULL)
275 return -1;
277 if (idlen > stats->most_popular_ndocs) {
278 stats->most_popular_ndocs = idlen;
279 stats->most_popular = p;
282 p += IDX_ENTRY_SIZE;
285 return 0;
288 static inline uint8_t *
289 db_extract_doc(struct db *db, uint8_t *p, struct db_entry *e)
291 uint16_t namelen, descrlen;
293 /*
294 * namelen[2] name[namelen]
295 * descrlen[2] descr[descrlen]
296 */
298 if (p + 2 > db->docs_end)
299 return NULL;
300 memcpy(&namelen, p, sizeof(namelen));
301 p += sizeof(namelen);
303 if (p + namelen > db->docs_end || p[namelen] != '\0')
304 return NULL;
305 e->name = p;
306 p += namelen + 1;
308 if (p + 2 > db->docs_end)
309 return NULL;
310 memcpy(&descrlen, p, sizeof(descrlen));
311 p += sizeof(descrlen);
313 if (p + descrlen > db->docs_end || p[descrlen] != '\0')
314 return NULL;
315 e->descr = p;
316 p += descrlen + 1;
318 return p;
321 int
322 db_listall(struct db *db, db_hit_cb cb, void *data)
324 uint8_t *p = db->docs_start;
326 while (p < db->docs_end) {
327 struct db_entry e;
329 if ((p = db_extract_doc(db, p, &e)) == NULL)
330 return -1;
332 if (cb(db, &e, data) == -1)
333 return -1;
336 return 0;
339 int
340 db_doc_by_id(struct db *db, int docid, struct db_entry *e)
342 uint8_t *p = db->docs_start;
343 int n = 0;
345 while (p < db->docs_end) {
346 if ((p = db_extract_doc(db, p, e)) == NULL)
347 return -1;
349 if (n == docid)
350 return 0;
352 n++;
355 return -1;
358 void
359 db_close(struct db *db)
361 munmap(db->m, db->len);
362 memset(db, 0, sizeof(*db));