commit e2b5610c2d39617b564fa64f4201e071a7499abd from: Omar Polo date: Mon Apr 11 17:08:18 2022 UTC initial import commit - /dev/null commit + e2b5610c2d39617b564fa64f4201e071a7499abd blob - /dev/null blob + 9b0e3f7dcc0943391bae90ec05fdc8ffe50e020b (mode 644) --- /dev/null +++ Makefile @@ -0,0 +1,3 @@ +SUBDIR = ftsearch mkftsidx + +.include blob - /dev/null blob + b3984eb9b4ac4e43aca78e00aff2fba2d08456cd (mode 644) --- /dev/null +++ ftsearch/Makefile @@ -0,0 +1,12 @@ +.PATH:${.CURDIR}/../lib + +PROG = ftsearch +SRCS = ftsearch.c db.c fts.c tokenize.c + +WARNINGS = yes + +DEBUG = -O0 -g + +CPPFLAGS += -I${.CURDIR}/../include + +.include blob - /dev/null blob + 6ce50b9966e4d78deb8fad9b01cc3565bf5e5177 (mode 644) --- /dev/null +++ ftsearch/ftsearch.1 @@ -0,0 +1,70 @@ +.\" Copyright (c) 2022 Omar Polo +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.Dd April 11, 2022 +.Dt FTSEARCH 1 +.Os +.Sh NAME +.Nm ftsearch +.Nd search strings quickly +.Sh SYNOPSIS +.Nm +.Bk -words +.Op Fl d Ar dbpath +.Op Fl l +.Op Fl s +.Op Ar query +.Ek +.Sh DESCRIPTION +The +.Nm +utility searches a database for all documents which match the given +.Ar query . +The database needs to be created beforehand with +.Xr mkftsidx 1 . +.Pp +The arguments are as follows +.Bl -tag -width 9m +.It Fl d Ar dbpath +Path to the database. +.Pa db +by default. +.It Fl l +List all known documents. +Conflicts with +.Fl s +and +.Ar query . +.It Fl s +Print database stats. +Conflicts with +.Fl l +and +.Ar query . +.It Ar query +The query to search for. +.El +.Sh EXAMPLES +Search document that match +.Dq file manager +.Bd -literal -offset indent +$ ftsearch 'file manager' +.Ed +.Sh SEE ALSO +.Xr mkftsidx 1 +.Sh AUTHORS +.An -nosplit +The +.Nm +program was written by +.An Omar Polo Aq Mt op@omarpolo.com . blob - /dev/null blob + a8f4be0dadf83a849771058b16a03a7e0ad949e7 (mode 644) --- /dev/null +++ ftsearch/ftsearch.c @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "db.h" +#include "fts.h" +#include "tokenize.h" + +const char *dbpath; + +static void __dead +usage(void) +{ + fprintf(stderr, "usage: %s [-d db] -l | -s | query", + getprogname()); + exit(1); +} + +static int +print_entry(struct db *db, struct db_entry *entry, void *data) +{ + printf("%-18s %s\n", entry->name, entry->descr); + return 0; +} + +int +main(int argc, char **argv) +{ + struct db db; + const char *errstr; + int fd, ch; + int list = 0, stats = 0, docid = -1; + + while ((ch = getopt(argc, argv, "d:lp:s")) != -1) { + switch (ch) { + case 'd': + dbpath = optarg; + break; + case 'l': + list = 1; + break; + case 'p': + docid = strtonum(optarg, 0, INT_MAX, &errstr); + if (errstr != NULL) + errx(1, "document id is %s: %s", errstr, + optarg); + break; + case 's': + stats = 1; + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (dbpath == NULL) + dbpath = "db"; + + if (list && stats) + usage(); + + if ((fd = open(dbpath, O_RDONLY)) == -1) + err(1, "can't open %s", dbpath); + + if (pledge("stdio", NULL) == -1) + err(1, "pledge"); + + if (db_open(&db, fd) == -1) + err(1, "db_open"); + + if (list) { + if (db_listall(&db, print_entry, NULL) == -1) + err(1, "db_listall"); + } else if (stats) { + struct db_stats st; + + if (db_stats(&db, &st) == -1) + err(1, "db_stats"); + printf("unique words = %zu\n", st.nwords); + printf("documents = %zu\n", st.ndocs); + printf("longest word = %s\n", st.longest_word); + printf("most popular = %s (%zu)\n", st.most_popular, + st.most_popular_ndocs); + } else if (docid != -1) { + struct db_entry e; + + if (db_doc_by_id(&db, docid, &e) == -1) + errx(1, "failed to fetch document #%d", docid); + print_entry(&db, &e, NULL); + } else { + if (argc != 1) + usage(); + if (fts(&db, *argv, print_entry, NULL) == -1) + errx(1, "fts failed"); + } + + db_close(&db); + close(fd); +} blob - /dev/null blob + cdc1261027504a8f1627c76103148100ca364d9e (mode 644) --- /dev/null +++ include/db.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#define DB_VERSION 0 +#define DB_WORDLEN 32 + +struct db { + uint8_t *m; + off_t len; + uint32_t version; + uint32_t nwords; + + uint8_t *idx_start; + uint8_t *idx_end; + uint8_t *list_start; + uint8_t *list_end; + uint8_t *docs_start; + uint8_t *docs_end; +}; + +struct db_stats { + size_t nwords; + size_t ndocs; + const char *longest_word; + const char *most_popular; + size_t most_popular_ndocs; +}; + +struct db_entry { + char *name; + char *descr; +}; + +typedef int (*db_hit_cb)(struct db *, struct db_entry *, void *); + +struct dictionary; + +int db_create(FILE *, struct dictionary *, struct db_entry *, size_t); +int db_open(struct db *, int); +uint32_t *db_word_docs(struct db *, const char *, size_t *); +int db_stats(struct db *, struct db_stats *); +int db_listall(struct db *, db_hit_cb, void *); +int db_doc_by_id(struct db *, int, struct db_entry *); +void db_close(struct db *); blob - /dev/null blob + 81fa7f21a625ed303a114738d5fd74e1502c060c (mode 644) --- /dev/null +++ include/dictionary.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +struct dict_entry { + char *word; + int *ids; + size_t len; + size_t cap; +}; + +struct dictionary { + size_t len; + size_t cap; + struct dict_entry *entries; +}; + +int dictionary_init(struct dictionary *); +int dictionary_add(struct dictionary *, const char *, int); +int dictionary_add_words(struct dictionary *, char **, int); +void dictionary_free(struct dictionary *); blob - /dev/null blob + 6196c9ed15110d609b814c34d240a3d166f6d2a6 (mode 644) --- /dev/null +++ include/fts.h @@ -0,0 +1,17 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +int fts(struct db *, const char *, db_hit_cb, void *); blob - /dev/null blob + c5da1d7a8f37fcb0a9ef861f311a0a9e49471dbc (mode 644) --- /dev/null +++ include/tokenize.h @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +char **tokenize(const char *); +void freetoks(char **); blob - /dev/null blob + 1f878b9e23e536a58cc047f99c5432d61d23209c (mode 644) --- /dev/null +++ lib/db.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#include +#include +#include +#include +#include + +#include "db.h" +#include "dictionary.h" + +#define IDX_ENTRY_SIZE (DB_WORDLEN + sizeof(int64_t)) + +static int +write_dictionary(FILE *fp, struct dictionary *dict) +{ + off_t start; + uint64_t pos; + uint32_t n; + size_t i, len; + + if ((uint64_t)dict->len > UINT32_MAX) + return -1; + + n = dict->len; + if (fwrite(&n, sizeof(n), 1, fp) != 1) + return -1; + + if ((start = ftello(fp)) == -1) + return -1; + + len = DB_WORDLEN + sizeof(int64_t); + pos = start + (n * len); + for (i = 0; i < dict->len; ++i) { + char word[DB_WORDLEN]; + + memset(word, 0, sizeof(word)); + strlcpy(word, dict->entries[i].word, sizeof(word)); + if (fwrite(word, sizeof(word), 1, fp) != 1) + return -1; + + if (fwrite(&pos, sizeof(pos), 1, fp) != 1) + return -1; + + /* one for the len */ + pos += sizeof(uint32_t) * (dict->entries[i].len + 1); + } + + for (i = 0; i < dict->len; ++i) { + size_t j; + uint32_t t, x; + + x = dict->entries[i].len; + if (fwrite(&x, sizeof(x), 1, fp) != 1) + return -1; + + for (j = 0; j < x; ++j) { + t = dict->entries[i].ids[j]; + if (fwrite(&t, sizeof(t), 1, fp) != 1) + return -1; + } + } + + return 0; +} + +int +db_create(FILE *fp, struct dictionary *dict, struct db_entry *entries, + size_t n) +{ + int64_t endidx; + size_t i; + uint32_t version = DB_VERSION; + + if (n > INT32_MAX) + return -1; + + if (fwrite(&version, sizeof(version), 1, fp) != 1) + return -1; + + /* reserve space for the start pointer -- filled later */ + if (fseek(fp, sizeof(int64_t), SEEK_CUR) == -1) + return -1; + + if (write_dictionary(fp, dict) == -1) + return -1; + + if ((endidx = ftello(fp)) == -1) + return -1; + + for (i = 0; i < n; ++i) { + uint16_t namelen, descrlen = 0; + + namelen = strlen(entries[i].name); + if (entries[i].descr != NULL) + descrlen = strlen(entries[i].descr); + + if (fwrite(&namelen, sizeof(namelen), 1, fp) != 1) + return -1; + if (fwrite(entries[i].name, namelen+1, 1, fp) != 1) + return -1; + + if (fwrite(&descrlen, sizeof(descrlen), 1, fp) != 1) + return -1; + if (descrlen > 0 && + fwrite(entries[i].descr, descrlen, 1, fp) != 1) + return -1; + if (fwrite("", 1, 1, fp) != 1) + return -1; + } + + if (fseek(fp, sizeof(version), SEEK_SET) == -1) + return -1; + + if (fwrite(&endidx, sizeof(endidx), 1, fp) != 1) + return -1; + + return 0; +} + +static int +initdb(struct db *db) +{ + off_t hdrlen = sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t); + int64_t end_off; + uint8_t *p = db->m; + + if (hdrlen > db->len) + return -1; + + memcpy(&db->version, p, sizeof(db->version)); + p += sizeof(db->version); + + memcpy(&end_off, p, sizeof(end_off)); + p += sizeof(end_off); + + memcpy(&db->nwords, p, sizeof(db->nwords)); + p += sizeof(db->nwords); + + db->idx_start = p; + db->idx_end = p + db->nwords * IDX_ENTRY_SIZE; + db->list_start = db->idx_end; + db->list_end = db->m + end_off; + db->docs_start = db->list_end; + db->docs_end = db->m + db->len; + + if (db->idx_end > db->docs_end) + return -1; + if (db->list_end > db->docs_end) + return -1; + + return 0; +} + +int +db_open(struct db *db, int fd) +{ + memset(db, 0, sizeof(*db)); + + if ((db->len = lseek(fd, 0, SEEK_END)) == -1) + return -1; + + if (lseek(fd, 0, SEEK_SET) == -1) + return -1; + + db->m = mmap(NULL, db->len, PROT_READ, MAP_PRIVATE, fd, 0); + if (db->m == MAP_FAILED) + return -1; + + if (initdb(db) == -1) { + db_close(db); + return -1; + } + + return 0; +} + +static int +db_countdocs(struct db *db, struct db_entry *e, void *d) +{ + struct db_stats *stats = d; + + stats->ndocs++; + return 0; +} + +static int +db_idx_compar(const void *key, const void *elem) +{ + const char *word = key; + const char *idx_entry = elem; + + if (idx_entry[DB_WORDLEN-1] != '\0') + return -1; + return strcmp(word, idx_entry); +} + +static inline uint32_t * +db_getdocs(struct db *db, const uint8_t *entry, size_t *len) +{ + int64_t pos; + uint32_t l; + + entry += DB_WORDLEN; + memcpy(&pos, entry, sizeof(pos)); + + entry = db->m + pos; + if (entry < db->list_start || entry > db->list_end) + return NULL; + + memcpy(&l, entry, sizeof(l)); + entry += sizeof(l); + *len = l; + return (uint32_t *)entry; +} + +uint32_t * +db_word_docs(struct db *db, const char *word, size_t *len) +{ + uint8_t *e; + + *len = 0; + + e = bsearch(word, db->idx_start, db->nwords, IDX_ENTRY_SIZE, + db_idx_compar); + if (e == NULL) + return NULL; + return db_getdocs(db, e, len); +} + +int +db_stats(struct db *db, struct db_stats *stats) +{ + const uint8_t *p; + size_t l, maxl = 0, idlen; + + memset(stats, 0, sizeof(*stats)); + + if (db_listall(db, db_countdocs, stats) == -1) + return -1; + + stats->nwords = db->nwords; + + p = db->idx_start; + while (p < db->idx_end) { + if (p + DB_WORDLEN > db->idx_end) + return -1; + + if (p[DB_WORDLEN-1] != '\0') + return -1; + + l = strlen(p); + if (l > maxl) { + maxl = l; + stats->longest_word = p; + } + + if (db_getdocs(db, p, &idlen) == NULL) + return -1; + + if (idlen > stats->most_popular_ndocs) { + stats->most_popular_ndocs = idlen; + stats->most_popular = p; + } + + p += IDX_ENTRY_SIZE; + } + + return 0; +} + +static inline uint8_t * +db_extract_doc(struct db *db, uint8_t *p, struct db_entry *e) +{ + uint16_t namelen, descrlen; + + /* + * namelen[2] name[namelen] + * descrlen[2] descr[descrlen] + */ + + if (p + 2 > db->docs_end) + return NULL; + memcpy(&namelen, p, sizeof(namelen)); + p += sizeof(namelen); + + if (p + namelen > db->docs_end || p[namelen] != '\0') + return NULL; + e->name = p; + p += namelen + 1; + + if (p + 2 > db->docs_end) + return NULL; + memcpy(&descrlen, p, sizeof(descrlen)); + p += sizeof(descrlen); + + if (p + descrlen > db->docs_end || p[descrlen] != '\0') + return NULL; + e->descr = p; + p += descrlen + 1; + + return p; +} + +int +db_listall(struct db *db, db_hit_cb cb, void *data) +{ + uint8_t *p = db->docs_start; + + while (p < db->docs_end) { + struct db_entry e; + + if ((p = db_extract_doc(db, p, &e)) == NULL) + return -1; + + if (cb(db, &e, data) == -1) + return -1; + } + + return 0; +} + +int +db_doc_by_id(struct db *db, int docid, struct db_entry *e) +{ + uint8_t *p = db->docs_start; + int n = 0; + + while (p < db->docs_end) { + if ((p = db_extract_doc(db, p, e)) == NULL) + return -1; + + if (n == docid) + return 0; + + n++; + } + + return -1; +} + +void +db_close(struct db *db) +{ + munmap(db->m, db->len); + memset(db, 0, sizeof(*db)); +} blob - /dev/null blob + 36390fb9a5b5e9e284b077a1a42c2cdff67c7059 (mode 644) --- /dev/null +++ lib/dictionary.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include + +#include "dictionary.h" + +int +dictionary_init(struct dictionary *dict) +{ + memset(dict, 0, sizeof(*dict)); + return 1; +} + +static inline int +add_docid(struct dict_entry *e, int docid) +{ + void *t; + size_t newcap; + + if (e->len > 0 && e->ids[e->len-1] == docid) + return 1; + + if (e->len == e->cap) { + newcap = e->cap * 1.5; + if (newcap == 0) + newcap = 8; + t = recallocarray(e->ids, e->cap, newcap, sizeof(*e->ids)); + if (t == NULL) + return 0; + e->ids = t; + e->cap = newcap; + } + + e->ids[e->len++] = docid; + return 1; +} + +int +dictionary_add(struct dictionary *dict, const char *word, int docid) +{ + struct dict_entry *e = NULL; + void *newentr; + size_t newcap, mid = 0, left = 0, right = dict->len; + int r = 0; + + while (left < right) { + mid = (left + right) / 2; + e = &dict->entries[mid]; + r = strcmp(word, e->word); + if (r < 0) + right = mid; + else if (r > 0) + left = mid + 1; + else + return add_docid(e, docid); + } + + if (r > 0) + mid++; + + if (dict->len == dict->cap) { + newcap = dict->cap * 1.5; + if (newcap == 0) + newcap = 8; + newentr = recallocarray(dict->entries, dict->cap, newcap, + sizeof(*dict->entries)); + if (newentr == NULL) + return 0; + dict->entries = newentr; + dict->cap = newcap; + } + + e = &dict->entries[mid]; + if (e != dict->entries + dict->len) { + size_t i = e - dict->entries; + memmove(e+1, e, sizeof(*e) * (dict->len - i)); + } + + dict->len++; + memset(e, 0, sizeof(*e)); + if ((e->word = strdup(word)) == NULL) + return 0; + return add_docid(e, docid); +} + +int +dictionary_add_words(struct dictionary *dict, char **words, int docid) +{ + for (; *words != NULL; ++words) { + if (!dictionary_add(dict, *words, docid)) + return 0; + } + + return 1; +} + +void +dictionary_free(struct dictionary *dict) +{ + size_t i; + + for (i = 0; i < dict->len; ++i) { + free(dict->entries[i].word); + free(dict->entries[i].ids); + } + + free(dict->entries); +} blob - /dev/null blob + 1e16be4b854b5e95f47bb7ad1060e66867c040cb (mode 644) --- /dev/null +++ lib/fts.c @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "db.h" +#include "fts.h" +#include "tokenize.h" + +struct doclist { + uint32_t *ids; + size_t len; +}; + +int +fts(struct db *db, const char *query, db_hit_cb cb, void *data) +{ + struct doclist *xs = NULL; + size_t i, len; + char **toks, **t; + int ret = 0; + + if ((toks = tokenize(query)) == NULL) + return -1; + + len = 0; + for (t = toks; *t != NULL; ++t) + len++; + + if (len == 0) + goto done; + + if ((xs = calloc(len, sizeof(*xs))) == NULL) { + freetoks(toks); + return -1; + } + + for (i = 0; i < len; ++i) { + xs[i].ids = db_word_docs(db, toks[i], &xs[i].len); + if (xs[i].ids == NULL || xs[i].len == 0) + goto done; + } + + for (;;) { + struct db_entry e; + uint32_t mdoc; + + mdoc = xs[0].ids[0]; + for (i = 1; i < len; ++i) { + if (xs[i].ids[0] > mdoc) + goto next; + while (xs[i].ids[0] < mdoc) { + if (--xs[i].len == 0) + goto done; + xs[i].ids++; + } + + if (xs[i].ids[0] != mdoc) + goto next; + } + + if (db_doc_by_id(db, mdoc, &e) == -1) { + ret = -1; + goto done; + } + + if (cb(db, &e, data) == -1) { + ret = -1; + goto done; + } + + next: + if (--xs[0].len == 0) + goto done; + xs[0].ids++; + } + +done: + free(xs); + freetoks(toks); + + return ret; +} blob - /dev/null blob + d07eb30cb1b222a478848edaccfeefadee59a8d9 (mode 644) --- /dev/null +++ lib/tokenize.c @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "tokenize.h" + +#ifndef WDELIMS +/* everything but a-zA-Z */ +#define WDELIMS " \t\n!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~" +#endif + +char ** +tokenize(const char *s) +{ + char *d, *dup, *t, **tok = NULL; + void *newtok; + size_t cap = 0, len = 0, newcap; + + if ((dup = strdup(s)) == NULL) + return NULL; + d = dup; + + for (t = d; *t; ++t) + *t = tolower(*t); + + while ((t = strsep(&d, WDELIMS)) != NULL) { + if (*t == '\0') + continue; + + /* keep the space for a NULL terminator */ + if (len+1 >= cap) { + newcap = cap * 1.5; + if (newcap == 0) + newcap = 8; + newtok = recallocarray(tok, cap, newcap, + sizeof(char *)); + if (newtok == NULL) + goto err; + tok = newtok; + cap = newcap; + } + + if ((tok[len++] = strdup(t)) == NULL) + goto err; + } + + free(dup); + return tok; + +err: + freetoks(tok); + free(dup); + return NULL; +} + +void +freetoks(char **tok) +{ + char **i; + + if (tok == NULL) + return; + + for (i = tok; *i != NULL; ++i) + free(*i); + free(tok); +} blob - /dev/null blob + 6ff8e82141f23f626139263f9fa9a0af829f73d0 (mode 644) --- /dev/null +++ mkftsidx/Makefile @@ -0,0 +1,23 @@ +.PATH:${.CURDIR}/../lib + +PROG = mkftsidx +SRCS = mkftsidx.c ports.c wiki.c db.c dictionary.c tokenize.c + +WARNINGS = yes + +CPPFLAGS += -I/usr/local/include -I${.CURDIR}/../include +LDADD = -lexpat -lsqlite3 -L/usr/local/lib + +.if defined(PROFILE) +CPPFLAGS += -DPROFILE +LDADD += -static -lm -lpthread +DEBUG = -pg +.endif + +DEBUG += -O0 -g + +show-prof: + gprof mkftsidx ../gmon.out | gprof2dot | dot -Tpng > profile.png + nsxiv profile.png & + +.include blob - /dev/null blob + 74558e22e0c5946bae114760b939a148fd8547c5 (mode 644) --- /dev/null +++ mkftsidx/mkftsidx.1 @@ -0,0 +1,74 @@ +.\" Copyright (c) 2022 Omar Polo +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.Dd April 11, 2022 +.Dt MKFTSIDX 1 +.Os +.Sh NAME +.Nm mkftsidx +.Nd construct fts database +.Sh SYNOPSIS +.Nm +.Bk -words +.Op Fl o Ar dbpath +.Op Fl m Ar p|w +.Op Ar path +.Ek +.Sh DESCRIPTION +.Nm +is a program to create a fts database for +.Xr ftsearch 1 . +The arguments are as follows: +.Bl -tag -width Ds +.It Fl o Ar dbpath +Path to the database file to create. +.Pa db +by default. +.It Fl m Ar p|w +Set the mode. +If +.Ar p +.Pq the default +then create a database with the +.Ox +ports tree data, +otherwise creates a database from a Wikipedia dump. +.It Ar path +Path to the sources. +When working in +.Ar p +mode, it's the optional path to the sqlports database. +Otherwise, it's the mandatory path to the Wikipedia file dump. +.El +.Sh EXAMPLES +To create a database with the +.Ox +ports tree content: +.Bd -literal -offset indent +$ mkftsidx +.Ed +.Pp +To create a database from a Wikipedia dump to the +.Pa db.wiki +file: +.Bd -literal -offset indent +$ mkftsidx -o db.wiki -mw enwiki-latest-abstract1.xml +.Ed +.Sh SEE ALSO +.Xr ftsearch 1 +.Sh AUTHORS +.An -nosplit +The +.Nm +program was written by +.An Omar Polo Aq Mt op@omarpolo.com . blob - /dev/null blob + a5c9ba3665c026d1c43cd6b584753e845cb708ab (mode 644) --- /dev/null +++ mkftsidx/mkftsidx.c @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "db.h" +#include "dictionary.h" + +#include "mkftsidx.h" + +enum { + MODE_SQLPORTS, + MODE_WIKI, +}; + +char * +xstrdup(const char *s) +{ + char *t; + + if (s == NULL) + return NULL; + + if ((t = strdup(s)) == NULL) + err(1, "strdup"); + return t; +} + +__dead void +usage(void) +{ + fprintf(stderr, "usage: %s [-o dbpath] [-m p|w] [path]\n", + getprogname()); + exit(1); +} + +int +main(int argc, char **argv) +{ + struct dictionary dict; + struct db_entry *entries = NULL; + const char *dbpath = NULL; + FILE *fp; + size_t i, len = 0; + int ch, r = 0, mode = MODE_SQLPORTS; + +#ifndef PROFILE + /* sqlite needs flock */ + if (pledge("stdio rpath wpath cpath flock", NULL) == -1) + err(1, "pledge"); +#endif + + while ((ch = getopt(argc, argv, "m:o:")) != -1) { + switch (ch) { + case 'm': + switch (*optarg) { + case 'p': + mode = MODE_SQLPORTS; + break; + case 'w': + mode = MODE_WIKI; + break; + default: + usage(); + } + break; + case 'o': + dbpath = optarg; + break; + default: + usage(); + } + } + argc -= optind; + argv += optind; + + if (dbpath == NULL) + dbpath = "db"; + + if (!dictionary_init(&dict)) + err(1, "dictionary_init"); + + if (mode == MODE_SQLPORTS) + r = idx_ports(&dict, &entries, &len, argc, argv); + else + r = idx_wiki(&dict, &entries, &len, argc, argv); + + if (r == 0) { + if ((fp = fopen(dbpath, "w+")) == NULL) + err(1, "can't open %s", dbpath); + if (db_create(fp, &dict, entries, len) == -1) { + warn("db_create"); + unlink(dbpath); + r = 1; + } + fclose(fp); + } + + for (i = 0; i < len; ++i) { + free(entries[i].name); + free(entries[i].descr); + } + free(entries); + dictionary_free(&dict); + + return r; +} blob - /dev/null blob + e64b8c34a543829be5f9855d096e818fc30e2597 (mode 644) --- /dev/null +++ mkftsidx/mkftsidx.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* mkftsidx.c */ +__dead void usage(void); +char *xstrdup(const char *); + +/* ports.c */ +int idx_ports(struct dictionary *, struct db_entry **, size_t *, + int, char **); + +/* wiki.c */ +int idx_wiki(struct dictionary *, struct db_entry **, size_t *, + int, char **); blob - /dev/null blob + aaad95dea48ebc60c0071734e949b74faf9c0eed (mode 644) --- /dev/null +++ mkftsidx/ports.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include + +#include "db.h" +#include "dictionary.h" +#include "tokenize.h" + +#include "mkftsidx.h" + +#ifndef SQLPORTS +#define SQLPORTS "/usr/local/share/sqlports" +#endif + +#define QNUM "select count(*) from portsq;" +#define QALL "select pkgstem, comment, descr_contents from portsq;" + +static int +countports(sqlite3 *db) +{ + sqlite3_stmt *stmt; + int r, n = -1; + + r = sqlite3_prepare_v2(db, QNUM, -1, &stmt, NULL); + if (r != SQLITE_OK) { + warnx("failed to prepare statement: %s", + sqlite3_errstr(r)); + return -1; + } + + r = sqlite3_step(stmt); + if (r == SQLITE_ROW) + n = sqlite3_column_int(stmt, 0); + + sqlite3_finalize(stmt); + return n; +} + +int +idx_ports(struct dictionary *dict, struct db_entry **entries, size_t *len, + int argc, char **argv) +{ + const char *dbpath; + sqlite3 *db; + sqlite3_stmt *stmt; + size_t i; + int r; + + if (argc > 1) + usage(); + else if (argc == 1) + dbpath = *argv; + else + dbpath = SQLPORTS; + + if ((r = sqlite3_open(dbpath, &db)) != SQLITE_OK) + errx(1, "can't open %s: %s", dbpath, sqlite3_errstr(r)); + + if ((r = countports(db)) == -1 || r == 0) { + warnx("error querying the db or empty portsq table!"); + goto done; + } + *len = r; + + if ((*entries = calloc(*len, sizeof(**entries))) == NULL) + err(1, "calloc"); + + r = sqlite3_prepare_v2(db, QALL, -1, &stmt, NULL); + if (r != SQLITE_OK) + errx(1, "failed to prepare statement: %s", sqlite3_errstr(r)); + + for (i = 0; i < *len; ++i) { + const char *pkgstem, *comment, *descr; + char *doc, **toks; + + r = sqlite3_step(stmt); + if (r == SQLITE_DONE) + break; + if (r != SQLITE_ROW) + errx(1, "sqlite3_step: %s", sqlite3_errstr(r)); + + pkgstem = sqlite3_column_text(stmt, 0); + comment = sqlite3_column_text(stmt, 1); + descr = sqlite3_column_text(stmt, 2); + + (*entries)[i].name = xstrdup(pkgstem); + (*entries)[i].descr = xstrdup(comment); + + r = asprintf(&doc, "%s %s %s", pkgstem, + comment != NULL ? comment : "", + descr != NULL ? descr : ""); + if (r == -1) + err(1, "asprintf"); + + if ((toks = tokenize(doc)) == NULL) + err(1, "tokenize"); + if (!dictionary_add_words(dict, toks, i)) + err(1, "dictionary_add_words"); + freetoks(toks); + free(doc); + } + +done: + sqlite3_close(db); + return 0; +} blob - /dev/null blob + a2fa1f2751eaa0c92f8c65cfe384e8ed9f4f018d (mode 644) --- /dev/null +++ mkftsidx/wiki.c @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "db.h" +#include "dictionary.h" +#include "tokenize.h" + +#include "mkftsidx.h" + +enum { + N_UNK, + N_TIT, + N_URL, + N_ABS, +}; + +struct mydata { + struct dictionary *dict; + struct db_entry *entries; + size_t len; + size_t cap; + + int next; + char *title; + char *url; + char *abstract; +}; + +static void +el_start(void *data, const char *element, const char **attr) +{ + struct mydata *d = data; + + if (!strcmp(element, "title")) { + d->next = N_TIT; + } else if (!strcmp(element, "url")) { + d->next = N_URL; + } else if (!strcmp(element, "abstract")) { + d->next = N_ABS; + } +} + +static void +append_text(char **text, const char *s, int len) +{ + char *t, *out, *orig; + + if ((t = calloc(1, len + 1)) == NULL) + err(1, "calloc"); + memcpy(t, s, len); + + if ((orig = *text) == NULL) + orig = ""; + if (asprintf(&out, "%s%s", orig, t) == -1) + err(1, "asprintf"); + free(*text); + *text = out; + free(t); +} + +static void +on_text(void *data, const char *s, int len) +{ + struct mydata *d = data; + + switch (d->next) { + case N_TIT: + append_text(&d->title, s, len); + break; + case N_URL: + append_text(&d->url, s, len); + break; + case N_ABS: + append_text(&d->abstract, s, len); + break; + default: + break; + } +} + +static void +el_end(void *data, const char *element) +{ + struct mydata *d = data; + struct db_entry *e; + size_t newcap; + const char *title; + char *doc, **toks; + void *t; + int r, next; + + next = d->next; + d->next = N_UNK; + if ((next == N_TIT && !strcmp(element, "title")) || + (next == N_URL && !strcmp(element, "url")) || + (next == N_ABS && !strcmp(element, "abstract")) || + strcmp(element, "doc")) + return; + + if (d->len == d->cap) { + newcap = d->cap * 1.5; + if (newcap == 0) + newcap = 8; + t = recallocarray(d->entries, d->cap, newcap, + sizeof(*d->entries)); + if (t == NULL) + err(1, "recallocarray"); + d->entries = t; + d->cap = newcap; + } + + title = d->title; + if (!strncmp(title, "Wikipedia: ", 11)) + title += 11; + + e = &d->entries[d->len++]; + e->name = xstrdup(d->url); + e->descr = xstrdup(title); + + if (d->len % 1000 == 0) + printf("=> %zu\n", d->len); + + r = asprintf(&doc, "%s %s", title, d->abstract); + if (r == -1) + err(1, "asprintf"); + + if ((toks = tokenize(doc)) != NULL) { + if (!dictionary_add_words(d->dict, toks, d->len-1)) + err(1, "dictionary_add_words"); + freetoks(toks); + } + free(doc); + + free(d->title); + free(d->url); + free(d->abstract); + + d->title = NULL; + d->url = NULL; + d->abstract = NULL; +} + +int +idx_wiki(struct dictionary *dict, struct db_entry **entries, size_t *len, + int argc, char **argv) +{ + struct mydata d; + XML_Parser parser; + const char *xmlpath; + char buf[BUFSIZ]; + int done = 0; + FILE *fp; + size_t r; + + if (argc != 1) { + warnx("missing path to xml file"); + usage(); + } + xmlpath = *argv; + + memset(&d, 0, sizeof(d)); + d.dict = dict; + + if ((parser = XML_ParserCreate(NULL)) == NULL) + err(1, "XML_ParserCreate"); + XML_SetUserData(parser, &d); + XML_SetElementHandler(parser, el_start, el_end); + XML_SetCharacterDataHandler(parser, on_text); + + if ((fp = fopen(xmlpath, "r")) == NULL) + err(1, "can't open %s", xmlpath); + + do { + r = fread(buf, 1, sizeof(buf), fp); + done = r != sizeof(buf); + if (!XML_Parse(parser, buf, r, done)) + errx(1, "can't parse: %s at %s:%lu", + XML_ErrorString(XML_GetErrorCode(parser)), + xmlpath, + XML_GetCurrentLineNumber(parser)); + } while (!done); + + fclose(fp); + XML_ParserFree(parser); + + *len = d.len; + *entries = d.entries; + + return 0; +}