commit - /dev/null
commit + e2b5610c2d39617b564fa64f4201e071a7499abd
blob - /dev/null
blob + 9b0e3f7dcc0943391bae90ec05fdc8ffe50e020b (mode 644)
--- /dev/null
+++ Makefile
+SUBDIR = ftsearch mkftsidx
+
+.include <bsd.subdir.mk>
blob - /dev/null
blob + b3984eb9b4ac4e43aca78e00aff2fba2d08456cd (mode 644)
--- /dev/null
+++ ftsearch/Makefile
+.PATH:${.CURDIR}/../lib
+
+PROG = ftsearch
+SRCS = ftsearch.c db.c fts.c tokenize.c
+
+WARNINGS = yes
+
+DEBUG = -O0 -g
+
+CPPFLAGS += -I${.CURDIR}/../include
+
+.include <bsd.prog.mk>
blob - /dev/null
blob + 6ce50b9966e4d78deb8fad9b01cc3565bf5e5177 (mode 644)
--- /dev/null
+++ ftsearch/ftsearch.1
+.\" Copyright (c) 2022 Omar Polo <op@openbsd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.Dd April 11, 2022
+.Dt FTSEARCH 1
+.Os
+.Sh NAME
+.Nm ftsearch
+.Nd search strings quickly
+.Sh SYNOPSIS
+.Nm
+.Bk -words
+.Op Fl d Ar dbpath
+.Op Fl l
+.Op Fl s
+.Op Ar query
+.Ek
+.Sh DESCRIPTION
+The
+.Nm
+utility searches a database for all documents which match the given
+.Ar query .
+The database needs to be created beforehand with
+.Xr mkftsidx 1 .
+.Pp
+The arguments are as follows
+.Bl -tag -width 9m
+.It Fl d Ar dbpath
+Path to the database.
+.Pa db
+by default.
+.It Fl l
+List all known documents.
+Conflicts with
+.Fl s
+and
+.Ar query .
+.It Fl s
+Print database stats.
+Conflicts with
+.Fl l
+and
+.Ar query .
+.It Ar query
+The query to search for.
+.El
+.Sh EXAMPLES
+Search document that match
+.Dq file manager
+.Bd -literal -offset indent
+$ ftsearch 'file manager'
+.Ed
+.Sh SEE ALSO
+.Xr mkftsidx 1
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+program was written by
+.An Omar Polo Aq Mt op@omarpolo.com .
blob - /dev/null
blob + a8f4be0dadf83a849771058b16a03a7e0ad949e7 (mode 644)
--- /dev/null
+++ ftsearch/ftsearch.c
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "db.h"
+#include "fts.h"
+#include "tokenize.h"
+
+const char *dbpath;
+
+static void __dead
+usage(void)
+{
+ fprintf(stderr, "usage: %s [-d db] -l | -s | query",
+ getprogname());
+ exit(1);
+}
+
+static int
+print_entry(struct db *db, struct db_entry *entry, void *data)
+{
+ printf("%-18s %s\n", entry->name, entry->descr);
+ return 0;
+}
+
+int
+main(int argc, char **argv)
+{
+ struct db db;
+ const char *errstr;
+ int fd, ch;
+ int list = 0, stats = 0, docid = -1;
+
+ while ((ch = getopt(argc, argv, "d:lp:s")) != -1) {
+ switch (ch) {
+ case 'd':
+ dbpath = optarg;
+ break;
+ case 'l':
+ list = 1;
+ break;
+ case 'p':
+ docid = strtonum(optarg, 0, INT_MAX, &errstr);
+ if (errstr != NULL)
+ errx(1, "document id is %s: %s", errstr,
+ optarg);
+ break;
+ case 's':
+ stats = 1;
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (dbpath == NULL)
+ dbpath = "db";
+
+ if (list && stats)
+ usage();
+
+ if ((fd = open(dbpath, O_RDONLY)) == -1)
+ err(1, "can't open %s", dbpath);
+
+ if (pledge("stdio", NULL) == -1)
+ err(1, "pledge");
+
+ if (db_open(&db, fd) == -1)
+ err(1, "db_open");
+
+ if (list) {
+ if (db_listall(&db, print_entry, NULL) == -1)
+ err(1, "db_listall");
+ } else if (stats) {
+ struct db_stats st;
+
+ if (db_stats(&db, &st) == -1)
+ err(1, "db_stats");
+ printf("unique words = %zu\n", st.nwords);
+ printf("documents = %zu\n", st.ndocs);
+ printf("longest word = %s\n", st.longest_word);
+ printf("most popular = %s (%zu)\n", st.most_popular,
+ st.most_popular_ndocs);
+ } else if (docid != -1) {
+ struct db_entry e;
+
+ if (db_doc_by_id(&db, docid, &e) == -1)
+ errx(1, "failed to fetch document #%d", docid);
+ print_entry(&db, &e, NULL);
+ } else {
+ if (argc != 1)
+ usage();
+ if (fts(&db, *argv, print_entry, NULL) == -1)
+ errx(1, "fts failed");
+ }
+
+ db_close(&db);
+ close(fd);
+}
blob - /dev/null
blob + cdc1261027504a8f1627c76103148100ca364d9e (mode 644)
--- /dev/null
+++ include/db.h
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DB_VERSION 0
+#define DB_WORDLEN 32
+
+struct db {
+ uint8_t *m;
+ off_t len;
+ uint32_t version;
+ uint32_t nwords;
+
+ uint8_t *idx_start;
+ uint8_t *idx_end;
+ uint8_t *list_start;
+ uint8_t *list_end;
+ uint8_t *docs_start;
+ uint8_t *docs_end;
+};
+
+struct db_stats {
+ size_t nwords;
+ size_t ndocs;
+ const char *longest_word;
+ const char *most_popular;
+ size_t most_popular_ndocs;
+};
+
+struct db_entry {
+ char *name;
+ char *descr;
+};
+
+typedef int (*db_hit_cb)(struct db *, struct db_entry *, void *);
+
+struct dictionary;
+
+int db_create(FILE *, struct dictionary *, struct db_entry *, size_t);
+int db_open(struct db *, int);
+uint32_t *db_word_docs(struct db *, const char *, size_t *);
+int db_stats(struct db *, struct db_stats *);
+int db_listall(struct db *, db_hit_cb, void *);
+int db_doc_by_id(struct db *, int, struct db_entry *);
+void db_close(struct db *);
blob - /dev/null
blob + 81fa7f21a625ed303a114738d5fd74e1502c060c (mode 644)
--- /dev/null
+++ include/dictionary.h
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+struct dict_entry {
+ char *word;
+ int *ids;
+ size_t len;
+ size_t cap;
+};
+
+struct dictionary {
+ size_t len;
+ size_t cap;
+ struct dict_entry *entries;
+};
+
+int dictionary_init(struct dictionary *);
+int dictionary_add(struct dictionary *, const char *, int);
+int dictionary_add_words(struct dictionary *, char **, int);
+void dictionary_free(struct dictionary *);
blob - /dev/null
blob + 6196c9ed15110d609b814c34d240a3d166f6d2a6 (mode 644)
--- /dev/null
+++ include/fts.h
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+int fts(struct db *, const char *, db_hit_cb, void *);
blob - /dev/null
blob + c5da1d7a8f37fcb0a9ef861f311a0a9e49471dbc (mode 644)
--- /dev/null
+++ include/tokenize.h
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+char **tokenize(const char *);
+void freetoks(char **);
blob - /dev/null
blob + 1f878b9e23e536a58cc047f99c5432d61d23209c (mode 644)
--- /dev/null
+++ lib/db.c
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "db.h"
+#include "dictionary.h"
+
+#define IDX_ENTRY_SIZE (DB_WORDLEN + sizeof(int64_t))
+
+static int
+write_dictionary(FILE *fp, struct dictionary *dict)
+{
+ off_t start;
+ uint64_t pos;
+ uint32_t n;
+ size_t i, len;
+
+ if ((uint64_t)dict->len > UINT32_MAX)
+ return -1;
+
+ n = dict->len;
+ if (fwrite(&n, sizeof(n), 1, fp) != 1)
+ return -1;
+
+ if ((start = ftello(fp)) == -1)
+ return -1;
+
+ len = DB_WORDLEN + sizeof(int64_t);
+ pos = start + (n * len);
+ for (i = 0; i < dict->len; ++i) {
+ char word[DB_WORDLEN];
+
+ memset(word, 0, sizeof(word));
+ strlcpy(word, dict->entries[i].word, sizeof(word));
+ if (fwrite(word, sizeof(word), 1, fp) != 1)
+ return -1;
+
+ if (fwrite(&pos, sizeof(pos), 1, fp) != 1)
+ return -1;
+
+ /* one for the len */
+ pos += sizeof(uint32_t) * (dict->entries[i].len + 1);
+ }
+
+ for (i = 0; i < dict->len; ++i) {
+ size_t j;
+ uint32_t t, x;
+
+ x = dict->entries[i].len;
+ if (fwrite(&x, sizeof(x), 1, fp) != 1)
+ return -1;
+
+ for (j = 0; j < x; ++j) {
+ t = dict->entries[i].ids[j];
+ if (fwrite(&t, sizeof(t), 1, fp) != 1)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int
+db_create(FILE *fp, struct dictionary *dict, struct db_entry *entries,
+ size_t n)
+{
+ int64_t endidx;
+ size_t i;
+ uint32_t version = DB_VERSION;
+
+ if (n > INT32_MAX)
+ return -1;
+
+ if (fwrite(&version, sizeof(version), 1, fp) != 1)
+ return -1;
+
+ /* reserve space for the start pointer -- filled later */
+ if (fseek(fp, sizeof(int64_t), SEEK_CUR) == -1)
+ return -1;
+
+ if (write_dictionary(fp, dict) == -1)
+ return -1;
+
+ if ((endidx = ftello(fp)) == -1)
+ return -1;
+
+ for (i = 0; i < n; ++i) {
+ uint16_t namelen, descrlen = 0;
+
+ namelen = strlen(entries[i].name);
+ if (entries[i].descr != NULL)
+ descrlen = strlen(entries[i].descr);
+
+ if (fwrite(&namelen, sizeof(namelen), 1, fp) != 1)
+ return -1;
+ if (fwrite(entries[i].name, namelen+1, 1, fp) != 1)
+ return -1;
+
+ if (fwrite(&descrlen, sizeof(descrlen), 1, fp) != 1)
+ return -1;
+ if (descrlen > 0 &&
+ fwrite(entries[i].descr, descrlen, 1, fp) != 1)
+ return -1;
+ if (fwrite("", 1, 1, fp) != 1)
+ return -1;
+ }
+
+ if (fseek(fp, sizeof(version), SEEK_SET) == -1)
+ return -1;
+
+ if (fwrite(&endidx, sizeof(endidx), 1, fp) != 1)
+ return -1;
+
+ return 0;
+}
+
+static int
+initdb(struct db *db)
+{
+ off_t hdrlen = sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t);
+ int64_t end_off;
+ uint8_t *p = db->m;
+
+ if (hdrlen > db->len)
+ return -1;
+
+ memcpy(&db->version, p, sizeof(db->version));
+ p += sizeof(db->version);
+
+ memcpy(&end_off, p, sizeof(end_off));
+ p += sizeof(end_off);
+
+ memcpy(&db->nwords, p, sizeof(db->nwords));
+ p += sizeof(db->nwords);
+
+ db->idx_start = p;
+ db->idx_end = p + db->nwords * IDX_ENTRY_SIZE;
+ db->list_start = db->idx_end;
+ db->list_end = db->m + end_off;
+ db->docs_start = db->list_end;
+ db->docs_end = db->m + db->len;
+
+ if (db->idx_end > db->docs_end)
+ return -1;
+ if (db->list_end > db->docs_end)
+ return -1;
+
+ return 0;
+}
+
+int
+db_open(struct db *db, int fd)
+{
+ memset(db, 0, sizeof(*db));
+
+ if ((db->len = lseek(fd, 0, SEEK_END)) == -1)
+ return -1;
+
+ if (lseek(fd, 0, SEEK_SET) == -1)
+ return -1;
+
+ db->m = mmap(NULL, db->len, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (db->m == MAP_FAILED)
+ return -1;
+
+ if (initdb(db) == -1) {
+ db_close(db);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+db_countdocs(struct db *db, struct db_entry *e, void *d)
+{
+ struct db_stats *stats = d;
+
+ stats->ndocs++;
+ return 0;
+}
+
+static int
+db_idx_compar(const void *key, const void *elem)
+{
+ const char *word = key;
+ const char *idx_entry = elem;
+
+ if (idx_entry[DB_WORDLEN-1] != '\0')
+ return -1;
+ return strcmp(word, idx_entry);
+}
+
+static inline uint32_t *
+db_getdocs(struct db *db, const uint8_t *entry, size_t *len)
+{
+ int64_t pos;
+ uint32_t l;
+
+ entry += DB_WORDLEN;
+ memcpy(&pos, entry, sizeof(pos));
+
+ entry = db->m + pos;
+ if (entry < db->list_start || entry > db->list_end)
+ return NULL;
+
+ memcpy(&l, entry, sizeof(l));
+ entry += sizeof(l);
+ *len = l;
+ return (uint32_t *)entry;
+}
+
+uint32_t *
+db_word_docs(struct db *db, const char *word, size_t *len)
+{
+ uint8_t *e;
+
+ *len = 0;
+
+ e = bsearch(word, db->idx_start, db->nwords, IDX_ENTRY_SIZE,
+ db_idx_compar);
+ if (e == NULL)
+ return NULL;
+ return db_getdocs(db, e, len);
+}
+
+int
+db_stats(struct db *db, struct db_stats *stats)
+{
+ const uint8_t *p;
+ size_t l, maxl = 0, idlen;
+
+ memset(stats, 0, sizeof(*stats));
+
+ if (db_listall(db, db_countdocs, stats) == -1)
+ return -1;
+
+ stats->nwords = db->nwords;
+
+ p = db->idx_start;
+ while (p < db->idx_end) {
+ if (p + DB_WORDLEN > db->idx_end)
+ return -1;
+
+ if (p[DB_WORDLEN-1] != '\0')
+ return -1;
+
+ l = strlen(p);
+ if (l > maxl) {
+ maxl = l;
+ stats->longest_word = p;
+ }
+
+ if (db_getdocs(db, p, &idlen) == NULL)
+ return -1;
+
+ if (idlen > stats->most_popular_ndocs) {
+ stats->most_popular_ndocs = idlen;
+ stats->most_popular = p;
+ }
+
+ p += IDX_ENTRY_SIZE;
+ }
+
+ return 0;
+}
+
+static inline uint8_t *
+db_extract_doc(struct db *db, uint8_t *p, struct db_entry *e)
+{
+ uint16_t namelen, descrlen;
+
+ /*
+ * namelen[2] name[namelen]
+ * descrlen[2] descr[descrlen]
+ */
+
+ if (p + 2 > db->docs_end)
+ return NULL;
+ memcpy(&namelen, p, sizeof(namelen));
+ p += sizeof(namelen);
+
+ if (p + namelen > db->docs_end || p[namelen] != '\0')
+ return NULL;
+ e->name = p;
+ p += namelen + 1;
+
+ if (p + 2 > db->docs_end)
+ return NULL;
+ memcpy(&descrlen, p, sizeof(descrlen));
+ p += sizeof(descrlen);
+
+ if (p + descrlen > db->docs_end || p[descrlen] != '\0')
+ return NULL;
+ e->descr = p;
+ p += descrlen + 1;
+
+ return p;
+}
+
+int
+db_listall(struct db *db, db_hit_cb cb, void *data)
+{
+ uint8_t *p = db->docs_start;
+
+ while (p < db->docs_end) {
+ struct db_entry e;
+
+ if ((p = db_extract_doc(db, p, &e)) == NULL)
+ return -1;
+
+ if (cb(db, &e, data) == -1)
+ return -1;
+ }
+
+ return 0;
+}
+
+int
+db_doc_by_id(struct db *db, int docid, struct db_entry *e)
+{
+ uint8_t *p = db->docs_start;
+ int n = 0;
+
+ while (p < db->docs_end) {
+ if ((p = db_extract_doc(db, p, e)) == NULL)
+ return -1;
+
+ if (n == docid)
+ return 0;
+
+ n++;
+ }
+
+ return -1;
+}
+
+void
+db_close(struct db *db)
+{
+ munmap(db->m, db->len);
+ memset(db, 0, sizeof(*db));
+}
blob - /dev/null
blob + 36390fb9a5b5e9e284b077a1a42c2cdff67c7059 (mode 644)
--- /dev/null
+++ lib/dictionary.c
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "dictionary.h"
+
+int
+dictionary_init(struct dictionary *dict)
+{
+ memset(dict, 0, sizeof(*dict));
+ return 1;
+}
+
+static inline int
+add_docid(struct dict_entry *e, int docid)
+{
+ void *t;
+ size_t newcap;
+
+ if (e->len > 0 && e->ids[e->len-1] == docid)
+ return 1;
+
+ if (e->len == e->cap) {
+ newcap = e->cap * 1.5;
+ if (newcap == 0)
+ newcap = 8;
+ t = recallocarray(e->ids, e->cap, newcap, sizeof(*e->ids));
+ if (t == NULL)
+ return 0;
+ e->ids = t;
+ e->cap = newcap;
+ }
+
+ e->ids[e->len++] = docid;
+ return 1;
+}
+
+int
+dictionary_add(struct dictionary *dict, const char *word, int docid)
+{
+ struct dict_entry *e = NULL;
+ void *newentr;
+ size_t newcap, mid = 0, left = 0, right = dict->len;
+ int r = 0;
+
+ while (left < right) {
+ mid = (left + right) / 2;
+ e = &dict->entries[mid];
+ r = strcmp(word, e->word);
+ if (r < 0)
+ right = mid;
+ else if (r > 0)
+ left = mid + 1;
+ else
+ return add_docid(e, docid);
+ }
+
+ if (r > 0)
+ mid++;
+
+ if (dict->len == dict->cap) {
+ newcap = dict->cap * 1.5;
+ if (newcap == 0)
+ newcap = 8;
+ newentr = recallocarray(dict->entries, dict->cap, newcap,
+ sizeof(*dict->entries));
+ if (newentr == NULL)
+ return 0;
+ dict->entries = newentr;
+ dict->cap = newcap;
+ }
+
+ e = &dict->entries[mid];
+ if (e != dict->entries + dict->len) {
+ size_t i = e - dict->entries;
+ memmove(e+1, e, sizeof(*e) * (dict->len - i));
+ }
+
+ dict->len++;
+ memset(e, 0, sizeof(*e));
+ if ((e->word = strdup(word)) == NULL)
+ return 0;
+ return add_docid(e, docid);
+}
+
+int
+dictionary_add_words(struct dictionary *dict, char **words, int docid)
+{
+ for (; *words != NULL; ++words) {
+ if (!dictionary_add(dict, *words, docid))
+ return 0;
+ }
+
+ return 1;
+}
+
+void
+dictionary_free(struct dictionary *dict)
+{
+ size_t i;
+
+ for (i = 0; i < dict->len; ++i) {
+ free(dict->entries[i].word);
+ free(dict->entries[i].ids);
+ }
+
+ free(dict->entries);
+}
blob - /dev/null
blob + 1e16be4b854b5e95f47bb7ad1060e66867c040cb (mode 644)
--- /dev/null
+++ lib/fts.c
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "db.h"
+#include "fts.h"
+#include "tokenize.h"
+
+struct doclist {
+ uint32_t *ids;
+ size_t len;
+};
+
+int
+fts(struct db *db, const char *query, db_hit_cb cb, void *data)
+{
+ struct doclist *xs = NULL;
+ size_t i, len;
+ char **toks, **t;
+ int ret = 0;
+
+ if ((toks = tokenize(query)) == NULL)
+ return -1;
+
+ len = 0;
+ for (t = toks; *t != NULL; ++t)
+ len++;
+
+ if (len == 0)
+ goto done;
+
+ if ((xs = calloc(len, sizeof(*xs))) == NULL) {
+ freetoks(toks);
+ return -1;
+ }
+
+ for (i = 0; i < len; ++i) {
+ xs[i].ids = db_word_docs(db, toks[i], &xs[i].len);
+ if (xs[i].ids == NULL || xs[i].len == 0)
+ goto done;
+ }
+
+ for (;;) {
+ struct db_entry e;
+ uint32_t mdoc;
+
+ mdoc = xs[0].ids[0];
+ for (i = 1; i < len; ++i) {
+ if (xs[i].ids[0] > mdoc)
+ goto next;
+ while (xs[i].ids[0] < mdoc) {
+ if (--xs[i].len == 0)
+ goto done;
+ xs[i].ids++;
+ }
+
+ if (xs[i].ids[0] != mdoc)
+ goto next;
+ }
+
+ if (db_doc_by_id(db, mdoc, &e) == -1) {
+ ret = -1;
+ goto done;
+ }
+
+ if (cb(db, &e, data) == -1) {
+ ret = -1;
+ goto done;
+ }
+
+ next:
+ if (--xs[0].len == 0)
+ goto done;
+ xs[0].ids++;
+ }
+
+done:
+ free(xs);
+ freetoks(toks);
+
+ return ret;
+}
blob - /dev/null
blob + d07eb30cb1b222a478848edaccfeefadee59a8d9 (mode 644)
--- /dev/null
+++ lib/tokenize.c
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "tokenize.h"
+
+#ifndef WDELIMS
+/* everything but a-zA-Z */
+#define WDELIMS " \t\n!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"
+#endif
+
+char **
+tokenize(const char *s)
+{
+ char *d, *dup, *t, **tok = NULL;
+ void *newtok;
+ size_t cap = 0, len = 0, newcap;
+
+ if ((dup = strdup(s)) == NULL)
+ return NULL;
+ d = dup;
+
+ for (t = d; *t; ++t)
+ *t = tolower(*t);
+
+ while ((t = strsep(&d, WDELIMS)) != NULL) {
+ if (*t == '\0')
+ continue;
+
+ /* keep the space for a NULL terminator */
+ if (len+1 >= cap) {
+ newcap = cap * 1.5;
+ if (newcap == 0)
+ newcap = 8;
+ newtok = recallocarray(tok, cap, newcap,
+ sizeof(char *));
+ if (newtok == NULL)
+ goto err;
+ tok = newtok;
+ cap = newcap;
+ }
+
+ if ((tok[len++] = strdup(t)) == NULL)
+ goto err;
+ }
+
+ free(dup);
+ return tok;
+
+err:
+ freetoks(tok);
+ free(dup);
+ return NULL;
+}
+
+void
+freetoks(char **tok)
+{
+ char **i;
+
+ if (tok == NULL)
+ return;
+
+ for (i = tok; *i != NULL; ++i)
+ free(*i);
+ free(tok);
+}
blob - /dev/null
blob + 6ff8e82141f23f626139263f9fa9a0af829f73d0 (mode 644)
--- /dev/null
+++ mkftsidx/Makefile
+.PATH:${.CURDIR}/../lib
+
+PROG = mkftsidx
+SRCS = mkftsidx.c ports.c wiki.c db.c dictionary.c tokenize.c
+
+WARNINGS = yes
+
+CPPFLAGS += -I/usr/local/include -I${.CURDIR}/../include
+LDADD = -lexpat -lsqlite3 -L/usr/local/lib
+
+.if defined(PROFILE)
+CPPFLAGS += -DPROFILE
+LDADD += -static -lm -lpthread
+DEBUG = -pg
+.endif
+
+DEBUG += -O0 -g
+
+show-prof:
+ gprof mkftsidx ../gmon.out | gprof2dot | dot -Tpng > profile.png
+ nsxiv profile.png &
+
+.include <bsd.prog.mk>
blob - /dev/null
blob + 74558e22e0c5946bae114760b939a148fd8547c5 (mode 644)
--- /dev/null
+++ mkftsidx/mkftsidx.1
+.\" Copyright (c) 2022 Omar Polo <op@openbsd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.Dd April 11, 2022
+.Dt MKFTSIDX 1
+.Os
+.Sh NAME
+.Nm mkftsidx
+.Nd construct fts database
+.Sh SYNOPSIS
+.Nm
+.Bk -words
+.Op Fl o Ar dbpath
+.Op Fl m Ar p|w
+.Op Ar path
+.Ek
+.Sh DESCRIPTION
+.Nm
+is a program to create a fts database for
+.Xr ftsearch 1 .
+The arguments are as follows:
+.Bl -tag -width Ds
+.It Fl o Ar dbpath
+Path to the database file to create.
+.Pa db
+by default.
+.It Fl m Ar p|w
+Set the mode.
+If
+.Ar p
+.Pq the default
+then create a database with the
+.Ox
+ports tree data,
+otherwise creates a database from a Wikipedia dump.
+.It Ar path
+Path to the sources.
+When working in
+.Ar p
+mode, it's the optional path to the sqlports database.
+Otherwise, it's the mandatory path to the Wikipedia file dump.
+.El
+.Sh EXAMPLES
+To create a database with the
+.Ox
+ports tree content:
+.Bd -literal -offset indent
+$ mkftsidx
+.Ed
+.Pp
+To create a database from a Wikipedia dump to the
+.Pa db.wiki
+file:
+.Bd -literal -offset indent
+$ mkftsidx -o db.wiki -mw enwiki-latest-abstract1.xml
+.Ed
+.Sh SEE ALSO
+.Xr ftsearch 1
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+program was written by
+.An Omar Polo Aq Mt op@omarpolo.com .
blob - /dev/null
blob + a5c9ba3665c026d1c43cd6b584753e845cb708ab (mode 644)
--- /dev/null
+++ mkftsidx/mkftsidx.c
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "db.h"
+#include "dictionary.h"
+
+#include "mkftsidx.h"
+
+enum {
+ MODE_SQLPORTS,
+ MODE_WIKI,
+};
+
+char *
+xstrdup(const char *s)
+{
+ char *t;
+
+ if (s == NULL)
+ return NULL;
+
+ if ((t = strdup(s)) == NULL)
+ err(1, "strdup");
+ return t;
+}
+
+__dead void
+usage(void)
+{
+ fprintf(stderr, "usage: %s [-o dbpath] [-m p|w] [path]\n",
+ getprogname());
+ exit(1);
+}
+
+int
+main(int argc, char **argv)
+{
+ struct dictionary dict;
+ struct db_entry *entries = NULL;
+ const char *dbpath = NULL;
+ FILE *fp;
+ size_t i, len = 0;
+ int ch, r = 0, mode = MODE_SQLPORTS;
+
+#ifndef PROFILE
+ /* sqlite needs flock */
+ if (pledge("stdio rpath wpath cpath flock", NULL) == -1)
+ err(1, "pledge");
+#endif
+
+ while ((ch = getopt(argc, argv, "m:o:")) != -1) {
+ switch (ch) {
+ case 'm':
+ switch (*optarg) {
+ case 'p':
+ mode = MODE_SQLPORTS;
+ break;
+ case 'w':
+ mode = MODE_WIKI;
+ break;
+ default:
+ usage();
+ }
+ break;
+ case 'o':
+ dbpath = optarg;
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (dbpath == NULL)
+ dbpath = "db";
+
+ if (!dictionary_init(&dict))
+ err(1, "dictionary_init");
+
+ if (mode == MODE_SQLPORTS)
+ r = idx_ports(&dict, &entries, &len, argc, argv);
+ else
+ r = idx_wiki(&dict, &entries, &len, argc, argv);
+
+ if (r == 0) {
+ if ((fp = fopen(dbpath, "w+")) == NULL)
+ err(1, "can't open %s", dbpath);
+ if (db_create(fp, &dict, entries, len) == -1) {
+ warn("db_create");
+ unlink(dbpath);
+ r = 1;
+ }
+ fclose(fp);
+ }
+
+ for (i = 0; i < len; ++i) {
+ free(entries[i].name);
+ free(entries[i].descr);
+ }
+ free(entries);
+ dictionary_free(&dict);
+
+ return r;
+}
blob - /dev/null
blob + e64b8c34a543829be5f9855d096e818fc30e2597 (mode 644)
--- /dev/null
+++ mkftsidx/mkftsidx.h
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* mkftsidx.c */
+__dead void usage(void);
+char *xstrdup(const char *);
+
+/* ports.c */
+int idx_ports(struct dictionary *, struct db_entry **, size_t *,
+ int, char **);
+
+/* wiki.c */
+int idx_wiki(struct dictionary *, struct db_entry **, size_t *,
+ int, char **);
blob - /dev/null
blob + aaad95dea48ebc60c0071734e949b74faf9c0eed (mode 644)
--- /dev/null
+++ mkftsidx/ports.c
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sqlite3.h>
+
+#include "db.h"
+#include "dictionary.h"
+#include "tokenize.h"
+
+#include "mkftsidx.h"
+
+#ifndef SQLPORTS
+#define SQLPORTS "/usr/local/share/sqlports"
+#endif
+
+#define QNUM "select count(*) from portsq;"
+#define QALL "select pkgstem, comment, descr_contents from portsq;"
+
+static int
+countports(sqlite3 *db)
+{
+ sqlite3_stmt *stmt;
+ int r, n = -1;
+
+ r = sqlite3_prepare_v2(db, QNUM, -1, &stmt, NULL);
+ if (r != SQLITE_OK) {
+ warnx("failed to prepare statement: %s",
+ sqlite3_errstr(r));
+ return -1;
+ }
+
+ r = sqlite3_step(stmt);
+ if (r == SQLITE_ROW)
+ n = sqlite3_column_int(stmt, 0);
+
+ sqlite3_finalize(stmt);
+ return n;
+}
+
+int
+idx_ports(struct dictionary *dict, struct db_entry **entries, size_t *len,
+ int argc, char **argv)
+{
+ const char *dbpath;
+ sqlite3 *db;
+ sqlite3_stmt *stmt;
+ size_t i;
+ int r;
+
+ if (argc > 1)
+ usage();
+ else if (argc == 1)
+ dbpath = *argv;
+ else
+ dbpath = SQLPORTS;
+
+ if ((r = sqlite3_open(dbpath, &db)) != SQLITE_OK)
+ errx(1, "can't open %s: %s", dbpath, sqlite3_errstr(r));
+
+ if ((r = countports(db)) == -1 || r == 0) {
+ warnx("error querying the db or empty portsq table!");
+ goto done;
+ }
+ *len = r;
+
+ if ((*entries = calloc(*len, sizeof(**entries))) == NULL)
+ err(1, "calloc");
+
+ r = sqlite3_prepare_v2(db, QALL, -1, &stmt, NULL);
+ if (r != SQLITE_OK)
+ errx(1, "failed to prepare statement: %s", sqlite3_errstr(r));
+
+ for (i = 0; i < *len; ++i) {
+ const char *pkgstem, *comment, *descr;
+ char *doc, **toks;
+
+ r = sqlite3_step(stmt);
+ if (r == SQLITE_DONE)
+ break;
+ if (r != SQLITE_ROW)
+ errx(1, "sqlite3_step: %s", sqlite3_errstr(r));
+
+ pkgstem = sqlite3_column_text(stmt, 0);
+ comment = sqlite3_column_text(stmt, 1);
+ descr = sqlite3_column_text(stmt, 2);
+
+ (*entries)[i].name = xstrdup(pkgstem);
+ (*entries)[i].descr = xstrdup(comment);
+
+ r = asprintf(&doc, "%s %s %s", pkgstem,
+ comment != NULL ? comment : "",
+ descr != NULL ? descr : "");
+ if (r == -1)
+ err(1, "asprintf");
+
+ if ((toks = tokenize(doc)) == NULL)
+ err(1, "tokenize");
+ if (!dictionary_add_words(dict, toks, i))
+ err(1, "dictionary_add_words");
+ freetoks(toks);
+ free(doc);
+ }
+
+done:
+ sqlite3_close(db);
+ return 0;
+}
blob - /dev/null
blob + a2fa1f2751eaa0c92f8c65cfe384e8ed9f4f018d (mode 644)
--- /dev/null
+++ mkftsidx/wiki.c
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <expat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "db.h"
+#include "dictionary.h"
+#include "tokenize.h"
+
+#include "mkftsidx.h"
+
+enum {
+ N_UNK,
+ N_TIT,
+ N_URL,
+ N_ABS,
+};
+
+struct mydata {
+ struct dictionary *dict;
+ struct db_entry *entries;
+ size_t len;
+ size_t cap;
+
+ int next;
+ char *title;
+ char *url;
+ char *abstract;
+};
+
+static void
+el_start(void *data, const char *element, const char **attr)
+{
+ struct mydata *d = data;
+
+ if (!strcmp(element, "title")) {
+ d->next = N_TIT;
+ } else if (!strcmp(element, "url")) {
+ d->next = N_URL;
+ } else if (!strcmp(element, "abstract")) {
+ d->next = N_ABS;
+ }
+}
+
+static void
+append_text(char **text, const char *s, int len)
+{
+ char *t, *out, *orig;
+
+ if ((t = calloc(1, len + 1)) == NULL)
+ err(1, "calloc");
+ memcpy(t, s, len);
+
+ if ((orig = *text) == NULL)
+ orig = "";
+ if (asprintf(&out, "%s%s", orig, t) == -1)
+ err(1, "asprintf");
+ free(*text);
+ *text = out;
+ free(t);
+}
+
+static void
+on_text(void *data, const char *s, int len)
+{
+ struct mydata *d = data;
+
+ switch (d->next) {
+ case N_TIT:
+ append_text(&d->title, s, len);
+ break;
+ case N_URL:
+ append_text(&d->url, s, len);
+ break;
+ case N_ABS:
+ append_text(&d->abstract, s, len);
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+el_end(void *data, const char *element)
+{
+ struct mydata *d = data;
+ struct db_entry *e;
+ size_t newcap;
+ const char *title;
+ char *doc, **toks;
+ void *t;
+ int r, next;
+
+ next = d->next;
+ d->next = N_UNK;
+ if ((next == N_TIT && !strcmp(element, "title")) ||
+ (next == N_URL && !strcmp(element, "url")) ||
+ (next == N_ABS && !strcmp(element, "abstract")) ||
+ strcmp(element, "doc"))
+ return;
+
+ if (d->len == d->cap) {
+ newcap = d->cap * 1.5;
+ if (newcap == 0)
+ newcap = 8;
+ t = recallocarray(d->entries, d->cap, newcap,
+ sizeof(*d->entries));
+ if (t == NULL)
+ err(1, "recallocarray");
+ d->entries = t;
+ d->cap = newcap;
+ }
+
+ title = d->title;
+ if (!strncmp(title, "Wikipedia: ", 11))
+ title += 11;
+
+ e = &d->entries[d->len++];
+ e->name = xstrdup(d->url);
+ e->descr = xstrdup(title);
+
+ if (d->len % 1000 == 0)
+ printf("=> %zu\n", d->len);
+
+ r = asprintf(&doc, "%s %s", title, d->abstract);
+ if (r == -1)
+ err(1, "asprintf");
+
+ if ((toks = tokenize(doc)) != NULL) {
+ if (!dictionary_add_words(d->dict, toks, d->len-1))
+ err(1, "dictionary_add_words");
+ freetoks(toks);
+ }
+ free(doc);
+
+ free(d->title);
+ free(d->url);
+ free(d->abstract);
+
+ d->title = NULL;
+ d->url = NULL;
+ d->abstract = NULL;
+}
+
+int
+idx_wiki(struct dictionary *dict, struct db_entry **entries, size_t *len,
+ int argc, char **argv)
+{
+ struct mydata d;
+ XML_Parser parser;
+ const char *xmlpath;
+ char buf[BUFSIZ];
+ int done = 0;
+ FILE *fp;
+ size_t r;
+
+ if (argc != 1) {
+ warnx("missing path to xml file");
+ usage();
+ }
+ xmlpath = *argv;
+
+ memset(&d, 0, sizeof(d));
+ d.dict = dict;
+
+ if ((parser = XML_ParserCreate(NULL)) == NULL)
+ err(1, "XML_ParserCreate");
+ XML_SetUserData(parser, &d);
+ XML_SetElementHandler(parser, el_start, el_end);
+ XML_SetCharacterDataHandler(parser, on_text);
+
+ if ((fp = fopen(xmlpath, "r")) == NULL)
+ err(1, "can't open %s", xmlpath);
+
+ do {
+ r = fread(buf, 1, sizeof(buf), fp);
+ done = r != sizeof(buf);
+ if (!XML_Parse(parser, buf, r, done))
+ errx(1, "can't parse: %s at %s:%lu",
+ XML_ErrorString(XML_GetErrorCode(parser)),
+ xmlpath,
+ XML_GetCurrentLineNumber(parser));
+ } while (!done);
+
+ fclose(fp);
+ XML_ParserFree(parser);
+
+ *len = d.len;
+ *entries = d.entries;
+
+ return 0;
+}