Commit Diff


commit - /dev/null
commit + e2b5610c2d39617b564fa64f4201e071a7499abd
blob - /dev/null
blob + 9b0e3f7dcc0943391bae90ec05fdc8ffe50e020b (mode 644)
--- /dev/null
+++ Makefile
@@ -0,0 +1,3 @@
+SUBDIR =	ftsearch mkftsidx
+
+.include <bsd.subdir.mk>
blob - /dev/null
blob + b3984eb9b4ac4e43aca78e00aff2fba2d08456cd (mode 644)
--- /dev/null
+++ ftsearch/Makefile
@@ -0,0 +1,12 @@
+.PATH:${.CURDIR}/../lib
+
+PROG =	ftsearch
+SRCS =	ftsearch.c db.c fts.c tokenize.c
+
+WARNINGS = yes
+
+DEBUG = -O0 -g
+
+CPPFLAGS += -I${.CURDIR}/../include
+
+.include <bsd.prog.mk>
blob - /dev/null
blob + 6ce50b9966e4d78deb8fad9b01cc3565bf5e5177 (mode 644)
--- /dev/null
+++ ftsearch/ftsearch.1
@@ -0,0 +1,70 @@
+.\" Copyright (c) 2022 Omar Polo <op@openbsd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.Dd April 11, 2022
+.Dt FTSEARCH 1
+.Os
+.Sh NAME
+.Nm ftsearch
+.Nd search strings quickly
+.Sh SYNOPSIS
+.Nm
+.Bk -words
+.Op Fl d Ar dbpath
+.Op Fl l
+.Op Fl s
+.Op Ar query
+.Ek
+.Sh DESCRIPTION
+The
+.Nm
+utility searches a database for all documents which match the given
+.Ar query .
+The database needs to be created beforehand with
+.Xr mkftsidx 1 .
+.Pp
+The arguments are as follows
+.Bl -tag -width 9m
+.It Fl d Ar dbpath
+Path to the database.
+.Pa db
+by default.
+.It Fl l
+List all known documents.
+Conflicts with
+.Fl s
+and
+.Ar query .
+.It Fl s
+Print database stats.
+Conflicts with
+.Fl l
+and
+.Ar query .
+.It Ar query
+The query to search for.
+.El
+.Sh EXAMPLES
+Search document that match
+.Dq file manager
+.Bd -literal -offset indent
+$ ftsearch 'file manager'
+.Ed
+.Sh SEE ALSO
+.Xr mkftsidx 1
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+program was written by
+.An Omar Polo Aq Mt op@omarpolo.com .
blob - /dev/null
blob + a8f4be0dadf83a849771058b16a03a7e0ad949e7 (mode 644)
--- /dev/null
+++ ftsearch/ftsearch.c
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "db.h"
+#include "fts.h"
+#include "tokenize.h"
+
+const char *dbpath;
+
+static void __dead
+usage(void)
+{
+	fprintf(stderr, "usage: %s [-d db] -l | -s | query",
+	    getprogname());
+	exit(1);
+}
+
+static int
+print_entry(struct db *db, struct db_entry *entry, void *data)
+{
+	printf("%-18s %s\n", entry->name, entry->descr);
+	return 0;
+}
+
+int
+main(int argc, char **argv)
+{
+	struct db db;
+	const char *errstr;
+	int fd, ch;
+	int list = 0, stats = 0, docid = -1;
+
+	while ((ch = getopt(argc, argv, "d:lp:s")) != -1) {
+		switch (ch) {
+		case 'd':
+			dbpath = optarg;
+			break;
+		case 'l':
+			list = 1;
+			break;
+		case 'p':
+			docid = strtonum(optarg, 0, INT_MAX, &errstr);
+			if (errstr != NULL)
+				errx(1, "document id is %s: %s", errstr,
+				    optarg);
+			break;
+		case 's':
+			stats = 1;
+			break;
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (dbpath == NULL)
+		dbpath = "db";
+
+	if (list && stats)
+		usage();
+
+	if ((fd = open(dbpath, O_RDONLY)) == -1)
+		err(1, "can't open %s", dbpath);
+
+	if (pledge("stdio", NULL) == -1)
+		err(1, "pledge");
+
+	if (db_open(&db, fd) == -1)
+		err(1, "db_open");
+
+	if (list) {
+		if (db_listall(&db, print_entry, NULL) == -1)
+			err(1, "db_listall");
+	} else if (stats) {
+		struct db_stats st;
+
+		if (db_stats(&db, &st) == -1)
+			err(1, "db_stats");
+		printf("unique words = %zu\n", st.nwords);
+		printf("documents    = %zu\n", st.ndocs);
+		printf("longest word = %s\n", st.longest_word);
+		printf("most popular = %s (%zu)\n", st.most_popular,
+		    st.most_popular_ndocs);
+	} else if (docid != -1) {
+		struct db_entry e;
+
+		if (db_doc_by_id(&db, docid, &e) == -1)
+			errx(1, "failed to fetch document #%d", docid);
+		print_entry(&db, &e, NULL);
+	} else {
+		if (argc != 1)
+			usage();
+		if (fts(&db, *argv, print_entry, NULL) == -1)
+			errx(1, "fts failed");
+	}
+
+	db_close(&db);
+	close(fd);
+}
blob - /dev/null
blob + cdc1261027504a8f1627c76103148100ca364d9e (mode 644)
--- /dev/null
+++ include/db.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DB_VERSION	 0
+#define DB_WORDLEN	32
+
+struct db {
+	uint8_t	*m;
+	off_t	 len;
+	uint32_t version;
+	uint32_t nwords;
+
+	uint8_t	*idx_start;
+	uint8_t	*idx_end;
+	uint8_t	*list_start;
+	uint8_t	*list_end;
+	uint8_t	*docs_start;
+	uint8_t	*docs_end;
+};
+
+struct db_stats {
+	size_t		 nwords;
+	size_t		 ndocs;
+	const char	*longest_word;
+	const char	*most_popular;
+	size_t		 most_popular_ndocs;
+};
+
+struct db_entry {
+	char	*name;
+	char	*descr;
+};
+
+typedef int (*db_hit_cb)(struct db *, struct db_entry *, void *);
+
+struct dictionary;
+
+int		 db_create(FILE *, struct dictionary *, struct db_entry *, size_t);
+int		 db_open(struct db *, int);
+uint32_t	*db_word_docs(struct db *, const char *, size_t *);
+int		 db_stats(struct db *, struct db_stats *);
+int		 db_listall(struct db *, db_hit_cb, void *);
+int		 db_doc_by_id(struct db *, int, struct db_entry *);
+void		 db_close(struct db *);
blob - /dev/null
blob + 81fa7f21a625ed303a114738d5fd74e1502c060c (mode 644)
--- /dev/null
+++ include/dictionary.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+struct dict_entry {
+	char	*word;
+	int	*ids;
+	size_t	 len;
+	size_t	 cap;
+};
+
+struct dictionary {
+	size_t	len;
+	size_t	cap;
+	struct dict_entry *entries;
+};
+
+int	dictionary_init(struct dictionary *);
+int	dictionary_add(struct dictionary *, const char *, int);
+int	dictionary_add_words(struct dictionary *, char **, int);
+void	dictionary_free(struct dictionary *);
blob - /dev/null
blob + 6196c9ed15110d609b814c34d240a3d166f6d2a6 (mode 644)
--- /dev/null
+++ include/fts.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+int	fts(struct db *, const char *, db_hit_cb, void *);
blob - /dev/null
blob + c5da1d7a8f37fcb0a9ef861f311a0a9e49471dbc (mode 644)
--- /dev/null
+++ include/tokenize.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+char	**tokenize(const char *);
+void	  freetoks(char **);
blob - /dev/null
blob + 1f878b9e23e536a58cc047f99c5432d61d23209c (mode 644)
--- /dev/null
+++ lib/db.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "db.h"
+#include "dictionary.h"
+
+#define IDX_ENTRY_SIZE (DB_WORDLEN + sizeof(int64_t))
+
+static int
+write_dictionary(FILE *fp, struct dictionary *dict)
+{
+	off_t start;
+	uint64_t pos;
+	uint32_t n;
+	size_t i, len;
+
+	if ((uint64_t)dict->len > UINT32_MAX)
+		return -1;
+
+	n = dict->len;
+	if (fwrite(&n, sizeof(n), 1, fp) != 1)
+		return -1;
+
+	if ((start = ftello(fp)) == -1)
+		return -1;
+
+	len = DB_WORDLEN + sizeof(int64_t);
+	pos = start + (n * len);
+	for (i = 0; i < dict->len; ++i) {
+		char word[DB_WORDLEN];
+
+		memset(word, 0, sizeof(word));
+		strlcpy(word, dict->entries[i].word, sizeof(word));
+		if (fwrite(word, sizeof(word), 1, fp) != 1)
+			return -1;
+
+		if (fwrite(&pos, sizeof(pos), 1, fp) != 1)
+			return -1;
+
+		/* one for the len */
+		pos += sizeof(uint32_t) * (dict->entries[i].len + 1);
+	}
+
+	for (i = 0; i < dict->len; ++i) {
+		size_t j;
+		uint32_t t, x;
+
+		x = dict->entries[i].len;
+		if (fwrite(&x, sizeof(x), 1, fp) != 1)
+			return -1;
+
+		for (j = 0; j < x; ++j) {
+			t = dict->entries[i].ids[j];
+			if (fwrite(&t, sizeof(t), 1, fp) != 1)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+int
+db_create(FILE *fp, struct dictionary *dict, struct db_entry *entries,
+    size_t n)
+{
+	int64_t endidx;
+	size_t i;
+	uint32_t version = DB_VERSION;
+
+	if (n > INT32_MAX)
+		return -1;
+
+	if (fwrite(&version, sizeof(version), 1, fp) != 1)
+		return -1;
+
+	/* reserve space for the start pointer -- filled later */
+	if (fseek(fp, sizeof(int64_t), SEEK_CUR) == -1)
+		return -1;
+
+	if (write_dictionary(fp, dict) == -1)
+		return -1;
+
+	if ((endidx = ftello(fp)) == -1)
+		return -1;
+
+	for (i = 0; i < n; ++i) {
+		uint16_t namelen, descrlen = 0;
+
+		namelen = strlen(entries[i].name);
+		if (entries[i].descr != NULL)
+			descrlen = strlen(entries[i].descr);
+
+		if (fwrite(&namelen, sizeof(namelen), 1, fp) != 1)
+			return -1;
+		if (fwrite(entries[i].name, namelen+1, 1, fp) != 1)
+			return -1;
+
+		if (fwrite(&descrlen, sizeof(descrlen), 1, fp) != 1)
+			return -1;
+		if (descrlen > 0 &&
+		    fwrite(entries[i].descr, descrlen, 1, fp) != 1)
+			return -1;
+		if (fwrite("", 1, 1, fp) != 1)
+			return -1;
+	}
+
+	if (fseek(fp, sizeof(version), SEEK_SET) == -1)
+		return -1;
+
+	if (fwrite(&endidx, sizeof(endidx), 1, fp) != 1)
+		return -1;
+
+	return 0;
+}
+
+static int
+initdb(struct db *db)
+{
+	off_t hdrlen = sizeof(uint32_t) + sizeof(int64_t) + sizeof(uint32_t);
+	int64_t end_off;
+	uint8_t *p = db->m;
+
+	if (hdrlen > db->len)
+		return -1;
+
+	memcpy(&db->version, p, sizeof(db->version));
+	p += sizeof(db->version);
+
+	memcpy(&end_off, p, sizeof(end_off));
+	p += sizeof(end_off);
+
+	memcpy(&db->nwords, p, sizeof(db->nwords));
+	p += sizeof(db->nwords);
+
+	db->idx_start = p;
+	db->idx_end = p + db->nwords * IDX_ENTRY_SIZE;
+	db->list_start = db->idx_end;
+	db->list_end = db->m + end_off;
+	db->docs_start = db->list_end;
+	db->docs_end = db->m + db->len;
+
+	if (db->idx_end > db->docs_end)
+		return -1;
+	if (db->list_end > db->docs_end)
+		return -1;
+
+	return 0;
+}
+
+int
+db_open(struct db *db, int fd)
+{
+	memset(db, 0, sizeof(*db));
+
+	if ((db->len = lseek(fd, 0, SEEK_END)) == -1)
+		return -1;
+
+	if (lseek(fd, 0, SEEK_SET) == -1)
+		return -1;
+
+	db->m = mmap(NULL, db->len, PROT_READ, MAP_PRIVATE, fd, 0);
+	if (db->m == MAP_FAILED)
+		return -1;
+
+	if (initdb(db) == -1) {
+		db_close(db);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int
+db_countdocs(struct db *db, struct db_entry *e, void *d)
+{
+	struct db_stats *stats = d;
+
+	stats->ndocs++;
+	return 0;
+}
+
+static int
+db_idx_compar(const void *key, const void *elem)
+{
+	const char *word = key;
+	const char *idx_entry = elem;
+
+	if (idx_entry[DB_WORDLEN-1] != '\0')
+		return -1;
+	return strcmp(word, idx_entry);
+}
+
+static inline uint32_t *
+db_getdocs(struct db *db, const uint8_t *entry, size_t *len)
+{
+	int64_t pos;
+	uint32_t l;
+
+	entry += DB_WORDLEN;
+	memcpy(&pos, entry, sizeof(pos));
+
+	entry = db->m + pos;
+	if (entry < db->list_start || entry > db->list_end)
+		return NULL;
+
+	memcpy(&l, entry, sizeof(l));
+	entry += sizeof(l);
+	*len = l;
+	return (uint32_t *)entry;
+}
+
+uint32_t *
+db_word_docs(struct db *db, const char *word, size_t *len)
+{
+	uint8_t *e;
+
+	*len = 0;
+
+	e = bsearch(word, db->idx_start, db->nwords, IDX_ENTRY_SIZE,
+	    db_idx_compar);
+	if (e == NULL)
+		return NULL;
+	return db_getdocs(db, e, len);
+}
+
+int
+db_stats(struct db *db, struct db_stats *stats)
+{
+	const uint8_t *p;
+	size_t l, maxl = 0, idlen;
+
+	memset(stats, 0, sizeof(*stats));
+
+	if (db_listall(db, db_countdocs, stats) == -1)
+		return -1;
+
+	stats->nwords = db->nwords;
+
+	p = db->idx_start;
+	while (p < db->idx_end) {
+		if (p + DB_WORDLEN > db->idx_end)
+			return -1;
+
+		if (p[DB_WORDLEN-1] != '\0')
+			return -1;
+
+		l = strlen(p);
+		if (l > maxl) {
+			maxl = l;
+			stats->longest_word = p;
+		}
+
+		if (db_getdocs(db, p, &idlen) == NULL)
+			return -1;
+
+		if (idlen > stats->most_popular_ndocs) {
+			stats->most_popular_ndocs = idlen;
+			stats->most_popular = p;
+		}
+
+		p += IDX_ENTRY_SIZE;
+	}
+
+	return 0;
+}
+
+static inline uint8_t *
+db_extract_doc(struct db *db, uint8_t *p, struct db_entry *e)
+{
+	uint16_t namelen, descrlen;
+
+	/*
+	 * namelen[2] name[namelen]
+	 * descrlen[2] descr[descrlen]
+	 */
+
+	if (p + 2 > db->docs_end)
+		return NULL;
+	memcpy(&namelen, p, sizeof(namelen));
+	p += sizeof(namelen);
+
+	if (p + namelen > db->docs_end || p[namelen] != '\0')
+		return NULL;
+	e->name = p;
+	p += namelen + 1;
+
+	if (p + 2 > db->docs_end)
+		return NULL;
+	memcpy(&descrlen, p, sizeof(descrlen));
+	p += sizeof(descrlen);
+
+	if (p + descrlen > db->docs_end || p[descrlen] != '\0')
+		return NULL;
+	e->descr = p;
+	p += descrlen + 1;
+
+	return p;
+}
+
+int
+db_listall(struct db *db, db_hit_cb cb, void *data)
+{
+	uint8_t *p = db->docs_start;
+
+	while (p < db->docs_end) {
+		struct db_entry e;
+
+		if ((p = db_extract_doc(db, p, &e)) == NULL)
+			return -1;
+
+		if (cb(db, &e, data) == -1)
+			return -1;
+	}
+
+	return 0;
+}
+
+int
+db_doc_by_id(struct db *db, int docid, struct db_entry *e)
+{
+	uint8_t *p = db->docs_start;
+	int n = 0;
+
+	while (p < db->docs_end) {
+		if ((p = db_extract_doc(db, p, e)) == NULL)
+			return -1;
+
+		if (n == docid)
+			return 0;
+
+		n++;
+	}
+
+	return -1;
+}
+
+void
+db_close(struct db *db)
+{
+	munmap(db->m, db->len);
+	memset(db, 0, sizeof(*db));
+}
blob - /dev/null
blob + 36390fb9a5b5e9e284b077a1a42c2cdff67c7059 (mode 644)
--- /dev/null
+++ lib/dictionary.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+
+#include "dictionary.h"
+
+int
+dictionary_init(struct dictionary *dict)
+{
+	memset(dict, 0, sizeof(*dict));
+	return 1;
+}
+
+static inline int
+add_docid(struct dict_entry *e, int docid)
+{
+	void *t;
+	size_t newcap;
+
+	if (e->len > 0 && e->ids[e->len-1] == docid)
+		return 1;
+
+	if (e->len == e->cap) {
+		newcap = e->cap * 1.5;
+		if (newcap == 0)
+			newcap = 8;
+		t = recallocarray(e->ids, e->cap, newcap, sizeof(*e->ids));
+		if (t == NULL)
+			return 0;
+		e->ids = t;
+		e->cap = newcap;
+	}
+
+	e->ids[e->len++] = docid;
+	return 1;
+}
+
+int
+dictionary_add(struct dictionary *dict, const char *word, int docid)
+{
+	struct dict_entry *e = NULL;
+	void *newentr;
+	size_t newcap, mid = 0, left = 0, right = dict->len;
+	int r = 0;
+
+	while (left < right) {
+		mid = (left + right) / 2;
+		e = &dict->entries[mid];
+		r = strcmp(word, e->word);
+		if (r < 0)
+			right = mid;
+		else if (r > 0)
+			left = mid + 1;
+		else
+			return add_docid(e, docid);
+	}
+
+	if (r > 0)
+		mid++;
+
+	if (dict->len == dict->cap) {
+		newcap = dict->cap * 1.5;
+		if (newcap == 0)
+			newcap = 8;
+		newentr = recallocarray(dict->entries, dict->cap, newcap,
+		    sizeof(*dict->entries));
+		if (newentr == NULL)
+			return 0;
+		dict->entries = newentr;
+		dict->cap = newcap;
+	}
+
+	e = &dict->entries[mid];
+	if (e != dict->entries + dict->len) {
+		size_t i = e - dict->entries;
+		memmove(e+1, e, sizeof(*e) * (dict->len - i));
+	}
+
+	dict->len++;
+	memset(e, 0, sizeof(*e));
+	if ((e->word = strdup(word)) == NULL)
+		return 0;
+	return add_docid(e, docid);
+}
+
+int
+dictionary_add_words(struct dictionary *dict, char **words, int docid)
+{
+	for (; *words != NULL; ++words) {
+		if (!dictionary_add(dict, *words, docid))
+			return 0;
+	}
+
+	return 1;
+}
+
+void
+dictionary_free(struct dictionary *dict)
+{
+	size_t i;
+
+	for (i = 0; i < dict->len; ++i) {
+		free(dict->entries[i].word);
+		free(dict->entries[i].ids);
+	}
+
+	free(dict->entries);
+}
blob - /dev/null
blob + 1e16be4b854b5e95f47bb7ad1060e66867c040cb (mode 644)
--- /dev/null
+++ lib/fts.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "db.h"
+#include "fts.h"
+#include "tokenize.h"
+
+struct doclist {
+	uint32_t	*ids;
+	size_t		 len;
+};
+
+int
+fts(struct db *db, const char *query, db_hit_cb cb, void *data)
+{
+	struct doclist *xs = NULL;
+	size_t i, len;
+	char **toks, **t;
+	int ret = 0;
+
+	if ((toks = tokenize(query)) == NULL)
+		return -1;
+
+	len = 0;
+	for (t = toks; *t != NULL; ++t)
+		len++;
+
+	if (len == 0)
+		goto done;
+
+	if ((xs = calloc(len, sizeof(*xs))) == NULL) {
+		freetoks(toks);
+		return -1;
+	}
+
+	for (i = 0; i < len; ++i) {
+		xs[i].ids = db_word_docs(db, toks[i], &xs[i].len);
+		if (xs[i].ids == NULL || xs[i].len == 0)
+			goto done;
+	}
+
+	for (;;) {
+		struct db_entry e;
+		uint32_t mdoc;
+
+		mdoc = xs[0].ids[0];
+		for (i = 1; i < len; ++i) {
+			if (xs[i].ids[0] > mdoc)
+				goto next;
+			while (xs[i].ids[0] < mdoc) {
+				if (--xs[i].len == 0)
+					goto done;
+				xs[i].ids++;
+			}
+
+			if (xs[i].ids[0] != mdoc)
+				goto next;
+		}
+
+		if (db_doc_by_id(db, mdoc, &e) == -1) {
+			ret = -1;
+			goto done;
+		}
+
+		if (cb(db, &e, data) == -1) {
+			ret = -1;
+			goto done;
+		}
+
+	next:
+		if (--xs[0].len == 0)
+			goto done;
+		xs[0].ids++;
+	}
+
+done:
+	free(xs);
+	freetoks(toks);
+
+	return ret;
+}
blob - /dev/null
blob + d07eb30cb1b222a478848edaccfeefadee59a8d9 (mode 644)
--- /dev/null
+++ lib/tokenize.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "tokenize.h"
+
+#ifndef WDELIMS
+/* everything but a-zA-Z */
+#define WDELIMS " \t\n!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~"
+#endif
+
+char **
+tokenize(const char *s)
+{
+	char *d, *dup, *t, **tok = NULL;
+	void *newtok;
+	size_t cap = 0, len = 0, newcap;
+
+	if ((dup = strdup(s)) == NULL)
+		return NULL;
+	d = dup;
+
+	for (t = d; *t; ++t)
+		*t = tolower(*t);
+
+	while ((t = strsep(&d, WDELIMS)) != NULL) {
+		if (*t == '\0')
+			continue;
+
+		/* keep the space for a NULL terminator */
+		if (len+1 >= cap) {
+			newcap = cap * 1.5;
+			if (newcap == 0)
+				newcap = 8;
+			newtok = recallocarray(tok, cap, newcap,
+			    sizeof(char *));
+			if (newtok == NULL)
+				goto err;
+			tok = newtok;
+			cap = newcap;
+		}
+
+		if ((tok[len++] = strdup(t)) == NULL)
+			goto err;
+	}
+
+	free(dup);
+	return tok;
+
+err:
+	freetoks(tok);
+	free(dup);
+	return NULL;
+}
+
+void
+freetoks(char **tok)
+{
+	char **i;
+
+	if (tok == NULL)
+		return;
+
+	for (i = tok; *i != NULL; ++i)
+		free(*i);
+	free(tok);
+}
blob - /dev/null
blob + 6ff8e82141f23f626139263f9fa9a0af829f73d0 (mode 644)
--- /dev/null
+++ mkftsidx/Makefile
@@ -0,0 +1,23 @@
+.PATH:${.CURDIR}/../lib
+
+PROG =	mkftsidx
+SRCS =	mkftsidx.c ports.c wiki.c db.c dictionary.c tokenize.c
+
+WARNINGS = yes
+
+CPPFLAGS += -I/usr/local/include -I${.CURDIR}/../include
+LDADD = -lexpat -lsqlite3 -L/usr/local/lib
+
+.if defined(PROFILE)
+CPPFLAGS += -DPROFILE
+LDADD += -static -lm -lpthread
+DEBUG = -pg
+.endif
+
+DEBUG += -O0 -g
+
+show-prof:
+	gprof mkftsidx ../gmon.out | gprof2dot | dot -Tpng > profile.png
+	nsxiv profile.png &
+
+.include <bsd.prog.mk>
blob - /dev/null
blob + 74558e22e0c5946bae114760b939a148fd8547c5 (mode 644)
--- /dev/null
+++ mkftsidx/mkftsidx.1
@@ -0,0 +1,74 @@
+.\" Copyright (c) 2022 Omar Polo <op@openbsd.org>
+.\"
+.\" Permission to use, copy, modify, and distribute this software for any
+.\" purpose with or without fee is hereby granted, provided that the above
+.\" copyright notice and this permission notice appear in all copies.
+.\"
+.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+.Dd April 11, 2022
+.Dt MKFTSIDX 1
+.Os
+.Sh NAME
+.Nm mkftsidx
+.Nd construct fts database
+.Sh SYNOPSIS
+.Nm
+.Bk -words
+.Op Fl o Ar dbpath
+.Op Fl m Ar p|w
+.Op Ar path
+.Ek
+.Sh DESCRIPTION
+.Nm
+is a program to create a fts database for
+.Xr ftsearch 1 .
+The arguments are as follows:
+.Bl -tag -width Ds
+.It Fl o Ar dbpath
+Path to the database file to create.
+.Pa db
+by default.
+.It Fl m Ar p|w
+Set the mode.
+If
+.Ar p
+.Pq the default
+then create a database with the
+.Ox
+ports tree data,
+otherwise creates a database from a Wikipedia dump.
+.It Ar path
+Path to the sources.
+When working in
+.Ar p
+mode, it's the optional path to the sqlports database.
+Otherwise, it's the mandatory path to the Wikipedia file dump.
+.El
+.Sh EXAMPLES
+To create a database with the
+.Ox
+ports tree content:
+.Bd -literal -offset indent
+$ mkftsidx
+.Ed
+.Pp
+To create a database from a Wikipedia dump to the
+.Pa db.wiki
+file:
+.Bd -literal -offset indent
+$ mkftsidx -o db.wiki -mw enwiki-latest-abstract1.xml
+.Ed
+.Sh SEE ALSO
+.Xr ftsearch 1
+.Sh AUTHORS
+.An -nosplit
+The
+.Nm
+program was written by
+.An Omar Polo Aq Mt op@omarpolo.com .
blob - /dev/null
blob + a5c9ba3665c026d1c43cd6b584753e845cb708ab (mode 644)
--- /dev/null
+++ mkftsidx/mkftsidx.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "db.h"
+#include "dictionary.h"
+
+#include "mkftsidx.h"
+
+enum {
+	MODE_SQLPORTS,
+	MODE_WIKI,
+};
+
+char *
+xstrdup(const char *s)
+{
+	char *t;
+
+	if (s == NULL)
+		return NULL;
+
+	if ((t = strdup(s)) == NULL)
+		err(1, "strdup");
+	return t;
+}
+
+__dead void
+usage(void)
+{
+	fprintf(stderr, "usage: %s [-o dbpath] [-m p|w] [path]\n",
+	    getprogname());
+	exit(1);
+}
+
+int
+main(int argc, char **argv)
+{
+	struct dictionary dict;
+	struct db_entry *entries = NULL;
+	const char *dbpath = NULL;
+	FILE *fp;
+	size_t i, len = 0;
+	int ch, r = 0, mode = MODE_SQLPORTS;
+
+#ifndef PROFILE
+	/* sqlite needs flock */
+	if (pledge("stdio rpath wpath cpath flock", NULL) == -1)
+		err(1, "pledge");
+#endif
+
+	while ((ch = getopt(argc, argv, "m:o:")) != -1) {
+		switch (ch) {
+		case 'm':
+			switch (*optarg) {
+			case 'p':
+				mode = MODE_SQLPORTS;
+				break;
+			case 'w':
+				mode = MODE_WIKI;
+				break;
+			default:
+				usage();
+			}
+			break;
+		case 'o':
+			dbpath = optarg;
+			break;
+		default:
+			usage();
+		}
+	}
+	argc -= optind;
+	argv += optind;
+
+	if (dbpath == NULL)
+		dbpath = "db";
+
+	if (!dictionary_init(&dict))
+		err(1, "dictionary_init");
+
+	if (mode == MODE_SQLPORTS)
+		r = idx_ports(&dict, &entries, &len, argc, argv);
+	else
+		r = idx_wiki(&dict, &entries, &len, argc, argv);
+
+	if (r == 0) {
+		if ((fp = fopen(dbpath, "w+")) == NULL)
+			err(1, "can't open %s", dbpath);
+		if (db_create(fp, &dict, entries, len) == -1) {
+			warn("db_create");
+			unlink(dbpath);
+			r = 1;
+		}
+		fclose(fp);
+	}
+
+	for (i = 0; i < len; ++i) {
+		free(entries[i].name);
+		free(entries[i].descr);
+	}
+	free(entries);
+	dictionary_free(&dict);
+
+	return r;
+}
blob - /dev/null
blob + e64b8c34a543829be5f9855d096e818fc30e2597 (mode 644)
--- /dev/null
+++ mkftsidx/mkftsidx.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* mkftsidx.c */
+__dead void	 usage(void);
+char		*xstrdup(const char *);
+
+/* ports.c */
+int idx_ports(struct dictionary *, struct db_entry **, size_t *,
+    int, char **);
+
+/* wiki.c */
+int idx_wiki(struct dictionary *, struct db_entry **, size_t *,
+    int, char **);
blob - /dev/null
blob + aaad95dea48ebc60c0071734e949b74faf9c0eed (mode 644)
--- /dev/null
+++ mkftsidx/ports.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sqlite3.h>
+
+#include "db.h"
+#include "dictionary.h"
+#include "tokenize.h"
+
+#include "mkftsidx.h"
+
+#ifndef SQLPORTS
+#define SQLPORTS "/usr/local/share/sqlports"
+#endif
+
+#define QNUM "select count(*) from portsq;"
+#define QALL "select pkgstem, comment, descr_contents from portsq;"
+
+static int
+countports(sqlite3 *db)
+{
+	sqlite3_stmt *stmt;
+	int r, n = -1;
+
+	r = sqlite3_prepare_v2(db, QNUM, -1, &stmt, NULL);
+	if (r != SQLITE_OK) {
+		warnx("failed to prepare statement: %s",
+		    sqlite3_errstr(r));
+		return -1;
+	}
+
+	r = sqlite3_step(stmt);
+	if (r == SQLITE_ROW)
+		n = sqlite3_column_int(stmt, 0);
+
+	sqlite3_finalize(stmt);
+	return n;
+}
+
+int
+idx_ports(struct dictionary *dict, struct db_entry **entries, size_t *len,
+    int argc, char **argv)
+{
+	const char *dbpath;
+	sqlite3 *db;
+	sqlite3_stmt *stmt;
+	size_t i;
+	int r;
+
+	if (argc > 1)
+		usage();
+	else if (argc == 1)
+		dbpath = *argv;
+	else
+		dbpath = SQLPORTS;
+
+	if ((r = sqlite3_open(dbpath, &db)) != SQLITE_OK)
+		errx(1, "can't open %s: %s", dbpath, sqlite3_errstr(r));
+
+	if ((r = countports(db)) == -1 || r == 0) {
+		warnx("error querying the db or empty portsq table!");
+		goto done;
+	}
+	*len = r;
+
+	if ((*entries = calloc(*len, sizeof(**entries))) == NULL)
+		err(1, "calloc");
+
+	r = sqlite3_prepare_v2(db, QALL, -1, &stmt, NULL);
+	if (r != SQLITE_OK)
+		errx(1, "failed to prepare statement: %s", sqlite3_errstr(r));
+
+	for (i = 0; i < *len; ++i) {
+		const char *pkgstem, *comment, *descr;
+		char *doc, **toks;
+
+		r = sqlite3_step(stmt);
+		if (r == SQLITE_DONE)
+			break;
+		if (r != SQLITE_ROW)
+			errx(1, "sqlite3_step: %s", sqlite3_errstr(r));
+
+		pkgstem = sqlite3_column_text(stmt, 0);
+		comment = sqlite3_column_text(stmt, 1);
+		descr = sqlite3_column_text(stmt, 2);
+
+		(*entries)[i].name = xstrdup(pkgstem);
+		(*entries)[i].descr = xstrdup(comment);
+
+		r = asprintf(&doc, "%s %s %s", pkgstem,
+		    comment != NULL ? comment : "",
+		    descr != NULL ? descr : "");
+		if (r == -1)
+			err(1, "asprintf");
+
+		if ((toks = tokenize(doc)) == NULL)
+			err(1, "tokenize");
+		if (!dictionary_add_words(dict, toks, i))
+			err(1, "dictionary_add_words");
+		freetoks(toks);
+		free(doc);
+	}
+
+done:
+	sqlite3_close(db);
+	return 0;
+}
blob - /dev/null
blob + a2fa1f2751eaa0c92f8c65cfe384e8ed9f4f018d (mode 644)
--- /dev/null
+++ mkftsidx/wiki.c
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2022 Omar Polo <op@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <err.h>
+#include <expat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "db.h"
+#include "dictionary.h"
+#include "tokenize.h"
+
+#include "mkftsidx.h"
+
+enum {
+	N_UNK,
+	N_TIT,
+	N_URL,
+	N_ABS,
+};
+
+struct mydata {
+	struct dictionary	*dict;
+	struct db_entry		*entries;
+	size_t			 len;
+	size_t			 cap;
+
+	int next;
+	char *title;
+	char *url;
+	char *abstract;
+};
+
+static void
+el_start(void *data, const char *element, const char **attr)
+{
+	struct mydata *d = data;
+
+	if (!strcmp(element, "title")) {
+		d->next = N_TIT;
+	} else if (!strcmp(element, "url")) {
+		d->next = N_URL;
+	} else if (!strcmp(element, "abstract")) {
+		d->next = N_ABS;
+	}
+}
+
+static void
+append_text(char **text, const char *s, int len)
+{
+	char *t, *out, *orig;
+
+	if ((t = calloc(1, len + 1)) == NULL)
+		err(1, "calloc");
+	memcpy(t, s, len);
+
+	if ((orig = *text) == NULL)
+		orig = "";
+	if (asprintf(&out, "%s%s", orig, t) == -1)
+		err(1, "asprintf");
+	free(*text);
+	*text = out;
+	free(t);
+}
+
+static void
+on_text(void *data, const char *s, int len)
+{
+	struct mydata *d = data;
+
+	switch (d->next) {
+	case N_TIT:
+		append_text(&d->title, s, len);
+		break;
+	case N_URL:
+		append_text(&d->url, s, len);
+		break;
+	case N_ABS:
+		append_text(&d->abstract, s, len);
+		break;
+	default:
+		break;
+	}
+}
+
+static void
+el_end(void *data, const char *element)
+{
+	struct mydata *d = data;
+	struct db_entry *e;
+	size_t newcap;
+	const char *title;
+	char *doc, **toks;
+	void *t;
+	int r, next;
+
+	next = d->next;
+	d->next = N_UNK;
+	if ((next == N_TIT && !strcmp(element, "title")) ||
+	    (next == N_URL && !strcmp(element, "url")) ||
+	    (next == N_ABS && !strcmp(element, "abstract")) ||
+	    strcmp(element, "doc"))
+		return;
+
+	if (d->len == d->cap) {
+		newcap = d->cap * 1.5;
+		if (newcap == 0)
+			newcap = 8;
+		t = recallocarray(d->entries, d->cap, newcap,
+		    sizeof(*d->entries));
+		if (t == NULL)
+			err(1, "recallocarray");
+		d->entries = t;
+		d->cap = newcap;
+	}
+
+	title = d->title;
+	if (!strncmp(title, "Wikipedia: ", 11))
+		title += 11;
+
+	e = &d->entries[d->len++];
+	e->name = xstrdup(d->url);
+	e->descr = xstrdup(title);
+
+	if (d->len % 1000 == 0)
+		printf("=> %zu\n", d->len);
+
+	r = asprintf(&doc, "%s %s", title, d->abstract);
+	if (r == -1)
+		err(1, "asprintf");
+
+	if ((toks = tokenize(doc)) != NULL) {
+		if (!dictionary_add_words(d->dict, toks, d->len-1))
+			err(1, "dictionary_add_words");
+		freetoks(toks);
+	}
+	free(doc);
+
+	free(d->title);
+	free(d->url);
+	free(d->abstract);
+
+	d->title = NULL;
+	d->url = NULL;
+	d->abstract = NULL;
+}
+
+int
+idx_wiki(struct dictionary *dict, struct db_entry **entries, size_t *len,
+    int argc, char **argv)
+{
+	struct mydata d;
+	XML_Parser parser;
+	const char *xmlpath;
+	char buf[BUFSIZ];
+	int done = 0;
+	FILE *fp;
+	size_t r;
+
+	if (argc != 1) {
+		warnx("missing path to xml file");
+		usage();
+	}
+	xmlpath = *argv;
+
+	memset(&d, 0, sizeof(d));
+	d.dict = dict;
+
+	if ((parser = XML_ParserCreate(NULL)) == NULL)
+		err(1, "XML_ParserCreate");
+	XML_SetUserData(parser, &d);
+	XML_SetElementHandler(parser, el_start, el_end);
+	XML_SetCharacterDataHandler(parser, on_text);
+
+	if ((fp = fopen(xmlpath, "r")) == NULL)
+		err(1, "can't open %s", xmlpath);
+
+	do {
+		r = fread(buf, 1, sizeof(buf), fp);
+		done = r != sizeof(buf);
+		if (!XML_Parse(parser, buf, r, done))
+			errx(1, "can't parse: %s at %s:%lu",
+			    XML_ErrorString(XML_GetErrorCode(parser)),
+			    xmlpath,
+			    XML_GetCurrentLineNumber(parser));
+	} while (!done);
+
+	fclose(fp);
+	XML_ParserFree(parser);
+
+	*len = d.len;
+	*entries = d.entries;
+
+	return 0;
+}