2 e2b5610c 2022-04-11 op * Copyright (c) 2022 Omar Polo <op@openbsd.org>
4 e2b5610c 2022-04-11 op * Permission to use, copy, modify, and distribute this software for any
5 e2b5610c 2022-04-11 op * purpose with or without fee is hereby granted, provided that the above
6 e2b5610c 2022-04-11 op * copyright notice and this permission notice appear in all copies.
8 e2b5610c 2022-04-11 op * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 e2b5610c 2022-04-11 op * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 e2b5610c 2022-04-11 op * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 e2b5610c 2022-04-11 op * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 e2b5610c 2022-04-11 op * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 e2b5610c 2022-04-11 op * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 e2b5610c 2022-04-11 op * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 e2b5610c 2022-04-11 op #include <err.h>
18 e2b5610c 2022-04-11 op #include <expat.h>
19 e2b5610c 2022-04-11 op #include <stdio.h>
20 e2b5610c 2022-04-11 op #include <stdlib.h>
21 e2b5610c 2022-04-11 op #include <string.h>
23 e2b5610c 2022-04-11 op #include "db.h"
24 e2b5610c 2022-04-11 op #include "dictionary.h"
25 e2b5610c 2022-04-11 op #include "tokenize.h"
27 e2b5610c 2022-04-11 op #include "mkftsidx.h"
36 e2b5610c 2022-04-11 op struct mydata {
37 e2b5610c 2022-04-11 op struct dictionary *dict;
38 e2b5610c 2022-04-11 op struct db_entry *entries;
45 e2b5610c 2022-04-11 op char *abstract;
49 e2b5610c 2022-04-11 op el_start(void *data, const char *element, const char **attr)
51 e2b5610c 2022-04-11 op struct mydata *d = data;
53 e2b5610c 2022-04-11 op if (!strcmp(element, "title")) {
54 e2b5610c 2022-04-11 op d->next = N_TIT;
55 e2b5610c 2022-04-11 op } else if (!strcmp(element, "url")) {
56 e2b5610c 2022-04-11 op d->next = N_URL;
57 e2b5610c 2022-04-11 op } else if (!strcmp(element, "abstract")) {
58 e2b5610c 2022-04-11 op d->next = N_ABS;
63 e2b5610c 2022-04-11 op append_text(char **text, const char *s, int len)
65 e2b5610c 2022-04-11 op char *t, *out, *orig;
67 e2b5610c 2022-04-11 op if ((t = calloc(1, len + 1)) == NULL)
68 e2b5610c 2022-04-11 op err(1, "calloc");
69 e2b5610c 2022-04-11 op memcpy(t, s, len);
71 e2b5610c 2022-04-11 op if ((orig = *text) == NULL)
73 e2b5610c 2022-04-11 op if (asprintf(&out, "%s%s", orig, t) == -1)
74 e2b5610c 2022-04-11 op err(1, "asprintf");
81 e2b5610c 2022-04-11 op on_text(void *data, const char *s, int len)
83 e2b5610c 2022-04-11 op struct mydata *d = data;
85 e2b5610c 2022-04-11 op switch (d->next) {
87 e2b5610c 2022-04-11 op append_text(&d->title, s, len);
90 e2b5610c 2022-04-11 op append_text(&d->url, s, len);
93 e2b5610c 2022-04-11 op append_text(&d->abstract, s, len);
101 e2b5610c 2022-04-11 op el_end(void *data, const char *element)
103 e2b5610c 2022-04-11 op struct mydata *d = data;
104 e2b5610c 2022-04-11 op struct db_entry *e;
105 e2b5610c 2022-04-11 op size_t newcap;
106 86693535 2022-04-11 op const char *title, *abstract;
107 e2b5610c 2022-04-11 op char *doc, **toks;
111 e2b5610c 2022-04-11 op next = d->next;
112 e2b5610c 2022-04-11 op d->next = N_UNK;
113 e2b5610c 2022-04-11 op if ((next == N_TIT && !strcmp(element, "title")) ||
114 e2b5610c 2022-04-11 op (next == N_URL && !strcmp(element, "url")) ||
115 e2b5610c 2022-04-11 op (next == N_ABS && !strcmp(element, "abstract")) ||
116 e2b5610c 2022-04-11 op strcmp(element, "doc"))
119 e2b5610c 2022-04-11 op if (d->len == d->cap) {
120 e2b5610c 2022-04-11 op newcap = d->cap * 1.5;
121 e2b5610c 2022-04-11 op if (newcap == 0)
123 e2b5610c 2022-04-11 op t = recallocarray(d->entries, d->cap, newcap,
124 e2b5610c 2022-04-11 op sizeof(*d->entries));
125 e2b5610c 2022-04-11 op if (t == NULL)
126 e2b5610c 2022-04-11 op err(1, "recallocarray");
127 e2b5610c 2022-04-11 op d->entries = t;
128 e2b5610c 2022-04-11 op d->cap = newcap;
131 e2b5610c 2022-04-11 op title = d->title;
132 e2b5610c 2022-04-11 op if (!strncmp(title, "Wikipedia: ", 11))
135 86693535 2022-04-11 op if ((abstract = d->abstract) == NULL)
136 86693535 2022-04-11 op abstract = "";
138 e2b5610c 2022-04-11 op e = &d->entries[d->len++];
139 e2b5610c 2022-04-11 op e->name = xstrdup(d->url);
140 e2b5610c 2022-04-11 op e->descr = xstrdup(title);
142 e2b5610c 2022-04-11 op if (d->len % 1000 == 0)
143 e2b5610c 2022-04-11 op printf("=> %zu\n", d->len);
145 86693535 2022-04-11 op r = asprintf(&doc, "%s %s", title, abstract);
147 e2b5610c 2022-04-11 op err(1, "asprintf");
149 16b32c38 2022-04-11 op if ((toks = tokenize(doc)) == NULL)
150 16b32c38 2022-04-11 op err(1, "tokenize");
151 16b32c38 2022-04-11 op if (!dictionary_add_words(d->dict, toks, d->len-1))
152 16b32c38 2022-04-11 op err(1, "dictionary_add_words");
153 16b32c38 2022-04-11 op freetoks(toks);
156 e2b5610c 2022-04-11 op free(d->title);
157 e2b5610c 2022-04-11 op free(d->url);
158 e2b5610c 2022-04-11 op free(d->abstract);
160 e2b5610c 2022-04-11 op d->title = NULL;
161 e2b5610c 2022-04-11 op d->url = NULL;
162 e2b5610c 2022-04-11 op d->abstract = NULL;
166 e2b5610c 2022-04-11 op idx_wiki(struct dictionary *dict, struct db_entry **entries, size_t *len,
167 e2b5610c 2022-04-11 op int argc, char **argv)
169 e2b5610c 2022-04-11 op struct mydata d;
170 e2b5610c 2022-04-11 op XML_Parser parser;
171 e2b5610c 2022-04-11 op const char *xmlpath;
172 e2b5610c 2022-04-11 op char buf[BUFSIZ];
173 e2b5610c 2022-04-11 op int done = 0;
177 e2b5610c 2022-04-11 op if (argc != 1) {
178 e2b5610c 2022-04-11 op warnx("missing path to xml file");
181 e2b5610c 2022-04-11 op xmlpath = *argv;
183 e2b5610c 2022-04-11 op memset(&d, 0, sizeof(d));
184 e2b5610c 2022-04-11 op d.dict = dict;
186 e2b5610c 2022-04-11 op if ((parser = XML_ParserCreate(NULL)) == NULL)
187 e2b5610c 2022-04-11 op err(1, "XML_ParserCreate");
188 e2b5610c 2022-04-11 op XML_SetUserData(parser, &d);
189 e2b5610c 2022-04-11 op XML_SetElementHandler(parser, el_start, el_end);
190 e2b5610c 2022-04-11 op XML_SetCharacterDataHandler(parser, on_text);
192 e2b5610c 2022-04-11 op if ((fp = fopen(xmlpath, "r")) == NULL)
193 e2b5610c 2022-04-11 op err(1, "can't open %s", xmlpath);
196 e2b5610c 2022-04-11 op r = fread(buf, 1, sizeof(buf), fp);
197 e2b5610c 2022-04-11 op done = r != sizeof(buf);
198 e2b5610c 2022-04-11 op if (!XML_Parse(parser, buf, r, done))
199 e2b5610c 2022-04-11 op errx(1, "can't parse: %s at %s:%lu",
200 e2b5610c 2022-04-11 op XML_ErrorString(XML_GetErrorCode(parser)),
202 e2b5610c 2022-04-11 op XML_GetCurrentLineNumber(parser));
203 e2b5610c 2022-04-11 op } while (!done);
206 e2b5610c 2022-04-11 op XML_ParserFree(parser);
208 e2b5610c 2022-04-11 op *len = d.len;
209 e2b5610c 2022-04-11 op *entries = d.entries;