2 * Copyright (c) 2022 Omar Polo <op@omarpolo.com>
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
24 #include "dictionary.h"
37 struct dictionary *dict;
38 struct db_entry *entries;
49 el_start(void *data, const char *element, const char **attr)
51 struct mydata *d = data;
53 if (!strcmp(element, "title")) {
55 } else if (!strcmp(element, "url")) {
57 } else if (!strcmp(element, "abstract")) {
63 append_text(char **text, const char *s, int len)
67 if ((t = calloc(1, len + 1)) == NULL)
71 if ((orig = *text) == NULL)
73 if (asprintf(&out, "%s%s", orig, t) == -1)
81 on_text(void *data, const char *s, int len)
83 struct mydata *d = data;
87 append_text(&d->title, s, len);
90 append_text(&d->url, s, len);
93 append_text(&d->abstract, s, len);
101 el_end(void *data, const char *element)
103 struct mydata *d = data;
106 const char *title, *abstract;
113 if ((next == N_TIT && !strcmp(element, "title")) ||
114 (next == N_URL && !strcmp(element, "url")) ||
115 (next == N_ABS && !strcmp(element, "abstract")) ||
116 strcmp(element, "doc"))
119 if (d->len == d->cap) {
120 newcap = d->cap * 1.5;
123 t = recallocarray(d->entries, d->cap, newcap,
124 sizeof(*d->entries));
126 err(1, "recallocarray");
132 if (!strncmp(title, "Wikipedia: ", 11))
135 if ((abstract = d->abstract) == NULL)
138 e = &d->entries[d->len++];
139 e->name = xstrdup(d->url);
140 e->descr = xstrdup(title);
142 if (d->len % 1000 == 0)
143 printf("=> %zu\n", d->len);
145 r = asprintf(&doc, "%s %s", title, abstract);
149 if ((toks = tokenize(doc)) == NULL)
151 if (!dictionary_add_words(d->dict, toks, d->len-1))
152 err(1, "dictionary_add_words");
166 idx_wiki(struct dictionary *dict, struct db_entry **entries, size_t *len,
167 int argc, char **argv)
178 warnx("missing path to xml file");
183 memset(&d, 0, sizeof(d));
186 if ((parser = XML_ParserCreate(NULL)) == NULL)
187 err(1, "XML_ParserCreate");
188 XML_SetUserData(parser, &d);
189 XML_SetElementHandler(parser, el_start, el_end);
190 XML_SetCharacterDataHandler(parser, on_text);
192 if ((fp = fopen(xmlpath, "r")) == NULL)
193 err(1, "can't open %s", xmlpath);
196 r = fread(buf, 1, sizeof(buf), fp);
197 done = r != sizeof(buf);
198 if (!XML_Parse(parser, buf, r, done))
199 errx(1, "can't parse: %s at %s:%lu",
200 XML_ErrorString(XML_GetErrorCode(parser)),
202 XML_GetCurrentLineNumber(parser));
206 XML_ParserFree(parser);
209 *entries = d.entries;