Blob


1 /*
2 * Copyright (c) 2022 Omar Polo <op@openbsd.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
17 #include <err.h>
18 #include <expat.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
23 #include "db.h"
24 #include "dictionary.h"
25 #include "tokenize.h"
27 #include "mkftsidx.h"
29 enum {
30 N_UNK,
31 N_TIT,
32 N_URL,
33 N_ABS,
34 };
36 struct mydata {
37 struct dictionary *dict;
38 struct db_entry *entries;
39 size_t len;
40 size_t cap;
42 int next;
43 char *title;
44 char *url;
45 char *abstract;
46 };
48 static void
49 el_start(void *data, const char *element, const char **attr)
50 {
51 struct mydata *d = data;
53 if (!strcmp(element, "title")) {
54 d->next = N_TIT;
55 } else if (!strcmp(element, "url")) {
56 d->next = N_URL;
57 } else if (!strcmp(element, "abstract")) {
58 d->next = N_ABS;
59 }
60 }
62 static void
63 append_text(char **text, const char *s, int len)
64 {
65 char *t, *out, *orig;
67 if ((t = calloc(1, len + 1)) == NULL)
68 err(1, "calloc");
69 memcpy(t, s, len);
71 if ((orig = *text) == NULL)
72 orig = "";
73 if (asprintf(&out, "%s%s", orig, t) == -1)
74 err(1, "asprintf");
75 free(*text);
76 *text = out;
77 free(t);
78 }
80 static void
81 on_text(void *data, const char *s, int len)
82 {
83 struct mydata *d = data;
85 switch (d->next) {
86 case N_TIT:
87 append_text(&d->title, s, len);
88 break;
89 case N_URL:
90 append_text(&d->url, s, len);
91 break;
92 case N_ABS:
93 append_text(&d->abstract, s, len);
94 break;
95 default:
96 break;
97 }
98 }
100 static void
101 el_end(void *data, const char *element)
103 struct mydata *d = data;
104 struct db_entry *e;
105 size_t newcap;
106 const char *title, *abstract;
107 char *doc, **toks;
108 void *t;
109 int r, next;
111 next = d->next;
112 d->next = N_UNK;
113 if ((next == N_TIT && !strcmp(element, "title")) ||
114 (next == N_URL && !strcmp(element, "url")) ||
115 (next == N_ABS && !strcmp(element, "abstract")) ||
116 strcmp(element, "doc"))
117 return;
119 if (d->len == d->cap) {
120 newcap = d->cap * 1.5;
121 if (newcap == 0)
122 newcap = 8;
123 t = recallocarray(d->entries, d->cap, newcap,
124 sizeof(*d->entries));
125 if (t == NULL)
126 err(1, "recallocarray");
127 d->entries = t;
128 d->cap = newcap;
131 title = d->title;
132 if (!strncmp(title, "Wikipedia: ", 11))
133 title += 11;
135 if ((abstract = d->abstract) == NULL)
136 abstract = "";
138 e = &d->entries[d->len++];
139 e->name = xstrdup(d->url);
140 e->descr = xstrdup(title);
142 if (d->len % 1000 == 0)
143 printf("=> %zu\n", d->len);
145 r = asprintf(&doc, "%s %s", title, abstract);
146 if (r == -1)
147 err(1, "asprintf");
149 if ((toks = tokenize(doc)) == NULL)
150 err(1, "tokenize");
151 if (!dictionary_add_words(d->dict, toks, d->len-1))
152 err(1, "dictionary_add_words");
153 freetoks(toks);
154 free(doc);
156 free(d->title);
157 free(d->url);
158 free(d->abstract);
160 d->title = NULL;
161 d->url = NULL;
162 d->abstract = NULL;
165 int
166 idx_wiki(struct dictionary *dict, struct db_entry **entries, size_t *len,
167 int argc, char **argv)
169 struct mydata d;
170 XML_Parser parser;
171 const char *xmlpath;
172 char buf[BUFSIZ];
173 int done = 0;
174 FILE *fp;
175 size_t r;
177 if (argc != 1) {
178 warnx("missing path to xml file");
179 usage();
181 xmlpath = *argv;
183 memset(&d, 0, sizeof(d));
184 d.dict = dict;
186 if ((parser = XML_ParserCreate(NULL)) == NULL)
187 err(1, "XML_ParserCreate");
188 XML_SetUserData(parser, &d);
189 XML_SetElementHandler(parser, el_start, el_end);
190 XML_SetCharacterDataHandler(parser, on_text);
192 if ((fp = fopen(xmlpath, "r")) == NULL)
193 err(1, "can't open %s", xmlpath);
195 do {
196 r = fread(buf, 1, sizeof(buf), fp);
197 done = r != sizeof(buf);
198 if (!XML_Parse(parser, buf, r, done))
199 errx(1, "can't parse: %s at %s:%lu",
200 XML_ErrorString(XML_GetErrorCode(parser)),
201 xmlpath,
202 XML_GetCurrentLineNumber(parser));
203 } while (!done);
205 fclose(fp);
206 XML_ParserFree(parser);
208 *len = d.len;
209 *entries = d.entries;
211 return 0;