Blob


1 /*
2 * Copyright (c) 2022 Omar Polo <op@openbsd.org>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
17 #include <err.h>
18 #include <expat.h>
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
23 #include "db.h"
24 #include "dictionary.h"
25 #include "tokenize.h"
27 #include "mkftsidx.h"
29 enum {
30 N_UNK,
31 N_TIT,
32 N_URL,
33 N_ABS,
34 };
36 struct mydata {
37 struct dictionary *dict;
38 struct db_entry *entries;
39 size_t len;
40 size_t cap;
42 int next;
43 char *title;
44 char *url;
45 char *abstract;
46 };
48 static void
49 el_start(void *data, const char *element, const char **attr)
50 {
51 struct mydata *d = data;
53 if (!strcmp(element, "title")) {
54 d->next = N_TIT;
55 } else if (!strcmp(element, "url")) {
56 d->next = N_URL;
57 } else if (!strcmp(element, "abstract")) {
58 d->next = N_ABS;
59 }
60 }
62 static void
63 append_text(char **text, const char *s, int len)
64 {
65 char *t, *out, *orig;
67 if ((t = calloc(1, len + 1)) == NULL)
68 err(1, "calloc");
69 memcpy(t, s, len);
71 if ((orig = *text) == NULL)
72 orig = "";
73 if (asprintf(&out, "%s%s", orig, t) == -1)
74 err(1, "asprintf");
75 free(*text);
76 *text = out;
77 free(t);
78 }
80 static void
81 on_text(void *data, const char *s, int len)
82 {
83 struct mydata *d = data;
85 switch (d->next) {
86 case N_TIT:
87 append_text(&d->title, s, len);
88 break;
89 case N_URL:
90 append_text(&d->url, s, len);
91 break;
92 case N_ABS:
93 append_text(&d->abstract, s, len);
94 break;
95 default:
96 break;
97 }
98 }
100 static void
101 el_end(void *data, const char *element)
103 struct mydata *d = data;
104 struct db_entry *e;
105 size_t newcap;
106 const char *title;
107 char *doc, **toks;
108 void *t;
109 int r, next;
111 next = d->next;
112 d->next = N_UNK;
113 if ((next == N_TIT && !strcmp(element, "title")) ||
114 (next == N_URL && !strcmp(element, "url")) ||
115 (next == N_ABS && !strcmp(element, "abstract")) ||
116 strcmp(element, "doc"))
117 return;
119 if (d->len == d->cap) {
120 newcap = d->cap * 1.5;
121 if (newcap == 0)
122 newcap = 8;
123 t = recallocarray(d->entries, d->cap, newcap,
124 sizeof(*d->entries));
125 if (t == NULL)
126 err(1, "recallocarray");
127 d->entries = t;
128 d->cap = newcap;
131 title = d->title;
132 if (!strncmp(title, "Wikipedia: ", 11))
133 title += 11;
135 e = &d->entries[d->len++];
136 e->name = xstrdup(d->url);
137 e->descr = xstrdup(title);
139 if (d->len % 1000 == 0)
140 printf("=> %zu\n", d->len);
142 r = asprintf(&doc, "%s %s", title, d->abstract);
143 if (r == -1)
144 err(1, "asprintf");
146 if ((toks = tokenize(doc)) == NULL)
147 err(1, "tokenize");
148 if (!dictionary_add_words(d->dict, toks, d->len-1))
149 err(1, "dictionary_add_words");
150 freetoks(toks);
151 free(doc);
153 free(d->title);
154 free(d->url);
155 free(d->abstract);
157 d->title = NULL;
158 d->url = NULL;
159 d->abstract = NULL;
162 int
163 idx_wiki(struct dictionary *dict, struct db_entry **entries, size_t *len,
164 int argc, char **argv)
166 struct mydata d;
167 XML_Parser parser;
168 const char *xmlpath;
169 char buf[BUFSIZ];
170 int done = 0;
171 FILE *fp;
172 size_t r;
174 if (argc != 1) {
175 warnx("missing path to xml file");
176 usage();
178 xmlpath = *argv;
180 memset(&d, 0, sizeof(d));
181 d.dict = dict;
183 if ((parser = XML_ParserCreate(NULL)) == NULL)
184 err(1, "XML_ParserCreate");
185 XML_SetUserData(parser, &d);
186 XML_SetElementHandler(parser, el_start, el_end);
187 XML_SetCharacterDataHandler(parser, on_text);
189 if ((fp = fopen(xmlpath, "r")) == NULL)
190 err(1, "can't open %s", xmlpath);
192 do {
193 r = fread(buf, 1, sizeof(buf), fp);
194 done = r != sizeof(buf);
195 if (!XML_Parse(parser, buf, r, done))
196 errx(1, "can't parse: %s at %s:%lu",
197 XML_ErrorString(XML_GetErrorCode(parser)),
198 xmlpath,
199 XML_GetCurrentLineNumber(parser));
200 } while (!done);
202 fclose(fp);
203 XML_ParserFree(parser);
205 *len = d.len;
206 *entries = d.entries;
208 return 0;