Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "hdr.h"
5 #include "conv.h"
7 typedef struct Hchar Hchar;
8 struct Hchar
9 {
10 char *s;
11 Rune r;
12 };
14 /* &lt;, &gt;, &quot;, &amp; intentionally omitted */
16 static Hchar byname[] =
17 {
18 {"AElig", 198},
19 {"Aacute", 193},
20 {"Acirc", 194},
21 {"Agrave", 192},
22 {"Alpha", 913},
23 {"Aring", 197},
24 {"Atilde", 195},
25 {"Auml", 196},
26 {"Beta", 914},
27 {"Ccedil", 199},
28 {"Chi", 935},
29 {"Dagger", 8225},
30 {"Delta", 916},
31 {"ETH", 208},
32 {"Eacute", 201},
33 {"Ecirc", 202},
34 {"Egrave", 200},
35 {"Epsilon", 917},
36 {"Eta", 919},
37 {"Euml", 203},
38 {"Gamma", 915},
39 {"Iacute", 205},
40 {"Icirc", 206},
41 {"Igrave", 204},
42 {"Iota", 921},
43 {"Iuml", 207},
44 {"Kappa", 922},
45 {"Lambda", 923},
46 {"Mu", 924},
47 {"Ntilde", 209},
48 {"Nu", 925},
49 {"OElig", 338},
50 {"Oacute", 211},
51 {"Ocirc", 212},
52 {"Ograve", 210},
53 {"Omega", 937},
54 {"Omicron", 927},
55 {"Oslash", 216},
56 {"Otilde", 213},
57 {"Ouml", 214},
58 {"Phi", 934},
59 {"Pi", 928},
60 {"Prime", 8243},
61 {"Psi", 936},
62 {"Rho", 929},
63 {"Scaron", 352},
64 {"Sigma", 931},
65 {"THORN", 222},
66 {"Tau", 932},
67 {"Theta", 920},
68 {"Uacute", 218},
69 {"Ucirc", 219},
70 {"Ugrave", 217},
71 {"Upsilon", 933},
72 {"Uuml", 220},
73 {"Xi", 926},
74 {"Yacute", 221},
75 {"Yuml", 376},
76 {"Zeta", 918},
77 {"aacute", 225},
78 {"acirc", 226},
79 {"acute", 180},
80 {"aelig", 230},
81 {"agrave", 224},
82 {"alefsym", 8501},
83 {"alpha", 945},
84 {"amp", 38},
85 {"and", 8743},
86 {"ang", 8736},
87 {"aring", 229},
88 {"asymp", 8776},
89 {"atilde", 227},
90 {"auml", 228},
91 {"bdquo", 8222},
92 {"beta", 946},
93 {"brvbar", 166},
94 {"bull", 8226},
95 {"cap", 8745},
96 {"ccedil", 231},
97 {"cdots", 8943},
98 {"cedil", 184},
99 {"cent", 162},
100 {"chi", 967},
101 {"circ", 710},
102 {"clubs", 9827},
103 {"cong", 8773},
104 {"copy", 169},
105 {"crarr", 8629},
106 {"cup", 8746},
107 {"curren", 164},
108 {"dArr", 8659},
109 {"dagger", 8224},
110 {"darr", 8595},
111 {"ddots", 8945},
112 {"deg", 176},
113 {"delta", 948},
114 {"diams", 9830},
115 {"divide", 247},
116 {"eacute", 233},
117 {"ecirc", 234},
118 {"egrave", 232},
119 {"emdash", 8212}, /* non-standard but commonly used */
120 {"empty", 8709},
121 {"emsp", 8195},
122 {"endash", 8211}, /* non-standard but commonly used */
123 {"ensp", 8194},
124 {"epsilon", 949},
125 {"equiv", 8801},
126 {"eta", 951},
127 {"eth", 240},
128 {"euml", 235},
129 {"euro", 8364},
130 {"exist", 8707},
131 {"fnof", 402},
132 {"forall", 8704},
133 {"frac12", 189},
134 {"frac14", 188},
135 {"frac34", 190},
136 {"frasl", 8260},
137 {"gamma", 947},
138 {"ge", 8805},
139 {"gt", 62},
140 {"hArr", 8660},
141 {"harr", 8596},
142 {"hearts", 9829},
143 {"hellip", 8230},
144 {"iacute", 237},
145 {"icirc", 238},
146 {"iexcl", 161},
147 {"igrave", 236},
148 {"image", 8465},
149 {"infin", 8734},
150 {"int", 8747},
151 {"iota", 953},
152 {"iquest", 191},
153 {"isin", 8712},
154 {"iuml", 239},
155 {"kappa", 954},
156 {"lArr", 8656},
157 {"lambda", 955},
158 {"lang", 9001},
159 {"laquo", 171},
160 {"larr", 8592},
161 {"lceil", 8968},
162 {"ldots", 8230},
163 {"ldquo", 8220},
164 {"le", 8804},
165 {"lfloor", 8970},
166 {"lowast", 8727},
167 {"loz", 9674},
168 {"lrm", 8206},
169 {"lsaquo", 8249},
170 {"lsquo", 8216},
171 {"lt", 60},
172 {"macr", 175},
173 {"mdash", 8212},
174 {"micro", 181},
175 {"middot", 183},
176 {"minus", 8722},
177 {"mu", 956},
178 {"nabla", 8711},
179 {"nbsp", 160},
180 {"ndash", 8211},
181 {"ne", 8800},
182 {"ni", 8715},
183 {"not", 172},
184 {"notin", 8713},
185 {"nsub", 8836},
186 {"ntilde", 241},
187 {"nu", 957},
188 {"oacute", 243},
189 {"ocirc", 244},
190 {"oelig", 339},
191 {"ograve", 242},
192 {"oline", 8254},
193 {"omega", 969},
194 {"omicron", 959},
195 {"oplus", 8853},
196 {"or", 8744},
197 {"ordf", 170},
198 {"ordm", 186},
199 {"oslash", 248},
200 {"otilde", 245},
201 {"otimes", 8855},
202 {"ouml", 246},
203 {"para", 182},
204 {"part", 8706},
205 {"permil", 8240},
206 {"perp", 8869},
207 {"phi", 966},
208 {"pi", 960},
209 {"piv", 982},
210 {"plusmn", 177},
211 {"pound", 163},
212 {"prime", 8242},
213 {"prod", 8719},
214 {"prop", 8733},
215 {"psi", 968},
216 {"quad", 8193},
217 {"quot", 34},
218 {"rArr", 8658},
219 {"radic", 8730},
220 {"rang", 9002},
221 {"raquo", 187},
222 {"rarr", 8594},
223 {"rceil", 8969},
224 {"rdquo", 8221},
225 {"real", 8476},
226 {"reg", 174},
227 {"rfloor", 8971},
228 {"rho", 961},
229 {"rlm", 8207},
230 {"rsaquo", 8250},
231 {"rsquo", 8217},
232 {"sbquo", 8218},
233 {"scaron", 353},
234 {"sdot", 8901},
235 {"sect", 167},
236 {"shy", 173},
237 {"sigma", 963},
238 {"sigmaf", 962},
239 {"sim", 8764},
240 {"sp", 8194},
241 {"spades", 9824},
242 {"sub", 8834},
243 {"sube", 8838},
244 {"sum", 8721},
245 {"sup", 8835},
246 {"sup1", 185},
247 {"sup2", 178},
248 {"sup3", 179},
249 {"supe", 8839},
250 {"szlig", 223},
251 {"tau", 964},
252 {"there4", 8756},
253 {"theta", 952},
254 {"thetasym", 977},
255 {"thinsp", 8201},
256 {"thorn", 254},
257 {"tilde", 732},
258 {"times", 215},
259 {"trade", 8482},
260 {"uArr", 8657},
261 {"uacute", 250},
262 {"uarr", 8593},
263 {"ucirc", 251},
264 {"ugrave", 249},
265 {"uml", 168},
266 {"upsih", 978},
267 {"upsilon", 965},
268 {"uuml", 252},
269 {"varepsilon", 8712},
270 {"varphi", 981},
271 {"varpi", 982},
272 {"varrho", 1009},
273 {"vdots", 8942},
274 {"vsigma", 962},
275 {"vtheta", 977},
276 {"weierp", 8472},
277 {"xi", 958},
278 {"yacute", 253},
279 {"yen", 165},
280 {"yuml", 255},
281 {"zeta", 950},
282 {"zwj", 8205},
283 {"zwnj", 8204}
284 };
286 static Hchar byrune[nelem(byname)];
288 static int
289 hnamecmp(const void *va, const void *vb)
291 Hchar *a, *b;
293 a = (Hchar*)va;
294 b = (Hchar*)vb;
295 return strcmp(a->s, b->s);
298 static int
299 hrunecmp(const void *va, const void *vb)
301 Hchar *a, *b;
303 a = (Hchar*)va;
304 b = (Hchar*)vb;
305 return a->r - b->r;
308 static void
309 html_init(void)
311 static int init;
313 if(init)
314 return;
315 init = 1;
316 memmove(byrune, byname, sizeof byrune);
317 qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
318 qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
321 static Rune
322 findbyname(char *s)
324 Hchar *h;
325 int n, m, x;
327 h = byname;
328 n = nelem(byname);
329 while(n > 0){
330 m = n/2;
331 x = strcmp(h[m].s, s);
332 if(x == 0)
333 return h[m].r;
334 if(x < 0){
335 h += m+1;
336 n -= m+1;
337 }else
338 n = m;
340 return Runeerror;
343 static char*
344 findbyrune(Rune r)
346 Hchar *h;
347 int n, m;
349 h = byrune;
350 n = nelem(byrune);
351 while(n > 0){
352 m = n/2;
353 if(h[m].r == r)
354 return h[m].s;
355 if(h[m].r < r){
356 h += m+1;
357 n -= m+1;
358 }else
359 n = m;
361 return nil;
364 void
365 html_in(int fd, long *x, struct convert *out)
367 char buf[100], *p;
368 Biobuf b;
369 Rune rbuf[N];
370 Rune *r, *er;
371 int c, i;
373 USED(x);
375 html_init();
376 r = rbuf;
377 er = rbuf+N;
378 Binit(&b, fd, OREAD);
379 while((c = Bgetrune(&b)) != Beof){
380 if(r >= er){
381 OUT(out, rbuf, r-rbuf);
382 r = rbuf;
384 if(c == '&'){
385 buf[0] = c;
386 for(i=1; i<nelem(buf)-1;){
387 c = Bgetc(&b);
388 if(c == Beof)
389 break;
390 buf[i++] = c;
391 if(strchr("; \t\r\n", c))
392 break;
394 buf[i] = 0;
395 if(buf[i-1] == ';'){
396 buf[i-1] = 0;
397 if((c = findbyname(buf+1)) != Runeerror){
398 *r++ = c;
399 continue;
401 buf[i-1] = ';';
402 if(buf[1] == '#'){
403 if(buf[2] == 'x')
404 c = strtol(buf+3, &p, 16);
405 else
406 c = strtol(buf+2, &p, 10);
407 if(*p != ';' || c >= NRUNE || c < 0)
408 goto bad;
409 *r++ = c;
410 continue;
413 bad:
414 for(p=buf; p<buf+i; ){
415 p += chartorune(r++, p);
416 if(r >= er){
417 OUT(out, rbuf, r-rbuf);
418 r = rbuf;
421 continue;
423 *r++ = c;
425 if(r > rbuf)
426 OUT(out, rbuf, r-rbuf);
427 OUT(out, rbuf, 0);
430 /*
431 * use biobuf because can use more than UTFmax bytes per rune
432 */
433 void
434 html_out(Rune *r, int n, long *x)
436 char *s;
437 Biobuf b;
438 Rune *er;
440 USED(x);
441 html_init();
442 Binit(&b, 1, OWRITE);
443 er = r+n;
444 for(; r<er; r++){
445 if(*r < Runeself)
446 Bputrune(&b, *r);
447 else if((s = findbyrune(*r)) != nil)
448 Bprint(&b, "&%s;", s);
449 else
450 Bprint(&b, "&#%d;", *r);
452 Bflush(&b);