Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "hdr.h"
5 #include "conv.h"
7 typedef struct Hchar Hchar;
8 struct Hchar
9 {
10 char *s;
11 Rune r;
12 };
14 /* &lt;, &gt;, &quot;, &amp; intentionally omitted */
16 /*
17 * Names beginning with _ are names we recognize
18 * (without the underscore) but will not generate,
19 * because they are nonstandard.
20 */
21 static Hchar byname[] =
22 {
23 {"AElig", 198},
24 {"Aacute", 193},
25 {"Acirc", 194},
26 {"Agrave", 192},
27 {"Alpha", 913},
28 {"Aring", 197},
29 {"Atilde", 195},
30 {"Auml", 196},
31 {"Beta", 914},
32 {"Ccedil", 199},
33 {"Chi", 935},
34 {"Dagger", 8225},
35 {"Delta", 916},
36 {"ETH", 208},
37 {"Eacute", 201},
38 {"Ecirc", 202},
39 {"Egrave", 200},
40 {"Epsilon", 917},
41 {"Eta", 919},
42 {"Euml", 203},
43 {"Gamma", 915},
44 {"Iacute", 205},
45 {"Icirc", 206},
46 {"Igrave", 204},
47 {"Iota", 921},
48 {"Iuml", 207},
49 {"Kappa", 922},
50 {"Lambda", 923},
51 {"Mu", 924},
52 {"Ntilde", 209},
53 {"Nu", 925},
54 {"OElig", 338},
55 {"Oacute", 211},
56 {"Ocirc", 212},
57 {"Ograve", 210},
58 {"Omega", 937},
59 {"Omicron", 927},
60 {"Oslash", 216},
61 {"Otilde", 213},
62 {"Ouml", 214},
63 {"Phi", 934},
64 {"Pi", 928},
65 {"Prime", 8243},
66 {"Psi", 936},
67 {"Rho", 929},
68 {"Scaron", 352},
69 {"Sigma", 931},
70 {"THORN", 222},
71 {"Tau", 932},
72 {"Theta", 920},
73 {"Uacute", 218},
74 {"Ucirc", 219},
75 {"Ugrave", 217},
76 {"Upsilon", 933},
77 {"Uuml", 220},
78 {"Xi", 926},
79 {"Yacute", 221},
80 {"Yuml", 376},
81 {"Zeta", 918},
82 {"aacute", 225},
83 {"acirc", 226},
84 {"acute", 180},
85 {"aelig", 230},
86 {"agrave", 224},
87 {"alefsym", 8501},
88 {"alpha", 945},
89 {"amp", 38},
90 {"and", 8743},
91 {"ang", 8736},
92 {"aring", 229},
93 {"asymp", 8776},
94 {"atilde", 227},
95 {"auml", 228},
96 {"bdquo", 8222},
97 {"beta", 946},
98 {"brvbar", 166},
99 {"bull", 8226},
100 {"cap", 8745},
101 {"ccedil", 231},
102 {"cdots", 8943},
103 {"cedil", 184},
104 {"cent", 162},
105 {"chi", 967},
106 {"circ", 710},
107 {"clubs", 9827},
108 {"cong", 8773},
109 {"copy", 169},
110 {"crarr", 8629},
111 {"cup", 8746},
112 {"curren", 164},
113 {"dArr", 8659},
114 {"dagger", 8224},
115 {"darr", 8595},
116 {"ddots", 8945},
117 {"deg", 176},
118 {"delta", 948},
119 {"diams", 9830},
120 {"divide", 247},
121 {"eacute", 233},
122 {"ecirc", 234},
123 {"egrave", 232},
124 {"_emdash", 8212}, /* non-standard but commonly used */
125 {"empty", 8709},
126 {"emsp", 8195},
127 {"_endash", 8211}, /* non-standard but commonly used */
128 {"ensp", 8194},
129 {"epsilon", 949},
130 {"equiv", 8801},
131 {"eta", 951},
132 {"eth", 240},
133 {"euml", 235},
134 {"euro", 8364},
135 {"exist", 8707},
136 {"fnof", 402},
137 {"forall", 8704},
138 {"frac12", 189},
139 {"frac14", 188},
140 {"frac34", 190},
141 {"frasl", 8260},
142 {"gamma", 947},
143 {"ge", 8805},
144 {"gt", 62},
145 {"hArr", 8660},
146 {"harr", 8596},
147 {"hearts", 9829},
148 {"hellip", 8230},
149 {"iacute", 237},
150 {"icirc", 238},
151 {"iexcl", 161},
152 {"igrave", 236},
153 {"image", 8465},
154 {"infin", 8734},
155 {"int", 8747},
156 {"iota", 953},
157 {"iquest", 191},
158 {"isin", 8712},
159 {"iuml", 239},
160 {"kappa", 954},
161 {"lArr", 8656},
162 {"lambda", 955},
163 {"lang", 9001},
164 {"laquo", 171},
165 {"larr", 8592},
166 {"lceil", 8968},
167 {"_ldots", 8230},
168 {"ldquo", 8220},
169 {"le", 8804},
170 {"lfloor", 8970},
171 {"lowast", 8727},
172 {"loz", 9674},
173 {"lrm", 8206},
174 {"lsaquo", 8249},
175 {"lsquo", 8216},
176 {"lt", 60},
177 {"macr", 175},
178 {"mdash", 8212},
179 {"micro", 181},
180 {"middot", 183},
181 {"minus", 8722},
182 {"mu", 956},
183 {"nabla", 8711},
184 {"nbsp", 160},
185 {"ndash", 8211},
186 {"ne", 8800},
187 {"ni", 8715},
188 {"not", 172},
189 {"notin", 8713},
190 {"nsub", 8836},
191 {"ntilde", 241},
192 {"nu", 957},
193 {"oacute", 243},
194 {"ocirc", 244},
195 {"oelig", 339},
196 {"ograve", 242},
197 {"oline", 8254},
198 {"omega", 969},
199 {"omicron", 959},
200 {"oplus", 8853},
201 {"or", 8744},
202 {"ordf", 170},
203 {"ordm", 186},
204 {"oslash", 248},
205 {"otilde", 245},
206 {"otimes", 8855},
207 {"ouml", 246},
208 {"para", 182},
209 {"part", 8706},
210 {"permil", 8240},
211 {"perp", 8869},
212 {"phi", 966},
213 {"pi", 960},
214 {"piv", 982},
215 {"plusmn", 177},
216 {"pound", 163},
217 {"prime", 8242},
218 {"prod", 8719},
219 {"prop", 8733},
220 {"psi", 968},
221 {"quad", 8193},
222 {"quot", 34},
223 {"rArr", 8658},
224 {"radic", 8730},
225 {"rang", 9002},
226 {"raquo", 187},
227 {"rarr", 8594},
228 {"rceil", 8969},
229 {"rdquo", 8221},
230 {"real", 8476},
231 {"reg", 174},
232 {"rfloor", 8971},
233 {"rho", 961},
234 {"rlm", 8207},
235 {"rsaquo", 8250},
236 {"rsquo", 8217},
237 {"sbquo", 8218},
238 {"scaron", 353},
239 {"sdot", 8901},
240 {"sect", 167},
241 {"shy", 173},
242 {"sigma", 963},
243 {"sigmaf", 962},
244 {"sim", 8764},
245 {"_sp", 8194},
246 {"spades", 9824},
247 {"sub", 8834},
248 {"sube", 8838},
249 {"sum", 8721},
250 {"sup", 8835},
251 {"sup1", 185},
252 {"sup2", 178},
253 {"sup3", 179},
254 {"supe", 8839},
255 {"szlig", 223},
256 {"tau", 964},
257 {"there4", 8756},
258 {"theta", 952},
259 {"thetasym", 977},
260 {"thinsp", 8201},
261 {"thorn", 254},
262 {"tilde", 732},
263 {"times", 215},
264 {"trade", 8482},
265 {"uArr", 8657},
266 {"uacute", 250},
267 {"uarr", 8593},
268 {"ucirc", 251},
269 {"ugrave", 249},
270 {"uml", 168},
271 {"upsih", 978},
272 {"upsilon", 965},
273 {"uuml", 252},
274 {"_varepsilon", 8712},
275 {"varphi", 981},
276 {"_varpi", 982},
277 {"varrho", 1009},
278 {"vdots", 8942},
279 {"_vsigma", 962},
280 {"_vtheta", 977},
281 {"weierp", 8472},
282 {"xi", 958},
283 {"yacute", 253},
284 {"yen", 165},
285 {"yuml", 255},
286 {"zeta", 950},
287 {"zwj", 8205},
288 {"zwnj", 8204}
289 };
291 static Hchar byrune[nelem(byname)];
293 static int
294 hnamecmp(const void *va, const void *vb)
296 Hchar *a, *b;
298 a = (Hchar*)va;
299 b = (Hchar*)vb;
300 return strcmp(a->s, b->s);
303 static int
304 hrunecmp(const void *va, const void *vb)
306 Hchar *a, *b;
308 a = (Hchar*)va;
309 b = (Hchar*)vb;
310 return a->r - b->r;
313 static void
314 html_init(void)
316 static int init;
317 int i;
319 if(init)
320 return;
321 init = 1;
322 memmove(byrune, byname, sizeof byrune);
324 /* Eliminate names we aren't allowed to generate. */
325 for(i=0; i<nelem(byrune); i++){
326 if(byrune[i].s[0] == '_'){
327 byrune[i].r = Runeerror;
328 byname[i].s++;
332 qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
333 qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
336 static Rune
337 findbyname(char *s)
339 Hchar *h;
340 int n, m, x;
342 h = byname;
343 n = nelem(byname);
344 while(n > 0){
345 m = n/2;
346 x = strcmp(h[m].s, s);
347 if(x == 0)
348 return h[m].r;
349 if(x < 0){
350 h += m+1;
351 n -= m+1;
352 }else
353 n = m;
355 return Runeerror;
358 static char*
359 findbyrune(Rune r)
361 Hchar *h;
362 int n, m;
364 if(r == Runeerror)
365 return nil;
366 h = byrune;
367 n = nelem(byrune);
368 while(n > 0){
369 m = n/2;
370 if(h[m].r == r)
371 return h[m].s;
372 if(h[m].r < r){
373 h += m+1;
374 n -= m+1;
375 }else
376 n = m;
378 return nil;
381 void
382 html_in(int fd, long *x, struct convert *out)
384 char buf[100], *p;
385 Biobuf b;
386 Rune rbuf[N];
387 Rune *r, *er;
388 int c, i;
390 USED(x);
392 html_init();
393 r = rbuf;
394 er = rbuf+N;
395 Binit(&b, fd, OREAD);
396 while((c = Bgetrune(&b)) != Beof){
397 if(r >= er){
398 OUT(out, rbuf, r-rbuf);
399 r = rbuf;
401 if(c == '&'){
402 buf[0] = c;
403 for(i=1; i<nelem(buf)-1;){
404 c = Bgetc(&b);
405 if(c == Beof)
406 break;
407 buf[i++] = c;
408 if(strchr("; \t\r\n", c))
409 break;
411 buf[i] = 0;
412 if(buf[i-1] == ';'){
413 buf[i-1] = 0;
414 if((c = findbyname(buf+1)) != Runeerror){
415 *r++ = c;
416 continue;
418 buf[i-1] = ';';
419 if(buf[1] == '#'){
420 if(buf[2] == 'x')
421 c = strtol(buf+3, &p, 16);
422 else
423 c = strtol(buf+2, &p, 10);
424 if(*p != ';' || c >= NRUNE || c < 0)
425 goto bad;
426 *r++ = c;
427 continue;
430 bad:
431 for(p=buf; p<buf+i; ){
432 p += chartorune(r++, p);
433 if(r >= er){
434 OUT(out, rbuf, r-rbuf);
435 r = rbuf;
438 continue;
440 *r++ = c;
442 if(r > rbuf)
443 OUT(out, rbuf, r-rbuf);
444 OUT(out, rbuf, 0);
447 /*
448 * use biobuf because can use more than UTFmax bytes per rune
449 */
450 void
451 html_out(Rune *r, int n, long *x)
453 char *s;
454 Biobuf b;
455 Rune *er;
457 USED(x);
458 html_init();
459 Binit(&b, 1, OWRITE);
460 er = r+n;
461 for(; r<er; r++){
462 if(*r < Runeself)
463 Bputrune(&b, *r);
464 else if((s = findbyrune(*r)) != nil)
465 Bprint(&b, "&%s;", s);
466 else
467 Bprint(&b, "&#%d;", *r);
469 Bflush(&b);