Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "hdr.h"
5 #include "conv.h"
7 typedef struct Hchar Hchar;
8 struct Hchar
9 {
10 char *s;
11 Rune r;
12 };
14 /* &lt;, &gt;, &quot;, &amp; intentionally omitted */
16 static Hchar byname[] =
17 {
18 {"AElig", 198},
19 {"Aacute", 193},
20 {"Acirc", 194},
21 {"Agrave", 192},
22 {"Aring", 197},
23 {"Atilde", 195},
24 {"Auml", 196},
25 {"Ccedil", 199},
26 {"ETH", 208},
27 {"Eacute", 201},
28 {"Ecirc", 202},
29 {"Egrave", 200},
30 {"Euml", 203},
31 {"Iacute", 205},
32 {"Icirc", 206},
33 {"Igrave", 204},
34 {"Iuml", 207},
35 {"Ntilde", 209},
36 {"Oacute", 211},
37 {"Ocirc", 212},
38 {"Ograve", 210},
39 {"Oslash", 216},
40 {"Otilde", 213},
41 {"Ouml", 214},
42 {"THORN", 222},
43 {"Uacute", 218},
44 {"Ucirc", 219},
45 {"Ugrave", 217},
46 {"Uuml", 220},
47 {"Yacute", 221},
48 {"aacute", 225},
49 {"acirc", 226},
50 {"acute", 180},
51 {"aelig", 230},
52 {"agrave", 224},
53 {"alpha", 945},
54 {"aring", 229},
55 {"atilde", 227},
56 {"auml", 228},
57 {"beta", 946},
58 {"brvbar", 166},
59 {"ccedil", 231},
60 {"cdots", 8943},
61 {"cedil", 184},
62 {"cent", 162},
63 {"chi", 967},
64 {"copy", 169},
65 {"curren", 164},
66 {"ddots", 8945},
67 {"deg", 176},
68 {"delta", 948},
69 {"divide", 247},
70 {"eacute", 233},
71 {"ecirc", 234},
72 {"egrave", 232},
73 {"emdash", 8212}, /* non-standard but commonly used */
74 {"emsp", 8195},
75 {"endash", 8211}, /* non-standard but commonly used */
76 {"ensp", 8194},
77 {"epsilon", 949},
78 {"eta", 951},
79 {"eth", 240},
80 {"euml", 235},
81 {"frac12", 189},
82 {"frac14", 188},
83 {"frac34", 190},
84 {"gamma", 947},
85 {"iacute", 237},
86 {"icirc", 238},
87 {"iexcl", 161},
88 {"igrave", 236},
89 {"iota", 953},
90 {"iquest", 191},
91 {"iuml", 239},
92 {"kappa", 954},
93 {"lambda", 955},
94 {"laquo", 171},
95 {"ldquo", 8220},
96 {"ldots", 8230},
97 {"lsquo", 8216},
98 {"macr", 175},
99 {"mdash", 8212},
100 {"micro", 181},
101 {"middot", 183},
102 {"mu", 956},
103 {"nbsp", 160},
104 {"ndash", 8211},
105 {"not", 172},
106 {"ntilde", 241},
107 {"nu", 957},
108 {"oacute", 243},
109 {"ocirc", 244},
110 {"ograve", 242},
111 {"omega", 969},
112 {"omicron", 959},
113 {"ordf", 170},
114 {"ordm", 186},
115 {"oslash", 248},
116 {"otilde", 245},
117 {"ouml", 246},
118 {"para", 182},
119 {"phi", 966},
120 {"pi", 960},
121 {"plusmn", 177},
122 {"pound", 163},
123 {"psi", 968},
124 {"quad", 8193},
125 {"raquo", 187},
126 {"rdquo", 8221},
127 {"reg", 174},
128 {"rho", 961},
129 {"rsquo", 8217},
130 {"sect", 167},
131 {"shy", 173},
132 {"sigma", 963},
133 {"sp", 8194},
134 {"sup1", 185},
135 {"sup2", 178},
136 {"sup3", 179},
137 {"szlig", 223},
138 {"tau", 964},
139 {"theta", 952},
140 {"thinsp", 8201},
141 {"thorn", 254},
142 {"times", 215},
143 {"trade", 8482},
144 {"uacute", 250},
145 {"ucirc", 251},
146 {"ugrave", 249},
147 {"uml", 168},
148 {"upsilon", 965},
149 {"uuml", 252},
150 {"varepsilon", 8712},
151 {"varphi", 981},
152 {"varpi", 982},
153 {"varrho", 1009},
154 {"vdots", 8942},
155 {"vsigma", 962},
156 {"vtheta", 977},
157 {"xi", 958},
158 {"yacute", 253},
159 {"yen", 165},
160 {"yuml", 255},
161 {"zeta", 950}
162 };
164 static Hchar byrune[nelem(byname)];
166 static int
167 hnamecmp(const void *va, const void *vb)
169 Hchar *a, *b;
171 a = (Hchar*)va;
172 b = (Hchar*)vb;
173 return strcmp(a->s, b->s);
176 static int
177 hrunecmp(const void *va, const void *vb)
179 Hchar *a, *b;
181 a = (Hchar*)va;
182 b = (Hchar*)vb;
183 return a->r - b->r;
186 static void
187 html_init(void)
189 static int init;
191 if(init)
192 return;
193 init = 1;
194 memmove(byrune, byname, sizeof byrune);
195 qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
196 qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
199 static Rune
200 findbyname(char *s)
202 Hchar *h;
203 int n, m, x;
205 h = byname;
206 n = nelem(byname);
207 while(n > 0){
208 m = n/2;
209 x = strcmp(h[m].s, s);
210 if(x == 0)
211 return h[m].r;
212 if(x < 0){
213 h += m+1;
214 n -= m+1;
215 }else
216 n = m;
218 return Runeerror;
221 static char*
222 findbyrune(Rune r)
224 Hchar *h;
225 int n, m;
227 h = byrune;
228 n = nelem(byrune);
229 while(n > 0){
230 m = n/2;
231 if(h[m].r == r)
232 return h[m].s;
233 if(h[m].r < r){
234 h += m+1;
235 n -= m+1;
236 }else
237 n = m;
239 return nil;
242 void
243 html_in(int fd, long *x, struct convert *out)
245 char buf[100], *p;
246 Biobuf b;
247 Rune rbuf[N];
248 Rune *r, *er;
249 int c, i;
251 USED(x);
253 html_init();
254 r = rbuf;
255 er = rbuf+N;
256 Binit(&b, fd, OREAD);
257 while((c = Bgetrune(&b)) != Beof){
258 if(r >= er){
259 OUT(out, rbuf, r-rbuf);
260 r = rbuf;
262 if(c == '&'){
263 buf[0] = c;
264 for(i=1; i<nelem(buf)-1;){
265 c = Bgetc(&b);
266 if(c == Beof)
267 break;
268 buf[i++] = c;
269 if(strchr("; \t\r\n", c))
270 break;
272 buf[i] = 0;
273 if(buf[i-1] == ';'){
274 buf[i-1] = 0;
275 if((c = findbyname(buf+1)) != Runeerror){
276 *r++ = c;
277 continue;
279 buf[i-1] = ';';
280 if(buf[1] == '#'){
281 if(buf[2] == 'x')
282 c = strtol(buf+3, &p, 16);
283 else
284 c = strtol(buf+2, &p, 10);
285 if(*p != ';' || c >= NRUNE || c < 0)
286 goto bad;
287 *r++ = c;
288 continue;
291 bad:
292 for(p=buf; p<buf+i; ){
293 p += chartorune(r++, p);
294 if(r >= er){
295 OUT(out, rbuf, r-rbuf);
296 r = rbuf;
299 continue;
301 *r++ = c;
303 if(r > rbuf)
304 OUT(out, rbuf, r-rbuf);
307 /*
308 * use biobuf because can use more than UTFmax bytes per rune
309 */
310 void
311 html_out(Rune *r, int n, long *x)
313 char *s;
314 Biobuf b;
315 Rune *er;
317 html_init();
318 Binit(&b, 1, OWRITE);
319 er = r+n;
320 for(; r<er; r++){
321 if(*r < Runeself)
322 Bputrune(&b, *r);
323 else if((s = findbyrune(*r)) != nil)
324 Bprint(&b, "&%s;", s);
325 else
326 Bprint(&b, "&#x%04x;", *r);
328 Bflush(&b);