Blob
1 #include <u.h>2 #include <libc.h>3 #include <bio.h>4 #include "hdr.h"5 #include "conv.h"7 typedef struct Hchar Hchar;8 struct Hchar9 {10 char *s;11 Rune r;12 };14 /* <, >, ", & intentionally omitted */16 static Hchar byname[] =17 {18 {"AElig", 198},19 {"Aacute", 193},20 {"Acirc", 194},21 {"Agrave", 192},22 {"Alpha", 913},23 {"Aring", 197},24 {"Atilde", 195},25 {"Auml", 196},26 {"Beta", 914},27 {"Ccedil", 199},28 {"Chi", 935},29 {"Dagger", 8225},30 {"Delta", 916},31 {"ETH", 208},32 {"Eacute", 201},33 {"Ecirc", 202},34 {"Egrave", 200},35 {"Epsilon", 917},36 {"Eta", 919},37 {"Euml", 203},38 {"Gamma", 915},39 {"Iacute", 205},40 {"Icirc", 206},41 {"Igrave", 204},42 {"Iota", 921},43 {"Iuml", 207},44 {"Kappa", 922},45 {"Lambda", 923},46 {"Mu", 924},47 {"Ntilde", 209},48 {"Nu", 925},49 {"OElig", 338},50 {"Oacute", 211},51 {"Ocirc", 212},52 {"Ograve", 210},53 {"Omega", 937},54 {"Omicron", 927},55 {"Oslash", 216},56 {"Otilde", 213},57 {"Ouml", 214},58 {"Phi", 934},59 {"Pi", 928},60 {"Prime", 8243},61 {"Psi", 936},62 {"Rho", 929},63 {"Scaron", 352},64 {"Sigma", 931},65 {"THORN", 222},66 {"Tau", 932},67 {"Theta", 920},68 {"Uacute", 218},69 {"Ucirc", 219},70 {"Ugrave", 217},71 {"Upsilon", 933},72 {"Uuml", 220},73 {"Xi", 926},74 {"Yacute", 221},75 {"Yuml", 376},76 {"Zeta", 918},77 {"aacute", 225},78 {"acirc", 226},79 {"acute", 180},80 {"aelig", 230},81 {"agrave", 224},82 {"alefsym", 8501},83 {"alpha", 945},84 {"amp", 38},85 {"and", 8743},86 {"ang", 8736},87 {"aring", 229},88 {"asymp", 8776},89 {"atilde", 227},90 {"auml", 228},91 {"bdquo", 8222},92 {"beta", 946},93 {"brvbar", 166},94 {"bull", 8226},95 {"cap", 8745},96 {"ccedil", 231},97 {"cdots", 8943},98 {"cedil", 184},99 {"cent", 162},100 {"chi", 967},101 {"circ", 710},102 {"clubs", 9827},103 {"cong", 8773},104 {"copy", 169},105 {"crarr", 8629},106 {"cup", 8746},107 {"curren", 164},108 {"dArr", 8659},109 {"dagger", 8224},110 {"darr", 8595},111 {"ddots", 8945},112 {"deg", 176},113 {"delta", 948},114 {"diams", 9830},115 {"divide", 247},116 {"eacute", 233},117 {"ecirc", 234},118 {"egrave", 232},119 {"emdash", 8212}, /* non-standard but commonly used */120 {"empty", 8709},121 {"emsp", 8195},122 {"endash", 8211}, /* non-standard but commonly used */123 {"ensp", 8194},124 {"epsilon", 949},125 {"equiv", 8801},126 {"eta", 951},127 {"eth", 240},128 {"euml", 235},129 {"euro", 8364},130 {"exist", 8707},131 {"fnof", 402},132 {"forall", 8704},133 {"frac12", 189},134 {"frac14", 188},135 {"frac34", 190},136 {"frasl", 8260},137 {"gamma", 947},138 {"ge", 8805},139 {"gt", 62},140 {"hArr", 8660},141 {"harr", 8596},142 {"hearts", 9829},143 {"hellip", 8230},144 {"iacute", 237},145 {"icirc", 238},146 {"iexcl", 161},147 {"igrave", 236},148 {"image", 8465},149 {"infin", 8734},150 {"int", 8747},151 {"iota", 953},152 {"iquest", 191},153 {"isin", 8712},154 {"iuml", 239},155 {"kappa", 954},156 {"lArr", 8656},157 {"lambda", 955},158 {"lang", 9001},159 {"laquo", 171},160 {"larr", 8592},161 {"lceil", 8968},162 {"ldots", 8230},163 {"ldquo", 8220},164 {"le", 8804},165 {"lfloor", 8970},166 {"lowast", 8727},167 {"loz", 9674},168 {"lrm", 8206},169 {"lsaquo", 8249},170 {"lsquo", 8216},171 {"lt", 60},172 {"macr", 175},173 {"mdash", 8212},174 {"micro", 181},175 {"middot", 183},176 {"minus", 8722},177 {"mu", 956},178 {"nabla", 8711},179 {"nbsp", 160},180 {"ndash", 8211},181 {"ne", 8800},182 {"ni", 8715},183 {"not", 172},184 {"notin", 8713},185 {"nsub", 8836},186 {"ntilde", 241},187 {"nu", 957},188 {"oacute", 243},189 {"ocirc", 244},190 {"oelig", 339},191 {"ograve", 242},192 {"oline", 8254},193 {"omega", 969},194 {"omicron", 959},195 {"oplus", 8853},196 {"or", 8744},197 {"ordf", 170},198 {"ordm", 186},199 {"oslash", 248},200 {"otilde", 245},201 {"otimes", 8855},202 {"ouml", 246},203 {"para", 182},204 {"part", 8706},205 {"permil", 8240},206 {"perp", 8869},207 {"phi", 966},208 {"pi", 960},209 {"piv", 982},210 {"plusmn", 177},211 {"pound", 163},212 {"prime", 8242},213 {"prod", 8719},214 {"prop", 8733},215 {"psi", 968},216 {"quad", 8193},217 {"quot", 34},218 {"rArr", 8658},219 {"radic", 8730},220 {"rang", 9002},221 {"raquo", 187},222 {"rarr", 8594},223 {"rceil", 8969},224 {"rdquo", 8221},225 {"real", 8476},226 {"reg", 174},227 {"rfloor", 8971},228 {"rho", 961},229 {"rlm", 8207},230 {"rsaquo", 8250},231 {"rsquo", 8217},232 {"sbquo", 8218},233 {"scaron", 353},234 {"sdot", 8901},235 {"sect", 167},236 {"shy", 173},237 {"sigma", 963},238 {"sigmaf", 962},239 {"sim", 8764},240 {"sp", 8194},241 {"spades", 9824},242 {"sub", 8834},243 {"sube", 8838},244 {"sum", 8721},245 {"sup", 8835},246 {"sup1", 185},247 {"sup2", 178},248 {"sup3", 179},249 {"supe", 8839},250 {"szlig", 223},251 {"tau", 964},252 {"there4", 8756},253 {"theta", 952},254 {"thetasym", 977},255 {"thinsp", 8201},256 {"thorn", 254},257 {"tilde", 732},258 {"times", 215},259 {"trade", 8482},260 {"uArr", 8657},261 {"uacute", 250},262 {"uarr", 8593},263 {"ucirc", 251},264 {"ugrave", 249},265 {"uml", 168},266 {"upsih", 978},267 {"upsilon", 965},268 {"uuml", 252},269 {"varepsilon", 8712},270 {"varphi", 981},271 {"varpi", 982},272 {"varrho", 1009},273 {"vdots", 8942},274 {"vsigma", 962},275 {"vtheta", 977},276 {"weierp", 8472},277 {"xi", 958},278 {"yacute", 253},279 {"yen", 165},280 {"yuml", 255},281 {"zeta", 950},282 {"zwj", 8205},283 {"zwnj", 8204}284 };286 static Hchar byrune[nelem(byname)];288 static int289 hnamecmp(const void *va, const void *vb)290 {291 Hchar *a, *b;293 a = (Hchar*)va;294 b = (Hchar*)vb;295 return strcmp(a->s, b->s);296 }298 static int299 hrunecmp(const void *va, const void *vb)300 {301 Hchar *a, *b;303 a = (Hchar*)va;304 b = (Hchar*)vb;305 return a->r - b->r;306 }308 static void309 html_init(void)310 {311 static int init;313 if(init)314 return;315 init = 1;316 memmove(byrune, byname, sizeof byrune);317 qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);318 qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);319 }321 static Rune322 findbyname(char *s)323 {324 Hchar *h;325 int n, m, x;327 h = byname;328 n = nelem(byname);329 while(n > 0){330 m = n/2;331 x = strcmp(h[m].s, s);332 if(x == 0)333 return h[m].r;334 if(x < 0){335 h += m+1;336 n -= m+1;337 }else338 n = m;339 }340 return Runeerror;341 }343 static char*344 findbyrune(Rune r)345 {346 Hchar *h;347 int n, m;349 h = byrune;350 n = nelem(byrune);351 while(n > 0){352 m = n/2;353 if(h[m].r == r)354 return h[m].s;355 if(h[m].r < r){356 h += m+1;357 n -= m+1;358 }else359 n = m;360 }361 return nil;362 }364 void365 html_in(int fd, long *x, struct convert *out)366 {367 char buf[100], *p;368 Biobuf b;369 Rune rbuf[N];370 Rune *r, *er;371 int c, i;373 USED(x);375 html_init();376 r = rbuf;377 er = rbuf+N;378 Binit(&b, fd, OREAD);379 while((c = Bgetrune(&b)) != Beof){380 if(r >= er){381 OUT(out, rbuf, r-rbuf);382 r = rbuf;383 }384 if(c == '&'){385 buf[0] = c;386 for(i=1; i<nelem(buf)-1;){387 c = Bgetc(&b);388 if(c == Beof)389 break;390 buf[i++] = c;391 if(strchr("; \t\r\n", c))392 break;393 }394 buf[i] = 0;395 if(buf[i-1] == ';'){396 buf[i-1] = 0;397 if((c = findbyname(buf+1)) != Runeerror){398 *r++ = c;399 continue;400 }401 buf[i-1] = ';';402 if(buf[1] == '#'){403 if(buf[2] == 'x')404 c = strtol(buf+3, &p, 16);405 else406 c = strtol(buf+2, &p, 10);407 if(*p != ';' || c >= NRUNE || c < 0)408 goto bad;409 *r++ = c;410 continue;411 }412 }413 bad:414 for(p=buf; p<buf+i; ){415 p += chartorune(r++, p);416 if(r >= er){417 OUT(out, rbuf, r-rbuf);418 r = rbuf;419 }420 }421 continue;422 }423 *r++ = c;424 }425 if(r > rbuf)426 OUT(out, rbuf, r-rbuf);427 OUT(out, rbuf, 0);428 }430 /*431 * use biobuf because can use more than UTFmax bytes per rune432 */433 void434 html_out(Rune *r, int n, long *x)435 {436 char *s;437 Biobuf b;438 Rune *er;440 USED(x);441 html_init();442 Binit(&b, 1, OWRITE);443 er = r+n;444 for(; r<er; r++){445 if(*r < Runeself)446 Bputrune(&b, *r);447 else if((s = findbyrune(*r)) != nil)448 Bprint(&b, "&%s;", s);449 else450 Bprint(&b, "&#%d;", *r);451 }452 Bflush(&b);453 }