Blob
1 #include <u.h>2 #include <libc.h>3 #include <bio.h>4 #include "hdr.h"5 #include "conv.h"7 typedef struct Hchar Hchar;8 struct Hchar9 {10 char *s;11 Rune r;12 };14 /* <, >, ", & intentionally omitted */16 static Hchar byname[] =17 {18 {"AElig", 198},19 {"Aacute", 193},20 {"Acirc", 194},21 {"Agrave", 192},22 {"Aring", 197},23 {"Atilde", 195},24 {"Auml", 196},25 {"Ccedil", 199},26 {"ETH", 208},27 {"Eacute", 201},28 {"Ecirc", 202},29 {"Egrave", 200},30 {"Euml", 203},31 {"Iacute", 205},32 {"Icirc", 206},33 {"Igrave", 204},34 {"Iuml", 207},35 {"Ntilde", 209},36 {"Oacute", 211},37 {"Ocirc", 212},38 {"Ograve", 210},39 {"Oslash", 216},40 {"Otilde", 213},41 {"Ouml", 214},42 {"THORN", 222},43 {"Uacute", 218},44 {"Ucirc", 219},45 {"Ugrave", 217},46 {"Uuml", 220},47 {"Yacute", 221},48 {"aacute", 225},49 {"acirc", 226},50 {"acute", 180},51 {"aelig", 230},52 {"agrave", 224},53 {"alpha", 945},54 {"aring", 229},55 {"atilde", 227},56 {"auml", 228},57 {"beta", 946},58 {"brvbar", 166},59 {"ccedil", 231},60 {"cdots", 8943},61 {"cedil", 184},62 {"cent", 162},63 {"chi", 967},64 {"copy", 169},65 {"curren", 164},66 {"ddots", 8945},67 {"deg", 176},68 {"delta", 948},69 {"divide", 247},70 {"eacute", 233},71 {"ecirc", 234},72 {"egrave", 232},73 {"emdash", 8212}, /* non-standard but commonly used */74 {"emsp", 8195},75 {"endash", 8211}, /* non-standard but commonly used */76 {"ensp", 8194},77 {"epsilon", 949},78 {"eta", 951},79 {"eth", 240},80 {"euml", 235},81 {"frac12", 189},82 {"frac14", 188},83 {"frac34", 190},84 {"gamma", 947},85 {"iacute", 237},86 {"icirc", 238},87 {"iexcl", 161},88 {"igrave", 236},89 {"iota", 953},90 {"iquest", 191},91 {"iuml", 239},92 {"kappa", 954},93 {"lambda", 955},94 {"laquo", 171},95 {"ldquo", 8220},96 {"ldots", 8230},97 {"lsquo", 8216},98 {"macr", 175},99 {"mdash", 8212},100 {"micro", 181},101 {"middot", 183},102 {"mu", 956},103 {"nbsp", 160},104 {"ndash", 8211},105 {"not", 172},106 {"ntilde", 241},107 {"nu", 957},108 {"oacute", 243},109 {"ocirc", 244},110 {"ograve", 242},111 {"omega", 969},112 {"omicron", 959},113 {"ordf", 170},114 {"ordm", 186},115 {"oslash", 248},116 {"otilde", 245},117 {"ouml", 246},118 {"para", 182},119 {"phi", 966},120 {"pi", 960},121 {"plusmn", 177},122 {"pound", 163},123 {"psi", 968},124 {"quad", 8193},125 {"raquo", 187},126 {"rdquo", 8221},127 {"reg", 174},128 {"rho", 961},129 {"rsquo", 8217},130 {"sect", 167},131 {"shy", 173},132 {"sigma", 963},133 {"sp", 8194},134 {"sup1", 185},135 {"sup2", 178},136 {"sup3", 179},137 {"szlig", 223},138 {"tau", 964},139 {"theta", 952},140 {"thinsp", 8201},141 {"thorn", 254},142 {"times", 215},143 {"trade", 8482},144 {"uacute", 250},145 {"ucirc", 251},146 {"ugrave", 249},147 {"uml", 168},148 {"upsilon", 965},149 {"uuml", 252},150 {"varepsilon", 8712},151 {"varphi", 981},152 {"varpi", 982},153 {"varrho", 1009},154 {"vdots", 8942},155 {"vsigma", 962},156 {"vtheta", 977},157 {"xi", 958},158 {"yacute", 253},159 {"yen", 165},160 {"yuml", 255},161 {"zeta", 950}162 };164 static Hchar byrune[nelem(byname)];166 static int167 hnamecmp(const void *va, const void *vb)168 {169 Hchar *a, *b;171 a = (Hchar*)va;172 b = (Hchar*)vb;173 return strcmp(a->s, b->s);174 }176 static int177 hrunecmp(const void *va, const void *vb)178 {179 Hchar *a, *b;181 a = (Hchar*)va;182 b = (Hchar*)vb;183 return a->r - b->r;184 }186 static void187 html_init(void)188 {189 static int init;191 if(init)192 return;193 init = 1;194 memmove(byrune, byname, sizeof byrune);195 qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);196 qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);197 }199 static Rune200 findbyname(char *s)201 {202 Hchar *h;203 int n, m, x;205 h = byname;206 n = nelem(byname);207 while(n > 0){208 m = n/2;209 x = strcmp(h[m].s, s);210 if(x == 0)211 return h[m].r;212 if(x < 0){213 h += m+1;214 n -= m+1;215 }else216 n = m;217 }218 return Runeerror;219 }221 static char*222 findbyrune(Rune r)223 {224 Hchar *h;225 int n, m;227 h = byrune;228 n = nelem(byrune);229 while(n > 0){230 m = n/2;231 if(h[m].r == r)232 return h[m].s;233 if(h[m].r < r){234 h += m+1;235 n -= m+1;236 }else237 n = m;238 }239 return nil;240 }242 void243 html_in(int fd, long *x, struct convert *out)244 {245 char buf[100], *p;246 Biobuf b;247 Rune rbuf[N];248 Rune *r, *er;249 int c, i;251 USED(x);253 html_init();254 r = rbuf;255 er = rbuf+N;256 Binit(&b, fd, OREAD);257 while((c = Bgetrune(&b)) != Beof){258 if(r >= er){259 OUT(out, rbuf, r-rbuf);260 r = rbuf;261 }262 if(c == '&'){263 buf[0] = c;264 for(i=1; i<nelem(buf)-1;){265 c = Bgetc(&b);266 if(c == Beof)267 break;268 buf[i++] = c;269 if(strchr("; \t\r\n", c))270 break;271 }272 buf[i] = 0;273 if(buf[i-1] == ';'){274 buf[i-1] = 0;275 if((c = findbyname(buf+1)) != Runeerror){276 *r++ = c;277 continue;278 }279 buf[i-1] = ';';280 if(buf[1] == '#'){281 if(buf[2] == 'x')282 c = strtol(buf+3, &p, 16);283 else284 c = strtol(buf+2, &p, 10);285 if(*p != ';' || c >= NRUNE || c < 0)286 goto bad;287 *r++ = c;288 continue;289 }290 }291 bad:292 for(p=buf; p<buf+i; ){293 p += chartorune(r++, p);294 if(r >= er){295 OUT(out, rbuf, r-rbuf);296 r = rbuf;297 }298 }299 continue;300 }301 *r++ = c;302 }303 if(r > rbuf)304 OUT(out, rbuf, r-rbuf);305 }307 /*308 * use biobuf because can use more than UTFmax bytes per rune309 */310 void311 html_out(Rune *r, int n, long *x)312 {313 char *s;314 Biobuf b;315 Rune *er;317 html_init();318 Binit(&b, 1, OWRITE);319 er = r+n;320 for(; r<er; r++){321 if(*r < Runeself)322 Bputrune(&b, *r);323 else if((s = findbyrune(*r)) != nil)324 Bprint(&b, "&%s;", s);325 else326 Bprint(&b, "&#%d;", *r);327 }328 Bflush(&b);329 }