op public repos

Blob

Date:: Fri Jan 29 11:32:41 2021 UTC
Message:: libhtml: fix array bounds in lex
Actions:: History | Blame | Raw File
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7 
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 	int			i;		/* index of next byte to use */
12 	uchar*		data;		/* all the data */
13 	int			edata;	/* data[0:edata] is valid */
14 	int			chset;	/* one of US_Ascii, etc. */
15 	int			mtype;	/* TextHtml or TextPlain */
16 };
17 
18 enum {
19 	EOF = -2,
20 	EOB = -1
21 };
22 
23 #define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27 
28 /* HTML 4.0 tag names. */
29 /* Keep sorted, and in correspondence with enum in iparse.h. */
30 Rune **tagnames;
31 char *_tagnames[] = {
32 	" ",
33 	"!",
34 	"a",
35 	"abbr",
36 	"acronym",
37 	"address",
38 	"applet",
39 	"area",
40 	"b",
41 	"base",
42 	"basefont",
43 	"bdo",
44 	"big",
45 	"blink",
46 	"blockquote",
47 	"body",
48 	"bq",
49 	"br",
50 	"button",
51 	"caption",
52 	"center",
53 	"cite",
54 	"code",
55 	"col",
56 	"colgroup",
57 	"dd",
58 	"del",
59 	"dfn",
60 	"dir",
61 	"div",
62 	"dl",
63 	"dt",
64 	"em",
65 	"fieldset",
66 	"font",
67 	"form",
68 	"frame",
69 	"frameset",
70 	"h1",
71 	"h2",
72 	"h3",
73 	"h4",
74 	"h5",
75 	"h6",
76 	"head",
77 	"hr",
78 	"html",
79 	"i",
80 	"iframe",
81 	"img",
82 	"input",
83 	"ins",
84 	"isindex",
85 	"kbd",
86 	"label",
87 	"legend",
88 	"li",
89 	"link",
90 	"map",
91 	"menu",
92 	"meta",
93 	"nobr",
94 	"noframes",
95 	"noscript",
96 	"object",
97 	"ol",
98 	"optgroup",
99 	"option",
100 	"p",
101 	"param",
102 	"pre",
103 	"q",
104 	"s",
105 	"samp",
106 	"script",
107 	"select",
108 	"small",
109 	"span",
110 	"strike",
111 	"strong",
112 	"style",
113 	"sub",
114 	"sup",
115 	"table",
116 	"tbody",
117 	"td",
118 	"textarea",
119 	"tfoot",
120 	"th",
121 	"thead",
122 	"title",
123 	"tr",
124 	"tt",
125 	"u",
126 	"ul",
127 	"var"
128 };
129 
130 /* HTML 4.0 attribute names. */
131 /* Keep sorted, and in correspondence with enum in i.h. */
132 Rune **attrnames;
133 char* _attrnames[] = {
134 	"abbr",
135 	"accept-charset",
136 	"access-key",
137 	"action",
138 	"align",
139 	"alink",
140 	"alt",
141 	"archive",
142 	"axis",
143 	"background",
144 	"bgcolor",
145 	"border",
146 	"cellpadding",
147 	"cellspacing",
148 	"char",
149 	"charoff",
150 	"charset",
151 	"checked",
152 	"cite",
153 	"class",
154 	"classid",
155 	"clear",
156 	"code",
157 	"codebase",
158 	"codetype",
159 	"color",
160 	"cols",
161 	"colspan",
162 	"compact",
163 	"content",
164 	"coords",
165 	"data",
166 	"datetime",
167 	"declare",
168 	"defer",
169 	"dir",
170 	"disabled",
171 	"enctype",
172 	"face",
173 	"for",
174 	"frame",
175 	"frameborder",
176 	"headers",
177 	"height",
178 	"href",
179 	"hreflang",
180 	"hspace",
181 	"http-equiv",
182 	"id",
183 	"ismap",
184 	"label",
185 	"lang",
186 	"link",
187 	"longdesc",
188 	"marginheight",
189 	"marginwidth",
190 	"maxlength",
191 	"media",
192 	"method",
193 	"multiple",
194 	"name",
195 	"nohref",
196 	"noresize",
197 	"noshade",
198 	"nowrap",
199 	"object",
200 	"onblur",
201 	"onchange",
202 	"onclick",
203 	"ondblclick",
204 	"onfocus",
205 	"onkeypress",
206 	"onkeyup",
207 	"onload",
208 	"onmousedown",
209 	"onmousemove",
210 	"onmouseout",
211 	"onmouseover",
212 	"onmouseup",
213 	"onreset",
214 	"onselect",
215 	"onsubmit",
216 	"onunload",
217 	"profile",
218 	"prompt",
219 	"readonly",
220 	"rel",
221 	"rev",
222 	"rows",
223 	"rowspan",
224 	"rules",
225 	"scheme",
226 	"scope",
227 	"scrolling",
228 	"selected",
229 	"shape",
230 	"size",
231 	"span",
232 	"src",
233 	"standby",
234 	"start",
235 	"style",
236 	"summary",
237 	"tabindex",
238 	"target",
239 	"text",
240 	"title",
241 	"type",
242 	"usemap",
243 	"valign",
244 	"value",
245 	"valuetype",
246 	"version",
247 	"vlink",
248 	"vspace",
249 	"width"
250 };
251 
252 
253 /* Character entity to unicode character number map. */
254 /* Keep sorted by name. */
255 StringInt *chartab;
256 AsciiInt _chartab[] = {
257 	{"AElig", 198},
258 	{"Aacute", 193},
259 	{"Acirc", 194},
260 	{"Agrave", 192},
261 	{"Aring", 197},
262 	{"Atilde", 195},
263 	{"Auml", 196},
264 	{"Ccedil", 199},
265 	{"ETH", 208},
266 	{"Eacute", 201},
267 	{"Ecirc", 202},
268 	{"Egrave", 200},
269 	{"Euml", 203},
270 	{"Iacute", 205},
271 	{"Icirc", 206},
272 	{"Igrave", 204},
273 	{"Iuml", 207},
274 	{"Ntilde", 209},
275 	{"Oacute", 211},
276 	{"Ocirc", 212},
277 	{"Ograve", 210},
278 	{"Oslash", 216},
279 	{"Otilde", 213},
280 	{"Ouml", 214},
281 	{"THORN", 222},
282 	{"Uacute", 218},
283 	{"Ucirc", 219},
284 	{"Ugrave", 217},
285 	{"Uuml", 220},
286 	{"Yacute", 221},
287 	{"aacute", 225},
288 	{"acirc", 226},
289 	{"acute", 180},
290 	{"aelig", 230},
291 	{"agrave", 224},
292 	{"alpha", 945},
293 	{"amp", 38},
294 	{"aring", 229},
295 	{"atilde", 227},
296 	{"auml", 228},
297 	{"beta", 946},
298 	{"brvbar", 166},
299 	{"ccedil", 231},
300 	{"cdots", 8943},
301 	{"cedil", 184},
302 	{"cent", 162},
303 	{"chi", 967},
304 	{"copy", 169},
305 	{"curren", 164},
306 	{"ddots", 8945},
307 	{"deg", 176},
308 	{"delta", 948},
309 	{"divide", 247},
310 	{"eacute", 233},
311 	{"ecirc", 234},
312 	{"egrave", 232},
313 	{"emdash", 8212},	/* non-standard but commonly used */
314 	{"emsp", 8195},
315 	{"endash", 8211},	/* non-standard but commonly used */
316 	{"ensp", 8194},
317 	{"epsilon", 949},
318 	{"eta", 951},
319 	{"eth", 240},
320 	{"euml", 235},
321 	{"frac12", 189},
322 	{"frac14", 188},
323 	{"frac34", 190},
324 	{"gamma", 947},
325 	{"gt", 62},
326 	{"iacute", 237},
327 	{"icirc", 238},
328 	{"iexcl", 161},
329 	{"igrave", 236},
330 	{"iota", 953},
331 	{"iquest", 191},
332 	{"iuml", 239},
333 	{"kappa", 954},
334 	{"lambda", 955},
335 	{"laquo", 171},
336 	{"ldquo", 8220},
337 	{"ldots", 8230},
338 	{"lsquo", 8216},
339 	{"lt", 60},
340 	{"macr", 175},
341 	{"mdash", 8212},
342 	{"micro", 181},
343 	{"middot", 183},
344 	{"mu", 956},
345 	{"nbsp", 160},
346 	{"ndash", 8211},
347 	{"not", 172},
348 	{"ntilde", 241},
349 	{"nu", 957},
350 	{"oacute", 243},
351 	{"ocirc", 244},
352 	{"ograve", 242},
353 	{"omega", 969},
354 	{"omicron", 959},
355 	{"ordf", 170},
356 	{"ordm", 186},
357 	{"oslash", 248},
358 	{"otilde", 245},
359 	{"ouml", 246},
360 	{"para", 182},
361 	{"phi", 966},
362 	{"pi", 960},
363 	{"plusmn", 177},
364 	{"pound", 163},
365 	{"psi", 968},
366 	{"quad", 8193},
367 	{"quot", 34},
368 	{"raquo", 187},
369 	{"rdquo", 8221},
370 	{"reg", 174},
371 	{"rho", 961},
372 	{"rsquo", 8217},
373 	{"sect", 167},
374 	{"shy", 173},
375 	{"sigma", 963},
376 	{"sp", 8194},
377 	{"sup1", 185},
378 	{"sup2", 178},
379 	{"sup3", 179},
380 	{"szlig", 223},
381 	{"tau", 964},
382 	{"theta", 952},
383 	{"thinsp", 8201},
384 	{"thorn", 254},
385 	{"times", 215},
386 	{"trade", 8482},
387 	{"uacute", 250},
388 	{"ucirc", 251},
389 	{"ugrave", 249},
390 	{"uml", 168},
391 	{"upsilon", 965},
392 	{"uuml", 252},
393 	{"varepsilon", 8712},
394 	{"varphi", 981},
395 	{"varpi", 982},
396 	{"varrho", 1009},
397 	{"vdots", 8942},
398 	{"vsigma", 962},
399 	{"vtheta", 977},
400 	{"xi", 958},
401 	{"yacute", 253},
402 	{"yen", 165},
403 	{"yuml", 255},
404 	{"zeta", 950}
405 };
406 #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
407 
408 /* Characters Winstart..Winend are those that Windows */
409 /* uses interpolated into the Latin1 set. */
410 /* They aren't supposed to appear in HTML, but they do.... */
411 enum {
412 	Winstart = 127,
413 	Winend = 159
414 };
415 
416 static int	winchars[]= { 8226,	/* 8226 is a bullet */
417 	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
418 	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
419 	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
420 	732, 8482, 353, 8250, 339, 8226, 8226, 376};
421 
422 static StringInt*	tagtable;		/* initialized from tagnames */
423 static StringInt*	attrtable;		/* initialized from attrnames */
424 
425 static void		lexinit(void);
426 static int		getplaindata(TokenSource* ts, Token* a, int* pai);
427 static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
428 static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
429 static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
430 static Rune*		buftostr(Rune* s, Rune* buf, int j);
431 static int		comment(TokenSource* ts);
432 static int		findstr(TokenSource* ts, Rune* s);
433 static int		ampersand(TokenSource* ts);
434 /*static int		lowerc(int c); */
435 static int		getchar(TokenSource* ts);
436 static void		ungetchar(TokenSource* ts, int c);
437 static void		backup(TokenSource* ts, int savei);
438 /*static void		freeinsidetoken(Token* t); */
439 static void		freeattrs(Attr* ahead);
440 static Attr*		newattr(int attid, Rune* value, Attr* link);
441 static int		Tconv(Fmt* f);
442 
443 int	dbglex = 0;
444 static int lexinited = 0;
445 
446 static void
447 lexinit(void)
448 {
449 	chartab = _cvtstringinttab(_chartab, nelem(_chartab));
450 	tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
451 	tagtable = _makestrinttab(tagnames, Numtags);
452 	attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
453 	attrtable = _makestrinttab(attrnames, Numattrs);
454 	fmtinstall('T', Tconv);
455 	lexinited = 1;
456 }
457 
458 static TokenSource*
459 newtokensource(uchar* data, int edata, int chset, int mtype)
460 {
461 	TokenSource*	ans;
462 
463 	assert(chset == US_Ascii || chset == ISO_8859_1 ||
464 			chset == UTF_8 || chset == Unicode);
465 	ans = (TokenSource*)emalloc(sizeof(TokenSource));
466 	ans->i = 0;
467 	ans->data = data;
468 	ans->edata = edata;
469 	ans->chset = chset;
470 	ans->mtype = mtype;
471 	return ans;
472 }
473 
474 enum {
475 	ToksChunk = 500
476 };
477 
478 /* Call this to get the tokens. */
479 /*  The number of returned tokens is returned in *plen. */
480 Token*
481 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
482 {
483 	TokenSource*	ts;
484 	Token*		a;
485 	int	alen;
486 	int	ai;
487 	int	starti;
488 	int	c;
489 	int	tag;
490 
491 	if(!lexinited)
492 		lexinit();
493 	ts = newtokensource(data, datalen, chset, mtype);
494 	alen = ToksChunk;
495 	a = (Token*)emalloc(alen * sizeof(Token));
496 	ai = 0;
497 	if(dbglex)
498 		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
499 	if(ts->mtype == TextHtml){
500 		for(;;){
501 			if(ai == alen){
502 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
503 				alen += ToksChunk;
504 			}
505 			starti = ts->i;
506 			c = getchar(ts);
507 			if(c < 0)
508 				break;
509 			if(c == '<'){
510 				tag = gettag(ts, starti, a, &ai);
511 				if(tag == Tscript){
512 					/* special rules for getting Data after.... */
513 					starti = ts->i;
514 					c = getchar(ts);
515 					tag = getscriptdata(ts, c, starti, a, &ai);
516 				}
517 			}
518 			else
519 				tag = getdata(ts, c, starti, a, &ai);
520 			if(tag == -1)
521 				break;
522 			else if(dbglex > 1 && tag != Comment)
523 				fprint(2, "lex: got token %T\n", &a[ai-1]);
524 		}
525 	}
526 	else {
527 		/* plain text (non-html) tokens */
528 		for(;;){
529 			if(ai == alen){
530 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
531 				alen += ToksChunk;
532 			}
533 			tag = getplaindata(ts, a, &ai);
534 			if(tag == -1)
535 				break;
536 			if(dbglex > 1)
537 				fprint(2, "lex: got token %T\n", &a[ai]);
538 		}
539 	}
540 	if(dbglex)
541 		fprint(2, "lex: returning %d tokens\n", ai);
542 	*plen = ai;
543 	free(ts);
544 	if(ai == 0) {
545 		free(a);
546 		return nil;
547 	}
548 	return a;
549 }
550 
551 /* For case where source isn't HTML. */
552 /* Just make data tokens, one per line (or partial line, */
553 /* at end of buffer), ignoring non-whitespace control */
554 /* characters and dumping \r's. */
555 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
556 /* Otherwise return -1; */
557 static int
558 getplaindata(TokenSource* ts, Token* a, int* pai)
559 {
560 	Rune*	s;
561 	int	j;
562 	int	starti;
563 	int	c;
564 	Token*	tok;
565 	Rune	buf[BIGBUFSIZE];
566 
567 	s = nil;
568 	j = 0;
569 	starti = ts->i;
570 	for(c = getchar(ts); c >= 0; c = getchar(ts)){
571 		if(c < ' '){
572 			if(isspace(c)){
573 				if(c == '\r'){
574 					/* ignore it unless no following '\n', */
575 					/* in which case treat it like '\n' */
576 					c = getchar(ts);
577 					if(c != '\n'){
578 						if(c >= 0)
579 							ungetchar(ts, c);
580 						c = '\n';
581 					}
582 				}
583 			}
584 			else
585 				c = 0;
586 		}
587 		if(c != 0){
588 			buf[j++] = c;
589 			if(j == BIGBUFSIZE-1){
590 				s = buftostr(s, buf, j);
591 				j = 0;
592 			}
593 		}
594 		if(c == '\n')
595 			break;
596 	}
597 	s = buftostr(s, buf, j);
598 	if(s == nil)
599 		return -1;
600 	tok = &a[(*pai)++];
601 	tok->tag = Data;
602 	tok->text = s;
603 	tok->attr = nil;
604 	tok->starti = starti;
605 	return Data;
606 }
607 
608 /* Return concatenation of s and buf[0:j] */
609 /* Frees s. */
610 static Rune*
611 buftostr(Rune* s, Rune* buf, int j)
612 {
613 	Rune *tmp;
614 	buf[j] = 0;
615 	if(s == nil)
616 		tmp = _Strndup(buf, j);
617 	else
618 		tmp = _Strdup2(s, buf);
619 	free(s);
620 	return tmp;
621 }
622 
623 /* Gather data up to next start-of-tag or end-of-buffer. */
624 /* Translate entity references (&amp;). */
625 /* Ignore non-whitespace control characters and get rid of \r's. */
626 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
627 /* Otherwise return -1; */
628 static int
629 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
630 {
631 	Rune*	s;
632 	int	j;
633 	int	c;
634 	Token*	tok;
635 	Rune	buf[BIGBUFSIZE];
636 
637 	s = nil;
638 	j = 0;
639 	c = firstc;
640 	while(c >= 0){
641 		if(c == '&'){
642 			c = ampersand(ts);
643 			if(c < 0)
644 				break;
645 		}
646 		else if(c < ' '){
647 			if(isspace(c)){
648 				if(c == '\r'){
649 					/* ignore it unless no following '\n', */
650 					/* in which case treat it like '\n' */
651 					c = getchar(ts);
652 					if(c != '\n'){
653 						if(c >= 0)
654 							ungetchar(ts, c);
655 						c = '\n';
656 					}
657 				}
658 			}
659 			else {
660 				if(warn)
661 					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
662 				c = 0;
663 			}
664 		}
665 		else if(c == '<'){
666 			ungetchar(ts, c);
667 			break;
668 		}
669 		if(c != 0){
670 			buf[j++] = c;
671 			if(j == BIGBUFSIZE-1){
672 				s = buftostr(s, buf, j);
673 				j = 0;
674 			}
675 		}
676 		c = getchar(ts);
677 	}
678 	s = buftostr(s, buf, j);
679 	if(s == nil)
680 		return -1;
681 	tok = &a[(*pai)++];
682 	tok->tag = Data;
683 	tok->text = s;
684 	tok->attr = nil;
685 	tok->starti = starti;
686 	return Data;
687 }
688 
689 /* The rules for lexing scripts are different (ugh). */
690 /* Gather up everything until see a </SCRIPT>. */
691 static int
692 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
693 {
694 	Rune*	s;
695 	int	j;
696 	int	tstarti;
697 	int	savei;
698 	int	c;
699 	int	tag;
700 	int	done;
701 	Token*	tok;
702 	Rune	buf[BIGBUFSIZE];
703 
704 	s = nil;
705 	j = 0;
706 	tstarti = starti;
707 	c = firstc;
708 	done = 0;
709 	while(c >= 0){
710 		if(c == '<'){
711 			/* other browsers ignore stuff to end of line after <! */
712 			savei = ts->i;
713 			c = getchar(ts);
714 			if(c == '!'){
715 				while(c >= 0 && c != '\n' && c != '\r')
716 					c = getchar(ts);
717 				if(c == '\r')
718 					c = getchar(ts);
719 				if(c == '\n')
720 					c = getchar(ts);
721 			}
722 			else if(c >= 0){
723 				backup(ts, savei);
724 				tag = gettag(ts, tstarti, a, pai);
725 				if(tag == -1)
726 					break;
727 				if(tag != Comment)
728 					(*pai)--;
729 				backup(ts, tstarti);
730 				if(tag == Tscript + RBRA){
731 					done = 1;
732 					break;
733 				}
734 				/* here tag was not </SCRIPT>, so take as regular data */
735 				c = getchar(ts);
736 			}
737 		}
738 		if(c < 0)
739 			break;
740 		if(c != 0){
741 			buf[j++] = c;
742 			if(j == BIGBUFSIZE-1){
743 				s = buftostr(s, buf, j);
744 				j = 0;
745 			}
746 		}
747 		tstarti = ts->i;
748 		c = getchar(ts);
749 	}
750 	if(done || ts->i == ts->edata){
751 		s = buftostr(s, buf, j);
752 		tok = &a[(*pai)++];
753 		tok->tag = Data;
754 		tok->text = s;
755 		tok->attr = nil;
756 		tok->starti = starti;
757 		return Data;
758 	}
759 	backup(ts, starti);
760 	return -1;
761 }
762 
763 /* We've just seen a '<'.  Gather up stuff to closing '>' (if buffer */
764 /* ends before then, return -1). */
765 /* If it's a tag, look up the name, gather the attributes, and return */
766 /* the appropriate token. */
767 /* Else it's either just plain data or some kind of ignorable stuff: */
768 /* return Data or Comment as appropriate. */
769 /* If it's not a Comment, put it in a[*pai] and bump *pai. */
770 static int
771 gettag(TokenSource* ts, int starti, Token* a, int* pai)
772 {
773 	int	rbra;
774 	int	ans;
775 	Attr*	al;
776 	int	nexti;
777 	int	c;
778 	int	ti;
779 	int	afnd;
780 	int	attid;
781 	int	quote;
782 	Rune*	val;
783 	int	nv;
784 	int	i;
785 	int	tag;
786 	Token*	tok;
787 	Rune	buf[BIGBUFSIZE];
788 
789 	rbra = 0;
790 	nexti = ts->i;
791 	tok = &a[*pai];
792 	tok->tag = Notfound;
793 	tok->text = nil;
794 	tok->attr = nil;
795 	tok->starti = starti;
796 	c = getchar(ts);
797 	if(c == '/'){
798 		rbra = RBRA;
799 		c = getchar(ts);
800 	}
801 	if(c < 0)
802 		goto eob_done;
803 	if(c >= 256 || !isalpha(c)){
804 		/* not a tag */
805 		if(c == '!'){
806 			ans = comment(ts);
807 			if(ans != -1)
808 				return ans;
809 			goto eob_done;
810 		}
811 		else {
812 			backup(ts, nexti);
813 			tok->tag = Data;
814 			tok->text = _Strdup(L(Llt));
815 			(*pai)++;
816 			return Data;
817 		}
818 	}
819 	/* c starts a tagname */
820 	buf[0] = c;
821 	i = 1;
822 	for(;;){
823 		c = getchar(ts);
824 		if(c < 0)
825 			goto eob_done;
826 		if(!ISNAMCHAR(c))
827 			break;
828 		/* if name is bigger than buf it won't be found anyway... */
829 		if(i < BIGBUFSIZE)
830 			buf[i++] = c;
831 	}
832 	if(_lookup(tagtable, Numtags, buf, i, &tag))
833 		tok->tag = tag + rbra;
834 	else
835 		tok->text = _Strndup(buf, i);	/* for warning print, in build */
836 
837 	/* attribute gathering loop */
838 	al = nil;
839 	for(;;){
840 		/* look for "ws name" or "ws name ws = ws val"  (ws=whitespace) */
841 		/* skip whitespace */
842 attrloop_continue:
843 		while(c < 256 && isspace(c)){
844 			c = getchar(ts);
845 			if(c < 0)
846 				goto eob_done;
847 		}
848 		if(c == '>')
849 			goto attrloop_done;
850 		if(c == '<'){
851 			if(warn)
852 				fprint(2, "warning: unclosed tag\n");
853 			ungetchar(ts, c);
854 			goto attrloop_done;
855 		}
856 		if(c >= 256 || !isalpha(c)){
857 			if(warn)
858 				fprint(2, "warning: expected attribute name\n");
859 			/* skipt to next attribute name */
860 			for(;;){
861 				c = getchar(ts);
862 				if(c < 0)
863 					goto eob_done;
864 				if(c < 256 && isalpha(c))
865 					goto attrloop_continue;
866 				if(c == '<'){
867 					if(warn)
868 						fprint(2, "warning: unclosed tag\n");
869 					ungetchar(ts, 60);
870 					goto attrloop_done;
871 				}
872 				if(c == '>')
873 					goto attrloop_done;
874 			}
875 		}
876 		/* gather attribute name */
877 		buf[0] = c;
878 		i = 1;
879 		for(;;){
880 			c = getchar(ts);
881 			if(c < 0)
882 				goto eob_done;
883 			if(!ISNAMCHAR(c))
884 				break;
885 			if(i < BIGBUFSIZE-1)
886 				buf[i++] = c;
887 		}
888 		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
889 		if(warn && !afnd){
890 			buf[i] = 0;
891 			fprint(2, "warning: unknown attribute name %S\n", buf);
892 		}
893 		/* skip whitespace */
894 		while(c < 256 && isspace(c)){
895 			c = getchar(ts);
896 			if(c < 0)
897 				goto eob_done;
898 		}
899 		if(c != '='){
900 			if(afnd)
901 				al = newattr(attid, nil, al);
902 			goto attrloop_continue;
903 		}
904 		/*# c is '=' here;  skip whitespace */
905 		for(;;){
906 			c = getchar(ts);
907 			if(c < 0)
908 				goto eob_done;
909 			if(c >= 256 || !isspace(c))
910 				break;
911 		}
912 		quote = 0;
913 		if(c == '\'' || c == '"'){
914 			quote = c;
915 			c = getchar(ts);
916 			if(c < 0)
917 				goto eob_done;
918 		}
919 		val = nil;
920 		nv = 0;
921 		for(;;){
922 valloop_continue:
923 			if(c < 0)
924 				goto eob_done;
925 			if(c == '>'){
926 				if(quote){
927 					/* c might be part of string (though not good style) */
928 					/* but if line ends before close quote, assume */
929 					/* there was an unmatched quote */
930 					ti = ts->i;
931 					for(;;){
932 						c = getchar(ts);
933 						if(c < 0)
934 							goto eob_done;
935 						if(c == quote){
936 							backup(ts, ti);
937 							buf[nv++] = '>';
938 							if(nv == BIGBUFSIZE-1){
939 								val = buftostr(val, buf, nv);
940 								nv = 0;
941 							}
942 							c = getchar(ts);
943 							goto valloop_continue;
944 						}
945 						if(c == '\n'){
946 							if(warn)
947 								fprint(2, "warning: apparent unmatched quote\n");
948 							backup(ts, ti);
949 							c = '>';
950 							goto valloop_done;
951 						}
952 					}
953 				}
954 				else
955 					goto valloop_done;
956 			}
957 			if(quote){
958 				if(c == quote){
959 					c = getchar(ts);
960 					if(c < 0)
961 						goto eob_done;
962 					goto valloop_done;
963 				}
964 				if(c == '\r'){
965 					c = getchar(ts);
966 					goto valloop_continue;
967 				}
968 				if(c == '\t' || c == '\n')
969 					c = ' ';
970 			}
971 			else {
972 				if(c < 256 && isspace(c))
973 					goto valloop_done;
974 			}
975 			if(c == '&'){
976 				c = ampersand(ts);
977 				if(c == -1)
978 					goto eob_done;
979 			}
980 			buf[nv++] = c;
981 			if(nv == BIGBUFSIZE-1){
982 				val = buftostr(val, buf, nv);
983 				nv = 0;
984 			}
985 			c = getchar(ts);
986 		}
987 valloop_done:
988 		if(afnd){
989 			val = buftostr(val, buf, nv);
990 			al = newattr(attid, val, al);
991 		}
992 	}
993 
994 attrloop_done:
995 	tok->attr = al;
996 	(*pai)++;
997 	return tok->tag;
998 
999 eob_done:
1000 	if(warn)
1001 		fprint(2, "warning: incomplete tag at end of page\n");
1002 	backup(ts, nexti);
1003 	tok->tag = Data;
1004 	tok->text = _Strdup(L(Llt));
1005 	return Data;
1006 }
1007 
1008 /* We've just read a '<!' at position starti, */
1009 /* so this may be a comment or other ignored section, or it may */
1010 /* be just a literal string if there is no close before end of file */
1011 /* (other browsers do that). */
1012 /* The accepted practice seems to be (note: contrary to SGML spec!): */
1013 /* If see <!--, look for --> to close, or if none, > to close. */
1014 /* If see <!(not --), look for > to close. */
1015 /* If no close before end of file, leave original characters in as literal data. */
1016 /* */
1017 /* If we see ignorable stuff, return Comment. */
1018 /* Else return nil (caller should back up and try again when more data arrives, */
1019 /* unless at end of file, in which case caller should just make '<' a data token). */
1020 static int
1021 comment(TokenSource* ts)
1022 {
1023 	int	nexti;
1024 	int	havecomment;
1025 	int	c;
1026 
1027 	nexti = ts->i;
1028 	havecomment = 0;
1029 	c = getchar(ts);
1030 	if(c == '-'){
1031 		c = getchar(ts);
1032 		if(c == '-'){
1033 			if(findstr(ts, L(Larrow)))
1034 				havecomment = 1;
1035 			else
1036 				backup(ts, nexti);
1037 		}
1038 	}
1039 	if(!havecomment){
1040 		if(c == '>')
1041 			havecomment = 1;
1042 		else if(c >= 0){
1043 			if(findstr(ts, L(Lgt)))
1044 				havecomment = 1;
1045 		}
1046 	}
1047 	if(havecomment)
1048 		return Comment;
1049 	return -1;
1050 }
1051 
1052 /* Look for string s in token source. */
1053 /* If found, return 1, with buffer at next char after s, */
1054 /* else return 0 (caller should back up). */
1055 static int
1056 findstr(TokenSource* ts, Rune* s)
1057 {
1058 	int	c0;
1059 	int	n;
1060 	int	nexti;
1061 	int	i;
1062 	int	c;
1063 
1064 	c0 = s[0];
1065 	n = runestrlen(s);
1066 	for(;;){
1067 		c = getchar(ts);
1068 		if(c < 0)
1069 			break;
1070 		if(c == c0){
1071 			if(n == 1)
1072 				return 1;
1073 			nexti = ts->i;
1074 			for(i = 1; i < n; i++){
1075 				c = getchar(ts);
1076 				if(c < 0)
1077 					goto mainloop_done;
1078 				if(c != s[i])
1079 					break;
1080 			}
1081 			if(i == n)
1082 				return 1;
1083 			backup(ts, nexti);
1084 		}
1085 	}
1086 mainloop_done:
1087 	return 0;
1088 }
1089 
1090 static int
1091 xdigit(int c)
1092 {
1093 	if('0' <= c && c <= '9')
1094 		return c-'0';
1095 	if('a' <= c && c <= 'f')
1096 		return c-'a'+10;
1097 	if('A' <= c && c <= 'F')
1098 		return c-'A'+10;
1099 	return -1;
1100 }
1101 
1102 /* We've just read an '&'; look for an entity reference */
1103 /* name, and if found, return translated char. */
1104 /* if there is a complete entity name but it isn't known, */
1105 /* try prefixes (gets around some buggy HTML out there), */
1106 /* and if that fails, back up to just past the '&' and return '&'. */
1107 /* If the entity can't be completed in the current buffer, back up */
1108 /* to the '&' and return -1. */
1109 static int
1110 ampersand(TokenSource* ts)
1111 {
1112 	int	savei;
1113 	int	c;
1114 	int	fnd;
1115 	int	ans;
1116 	int	v;
1117 	int	i;
1118 	int	k;
1119 	Rune	buf[SMALLBUFSIZE];
1120 
1121 	savei = ts->i;
1122 	c = getchar(ts);
1123 	fnd = 0;
1124 	ans = -1;
1125 	if(c == '#'){
1126 		c = getchar(ts);
1127 		v = 0;
1128 		if(c == 'x'){
1129 			c = getchar(ts);
1130 			while((i=xdigit(c)) != -1){
1131 				v = v*16 + i;
1132 				c = getchar(ts);
1133 			}
1134 		}else{
1135 			while('0' <= c && c <= '9'){
1136 				v = v*10 + c - '0';
1137 				c = getchar(ts);
1138 			}
1139 		}
1140 		if(c >= 0){
1141 			if(!(c == ';' || c == '\n' || c == '\r'))
1142 				ungetchar(ts, c);
1143 			c = v;
1144 			if(c == 160)
1145 				c = 160;
1146 			if(c >= Winstart && c <= Winend){
1147 				c = winchars[c - Winstart];
1148 			}
1149 			ans = c;
1150 			fnd = 1;
1151 		}
1152 	}
1153 	else if(c < 256 && isalpha(c)){
1154 		buf[0] = c;
1155 		k = 1;
1156 		for(;;){
1157 			c = getchar(ts);
1158 			if(c < 0)
1159 				break;
1160 			if(ISNAMCHAR(c)){
1161 				if(k < SMALLBUFSIZE-1)
1162 					buf[k++] = c;
1163 			}
1164 			else {
1165 				if(!(c == ';' || c == '\n' || c == '\r'))
1166 					ungetchar(ts, c);
1167 				break;
1168 			}
1169 		}
1170 		if(c >= 0){
1171 			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1172 			if(!fnd){
1173 				/* Try prefixes of s */
1174 				if(c == ';' || c == '\n' || c == '\r')
1175 					ungetchar(ts, c);
1176 				i = k;
1177 				while(--k > 0){
1178 					fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1179 					if(fnd){
1180 						while(i > k){
1181 							i--;
1182 							ungetchar(ts, buf[i]);
1183 						}
1184 						break;
1185 					}
1186 				}
1187 			}
1188 		}
1189 	}
1190 	if(!fnd){
1191 		backup(ts, savei);
1192 		ans = '&';
1193 	}
1194 	return ans;
1195 }
1196 
1197 /* Get next char, obeying ts.chset. */
1198 /* Returns -1 if no complete character left before current end of data. */
1199 static int
1200 getchar(TokenSource* ts)
1201 {
1202 	uchar*	buf;
1203 	int	c;
1204 	int	n;
1205 	int	ok;
1206 	Rune	r;
1207 
1208 	if(ts->i >= ts->edata)
1209 		return -1;
1210 	buf = ts->data;
1211 	c = buf[ts->i];
1212 	switch(ts->chset){
1213 	case ISO_8859_1:
1214 		if(c >= Winstart && c <= Winend)
1215 			c = winchars[c - Winstart];
1216 		ts->i++;
1217 		break;
1218 	case US_Ascii:
1219 		if(c > 127){
1220 			if(warn)
1221 				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1222 		}
1223 		ts->i++;
1224 		break;
1225 	case UTF_8:
1226 		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1227 		n = chartorune(&r, (char*)(buf+ts->i));
1228 		if(ok){
1229 			if(warn && c == 0x80)
1230 				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1231 			ts->i += n;
1232 			c = r;
1233 		}
1234 		else {
1235 			/* not enough bytes in buf to complete utf-8 char */
1236 			ts->i = ts->edata;	/* mark "all used" */
1237 			c = -1;
1238 		}
1239 		break;
1240 	case Unicode:
1241 		if(ts->i < ts->edata - 1){
1242 			/*standards say most-significant byte first */
1243 			c = (c << 8)|(buf[ts->i + 1]);
1244 			ts->i += 2;
1245 		}
1246 		else {
1247 			ts->i = ts->edata;	/* mark "all used" */
1248 			c = -1;
1249 		}
1250 		break;
1251 	}
1252 	return c;
1253 }
1254 
1255 /* Assuming c was the last character returned by getchar, set */
1256 /* things up so that next getchar will get that same character */
1257 /* followed by the current 'next character', etc. */
1258 static void
1259 ungetchar(TokenSource* ts, int c)
1260 {
1261 	int	n;
1262 	Rune	r;
1263 	char	a[UTFmax];
1264 
1265 	n = 1;
1266 	switch(ts->chset){
1267 	case UTF_8:
1268 		if(c >= 128){
1269 			r = c;
1270 			n = runetochar(a, &r);
1271 		}
1272 		break;
1273 	case Unicode:
1274 		n = 2;
1275 		break;
1276 	}
1277 	ts->i -= n;
1278 }
1279 
1280 /* Restore ts so that it is at the state where the index was savei. */
1281 static void
1282 backup(TokenSource* ts, int savei)
1283 {
1284 	if(dbglex)
1285 		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1286 	ts->i = savei;
1287 }
1288 
1289 
1290 /* Look for value associated with attribute attid in token t. */
1291 /* If there is one, return 1 and put the value in *pans, */
1292 /* else return 0. */
1293 /* If xfer is true, transfer ownership of the string to the caller */
1294 /* (nil it out here); otherwise, caller must duplicate the answer */
1295 /* if it needs to save it. */
1296 /* OK to have pans==0, in which case this is just looking */
1297 /* to see if token is present. */
1298 int
1299 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1300 {
1301 	Attr*	attr;
1302 
1303 	attr = t->attr;
1304 	while(attr != nil){
1305 		if(attr->attid == attid){
1306 			if(pans != nil)
1307 				*pans = attr->value;
1308 			if(xfer)
1309 				attr->value = nil;
1310 			return 1;
1311 		}
1312 		attr = attr->next;
1313 	}
1314 	if(pans != nil)
1315 		*pans = nil;
1316 	return 0;
1317 }
1318 
1319 static int
1320 Tconv(Fmt *f)
1321 {
1322 	Token*	t;
1323 	int	i;
1324 	int	tag;
1325 	char*	srbra;
1326 	Rune*	aname;
1327 	Rune*	tname;
1328 	Attr*	a;
1329 	char	buf[BIGBUFSIZE];
1330 
1331 	t = va_arg(f->args, Token*);
1332 	if(t == nil)
1333 		sprint(buf, "<null>");
1334 	else {
1335 		i = 0;
1336 		if(dbglex > 1)
1337 			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1338 		tag = t->tag;
1339 		if(tag == Data){
1340 			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1341 		}
1342 		else {
1343 			srbra = "";
1344 			if(tag >= RBRA){
1345 				tag -= RBRA;
1346 				srbra = "/";
1347 			}
1348 			tname = tagnames[tag];
1349 			if(tag == Notfound)
1350 				tname = L(Lquestion);
1351 			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1352 			for(a = t->attr; a != nil; a = a->next){
1353 				aname = attrnames[a->attid];
1354 				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1355 				if(a->value != nil)
1356 					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1357 			}
1358 			i += snprint(buf+i, sizeof(buf)-i-1, ">");
1359 		}
1360 		buf[i] = 0;
1361 	}
1362 	return fmtstrcpy(f, buf);
1363 }
1364 
1365 /* Attrs own their constituent strings, but build may eventually */
1366 /* transfer some values to its items and nil them out in the Attr. */
1367 static Attr*
1368 newattr(int attid, Rune* value, Attr* link)
1369 {
1370 	Attr* ans;
1371 
1372 	ans = (Attr*)emalloc(sizeof(Attr));
1373 	ans->attid = attid;
1374 	ans->value = value;
1375 	ans->next = link;
1376 	return ans;
1377 }
1378 
1379 /* Free list of Attrs linked through next field */
1380 static void
1381 freeattrs(Attr* ahead)
1382 {
1383 	Attr* a;
1384 	Attr* nexta;
1385 
1386 	a = ahead;
1387 	while(a != nil){
1388 		nexta = a->next;
1389 		free(a->value);
1390 		free(a);
1391 		a = nexta;
1392 	}
1393 }
1394 
1395 /* Free array of Tokens. */
1396 /* Allocated space might have room for more than n tokens, */
1397 /* but only n of them are initialized. */
1398 /* If caller has transferred ownership of constitutent strings */
1399 /* or attributes, it must have nil'd out the pointers in the Tokens. */
1400 void
1401 _freetokens(Token* tarray, int n)
1402 {
1403 	int i;
1404 	Token* t;
1405 
1406 	if(tarray == nil)
1407 		return;
1408 	for(i = 0; i < n; i++){
1409 		t = &tarray[i];
1410 		free(t->text);
1411 		freeattrs(t->attr);
1412 	}
1413 	free(tarray);
1414 }