op public repos

Blob

Date:: Sat Apr 1 19:24:03 2006 UTC
Message:: Use gcc -ansi -pedantic in 9c. Fix many non-C89-isms.
Actions:: History | Blame | Raw File
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7 
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 	int			i;		/* index of next byte to use */
12 	uchar*		data;		/* all the data */
13 	int			edata;	/* data[0:edata] is valid */
14 	int			chset;	/* one of US_Ascii, etc. */
15 	int			mtype;	/* TextHtml or TextPlain */
16 };
17 
18 enum {
19 	EOF = -2,
20 	EOB = -1
21 };
22 
23 #define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27 
28 /* HTML 4.0 tag names. */
29 /* Keep sorted, and in correspondence with enum in iparse.h. */
30 Rune **tagnames;
31 char *_tagnames[] = {
32 	" ",
33 	"!",
34 	"a", 
35 	"abbr",
36 	"acronym",
37 	"address",
38 	"applet", 
39 	"area",
40 	"b",
41 	"base",
42 	"basefont",
43 	"bdo",
44 	"big",
45 	"blink",
46 	"blockquote",
47 	"body",
48 	"bq",
49 	"br",
50 	"button",
51 	"caption",
52 	"center",
53 	"cite",
54 	"code",
55 	"col",
56 	"colgroup",
57 	"dd",
58 	"del",
59 	"dfn",
60 	"dir",
61 	"div",
62 	"dl",
63 	"dt",
64 	"em",
65 	"fieldset",
66 	"font",
67 	"form",
68 	"frame",
69 	"frameset",
70 	"h1",
71 	"h2",
72 	"h3",
73 	"h4",
74 	"h5",
75 	"h6",
76 	"head",
77 	"hr",
78 	"html",
79 	"i",
80 	"iframe",
81 	"img",
82 	"input",
83 	"ins",
84 	"isindex",
85 	"kbd",
86 	"label",
87 	"legend",
88 	"li",
89 	"link",
90 	"map",
91 	"menu",
92 	"meta",
93 	"nobr",
94 	"noframes",
95 	"noscript",
96 	"object",
97 	"ol",
98 	"optgroup",
99 	"option",
100 	"p",
101 	"param",
102 	"pre",
103 	"q",
104 	"s",
105 	"samp",
106 	"script",
107 	"select",
108 	"small",
109 	"span",
110 	"strike",
111 	"strong",
112 	"style",
113 	"sub",
114 	"sup",
115 	"table",
116 	"tbody",
117 	"td",
118 	"textarea",
119 	"tfoot",
120 	"th",
121 	"thead",
122 	"title",
123 	"tr",
124 	"tt",
125 	"u",
126 	"ul",
127 	"var"
128 };
129 
130 /* HTML 4.0 attribute names. */
131 /* Keep sorted, and in correspondence with enum in i.h. */
132 Rune **attrnames;
133 char* _attrnames[] = {
134 	"abbr",
135 	"accept-charset",
136 	"access-key",
137 	"action",
138 	"align",
139 	"alink",
140 	"alt",
141 	"archive",
142 	"axis",
143 	"background",
144 	"bgcolor",
145 	"border",
146 	"cellpadding",
147 	"cellspacing",
148 	"char",
149 	"charoff",
150 	"charset",
151 	"checked",
152 	"cite",
153 	"class",
154 	"classid",
155 	"clear",
156 	"code",
157 	"codebase",
158 	"codetype",
159 	"color",
160 	"cols",
161 	"colspan",
162 	"compact",
163 	"content",
164 	"coords",
165 	"data",
166 	"datetime",
167 	"declare",
168 	"defer",
169 	"dir",
170 	"disabled",
171 	"enctype",
172 	"face",
173 	"for",
174 	"frame",
175 	"frameborder",
176 	"headers",
177 	"height",
178 	"href",
179 	"hreflang",
180 	"hspace",
181 	"http-equiv",
182 	"id",
183 	"ismap",
184 	"label",
185 	"lang",
186 	"link",
187 	"longdesc",
188 	"marginheight",
189 	"marginwidth",
190 	"maxlength",
191 	"media",
192 	"method",
193 	"multiple",
194 	"name",
195 	"nohref",
196 	"noresize",
197 	"noshade",
198 	"nowrap",
199 	"object",
200 	"onblur",
201 	"onchange",
202 	"onclick",
203 	"ondblclick",
204 	"onfocus",
205 	"onkeypress",
206 	"onkeyup",
207 	"onload",
208 	"onmousedown",
209 	"onmousemove",
210 	"onmouseout",
211 	"onmouseover",
212 	"onmouseup",
213 	"onreset",
214 	"onselect",
215 	"onsubmit",
216 	"onunload",
217 	"profile",
218 	"prompt",
219 	"readonly",
220 	"rel",
221 	"rev",
222 	"rows",
223 	"rowspan",
224 	"rules",
225 	"scheme",
226 	"scope",
227 	"scrolling",
228 	"selected",
229 	"shape",
230 	"size",
231 	"span",
232 	"src",
233 	"standby",
234 	"start",
235 	"style",
236 	"summary",
237 	"tabindex",
238 	"target",
239 	"text",
240 	"title",
241 	"type",
242 	"usemap",
243 	"valign",
244 	"value",
245 	"valuetype",
246 	"version",
247 	"vlink",
248 	"vspace",
249 	"width"
250 };
251 
252 
253 /* Character entity to unicode character number map. */
254 /* Keep sorted by name. */
255 StringInt *chartab;
256 AsciiInt _chartab[] = {
257 	{"AElig", 198},
258 	{"Aacute", 193},
259 	{"Acirc", 194},
260 	{"Agrave", 192},
261 	{"Aring", 197},
262 	{"Atilde", 195},
263 	{"Auml", 196},
264 	{"Ccedil", 199},
265 	{"ETH", 208},
266 	{"Eacute", 201},
267 	{"Ecirc", 202},
268 	{"Egrave", 200},
269 	{"Euml", 203},
270 	{"Iacute", 205},
271 	{"Icirc", 206},
272 	{"Igrave", 204},
273 	{"Iuml", 207},
274 	{"Ntilde", 209},
275 	{"Oacute", 211},
276 	{"Ocirc", 212},
277 	{"Ograve", 210},
278 	{"Oslash", 216},
279 	{"Otilde", 213},
280 	{"Ouml", 214},
281 	{"THORN", 222},
282 	{"Uacute", 218},
283 	{"Ucirc", 219},
284 	{"Ugrave", 217},
285 	{"Uuml", 220},
286 	{"Yacute", 221},
287 	{"aacute", 225},
288 	{"acirc", 226},
289 	{"acute", 180},
290 	{"aelig", 230},
291 	{"agrave", 224},
292 	{"alpha", 945},
293 	{"amp", 38},
294 	{"aring", 229},
295 	{"atilde", 227},
296 	{"auml", 228},
297 	{"beta", 946},
298 	{"brvbar", 166},
299 	{"ccedil", 231},
300 	{"cdots", 8943},
301 	{"cedil", 184},
302 	{"cent", 162},
303 	{"chi", 967},
304 	{"copy", 169},
305 	{"curren", 164},
306 	{"ddots", 8945},
307 	{"deg", 176},
308 	{"delta", 948},
309 	{"divide", 247},
310 	{"eacute", 233},
311 	{"ecirc", 234},
312 	{"egrave", 232},
313 	{"emdash", 8212},	/* non-standard but commonly used */
314 	{"emsp", 8195},
315 	{"endash", 8211},	/* non-standard but commonly used */
316 	{"ensp", 8194},
317 	{"epsilon", 949},
318 	{"eta", 951},
319 	{"eth", 240},
320 	{"euml", 235},
321 	{"frac12", 189},
322 	{"frac14", 188},
323 	{"frac34", 190},
324 	{"gamma", 947},
325 	{"gt", 62},
326 	{"iacute", 237},
327 	{"icirc", 238},
328 	{"iexcl", 161},
329 	{"igrave", 236},
330 	{"iota", 953},
331 	{"iquest", 191},
332 	{"iuml", 239},
333 	{"kappa", 954},
334 	{"lambda", 955},
335 	{"laquo", 171},
336 	{"ldquo", 8220},
337 	{"ldots", 8230},
338 	{"lsquo", 8216},
339 	{"lt", 60},
340 	{"macr", 175},
341 	{"mdash", 8212},
342 	{"micro", 181},
343 	{"middot", 183},
344 	{"mu", 956},
345 	{"nbsp", 160},
346 	{"ndash", 8211},
347 	{"not", 172},
348 	{"ntilde", 241},
349 	{"nu", 957},
350 	{"oacute", 243},
351 	{"ocirc", 244},
352 	{"ograve", 242},
353 	{"omega", 969},
354 	{"omicron", 959},
355 	{"ordf", 170},
356 	{"ordm", 186},
357 	{"oslash", 248},
358 	{"otilde", 245},
359 	{"ouml", 246},
360 	{"para", 182},
361 	{"phi", 966},
362 	{"pi", 960},
363 	{"plusmn", 177},
364 	{"pound", 163},
365 	{"psi", 968},
366 	{"quad", 8193},
367 	{"quot", 34},
368 	{"raquo", 187},
369 	{"rdquo", 8221},
370 	{"reg", 174},
371 	{"rho", 961},
372 	{"rsquo", 8217},
373 	{"sect", 167},
374 	{"shy", 173},
375 	{"sigma", 963},
376 	{"sp", 8194},
377 	{"sup1", 185},
378 	{"sup2", 178},
379 	{"sup3", 179},
380 	{"szlig", 223},
381 	{"tau", 964},
382 	{"theta", 952},
383 	{"thinsp", 8201},
384 	{"thorn", 254},
385 	{"times", 215},
386 	{"trade", 8482},
387 	{"uacute", 250},
388 	{"ucirc", 251},
389 	{"ugrave", 249},
390 	{"uml", 168},
391 	{"upsilon", 965},
392 	{"uuml", 252},
393 	{"varepsilon", 8712},
394 	{"varphi", 981},
395 	{"varpi", 982},
396 	{"varrho", 1009},
397 	{"vdots", 8942},
398 	{"vsigma", 962},
399 	{"vtheta", 977},
400 	{"xi", 958},
401 	{"yacute", 253},
402 	{"yen", 165},
403 	{"yuml", 255},
404 	{"zeta", 950}
405 };
406 #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
407 
408 /* Characters Winstart..Winend are those that Windows */
409 /* uses interpolated into the Latin1 set. */
410 /* They aren't supposed to appear in HTML, but they do.... */
411 enum {
412 	Winstart = 127,
413 	Winend = 159
414 };
415 
416 static int	winchars[]= { 8226,	/* 8226 is a bullet */
417 	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
418 	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
419 	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
420 	732, 8482, 353, 8250, 339, 8226, 8226, 376};
421 
422 static StringInt*	tagtable;		/* initialized from tagnames */
423 static StringInt*	attrtable;		/* initialized from attrnames */
424 
425 static void		lexinit(void);
426 static int		getplaindata(TokenSource* ts, Token* a, int* pai);
427 static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
428 static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
429 static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
430 static Rune*		buftostr(Rune* s, Rune* buf, int j);
431 static int		comment(TokenSource* ts);
432 static int		findstr(TokenSource* ts, Rune* s);
433 static int		ampersand(TokenSource* ts);
434 /*static int		lowerc(int c); */
435 static int		getchar(TokenSource* ts);
436 static void		ungetchar(TokenSource* ts, int c);
437 static void		backup(TokenSource* ts, int savei);
438 /*static void		freeinsidetoken(Token* t); */
439 static void		freeattrs(Attr* ahead);
440 static Attr*		newattr(int attid, Rune* value, Attr* link);
441 static int		Tconv(Fmt* f);
442 
443 int	dbglex = 0;
444 static int lexinited = 0;
445 
446 static void
447 lexinit(void)
448 {
449 	chartab = _cvtstringinttab(_chartab, nelem(_chartab));
450 	tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
451 	tagtable = _makestrinttab(tagnames, Numtags);
452 	attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
453 	attrtable = _makestrinttab(attrnames, Numattrs);
454 	fmtinstall('T', Tconv);
455 	lexinited = 1;
456 }
457 
458 static TokenSource*
459 newtokensource(uchar* data, int edata, int chset, int mtype)
460 {
461 	TokenSource*	ans;
462 
463 	assert(chset == US_Ascii || chset == ISO_8859_1 ||
464 			chset == UTF_8 || chset == Unicode);
465 	ans = (TokenSource*)emalloc(sizeof(TokenSource));
466 	ans->i = 0;
467 	ans->data = data;
468 	ans->edata = edata;
469 	ans->chset = chset;
470 	ans->mtype = mtype;
471 	return ans;
472 }
473 
474 enum {
475 	ToksChunk = 500
476 };
477 
478 /* Call this to get the tokens. */
479 /*  The number of returned tokens is returned in *plen. */
480 Token*
481 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
482 {
483 	TokenSource*	ts;
484 	Token*		a;
485 	int	alen;
486 	int	ai;
487 	int	starti;
488 	int	c;
489 	int	tag;
490 
491 	if(!lexinited)
492 		lexinit();
493 	ts = newtokensource(data, datalen, chset, mtype);
494 	alen = ToksChunk;
495 	a = (Token*)emalloc(alen * sizeof(Token));
496 	ai = 0;
497 	if(dbglex)
498 		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
499 	if(ts->mtype == TextHtml){
500 		for(;;){
501 			if(ai == alen){
502 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
503 				alen += ToksChunk;
504 			}
505 			starti = ts->i;
506 			c = getchar(ts);
507 			if(c < 0)
508 				break;
509 			if(c == '<'){
510 				tag = gettag(ts, starti, a, &ai);
511 				if(tag == Tscript){
512 					/* special rules for getting Data after.... */
513 					starti = ts->i;
514 					c = getchar(ts);
515 					tag = getscriptdata(ts, c, starti, a, &ai);
516 				}
517 			}
518 			else
519 				tag = getdata(ts, c, starti, a, &ai);
520 			if(tag == -1)
521 				break;
522 			else if(dbglex > 1 && tag != Comment)
523 				fprint(2, "lex: got token %T\n", &a[ai-1]);
524 		}
525 	}
526 	else {
527 		/* plain text (non-html) tokens */
528 		for(;;){
529 			if(ai == alen){
530 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
531 				alen += ToksChunk;
532 			}
533 			tag = getplaindata(ts, a, &ai);
534 			if(tag == -1)
535 				break;
536 			if(dbglex > 1)
537 				fprint(2, "lex: got token %T\n", &a[ai]);
538 		}
539 	}
540 	if(dbglex)
541 		fprint(2, "lex: returning %d tokens\n", ai);
542 	*plen = ai;
543 	if(ai == 0) 
544 		return nil;
545 	return a;
546 }
547 
548 /* For case where source isn't HTML. */
549 /* Just make data tokens, one per line (or partial line, */
550 /* at end of buffer), ignoring non-whitespace control */
551 /* characters and dumping \r's. */
552 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
553 /* Otherwise return -1; */
554 static int
555 getplaindata(TokenSource* ts, Token* a, int* pai)
556 {
557 	Rune*	s;
558 	int	j;
559 	int	starti;
560 	int	c;
561 	Token*	tok;
562 	Rune	buf[BIGBUFSIZE];
563 
564 	s = nil;
565 	j = 0;
566 	starti = ts->i;
567 	for(c = getchar(ts); c >= 0; c = getchar(ts)){
568 		if(c < ' '){
569 			if(isspace(c)){
570 				if(c == '\r'){
571 					/* ignore it unless no following '\n', */
572 					/* in which case treat it like '\n' */
573 					c = getchar(ts);
574 					if(c != '\n'){
575 						if(c >= 0)
576 							ungetchar(ts, c);
577 						c = '\n';
578 					}
579 				}
580 			}
581 			else
582 				c = 0;
583 		}
584 		if(c != 0){
585 			buf[j++] = c;
586 			if(j == sizeof(buf)-1){
587 				s = buftostr(s, buf, j);
588 				j = 0;
589 			}
590 		}
591 		if(c == '\n')
592 			break;
593 	}
594 	s = buftostr(s, buf, j);
595 	if(s == nil)
596 		return -1;
597 	tok = &a[(*pai)++];
598 	tok->tag = Data;
599 	tok->text = s;
600 	tok->attr = nil;
601 	tok->starti = starti;
602 	return Data;
603 }
604 
605 /* Return concatenation of s and buf[0:j] */
606 static Rune*
607 buftostr(Rune* s, Rune* buf, int j)
608 {
609 	buf[j] = 0;
610 	if(s == nil)
611 		s = _Strndup(buf, j);
612 	else 
613 		s = _Strdup2(s, buf);
614 	return s;
615 }
616 
617 /* Gather data up to next start-of-tag or end-of-buffer. */
618 /* Translate entity references (&amp;). */
619 /* Ignore non-whitespace control characters and get rid of \r's. */
620 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
621 /* Otherwise return -1; */
622 static int
623 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
624 {
625 	Rune*	s;
626 	int	j;
627 	int	c;
628 	Token*	tok;
629 	Rune	buf[BIGBUFSIZE];
630 
631 	s = nil;
632 	j = 0;
633 	c = firstc;
634 	while(c >= 0){
635 		if(c == '&'){
636 			c = ampersand(ts);
637 			if(c < 0)
638 				break;
639 		}
640 		else if(c < ' '){
641 			if(isspace(c)){
642 				if(c == '\r'){
643 					/* ignore it unless no following '\n', */
644 					/* in which case treat it like '\n' */
645 					c = getchar(ts);
646 					if(c != '\n'){
647 						if(c >= 0)
648 							ungetchar(ts, c);
649 						c = '\n';
650 					}
651 				}
652 			}
653 			else {
654 				if(warn)
655 					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
656 				c = 0;
657 			}
658 		}
659 		else if(c == '<'){
660 			ungetchar(ts, c);
661 			break;
662 		}
663 		if(c != 0){
664 			buf[j++] = c;
665 			if(j == BIGBUFSIZE-1){
666 				s = buftostr(s, buf, j);
667 				j = 0;
668 			}
669 		}
670 		c = getchar(ts);
671 	}
672 	s = buftostr(s, buf, j);
673 	if(s == nil)
674 		return -1;
675 	tok = &a[(*pai)++];
676 	tok->tag = Data;
677 	tok->text = s;
678 	tok->attr = nil;
679 	tok->starti = starti;
680 	return Data;
681 }
682 
683 /* The rules for lexing scripts are different (ugh). */
684 /* Gather up everything until see a </SCRIPT>. */
685 static int
686 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
687 {
688 	Rune*	s;
689 	int	j;
690 	int	tstarti;
691 	int	savei;
692 	int	c;
693 	int	tag;
694 	int	done;
695 	Token*	tok;
696 	Rune	buf[BIGBUFSIZE];
697 
698 	s = nil;
699 	j = 0;
700 	tstarti = starti;
701 	c = firstc;
702 	done = 0;
703 	while(c >= 0){
704 		if(c == '<'){
705 			/* other browsers ignore stuff to end of line after <! */
706 			savei = ts->i;
707 			c = getchar(ts);
708 			if(c == '!'){
709 				while(c >= 0 && c != '\n' && c != '\r')
710 					c = getchar(ts);
711 				if(c == '\r')
712 					c = getchar(ts);
713 				if(c == '\n')
714 					c = getchar(ts);
715 			}
716 			else if(c >= 0){
717 				backup(ts, savei);
718 				tag = gettag(ts, tstarti, a, pai);
719 				if(tag == -1)
720 					break;
721 				if(tag != Comment)
722 					(*pai)--;
723 				backup(ts, tstarti);
724 				if(tag == Tscript + RBRA){
725 					done = 1;
726 					break;
727 				}
728 				/* here tag was not </SCRIPT>, so take as regular data */
729 				c = getchar(ts);
730 			}
731 		}
732 		if(c < 0)
733 			break;
734 		if(c != 0){
735 			buf[j++] = c;
736 			if(j == BIGBUFSIZE-1){
737 				s = buftostr(s, buf, j);
738 				j = 0;
739 			}
740 		}
741 		tstarti = ts->i;
742 		c = getchar(ts);
743 	}
744 	if(done || ts->i == ts->edata){
745 		s = buftostr(s, buf, j);
746 		tok = &a[(*pai)++];
747 		tok->tag = Data;
748 		tok->text = s;
749 		tok->attr = nil;
750 		tok->starti = starti;
751 		return Data;
752 	}
753 	backup(ts, starti);
754 	return -1;
755 }
756 
757 /* We've just seen a '<'.  Gather up stuff to closing '>' (if buffer */
758 /* ends before then, return -1). */
759 /* If it's a tag, look up the name, gather the attributes, and return */
760 /* the appropriate token. */
761 /* Else it's either just plain data or some kind of ignorable stuff: */
762 /* return Data or Comment as appropriate. */
763 /* If it's not a Comment, put it in a[*pai] and bump *pai. */
764 static int
765 gettag(TokenSource* ts, int starti, Token* a, int* pai)
766 {
767 	int	rbra;
768 	int	ans;
769 	Attr*	al;
770 	int	nexti;
771 	int	c;
772 	int	ti;
773 	int	afnd;
774 	int	attid;
775 	int	quote;
776 	Rune*	val;
777 	int	nv;
778 	int	i;
779 	int	tag;
780 	Token*	tok;
781 	Rune	buf[BIGBUFSIZE];
782 
783 	rbra = 0;
784 	nexti = ts->i;
785 	tok = &a[*pai];
786 	tok->tag = Notfound;
787 	tok->text = nil;
788 	tok->attr = nil;
789 	tok->starti = starti;
790 	c = getchar(ts);
791 	if(c == '/'){
792 		rbra = RBRA;
793 		c = getchar(ts);
794 	}
795 	if(c < 0)
796 		goto eob_done;
797 	if(c >= 256 || !isalpha(c)){
798 		/* not a tag */
799 		if(c == '!'){
800 			ans = comment(ts);
801 			if(ans != -1)
802 				return ans;
803 			goto eob_done;
804 		}
805 		else {
806 			backup(ts, nexti);
807 			tok->tag = Data;
808 			tok->text = _Strdup(L(Llt));
809 			(*pai)++;
810 			return Data;
811 		}
812 	}
813 	/* c starts a tagname */
814 	buf[0] = c;
815 	i = 1;
816 	for(;;){
817 		c = getchar(ts);
818 		if(c < 0)
819 			goto eob_done;
820 		if(!ISNAMCHAR(c))
821 			break;
822 		/* if name is bigger than buf it won't be found anyway... */
823 		if(i < BIGBUFSIZE)
824 			buf[i++] = c;
825 	}
826 	if(_lookup(tagtable, Numtags, buf, i, &tag))
827 		tok->tag = tag + rbra;
828 	else
829 		tok->text = _Strndup(buf, i);	/* for warning print, in build */
830 
831 	/* attribute gathering loop */
832 	al = nil;
833 	for(;;){
834 		/* look for "ws name" or "ws name ws = ws val"  (ws=whitespace) */
835 		/* skip whitespace */
836 attrloop_continue:
837 		while(c < 256 && isspace(c)){
838 			c = getchar(ts);
839 			if(c < 0)
840 				goto eob_done;
841 		}
842 		if(c == '>')
843 			goto attrloop_done;
844 		if(c == '<'){
845 			if(warn)
846 				fprint(2, "warning: unclosed tag\n");
847 			ungetchar(ts, c);
848 			goto attrloop_done;
849 		}
850 		if(c >= 256 || !isalpha(c)){
851 			if(warn)
852 				fprint(2, "warning: expected attribute name\n");
853 			/* skipt to next attribute name */
854 			for(;;){
855 				c = getchar(ts);
856 				if(c < 0)
857 					goto eob_done;
858 				if(c < 256 && isalpha(c))
859 					goto attrloop_continue;
860 				if(c == '<'){
861 					if(warn)
862 						fprint(2, "warning: unclosed tag\n");
863 					ungetchar(ts, 60);
864 					goto attrloop_done;
865 				}
866 				if(c == '>')
867 					goto attrloop_done;
868 			}
869 		}
870 		/* gather attribute name */
871 		buf[0] = c;
872 		i = 1;
873 		for(;;){
874 			c = getchar(ts);
875 			if(c < 0)
876 				goto eob_done;
877 			if(!ISNAMCHAR(c))
878 				break;
879 			if(i < BIGBUFSIZE-1)
880 				buf[i++] = c;
881 		}
882 		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
883 		if(warn && !afnd){
884 			buf[i] = 0;
885 			fprint(2, "warning: unknown attribute name %S\n", buf);
886 		}
887 		/* skip whitespace */
888 		while(c < 256 && isspace(c)){
889 			c = getchar(ts);
890 			if(c < 0)
891 				goto eob_done;
892 		}
893 		if(c != '='){
894 			if(afnd)
895 				al = newattr(attid, nil, al);
896 			goto attrloop_continue;
897 		}
898 		/*# c is '=' here;  skip whitespace */
899 		for(;;){
900 			c = getchar(ts);
901 			if(c < 0)
902 				goto eob_done;
903 			if(c >= 256 || !isspace(c))
904 				break;
905 		}
906 		quote = 0;
907 		if(c == '\'' || c == '"'){
908 			quote = c;
909 			c = getchar(ts);
910 			if(c < 0)
911 				goto eob_done;
912 		}
913 		val = nil;
914 		nv = 0;
915 		for(;;){
916 valloop_continue:
917 			if(c < 0)
918 				goto eob_done;
919 			if(c == '>'){
920 				if(quote){
921 					/* c might be part of string (though not good style) */
922 					/* but if line ends before close quote, assume */
923 					/* there was an unmatched quote */
924 					ti = ts->i;
925 					for(;;){
926 						c = getchar(ts);
927 						if(c < 0)
928 							goto eob_done;
929 						if(c == quote){
930 							backup(ts, ti);
931 							buf[nv++] = '>';
932 							if(nv == BIGBUFSIZE-1){
933 								val = buftostr(val, buf, nv);
934 								nv = 0;
935 							}
936 							c = getchar(ts);
937 							goto valloop_continue;
938 						}
939 						if(c == '\n'){
940 							if(warn)
941 								fprint(2, "warning: apparent unmatched quote\n");
942 							backup(ts, ti);
943 							c = '>';
944 							goto valloop_done;
945 						}
946 					}
947 				}
948 				else
949 					goto valloop_done;
950 			}
951 			if(quote){
952 				if(c == quote){
953 					c = getchar(ts);
954 					if(c < 0)
955 						goto eob_done;
956 					goto valloop_done;
957 				}
958 				if(c == '\r'){
959 					c = getchar(ts);
960 					goto valloop_continue;
961 				}
962 				if(c == '\t' || c == '\n')
963 					c = ' ';
964 			}
965 			else {
966 				if(c < 256 && isspace(c))
967 					goto valloop_done;
968 			}
969 			if(c == '&'){
970 				c = ampersand(ts);
971 				if(c == -1)
972 					goto eob_done;
973 			}
974 			buf[nv++] = c;
975 			if(nv == BIGBUFSIZE-1){
976 				val = buftostr(val, buf, nv);
977 				nv = 0;
978 			}
979 			c = getchar(ts);
980 		}
981 valloop_done:
982 		if(afnd){
983 			val = buftostr(val, buf, nv);
984 			al = newattr(attid, val, al);
985 		}
986 	}
987 
988 attrloop_done:
989 	tok->attr = al;
990 	(*pai)++;
991 	return tok->tag;
992 
993 eob_done:
994 	if(warn)
995 		fprint(2, "warning: incomplete tag at end of page\n");
996 	backup(ts, nexti);
997 	tok->tag = Data;
998 	tok->text = _Strdup(L(Llt));
999 	return Data;
1000 }
1001 
1002 /* We've just read a '<!' at position starti, */
1003 /* so this may be a comment or other ignored section, or it may */
1004 /* be just a literal string if there is no close before end of file */
1005 /* (other browsers do that). */
1006 /* The accepted practice seems to be (note: contrary to SGML spec!): */
1007 /* If see <!--, look for --> to close, or if none, > to close. */
1008 /* If see <!(not --), look for > to close. */
1009 /* If no close before end of file, leave original characters in as literal data. */
1010 /* */
1011 /* If we see ignorable stuff, return Comment. */
1012 /* Else return nil (caller should back up and try again when more data arrives, */
1013 /* unless at end of file, in which case caller should just make '<' a data token). */
1014 static int
1015 comment(TokenSource* ts)
1016 {
1017 	int	nexti;
1018 	int	havecomment;
1019 	int	c;
1020 
1021 	nexti = ts->i;
1022 	havecomment = 0;
1023 	c = getchar(ts);
1024 	if(c == '-'){
1025 		c = getchar(ts);
1026 		if(c == '-'){
1027 			if(findstr(ts, L(Larrow)))
1028 				havecomment = 1;
1029 			else
1030 				backup(ts, nexti);
1031 		}
1032 	}
1033 	if(!havecomment){
1034 		if(c == '>')
1035 			havecomment = 1;
1036 		else if(c >= 0){
1037 			if(findstr(ts, L(Lgt)))
1038 				havecomment = 1;
1039 		}
1040 	}
1041 	if(havecomment)
1042 		return Comment;
1043 	return -1;
1044 }
1045 
1046 /* Look for string s in token source. */
1047 /* If found, return 1, with buffer at next char after s, */
1048 /* else return 0 (caller should back up). */
1049 static int
1050 findstr(TokenSource* ts, Rune* s)
1051 {
1052 	int	c0;
1053 	int	n;
1054 	int	nexti;
1055 	int	i;
1056 	int	c;
1057 
1058 	c0 = s[0];
1059 	n = runestrlen(s);
1060 	for(;;){
1061 		c = getchar(ts);
1062 		if(c < 0)
1063 			break;
1064 		if(c == c0){
1065 			if(n == 1)
1066 				return 1;
1067 			nexti = ts->i;
1068 			for(i = 1; i < n; i++){
1069 				c = getchar(ts);
1070 				if(c < 0)
1071 					goto mainloop_done;
1072 				if(c != s[i])
1073 					break;
1074 			}
1075 			if(i == n)
1076 				return 1;
1077 			backup(ts, nexti);
1078 		}
1079 	}
1080 mainloop_done:
1081 	return 0;
1082 }
1083 
1084 static int
1085 xdigit(int c)
1086 {
1087 	if('0' <= c && c <= '9')
1088 		return c-'0';
1089 	if('a' <= c && c <= 'f')
1090 		return c-'a'+10;
1091 	if('A' <= c && c <= 'F')
1092 		return c-'A'+10;
1093 	return -1;
1094 }
1095 
1096 /* We've just read an '&'; look for an entity reference */
1097 /* name, and if found, return translated char. */
1098 /* if there is a complete entity name but it isn't known, */
1099 /* try prefixes (gets around some buggy HTML out there), */
1100 /* and if that fails, back up to just past the '&' and return '&'. */
1101 /* If the entity can't be completed in the current buffer, back up */
1102 /* to the '&' and return -1. */
1103 static int
1104 ampersand(TokenSource* ts)
1105 {
1106 	int	savei;
1107 	int	c;
1108 	int	fnd;
1109 	int	ans;
1110 	int	v;
1111 	int	i;
1112 	int	k;
1113 	Rune	buf[SMALLBUFSIZE];
1114 
1115 	savei = ts->i;
1116 	c = getchar(ts);
1117 	fnd = 0;
1118 	ans = -1;
1119 	if(c == '#'){
1120 		c = getchar(ts);
1121 		v = 0;
1122 		if(c == 'x'){
1123 			c = getchar(ts);
1124 			while((i=xdigit(c)) != -1){
1125 				v = v*16 + i;
1126 				c = getchar(ts);
1127 			}
1128 		}else{
1129 			while('0' <= c && c <= '9'){
1130 				v = v*10 + c - '0';
1131 				c = getchar(ts);
1132 			}
1133 		}
1134 		if(c >= 0){
1135 			if(!(c == ';' || c == '\n' || c == '\r'))
1136 				ungetchar(ts, c);
1137 			c = v;
1138 			if(c == 160)
1139 				c = 160;
1140 			if(c >= Winstart && c <= Winend){
1141 				c = winchars[c - Winstart];
1142 			}
1143 			ans = c;
1144 			fnd = 1;
1145 		}
1146 	}
1147 	else if(c < 256 && isalpha(c)){
1148 		buf[0] = c;
1149 		k = 1;
1150 		for(;;){
1151 			c = getchar(ts);
1152 			if(c < 0)
1153 				break;
1154 			if(ISNAMCHAR(c)){
1155 				if(k < SMALLBUFSIZE-1)
1156 					buf[k++] = c;
1157 			}
1158 			else {
1159 				if(!(c == ';' || c == '\n' || c == '\r'))
1160 					ungetchar(ts, c);
1161 				break;
1162 			}
1163 		}
1164 		if(c >= 0){
1165 			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1166 			if(!fnd){
1167 				/* Try prefixes of s */
1168 				if(c == ';' || c == '\n' || c == '\r')
1169 					ungetchar(ts, c);
1170 				i = k;
1171 				while(--k > 0){
1172 					fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1173 					if(fnd){
1174 						while(i > k){
1175 							i--;
1176 							ungetchar(ts, buf[i]);
1177 						}
1178 						break;
1179 					}
1180 				}
1181 			}
1182 		}
1183 	}
1184 	if(!fnd){
1185 		backup(ts, savei);
1186 		ans = '&';
1187 	}
1188 	return ans;
1189 }
1190 
1191 /* Get next char, obeying ts.chset. */
1192 /* Returns -1 if no complete character left before current end of data. */
1193 static int
1194 getchar(TokenSource* ts)
1195 {
1196 	uchar*	buf;
1197 	int	c;
1198 	int	n;
1199 	int	ok;
1200 	Rune	r;
1201 
1202 	if(ts->i >= ts->edata)
1203 		return -1;
1204 	buf = ts->data;
1205 	c = buf[ts->i];
1206 	switch(ts->chset){
1207 	case ISO_8859_1:
1208 		if(c >= Winstart && c <= Winend)
1209 			c = winchars[c - Winstart];
1210 		ts->i++;
1211 		break;
1212 	case US_Ascii:
1213 		if(c > 127){
1214 			if(warn)
1215 				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1216 		}
1217 		ts->i++;
1218 		break;
1219 	case UTF_8:
1220 		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1221 		n = chartorune(&r, (char*)(buf+ts->i));
1222 		if(ok){
1223 			if(warn && c == 0x80)
1224 				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1225 			ts->i += n;
1226 			c = r;
1227 		}
1228 		else {
1229 			/* not enough bytes in buf to complete utf-8 char */
1230 			ts->i = ts->edata;	/* mark "all used" */
1231 			c = -1;
1232 		}
1233 		break;
1234 	case Unicode:
1235 		if(ts->i < ts->edata - 1){
1236 			/*standards say most-significant byte first */
1237 			c = (c << 8)|(buf[ts->i + 1]);
1238 			ts->i += 2;
1239 		}
1240 		else {
1241 			ts->i = ts->edata;	/* mark "all used" */
1242 			c = -1;
1243 		}
1244 		break;
1245 	}
1246 	return c;
1247 }
1248 
1249 /* Assuming c was the last character returned by getchar, set */
1250 /* things up so that next getchar will get that same character */
1251 /* followed by the current 'next character', etc. */
1252 static void
1253 ungetchar(TokenSource* ts, int c)
1254 {
1255 	int	n;
1256 	Rune	r;
1257 	char	a[UTFmax];
1258 
1259 	n = 1;
1260 	switch(ts->chset){
1261 	case UTF_8:
1262 		if(c >= 128){
1263 			r = c;
1264 			n = runetochar(a, &r);
1265 		}
1266 		break;
1267 	case Unicode:
1268 		n = 2;
1269 		break;
1270 	}
1271 	ts->i -= n;
1272 }
1273 
1274 /* Restore ts so that it is at the state where the index was savei. */
1275 static void
1276 backup(TokenSource* ts, int savei)
1277 {
1278 	if(dbglex)
1279 		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1280 	ts->i = savei;
1281 }
1282 
1283 
1284 /* Look for value associated with attribute attid in token t. */
1285 /* If there is one, return 1 and put the value in *pans, */
1286 /* else return 0. */
1287 /* If xfer is true, transfer ownership of the string to the caller */
1288 /* (nil it out here); otherwise, caller must duplicate the answer */
1289 /* if it needs to save it. */
1290 /* OK to have pans==0, in which case this is just looking */
1291 /* to see if token is present. */
1292 int
1293 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1294 {
1295 	Attr*	attr;
1296 
1297 	attr = t->attr;
1298 	while(attr != nil){
1299 		if(attr->attid == attid){
1300 			if(pans != nil)
1301 				*pans = attr->value;
1302 			if(xfer)
1303 				attr->value = nil;
1304 			return 1;
1305 		}
1306 		attr = attr->next;
1307 	}
1308 	if(pans != nil)
1309 		*pans = nil;
1310 	return 0;
1311 }
1312 
1313 static int
1314 Tconv(Fmt *f)
1315 {
1316 	Token*	t;
1317 	int	i;
1318 	int	tag;
1319 	char*	srbra;
1320 	Rune*	aname;
1321 	Rune*	tname;
1322 	Attr*	a;
1323 	char	buf[BIGBUFSIZE];
1324 
1325 	t = va_arg(f->args, Token*);
1326 	if(t == nil)
1327 		sprint(buf, "<null>");
1328 	else {
1329 		i = 0;
1330 		if(dbglex > 1)
1331 			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1332 		tag = t->tag;
1333 		if(tag == Data){
1334 			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1335 		}
1336 		else {
1337 			srbra = "";
1338 			if(tag >= RBRA){
1339 				tag -= RBRA;
1340 				srbra = "/";
1341 			}
1342 			tname = tagnames[tag];
1343 			if(tag == Notfound)
1344 				tname = L(Lquestion);
1345 			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1346 			for(a = t->attr; a != nil; a = a->next){
1347 				aname = attrnames[a->attid];
1348 				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1349 				if(a->value != nil)
1350 					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1351 			}
1352 			i += snprint(buf+i, sizeof(buf)-i-1, ">");
1353 		}
1354 		buf[i] = 0;
1355 	}
1356 	return fmtstrcpy(f, buf);
1357 }
1358 
1359 /* Attrs own their constituent strings, but build may eventually */
1360 /* transfer some values to its items and nil them out in the Attr. */
1361 static Attr*
1362 newattr(int attid, Rune* value, Attr* link)
1363 {
1364 	Attr* ans;
1365 
1366 	ans = (Attr*)emalloc(sizeof(Attr));
1367 	ans->attid = attid;
1368 	ans->value = value;
1369 	ans->next = link;
1370 	return ans;
1371 }
1372 
1373 /* Free list of Attrs linked through next field */
1374 static void
1375 freeattrs(Attr* ahead)
1376 {
1377 	Attr* a;
1378 	Attr* nexta;
1379 
1380 	a = ahead;
1381 	while(a != nil){
1382 		nexta = a->next;
1383 		free(a->value);
1384 		free(a);
1385 		a = nexta;
1386 	}
1387 }
1388 
1389 /* Free array of Tokens. */
1390 /* Allocated space might have room for more than n tokens, */
1391 /* but only n of them are initialized. */
1392 /* If caller has transferred ownership of constitutent strings */
1393 /* or attributes, it must have nil'd out the pointers in the Tokens. */
1394 void
1395 _freetokens(Token* tarray, int n)
1396 {
1397 	int i;
1398 	Token* t;
1399 
1400 	if(tarray == nil)
1401 		return;
1402 	for(i = 0; i < n; i++){
1403 		t = &tarray[i];
1404 		free(t->text);
1405 		freeattrs(t->attr);
1406 	}
1407 	free(tarray);
1408 }