op public repos

Blob

Date:: Mon Sep 19 16:46:08 2005 UTC
Message:: The people who use emdash and endash are probably the same ones who think data is plural.
Actions:: History | Blame | Raw File
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7 
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 	int			i;		// index of next byte to use
12 	uchar*		data;		// all the data
13 	int			edata;	// data[0:edata] is valid
14 	int			chset;	// one of US_Ascii, etc.
15 	int			mtype;	// TextHtml or TextPlain
16 };
17 
18 enum {
19 	EOF = -2,
20 	EOB = -1
21 };
22 
23 #define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27 
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune **tagnames;
31 char *_tagnames[] = {
32 	" ",
33 	"!",
34 	"a", 
35 	"abbr",
36 	"acronym",
37 	"address",
38 	"applet", 
39 	"area",
40 	"b",
41 	"base",
42 	"basefont",
43 	"bdo",
44 	"big",
45 	"blink",
46 	"blockquote",
47 	"body",
48 	"bq",
49 	"br",
50 	"button",
51 	"caption",
52 	"center",
53 	"cite",
54 	"code",
55 	"col",
56 	"colgroup",
57 	"dd",
58 	"del",
59 	"dfn",
60 	"dir",
61 	"div",
62 	"dl",
63 	"dt",
64 	"em",
65 	"fieldset",
66 	"font",
67 	"form",
68 	"frame",
69 	"frameset",
70 	"h1",
71 	"h2",
72 	"h3",
73 	"h4",
74 	"h5",
75 	"h6",
76 	"head",
77 	"hr",
78 	"html",
79 	"i",
80 	"iframe",
81 	"img",
82 	"input",
83 	"ins",
84 	"isindex",
85 	"kbd",
86 	"label",
87 	"legend",
88 	"li",
89 	"link",
90 	"map",
91 	"menu",
92 	"meta",
93 	"nobr",
94 	"noframes",
95 	"noscript",
96 	"object",
97 	"ol",
98 	"optgroup",
99 	"option",
100 	"p",
101 	"param",
102 	"pre",
103 	"q",
104 	"s",
105 	"samp",
106 	"script",
107 	"select",
108 	"small",
109 	"span",
110 	"strike",
111 	"strong",
112 	"style",
113 	"sub",
114 	"sup",
115 	"table",
116 	"tbody",
117 	"td",
118 	"textarea",
119 	"tfoot",
120 	"th",
121 	"thead",
122 	"title",
123 	"tr",
124 	"tt",
125 	"u",
126 	"ul",
127 	"var"
128 };
129 
130 // HTML 4.0 attribute names.
131 // Keep sorted, and in correspondence with enum in i.h.
132 Rune **attrnames;
133 char* _attrnames[] = {
134 	"abbr",
135 	"accept-charset",
136 	"access-key",
137 	"action",
138 	"align",
139 	"alink",
140 	"alt",
141 	"archive",
142 	"axis",
143 	"background",
144 	"bgcolor",
145 	"border",
146 	"cellpadding",
147 	"cellspacing",
148 	"char",
149 	"charoff",
150 	"charset",
151 	"checked",
152 	"cite",
153 	"class",
154 	"classid",
155 	"clear",
156 	"code",
157 	"codebase",
158 	"codetype",
159 	"color",
160 	"cols",
161 	"colspan",
162 	"compact",
163 	"content",
164 	"coords",
165 	"data",
166 	"datetime",
167 	"declare",
168 	"defer",
169 	"dir",
170 	"disabled",
171 	"enctype",
172 	"face",
173 	"for",
174 	"frame",
175 	"frameborder",
176 	"headers",
177 	"height",
178 	"href",
179 	"hreflang",
180 	"hspace",
181 	"http-equiv",
182 	"id",
183 	"ismap",
184 	"label",
185 	"lang",
186 	"link",
187 	"longdesc",
188 	"marginheight",
189 	"marginwidth",
190 	"maxlength",
191 	"media",
192 	"method",
193 	"multiple",
194 	"name",
195 	"nohref",
196 	"noresize",
197 	"noshade",
198 	"nowrap",
199 	"object",
200 	"onblur",
201 	"onchange",
202 	"onclick",
203 	"ondblclick",
204 	"onfocus",
205 	"onkeypress",
206 	"onkeyup",
207 	"onload",
208 	"onmousedown",
209 	"onmousemove",
210 	"onmouseout",
211 	"onmouseover",
212 	"onmouseup",
213 	"onreset",
214 	"onselect",
215 	"onsubmit",
216 	"onunload",
217 	"profile",
218 	"prompt",
219 	"readonly",
220 	"rel",
221 	"rev",
222 	"rows",
223 	"rowspan",
224 	"rules",
225 	"scheme",
226 	"scope",
227 	"scrolling",
228 	"selected",
229 	"shape",
230 	"size",
231 	"span",
232 	"src",
233 	"standby",
234 	"start",
235 	"style",
236 	"summary",
237 	"tabindex",
238 	"target",
239 	"text",
240 	"title",
241 	"type",
242 	"usemap",
243 	"valign",
244 	"value",
245 	"valuetype",
246 	"version",
247 	"vlink",
248 	"vspace",
249 	"width"
250 };
251 
252 
253 // Character entity to unicode character number map.
254 // Keep sorted by name.
255 StringInt *chartab;
256 AsciiInt _chartab[] = {
257 	{"AElig", 198},
258 	{"Aacute", 193},
259 	{"Acirc", 194},
260 	{"Agrave", 192},
261 	{"Aring", 197},
262 	{"Atilde", 195},
263 	{"Auml", 196},
264 	{"Ccedil", 199},
265 	{"ETH", 208},
266 	{"Eacute", 201},
267 	{"Ecirc", 202},
268 	{"Egrave", 200},
269 	{"Euml", 203},
270 	{"Iacute", 205},
271 	{"Icirc", 206},
272 	{"Igrave", 204},
273 	{"Iuml", 207},
274 	{"Ntilde", 209},
275 	{"Oacute", 211},
276 	{"Ocirc", 212},
277 	{"Ograve", 210},
278 	{"Oslash", 216},
279 	{"Otilde", 213},
280 	{"Ouml", 214},
281 	{"THORN", 222},
282 	{"Uacute", 218},
283 	{"Ucirc", 219},
284 	{"Ugrave", 217},
285 	{"Uuml", 220},
286 	{"Yacute", 221},
287 	{"aacute", 225},
288 	{"acirc", 226},
289 	{"acute", 180},
290 	{"aelig", 230},
291 	{"agrave", 224},
292 	{"alpha", 945},
293 	{"amp", 38},
294 	{"aring", 229},
295 	{"atilde", 227},
296 	{"auml", 228},
297 	{"beta", 946},
298 	{"brvbar", 166},
299 	{"ccedil", 231},
300 	{"cdots", 8943},
301 	{"cedil", 184},
302 	{"cent", 162},
303 	{"chi", 967},
304 	{"copy", 169},
305 	{"curren", 164},
306 	{"ddots", 8945},
307 	{"deg", 176},
308 	{"delta", 948},
309 	{"divide", 247},
310 	{"eacute", 233},
311 	{"ecirc", 234},
312 	{"egrave", 232},
313 	{"emdash", 8212},	/* non-standard but commonly used */
314 	{"emsp", 8195},
315 	{"endash", 8211},	/* non-standard but commonly used */
316 	{"ensp", 8194},
317 	{"epsilon", 949},
318 	{"eta", 951},
319 	{"eth", 240},
320 	{"euml", 235},
321 	{"frac12", 189},
322 	{"frac14", 188},
323 	{"frac34", 190},
324 	{"gamma", 947},
325 	{"gt", 62},
326 	{"iacute", 237},
327 	{"icirc", 238},
328 	{"iexcl", 161},
329 	{"igrave", 236},
330 	{"iota", 953},
331 	{"iquest", 191},
332 	{"iuml", 239},
333 	{"kappa", 954},
334 	{"lambda", 955},
335 	{"laquo", 171},
336 	{"ldots", 8230},
337 	{"lt", 60},
338 	{"macr", 175},
339 	{"mdash", 8212},
340 	{"micro", 181},
341 	{"middot", 183},
342 	{"mu", 956},
343 	{"nbsp", 160},
344 	{"ndash", 8211},
345 	{"not", 172},
346 	{"ntilde", 241},
347 	{"nu", 957},
348 	{"oacute", 243},
349 	{"ocirc", 244},
350 	{"ograve", 242},
351 	{"omega", 969},
352 	{"omicron", 959},
353 	{"ordf", 170},
354 	{"ordm", 186},
355 	{"oslash", 248},
356 	{"otilde", 245},
357 	{"ouml", 246},
358 	{"para", 182},
359 	{"phi", 966},
360 	{"pi", 960},
361 	{"plusmn", 177},
362 	{"pound", 163},
363 	{"psi", 968},
364 	{"quad", 8193},
365 	{"quot", 34},
366 	{"raquo", 187},
367 	{"reg", 174},
368 	{"rho", 961},
369 	{"sect", 167},
370 	{"shy", 173},
371 	{"sigma", 963},
372 	{"sp", 8194},
373 	{"sup1", 185},
374 	{"sup2", 178},
375 	{"sup3", 179},
376 	{"szlig", 223},
377 	{"tau", 964},
378 	{"theta", 952},
379 	{"thinsp", 8201},
380 	{"thorn", 254},
381 	{"times", 215},
382 	{"trade", 8482},
383 	{"uacute", 250},
384 	{"ucirc", 251},
385 	{"ugrave", 249},
386 	{"uml", 168},
387 	{"upsilon", 965},
388 	{"uuml", 252},
389 	{"varepsilon", 8712},
390 	{"varphi", 981},
391 	{"varpi", 982},
392 	{"varrho", 1009},
393 	{"vdots", 8942},
394 	{"vsigma", 962},
395 	{"vtheta", 977},
396 	{"xi", 958},
397 	{"yacute", 253},
398 	{"yen", 165},
399 	{"yuml", 255},
400 	{"zeta", 950}
401 };
402 #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
403 
404 // Characters Winstart..Winend are those that Windows
405 // uses interpolated into the Latin1 set.
406 // They aren't supposed to appear in HTML, but they do....
407 enum {
408 	Winstart = 127,
409 	Winend = 159
410 };
411 
412 static int	winchars[]= { 8226,	// 8226 is a bullet
413 	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
414 	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
415 	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
416 	732, 8482, 353, 8250, 339, 8226, 8226, 376};
417 
418 static StringInt*	tagtable;		// initialized from tagnames
419 static StringInt*	attrtable;		// initialized from attrnames
420 
421 static void		lexinit(void);
422 static int		getplaindata(TokenSource* ts, Token* a, int* pai);
423 static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
424 static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
425 static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
426 static Rune*		buftostr(Rune* s, Rune* buf, int j);
427 static int		comment(TokenSource* ts);
428 static int		findstr(TokenSource* ts, Rune* s);
429 static int		ampersand(TokenSource* ts);
430 //static int		lowerc(int c);
431 static int		getchar(TokenSource* ts);
432 static void		ungetchar(TokenSource* ts, int c);
433 static void		backup(TokenSource* ts, int savei);
434 //static void		freeinsidetoken(Token* t);
435 static void		freeattrs(Attr* ahead);
436 static Attr*		newattr(int attid, Rune* value, Attr* link);
437 static int		Tconv(Fmt* f);
438 
439 int	dbglex = 0;
440 static int lexinited = 0;
441 
442 static void
443 lexinit(void)
444 {
445 	chartab = _cvtstringinttab(_chartab, nelem(_chartab));
446 	tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
447 	tagtable = _makestrinttab(tagnames, Numtags);
448 	attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
449 	attrtable = _makestrinttab(attrnames, Numattrs);
450 	fmtinstall('T', Tconv);
451 	lexinited = 1;
452 }
453 
454 static TokenSource*
455 newtokensource(uchar* data, int edata, int chset, int mtype)
456 {
457 	TokenSource*	ans;
458 
459 	assert(chset == US_Ascii || chset == ISO_8859_1 ||
460 			chset == UTF_8 || chset == Unicode);
461 	ans = (TokenSource*)emalloc(sizeof(TokenSource));
462 	ans->i = 0;
463 	ans->data = data;
464 	ans->edata = edata;
465 	ans->chset = chset;
466 	ans->mtype = mtype;
467 	return ans;
468 }
469 
470 enum {
471 	ToksChunk = 500
472 };
473 
474 // Call this to get the tokens.
475 //  The number of returned tokens is returned in *plen.
476 Token*
477 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
478 {
479 	TokenSource*	ts;
480 	Token*		a;
481 	int	alen;
482 	int	ai;
483 	int	starti;
484 	int	c;
485 	int	tag;
486 
487 	if(!lexinited)
488 		lexinit();
489 	ts = newtokensource(data, datalen, chset, mtype);
490 	alen = ToksChunk;
491 	a = (Token*)emalloc(alen * sizeof(Token));
492 	ai = 0;
493 	if(dbglex)
494 		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
495 	if(ts->mtype == TextHtml) {
496 		for(;;) {
497 			if(ai == alen) {
498 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
499 				alen += ToksChunk;
500 			}
501 			starti = ts->i;
502 			c = getchar(ts);
503 			if(c < 0)
504 				break;
505 			if(c == '<') {
506 				tag = gettag(ts, starti, a, &ai);
507 				if(tag == Tscript) {
508 					// special rules for getting Data after....
509 					starti = ts->i;
510 					c = getchar(ts);
511 					tag = getscriptdata(ts, c, starti, a, &ai);
512 				}
513 			}
514 			else
515 				tag = getdata(ts, c, starti, a, &ai);
516 			if(tag == -1)
517 				break;
518 			else if(dbglex > 1 && tag != Comment)
519 				fprint(2, "lex: got token %T\n", &a[ai-1]);
520 		}
521 	}
522 	else {
523 		// plain text (non-html) tokens
524 		for(;;) {
525 			if(ai == alen) {
526 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
527 				alen += ToksChunk;
528 			}
529 			tag = getplaindata(ts, a, &ai);
530 			if(tag == -1)
531 				break;
532 			if(dbglex > 1)
533 				fprint(2, "lex: got token %T\n", &a[ai]);
534 		}
535 	}
536 	if(dbglex)
537 		fprint(2, "lex: returning %d tokens\n", ai);
538 	*plen = ai;
539 	if(ai == 0) 
540 		return nil;
541 	return a;
542 }
543 
544 // For case where source isn't HTML.
545 // Just make data tokens, one per line (or partial line,
546 // at end of buffer), ignoring non-whitespace control
547 // characters and dumping \r's.
548 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
549 // Otherwise return -1;
550 static int
551 getplaindata(TokenSource* ts, Token* a, int* pai)
552 {
553 	Rune*	s;
554 	int	j;
555 	int	starti;
556 	int	c;
557 	Token*	tok;
558 	Rune	buf[BIGBUFSIZE];
559 
560 	s = nil;
561 	j = 0;
562 	starti = ts->i;
563 	for(c = getchar(ts); c >= 0; c = getchar(ts)) {
564 		if(c < ' ') {
565 			if(isspace(c)) {
566 				if(c == '\r') {
567 					// ignore it unless no following '\n',
568 					// in which case treat it like '\n'
569 					c = getchar(ts);
570 					if(c != '\n') {
571 						if(c >= 0)
572 							ungetchar(ts, c);
573 						c = '\n';
574 					}
575 				}
576 			}
577 			else
578 				c = 0;
579 		}
580 		if(c != 0) {
581 			buf[j++] = c;
582 			if(j == sizeof(buf)-1) {
583 				s = buftostr(s, buf, j);
584 				j = 0;
585 			}
586 		}
587 		if(c == '\n')
588 			break;
589 	}
590 	s = buftostr(s, buf, j);
591 	if(s == nil)
592 		return -1;
593 	tok = &a[(*pai)++];
594 	tok->tag = Data;
595 	tok->text = s;
596 	tok->attr = nil;
597 	tok->starti = starti;
598 	return Data;
599 }
600 
601 // Return concatenation of s and buf[0:j]
602 static Rune*
603 buftostr(Rune* s, Rune* buf, int j)
604 {
605 	buf[j] = 0;
606 	if(s == nil)
607 		s = _Strndup(buf, j);
608 	else 
609 		s = _Strdup2(s, buf);
610 	return s;
611 }
612 
613 // Gather data up to next start-of-tag or end-of-buffer.
614 // Translate entity references (&amp;).
615 // Ignore non-whitespace control characters and get rid of \r's.
616 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
617 // Otherwise return -1;
618 static int
619 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
620 {
621 	Rune*	s;
622 	int	j;
623 	int	c;
624 	Token*	tok;
625 	Rune	buf[BIGBUFSIZE];
626 
627 	s = nil;
628 	j = 0;
629 	c = firstc;
630 	while(c >= 0) {
631 		if(c == '&') {
632 			c = ampersand(ts);
633 			if(c < 0)
634 				break;
635 		}
636 		else if(c < ' ') {
637 			if(isspace(c)) {
638 				if(c == '\r') {
639 					// ignore it unless no following '\n',
640 					// in which case treat it like '\n'
641 					c = getchar(ts);
642 					if(c != '\n') {
643 						if(c >= 0)
644 							ungetchar(ts, c);
645 						c = '\n';
646 					}
647 				}
648 			}
649 			else {
650 				if(warn)
651 					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
652 				c = 0;
653 			}
654 		}
655 		else if(c == '<') {
656 			ungetchar(ts, c);
657 			break;
658 		}
659 		if(c != 0) {
660 			buf[j++] = c;
661 			if(j == BIGBUFSIZE-1) {
662 				s = buftostr(s, buf, j);
663 				j = 0;
664 			}
665 		}
666 		c = getchar(ts);
667 	}
668 	s = buftostr(s, buf, j);
669 	if(s == nil)
670 		return -1;
671 	tok = &a[(*pai)++];
672 	tok->tag = Data;
673 	tok->text = s;
674 	tok->attr = nil;
675 	tok->starti = starti;
676 	return Data;
677 }
678 
679 // The rules for lexing scripts are different (ugh).
680 // Gather up everything until see a </SCRIPT>.
681 static int
682 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
683 {
684 	Rune*	s;
685 	int	j;
686 	int	tstarti;
687 	int	savei;
688 	int	c;
689 	int	tag;
690 	int	done;
691 	Token*	tok;
692 	Rune	buf[BIGBUFSIZE];
693 
694 	s = nil;
695 	j = 0;
696 	tstarti = starti;
697 	c = firstc;
698 	done = 0;
699 	while(c >= 0) {
700 		if(c == '<') {
701 			// other browsers ignore stuff to end of line after <!
702 			savei = ts->i;
703 			c = getchar(ts);
704 			if(c == '!') {
705 				while(c >= 0 && c != '\n' && c != '\r')
706 					c = getchar(ts);
707 				if(c == '\r')
708 					c = getchar(ts);
709 				if(c == '\n')
710 					c = getchar(ts);
711 			}
712 			else if(c >= 0) {
713 				backup(ts, savei);
714 				tag = gettag(ts, tstarti, a, pai);
715 				if(tag == -1)
716 					break;
717 				if(tag != Comment)
718 					(*pai)--;
719 				backup(ts, tstarti);
720 				if(tag == Tscript + RBRA) {
721 					done = 1;
722 					break;
723 				}
724 				// here tag was not </SCRIPT>, so take as regular data
725 				c = getchar(ts);
726 			}
727 		}
728 		if(c < 0)
729 			break;
730 		if(c != 0) {
731 			buf[j++] = c;
732 			if(j == BIGBUFSIZE-1) {
733 				s = buftostr(s, buf, j);
734 				j = 0;
735 			}
736 		}
737 		tstarti = ts->i;
738 		c = getchar(ts);
739 	}
740 	if(done || ts->i == ts->edata) {
741 		s = buftostr(s, buf, j);
742 		tok = &a[(*pai)++];
743 		tok->tag = Data;
744 		tok->text = s;
745 		tok->attr = nil;
746 		tok->starti = starti;
747 		return Data;
748 	}
749 	backup(ts, starti);
750 	return -1;
751 }
752 
753 // We've just seen a '<'.  Gather up stuff to closing '>' (if buffer
754 // ends before then, return -1).
755 // If it's a tag, look up the name, gather the attributes, and return
756 // the appropriate token.
757 // Else it's either just plain data or some kind of ignorable stuff:
758 // return Data or Comment as appropriate.
759 // If it's not a Comment, put it in a[*pai] and bump *pai.
760 static int
761 gettag(TokenSource* ts, int starti, Token* a, int* pai)
762 {
763 	int	rbra;
764 	int	ans;
765 	Attr*	al;
766 	int	nexti;
767 	int	c;
768 	int	ti;
769 	int	afnd;
770 	int	attid;
771 	int	quote;
772 	Rune*	val;
773 	int	nv;
774 	int	i;
775 	int	tag;
776 	Token*	tok;
777 	Rune	buf[BIGBUFSIZE];
778 
779 	rbra = 0;
780 	nexti = ts->i;
781 	tok = &a[*pai];
782 	tok->tag = Notfound;
783 	tok->text = nil;
784 	tok->attr = nil;
785 	tok->starti = starti;
786 	c = getchar(ts);
787 	if(c == '/') {
788 		rbra = RBRA;
789 		c = getchar(ts);
790 	}
791 	if(c < 0)
792 		goto eob_done;
793 	if(c >= 256 || !isalpha(c)) {
794 		// not a tag
795 		if(c == '!') {
796 			ans = comment(ts);
797 			if(ans != -1)
798 				return ans;
799 			goto eob_done;
800 		}
801 		else {
802 			backup(ts, nexti);
803 			tok->tag = Data;
804 			tok->text = _Strdup(L(Llt));
805 			(*pai)++;
806 			return Data;
807 		}
808 	}
809 	// c starts a tagname
810 	buf[0] = c;
811 	i = 1;
812 	while(1) {
813 		c = getchar(ts);
814 		if(c < 0)
815 			goto eob_done;
816 		if(!ISNAMCHAR(c))
817 			break;
818 		// if name is bigger than buf it won't be found anyway...
819 		if(i < BIGBUFSIZE)
820 			buf[i++] = c;
821 	}
822 	if(_lookup(tagtable, Numtags, buf, i, &tag))
823 		tok->tag = tag + rbra;
824 	else
825 		tok->text = _Strndup(buf, i);	// for warning print, in build
826 
827 	// attribute gathering loop
828 	al = nil;
829 	while(1) {
830 		// look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
831 		// skip whitespace
832 attrloop_continue:
833 		while(c < 256 && isspace(c)) {
834 			c = getchar(ts);
835 			if(c < 0)
836 				goto eob_done;
837 		}
838 		if(c == '>')
839 			goto attrloop_done;
840 		if(c == '<') {
841 			if(warn)
842 				fprint(2, "warning: unclosed tag\n");
843 			ungetchar(ts, c);
844 			goto attrloop_done;
845 		}
846 		if(c >= 256 || !isalpha(c)) {
847 			if(warn)
848 				fprint(2, "warning: expected attribute name\n");
849 			// skipt to next attribute name
850 			while(1) {
851 				c = getchar(ts);
852 				if(c < 0)
853 					goto eob_done;
854 				if(c < 256 && isalpha(c))
855 					goto attrloop_continue;
856 				if(c == '<') {
857 					if(warn)
858 						fprint(2, "warning: unclosed tag\n");
859 					ungetchar(ts, 60);
860 					goto attrloop_done;
861 				}
862 				if(c == '>')
863 					goto attrloop_done;
864 			}
865 		}
866 		// gather attribute name
867 		buf[0] = c;
868 		i = 1;
869 		while(1) {
870 			c = getchar(ts);
871 			if(c < 0)
872 				goto eob_done;
873 			if(!ISNAMCHAR(c))
874 				break;
875 			if(i < BIGBUFSIZE-1)
876 				buf[i++] = c;
877 		}
878 		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
879 		if(warn && !afnd) {
880 			buf[i] = 0;
881 			fprint(2, "warning: unknown attribute name %S\n", buf);
882 		}
883 		// skip whitespace
884 		while(c < 256 && isspace(c)) {
885 			c = getchar(ts);
886 			if(c < 0)
887 				goto eob_done;
888 		}
889 		if(c != '=') {
890 			if(afnd)
891 				al = newattr(attid, nil, al);
892 			goto attrloop_continue;
893 		}
894 		//# c is '=' here;  skip whitespace
895 		while(1) {
896 			c = getchar(ts);
897 			if(c < 0)
898 				goto eob_done;
899 			if(c >= 256 || !isspace(c))
900 				break;
901 		}
902 		quote = 0;
903 		if(c == '\'' || c == '"') {
904 			quote = c;
905 			c = getchar(ts);
906 			if(c < 0)
907 				goto eob_done;
908 		}
909 		val = nil;
910 		nv = 0;
911 		while(1) {
912 valloop_continue:
913 			if(c < 0)
914 				goto eob_done;
915 			if(c == '>') {
916 				if(quote) {
917 					// c might be part of string (though not good style)
918 					// but if line ends before close quote, assume
919 					// there was an unmatched quote
920 					ti = ts->i;
921 					while(1) {
922 						c = getchar(ts);
923 						if(c < 0)
924 							goto eob_done;
925 						if(c == quote) {
926 							backup(ts, ti);
927 							buf[nv++] = '>';
928 							if(nv == BIGBUFSIZE-1) {
929 								val = buftostr(val, buf, nv);
930 								nv = 0;
931 							}
932 							c = getchar(ts);
933 							goto valloop_continue;
934 						}
935 						if(c == '\n') {
936 							if(warn)
937 								fprint(2, "warning: apparent unmatched quote\n");
938 							backup(ts, ti);
939 							c = '>';
940 							goto valloop_done;
941 						}
942 					}
943 				}
944 				else
945 					goto valloop_done;
946 			}
947 			if(quote) {
948 				if(c == quote) {
949 					c = getchar(ts);
950 					if(c < 0)
951 						goto eob_done;
952 					goto valloop_done;
953 				}
954 				if(c == '\r') {
955 					c = getchar(ts);
956 					goto valloop_continue;
957 				}
958 				if(c == '\t' || c == '\n')
959 					c = ' ';
960 			}
961 			else {
962 				if(c < 256 && isspace(c))
963 					goto valloop_done;
964 			}
965 			if(c == '&') {
966 				c = ampersand(ts);
967 				if(c == -1)
968 					goto eob_done;
969 			}
970 			buf[nv++] = c;
971 			if(nv == BIGBUFSIZE-1) {
972 				val = buftostr(val, buf, nv);
973 				nv = 0;
974 			}
975 			c = getchar(ts);
976 		}
977 valloop_done:
978 		if(afnd) {
979 			val = buftostr(val, buf, nv);
980 			al = newattr(attid, val, al);
981 		}
982 	}
983 
984 attrloop_done:
985 	tok->attr = al;
986 	(*pai)++;
987 	return tok->tag;
988 
989 eob_done:
990 	if(warn)
991 		fprint(2, "warning: incomplete tag at end of page\n");
992 	backup(ts, nexti);
993 	tok->tag = Data;
994 	tok->text = _Strdup(L(Llt));
995 	return Data;
996 }
997 
998 // We've just read a '<!' at position starti,
999 // so this may be a comment or other ignored section, or it may
1000 // be just a literal string if there is no close before end of file
1001 // (other browsers do that).
1002 // The accepted practice seems to be (note: contrary to SGML spec!):
1003 // If see <!--, look for --> to close, or if none, > to close.
1004 // If see <!(not --), look for > to close.
1005 // If no close before end of file, leave original characters in as literal data.
1006 //
1007 // If we see ignorable stuff, return Comment.
1008 // Else return nil (caller should back up and try again when more data arrives,
1009 // unless at end of file, in which case caller should just make '<' a data token).
1010 static int
1011 comment(TokenSource* ts)
1012 {
1013 	int	nexti;
1014 	int	havecomment;
1015 	int	c;
1016 
1017 	nexti = ts->i;
1018 	havecomment = 0;
1019 	c = getchar(ts);
1020 	if(c == '-') {
1021 		c = getchar(ts);
1022 		if(c == '-') {
1023 			if(findstr(ts, L(Larrow)))
1024 				havecomment = 1;
1025 			else
1026 				backup(ts, nexti);
1027 		}
1028 	}
1029 	if(!havecomment) {
1030 		if(c == '>')
1031 			havecomment = 1;
1032 		else if(c >= 0) {
1033 			if(findstr(ts, L(Lgt)))
1034 				havecomment = 1;
1035 		}
1036 	}
1037 	if(havecomment)
1038 		return Comment;
1039 	return -1;
1040 }
1041 
1042 // Look for string s in token source.
1043 // If found, return 1, with buffer at next char after s,
1044 // else return 0 (caller should back up).
1045 static int
1046 findstr(TokenSource* ts, Rune* s)
1047 {
1048 	int	c0;
1049 	int	n;
1050 	int	nexti;
1051 	int	i;
1052 	int	c;
1053 
1054 	c0 = s[0];
1055 	n = runestrlen(s);
1056 	while(1) {
1057 		c = getchar(ts);
1058 		if(c < 0)
1059 			break;
1060 		if(c == c0) {
1061 			if(n == 1)
1062 				return 1;
1063 			nexti = ts->i;
1064 			for(i = 1; i < n; i++) {
1065 				c = getchar(ts);
1066 				if(c < 0)
1067 					goto mainloop_done;
1068 				if(c != s[i])
1069 					break;
1070 			}
1071 			if(i == n)
1072 				return 1;
1073 			backup(ts, nexti);
1074 		}
1075 	}
1076 mainloop_done:
1077 	return 0;
1078 }
1079 
1080 // We've just read an '&'; look for an entity reference
1081 // name, and if found, return translated char.
1082 // if there is a complete entity name but it isn't known,
1083 // try prefixes (gets around some buggy HTML out there),
1084 // and if that fails, back up to just past the '&' and return '&'.
1085 // If the entity can't be completed in the current buffer, back up
1086 // to the '&' and return -1.
1087 static int
1088 ampersand(TokenSource* ts)
1089 {
1090 	int	savei;
1091 	int	c;
1092 	int	fnd;
1093 	int	ans;
1094 	int	v;
1095 	int	i;
1096 	int	k;
1097 	Rune	buf[SMALLBUFSIZE];
1098 
1099 	savei = ts->i;
1100 	c = getchar(ts);
1101 	fnd = 0;
1102 	ans = -1;
1103 	if(c == '#') {
1104 		c = getchar(ts);
1105 		v = 0;
1106 		while(c >= 0) {
1107 			if(!(c < 256 && isdigit(c)))
1108 				break;
1109 			v = v*10 + c - 48;
1110 			c = getchar(ts);
1111 		}
1112 		if(c >= 0) {
1113 			if(!(c == ';' || c == '\n' || c == '\r'))
1114 				ungetchar(ts, c);
1115 			c = v;
1116 			if(c == 160)
1117 				c = 160;
1118 			if(c >= Winstart && c <= Winend) {
1119 				c = winchars[c - Winstart];
1120 			}
1121 			ans = c;
1122 			fnd = 1;
1123 		}
1124 	}
1125 	else if(c < 256 && isalpha(c)) {
1126 		buf[0] = c;
1127 		k = 1;
1128 		while(1) {
1129 			c = getchar(ts);
1130 			if(c < 0)
1131 				break;
1132 			if(ISNAMCHAR(c)) {
1133 				if(k < SMALLBUFSIZE-1)
1134 					buf[k++] = c;
1135 			}
1136 			else {
1137 				if(!(c == ';' || c == '\n' || c == '\r'))
1138 					ungetchar(ts, c);
1139 				break;
1140 			}
1141 		}
1142 		if(c >= 0) {
1143 			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1144 			if(!fnd) {
1145 				// Try prefixes of s
1146 				if(c == ';' || c == '\n' || c == '\r')
1147 					ungetchar(ts, c);
1148 				i = k;
1149 				while(--k > 0) {
1150 					fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1151 					if(fnd) {
1152 						while(i > k) {
1153 							i--;
1154 							ungetchar(ts, buf[i]);
1155 						}
1156 						break;
1157 					}
1158 				}
1159 			}
1160 		}
1161 	}
1162 	if(!fnd) {
1163 		backup(ts, savei);
1164 		ans = '&';
1165 	}
1166 	return ans;
1167 }
1168 
1169 // Get next char, obeying ts.chset.
1170 // Returns -1 if no complete character left before current end of data.
1171 static int
1172 getchar(TokenSource* ts)
1173 {
1174 	uchar*	buf;
1175 	int	c;
1176 	int	n;
1177 	int	ok;
1178 	Rune	r;
1179 
1180 	if(ts->i >= ts->edata)
1181 		return -1;
1182 	buf = ts->data;
1183 	c = buf[ts->i];
1184 	switch(ts->chset) {
1185 	case ISO_8859_1:
1186 		if(c >= Winstart && c <= Winend)
1187 			c = winchars[c - Winstart];
1188 		ts->i++;
1189 		break;
1190 	case US_Ascii:
1191 		if(c > 127) {
1192 			if(warn)
1193 				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1194 		}
1195 		ts->i++;
1196 		break;
1197 	case UTF_8:
1198 		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1199 		n = chartorune(&r, (char*)(buf+ts->i));
1200 		if(ok) {
1201 			if(warn && c == 0x80)
1202 				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1203 			ts->i += n;
1204 			c = r;
1205 		}
1206 		else {
1207 			// not enough bytes in buf to complete utf-8 char
1208 			ts->i = ts->edata;	// mark "all used"
1209 			c = -1;
1210 		}
1211 		break;
1212 	case Unicode:
1213 		if(ts->i < ts->edata - 1) {
1214 			//standards say most-significant byte first
1215 			c = (c << 8)|(buf[ts->i + 1]);
1216 			ts->i += 2;
1217 		}
1218 		else {
1219 			ts->i = ts->edata;	// mark "all used"
1220 			c = -1;
1221 		}
1222 		break;
1223 	}
1224 	return c;
1225 }
1226 
1227 // Assuming c was the last character returned by getchar, set
1228 // things up so that next getchar will get that same character
1229 // followed by the current 'next character', etc.
1230 static void
1231 ungetchar(TokenSource* ts, int c)
1232 {
1233 	int	n;
1234 	Rune	r;
1235 	char	a[UTFmax];
1236 
1237 	n = 1;
1238 	switch(ts->chset) {
1239 	case UTF_8:
1240 		if(c >= 128) {
1241 			r = c;
1242 			n = runetochar(a, &r);
1243 		}
1244 		break;
1245 	case Unicode:
1246 		n = 2;
1247 		break;
1248 	}
1249 	ts->i -= n;
1250 }
1251 
1252 // Restore ts so that it is at the state where the index was savei.
1253 static void
1254 backup(TokenSource* ts, int savei)
1255 {
1256 	if(dbglex)
1257 		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1258 	ts->i = savei;
1259 }
1260 
1261 
1262 // Look for value associated with attribute attid in token t.
1263 // If there is one, return 1 and put the value in *pans,
1264 // else return 0.
1265 // If xfer is true, transfer ownership of the string to the caller
1266 // (nil it out here); otherwise, caller must duplicate the answer
1267 // if it needs to save it.
1268 // OK to have pans==0, in which case this is just looking
1269 // to see if token is present.
1270 int
1271 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1272 {
1273 	Attr*	attr;
1274 
1275 	attr = t->attr;
1276 	while(attr != nil) {
1277 		if(attr->attid == attid) {
1278 			if(pans != nil)
1279 				*pans = attr->value;
1280 			if(xfer)
1281 				attr->value = nil;
1282 			return 1;
1283 		}
1284 		attr = attr->next;
1285 	}
1286 	if(pans != nil)
1287 		*pans = nil;
1288 	return 0;
1289 }
1290 
1291 static int
1292 Tconv(Fmt *f)
1293 {
1294 	Token*	t;
1295 	int	i;
1296 	int	tag;
1297 	char*	srbra;
1298 	Rune*	aname;
1299 	Rune*	tname;
1300 	Attr*	a;
1301 	char	buf[BIGBUFSIZE];
1302 
1303 	t = va_arg(f->args, Token*);
1304 	if(t == nil)
1305 		sprint(buf, "<null>");
1306 	else {
1307 		i = 0;
1308 		if(dbglex > 1)
1309 			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1310 		tag = t->tag;
1311 		if(tag == Data) {
1312 			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1313 		}
1314 		else {
1315 			srbra = "";
1316 			if(tag >= RBRA) {
1317 				tag -= RBRA;
1318 				srbra = "/";
1319 			}
1320 			tname = tagnames[tag];
1321 			if(tag == Notfound)
1322 				tname = L(Lquestion);
1323 			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1324 			for(a = t->attr; a != nil; a = a->next) {
1325 				aname = attrnames[a->attid];
1326 				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1327 				if(a->value != nil)
1328 					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1329 			}
1330 			i += snprint(buf+i, sizeof(buf)-i-1, ">");
1331 		}
1332 		buf[i] = 0;
1333 	}
1334 	return fmtstrcpy(f, buf);
1335 }
1336 
1337 // Attrs own their constituent strings, but build may eventually
1338 // transfer some values to its items and nil them out in the Attr.
1339 static Attr*
1340 newattr(int attid, Rune* value, Attr* link)
1341 {
1342 	Attr* ans;
1343 
1344 	ans = (Attr*)emalloc(sizeof(Attr));
1345 	ans->attid = attid;
1346 	ans->value = value;
1347 	ans->next = link;
1348 	return ans;
1349 }
1350 
1351 // Free list of Attrs linked through next field
1352 static void
1353 freeattrs(Attr* ahead)
1354 {
1355 	Attr* a;
1356 	Attr* nexta;
1357 
1358 	a = ahead;
1359 	while(a != nil) {
1360 		nexta = a->next;
1361 		free(a->value);
1362 		free(a);
1363 		a = nexta;
1364 	}
1365 }
1366 
1367 // Free array of Tokens.
1368 // Allocated space might have room for more than n tokens,
1369 // but only n of them are initialized.
1370 // If caller has transferred ownership of constitutent strings
1371 // or attributes, it must have nil'd out the pointers in the Tokens.
1372 void
1373 _freetokens(Token* tarray, int n)
1374 {
1375 	int i;
1376 	Token* t;
1377 
1378 	if(tarray == nil)
1379 		return;
1380 	for(i = 0; i < n; i++) {
1381 		t = &tarray[i];
1382 		free(t->text);
1383 		freeattrs(t->attr);
1384 	}
1385 	free(tarray);
1386 }