op public repos

Blob

Date:: Tue Jan 4 22:20:21 2005 UTC
Message:: hide some routines
Actions:: History | Blame | Raw File
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
7 
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 	int			i;		// index of next byte to use
12 	uchar*		data;		// all the data
13 	int			edata;	// data[0:edata] is valid
14 	int			chset;	// one of US_Ascii, etc.
15 	int			mtype;	// TextHtml or TextPlain
16 };
17 
18 enum {
19 	EOF = -2,
20 	EOB = -1
21 };
22 
23 #define ISNAMCHAR(c)	((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
27 
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune **tagnames;
31 char *_tagnames[] = {
32 	" ",
33 	"!",
34 	"a", 
35 	"abbr",
36 	"acronym",
37 	"address",
38 	"applet", 
39 	"area",
40 	"b",
41 	"base",
42 	"basefont",
43 	"bdo",
44 	"big",
45 	"blink",
46 	"blockquote",
47 	"body",
48 	"bq",
49 	"br",
50 	"button",
51 	"caption",
52 	"center",
53 	"cite",
54 	"code",
55 	"col",
56 	"colgroup",
57 	"dd",
58 	"del",
59 	"dfn",
60 	"dir",
61 	"div",
62 	"dl",
63 	"dt",
64 	"em",
65 	"fieldset",
66 	"font",
67 	"form",
68 	"frame",
69 	"frameset",
70 	"h1",
71 	"h2",
72 	"h3",
73 	"h4",
74 	"h5",
75 	"h6",
76 	"head",
77 	"hr",
78 	"html",
79 	"i",
80 	"iframe",
81 	"img",
82 	"input",
83 	"ins",
84 	"isindex",
85 	"kbd",
86 	"label",
87 	"legend",
88 	"li",
89 	"link",
90 	"map",
91 	"menu",
92 	"meta",
93 	"nobr",
94 	"noframes",
95 	"noscript",
96 	"object",
97 	"ol",
98 	"optgroup",
99 	"option",
100 	"p",
101 	"param",
102 	"pre",
103 	"q",
104 	"s",
105 	"samp",
106 	"script",
107 	"select",
108 	"small",
109 	"span",
110 	"strike",
111 	"strong",
112 	"style",
113 	"sub",
114 	"sup",
115 	"table",
116 	"tbody",
117 	"td",
118 	"textarea",
119 	"tfoot",
120 	"th",
121 	"thead",
122 	"title",
123 	"tr",
124 	"tt",
125 	"u",
126 	"ul",
127 	"var"
128 };
129 
130 // HTML 4.0 attribute names.
131 // Keep sorted, and in correspondence with enum in i.h.
132 Rune **attrnames;
133 char* _attrnames[] = {
134 	"abbr",
135 	"accept-charset",
136 	"access-key",
137 	"action",
138 	"align",
139 	"alink",
140 	"alt",
141 	"archive",
142 	"axis",
143 	"background",
144 	"bgcolor",
145 	"border",
146 	"cellpadding",
147 	"cellspacing",
148 	"char",
149 	"charoff",
150 	"charset",
151 	"checked",
152 	"cite",
153 	"class",
154 	"classid",
155 	"clear",
156 	"code",
157 	"codebase",
158 	"codetype",
159 	"color",
160 	"cols",
161 	"colspan",
162 	"compact",
163 	"content",
164 	"coords",
165 	"data",
166 	"datetime",
167 	"declare",
168 	"defer",
169 	"dir",
170 	"disabled",
171 	"enctype",
172 	"face",
173 	"for",
174 	"frame",
175 	"frameborder",
176 	"headers",
177 	"height",
178 	"href",
179 	"hreflang",
180 	"hspace",
181 	"http-equiv",
182 	"id",
183 	"ismap",
184 	"label",
185 	"lang",
186 	"link",
187 	"longdesc",
188 	"marginheight",
189 	"marginwidth",
190 	"maxlength",
191 	"media",
192 	"method",
193 	"multiple",
194 	"name",
195 	"nohref",
196 	"noresize",
197 	"noshade",
198 	"nowrap",
199 	"object",
200 	"onblur",
201 	"onchange",
202 	"onclick",
203 	"ondblclick",
204 	"onfocus",
205 	"onkeypress",
206 	"onkeyup",
207 	"onload",
208 	"onmousedown",
209 	"onmousemove",
210 	"onmouseout",
211 	"onmouseover",
212 	"onmouseup",
213 	"onreset",
214 	"onselect",
215 	"onsubmit",
216 	"onunload",
217 	"profile",
218 	"prompt",
219 	"readonly",
220 	"rel",
221 	"rev",
222 	"rows",
223 	"rowspan",
224 	"rules",
225 	"scheme",
226 	"scope",
227 	"scrolling",
228 	"selected",
229 	"shape",
230 	"size",
231 	"span",
232 	"src",
233 	"standby",
234 	"start",
235 	"style",
236 	"summary",
237 	"tabindex",
238 	"target",
239 	"text",
240 	"title",
241 	"type",
242 	"usemap",
243 	"valign",
244 	"value",
245 	"valuetype",
246 	"version",
247 	"vlink",
248 	"vspace",
249 	"width"
250 };
251 
252 
253 // Character entity to unicode character number map.
254 // Keep sorted by name.
255 StringInt *chartab;
256 AsciiInt _chartab[142] = {
257 	{"AElig", 198},
258 	{"Aacute", 193},
259 	{"Acirc", 194},
260 	{"Agrave", 192},
261 	{"Aring", 197},
262 	{"Atilde", 195},
263 	{"Auml", 196},
264 	{"Ccedil", 199},
265 	{"ETH", 208},
266 	{"Eacute", 201},
267 	{"Ecirc", 202},
268 	{"Egrave", 200},
269 	{"Euml", 203},
270 	{"Iacute", 205},
271 	{"Icirc", 206},
272 	{"Igrave", 204},
273 	{"Iuml", 207},
274 	{"Ntilde", 209},
275 	{"Oacute", 211},
276 	{"Ocirc", 212},
277 	{"Ograve", 210},
278 	{"Oslash", 216},
279 	{"Otilde", 213},
280 	{"Ouml", 214},
281 	{"THORN", 222},
282 	{"Uacute", 218},
283 	{"Ucirc", 219},
284 	{"Ugrave", 217},
285 	{"Uuml", 220},
286 	{"Yacute", 221},
287 	{"aacute", 225},
288 	{"acirc", 226},
289 	{"acute", 180},
290 	{"aelig", 230},
291 	{"agrave", 224},
292 	{"alpha", 945},
293 	{"amp", 38},
294 	{"aring", 229},
295 	{"atilde", 227},
296 	{"auml", 228},
297 	{"beta", 946},
298 	{"brvbar", 166},
299 	{"ccedil", 231},
300 	{"cdots", 8943},
301 	{"cedil", 184},
302 	{"cent", 162},
303 	{"chi", 967},
304 	{"copy", 169},
305 	{"curren", 164},
306 	{"ddots", 8945},
307 	{"deg", 176},
308 	{"delta", 948},
309 	{"divide", 247},
310 	{"eacute", 233},
311 	{"ecirc", 234},
312 	{"egrave", 232},
313 	{"emdash", 8212},
314 	{"emsp", 8195},
315 	{"endash", 8211},
316 	{"ensp", 8194},
317 	{"epsilon", 949},
318 	{"eta", 951},
319 	{"eth", 240},
320 	{"euml", 235},
321 	{"frac12", 189},
322 	{"frac14", 188},
323 	{"frac34", 190},
324 	{"gamma", 947},
325 	{"gt", 62},
326 	{"iacute", 237},
327 	{"icirc", 238},
328 	{"iexcl", 161},
329 	{"igrave", 236},
330 	{"iota", 953},
331 	{"iquest", 191},
332 	{"iuml", 239},
333 	{"kappa", 954},
334 	{"lambda", 955},
335 	{"laquo", 171},
336 	{"ldots", 8230},
337 	{"lt", 60},
338 	{"macr", 175},
339 	{"micro", 181},
340 	{"middot", 183},
341 	{"mu", 956},
342 	{"nbsp", 160},
343 	{"not", 172},
344 	{"ntilde", 241},
345 	{"nu", 957},
346 	{"oacute", 243},
347 	{"ocirc", 244},
348 	{"ograve", 242},
349 	{"omega", 969},
350 	{"omicron", 959},
351 	{"ordf", 170},
352 	{"ordm", 186},
353 	{"oslash", 248},
354 	{"otilde", 245},
355 	{"ouml", 246},
356 	{"para", 182},
357 	{"phi", 966},
358 	{"pi", 960},
359 	{"plusmn", 177},
360 	{"pound", 163},
361 	{"psi", 968},
362 	{"quad", 8193},
363 	{"quot", 34},
364 	{"raquo", 187},
365 	{"reg", 174},
366 	{"rho", 961},
367 	{"sect", 167},
368 	{"shy", 173},
369 	{"sigma", 963},
370 	{"sp", 8194},
371 	{"sup1", 185},
372 	{"sup2", 178},
373 	{"sup3", 179},
374 	{"szlig", 223},
375 	{"tau", 964},
376 	{"theta", 952},
377 	{"thinsp", 8201},
378 	{"thorn", 254},
379 	{"times", 215},
380 	{"trade", 8482},
381 	{"uacute", 250},
382 	{"ucirc", 251},
383 	{"ugrave", 249},
384 	{"uml", 168},
385 	{"upsilon", 965},
386 	{"uuml", 252},
387 	{"varepsilon", 8712},
388 	{"varphi", 981},
389 	{"varpi", 982},
390 	{"varrho", 1009},
391 	{"vdots", 8942},
392 	{"vsigma", 962},
393 	{"vtheta", 977},
394 	{"xi", 958},
395 	{"yacute", 253},
396 	{"yen", 165},
397 	{"yuml", 255},
398 	{"zeta", 950}
399 };
400 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
401 
402 // Characters Winstart..Winend are those that Windows
403 // uses interpolated into the Latin1 set.
404 // They aren't supposed to appear in HTML, but they do....
405 enum {
406 	Winstart = 127,
407 	Winend = 159
408 };
409 
410 static int	winchars[]= { 8226,	// 8226 is a bullet
411 	8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
412 	710, 8240, 352, 8249, 338, 8226, 8226, 8226,
413 	8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
414 	732, 8482, 353, 8250, 339, 8226, 8226, 376};
415 
416 static StringInt*	tagtable;		// initialized from tagnames
417 static StringInt*	attrtable;		// initialized from attrnames
418 
419 static void		lexinit();
420 static int		getplaindata(TokenSource* ts, Token* a, int* pai);
421 static int		getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
422 static int		getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
423 static int		gettag(TokenSource* ts, int starti, Token* a, int* pai);
424 static Rune*		buftostr(Rune* s, Rune* buf, int j);
425 static int		comment(TokenSource* ts);
426 static int		findstr(TokenSource* ts, Rune* s);
427 static int		ampersand(TokenSource* ts);
428 //static int		lowerc(int c);
429 static int		getchar(TokenSource* ts);
430 static void		ungetchar(TokenSource* ts, int c);
431 static void		backup(TokenSource* ts, int savei);
432 //static void		freeinsidetoken(Token* t);
433 static void		freeattrs(Attr* ahead);
434 static Attr*		newattr(int attid, Rune* value, Attr* link);
435 static int		Tconv(Fmt* f);
436 
437 int	dbglex = 0;
438 static int lexinited = 0;
439 
440 static void
441 lexinit(void)
442 {
443 	chartab = _cvtstringinttab(_chartab, nelem(_chartab));
444 	tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
445 	tagtable = _makestrinttab(tagnames, Numtags);
446 	attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
447 	attrtable = _makestrinttab(attrnames, Numattrs);
448 	fmtinstall('T', Tconv);
449 	lexinited = 1;
450 }
451 
452 static TokenSource*
453 newtokensource(uchar* data, int edata, int chset, int mtype)
454 {
455 	TokenSource*	ans;
456 
457 	assert(chset == US_Ascii || chset == ISO_8859_1 ||
458 			chset == UTF_8 || chset == Unicode);
459 	ans = (TokenSource*)emalloc(sizeof(TokenSource));
460 	ans->i = 0;
461 	ans->data = data;
462 	ans->edata = edata;
463 	ans->chset = chset;
464 	ans->mtype = mtype;
465 	return ans;
466 }
467 
468 enum {
469 	ToksChunk = 500
470 };
471 
472 // Call this to get the tokens.
473 //  The number of returned tokens is returned in *plen.
474 Token*
475 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
476 {
477 	TokenSource*	ts;
478 	Token*		a;
479 	int	alen;
480 	int	ai;
481 	int	starti;
482 	int	c;
483 	int	tag;
484 
485 	if(!lexinited)
486 		lexinit();
487 	ts = newtokensource(data, datalen, chset, mtype);
488 	alen = ToksChunk;
489 	a = (Token*)emalloc(alen * sizeof(Token));
490 	ai = 0;
491 	if(dbglex)
492 		fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
493 	if(ts->mtype == TextHtml) {
494 		for(;;) {
495 			if(ai == alen) {
496 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
497 				alen += ToksChunk;
498 			}
499 			starti = ts->i;
500 			c = getchar(ts);
501 			if(c < 0)
502 				break;
503 			if(c == '<') {
504 				tag = gettag(ts, starti, a, &ai);
505 				if(tag == Tscript) {
506 					// special rules for getting Data after....
507 					starti = ts->i;
508 					c = getchar(ts);
509 					tag = getscriptdata(ts, c, starti, a, &ai);
510 				}
511 			}
512 			else
513 				tag = getdata(ts, c, starti, a, &ai);
514 			if(tag == -1)
515 				break;
516 			else if(dbglex > 1 && tag != Comment)
517 				fprint(2, "lex: got token %T\n", &a[ai-1]);
518 		}
519 	}
520 	else {
521 		// plain text (non-html) tokens
522 		for(;;) {
523 			if(ai == alen) {
524 				a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
525 				alen += ToksChunk;
526 			}
527 			tag = getplaindata(ts, a, &ai);
528 			if(tag == -1)
529 				break;
530 			if(dbglex > 1)
531 				fprint(2, "lex: got token %T\n", &a[ai]);
532 		}
533 	}
534 	if(dbglex)
535 		fprint(2, "lex: returning %d tokens\n", ai);
536 	*plen = ai;
537 	if(ai == 0) 
538 		return nil;
539 	return a;
540 }
541 
542 // For case where source isn't HTML.
543 // Just make data tokens, one per line (or partial line,
544 // at end of buffer), ignoring non-whitespace control
545 // characters and dumping \r's.
546 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
547 // Otherwise return -1;
548 static int
549 getplaindata(TokenSource* ts, Token* a, int* pai)
550 {
551 	Rune*	s;
552 	int	j;
553 	int	starti;
554 	int	c;
555 	Token*	tok;
556 	Rune	buf[BIGBUFSIZE];
557 
558 	s = nil;
559 	j = 0;
560 	starti = ts->i;
561 	for(c = getchar(ts); c >= 0; c = getchar(ts)) {
562 		if(c < ' ') {
563 			if(isspace(c)) {
564 				if(c == '\r') {
565 					// ignore it unless no following '\n',
566 					// in which case treat it like '\n'
567 					c = getchar(ts);
568 					if(c != '\n') {
569 						if(c >= 0)
570 							ungetchar(ts, c);
571 						c = '\n';
572 					}
573 				}
574 			}
575 			else
576 				c = 0;
577 		}
578 		if(c != 0) {
579 			buf[j++] = c;
580 			if(j == sizeof(buf)-1) {
581 				s = buftostr(s, buf, j);
582 				j = 0;
583 			}
584 		}
585 		if(c == '\n')
586 			break;
587 	}
588 	s = buftostr(s, buf, j);
589 	if(s == nil)
590 		return -1;
591 	tok = &a[(*pai)++];
592 	tok->tag = Data;
593 	tok->text = s;
594 	tok->attr = nil;
595 	tok->starti = starti;
596 	return Data;
597 }
598 
599 // Return concatenation of s and buf[0:j]
600 static Rune*
601 buftostr(Rune* s, Rune* buf, int j)
602 {
603 	buf[j] = 0;
604 	if(s == nil)
605 		s = _Strndup(buf, j);
606 	else 
607 		s = _Strdup2(s, buf);
608 	return s;
609 }
610 
611 // Gather data up to next start-of-tag or end-of-buffer.
612 // Translate entity references (&amp;).
613 // Ignore non-whitespace control characters and get rid of \r's.
614 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
615 // Otherwise return -1;
616 static int
617 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
618 {
619 	Rune*	s;
620 	int	j;
621 	int	c;
622 	Token*	tok;
623 	Rune	buf[BIGBUFSIZE];
624 
625 	s = nil;
626 	j = 0;
627 	c = firstc;
628 	while(c >= 0) {
629 		if(c == '&') {
630 			c = ampersand(ts);
631 			if(c < 0)
632 				break;
633 		}
634 		else if(c < ' ') {
635 			if(isspace(c)) {
636 				if(c == '\r') {
637 					// ignore it unless no following '\n',
638 					// in which case treat it like '\n'
639 					c = getchar(ts);
640 					if(c != '\n') {
641 						if(c >= 0)
642 							ungetchar(ts, c);
643 						c = '\n';
644 					}
645 				}
646 			}
647 			else {
648 				if(warn)
649 					fprint(2, "warning: non-whitespace control character %d ignored\n", c);
650 				c = 0;
651 			}
652 		}
653 		else if(c == '<') {
654 			ungetchar(ts, c);
655 			break;
656 		}
657 		if(c != 0) {
658 			buf[j++] = c;
659 			if(j == BIGBUFSIZE-1) {
660 				s = buftostr(s, buf, j);
661 				j = 0;
662 			}
663 		}
664 		c = getchar(ts);
665 	}
666 	s = buftostr(s, buf, j);
667 	if(s == nil)
668 		return -1;
669 	tok = &a[(*pai)++];
670 	tok->tag = Data;
671 	tok->text = s;
672 	tok->attr = nil;
673 	tok->starti = starti;
674 	return Data;
675 }
676 
677 // The rules for lexing scripts are different (ugh).
678 // Gather up everything until see a </SCRIPT>.
679 static int
680 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
681 {
682 	Rune*	s;
683 	int	j;
684 	int	tstarti;
685 	int	savei;
686 	int	c;
687 	int	tag;
688 	int	done;
689 	Token*	tok;
690 	Rune	buf[BIGBUFSIZE];
691 
692 	s = nil;
693 	j = 0;
694 	tstarti = starti;
695 	c = firstc;
696 	done = 0;
697 	while(c >= 0) {
698 		if(c == '<') {
699 			// other browsers ignore stuff to end of line after <!
700 			savei = ts->i;
701 			c = getchar(ts);
702 			if(c == '!') {
703 				while(c >= 0 && c != '\n' && c != '\r')
704 					c = getchar(ts);
705 				if(c == '\r')
706 					c = getchar(ts);
707 				if(c == '\n')
708 					c = getchar(ts);
709 			}
710 			else if(c >= 0) {
711 				backup(ts, savei);
712 				tag = gettag(ts, tstarti, a, pai);
713 				if(tag == -1)
714 					break;
715 				if(tag != Comment)
716 					(*pai)--;
717 				backup(ts, tstarti);
718 				if(tag == Tscript + RBRA) {
719 					done = 1;
720 					break;
721 				}
722 				// here tag was not </SCRIPT>, so take as regular data
723 				c = getchar(ts);
724 			}
725 		}
726 		if(c < 0)
727 			break;
728 		if(c != 0) {
729 			buf[j++] = c;
730 			if(j == BIGBUFSIZE-1) {
731 				s = buftostr(s, buf, j);
732 				j = 0;
733 			}
734 		}
735 		tstarti = ts->i;
736 		c = getchar(ts);
737 	}
738 	if(done || ts->i == ts->edata) {
739 		s = buftostr(s, buf, j);
740 		tok = &a[(*pai)++];
741 		tok->tag = Data;
742 		tok->text = s;
743 		tok->attr = nil;
744 		tok->starti = starti;
745 		return Data;
746 	}
747 	backup(ts, starti);
748 	return -1;
749 }
750 
751 // We've just seen a '<'.  Gather up stuff to closing '>' (if buffer
752 // ends before then, return -1).
753 // If it's a tag, look up the name, gather the attributes, and return
754 // the appropriate token.
755 // Else it's either just plain data or some kind of ignorable stuff:
756 // return Data or Comment as appropriate.
757 // If it's not a Comment, put it in a[*pai] and bump *pai.
758 static int
759 gettag(TokenSource* ts, int starti, Token* a, int* pai)
760 {
761 	int	rbra;
762 	int	ans;
763 	Attr*	al;
764 	int	nexti;
765 	int	c;
766 	int	ti;
767 	int	afnd;
768 	int	attid;
769 	int	quote;
770 	Rune*	val;
771 	int	nv;
772 	int	i;
773 	int	tag;
774 	Token*	tok;
775 	Rune	buf[BIGBUFSIZE];
776 
777 	rbra = 0;
778 	nexti = ts->i;
779 	tok = &a[*pai];
780 	tok->tag = Notfound;
781 	tok->text = nil;
782 	tok->attr = nil;
783 	tok->starti = starti;
784 	c = getchar(ts);
785 	if(c == '/') {
786 		rbra = RBRA;
787 		c = getchar(ts);
788 	}
789 	if(c < 0)
790 		goto eob_done;
791 	if(c >= 256 || !isalpha(c)) {
792 		// not a tag
793 		if(c == '!') {
794 			ans = comment(ts);
795 			if(ans != -1)
796 				return ans;
797 			goto eob_done;
798 		}
799 		else {
800 			backup(ts, nexti);
801 			tok->tag = Data;
802 			tok->text = _Strdup(L(Llt));
803 			(*pai)++;
804 			return Data;
805 		}
806 	}
807 	// c starts a tagname
808 	buf[0] = c;
809 	i = 1;
810 	while(1) {
811 		c = getchar(ts);
812 		if(c < 0)
813 			goto eob_done;
814 		if(!ISNAMCHAR(c))
815 			break;
816 		// if name is bigger than buf it won't be found anyway...
817 		if(i < BIGBUFSIZE)
818 			buf[i++] = c;
819 	}
820 	if(_lookup(tagtable, Numtags, buf, i, &tag))
821 		tok->tag = tag + rbra;
822 	else
823 		tok->text = _Strndup(buf, i);	// for warning print, in build
824 
825 	// attribute gathering loop
826 	al = nil;
827 	while(1) {
828 		// look for "ws name" or "ws name ws = ws val"  (ws=whitespace)
829 		// skip whitespace
830 attrloop_continue:
831 		while(c < 256 && isspace(c)) {
832 			c = getchar(ts);
833 			if(c < 0)
834 				goto eob_done;
835 		}
836 		if(c == '>')
837 			goto attrloop_done;
838 		if(c == '<') {
839 			if(warn)
840 				fprint(2, "warning: unclosed tag\n");
841 			ungetchar(ts, c);
842 			goto attrloop_done;
843 		}
844 		if(c >= 256 || !isalpha(c)) {
845 			if(warn)
846 				fprint(2, "warning: expected attribute name\n");
847 			// skipt to next attribute name
848 			while(1) {
849 				c = getchar(ts);
850 				if(c < 0)
851 					goto eob_done;
852 				if(c < 256 && isalpha(c))
853 					goto attrloop_continue;
854 				if(c == '<') {
855 					if(warn)
856 						fprint(2, "warning: unclosed tag\n");
857 					ungetchar(ts, 60);
858 					goto attrloop_done;
859 				}
860 				if(c == '>')
861 					goto attrloop_done;
862 			}
863 		}
864 		// gather attribute name
865 		buf[0] = c;
866 		i = 1;
867 		while(1) {
868 			c = getchar(ts);
869 			if(c < 0)
870 				goto eob_done;
871 			if(!ISNAMCHAR(c))
872 				break;
873 			if(i < BIGBUFSIZE-1)
874 				buf[i++] = c;
875 		}
876 		afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
877 		if(warn && !afnd) {
878 			buf[i] = 0;
879 			fprint(2, "warning: unknown attribute name %S\n", buf);
880 		}
881 		// skip whitespace
882 		while(c < 256 && isspace(c)) {
883 			c = getchar(ts);
884 			if(c < 0)
885 				goto eob_done;
886 		}
887 		if(c != '=') {
888 			if(afnd)
889 				al = newattr(attid, nil, al);
890 			goto attrloop_continue;
891 		}
892 		//# c is '=' here;  skip whitespace
893 		while(1) {
894 			c = getchar(ts);
895 			if(c < 0)
896 				goto eob_done;
897 			if(c >= 256 || !isspace(c))
898 				break;
899 		}
900 		quote = 0;
901 		if(c == '\'' || c == '"') {
902 			quote = c;
903 			c = getchar(ts);
904 			if(c < 0)
905 				goto eob_done;
906 		}
907 		val = nil;
908 		nv = 0;
909 		while(1) {
910 valloop_continue:
911 			if(c < 0)
912 				goto eob_done;
913 			if(c == '>') {
914 				if(quote) {
915 					// c might be part of string (though not good style)
916 					// but if line ends before close quote, assume
917 					// there was an unmatched quote
918 					ti = ts->i;
919 					while(1) {
920 						c = getchar(ts);
921 						if(c < 0)
922 							goto eob_done;
923 						if(c == quote) {
924 							backup(ts, ti);
925 							buf[nv++] = '>';
926 							if(nv == BIGBUFSIZE-1) {
927 								val = buftostr(val, buf, nv);
928 								nv = 0;
929 							}
930 							c = getchar(ts);
931 							goto valloop_continue;
932 						}
933 						if(c == '\n') {
934 							if(warn)
935 								fprint(2, "warning: apparent unmatched quote\n");
936 							backup(ts, ti);
937 							c = '>';
938 							goto valloop_done;
939 						}
940 					}
941 				}
942 				else
943 					goto valloop_done;
944 			}
945 			if(quote) {
946 				if(c == quote) {
947 					c = getchar(ts);
948 					if(c < 0)
949 						goto eob_done;
950 					goto valloop_done;
951 				}
952 				if(c == '\r') {
953 					c = getchar(ts);
954 					goto valloop_continue;
955 				}
956 				if(c == '\t' || c == '\n')
957 					c = ' ';
958 			}
959 			else {
960 				if(c < 256 && isspace(c))
961 					goto valloop_done;
962 			}
963 			if(c == '&') {
964 				c = ampersand(ts);
965 				if(c == -1)
966 					goto eob_done;
967 			}
968 			buf[nv++] = c;
969 			if(nv == BIGBUFSIZE-1) {
970 				val = buftostr(val, buf, nv);
971 				nv = 0;
972 			}
973 			c = getchar(ts);
974 		}
975 valloop_done:
976 		if(afnd) {
977 			val = buftostr(val, buf, nv);
978 			al = newattr(attid, val, al);
979 		}
980 	}
981 
982 attrloop_done:
983 	tok->attr = al;
984 	(*pai)++;
985 	return tok->tag;
986 
987 eob_done:
988 	if(warn)
989 		fprint(2, "warning: incomplete tag at end of page\n");
990 	backup(ts, nexti);
991 	tok->tag = Data;
992 	tok->text = _Strdup(L(Llt));
993 	return Data;
994 }
995 
996 // We've just read a '<!' at position starti,
997 // so this may be a comment or other ignored section, or it may
998 // be just a literal string if there is no close before end of file
999 // (other browsers do that).
1000 // The accepted practice seems to be (note: contrary to SGML spec!):
1001 // If see <!--, look for --> to close, or if none, > to close.
1002 // If see <!(not --), look for > to close.
1003 // If no close before end of file, leave original characters in as literal data.
1004 //
1005 // If we see ignorable stuff, return Comment.
1006 // Else return nil (caller should back up and try again when more data arrives,
1007 // unless at end of file, in which case caller should just make '<' a data token).
1008 static int
1009 comment(TokenSource* ts)
1010 {
1011 	int	nexti;
1012 	int	havecomment;
1013 	int	c;
1014 
1015 	nexti = ts->i;
1016 	havecomment = 0;
1017 	c = getchar(ts);
1018 	if(c == '-') {
1019 		c = getchar(ts);
1020 		if(c == '-') {
1021 			if(findstr(ts, L(Larrow)))
1022 				havecomment = 1;
1023 			else
1024 				backup(ts, nexti);
1025 		}
1026 	}
1027 	if(!havecomment) {
1028 		if(c == '>')
1029 			havecomment = 1;
1030 		else if(c >= 0) {
1031 			if(findstr(ts, L(Lgt)))
1032 				havecomment = 1;
1033 		}
1034 	}
1035 	if(havecomment)
1036 		return Comment;
1037 	return -1;
1038 }
1039 
1040 // Look for string s in token source.
1041 // If found, return 1, with buffer at next char after s,
1042 // else return 0 (caller should back up).
1043 static int
1044 findstr(TokenSource* ts, Rune* s)
1045 {
1046 	int	c0;
1047 	int	n;
1048 	int	nexti;
1049 	int	i;
1050 	int	c;
1051 
1052 	c0 = s[0];
1053 	n = runestrlen(s);
1054 	while(1) {
1055 		c = getchar(ts);
1056 		if(c < 0)
1057 			break;
1058 		if(c == c0) {
1059 			if(n == 1)
1060 				return 1;
1061 			nexti = ts->i;
1062 			for(i = 1; i < n; i++) {
1063 				c = getchar(ts);
1064 				if(c < 0)
1065 					goto mainloop_done;
1066 				if(c != s[i])
1067 					break;
1068 			}
1069 			if(i == n)
1070 				return 1;
1071 			backup(ts, nexti);
1072 		}
1073 	}
1074 mainloop_done:
1075 	return 0;
1076 }
1077 
1078 // We've just read an '&'; look for an entity reference
1079 // name, and if found, return translated char.
1080 // if there is a complete entity name but it isn't known,
1081 // try prefixes (gets around some buggy HTML out there),
1082 // and if that fails, back up to just past the '&' and return '&'.
1083 // If the entity can't be completed in the current buffer, back up
1084 // to the '&' and return -1.
1085 static int
1086 ampersand(TokenSource* ts)
1087 {
1088 	int	savei;
1089 	int	c;
1090 	int	fnd;
1091 	int	ans;
1092 	int	v;
1093 	int	i;
1094 	int	k;
1095 	Rune	buf[SMALLBUFSIZE];
1096 
1097 	savei = ts->i;
1098 	c = getchar(ts);
1099 	fnd = 0;
1100 	ans = -1;
1101 	if(c == '#') {
1102 		c = getchar(ts);
1103 		v = 0;
1104 		while(c >= 0) {
1105 			if(!(c < 256 && isdigit(c)))
1106 				break;
1107 			v = v*10 + c - 48;
1108 			c = getchar(ts);
1109 		}
1110 		if(c >= 0) {
1111 			if(!(c == ';' || c == '\n' || c == '\r'))
1112 				ungetchar(ts, c);
1113 			c = v;
1114 			if(c == 160)
1115 				c = 160;
1116 			if(c >= Winstart && c <= Winend) {
1117 				c = winchars[c - Winstart];
1118 			}
1119 			ans = c;
1120 			fnd = 1;
1121 		}
1122 	}
1123 	else if(c < 256 && isalpha(c)) {
1124 		buf[0] = c;
1125 		k = 1;
1126 		while(1) {
1127 			c = getchar(ts);
1128 			if(c < 0)
1129 				break;
1130 			if(ISNAMCHAR(c)) {
1131 				if(k < SMALLBUFSIZE-1)
1132 					buf[k++] = c;
1133 			}
1134 			else {
1135 				if(!(c == ';' || c == '\n' || c == '\r'))
1136 					ungetchar(ts, c);
1137 				break;
1138 			}
1139 		}
1140 		if(c >= 0) {
1141 			fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1142 			if(!fnd) {
1143 				// Try prefixes of s
1144 				if(c == ';' || c == '\n' || c == '\r')
1145 					ungetchar(ts, c);
1146 				i = k;
1147 				while(--k > 0) {
1148 					fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1149 					if(fnd) {
1150 						while(i > k) {
1151 							i--;
1152 							ungetchar(ts, buf[i]);
1153 						}
1154 						break;
1155 					}
1156 				}
1157 			}
1158 		}
1159 	}
1160 	if(!fnd) {
1161 		backup(ts, savei);
1162 		ans = '&';
1163 	}
1164 	return ans;
1165 }
1166 
1167 // Get next char, obeying ts.chset.
1168 // Returns -1 if no complete character left before current end of data.
1169 static int
1170 getchar(TokenSource* ts)
1171 {
1172 	uchar*	buf;
1173 	int	c;
1174 	int	n;
1175 	int	ok;
1176 	Rune	r;
1177 
1178 	if(ts->i >= ts->edata)
1179 		return -1;
1180 	buf = ts->data;
1181 	c = buf[ts->i];
1182 	switch(ts->chset) {
1183 	case ISO_8859_1:
1184 		if(c >= Winstart && c <= Winend)
1185 			c = winchars[c - Winstart];
1186 		ts->i++;
1187 		break;
1188 	case US_Ascii:
1189 		if(c > 127) {
1190 			if(warn)
1191 				fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1192 		}
1193 		ts->i++;
1194 		break;
1195 	case UTF_8:
1196 		ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1197 		n = chartorune(&r, (char*)(buf+ts->i));
1198 		if(ok) {
1199 			if(warn && c == 0x80)
1200 				fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1201 			ts->i += n;
1202 			c = r;
1203 		}
1204 		else {
1205 			// not enough bytes in buf to complete utf-8 char
1206 			ts->i = ts->edata;	// mark "all used"
1207 			c = -1;
1208 		}
1209 		break;
1210 	case Unicode:
1211 		if(ts->i < ts->edata - 1) {
1212 			//standards say most-significant byte first
1213 			c = (c << 8)|(buf[ts->i + 1]);
1214 			ts->i += 2;
1215 		}
1216 		else {
1217 			ts->i = ts->edata;	// mark "all used"
1218 			c = -1;
1219 		}
1220 		break;
1221 	}
1222 	return c;
1223 }
1224 
1225 // Assuming c was the last character returned by getchar, set
1226 // things up so that next getchar will get that same character
1227 // followed by the current 'next character', etc.
1228 static void
1229 ungetchar(TokenSource* ts, int c)
1230 {
1231 	int	n;
1232 	Rune	r;
1233 	char	a[UTFmax];
1234 
1235 	n = 1;
1236 	switch(ts->chset) {
1237 	case UTF_8:
1238 		if(c >= 128) {
1239 			r = c;
1240 			n = runetochar(a, &r);
1241 		}
1242 		break;
1243 	case Unicode:
1244 		n = 2;
1245 		break;
1246 	}
1247 	ts->i -= n;
1248 }
1249 
1250 // Restore ts so that it is at the state where the index was savei.
1251 static void
1252 backup(TokenSource* ts, int savei)
1253 {
1254 	if(dbglex)
1255 		fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1256 	ts->i = savei;
1257 }
1258 
1259 
1260 // Look for value associated with attribute attid in token t.
1261 // If there is one, return 1 and put the value in *pans,
1262 // else return 0.
1263 // If xfer is true, transfer ownership of the string to the caller
1264 // (nil it out here); otherwise, caller must duplicate the answer
1265 // if it needs to save it.
1266 // OK to have pans==0, in which case this is just looking
1267 // to see if token is present.
1268 int
1269 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1270 {
1271 	Attr*	attr;
1272 
1273 	attr = t->attr;
1274 	while(attr != nil) {
1275 		if(attr->attid == attid) {
1276 			if(pans != nil)
1277 				*pans = attr->value;
1278 			if(xfer)
1279 				attr->value = nil;
1280 			return 1;
1281 		}
1282 		attr = attr->next;
1283 	}
1284 	if(pans != nil)
1285 		*pans = nil;
1286 	return 0;
1287 }
1288 
1289 static int
1290 Tconv(Fmt *f)
1291 {
1292 	Token*	t;
1293 	int	i;
1294 	int	tag;
1295 	char*	srbra;
1296 	Rune*	aname;
1297 	Rune*	tname;
1298 	Attr*	a;
1299 	char	buf[BIGBUFSIZE];
1300 
1301 	t = va_arg(f->args, Token*);
1302 	if(t == nil)
1303 		sprint(buf, "<null>");
1304 	else {
1305 		i = 0;
1306 		if(dbglex > 1)
1307 			i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1308 		tag = t->tag;
1309 		if(tag == Data) {
1310 			i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1311 		}
1312 		else {
1313 			srbra = "";
1314 			if(tag >= RBRA) {
1315 				tag -= RBRA;
1316 				srbra = "/";
1317 			}
1318 			tname = tagnames[tag];
1319 			if(tag == Notfound)
1320 				tname = L(Lquestion);
1321 			i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1322 			for(a = t->attr; a != nil; a = a->next) {
1323 				aname = attrnames[a->attid];
1324 				i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1325 				if(a->value != nil)
1326 					i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1327 			}
1328 			i += snprint(buf+i, sizeof(buf)-i-1, ">");
1329 		}
1330 		buf[i] = 0;
1331 	}
1332 	return fmtstrcpy(f, buf);
1333 }
1334 
1335 // Attrs own their constituent strings, but build may eventually
1336 // transfer some values to its items and nil them out in the Attr.
1337 static Attr*
1338 newattr(int attid, Rune* value, Attr* link)
1339 {
1340 	Attr* ans;
1341 
1342 	ans = (Attr*)emalloc(sizeof(Attr));
1343 	ans->attid = attid;
1344 	ans->value = value;
1345 	ans->next = link;
1346 	return ans;
1347 }
1348 
1349 // Free list of Attrs linked through next field
1350 static void
1351 freeattrs(Attr* ahead)
1352 {
1353 	Attr* a;
1354 	Attr* nexta;
1355 
1356 	a = ahead;
1357 	while(a != nil) {
1358 		nexta = a->next;
1359 		free(a->value);
1360 		free(a);
1361 		a = nexta;
1362 	}
1363 }
1364 
1365 // Free array of Tokens.
1366 // Allocated space might have room for more than n tokens,
1367 // but only n of them are initialized.
1368 // If caller has transferred ownership of constitutent strings
1369 // or attributes, it must have nil'd out the pointers in the Tokens.
1370 void
1371 _freetokens(Token* tarray, int n)
1372 {
1373 	int i;
1374 	Token* t;
1375 
1376 	if(tarray == nil)
1377 		return;
1378 	for(i = 0; i < n; i++) {
1379 		t = &tarray[i];
1380 		free(t->text);
1381 		freeattrs(t->attr);
1382 	}
1383 	free(tarray);
1384 }