Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 int i; // index of next byte to use
12 uchar* data; // all the data
13 int edata; // data[0:edata] is valid
14 int chset; // one of US_Ascii, etc.
15 int mtype; // TextHtml or TextPlain
16 };
18 enum {
19 EOF = -2,
20 EOB = -1
21 };
23 #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune **tagnames;
31 char *_tagnames[] = {
32 " ",
33 "!",
34 "a",
35 "abbr",
36 "acronym",
37 "address",
38 "applet",
39 "area",
40 "b",
41 "base",
42 "basefont",
43 "bdo",
44 "big",
45 "blink",
46 "blockquote",
47 "body",
48 "bq",
49 "br",
50 "button",
51 "caption",
52 "center",
53 "cite",
54 "code",
55 "col",
56 "colgroup",
57 "dd",
58 "del",
59 "dfn",
60 "dir",
61 "div",
62 "dl",
63 "dt",
64 "em",
65 "fieldset",
66 "font",
67 "form",
68 "frame",
69 "frameset",
70 "h1",
71 "h2",
72 "h3",
73 "h4",
74 "h5",
75 "h6",
76 "head",
77 "hr",
78 "html",
79 "i",
80 "iframe",
81 "img",
82 "input",
83 "ins",
84 "isindex",
85 "kbd",
86 "label",
87 "legend",
88 "li",
89 "link",
90 "map",
91 "menu",
92 "meta",
93 "nobr",
94 "noframes",
95 "noscript",
96 "object",
97 "ol",
98 "optgroup",
99 "option",
100 "p",
101 "param",
102 "pre",
103 "q",
104 "s",
105 "samp",
106 "script",
107 "select",
108 "small",
109 "span",
110 "strike",
111 "strong",
112 "style",
113 "sub",
114 "sup",
115 "table",
116 "tbody",
117 "td",
118 "textarea",
119 "tfoot",
120 "th",
121 "thead",
122 "title",
123 "tr",
124 "tt",
125 "u",
126 "ul",
127 "var"
128 };
130 // HTML 4.0 attribute names.
131 // Keep sorted, and in correspondence with enum in i.h.
132 Rune **attrnames;
133 char* _attrnames[] = {
134 "abbr",
135 "accept-charset",
136 "access-key",
137 "action",
138 "align",
139 "alink",
140 "alt",
141 "archive",
142 "axis",
143 "background",
144 "bgcolor",
145 "border",
146 "cellpadding",
147 "cellspacing",
148 "char",
149 "charoff",
150 "charset",
151 "checked",
152 "cite",
153 "class",
154 "classid",
155 "clear",
156 "code",
157 "codebase",
158 "codetype",
159 "color",
160 "cols",
161 "colspan",
162 "compact",
163 "content",
164 "coords",
165 "data",
166 "datetime",
167 "declare",
168 "defer",
169 "dir",
170 "disabled",
171 "enctype",
172 "face",
173 "for",
174 "frame",
175 "frameborder",
176 "headers",
177 "height",
178 "href",
179 "hreflang",
180 "hspace",
181 "http-equiv",
182 "id",
183 "ismap",
184 "label",
185 "lang",
186 "link",
187 "longdesc",
188 "marginheight",
189 "marginwidth",
190 "maxlength",
191 "media",
192 "method",
193 "multiple",
194 "name",
195 "nohref",
196 "noresize",
197 "noshade",
198 "nowrap",
199 "object",
200 "onblur",
201 "onchange",
202 "onclick",
203 "ondblclick",
204 "onfocus",
205 "onkeypress",
206 "onkeyup",
207 "onload",
208 "onmousedown",
209 "onmousemove",
210 "onmouseout",
211 "onmouseover",
212 "onmouseup",
213 "onreset",
214 "onselect",
215 "onsubmit",
216 "onunload",
217 "profile",
218 "prompt",
219 "readonly",
220 "rel",
221 "rev",
222 "rows",
223 "rowspan",
224 "rules",
225 "scheme",
226 "scope",
227 "scrolling",
228 "selected",
229 "shape",
230 "size",
231 "span",
232 "src",
233 "standby",
234 "start",
235 "style",
236 "summary",
237 "tabindex",
238 "target",
239 "text",
240 "title",
241 "type",
242 "usemap",
243 "valign",
244 "value",
245 "valuetype",
246 "version",
247 "vlink",
248 "vspace",
249 "width"
250 };
253 // Character entity to unicode character number map.
254 // Keep sorted by name.
255 StringInt *chartab;
256 AsciiInt _chartab[] = {
257 {"AElig", 198},
258 {"Aacute", 193},
259 {"Acirc", 194},
260 {"Agrave", 192},
261 {"Aring", 197},
262 {"Atilde", 195},
263 {"Auml", 196},
264 {"Ccedil", 199},
265 {"ETH", 208},
266 {"Eacute", 201},
267 {"Ecirc", 202},
268 {"Egrave", 200},
269 {"Euml", 203},
270 {"Iacute", 205},
271 {"Icirc", 206},
272 {"Igrave", 204},
273 {"Iuml", 207},
274 {"Ntilde", 209},
275 {"Oacute", 211},
276 {"Ocirc", 212},
277 {"Ograve", 210},
278 {"Oslash", 216},
279 {"Otilde", 213},
280 {"Ouml", 214},
281 {"THORN", 222},
282 {"Uacute", 218},
283 {"Ucirc", 219},
284 {"Ugrave", 217},
285 {"Uuml", 220},
286 {"Yacute", 221},
287 {"aacute", 225},
288 {"acirc", 226},
289 {"acute", 180},
290 {"aelig", 230},
291 {"agrave", 224},
292 {"alpha", 945},
293 {"amp", 38},
294 {"aring", 229},
295 {"atilde", 227},
296 {"auml", 228},
297 {"beta", 946},
298 {"brvbar", 166},
299 {"ccedil", 231},
300 {"cdots", 8943},
301 {"cedil", 184},
302 {"cent", 162},
303 {"chi", 967},
304 {"copy", 169},
305 {"curren", 164},
306 {"ddots", 8945},
307 {"deg", 176},
308 {"delta", 948},
309 {"divide", 247},
310 {"eacute", 233},
311 {"ecirc", 234},
312 {"egrave", 232},
313 {"emdash", 8212}, /* non-standard but commonly used */
314 {"emsp", 8195},
315 {"endash", 8211}, /* non-standard but commonly used */
316 {"ensp", 8194},
317 {"epsilon", 949},
318 {"eta", 951},
319 {"eth", 240},
320 {"euml", 235},
321 {"frac12", 189},
322 {"frac14", 188},
323 {"frac34", 190},
324 {"gamma", 947},
325 {"gt", 62},
326 {"iacute", 237},
327 {"icirc", 238},
328 {"iexcl", 161},
329 {"igrave", 236},
330 {"iota", 953},
331 {"iquest", 191},
332 {"iuml", 239},
333 {"kappa", 954},
334 {"lambda", 955},
335 {"laquo", 171},
336 {"ldots", 8230},
337 {"lt", 60},
338 {"macr", 175},
339 {"mdash", 8212},
340 {"micro", 181},
341 {"middot", 183},
342 {"mu", 956},
343 {"nbsp", 160},
344 {"ndash", 8211},
345 {"not", 172},
346 {"ntilde", 241},
347 {"nu", 957},
348 {"oacute", 243},
349 {"ocirc", 244},
350 {"ograve", 242},
351 {"omega", 969},
352 {"omicron", 959},
353 {"ordf", 170},
354 {"ordm", 186},
355 {"oslash", 248},
356 {"otilde", 245},
357 {"ouml", 246},
358 {"para", 182},
359 {"phi", 966},
360 {"pi", 960},
361 {"plusmn", 177},
362 {"pound", 163},
363 {"psi", 968},
364 {"quad", 8193},
365 {"quot", 34},
366 {"raquo", 187},
367 {"reg", 174},
368 {"rho", 961},
369 {"sect", 167},
370 {"shy", 173},
371 {"sigma", 963},
372 {"sp", 8194},
373 {"sup1", 185},
374 {"sup2", 178},
375 {"sup3", 179},
376 {"szlig", 223},
377 {"tau", 964},
378 {"theta", 952},
379 {"thinsp", 8201},
380 {"thorn", 254},
381 {"times", 215},
382 {"trade", 8482},
383 {"uacute", 250},
384 {"ucirc", 251},
385 {"ugrave", 249},
386 {"uml", 168},
387 {"upsilon", 965},
388 {"uuml", 252},
389 {"varepsilon", 8712},
390 {"varphi", 981},
391 {"varpi", 982},
392 {"varrho", 1009},
393 {"vdots", 8942},
394 {"vsigma", 962},
395 {"vtheta", 977},
396 {"xi", 958},
397 {"yacute", 253},
398 {"yen", 165},
399 {"yuml", 255},
400 {"zeta", 950}
401 };
402 #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
404 // Characters Winstart..Winend are those that Windows
405 // uses interpolated into the Latin1 set.
406 // They aren't supposed to appear in HTML, but they do....
407 enum {
408 Winstart = 127,
409 Winend = 159
410 };
412 static int winchars[]= { 8226, // 8226 is a bullet
413 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
414 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
415 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
416 732, 8482, 353, 8250, 339, 8226, 8226, 376};
418 static StringInt* tagtable; // initialized from tagnames
419 static StringInt* attrtable; // initialized from attrnames
421 static void lexinit(void);
422 static int getplaindata(TokenSource* ts, Token* a, int* pai);
423 static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
424 static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
425 static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
426 static Rune* buftostr(Rune* s, Rune* buf, int j);
427 static int comment(TokenSource* ts);
428 static int findstr(TokenSource* ts, Rune* s);
429 static int ampersand(TokenSource* ts);
430 //static int lowerc(int c);
431 static int getchar(TokenSource* ts);
432 static void ungetchar(TokenSource* ts, int c);
433 static void backup(TokenSource* ts, int savei);
434 //static void freeinsidetoken(Token* t);
435 static void freeattrs(Attr* ahead);
436 static Attr* newattr(int attid, Rune* value, Attr* link);
437 static int Tconv(Fmt* f);
439 int dbglex = 0;
440 static int lexinited = 0;
442 static void
443 lexinit(void)
445 chartab = _cvtstringinttab(_chartab, nelem(_chartab));
446 tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
447 tagtable = _makestrinttab(tagnames, Numtags);
448 attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
449 attrtable = _makestrinttab(attrnames, Numattrs);
450 fmtinstall('T', Tconv);
451 lexinited = 1;
454 static TokenSource*
455 newtokensource(uchar* data, int edata, int chset, int mtype)
457 TokenSource* ans;
459 assert(chset == US_Ascii || chset == ISO_8859_1 ||
460 chset == UTF_8 || chset == Unicode);
461 ans = (TokenSource*)emalloc(sizeof(TokenSource));
462 ans->i = 0;
463 ans->data = data;
464 ans->edata = edata;
465 ans->chset = chset;
466 ans->mtype = mtype;
467 return ans;
470 enum {
471 ToksChunk = 500
472 };
474 // Call this to get the tokens.
475 // The number of returned tokens is returned in *plen.
476 Token*
477 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
479 TokenSource* ts;
480 Token* a;
481 int alen;
482 int ai;
483 int starti;
484 int c;
485 int tag;
487 if(!lexinited)
488 lexinit();
489 ts = newtokensource(data, datalen, chset, mtype);
490 alen = ToksChunk;
491 a = (Token*)emalloc(alen * sizeof(Token));
492 ai = 0;
493 if(dbglex)
494 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
495 if(ts->mtype == TextHtml) {
496 for(;;) {
497 if(ai == alen) {
498 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
499 alen += ToksChunk;
501 starti = ts->i;
502 c = getchar(ts);
503 if(c < 0)
504 break;
505 if(c == '<') {
506 tag = gettag(ts, starti, a, &ai);
507 if(tag == Tscript) {
508 // special rules for getting Data after....
509 starti = ts->i;
510 c = getchar(ts);
511 tag = getscriptdata(ts, c, starti, a, &ai);
514 else
515 tag = getdata(ts, c, starti, a, &ai);
516 if(tag == -1)
517 break;
518 else if(dbglex > 1 && tag != Comment)
519 fprint(2, "lex: got token %T\n", &a[ai-1]);
522 else {
523 // plain text (non-html) tokens
524 for(;;) {
525 if(ai == alen) {
526 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
527 alen += ToksChunk;
529 tag = getplaindata(ts, a, &ai);
530 if(tag == -1)
531 break;
532 if(dbglex > 1)
533 fprint(2, "lex: got token %T\n", &a[ai]);
536 if(dbglex)
537 fprint(2, "lex: returning %d tokens\n", ai);
538 *plen = ai;
539 if(ai == 0)
540 return nil;
541 return a;
544 // For case where source isn't HTML.
545 // Just make data tokens, one per line (or partial line,
546 // at end of buffer), ignoring non-whitespace control
547 // characters and dumping \r's.
548 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
549 // Otherwise return -1;
550 static int
551 getplaindata(TokenSource* ts, Token* a, int* pai)
553 Rune* s;
554 int j;
555 int starti;
556 int c;
557 Token* tok;
558 Rune buf[BIGBUFSIZE];
560 s = nil;
561 j = 0;
562 starti = ts->i;
563 for(c = getchar(ts); c >= 0; c = getchar(ts)) {
564 if(c < ' ') {
565 if(isspace(c)) {
566 if(c == '\r') {
567 // ignore it unless no following '\n',
568 // in which case treat it like '\n'
569 c = getchar(ts);
570 if(c != '\n') {
571 if(c >= 0)
572 ungetchar(ts, c);
573 c = '\n';
577 else
578 c = 0;
580 if(c != 0) {
581 buf[j++] = c;
582 if(j == sizeof(buf)-1) {
583 s = buftostr(s, buf, j);
584 j = 0;
587 if(c == '\n')
588 break;
590 s = buftostr(s, buf, j);
591 if(s == nil)
592 return -1;
593 tok = &a[(*pai)++];
594 tok->tag = Data;
595 tok->text = s;
596 tok->attr = nil;
597 tok->starti = starti;
598 return Data;
601 // Return concatenation of s and buf[0:j]
602 static Rune*
603 buftostr(Rune* s, Rune* buf, int j)
605 buf[j] = 0;
606 if(s == nil)
607 s = _Strndup(buf, j);
608 else
609 s = _Strdup2(s, buf);
610 return s;
613 // Gather data up to next start-of-tag or end-of-buffer.
614 // Translate entity references (&amp;).
615 // Ignore non-whitespace control characters and get rid of \r's.
616 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
617 // Otherwise return -1;
618 static int
619 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
621 Rune* s;
622 int j;
623 int c;
624 Token* tok;
625 Rune buf[BIGBUFSIZE];
627 s = nil;
628 j = 0;
629 c = firstc;
630 while(c >= 0) {
631 if(c == '&') {
632 c = ampersand(ts);
633 if(c < 0)
634 break;
636 else if(c < ' ') {
637 if(isspace(c)) {
638 if(c == '\r') {
639 // ignore it unless no following '\n',
640 // in which case treat it like '\n'
641 c = getchar(ts);
642 if(c != '\n') {
643 if(c >= 0)
644 ungetchar(ts, c);
645 c = '\n';
649 else {
650 if(warn)
651 fprint(2, "warning: non-whitespace control character %d ignored\n", c);
652 c = 0;
655 else if(c == '<') {
656 ungetchar(ts, c);
657 break;
659 if(c != 0) {
660 buf[j++] = c;
661 if(j == BIGBUFSIZE-1) {
662 s = buftostr(s, buf, j);
663 j = 0;
666 c = getchar(ts);
668 s = buftostr(s, buf, j);
669 if(s == nil)
670 return -1;
671 tok = &a[(*pai)++];
672 tok->tag = Data;
673 tok->text = s;
674 tok->attr = nil;
675 tok->starti = starti;
676 return Data;
679 // The rules for lexing scripts are different (ugh).
680 // Gather up everything until see a </SCRIPT>.
681 static int
682 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
684 Rune* s;
685 int j;
686 int tstarti;
687 int savei;
688 int c;
689 int tag;
690 int done;
691 Token* tok;
692 Rune buf[BIGBUFSIZE];
694 s = nil;
695 j = 0;
696 tstarti = starti;
697 c = firstc;
698 done = 0;
699 while(c >= 0) {
700 if(c == '<') {
701 // other browsers ignore stuff to end of line after <!
702 savei = ts->i;
703 c = getchar(ts);
704 if(c == '!') {
705 while(c >= 0 && c != '\n' && c != '\r')
706 c = getchar(ts);
707 if(c == '\r')
708 c = getchar(ts);
709 if(c == '\n')
710 c = getchar(ts);
712 else if(c >= 0) {
713 backup(ts, savei);
714 tag = gettag(ts, tstarti, a, pai);
715 if(tag == -1)
716 break;
717 if(tag != Comment)
718 (*pai)--;
719 backup(ts, tstarti);
720 if(tag == Tscript + RBRA) {
721 done = 1;
722 break;
724 // here tag was not </SCRIPT>, so take as regular data
725 c = getchar(ts);
728 if(c < 0)
729 break;
730 if(c != 0) {
731 buf[j++] = c;
732 if(j == BIGBUFSIZE-1) {
733 s = buftostr(s, buf, j);
734 j = 0;
737 tstarti = ts->i;
738 c = getchar(ts);
740 if(done || ts->i == ts->edata) {
741 s = buftostr(s, buf, j);
742 tok = &a[(*pai)++];
743 tok->tag = Data;
744 tok->text = s;
745 tok->attr = nil;
746 tok->starti = starti;
747 return Data;
749 backup(ts, starti);
750 return -1;
753 // We've just seen a '<'. Gather up stuff to closing '>' (if buffer
754 // ends before then, return -1).
755 // If it's a tag, look up the name, gather the attributes, and return
756 // the appropriate token.
757 // Else it's either just plain data or some kind of ignorable stuff:
758 // return Data or Comment as appropriate.
759 // If it's not a Comment, put it in a[*pai] and bump *pai.
760 static int
761 gettag(TokenSource* ts, int starti, Token* a, int* pai)
763 int rbra;
764 int ans;
765 Attr* al;
766 int nexti;
767 int c;
768 int ti;
769 int afnd;
770 int attid;
771 int quote;
772 Rune* val;
773 int nv;
774 int i;
775 int tag;
776 Token* tok;
777 Rune buf[BIGBUFSIZE];
779 rbra = 0;
780 nexti = ts->i;
781 tok = &a[*pai];
782 tok->tag = Notfound;
783 tok->text = nil;
784 tok->attr = nil;
785 tok->starti = starti;
786 c = getchar(ts);
787 if(c == '/') {
788 rbra = RBRA;
789 c = getchar(ts);
791 if(c < 0)
792 goto eob_done;
793 if(c >= 256 || !isalpha(c)) {
794 // not a tag
795 if(c == '!') {
796 ans = comment(ts);
797 if(ans != -1)
798 return ans;
799 goto eob_done;
801 else {
802 backup(ts, nexti);
803 tok->tag = Data;
804 tok->text = _Strdup(L(Llt));
805 (*pai)++;
806 return Data;
809 // c starts a tagname
810 buf[0] = c;
811 i = 1;
812 while(1) {
813 c = getchar(ts);
814 if(c < 0)
815 goto eob_done;
816 if(!ISNAMCHAR(c))
817 break;
818 // if name is bigger than buf it won't be found anyway...
819 if(i < BIGBUFSIZE)
820 buf[i++] = c;
822 if(_lookup(tagtable, Numtags, buf, i, &tag))
823 tok->tag = tag + rbra;
824 else
825 tok->text = _Strndup(buf, i); // for warning print, in build
827 // attribute gathering loop
828 al = nil;
829 while(1) {
830 // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
831 // skip whitespace
832 attrloop_continue:
833 while(c < 256 && isspace(c)) {
834 c = getchar(ts);
835 if(c < 0)
836 goto eob_done;
838 if(c == '>')
839 goto attrloop_done;
840 if(c == '<') {
841 if(warn)
842 fprint(2, "warning: unclosed tag\n");
843 ungetchar(ts, c);
844 goto attrloop_done;
846 if(c >= 256 || !isalpha(c)) {
847 if(warn)
848 fprint(2, "warning: expected attribute name\n");
849 // skipt to next attribute name
850 while(1) {
851 c = getchar(ts);
852 if(c < 0)
853 goto eob_done;
854 if(c < 256 && isalpha(c))
855 goto attrloop_continue;
856 if(c == '<') {
857 if(warn)
858 fprint(2, "warning: unclosed tag\n");
859 ungetchar(ts, 60);
860 goto attrloop_done;
862 if(c == '>')
863 goto attrloop_done;
866 // gather attribute name
867 buf[0] = c;
868 i = 1;
869 while(1) {
870 c = getchar(ts);
871 if(c < 0)
872 goto eob_done;
873 if(!ISNAMCHAR(c))
874 break;
875 if(i < BIGBUFSIZE-1)
876 buf[i++] = c;
878 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
879 if(warn && !afnd) {
880 buf[i] = 0;
881 fprint(2, "warning: unknown attribute name %S\n", buf);
883 // skip whitespace
884 while(c < 256 && isspace(c)) {
885 c = getchar(ts);
886 if(c < 0)
887 goto eob_done;
889 if(c != '=') {
890 if(afnd)
891 al = newattr(attid, nil, al);
892 goto attrloop_continue;
894 //# c is '=' here; skip whitespace
895 while(1) {
896 c = getchar(ts);
897 if(c < 0)
898 goto eob_done;
899 if(c >= 256 || !isspace(c))
900 break;
902 quote = 0;
903 if(c == '\'' || c == '"') {
904 quote = c;
905 c = getchar(ts);
906 if(c < 0)
907 goto eob_done;
909 val = nil;
910 nv = 0;
911 while(1) {
912 valloop_continue:
913 if(c < 0)
914 goto eob_done;
915 if(c == '>') {
916 if(quote) {
917 // c might be part of string (though not good style)
918 // but if line ends before close quote, assume
919 // there was an unmatched quote
920 ti = ts->i;
921 while(1) {
922 c = getchar(ts);
923 if(c < 0)
924 goto eob_done;
925 if(c == quote) {
926 backup(ts, ti);
927 buf[nv++] = '>';
928 if(nv == BIGBUFSIZE-1) {
929 val = buftostr(val, buf, nv);
930 nv = 0;
932 c = getchar(ts);
933 goto valloop_continue;
935 if(c == '\n') {
936 if(warn)
937 fprint(2, "warning: apparent unmatched quote\n");
938 backup(ts, ti);
939 c = '>';
940 goto valloop_done;
944 else
945 goto valloop_done;
947 if(quote) {
948 if(c == quote) {
949 c = getchar(ts);
950 if(c < 0)
951 goto eob_done;
952 goto valloop_done;
954 if(c == '\r') {
955 c = getchar(ts);
956 goto valloop_continue;
958 if(c == '\t' || c == '\n')
959 c = ' ';
961 else {
962 if(c < 256 && isspace(c))
963 goto valloop_done;
965 if(c == '&') {
966 c = ampersand(ts);
967 if(c == -1)
968 goto eob_done;
970 buf[nv++] = c;
971 if(nv == BIGBUFSIZE-1) {
972 val = buftostr(val, buf, nv);
973 nv = 0;
975 c = getchar(ts);
977 valloop_done:
978 if(afnd) {
979 val = buftostr(val, buf, nv);
980 al = newattr(attid, val, al);
984 attrloop_done:
985 tok->attr = al;
986 (*pai)++;
987 return tok->tag;
989 eob_done:
990 if(warn)
991 fprint(2, "warning: incomplete tag at end of page\n");
992 backup(ts, nexti);
993 tok->tag = Data;
994 tok->text = _Strdup(L(Llt));
995 return Data;
998 // We've just read a '<!' at position starti,
999 // so this may be a comment or other ignored section, or it may
1000 // be just a literal string if there is no close before end of file
1001 // (other browsers do that).
1002 // The accepted practice seems to be (note: contrary to SGML spec!):
1003 // If see <!--, look for --> to close, or if none, > to close.
1004 // If see <!(not --), look for > to close.
1005 // If no close before end of file, leave original characters in as literal data.
1007 // If we see ignorable stuff, return Comment.
1008 // Else return nil (caller should back up and try again when more data arrives,
1009 // unless at end of file, in which case caller should just make '<' a data token).
1010 static int
1011 comment(TokenSource* ts)
1013 int nexti;
1014 int havecomment;
1015 int c;
1017 nexti = ts->i;
1018 havecomment = 0;
1019 c = getchar(ts);
1020 if(c == '-') {
1021 c = getchar(ts);
1022 if(c == '-') {
1023 if(findstr(ts, L(Larrow)))
1024 havecomment = 1;
1025 else
1026 backup(ts, nexti);
1029 if(!havecomment) {
1030 if(c == '>')
1031 havecomment = 1;
1032 else if(c >= 0) {
1033 if(findstr(ts, L(Lgt)))
1034 havecomment = 1;
1037 if(havecomment)
1038 return Comment;
1039 return -1;
1042 // Look for string s in token source.
1043 // If found, return 1, with buffer at next char after s,
1044 // else return 0 (caller should back up).
1045 static int
1046 findstr(TokenSource* ts, Rune* s)
1048 int c0;
1049 int n;
1050 int nexti;
1051 int i;
1052 int c;
1054 c0 = s[0];
1055 n = runestrlen(s);
1056 while(1) {
1057 c = getchar(ts);
1058 if(c < 0)
1059 break;
1060 if(c == c0) {
1061 if(n == 1)
1062 return 1;
1063 nexti = ts->i;
1064 for(i = 1; i < n; i++) {
1065 c = getchar(ts);
1066 if(c < 0)
1067 goto mainloop_done;
1068 if(c != s[i])
1069 break;
1071 if(i == n)
1072 return 1;
1073 backup(ts, nexti);
1076 mainloop_done:
1077 return 0;
1080 // We've just read an '&'; look for an entity reference
1081 // name, and if found, return translated char.
1082 // if there is a complete entity name but it isn't known,
1083 // try prefixes (gets around some buggy HTML out there),
1084 // and if that fails, back up to just past the '&' and return '&'.
1085 // If the entity can't be completed in the current buffer, back up
1086 // to the '&' and return -1.
1087 static int
1088 ampersand(TokenSource* ts)
1090 int savei;
1091 int c;
1092 int fnd;
1093 int ans;
1094 int v;
1095 int i;
1096 int k;
1097 Rune buf[SMALLBUFSIZE];
1099 savei = ts->i;
1100 c = getchar(ts);
1101 fnd = 0;
1102 ans = -1;
1103 if(c == '#') {
1104 c = getchar(ts);
1105 v = 0;
1106 while(c >= 0) {
1107 if(!(c < 256 && isdigit(c)))
1108 break;
1109 v = v*10 + c - 48;
1110 c = getchar(ts);
1112 if(c >= 0) {
1113 if(!(c == ';' || c == '\n' || c == '\r'))
1114 ungetchar(ts, c);
1115 c = v;
1116 if(c == 160)
1117 c = 160;
1118 if(c >= Winstart && c <= Winend) {
1119 c = winchars[c - Winstart];
1121 ans = c;
1122 fnd = 1;
1125 else if(c < 256 && isalpha(c)) {
1126 buf[0] = c;
1127 k = 1;
1128 while(1) {
1129 c = getchar(ts);
1130 if(c < 0)
1131 break;
1132 if(ISNAMCHAR(c)) {
1133 if(k < SMALLBUFSIZE-1)
1134 buf[k++] = c;
1136 else {
1137 if(!(c == ';' || c == '\n' || c == '\r'))
1138 ungetchar(ts, c);
1139 break;
1142 if(c >= 0) {
1143 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1144 if(!fnd) {
1145 // Try prefixes of s
1146 if(c == ';' || c == '\n' || c == '\r')
1147 ungetchar(ts, c);
1148 i = k;
1149 while(--k > 0) {
1150 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1151 if(fnd) {
1152 while(i > k) {
1153 i--;
1154 ungetchar(ts, buf[i]);
1156 break;
1162 if(!fnd) {
1163 backup(ts, savei);
1164 ans = '&';
1166 return ans;
1169 // Get next char, obeying ts.chset.
1170 // Returns -1 if no complete character left before current end of data.
1171 static int
1172 getchar(TokenSource* ts)
1174 uchar* buf;
1175 int c;
1176 int n;
1177 int ok;
1178 Rune r;
1180 if(ts->i >= ts->edata)
1181 return -1;
1182 buf = ts->data;
1183 c = buf[ts->i];
1184 switch(ts->chset) {
1185 case ISO_8859_1:
1186 if(c >= Winstart && c <= Winend)
1187 c = winchars[c - Winstart];
1188 ts->i++;
1189 break;
1190 case US_Ascii:
1191 if(c > 127) {
1192 if(warn)
1193 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1195 ts->i++;
1196 break;
1197 case UTF_8:
1198 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1199 n = chartorune(&r, (char*)(buf+ts->i));
1200 if(ok) {
1201 if(warn && c == 0x80)
1202 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1203 ts->i += n;
1204 c = r;
1206 else {
1207 // not enough bytes in buf to complete utf-8 char
1208 ts->i = ts->edata; // mark "all used"
1209 c = -1;
1211 break;
1212 case Unicode:
1213 if(ts->i < ts->edata - 1) {
1214 //standards say most-significant byte first
1215 c = (c << 8)|(buf[ts->i + 1]);
1216 ts->i += 2;
1218 else {
1219 ts->i = ts->edata; // mark "all used"
1220 c = -1;
1222 break;
1224 return c;
1227 // Assuming c was the last character returned by getchar, set
1228 // things up so that next getchar will get that same character
1229 // followed by the current 'next character', etc.
1230 static void
1231 ungetchar(TokenSource* ts, int c)
1233 int n;
1234 Rune r;
1235 char a[UTFmax];
1237 n = 1;
1238 switch(ts->chset) {
1239 case UTF_8:
1240 if(c >= 128) {
1241 r = c;
1242 n = runetochar(a, &r);
1244 break;
1245 case Unicode:
1246 n = 2;
1247 break;
1249 ts->i -= n;
1252 // Restore ts so that it is at the state where the index was savei.
1253 static void
1254 backup(TokenSource* ts, int savei)
1256 if(dbglex)
1257 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1258 ts->i = savei;
1262 // Look for value associated with attribute attid in token t.
1263 // If there is one, return 1 and put the value in *pans,
1264 // else return 0.
1265 // If xfer is true, transfer ownership of the string to the caller
1266 // (nil it out here); otherwise, caller must duplicate the answer
1267 // if it needs to save it.
1268 // OK to have pans==0, in which case this is just looking
1269 // to see if token is present.
1270 int
1271 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1273 Attr* attr;
1275 attr = t->attr;
1276 while(attr != nil) {
1277 if(attr->attid == attid) {
1278 if(pans != nil)
1279 *pans = attr->value;
1280 if(xfer)
1281 attr->value = nil;
1282 return 1;
1284 attr = attr->next;
1286 if(pans != nil)
1287 *pans = nil;
1288 return 0;
1291 static int
1292 Tconv(Fmt *f)
1294 Token* t;
1295 int i;
1296 int tag;
1297 char* srbra;
1298 Rune* aname;
1299 Rune* tname;
1300 Attr* a;
1301 char buf[BIGBUFSIZE];
1303 t = va_arg(f->args, Token*);
1304 if(t == nil)
1305 sprint(buf, "<null>");
1306 else {
1307 i = 0;
1308 if(dbglex > 1)
1309 i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1310 tag = t->tag;
1311 if(tag == Data) {
1312 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1314 else {
1315 srbra = "";
1316 if(tag >= RBRA) {
1317 tag -= RBRA;
1318 srbra = "/";
1320 tname = tagnames[tag];
1321 if(tag == Notfound)
1322 tname = L(Lquestion);
1323 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1324 for(a = t->attr; a != nil; a = a->next) {
1325 aname = attrnames[a->attid];
1326 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1327 if(a->value != nil)
1328 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1330 i += snprint(buf+i, sizeof(buf)-i-1, ">");
1332 buf[i] = 0;
1334 return fmtstrcpy(f, buf);
1337 // Attrs own their constituent strings, but build may eventually
1338 // transfer some values to its items and nil them out in the Attr.
1339 static Attr*
1340 newattr(int attid, Rune* value, Attr* link)
1342 Attr* ans;
1344 ans = (Attr*)emalloc(sizeof(Attr));
1345 ans->attid = attid;
1346 ans->value = value;
1347 ans->next = link;
1348 return ans;
1351 // Free list of Attrs linked through next field
1352 static void
1353 freeattrs(Attr* ahead)
1355 Attr* a;
1356 Attr* nexta;
1358 a = ahead;
1359 while(a != nil) {
1360 nexta = a->next;
1361 free(a->value);
1362 free(a);
1363 a = nexta;
1367 // Free array of Tokens.
1368 // Allocated space might have room for more than n tokens,
1369 // but only n of them are initialized.
1370 // If caller has transferred ownership of constitutent strings
1371 // or attributes, it must have nil'd out the pointers in the Tokens.
1372 void
1373 _freetokens(Token* tarray, int n)
1375 int i;
1376 Token* t;
1378 if(tarray == nil)
1379 return;
1380 for(i = 0; i < n; i++) {
1381 t = &tarray[i];
1382 free(t->text);
1383 freeattrs(t->attr);
1385 free(tarray);