Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 int i; // index of next byte to use
12 uchar* data; // all the data
13 int edata; // data[0:edata] is valid
14 int chset; // one of US_Ascii, etc.
15 int mtype; // TextHtml or TextPlain
16 };
18 enum {
19 EOF = -2,
20 EOB = -1
21 };
23 #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
28 // HTML 4.0 tag names.
29 // Keep sorted, and in correspondence with enum in iparse.h.
30 Rune **tagnames;
31 char *_tagnames[] = {
32 " ",
33 "!",
34 "a",
35 "abbr",
36 "acronym",
37 "address",
38 "applet",
39 "area",
40 "b",
41 "base",
42 "basefont",
43 "bdo",
44 "big",
45 "blink",
46 "blockquote",
47 "body",
48 "bq",
49 "br",
50 "button",
51 "caption",
52 "center",
53 "cite",
54 "code",
55 "col",
56 "colgroup",
57 "dd",
58 "del",
59 "dfn",
60 "dir",
61 "div",
62 "dl",
63 "dt",
64 "em",
65 "fieldset",
66 "font",
67 "form",
68 "frame",
69 "frameset",
70 "h1",
71 "h2",
72 "h3",
73 "h4",
74 "h5",
75 "h6",
76 "head",
77 "hr",
78 "html",
79 "i",
80 "iframe",
81 "img",
82 "input",
83 "ins",
84 "isindex",
85 "kbd",
86 "label",
87 "legend",
88 "li",
89 "link",
90 "map",
91 "menu",
92 "meta",
93 "nobr",
94 "noframes",
95 "noscript",
96 "object",
97 "ol",
98 "optgroup",
99 "option",
100 "p",
101 "param",
102 "pre",
103 "q",
104 "s",
105 "samp",
106 "script",
107 "select",
108 "small",
109 "span",
110 "strike",
111 "strong",
112 "style",
113 "sub",
114 "sup",
115 "table",
116 "tbody",
117 "td",
118 "textarea",
119 "tfoot",
120 "th",
121 "thead",
122 "title",
123 "tr",
124 "tt",
125 "u",
126 "ul",
127 "var"
128 };
130 // HTML 4.0 attribute names.
131 // Keep sorted, and in correspondence with enum in i.h.
132 Rune **attrnames;
133 char* _attrnames[] = {
134 "abbr",
135 "accept-charset",
136 "access-key",
137 "action",
138 "align",
139 "alink",
140 "alt",
141 "archive",
142 "axis",
143 "background",
144 "bgcolor",
145 "border",
146 "cellpadding",
147 "cellspacing",
148 "char",
149 "charoff",
150 "charset",
151 "checked",
152 "cite",
153 "class",
154 "classid",
155 "clear",
156 "code",
157 "codebase",
158 "codetype",
159 "color",
160 "cols",
161 "colspan",
162 "compact",
163 "content",
164 "coords",
165 "data",
166 "datetime",
167 "declare",
168 "defer",
169 "dir",
170 "disabled",
171 "enctype",
172 "face",
173 "for",
174 "frame",
175 "frameborder",
176 "headers",
177 "height",
178 "href",
179 "hreflang",
180 "hspace",
181 "http-equiv",
182 "id",
183 "ismap",
184 "label",
185 "lang",
186 "link",
187 "longdesc",
188 "marginheight",
189 "marginwidth",
190 "maxlength",
191 "media",
192 "method",
193 "multiple",
194 "name",
195 "nohref",
196 "noresize",
197 "noshade",
198 "nowrap",
199 "object",
200 "onblur",
201 "onchange",
202 "onclick",
203 "ondblclick",
204 "onfocus",
205 "onkeypress",
206 "onkeyup",
207 "onload",
208 "onmousedown",
209 "onmousemove",
210 "onmouseout",
211 "onmouseover",
212 "onmouseup",
213 "onreset",
214 "onselect",
215 "onsubmit",
216 "onunload",
217 "profile",
218 "prompt",
219 "readonly",
220 "rel",
221 "rev",
222 "rows",
223 "rowspan",
224 "rules",
225 "scheme",
226 "scope",
227 "scrolling",
228 "selected",
229 "shape",
230 "size",
231 "span",
232 "src",
233 "standby",
234 "start",
235 "style",
236 "summary",
237 "tabindex",
238 "target",
239 "text",
240 "title",
241 "type",
242 "usemap",
243 "valign",
244 "value",
245 "valuetype",
246 "version",
247 "vlink",
248 "vspace",
249 "width"
250 };
253 // Character entity to unicode character number map.
254 // Keep sorted by name.
255 StringInt *chartab;
256 AsciiInt _chartab[142] = {
257 {"AElig", 198},
258 {"Aacute", 193},
259 {"Acirc", 194},
260 {"Agrave", 192},
261 {"Aring", 197},
262 {"Atilde", 195},
263 {"Auml", 196},
264 {"Ccedil", 199},
265 {"ETH", 208},
266 {"Eacute", 201},
267 {"Ecirc", 202},
268 {"Egrave", 200},
269 {"Euml", 203},
270 {"Iacute", 205},
271 {"Icirc", 206},
272 {"Igrave", 204},
273 {"Iuml", 207},
274 {"Ntilde", 209},
275 {"Oacute", 211},
276 {"Ocirc", 212},
277 {"Ograve", 210},
278 {"Oslash", 216},
279 {"Otilde", 213},
280 {"Ouml", 214},
281 {"THORN", 222},
282 {"Uacute", 218},
283 {"Ucirc", 219},
284 {"Ugrave", 217},
285 {"Uuml", 220},
286 {"Yacute", 221},
287 {"aacute", 225},
288 {"acirc", 226},
289 {"acute", 180},
290 {"aelig", 230},
291 {"agrave", 224},
292 {"alpha", 945},
293 {"amp", 38},
294 {"aring", 229},
295 {"atilde", 227},
296 {"auml", 228},
297 {"beta", 946},
298 {"brvbar", 166},
299 {"ccedil", 231},
300 {"cdots", 8943},
301 {"cedil", 184},
302 {"cent", 162},
303 {"chi", 967},
304 {"copy", 169},
305 {"curren", 164},
306 {"ddots", 8945},
307 {"deg", 176},
308 {"delta", 948},
309 {"divide", 247},
310 {"eacute", 233},
311 {"ecirc", 234},
312 {"egrave", 232},
313 {"emdash", 8212},
314 {"emsp", 8195},
315 {"endash", 8211},
316 {"ensp", 8194},
317 {"epsilon", 949},
318 {"eta", 951},
319 {"eth", 240},
320 {"euml", 235},
321 {"frac12", 189},
322 {"frac14", 188},
323 {"frac34", 190},
324 {"gamma", 947},
325 {"gt", 62},
326 {"iacute", 237},
327 {"icirc", 238},
328 {"iexcl", 161},
329 {"igrave", 236},
330 {"iota", 953},
331 {"iquest", 191},
332 {"iuml", 239},
333 {"kappa", 954},
334 {"lambda", 955},
335 {"laquo", 171},
336 {"ldots", 8230},
337 {"lt", 60},
338 {"macr", 175},
339 {"micro", 181},
340 {"middot", 183},
341 {"mu", 956},
342 {"nbsp", 160},
343 {"not", 172},
344 {"ntilde", 241},
345 {"nu", 957},
346 {"oacute", 243},
347 {"ocirc", 244},
348 {"ograve", 242},
349 {"omega", 969},
350 {"omicron", 959},
351 {"ordf", 170},
352 {"ordm", 186},
353 {"oslash", 248},
354 {"otilde", 245},
355 {"ouml", 246},
356 {"para", 182},
357 {"phi", 966},
358 {"pi", 960},
359 {"plusmn", 177},
360 {"pound", 163},
361 {"psi", 968},
362 {"quad", 8193},
363 {"quot", 34},
364 {"raquo", 187},
365 {"reg", 174},
366 {"rho", 961},
367 {"sect", 167},
368 {"shy", 173},
369 {"sigma", 963},
370 {"sp", 8194},
371 {"sup1", 185},
372 {"sup2", 178},
373 {"sup3", 179},
374 {"szlig", 223},
375 {"tau", 964},
376 {"theta", 952},
377 {"thinsp", 8201},
378 {"thorn", 254},
379 {"times", 215},
380 {"trade", 8482},
381 {"uacute", 250},
382 {"ucirc", 251},
383 {"ugrave", 249},
384 {"uml", 168},
385 {"upsilon", 965},
386 {"uuml", 252},
387 {"varepsilon", 8712},
388 {"varphi", 981},
389 {"varpi", 982},
390 {"varrho", 1009},
391 {"vdots", 8942},
392 {"vsigma", 962},
393 {"vtheta", 977},
394 {"xi", 958},
395 {"yacute", 253},
396 {"yen", 165},
397 {"yuml", 255},
398 {"zeta", 950}
399 };
400 #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
402 // Characters Winstart..Winend are those that Windows
403 // uses interpolated into the Latin1 set.
404 // They aren't supposed to appear in HTML, but they do....
405 enum {
406 Winstart = 127,
407 Winend = 159
408 };
410 static int winchars[]= { 8226, // 8226 is a bullet
411 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
412 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
413 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
414 732, 8482, 353, 8250, 339, 8226, 8226, 376};
416 static StringInt* tagtable; // initialized from tagnames
417 static StringInt* attrtable; // initialized from attrnames
419 static void lexinit();
420 static int getplaindata(TokenSource* ts, Token* a, int* pai);
421 static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
422 static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
423 static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
424 static Rune* buftostr(Rune* s, Rune* buf, int j);
425 static int comment(TokenSource* ts);
426 static int findstr(TokenSource* ts, Rune* s);
427 static int ampersand(TokenSource* ts);
428 //static int lowerc(int c);
429 static int getchar(TokenSource* ts);
430 static void ungetchar(TokenSource* ts, int c);
431 static void backup(TokenSource* ts, int savei);
432 //static void freeinsidetoken(Token* t);
433 static void freeattrs(Attr* ahead);
434 static Attr* newattr(int attid, Rune* value, Attr* link);
435 static int Tconv(Fmt* f);
437 int dbglex = 0;
438 static int lexinited = 0;
440 static void
441 lexinit(void)
443 chartab = _cvtstringinttab(_chartab, nelem(_chartab));
444 tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
445 tagtable = _makestrinttab(tagnames, Numtags);
446 attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
447 attrtable = _makestrinttab(attrnames, Numattrs);
448 fmtinstall('T', Tconv);
449 lexinited = 1;
452 static TokenSource*
453 newtokensource(uchar* data, int edata, int chset, int mtype)
455 TokenSource* ans;
457 assert(chset == US_Ascii || chset == ISO_8859_1 ||
458 chset == UTF_8 || chset == Unicode);
459 ans = (TokenSource*)emalloc(sizeof(TokenSource));
460 ans->i = 0;
461 ans->data = data;
462 ans->edata = edata;
463 ans->chset = chset;
464 ans->mtype = mtype;
465 return ans;
468 enum {
469 ToksChunk = 500
470 };
472 // Call this to get the tokens.
473 // The number of returned tokens is returned in *plen.
474 Token*
475 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
477 TokenSource* ts;
478 Token* a;
479 int alen;
480 int ai;
481 int starti;
482 int c;
483 int tag;
485 if(!lexinited)
486 lexinit();
487 ts = newtokensource(data, datalen, chset, mtype);
488 alen = ToksChunk;
489 a = (Token*)emalloc(alen * sizeof(Token));
490 ai = 0;
491 if(dbglex)
492 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
493 if(ts->mtype == TextHtml) {
494 for(;;) {
495 if(ai == alen) {
496 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
497 alen += ToksChunk;
499 starti = ts->i;
500 c = getchar(ts);
501 if(c < 0)
502 break;
503 if(c == '<') {
504 tag = gettag(ts, starti, a, &ai);
505 if(tag == Tscript) {
506 // special rules for getting Data after....
507 starti = ts->i;
508 c = getchar(ts);
509 tag = getscriptdata(ts, c, starti, a, &ai);
512 else
513 tag = getdata(ts, c, starti, a, &ai);
514 if(tag == -1)
515 break;
516 else if(dbglex > 1 && tag != Comment)
517 fprint(2, "lex: got token %T\n", &a[ai-1]);
520 else {
521 // plain text (non-html) tokens
522 for(;;) {
523 if(ai == alen) {
524 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
525 alen += ToksChunk;
527 tag = getplaindata(ts, a, &ai);
528 if(tag == -1)
529 break;
530 if(dbglex > 1)
531 fprint(2, "lex: got token %T\n", &a[ai]);
534 if(dbglex)
535 fprint(2, "lex: returning %d tokens\n", ai);
536 *plen = ai;
537 if(ai == 0)
538 return nil;
539 return a;
542 // For case where source isn't HTML.
543 // Just make data tokens, one per line (or partial line,
544 // at end of buffer), ignoring non-whitespace control
545 // characters and dumping \r's.
546 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
547 // Otherwise return -1;
548 static int
549 getplaindata(TokenSource* ts, Token* a, int* pai)
551 Rune* s;
552 int j;
553 int starti;
554 int c;
555 Token* tok;
556 Rune buf[BIGBUFSIZE];
558 s = nil;
559 j = 0;
560 starti = ts->i;
561 for(c = getchar(ts); c >= 0; c = getchar(ts)) {
562 if(c < ' ') {
563 if(isspace(c)) {
564 if(c == '\r') {
565 // ignore it unless no following '\n',
566 // in which case treat it like '\n'
567 c = getchar(ts);
568 if(c != '\n') {
569 if(c >= 0)
570 ungetchar(ts, c);
571 c = '\n';
575 else
576 c = 0;
578 if(c != 0) {
579 buf[j++] = c;
580 if(j == sizeof(buf)-1) {
581 s = buftostr(s, buf, j);
582 j = 0;
585 if(c == '\n')
586 break;
588 s = buftostr(s, buf, j);
589 if(s == nil)
590 return -1;
591 tok = &a[(*pai)++];
592 tok->tag = Data;
593 tok->text = s;
594 tok->attr = nil;
595 tok->starti = starti;
596 return Data;
599 // Return concatenation of s and buf[0:j]
600 static Rune*
601 buftostr(Rune* s, Rune* buf, int j)
603 buf[j] = 0;
604 if(s == nil)
605 s = _Strndup(buf, j);
606 else
607 s = _Strdup2(s, buf);
608 return s;
611 // Gather data up to next start-of-tag or end-of-buffer.
612 // Translate entity references (&amp;).
613 // Ignore non-whitespace control characters and get rid of \r's.
614 // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
615 // Otherwise return -1;
616 static int
617 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
619 Rune* s;
620 int j;
621 int c;
622 Token* tok;
623 Rune buf[BIGBUFSIZE];
625 s = nil;
626 j = 0;
627 c = firstc;
628 while(c >= 0) {
629 if(c == '&') {
630 c = ampersand(ts);
631 if(c < 0)
632 break;
634 else if(c < ' ') {
635 if(isspace(c)) {
636 if(c == '\r') {
637 // ignore it unless no following '\n',
638 // in which case treat it like '\n'
639 c = getchar(ts);
640 if(c != '\n') {
641 if(c >= 0)
642 ungetchar(ts, c);
643 c = '\n';
647 else {
648 if(warn)
649 fprint(2, "warning: non-whitespace control character %d ignored\n", c);
650 c = 0;
653 else if(c == '<') {
654 ungetchar(ts, c);
655 break;
657 if(c != 0) {
658 buf[j++] = c;
659 if(j == BIGBUFSIZE-1) {
660 s = buftostr(s, buf, j);
661 j = 0;
664 c = getchar(ts);
666 s = buftostr(s, buf, j);
667 if(s == nil)
668 return -1;
669 tok = &a[(*pai)++];
670 tok->tag = Data;
671 tok->text = s;
672 tok->attr = nil;
673 tok->starti = starti;
674 return Data;
677 // The rules for lexing scripts are different (ugh).
678 // Gather up everything until see a </SCRIPT>.
679 static int
680 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
682 Rune* s;
683 int j;
684 int tstarti;
685 int savei;
686 int c;
687 int tag;
688 int done;
689 Token* tok;
690 Rune buf[BIGBUFSIZE];
692 s = nil;
693 j = 0;
694 tstarti = starti;
695 c = firstc;
696 done = 0;
697 while(c >= 0) {
698 if(c == '<') {
699 // other browsers ignore stuff to end of line after <!
700 savei = ts->i;
701 c = getchar(ts);
702 if(c == '!') {
703 while(c >= 0 && c != '\n' && c != '\r')
704 c = getchar(ts);
705 if(c == '\r')
706 c = getchar(ts);
707 if(c == '\n')
708 c = getchar(ts);
710 else if(c >= 0) {
711 backup(ts, savei);
712 tag = gettag(ts, tstarti, a, pai);
713 if(tag == -1)
714 break;
715 if(tag != Comment)
716 (*pai)--;
717 backup(ts, tstarti);
718 if(tag == Tscript + RBRA) {
719 done = 1;
720 break;
722 // here tag was not </SCRIPT>, so take as regular data
723 c = getchar(ts);
726 if(c < 0)
727 break;
728 if(c != 0) {
729 buf[j++] = c;
730 if(j == BIGBUFSIZE-1) {
731 s = buftostr(s, buf, j);
732 j = 0;
735 tstarti = ts->i;
736 c = getchar(ts);
738 if(done || ts->i == ts->edata) {
739 s = buftostr(s, buf, j);
740 tok = &a[(*pai)++];
741 tok->tag = Data;
742 tok->text = s;
743 tok->attr = nil;
744 tok->starti = starti;
745 return Data;
747 backup(ts, starti);
748 return -1;
751 // We've just seen a '<'. Gather up stuff to closing '>' (if buffer
752 // ends before then, return -1).
753 // If it's a tag, look up the name, gather the attributes, and return
754 // the appropriate token.
755 // Else it's either just plain data or some kind of ignorable stuff:
756 // return Data or Comment as appropriate.
757 // If it's not a Comment, put it in a[*pai] and bump *pai.
758 static int
759 gettag(TokenSource* ts, int starti, Token* a, int* pai)
761 int rbra;
762 int ans;
763 Attr* al;
764 int nexti;
765 int c;
766 int ti;
767 int afnd;
768 int attid;
769 int quote;
770 Rune* val;
771 int nv;
772 int i;
773 int tag;
774 Token* tok;
775 Rune buf[BIGBUFSIZE];
777 rbra = 0;
778 nexti = ts->i;
779 tok = &a[*pai];
780 tok->tag = Notfound;
781 tok->text = nil;
782 tok->attr = nil;
783 tok->starti = starti;
784 c = getchar(ts);
785 if(c == '/') {
786 rbra = RBRA;
787 c = getchar(ts);
789 if(c < 0)
790 goto eob_done;
791 if(c >= 256 || !isalpha(c)) {
792 // not a tag
793 if(c == '!') {
794 ans = comment(ts);
795 if(ans != -1)
796 return ans;
797 goto eob_done;
799 else {
800 backup(ts, nexti);
801 tok->tag = Data;
802 tok->text = _Strdup(L(Llt));
803 (*pai)++;
804 return Data;
807 // c starts a tagname
808 buf[0] = c;
809 i = 1;
810 while(1) {
811 c = getchar(ts);
812 if(c < 0)
813 goto eob_done;
814 if(!ISNAMCHAR(c))
815 break;
816 // if name is bigger than buf it won't be found anyway...
817 if(i < BIGBUFSIZE)
818 buf[i++] = c;
820 if(_lookup(tagtable, Numtags, buf, i, &tag))
821 tok->tag = tag + rbra;
822 else
823 tok->text = _Strndup(buf, i); // for warning print, in build
825 // attribute gathering loop
826 al = nil;
827 while(1) {
828 // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
829 // skip whitespace
830 attrloop_continue:
831 while(c < 256 && isspace(c)) {
832 c = getchar(ts);
833 if(c < 0)
834 goto eob_done;
836 if(c == '>')
837 goto attrloop_done;
838 if(c == '<') {
839 if(warn)
840 fprint(2, "warning: unclosed tag\n");
841 ungetchar(ts, c);
842 goto attrloop_done;
844 if(c >= 256 || !isalpha(c)) {
845 if(warn)
846 fprint(2, "warning: expected attribute name\n");
847 // skipt to next attribute name
848 while(1) {
849 c = getchar(ts);
850 if(c < 0)
851 goto eob_done;
852 if(c < 256 && isalpha(c))
853 goto attrloop_continue;
854 if(c == '<') {
855 if(warn)
856 fprint(2, "warning: unclosed tag\n");
857 ungetchar(ts, 60);
858 goto attrloop_done;
860 if(c == '>')
861 goto attrloop_done;
864 // gather attribute name
865 buf[0] = c;
866 i = 1;
867 while(1) {
868 c = getchar(ts);
869 if(c < 0)
870 goto eob_done;
871 if(!ISNAMCHAR(c))
872 break;
873 if(i < BIGBUFSIZE-1)
874 buf[i++] = c;
876 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
877 if(warn && !afnd) {
878 buf[i] = 0;
879 fprint(2, "warning: unknown attribute name %S\n", buf);
881 // skip whitespace
882 while(c < 256 && isspace(c)) {
883 c = getchar(ts);
884 if(c < 0)
885 goto eob_done;
887 if(c != '=') {
888 if(afnd)
889 al = newattr(attid, nil, al);
890 goto attrloop_continue;
892 //# c is '=' here; skip whitespace
893 while(1) {
894 c = getchar(ts);
895 if(c < 0)
896 goto eob_done;
897 if(c >= 256 || !isspace(c))
898 break;
900 quote = 0;
901 if(c == '\'' || c == '"') {
902 quote = c;
903 c = getchar(ts);
904 if(c < 0)
905 goto eob_done;
907 val = nil;
908 nv = 0;
909 while(1) {
910 valloop_continue:
911 if(c < 0)
912 goto eob_done;
913 if(c == '>') {
914 if(quote) {
915 // c might be part of string (though not good style)
916 // but if line ends before close quote, assume
917 // there was an unmatched quote
918 ti = ts->i;
919 while(1) {
920 c = getchar(ts);
921 if(c < 0)
922 goto eob_done;
923 if(c == quote) {
924 backup(ts, ti);
925 buf[nv++] = '>';
926 if(nv == BIGBUFSIZE-1) {
927 val = buftostr(val, buf, nv);
928 nv = 0;
930 c = getchar(ts);
931 goto valloop_continue;
933 if(c == '\n') {
934 if(warn)
935 fprint(2, "warning: apparent unmatched quote\n");
936 backup(ts, ti);
937 c = '>';
938 goto valloop_done;
942 else
943 goto valloop_done;
945 if(quote) {
946 if(c == quote) {
947 c = getchar(ts);
948 if(c < 0)
949 goto eob_done;
950 goto valloop_done;
952 if(c == '\r') {
953 c = getchar(ts);
954 goto valloop_continue;
956 if(c == '\t' || c == '\n')
957 c = ' ';
959 else {
960 if(c < 256 && isspace(c))
961 goto valloop_done;
963 if(c == '&') {
964 c = ampersand(ts);
965 if(c == -1)
966 goto eob_done;
968 buf[nv++] = c;
969 if(nv == BIGBUFSIZE-1) {
970 val = buftostr(val, buf, nv);
971 nv = 0;
973 c = getchar(ts);
975 valloop_done:
976 if(afnd) {
977 val = buftostr(val, buf, nv);
978 al = newattr(attid, val, al);
982 attrloop_done:
983 tok->attr = al;
984 (*pai)++;
985 return tok->tag;
987 eob_done:
988 if(warn)
989 fprint(2, "warning: incomplete tag at end of page\n");
990 backup(ts, nexti);
991 tok->tag = Data;
992 tok->text = _Strdup(L(Llt));
993 return Data;
996 // We've just read a '<!' at position starti,
997 // so this may be a comment or other ignored section, or it may
998 // be just a literal string if there is no close before end of file
999 // (other browsers do that).
1000 // The accepted practice seems to be (note: contrary to SGML spec!):
1001 // If see <!--, look for --> to close, or if none, > to close.
1002 // If see <!(not --), look for > to close.
1003 // If no close before end of file, leave original characters in as literal data.
1005 // If we see ignorable stuff, return Comment.
1006 // Else return nil (caller should back up and try again when more data arrives,
1007 // unless at end of file, in which case caller should just make '<' a data token).
1008 static int
1009 comment(TokenSource* ts)
1011 int nexti;
1012 int havecomment;
1013 int c;
1015 nexti = ts->i;
1016 havecomment = 0;
1017 c = getchar(ts);
1018 if(c == '-') {
1019 c = getchar(ts);
1020 if(c == '-') {
1021 if(findstr(ts, L(Larrow)))
1022 havecomment = 1;
1023 else
1024 backup(ts, nexti);
1027 if(!havecomment) {
1028 if(c == '>')
1029 havecomment = 1;
1030 else if(c >= 0) {
1031 if(findstr(ts, L(Lgt)))
1032 havecomment = 1;
1035 if(havecomment)
1036 return Comment;
1037 return -1;
1040 // Look for string s in token source.
1041 // If found, return 1, with buffer at next char after s,
1042 // else return 0 (caller should back up).
1043 static int
1044 findstr(TokenSource* ts, Rune* s)
1046 int c0;
1047 int n;
1048 int nexti;
1049 int i;
1050 int c;
1052 c0 = s[0];
1053 n = runestrlen(s);
1054 while(1) {
1055 c = getchar(ts);
1056 if(c < 0)
1057 break;
1058 if(c == c0) {
1059 if(n == 1)
1060 return 1;
1061 nexti = ts->i;
1062 for(i = 1; i < n; i++) {
1063 c = getchar(ts);
1064 if(c < 0)
1065 goto mainloop_done;
1066 if(c != s[i])
1067 break;
1069 if(i == n)
1070 return 1;
1071 backup(ts, nexti);
1074 mainloop_done:
1075 return 0;
1078 // We've just read an '&'; look for an entity reference
1079 // name, and if found, return translated char.
1080 // if there is a complete entity name but it isn't known,
1081 // try prefixes (gets around some buggy HTML out there),
1082 // and if that fails, back up to just past the '&' and return '&'.
1083 // If the entity can't be completed in the current buffer, back up
1084 // to the '&' and return -1.
1085 static int
1086 ampersand(TokenSource* ts)
1088 int savei;
1089 int c;
1090 int fnd;
1091 int ans;
1092 int v;
1093 int i;
1094 int k;
1095 Rune buf[SMALLBUFSIZE];
1097 savei = ts->i;
1098 c = getchar(ts);
1099 fnd = 0;
1100 ans = -1;
1101 if(c == '#') {
1102 c = getchar(ts);
1103 v = 0;
1104 while(c >= 0) {
1105 if(!(c < 256 && isdigit(c)))
1106 break;
1107 v = v*10 + c - 48;
1108 c = getchar(ts);
1110 if(c >= 0) {
1111 if(!(c == ';' || c == '\n' || c == '\r'))
1112 ungetchar(ts, c);
1113 c = v;
1114 if(c == 160)
1115 c = 160;
1116 if(c >= Winstart && c <= Winend) {
1117 c = winchars[c - Winstart];
1119 ans = c;
1120 fnd = 1;
1123 else if(c < 256 && isalpha(c)) {
1124 buf[0] = c;
1125 k = 1;
1126 while(1) {
1127 c = getchar(ts);
1128 if(c < 0)
1129 break;
1130 if(ISNAMCHAR(c)) {
1131 if(k < SMALLBUFSIZE-1)
1132 buf[k++] = c;
1134 else {
1135 if(!(c == ';' || c == '\n' || c == '\r'))
1136 ungetchar(ts, c);
1137 break;
1140 if(c >= 0) {
1141 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1142 if(!fnd) {
1143 // Try prefixes of s
1144 if(c == ';' || c == '\n' || c == '\r')
1145 ungetchar(ts, c);
1146 i = k;
1147 while(--k > 0) {
1148 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1149 if(fnd) {
1150 while(i > k) {
1151 i--;
1152 ungetchar(ts, buf[i]);
1154 break;
1160 if(!fnd) {
1161 backup(ts, savei);
1162 ans = '&';
1164 return ans;
1167 // Get next char, obeying ts.chset.
1168 // Returns -1 if no complete character left before current end of data.
1169 static int
1170 getchar(TokenSource* ts)
1172 uchar* buf;
1173 int c;
1174 int n;
1175 int ok;
1176 Rune r;
1178 if(ts->i >= ts->edata)
1179 return -1;
1180 buf = ts->data;
1181 c = buf[ts->i];
1182 switch(ts->chset) {
1183 case ISO_8859_1:
1184 if(c >= Winstart && c <= Winend)
1185 c = winchars[c - Winstart];
1186 ts->i++;
1187 break;
1188 case US_Ascii:
1189 if(c > 127) {
1190 if(warn)
1191 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1193 ts->i++;
1194 break;
1195 case UTF_8:
1196 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1197 n = chartorune(&r, (char*)(buf+ts->i));
1198 if(ok) {
1199 if(warn && c == 0x80)
1200 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1201 ts->i += n;
1202 c = r;
1204 else {
1205 // not enough bytes in buf to complete utf-8 char
1206 ts->i = ts->edata; // mark "all used"
1207 c = -1;
1209 break;
1210 case Unicode:
1211 if(ts->i < ts->edata - 1) {
1212 //standards say most-significant byte first
1213 c = (c << 8)|(buf[ts->i + 1]);
1214 ts->i += 2;
1216 else {
1217 ts->i = ts->edata; // mark "all used"
1218 c = -1;
1220 break;
1222 return c;
1225 // Assuming c was the last character returned by getchar, set
1226 // things up so that next getchar will get that same character
1227 // followed by the current 'next character', etc.
1228 static void
1229 ungetchar(TokenSource* ts, int c)
1231 int n;
1232 Rune r;
1233 char a[UTFmax];
1235 n = 1;
1236 switch(ts->chset) {
1237 case UTF_8:
1238 if(c >= 128) {
1239 r = c;
1240 n = runetochar(a, &r);
1242 break;
1243 case Unicode:
1244 n = 2;
1245 break;
1247 ts->i -= n;
1250 // Restore ts so that it is at the state where the index was savei.
1251 static void
1252 backup(TokenSource* ts, int savei)
1254 if(dbglex)
1255 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1256 ts->i = savei;
1260 // Look for value associated with attribute attid in token t.
1261 // If there is one, return 1 and put the value in *pans,
1262 // else return 0.
1263 // If xfer is true, transfer ownership of the string to the caller
1264 // (nil it out here); otherwise, caller must duplicate the answer
1265 // if it needs to save it.
1266 // OK to have pans==0, in which case this is just looking
1267 // to see if token is present.
1268 int
1269 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1271 Attr* attr;
1273 attr = t->attr;
1274 while(attr != nil) {
1275 if(attr->attid == attid) {
1276 if(pans != nil)
1277 *pans = attr->value;
1278 if(xfer)
1279 attr->value = nil;
1280 return 1;
1282 attr = attr->next;
1284 if(pans != nil)
1285 *pans = nil;
1286 return 0;
1289 static int
1290 Tconv(Fmt *f)
1292 Token* t;
1293 int i;
1294 int tag;
1295 char* srbra;
1296 Rune* aname;
1297 Rune* tname;
1298 Attr* a;
1299 char buf[BIGBUFSIZE];
1301 t = va_arg(f->args, Token*);
1302 if(t == nil)
1303 sprint(buf, "<null>");
1304 else {
1305 i = 0;
1306 if(dbglex > 1)
1307 i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1308 tag = t->tag;
1309 if(tag == Data) {
1310 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1312 else {
1313 srbra = "";
1314 if(tag >= RBRA) {
1315 tag -= RBRA;
1316 srbra = "/";
1318 tname = tagnames[tag];
1319 if(tag == Notfound)
1320 tname = L(Lquestion);
1321 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1322 for(a = t->attr; a != nil; a = a->next) {
1323 aname = attrnames[a->attid];
1324 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1325 if(a->value != nil)
1326 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1328 i += snprint(buf+i, sizeof(buf)-i-1, ">");
1330 buf[i] = 0;
1332 return fmtstrcpy(f, buf);
1335 // Attrs own their constituent strings, but build may eventually
1336 // transfer some values to its items and nil them out in the Attr.
1337 static Attr*
1338 newattr(int attid, Rune* value, Attr* link)
1340 Attr* ans;
1342 ans = (Attr*)emalloc(sizeof(Attr));
1343 ans->attid = attid;
1344 ans->value = value;
1345 ans->next = link;
1346 return ans;
1349 // Free list of Attrs linked through next field
1350 static void
1351 freeattrs(Attr* ahead)
1353 Attr* a;
1354 Attr* nexta;
1356 a = ahead;
1357 while(a != nil) {
1358 nexta = a->next;
1359 free(a->value);
1360 free(a);
1361 a = nexta;
1365 // Free array of Tokens.
1366 // Allocated space might have room for more than n tokens,
1367 // but only n of them are initialized.
1368 // If caller has transferred ownership of constitutent strings
1369 // or attributes, it must have nil'd out the pointers in the Tokens.
1370 void
1371 _freetokens(Token* tarray, int n)
1373 int i;
1374 Token* t;
1376 if(tarray == nil)
1377 return;
1378 for(i = 0; i < n; i++) {
1379 t = &tarray[i];
1380 free(t->text);
1381 freeattrs(t->attr);
1383 free(tarray);