Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <ctype.h>
5 #include <html.h>
6 #include "impl.h"
8 typedef struct TokenSource TokenSource;
9 struct TokenSource
10 {
11 int i; /* index of next byte to use */
12 uchar* data; /* all the data */
13 int edata; /* data[0:edata] is valid */
14 int chset; /* one of US_Ascii, etc. */
15 int mtype; /* TextHtml or TextPlain */
16 };
18 enum {
19 EOF = -2,
20 EOB = -1
21 };
23 #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
25 #define SMALLBUFSIZE 240
26 #define BIGBUFSIZE 2000
28 /* HTML 4.0 tag names. */
29 /* Keep sorted, and in correspondence with enum in iparse.h. */
30 Rune **tagnames;
31 char *_tagnames[] = {
32 " ",
33 "!",
34 "a",
35 "abbr",
36 "acronym",
37 "address",
38 "applet",
39 "area",
40 "b",
41 "base",
42 "basefont",
43 "bdo",
44 "big",
45 "blink",
46 "blockquote",
47 "body",
48 "bq",
49 "br",
50 "button",
51 "caption",
52 "center",
53 "cite",
54 "code",
55 "col",
56 "colgroup",
57 "dd",
58 "del",
59 "dfn",
60 "dir",
61 "div",
62 "dl",
63 "dt",
64 "em",
65 "fieldset",
66 "font",
67 "form",
68 "frame",
69 "frameset",
70 "h1",
71 "h2",
72 "h3",
73 "h4",
74 "h5",
75 "h6",
76 "head",
77 "hr",
78 "html",
79 "i",
80 "iframe",
81 "img",
82 "input",
83 "ins",
84 "isindex",
85 "kbd",
86 "label",
87 "legend",
88 "li",
89 "link",
90 "map",
91 "menu",
92 "meta",
93 "nobr",
94 "noframes",
95 "noscript",
96 "object",
97 "ol",
98 "optgroup",
99 "option",
100 "p",
101 "param",
102 "pre",
103 "q",
104 "s",
105 "samp",
106 "script",
107 "select",
108 "small",
109 "span",
110 "strike",
111 "strong",
112 "style",
113 "sub",
114 "sup",
115 "table",
116 "tbody",
117 "td",
118 "textarea",
119 "tfoot",
120 "th",
121 "thead",
122 "title",
123 "tr",
124 "tt",
125 "u",
126 "ul",
127 "var"
128 };
130 /* HTML 4.0 attribute names. */
131 /* Keep sorted, and in correspondence with enum in i.h. */
132 Rune **attrnames;
133 char* _attrnames[] = {
134 "abbr",
135 "accept-charset",
136 "access-key",
137 "action",
138 "align",
139 "alink",
140 "alt",
141 "archive",
142 "axis",
143 "background",
144 "bgcolor",
145 "border",
146 "cellpadding",
147 "cellspacing",
148 "char",
149 "charoff",
150 "charset",
151 "checked",
152 "cite",
153 "class",
154 "classid",
155 "clear",
156 "code",
157 "codebase",
158 "codetype",
159 "color",
160 "cols",
161 "colspan",
162 "compact",
163 "content",
164 "coords",
165 "data",
166 "datetime",
167 "declare",
168 "defer",
169 "dir",
170 "disabled",
171 "enctype",
172 "face",
173 "for",
174 "frame",
175 "frameborder",
176 "headers",
177 "height",
178 "href",
179 "hreflang",
180 "hspace",
181 "http-equiv",
182 "id",
183 "ismap",
184 "label",
185 "lang",
186 "link",
187 "longdesc",
188 "marginheight",
189 "marginwidth",
190 "maxlength",
191 "media",
192 "method",
193 "multiple",
194 "name",
195 "nohref",
196 "noresize",
197 "noshade",
198 "nowrap",
199 "object",
200 "onblur",
201 "onchange",
202 "onclick",
203 "ondblclick",
204 "onfocus",
205 "onkeypress",
206 "onkeyup",
207 "onload",
208 "onmousedown",
209 "onmousemove",
210 "onmouseout",
211 "onmouseover",
212 "onmouseup",
213 "onreset",
214 "onselect",
215 "onsubmit",
216 "onunload",
217 "profile",
218 "prompt",
219 "readonly",
220 "rel",
221 "rev",
222 "rows",
223 "rowspan",
224 "rules",
225 "scheme",
226 "scope",
227 "scrolling",
228 "selected",
229 "shape",
230 "size",
231 "span",
232 "src",
233 "standby",
234 "start",
235 "style",
236 "summary",
237 "tabindex",
238 "target",
239 "text",
240 "title",
241 "type",
242 "usemap",
243 "valign",
244 "value",
245 "valuetype",
246 "version",
247 "vlink",
248 "vspace",
249 "width"
250 };
253 /* Character entity to unicode character number map. */
254 /* Keep sorted by name. */
255 StringInt *chartab;
256 AsciiInt _chartab[] = {
257 {"AElig", 198},
258 {"Aacute", 193},
259 {"Acirc", 194},
260 {"Agrave", 192},
261 {"Aring", 197},
262 {"Atilde", 195},
263 {"Auml", 196},
264 {"Ccedil", 199},
265 {"ETH", 208},
266 {"Eacute", 201},
267 {"Ecirc", 202},
268 {"Egrave", 200},
269 {"Euml", 203},
270 {"Iacute", 205},
271 {"Icirc", 206},
272 {"Igrave", 204},
273 {"Iuml", 207},
274 {"Ntilde", 209},
275 {"Oacute", 211},
276 {"Ocirc", 212},
277 {"Ograve", 210},
278 {"Oslash", 216},
279 {"Otilde", 213},
280 {"Ouml", 214},
281 {"THORN", 222},
282 {"Uacute", 218},
283 {"Ucirc", 219},
284 {"Ugrave", 217},
285 {"Uuml", 220},
286 {"Yacute", 221},
287 {"aacute", 225},
288 {"acirc", 226},
289 {"acute", 180},
290 {"aelig", 230},
291 {"agrave", 224},
292 {"alpha", 945},
293 {"amp", 38},
294 {"aring", 229},
295 {"atilde", 227},
296 {"auml", 228},
297 {"beta", 946},
298 {"brvbar", 166},
299 {"ccedil", 231},
300 {"cdots", 8943},
301 {"cedil", 184},
302 {"cent", 162},
303 {"chi", 967},
304 {"copy", 169},
305 {"curren", 164},
306 {"ddots", 8945},
307 {"deg", 176},
308 {"delta", 948},
309 {"divide", 247},
310 {"eacute", 233},
311 {"ecirc", 234},
312 {"egrave", 232},
313 {"emdash", 8212}, /* non-standard but commonly used */
314 {"emsp", 8195},
315 {"endash", 8211}, /* non-standard but commonly used */
316 {"ensp", 8194},
317 {"epsilon", 949},
318 {"eta", 951},
319 {"eth", 240},
320 {"euml", 235},
321 {"frac12", 189},
322 {"frac14", 188},
323 {"frac34", 190},
324 {"gamma", 947},
325 {"gt", 62},
326 {"iacute", 237},
327 {"icirc", 238},
328 {"iexcl", 161},
329 {"igrave", 236},
330 {"iota", 953},
331 {"iquest", 191},
332 {"iuml", 239},
333 {"kappa", 954},
334 {"lambda", 955},
335 {"laquo", 171},
336 {"ldquo", 8220},
337 {"ldots", 8230},
338 {"lsquo", 8216},
339 {"lt", 60},
340 {"macr", 175},
341 {"mdash", 8212},
342 {"micro", 181},
343 {"middot", 183},
344 {"mu", 956},
345 {"nbsp", 160},
346 {"ndash", 8211},
347 {"not", 172},
348 {"ntilde", 241},
349 {"nu", 957},
350 {"oacute", 243},
351 {"ocirc", 244},
352 {"ograve", 242},
353 {"omega", 969},
354 {"omicron", 959},
355 {"ordf", 170},
356 {"ordm", 186},
357 {"oslash", 248},
358 {"otilde", 245},
359 {"ouml", 246},
360 {"para", 182},
361 {"phi", 966},
362 {"pi", 960},
363 {"plusmn", 177},
364 {"pound", 163},
365 {"psi", 968},
366 {"quad", 8193},
367 {"quot", 34},
368 {"raquo", 187},
369 {"rdquo", 8221},
370 {"reg", 174},
371 {"rho", 961},
372 {"rsquo", 8217},
373 {"sect", 167},
374 {"shy", 173},
375 {"sigma", 963},
376 {"sp", 8194},
377 {"sup1", 185},
378 {"sup2", 178},
379 {"sup3", 179},
380 {"szlig", 223},
381 {"tau", 964},
382 {"theta", 952},
383 {"thinsp", 8201},
384 {"thorn", 254},
385 {"times", 215},
386 {"trade", 8482},
387 {"uacute", 250},
388 {"ucirc", 251},
389 {"ugrave", 249},
390 {"uml", 168},
391 {"upsilon", 965},
392 {"uuml", 252},
393 {"varepsilon", 8712},
394 {"varphi", 981},
395 {"varpi", 982},
396 {"varrho", 1009},
397 {"vdots", 8942},
398 {"vsigma", 962},
399 {"vtheta", 977},
400 {"xi", 958},
401 {"yacute", 253},
402 {"yen", 165},
403 {"yuml", 255},
404 {"zeta", 950}
405 };
406 #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
408 /* Characters Winstart..Winend are those that Windows */
409 /* uses interpolated into the Latin1 set. */
410 /* They aren't supposed to appear in HTML, but they do.... */
411 enum {
412 Winstart = 127,
413 Winend = 159
414 };
416 static int winchars[]= { 8226, /* 8226 is a bullet */
417 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
418 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
419 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
420 732, 8482, 353, 8250, 339, 8226, 8226, 376};
422 static StringInt* tagtable; /* initialized from tagnames */
423 static StringInt* attrtable; /* initialized from attrnames */
425 static void lexinit(void);
426 static int getplaindata(TokenSource* ts, Token* a, int* pai);
427 static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
428 static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
429 static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
430 static Rune* buftostr(Rune* s, Rune* buf, int j);
431 static int comment(TokenSource* ts);
432 static int findstr(TokenSource* ts, Rune* s);
433 static int ampersand(TokenSource* ts);
434 /*static int lowerc(int c); */
435 static int getchar(TokenSource* ts);
436 static void ungetchar(TokenSource* ts, int c);
437 static void backup(TokenSource* ts, int savei);
438 /*static void freeinsidetoken(Token* t); */
439 static void freeattrs(Attr* ahead);
440 static Attr* newattr(int attid, Rune* value, Attr* link);
441 static int Tconv(Fmt* f);
443 int dbglex = 0;
444 static int lexinited = 0;
446 static void
447 lexinit(void)
449 chartab = _cvtstringinttab(_chartab, nelem(_chartab));
450 tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
451 tagtable = _makestrinttab(tagnames, Numtags);
452 attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
453 attrtable = _makestrinttab(attrnames, Numattrs);
454 fmtinstall('T', Tconv);
455 lexinited = 1;
458 static TokenSource*
459 newtokensource(uchar* data, int edata, int chset, int mtype)
461 TokenSource* ans;
463 assert(chset == US_Ascii || chset == ISO_8859_1 ||
464 chset == UTF_8 || chset == Unicode);
465 ans = (TokenSource*)emalloc(sizeof(TokenSource));
466 ans->i = 0;
467 ans->data = data;
468 ans->edata = edata;
469 ans->chset = chset;
470 ans->mtype = mtype;
471 return ans;
474 enum {
475 ToksChunk = 500
476 };
478 /* Call this to get the tokens. */
479 /* The number of returned tokens is returned in *plen. */
480 Token*
481 _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
483 TokenSource* ts;
484 Token* a;
485 int alen;
486 int ai;
487 int starti;
488 int c;
489 int tag;
491 if(!lexinited)
492 lexinit();
493 ts = newtokensource(data, datalen, chset, mtype);
494 alen = ToksChunk;
495 a = (Token*)emalloc(alen * sizeof(Token));
496 ai = 0;
497 if(dbglex)
498 fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
499 if(ts->mtype == TextHtml){
500 for(;;){
501 if(ai == alen){
502 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
503 alen += ToksChunk;
505 starti = ts->i;
506 c = getchar(ts);
507 if(c < 0)
508 break;
509 if(c == '<'){
510 tag = gettag(ts, starti, a, &ai);
511 if(tag == Tscript){
512 /* special rules for getting Data after.... */
513 starti = ts->i;
514 c = getchar(ts);
515 tag = getscriptdata(ts, c, starti, a, &ai);
518 else
519 tag = getdata(ts, c, starti, a, &ai);
520 if(tag == -1)
521 break;
522 else if(dbglex > 1 && tag != Comment)
523 fprint(2, "lex: got token %T\n", &a[ai-1]);
526 else {
527 /* plain text (non-html) tokens */
528 for(;;){
529 if(ai == alen){
530 a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
531 alen += ToksChunk;
533 tag = getplaindata(ts, a, &ai);
534 if(tag == -1)
535 break;
536 if(dbglex > 1)
537 fprint(2, "lex: got token %T\n", &a[ai]);
540 if(dbglex)
541 fprint(2, "lex: returning %d tokens\n", ai);
542 *plen = ai;
543 if(ai == 0)
544 return nil;
545 return a;
548 /* For case where source isn't HTML. */
549 /* Just make data tokens, one per line (or partial line, */
550 /* at end of buffer), ignoring non-whitespace control */
551 /* characters and dumping \r's. */
552 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
553 /* Otherwise return -1; */
554 static int
555 getplaindata(TokenSource* ts, Token* a, int* pai)
557 Rune* s;
558 int j;
559 int starti;
560 int c;
561 Token* tok;
562 Rune buf[BIGBUFSIZE];
564 s = nil;
565 j = 0;
566 starti = ts->i;
567 for(c = getchar(ts); c >= 0; c = getchar(ts)){
568 if(c < ' '){
569 if(isspace(c)){
570 if(c == '\r'){
571 /* ignore it unless no following '\n', */
572 /* in which case treat it like '\n' */
573 c = getchar(ts);
574 if(c != '\n'){
575 if(c >= 0)
576 ungetchar(ts, c);
577 c = '\n';
581 else
582 c = 0;
584 if(c != 0){
585 buf[j++] = c;
586 if(j == sizeof(buf)-1){
587 s = buftostr(s, buf, j);
588 j = 0;
591 if(c == '\n')
592 break;
594 s = buftostr(s, buf, j);
595 if(s == nil)
596 return -1;
597 tok = &a[(*pai)++];
598 tok->tag = Data;
599 tok->text = s;
600 tok->attr = nil;
601 tok->starti = starti;
602 return Data;
605 /* Return concatenation of s and buf[0:j] */
606 static Rune*
607 buftostr(Rune* s, Rune* buf, int j)
609 buf[j] = 0;
610 if(s == nil)
611 s = _Strndup(buf, j);
612 else
613 s = _Strdup2(s, buf);
614 return s;
617 /* Gather data up to next start-of-tag or end-of-buffer. */
618 /* Translate entity references (&amp;). */
619 /* Ignore non-whitespace control characters and get rid of \r's. */
620 /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
621 /* Otherwise return -1; */
622 static int
623 getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
625 Rune* s;
626 int j;
627 int c;
628 Token* tok;
629 Rune buf[BIGBUFSIZE];
631 s = nil;
632 j = 0;
633 c = firstc;
634 while(c >= 0){
635 if(c == '&'){
636 c = ampersand(ts);
637 if(c < 0)
638 break;
640 else if(c < ' '){
641 if(isspace(c)){
642 if(c == '\r'){
643 /* ignore it unless no following '\n', */
644 /* in which case treat it like '\n' */
645 c = getchar(ts);
646 if(c != '\n'){
647 if(c >= 0)
648 ungetchar(ts, c);
649 c = '\n';
653 else {
654 if(warn)
655 fprint(2, "warning: non-whitespace control character %d ignored\n", c);
656 c = 0;
659 else if(c == '<'){
660 ungetchar(ts, c);
661 break;
663 if(c != 0){
664 buf[j++] = c;
665 if(j == BIGBUFSIZE-1){
666 s = buftostr(s, buf, j);
667 j = 0;
670 c = getchar(ts);
672 s = buftostr(s, buf, j);
673 if(s == nil)
674 return -1;
675 tok = &a[(*pai)++];
676 tok->tag = Data;
677 tok->text = s;
678 tok->attr = nil;
679 tok->starti = starti;
680 return Data;
683 /* The rules for lexing scripts are different (ugh). */
684 /* Gather up everything until see a </SCRIPT>. */
685 static int
686 getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
688 Rune* s;
689 int j;
690 int tstarti;
691 int savei;
692 int c;
693 int tag;
694 int done;
695 Token* tok;
696 Rune buf[BIGBUFSIZE];
698 s = nil;
699 j = 0;
700 tstarti = starti;
701 c = firstc;
702 done = 0;
703 while(c >= 0){
704 if(c == '<'){
705 /* other browsers ignore stuff to end of line after <! */
706 savei = ts->i;
707 c = getchar(ts);
708 if(c == '!'){
709 while(c >= 0 && c != '\n' && c != '\r')
710 c = getchar(ts);
711 if(c == '\r')
712 c = getchar(ts);
713 if(c == '\n')
714 c = getchar(ts);
716 else if(c >= 0){
717 backup(ts, savei);
718 tag = gettag(ts, tstarti, a, pai);
719 if(tag == -1)
720 break;
721 if(tag != Comment)
722 (*pai)--;
723 backup(ts, tstarti);
724 if(tag == Tscript + RBRA){
725 done = 1;
726 break;
728 /* here tag was not </SCRIPT>, so take as regular data */
729 c = getchar(ts);
732 if(c < 0)
733 break;
734 if(c != 0){
735 buf[j++] = c;
736 if(j == BIGBUFSIZE-1){
737 s = buftostr(s, buf, j);
738 j = 0;
741 tstarti = ts->i;
742 c = getchar(ts);
744 if(done || ts->i == ts->edata){
745 s = buftostr(s, buf, j);
746 tok = &a[(*pai)++];
747 tok->tag = Data;
748 tok->text = s;
749 tok->attr = nil;
750 tok->starti = starti;
751 return Data;
753 backup(ts, starti);
754 return -1;
757 /* We've just seen a '<'. Gather up stuff to closing '>' (if buffer */
758 /* ends before then, return -1). */
759 /* If it's a tag, look up the name, gather the attributes, and return */
760 /* the appropriate token. */
761 /* Else it's either just plain data or some kind of ignorable stuff: */
762 /* return Data or Comment as appropriate. */
763 /* If it's not a Comment, put it in a[*pai] and bump *pai. */
764 static int
765 gettag(TokenSource* ts, int starti, Token* a, int* pai)
767 int rbra;
768 int ans;
769 Attr* al;
770 int nexti;
771 int c;
772 int ti;
773 int afnd;
774 int attid;
775 int quote;
776 Rune* val;
777 int nv;
778 int i;
779 int tag;
780 Token* tok;
781 Rune buf[BIGBUFSIZE];
783 rbra = 0;
784 nexti = ts->i;
785 tok = &a[*pai];
786 tok->tag = Notfound;
787 tok->text = nil;
788 tok->attr = nil;
789 tok->starti = starti;
790 c = getchar(ts);
791 if(c == '/'){
792 rbra = RBRA;
793 c = getchar(ts);
795 if(c < 0)
796 goto eob_done;
797 if(c >= 256 || !isalpha(c)){
798 /* not a tag */
799 if(c == '!'){
800 ans = comment(ts);
801 if(ans != -1)
802 return ans;
803 goto eob_done;
805 else {
806 backup(ts, nexti);
807 tok->tag = Data;
808 tok->text = _Strdup(L(Llt));
809 (*pai)++;
810 return Data;
813 /* c starts a tagname */
814 buf[0] = c;
815 i = 1;
816 for(;;){
817 c = getchar(ts);
818 if(c < 0)
819 goto eob_done;
820 if(!ISNAMCHAR(c))
821 break;
822 /* if name is bigger than buf it won't be found anyway... */
823 if(i < BIGBUFSIZE)
824 buf[i++] = c;
826 if(_lookup(tagtable, Numtags, buf, i, &tag))
827 tok->tag = tag + rbra;
828 else
829 tok->text = _Strndup(buf, i); /* for warning print, in build */
831 /* attribute gathering loop */
832 al = nil;
833 for(;;){
834 /* look for "ws name" or "ws name ws = ws val" (ws=whitespace) */
835 /* skip whitespace */
836 attrloop_continue:
837 while(c < 256 && isspace(c)){
838 c = getchar(ts);
839 if(c < 0)
840 goto eob_done;
842 if(c == '>')
843 goto attrloop_done;
844 if(c == '<'){
845 if(warn)
846 fprint(2, "warning: unclosed tag\n");
847 ungetchar(ts, c);
848 goto attrloop_done;
850 if(c >= 256 || !isalpha(c)){
851 if(warn)
852 fprint(2, "warning: expected attribute name\n");
853 /* skipt to next attribute name */
854 for(;;){
855 c = getchar(ts);
856 if(c < 0)
857 goto eob_done;
858 if(c < 256 && isalpha(c))
859 goto attrloop_continue;
860 if(c == '<'){
861 if(warn)
862 fprint(2, "warning: unclosed tag\n");
863 ungetchar(ts, 60);
864 goto attrloop_done;
866 if(c == '>')
867 goto attrloop_done;
870 /* gather attribute name */
871 buf[0] = c;
872 i = 1;
873 for(;;){
874 c = getchar(ts);
875 if(c < 0)
876 goto eob_done;
877 if(!ISNAMCHAR(c))
878 break;
879 if(i < BIGBUFSIZE-1)
880 buf[i++] = c;
882 afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
883 if(warn && !afnd){
884 buf[i] = 0;
885 fprint(2, "warning: unknown attribute name %S\n", buf);
887 /* skip whitespace */
888 while(c < 256 && isspace(c)){
889 c = getchar(ts);
890 if(c < 0)
891 goto eob_done;
893 if(c != '='){
894 if(afnd)
895 al = newattr(attid, nil, al);
896 goto attrloop_continue;
898 /*# c is '=' here; skip whitespace */
899 for(;;){
900 c = getchar(ts);
901 if(c < 0)
902 goto eob_done;
903 if(c >= 256 || !isspace(c))
904 break;
906 quote = 0;
907 if(c == '\'' || c == '"'){
908 quote = c;
909 c = getchar(ts);
910 if(c < 0)
911 goto eob_done;
913 val = nil;
914 nv = 0;
915 for(;;){
916 valloop_continue:
917 if(c < 0)
918 goto eob_done;
919 if(c == '>'){
920 if(quote){
921 /* c might be part of string (though not good style) */
922 /* but if line ends before close quote, assume */
923 /* there was an unmatched quote */
924 ti = ts->i;
925 for(;;){
926 c = getchar(ts);
927 if(c < 0)
928 goto eob_done;
929 if(c == quote){
930 backup(ts, ti);
931 buf[nv++] = '>';
932 if(nv == BIGBUFSIZE-1){
933 val = buftostr(val, buf, nv);
934 nv = 0;
936 c = getchar(ts);
937 goto valloop_continue;
939 if(c == '\n'){
940 if(warn)
941 fprint(2, "warning: apparent unmatched quote\n");
942 backup(ts, ti);
943 c = '>';
944 goto valloop_done;
948 else
949 goto valloop_done;
951 if(quote){
952 if(c == quote){
953 c = getchar(ts);
954 if(c < 0)
955 goto eob_done;
956 goto valloop_done;
958 if(c == '\r'){
959 c = getchar(ts);
960 goto valloop_continue;
962 if(c == '\t' || c == '\n')
963 c = ' ';
965 else {
966 if(c < 256 && isspace(c))
967 goto valloop_done;
969 if(c == '&'){
970 c = ampersand(ts);
971 if(c == -1)
972 goto eob_done;
974 buf[nv++] = c;
975 if(nv == BIGBUFSIZE-1){
976 val = buftostr(val, buf, nv);
977 nv = 0;
979 c = getchar(ts);
981 valloop_done:
982 if(afnd){
983 val = buftostr(val, buf, nv);
984 al = newattr(attid, val, al);
988 attrloop_done:
989 tok->attr = al;
990 (*pai)++;
991 return tok->tag;
993 eob_done:
994 if(warn)
995 fprint(2, "warning: incomplete tag at end of page\n");
996 backup(ts, nexti);
997 tok->tag = Data;
998 tok->text = _Strdup(L(Llt));
999 return Data;
1002 /* We've just read a '<!' at position starti, */
1003 /* so this may be a comment or other ignored section, or it may */
1004 /* be just a literal string if there is no close before end of file */
1005 /* (other browsers do that). */
1006 /* The accepted practice seems to be (note: contrary to SGML spec!): */
1007 /* If see <!--, look for --> to close, or if none, > to close. */
1008 /* If see <!(not --), look for > to close. */
1009 /* If no close before end of file, leave original characters in as literal data. */
1010 /* */
1011 /* If we see ignorable stuff, return Comment. */
1012 /* Else return nil (caller should back up and try again when more data arrives, */
1013 /* unless at end of file, in which case caller should just make '<' a data token). */
1014 static int
1015 comment(TokenSource* ts)
1017 int nexti;
1018 int havecomment;
1019 int c;
1021 nexti = ts->i;
1022 havecomment = 0;
1023 c = getchar(ts);
1024 if(c == '-'){
1025 c = getchar(ts);
1026 if(c == '-'){
1027 if(findstr(ts, L(Larrow)))
1028 havecomment = 1;
1029 else
1030 backup(ts, nexti);
1033 if(!havecomment){
1034 if(c == '>')
1035 havecomment = 1;
1036 else if(c >= 0){
1037 if(findstr(ts, L(Lgt)))
1038 havecomment = 1;
1041 if(havecomment)
1042 return Comment;
1043 return -1;
1046 /* Look for string s in token source. */
1047 /* If found, return 1, with buffer at next char after s, */
1048 /* else return 0 (caller should back up). */
1049 static int
1050 findstr(TokenSource* ts, Rune* s)
1052 int c0;
1053 int n;
1054 int nexti;
1055 int i;
1056 int c;
1058 c0 = s[0];
1059 n = runestrlen(s);
1060 for(;;){
1061 c = getchar(ts);
1062 if(c < 0)
1063 break;
1064 if(c == c0){
1065 if(n == 1)
1066 return 1;
1067 nexti = ts->i;
1068 for(i = 1; i < n; i++){
1069 c = getchar(ts);
1070 if(c < 0)
1071 goto mainloop_done;
1072 if(c != s[i])
1073 break;
1075 if(i == n)
1076 return 1;
1077 backup(ts, nexti);
1080 mainloop_done:
1081 return 0;
1084 static int
1085 xdigit(int c)
1087 if('0' <= c && c <= '9')
1088 return c-'0';
1089 if('a' <= c && c <= 'f')
1090 return c-'a'+10;
1091 if('A' <= c && c <= 'F')
1092 return c-'A'+10;
1093 return -1;
1096 /* We've just read an '&'; look for an entity reference */
1097 /* name, and if found, return translated char. */
1098 /* if there is a complete entity name but it isn't known, */
1099 /* try prefixes (gets around some buggy HTML out there), */
1100 /* and if that fails, back up to just past the '&' and return '&'. */
1101 /* If the entity can't be completed in the current buffer, back up */
1102 /* to the '&' and return -1. */
1103 static int
1104 ampersand(TokenSource* ts)
1106 int savei;
1107 int c;
1108 int fnd;
1109 int ans;
1110 int v;
1111 int i;
1112 int k;
1113 Rune buf[SMALLBUFSIZE];
1115 savei = ts->i;
1116 c = getchar(ts);
1117 fnd = 0;
1118 ans = -1;
1119 if(c == '#'){
1120 c = getchar(ts);
1121 v = 0;
1122 if(c == 'x'){
1123 c = getchar(ts);
1124 while((i=xdigit(c)) != -1){
1125 v = v*16 + i;
1126 c = getchar(ts);
1128 }else{
1129 while('0' <= c && c <= '9'){
1130 v = v*10 + c - '0';
1131 c = getchar(ts);
1134 if(c >= 0){
1135 if(!(c == ';' || c == '\n' || c == '\r'))
1136 ungetchar(ts, c);
1137 c = v;
1138 if(c == 160)
1139 c = 160;
1140 if(c >= Winstart && c <= Winend){
1141 c = winchars[c - Winstart];
1143 ans = c;
1144 fnd = 1;
1147 else if(c < 256 && isalpha(c)){
1148 buf[0] = c;
1149 k = 1;
1150 for(;;){
1151 c = getchar(ts);
1152 if(c < 0)
1153 break;
1154 if(ISNAMCHAR(c)){
1155 if(k < SMALLBUFSIZE-1)
1156 buf[k++] = c;
1158 else {
1159 if(!(c == ';' || c == '\n' || c == '\r'))
1160 ungetchar(ts, c);
1161 break;
1164 if(c >= 0){
1165 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1166 if(!fnd){
1167 /* Try prefixes of s */
1168 if(c == ';' || c == '\n' || c == '\r')
1169 ungetchar(ts, c);
1170 i = k;
1171 while(--k > 0){
1172 fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1173 if(fnd){
1174 while(i > k){
1175 i--;
1176 ungetchar(ts, buf[i]);
1178 break;
1184 if(!fnd){
1185 backup(ts, savei);
1186 ans = '&';
1188 return ans;
1191 /* Get next char, obeying ts.chset. */
1192 /* Returns -1 if no complete character left before current end of data. */
1193 static int
1194 getchar(TokenSource* ts)
1196 uchar* buf;
1197 int c;
1198 int n;
1199 int ok;
1200 Rune r;
1202 if(ts->i >= ts->edata)
1203 return -1;
1204 buf = ts->data;
1205 c = buf[ts->i];
1206 switch(ts->chset){
1207 case ISO_8859_1:
1208 if(c >= Winstart && c <= Winend)
1209 c = winchars[c - Winstart];
1210 ts->i++;
1211 break;
1212 case US_Ascii:
1213 if(c > 127){
1214 if(warn)
1215 fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1217 ts->i++;
1218 break;
1219 case UTF_8:
1220 ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1221 n = chartorune(&r, (char*)(buf+ts->i));
1222 if(ok){
1223 if(warn && c == 0x80)
1224 fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1225 ts->i += n;
1226 c = r;
1228 else {
1229 /* not enough bytes in buf to complete utf-8 char */
1230 ts->i = ts->edata; /* mark "all used" */
1231 c = -1;
1233 break;
1234 case Unicode:
1235 if(ts->i < ts->edata - 1){
1236 /*standards say most-significant byte first */
1237 c = (c << 8)|(buf[ts->i + 1]);
1238 ts->i += 2;
1240 else {
1241 ts->i = ts->edata; /* mark "all used" */
1242 c = -1;
1244 break;
1246 return c;
1249 /* Assuming c was the last character returned by getchar, set */
1250 /* things up so that next getchar will get that same character */
1251 /* followed by the current 'next character', etc. */
1252 static void
1253 ungetchar(TokenSource* ts, int c)
1255 int n;
1256 Rune r;
1257 char a[UTFmax];
1259 n = 1;
1260 switch(ts->chset){
1261 case UTF_8:
1262 if(c >= 128){
1263 r = c;
1264 n = runetochar(a, &r);
1266 break;
1267 case Unicode:
1268 n = 2;
1269 break;
1271 ts->i -= n;
1274 /* Restore ts so that it is at the state where the index was savei. */
1275 static void
1276 backup(TokenSource* ts, int savei)
1278 if(dbglex)
1279 fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1280 ts->i = savei;
1284 /* Look for value associated with attribute attid in token t. */
1285 /* If there is one, return 1 and put the value in *pans, */
1286 /* else return 0. */
1287 /* If xfer is true, transfer ownership of the string to the caller */
1288 /* (nil it out here); otherwise, caller must duplicate the answer */
1289 /* if it needs to save it. */
1290 /* OK to have pans==0, in which case this is just looking */
1291 /* to see if token is present. */
1292 int
1293 _tokaval(Token* t, int attid, Rune** pans, int xfer)
1295 Attr* attr;
1297 attr = t->attr;
1298 while(attr != nil){
1299 if(attr->attid == attid){
1300 if(pans != nil)
1301 *pans = attr->value;
1302 if(xfer)
1303 attr->value = nil;
1304 return 1;
1306 attr = attr->next;
1308 if(pans != nil)
1309 *pans = nil;
1310 return 0;
1313 static int
1314 Tconv(Fmt *f)
1316 Token* t;
1317 int i;
1318 int tag;
1319 char* srbra;
1320 Rune* aname;
1321 Rune* tname;
1322 Attr* a;
1323 char buf[BIGBUFSIZE];
1325 t = va_arg(f->args, Token*);
1326 if(t == nil)
1327 sprint(buf, "<null>");
1328 else {
1329 i = 0;
1330 if(dbglex > 1)
1331 i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1332 tag = t->tag;
1333 if(tag == Data){
1334 i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1336 else {
1337 srbra = "";
1338 if(tag >= RBRA){
1339 tag -= RBRA;
1340 srbra = "/";
1342 tname = tagnames[tag];
1343 if(tag == Notfound)
1344 tname = L(Lquestion);
1345 i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1346 for(a = t->attr; a != nil; a = a->next){
1347 aname = attrnames[a->attid];
1348 i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1349 if(a->value != nil)
1350 i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1352 i += snprint(buf+i, sizeof(buf)-i-1, ">");
1354 buf[i] = 0;
1356 return fmtstrcpy(f, buf);
1359 /* Attrs own their constituent strings, but build may eventually */
1360 /* transfer some values to its items and nil them out in the Attr. */
1361 static Attr*
1362 newattr(int attid, Rune* value, Attr* link)
1364 Attr* ans;
1366 ans = (Attr*)emalloc(sizeof(Attr));
1367 ans->attid = attid;
1368 ans->value = value;
1369 ans->next = link;
1370 return ans;
1373 /* Free list of Attrs linked through next field */
1374 static void
1375 freeattrs(Attr* ahead)
1377 Attr* a;
1378 Attr* nexta;
1380 a = ahead;
1381 while(a != nil){
1382 nexta = a->next;
1383 free(a->value);
1384 free(a);
1385 a = nexta;
1389 /* Free array of Tokens. */
1390 /* Allocated space might have room for more than n tokens, */
1391 /* but only n of them are initialized. */
1392 /* If caller has transferred ownership of constitutent strings */
1393 /* or attributes, it must have nil'd out the pointers in the Tokens. */
1394 void
1395 _freetokens(Token* tarray, int n)
1397 int i;
1398 Token* t;
1400 if(tarray == nil)
1401 return;
1402 for(i = 0; i < n; i++){
1403 t = &tarray[i];
1404 free(t->text);
1405 freeattrs(t->attr);
1407 free(tarray);