Blame


1 7cf289ca 2004-04-06 devnull #include <u.h>
2 7cf289ca 2004-04-06 devnull #include <libc.h>
3 7cf289ca 2004-04-06 devnull #include <draw.h>
4 7cf289ca 2004-04-06 devnull #include <ctype.h>
5 7cf289ca 2004-04-06 devnull #include <html.h>
6 7cf289ca 2004-04-06 devnull #include "impl.h"
7 7cf289ca 2004-04-06 devnull
8 7cf289ca 2004-04-06 devnull typedef struct TokenSource TokenSource;
9 7cf289ca 2004-04-06 devnull struct TokenSource
10 7cf289ca 2004-04-06 devnull {
11 cbeb0b26 2006-04-01 devnull int i; /* index of next byte to use */
12 cbeb0b26 2006-04-01 devnull uchar* data; /* all the data */
13 cbeb0b26 2006-04-01 devnull int edata; /* data[0:edata] is valid */
14 cbeb0b26 2006-04-01 devnull int chset; /* one of US_Ascii, etc. */
15 cbeb0b26 2006-04-01 devnull int mtype; /* TextHtml or TextPlain */
16 7cf289ca 2004-04-06 devnull };
17 7cf289ca 2004-04-06 devnull
18 7cf289ca 2004-04-06 devnull enum {
19 7cf289ca 2004-04-06 devnull EOF = -2,
20 7cf289ca 2004-04-06 devnull EOB = -1
21 7cf289ca 2004-04-06 devnull };
22 7cf289ca 2004-04-06 devnull
23 7cf289ca 2004-04-06 devnull #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 7cf289ca 2004-04-06 devnull
25 7cf289ca 2004-04-06 devnull #define SMALLBUFSIZE 240
26 7cf289ca 2004-04-06 devnull #define BIGBUFSIZE 2000
27 7cf289ca 2004-04-06 devnull
28 cbeb0b26 2006-04-01 devnull /* HTML 4.0 tag names. */
29 cbeb0b26 2006-04-01 devnull /* Keep sorted, and in correspondence with enum in iparse.h. */
30 7cf289ca 2004-04-06 devnull Rune **tagnames;
31 7cf289ca 2004-04-06 devnull char *_tagnames[] = {
32 7cf289ca 2004-04-06 devnull " ",
33 7cf289ca 2004-04-06 devnull "!",
34 fa325e9b 2020-01-10 cross "a",
35 7cf289ca 2004-04-06 devnull "abbr",
36 7cf289ca 2004-04-06 devnull "acronym",
37 7cf289ca 2004-04-06 devnull "address",
38 fa325e9b 2020-01-10 cross "applet",
39 7cf289ca 2004-04-06 devnull "area",
40 7cf289ca 2004-04-06 devnull "b",
41 7cf289ca 2004-04-06 devnull "base",
42 7cf289ca 2004-04-06 devnull "basefont",
43 7cf289ca 2004-04-06 devnull "bdo",
44 7cf289ca 2004-04-06 devnull "big",
45 7cf289ca 2004-04-06 devnull "blink",
46 7cf289ca 2004-04-06 devnull "blockquote",
47 7cf289ca 2004-04-06 devnull "body",
48 7cf289ca 2004-04-06 devnull "bq",
49 7cf289ca 2004-04-06 devnull "br",
50 7cf289ca 2004-04-06 devnull "button",
51 7cf289ca 2004-04-06 devnull "caption",
52 7cf289ca 2004-04-06 devnull "center",
53 7cf289ca 2004-04-06 devnull "cite",
54 7cf289ca 2004-04-06 devnull "code",
55 7cf289ca 2004-04-06 devnull "col",
56 7cf289ca 2004-04-06 devnull "colgroup",
57 7cf289ca 2004-04-06 devnull "dd",
58 7cf289ca 2004-04-06 devnull "del",
59 7cf289ca 2004-04-06 devnull "dfn",
60 7cf289ca 2004-04-06 devnull "dir",
61 7cf289ca 2004-04-06 devnull "div",
62 7cf289ca 2004-04-06 devnull "dl",
63 7cf289ca 2004-04-06 devnull "dt",
64 7cf289ca 2004-04-06 devnull "em",
65 7cf289ca 2004-04-06 devnull "fieldset",
66 7cf289ca 2004-04-06 devnull "font",
67 7cf289ca 2004-04-06 devnull "form",
68 7cf289ca 2004-04-06 devnull "frame",
69 7cf289ca 2004-04-06 devnull "frameset",
70 7cf289ca 2004-04-06 devnull "h1",
71 7cf289ca 2004-04-06 devnull "h2",
72 7cf289ca 2004-04-06 devnull "h3",
73 7cf289ca 2004-04-06 devnull "h4",
74 7cf289ca 2004-04-06 devnull "h5",
75 7cf289ca 2004-04-06 devnull "h6",
76 7cf289ca 2004-04-06 devnull "head",
77 7cf289ca 2004-04-06 devnull "hr",
78 7cf289ca 2004-04-06 devnull "html",
79 7cf289ca 2004-04-06 devnull "i",
80 7cf289ca 2004-04-06 devnull "iframe",
81 7cf289ca 2004-04-06 devnull "img",
82 7cf289ca 2004-04-06 devnull "input",
83 7cf289ca 2004-04-06 devnull "ins",
84 7cf289ca 2004-04-06 devnull "isindex",
85 7cf289ca 2004-04-06 devnull "kbd",
86 7cf289ca 2004-04-06 devnull "label",
87 7cf289ca 2004-04-06 devnull "legend",
88 7cf289ca 2004-04-06 devnull "li",
89 7cf289ca 2004-04-06 devnull "link",
90 7cf289ca 2004-04-06 devnull "map",
91 7cf289ca 2004-04-06 devnull "menu",
92 7cf289ca 2004-04-06 devnull "meta",
93 7cf289ca 2004-04-06 devnull "nobr",
94 7cf289ca 2004-04-06 devnull "noframes",
95 7cf289ca 2004-04-06 devnull "noscript",
96 7cf289ca 2004-04-06 devnull "object",
97 7cf289ca 2004-04-06 devnull "ol",
98 7cf289ca 2004-04-06 devnull "optgroup",
99 7cf289ca 2004-04-06 devnull "option",
100 7cf289ca 2004-04-06 devnull "p",
101 7cf289ca 2004-04-06 devnull "param",
102 7cf289ca 2004-04-06 devnull "pre",
103 7cf289ca 2004-04-06 devnull "q",
104 7cf289ca 2004-04-06 devnull "s",
105 7cf289ca 2004-04-06 devnull "samp",
106 7cf289ca 2004-04-06 devnull "script",
107 7cf289ca 2004-04-06 devnull "select",
108 7cf289ca 2004-04-06 devnull "small",
109 7cf289ca 2004-04-06 devnull "span",
110 7cf289ca 2004-04-06 devnull "strike",
111 7cf289ca 2004-04-06 devnull "strong",
112 7cf289ca 2004-04-06 devnull "style",
113 7cf289ca 2004-04-06 devnull "sub",
114 7cf289ca 2004-04-06 devnull "sup",
115 7cf289ca 2004-04-06 devnull "table",
116 7cf289ca 2004-04-06 devnull "tbody",
117 7cf289ca 2004-04-06 devnull "td",
118 7cf289ca 2004-04-06 devnull "textarea",
119 7cf289ca 2004-04-06 devnull "tfoot",
120 7cf289ca 2004-04-06 devnull "th",
121 7cf289ca 2004-04-06 devnull "thead",
122 7cf289ca 2004-04-06 devnull "title",
123 7cf289ca 2004-04-06 devnull "tr",
124 7cf289ca 2004-04-06 devnull "tt",
125 7cf289ca 2004-04-06 devnull "u",
126 7cf289ca 2004-04-06 devnull "ul",
127 7cf289ca 2004-04-06 devnull "var"
128 7cf289ca 2004-04-06 devnull };
129 7cf289ca 2004-04-06 devnull
130 cbeb0b26 2006-04-01 devnull /* HTML 4.0 attribute names. */
131 cbeb0b26 2006-04-01 devnull /* Keep sorted, and in correspondence with enum in i.h. */
132 7cf289ca 2004-04-06 devnull Rune **attrnames;
133 7cf289ca 2004-04-06 devnull char* _attrnames[] = {
134 7cf289ca 2004-04-06 devnull "abbr",
135 7cf289ca 2004-04-06 devnull "accept-charset",
136 7cf289ca 2004-04-06 devnull "access-key",
137 7cf289ca 2004-04-06 devnull "action",
138 7cf289ca 2004-04-06 devnull "align",
139 7cf289ca 2004-04-06 devnull "alink",
140 7cf289ca 2004-04-06 devnull "alt",
141 7cf289ca 2004-04-06 devnull "archive",
142 7cf289ca 2004-04-06 devnull "axis",
143 7cf289ca 2004-04-06 devnull "background",
144 7cf289ca 2004-04-06 devnull "bgcolor",
145 7cf289ca 2004-04-06 devnull "border",
146 7cf289ca 2004-04-06 devnull "cellpadding",
147 7cf289ca 2004-04-06 devnull "cellspacing",
148 7cf289ca 2004-04-06 devnull "char",
149 7cf289ca 2004-04-06 devnull "charoff",
150 7cf289ca 2004-04-06 devnull "charset",
151 7cf289ca 2004-04-06 devnull "checked",
152 7cf289ca 2004-04-06 devnull "cite",
153 7cf289ca 2004-04-06 devnull "class",
154 7cf289ca 2004-04-06 devnull "classid",
155 7cf289ca 2004-04-06 devnull "clear",
156 7cf289ca 2004-04-06 devnull "code",
157 7cf289ca 2004-04-06 devnull "codebase",
158 7cf289ca 2004-04-06 devnull "codetype",
159 7cf289ca 2004-04-06 devnull "color",
160 7cf289ca 2004-04-06 devnull "cols",
161 7cf289ca 2004-04-06 devnull "colspan",
162 7cf289ca 2004-04-06 devnull "compact",
163 7cf289ca 2004-04-06 devnull "content",
164 7cf289ca 2004-04-06 devnull "coords",
165 7cf289ca 2004-04-06 devnull "data",
166 7cf289ca 2004-04-06 devnull "datetime",
167 7cf289ca 2004-04-06 devnull "declare",
168 7cf289ca 2004-04-06 devnull "defer",
169 7cf289ca 2004-04-06 devnull "dir",
170 7cf289ca 2004-04-06 devnull "disabled",
171 7cf289ca 2004-04-06 devnull "enctype",
172 7cf289ca 2004-04-06 devnull "face",
173 7cf289ca 2004-04-06 devnull "for",
174 7cf289ca 2004-04-06 devnull "frame",
175 7cf289ca 2004-04-06 devnull "frameborder",
176 7cf289ca 2004-04-06 devnull "headers",
177 7cf289ca 2004-04-06 devnull "height",
178 7cf289ca 2004-04-06 devnull "href",
179 7cf289ca 2004-04-06 devnull "hreflang",
180 7cf289ca 2004-04-06 devnull "hspace",
181 7cf289ca 2004-04-06 devnull "http-equiv",
182 7cf289ca 2004-04-06 devnull "id",
183 7cf289ca 2004-04-06 devnull "ismap",
184 7cf289ca 2004-04-06 devnull "label",
185 7cf289ca 2004-04-06 devnull "lang",
186 7cf289ca 2004-04-06 devnull "link",
187 7cf289ca 2004-04-06 devnull "longdesc",
188 7cf289ca 2004-04-06 devnull "marginheight",
189 7cf289ca 2004-04-06 devnull "marginwidth",
190 7cf289ca 2004-04-06 devnull "maxlength",
191 7cf289ca 2004-04-06 devnull "media",
192 7cf289ca 2004-04-06 devnull "method",
193 7cf289ca 2004-04-06 devnull "multiple",
194 7cf289ca 2004-04-06 devnull "name",
195 7cf289ca 2004-04-06 devnull "nohref",
196 7cf289ca 2004-04-06 devnull "noresize",
197 7cf289ca 2004-04-06 devnull "noshade",
198 7cf289ca 2004-04-06 devnull "nowrap",
199 7cf289ca 2004-04-06 devnull "object",
200 7cf289ca 2004-04-06 devnull "onblur",
201 7cf289ca 2004-04-06 devnull "onchange",
202 7cf289ca 2004-04-06 devnull "onclick",
203 7cf289ca 2004-04-06 devnull "ondblclick",
204 7cf289ca 2004-04-06 devnull "onfocus",
205 7cf289ca 2004-04-06 devnull "onkeypress",
206 7cf289ca 2004-04-06 devnull "onkeyup",
207 7cf289ca 2004-04-06 devnull "onload",
208 7cf289ca 2004-04-06 devnull "onmousedown",
209 7cf289ca 2004-04-06 devnull "onmousemove",
210 7cf289ca 2004-04-06 devnull "onmouseout",
211 7cf289ca 2004-04-06 devnull "onmouseover",
212 7cf289ca 2004-04-06 devnull "onmouseup",
213 7cf289ca 2004-04-06 devnull "onreset",
214 7cf289ca 2004-04-06 devnull "onselect",
215 7cf289ca 2004-04-06 devnull "onsubmit",
216 7cf289ca 2004-04-06 devnull "onunload",
217 7cf289ca 2004-04-06 devnull "profile",
218 7cf289ca 2004-04-06 devnull "prompt",
219 7cf289ca 2004-04-06 devnull "readonly",
220 7cf289ca 2004-04-06 devnull "rel",
221 7cf289ca 2004-04-06 devnull "rev",
222 7cf289ca 2004-04-06 devnull "rows",
223 7cf289ca 2004-04-06 devnull "rowspan",
224 7cf289ca 2004-04-06 devnull "rules",
225 7cf289ca 2004-04-06 devnull "scheme",
226 7cf289ca 2004-04-06 devnull "scope",
227 7cf289ca 2004-04-06 devnull "scrolling",
228 7cf289ca 2004-04-06 devnull "selected",
229 7cf289ca 2004-04-06 devnull "shape",
230 7cf289ca 2004-04-06 devnull "size",
231 7cf289ca 2004-04-06 devnull "span",
232 7cf289ca 2004-04-06 devnull "src",
233 7cf289ca 2004-04-06 devnull "standby",
234 7cf289ca 2004-04-06 devnull "start",
235 7cf289ca 2004-04-06 devnull "style",
236 7cf289ca 2004-04-06 devnull "summary",
237 7cf289ca 2004-04-06 devnull "tabindex",
238 7cf289ca 2004-04-06 devnull "target",
239 7cf289ca 2004-04-06 devnull "text",
240 7cf289ca 2004-04-06 devnull "title",
241 7cf289ca 2004-04-06 devnull "type",
242 7cf289ca 2004-04-06 devnull "usemap",
243 7cf289ca 2004-04-06 devnull "valign",
244 7cf289ca 2004-04-06 devnull "value",
245 7cf289ca 2004-04-06 devnull "valuetype",
246 7cf289ca 2004-04-06 devnull "version",
247 7cf289ca 2004-04-06 devnull "vlink",
248 7cf289ca 2004-04-06 devnull "vspace",
249 7cf289ca 2004-04-06 devnull "width"
250 7cf289ca 2004-04-06 devnull };
251 7cf289ca 2004-04-06 devnull
252 7cf289ca 2004-04-06 devnull
253 cbeb0b26 2006-04-01 devnull /* Character entity to unicode character number map. */
254 cbeb0b26 2006-04-01 devnull /* Keep sorted by name. */
255 7cf289ca 2004-04-06 devnull StringInt *chartab;
256 5b76ae26 2005-09-19 devnull AsciiInt _chartab[] = {
257 7cf289ca 2004-04-06 devnull {"AElig", 198},
258 7cf289ca 2004-04-06 devnull {"Aacute", 193},
259 7cf289ca 2004-04-06 devnull {"Acirc", 194},
260 7cf289ca 2004-04-06 devnull {"Agrave", 192},
261 7cf289ca 2004-04-06 devnull {"Aring", 197},
262 7cf289ca 2004-04-06 devnull {"Atilde", 195},
263 7cf289ca 2004-04-06 devnull {"Auml", 196},
264 7cf289ca 2004-04-06 devnull {"Ccedil", 199},
265 7cf289ca 2004-04-06 devnull {"ETH", 208},
266 7cf289ca 2004-04-06 devnull {"Eacute", 201},
267 7cf289ca 2004-04-06 devnull {"Ecirc", 202},
268 7cf289ca 2004-04-06 devnull {"Egrave", 200},
269 7cf289ca 2004-04-06 devnull {"Euml", 203},
270 7cf289ca 2004-04-06 devnull {"Iacute", 205},
271 7cf289ca 2004-04-06 devnull {"Icirc", 206},
272 7cf289ca 2004-04-06 devnull {"Igrave", 204},
273 7cf289ca 2004-04-06 devnull {"Iuml", 207},
274 7cf289ca 2004-04-06 devnull {"Ntilde", 209},
275 7cf289ca 2004-04-06 devnull {"Oacute", 211},
276 7cf289ca 2004-04-06 devnull {"Ocirc", 212},
277 7cf289ca 2004-04-06 devnull {"Ograve", 210},
278 7cf289ca 2004-04-06 devnull {"Oslash", 216},
279 7cf289ca 2004-04-06 devnull {"Otilde", 213},
280 7cf289ca 2004-04-06 devnull {"Ouml", 214},
281 7cf289ca 2004-04-06 devnull {"THORN", 222},
282 7cf289ca 2004-04-06 devnull {"Uacute", 218},
283 7cf289ca 2004-04-06 devnull {"Ucirc", 219},
284 7cf289ca 2004-04-06 devnull {"Ugrave", 217},
285 7cf289ca 2004-04-06 devnull {"Uuml", 220},
286 7cf289ca 2004-04-06 devnull {"Yacute", 221},
287 7cf289ca 2004-04-06 devnull {"aacute", 225},
288 7cf289ca 2004-04-06 devnull {"acirc", 226},
289 7cf289ca 2004-04-06 devnull {"acute", 180},
290 7cf289ca 2004-04-06 devnull {"aelig", 230},
291 7cf289ca 2004-04-06 devnull {"agrave", 224},
292 7cf289ca 2004-04-06 devnull {"alpha", 945},
293 7cf289ca 2004-04-06 devnull {"amp", 38},
294 7cf289ca 2004-04-06 devnull {"aring", 229},
295 7cf289ca 2004-04-06 devnull {"atilde", 227},
296 7cf289ca 2004-04-06 devnull {"auml", 228},
297 7cf289ca 2004-04-06 devnull {"beta", 946},
298 7cf289ca 2004-04-06 devnull {"brvbar", 166},
299 7cf289ca 2004-04-06 devnull {"ccedil", 231},
300 7cf289ca 2004-04-06 devnull {"cdots", 8943},
301 7cf289ca 2004-04-06 devnull {"cedil", 184},
302 7cf289ca 2004-04-06 devnull {"cent", 162},
303 7cf289ca 2004-04-06 devnull {"chi", 967},
304 7cf289ca 2004-04-06 devnull {"copy", 169},
305 7cf289ca 2004-04-06 devnull {"curren", 164},
306 7cf289ca 2004-04-06 devnull {"ddots", 8945},
307 7cf289ca 2004-04-06 devnull {"deg", 176},
308 7cf289ca 2004-04-06 devnull {"delta", 948},
309 7cf289ca 2004-04-06 devnull {"divide", 247},
310 7cf289ca 2004-04-06 devnull {"eacute", 233},
311 7cf289ca 2004-04-06 devnull {"ecirc", 234},
312 7cf289ca 2004-04-06 devnull {"egrave", 232},
313 5b76ae26 2005-09-19 devnull {"emdash", 8212}, /* non-standard but commonly used */
314 7cf289ca 2004-04-06 devnull {"emsp", 8195},
315 5b76ae26 2005-09-19 devnull {"endash", 8211}, /* non-standard but commonly used */
316 7cf289ca 2004-04-06 devnull {"ensp", 8194},
317 7cf289ca 2004-04-06 devnull {"epsilon", 949},
318 7cf289ca 2004-04-06 devnull {"eta", 951},
319 7cf289ca 2004-04-06 devnull {"eth", 240},
320 7cf289ca 2004-04-06 devnull {"euml", 235},
321 7cf289ca 2004-04-06 devnull {"frac12", 189},
322 7cf289ca 2004-04-06 devnull {"frac14", 188},
323 7cf289ca 2004-04-06 devnull {"frac34", 190},
324 7cf289ca 2004-04-06 devnull {"gamma", 947},
325 7cf289ca 2004-04-06 devnull {"gt", 62},
326 7cf289ca 2004-04-06 devnull {"iacute", 237},
327 7cf289ca 2004-04-06 devnull {"icirc", 238},
328 7cf289ca 2004-04-06 devnull {"iexcl", 161},
329 7cf289ca 2004-04-06 devnull {"igrave", 236},
330 7cf289ca 2004-04-06 devnull {"iota", 953},
331 7cf289ca 2004-04-06 devnull {"iquest", 191},
332 7cf289ca 2004-04-06 devnull {"iuml", 239},
333 7cf289ca 2004-04-06 devnull {"kappa", 954},
334 7cf289ca 2004-04-06 devnull {"lambda", 955},
335 7cf289ca 2004-04-06 devnull {"laquo", 171},
336 431e32de 2005-09-30 devnull {"ldquo", 8220},
337 7cf289ca 2004-04-06 devnull {"ldots", 8230},
338 431e32de 2005-09-30 devnull {"lsquo", 8216},
339 7cf289ca 2004-04-06 devnull {"lt", 60},
340 7cf289ca 2004-04-06 devnull {"macr", 175},
341 5b76ae26 2005-09-19 devnull {"mdash", 8212},
342 7cf289ca 2004-04-06 devnull {"micro", 181},
343 7cf289ca 2004-04-06 devnull {"middot", 183},
344 7cf289ca 2004-04-06 devnull {"mu", 956},
345 7cf289ca 2004-04-06 devnull {"nbsp", 160},
346 5b76ae26 2005-09-19 devnull {"ndash", 8211},
347 7cf289ca 2004-04-06 devnull {"not", 172},
348 7cf289ca 2004-04-06 devnull {"ntilde", 241},
349 7cf289ca 2004-04-06 devnull {"nu", 957},
350 7cf289ca 2004-04-06 devnull {"oacute", 243},
351 7cf289ca 2004-04-06 devnull {"ocirc", 244},
352 7cf289ca 2004-04-06 devnull {"ograve", 242},
353 7cf289ca 2004-04-06 devnull {"omega", 969},
354 7cf289ca 2004-04-06 devnull {"omicron", 959},
355 7cf289ca 2004-04-06 devnull {"ordf", 170},
356 7cf289ca 2004-04-06 devnull {"ordm", 186},
357 7cf289ca 2004-04-06 devnull {"oslash", 248},
358 7cf289ca 2004-04-06 devnull {"otilde", 245},
359 7cf289ca 2004-04-06 devnull {"ouml", 246},
360 7cf289ca 2004-04-06 devnull {"para", 182},
361 7cf289ca 2004-04-06 devnull {"phi", 966},
362 7cf289ca 2004-04-06 devnull {"pi", 960},
363 7cf289ca 2004-04-06 devnull {"plusmn", 177},
364 7cf289ca 2004-04-06 devnull {"pound", 163},
365 7cf289ca 2004-04-06 devnull {"psi", 968},
366 7cf289ca 2004-04-06 devnull {"quad", 8193},
367 7cf289ca 2004-04-06 devnull {"quot", 34},
368 7cf289ca 2004-04-06 devnull {"raquo", 187},
369 431e32de 2005-09-30 devnull {"rdquo", 8221},
370 7cf289ca 2004-04-06 devnull {"reg", 174},
371 7cf289ca 2004-04-06 devnull {"rho", 961},
372 431e32de 2005-09-30 devnull {"rsquo", 8217},
373 7cf289ca 2004-04-06 devnull {"sect", 167},
374 7cf289ca 2004-04-06 devnull {"shy", 173},
375 7cf289ca 2004-04-06 devnull {"sigma", 963},
376 7cf289ca 2004-04-06 devnull {"sp", 8194},
377 7cf289ca 2004-04-06 devnull {"sup1", 185},
378 7cf289ca 2004-04-06 devnull {"sup2", 178},
379 7cf289ca 2004-04-06 devnull {"sup3", 179},
380 7cf289ca 2004-04-06 devnull {"szlig", 223},
381 7cf289ca 2004-04-06 devnull {"tau", 964},
382 7cf289ca 2004-04-06 devnull {"theta", 952},
383 7cf289ca 2004-04-06 devnull {"thinsp", 8201},
384 7cf289ca 2004-04-06 devnull {"thorn", 254},
385 7cf289ca 2004-04-06 devnull {"times", 215},
386 7cf289ca 2004-04-06 devnull {"trade", 8482},
387 7cf289ca 2004-04-06 devnull {"uacute", 250},
388 7cf289ca 2004-04-06 devnull {"ucirc", 251},
389 7cf289ca 2004-04-06 devnull {"ugrave", 249},
390 7cf289ca 2004-04-06 devnull {"uml", 168},
391 7cf289ca 2004-04-06 devnull {"upsilon", 965},
392 7cf289ca 2004-04-06 devnull {"uuml", 252},
393 7cf289ca 2004-04-06 devnull {"varepsilon", 8712},
394 7cf289ca 2004-04-06 devnull {"varphi", 981},
395 7cf289ca 2004-04-06 devnull {"varpi", 982},
396 7cf289ca 2004-04-06 devnull {"varrho", 1009},
397 7cf289ca 2004-04-06 devnull {"vdots", 8942},
398 7cf289ca 2004-04-06 devnull {"vsigma", 962},
399 7cf289ca 2004-04-06 devnull {"vtheta", 977},
400 7cf289ca 2004-04-06 devnull {"xi", 958},
401 7cf289ca 2004-04-06 devnull {"yacute", 253},
402 7cf289ca 2004-04-06 devnull {"yen", 165},
403 7cf289ca 2004-04-06 devnull {"yuml", 255},
404 7cf289ca 2004-04-06 devnull {"zeta", 950}
405 7cf289ca 2004-04-06 devnull };
406 5b76ae26 2005-09-19 devnull #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
407 7cf289ca 2004-04-06 devnull
408 cbeb0b26 2006-04-01 devnull /* Characters Winstart..Winend are those that Windows */
409 cbeb0b26 2006-04-01 devnull /* uses interpolated into the Latin1 set. */
410 cbeb0b26 2006-04-01 devnull /* They aren't supposed to appear in HTML, but they do.... */
411 7cf289ca 2004-04-06 devnull enum {
412 7cf289ca 2004-04-06 devnull Winstart = 127,
413 7cf289ca 2004-04-06 devnull Winend = 159
414 7cf289ca 2004-04-06 devnull };
415 7cf289ca 2004-04-06 devnull
416 cbeb0b26 2006-04-01 devnull static int winchars[]= { 8226, /* 8226 is a bullet */
417 7cf289ca 2004-04-06 devnull 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
418 7cf289ca 2004-04-06 devnull 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
419 7cf289ca 2004-04-06 devnull 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
420 7cf289ca 2004-04-06 devnull 732, 8482, 353, 8250, 339, 8226, 8226, 376};
421 7cf289ca 2004-04-06 devnull
422 cbeb0b26 2006-04-01 devnull static StringInt* tagtable; /* initialized from tagnames */
423 cbeb0b26 2006-04-01 devnull static StringInt* attrtable; /* initialized from attrnames */
424 7cf289ca 2004-04-06 devnull
425 2b604081 2005-05-07 devnull static void lexinit(void);
426 7cf289ca 2004-04-06 devnull static int getplaindata(TokenSource* ts, Token* a, int* pai);
427 7cf289ca 2004-04-06 devnull static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
428 7cf289ca 2004-04-06 devnull static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
429 7cf289ca 2004-04-06 devnull static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
430 7cf289ca 2004-04-06 devnull static Rune* buftostr(Rune* s, Rune* buf, int j);
431 7cf289ca 2004-04-06 devnull static int comment(TokenSource* ts);
432 7cf289ca 2004-04-06 devnull static int findstr(TokenSource* ts, Rune* s);
433 7cf289ca 2004-04-06 devnull static int ampersand(TokenSource* ts);
434 cbeb0b26 2006-04-01 devnull /*static int lowerc(int c); */
435 7cf289ca 2004-04-06 devnull static int getchar(TokenSource* ts);
436 7cf289ca 2004-04-06 devnull static void ungetchar(TokenSource* ts, int c);
437 7cf289ca 2004-04-06 devnull static void backup(TokenSource* ts, int savei);
438 cbeb0b26 2006-04-01 devnull /*static void freeinsidetoken(Token* t); */
439 7cf289ca 2004-04-06 devnull static void freeattrs(Attr* ahead);
440 7cf289ca 2004-04-06 devnull static Attr* newattr(int attid, Rune* value, Attr* link);
441 7cf289ca 2004-04-06 devnull static int Tconv(Fmt* f);
442 7cf289ca 2004-04-06 devnull
443 7cf289ca 2004-04-06 devnull int dbglex = 0;
444 7cf289ca 2004-04-06 devnull static int lexinited = 0;
445 7cf289ca 2004-04-06 devnull
446 7cf289ca 2004-04-06 devnull static void
447 7cf289ca 2004-04-06 devnull lexinit(void)
448 7cf289ca 2004-04-06 devnull {
449 7e19561a 2005-01-04 devnull chartab = _cvtstringinttab(_chartab, nelem(_chartab));
450 7e19561a 2005-01-04 devnull tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
451 7cf289ca 2004-04-06 devnull tagtable = _makestrinttab(tagnames, Numtags);
452 7e19561a 2005-01-04 devnull attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
453 7cf289ca 2004-04-06 devnull attrtable = _makestrinttab(attrnames, Numattrs);
454 7cf289ca 2004-04-06 devnull fmtinstall('T', Tconv);
455 7cf289ca 2004-04-06 devnull lexinited = 1;
456 7cf289ca 2004-04-06 devnull }
457 7cf289ca 2004-04-06 devnull
458 7cf289ca 2004-04-06 devnull static TokenSource*
459 7cf289ca 2004-04-06 devnull newtokensource(uchar* data, int edata, int chset, int mtype)
460 7cf289ca 2004-04-06 devnull {
461 7cf289ca 2004-04-06 devnull TokenSource* ans;
462 7cf289ca 2004-04-06 devnull
463 7cf289ca 2004-04-06 devnull assert(chset == US_Ascii || chset == ISO_8859_1 ||
464 7cf289ca 2004-04-06 devnull chset == UTF_8 || chset == Unicode);
465 7cf289ca 2004-04-06 devnull ans = (TokenSource*)emalloc(sizeof(TokenSource));
466 7cf289ca 2004-04-06 devnull ans->i = 0;
467 7cf289ca 2004-04-06 devnull ans->data = data;
468 7cf289ca 2004-04-06 devnull ans->edata = edata;
469 7cf289ca 2004-04-06 devnull ans->chset = chset;
470 7cf289ca 2004-04-06 devnull ans->mtype = mtype;
471 7cf289ca 2004-04-06 devnull return ans;
472 7cf289ca 2004-04-06 devnull }
473 7cf289ca 2004-04-06 devnull
474 7cf289ca 2004-04-06 devnull enum {
475 7cf289ca 2004-04-06 devnull ToksChunk = 500
476 7cf289ca 2004-04-06 devnull };
477 7cf289ca 2004-04-06 devnull
478 cbeb0b26 2006-04-01 devnull /* Call this to get the tokens. */
479 cbeb0b26 2006-04-01 devnull /* The number of returned tokens is returned in *plen. */
480 7cf289ca 2004-04-06 devnull Token*
481 7cf289ca 2004-04-06 devnull _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
482 7cf289ca 2004-04-06 devnull {
483 7cf289ca 2004-04-06 devnull TokenSource* ts;
484 7cf289ca 2004-04-06 devnull Token* a;
485 7cf289ca 2004-04-06 devnull int alen;
486 7cf289ca 2004-04-06 devnull int ai;
487 7cf289ca 2004-04-06 devnull int starti;
488 7cf289ca 2004-04-06 devnull int c;
489 7cf289ca 2004-04-06 devnull int tag;
490 7cf289ca 2004-04-06 devnull
491 7cf289ca 2004-04-06 devnull if(!lexinited)
492 7cf289ca 2004-04-06 devnull lexinit();
493 7cf289ca 2004-04-06 devnull ts = newtokensource(data, datalen, chset, mtype);
494 7cf289ca 2004-04-06 devnull alen = ToksChunk;
495 7cf289ca 2004-04-06 devnull a = (Token*)emalloc(alen * sizeof(Token));
496 7cf289ca 2004-04-06 devnull ai = 0;
497 7cf289ca 2004-04-06 devnull if(dbglex)
498 7cf289ca 2004-04-06 devnull fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
499 431e32de 2005-09-30 devnull if(ts->mtype == TextHtml){
500 431e32de 2005-09-30 devnull for(;;){
501 431e32de 2005-09-30 devnull if(ai == alen){
502 7cf289ca 2004-04-06 devnull a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
503 7cf289ca 2004-04-06 devnull alen += ToksChunk;
504 7cf289ca 2004-04-06 devnull }
505 7cf289ca 2004-04-06 devnull starti = ts->i;
506 7cf289ca 2004-04-06 devnull c = getchar(ts);
507 7cf289ca 2004-04-06 devnull if(c < 0)
508 7cf289ca 2004-04-06 devnull break;
509 431e32de 2005-09-30 devnull if(c == '<'){
510 7cf289ca 2004-04-06 devnull tag = gettag(ts, starti, a, &ai);
511 431e32de 2005-09-30 devnull if(tag == Tscript){
512 cbeb0b26 2006-04-01 devnull /* special rules for getting Data after.... */
513 7cf289ca 2004-04-06 devnull starti = ts->i;
514 7cf289ca 2004-04-06 devnull c = getchar(ts);
515 7cf289ca 2004-04-06 devnull tag = getscriptdata(ts, c, starti, a, &ai);
516 7cf289ca 2004-04-06 devnull }
517 7cf289ca 2004-04-06 devnull }
518 7cf289ca 2004-04-06 devnull else
519 7cf289ca 2004-04-06 devnull tag = getdata(ts, c, starti, a, &ai);
520 7cf289ca 2004-04-06 devnull if(tag == -1)
521 7cf289ca 2004-04-06 devnull break;
522 7cf289ca 2004-04-06 devnull else if(dbglex > 1 && tag != Comment)
523 7cf289ca 2004-04-06 devnull fprint(2, "lex: got token %T\n", &a[ai-1]);
524 7cf289ca 2004-04-06 devnull }
525 7cf289ca 2004-04-06 devnull }
526 7cf289ca 2004-04-06 devnull else {
527 cbeb0b26 2006-04-01 devnull /* plain text (non-html) tokens */
528 431e32de 2005-09-30 devnull for(;;){
529 431e32de 2005-09-30 devnull if(ai == alen){
530 7cf289ca 2004-04-06 devnull a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
531 7cf289ca 2004-04-06 devnull alen += ToksChunk;
532 7cf289ca 2004-04-06 devnull }
533 7cf289ca 2004-04-06 devnull tag = getplaindata(ts, a, &ai);
534 7cf289ca 2004-04-06 devnull if(tag == -1)
535 7cf289ca 2004-04-06 devnull break;
536 7cf289ca 2004-04-06 devnull if(dbglex > 1)
537 7cf289ca 2004-04-06 devnull fprint(2, "lex: got token %T\n", &a[ai]);
538 7cf289ca 2004-04-06 devnull }
539 7cf289ca 2004-04-06 devnull }
540 7cf289ca 2004-04-06 devnull if(dbglex)
541 7cf289ca 2004-04-06 devnull fprint(2, "lex: returning %d tokens\n", ai);
542 7cf289ca 2004-04-06 devnull *plen = ai;
543 8241eca9 2020-01-08 crossd free(ts);
544 8241eca9 2020-01-08 crossd if(ai == 0) {
545 8241eca9 2020-01-08 crossd free(a);
546 7cf289ca 2004-04-06 devnull return nil;
547 8241eca9 2020-01-08 crossd }
548 7cf289ca 2004-04-06 devnull return a;
549 7cf289ca 2004-04-06 devnull }
550 7cf289ca 2004-04-06 devnull
551 cbeb0b26 2006-04-01 devnull /* For case where source isn't HTML. */
552 cbeb0b26 2006-04-01 devnull /* Just make data tokens, one per line (or partial line, */
553 cbeb0b26 2006-04-01 devnull /* at end of buffer), ignoring non-whitespace control */
554 cbeb0b26 2006-04-01 devnull /* characters and dumping \r's. */
555 cbeb0b26 2006-04-01 devnull /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
556 cbeb0b26 2006-04-01 devnull /* Otherwise return -1; */
557 7cf289ca 2004-04-06 devnull static int
558 7cf289ca 2004-04-06 devnull getplaindata(TokenSource* ts, Token* a, int* pai)
559 7cf289ca 2004-04-06 devnull {
560 7cf289ca 2004-04-06 devnull Rune* s;
561 7cf289ca 2004-04-06 devnull int j;
562 7cf289ca 2004-04-06 devnull int starti;
563 7cf289ca 2004-04-06 devnull int c;
564 7cf289ca 2004-04-06 devnull Token* tok;
565 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
566 7cf289ca 2004-04-06 devnull
567 7cf289ca 2004-04-06 devnull s = nil;
568 7cf289ca 2004-04-06 devnull j = 0;
569 7cf289ca 2004-04-06 devnull starti = ts->i;
570 431e32de 2005-09-30 devnull for(c = getchar(ts); c >= 0; c = getchar(ts)){
571 431e32de 2005-09-30 devnull if(c < ' '){
572 431e32de 2005-09-30 devnull if(isspace(c)){
573 431e32de 2005-09-30 devnull if(c == '\r'){
574 cbeb0b26 2006-04-01 devnull /* ignore it unless no following '\n', */
575 cbeb0b26 2006-04-01 devnull /* in which case treat it like '\n' */
576 7cf289ca 2004-04-06 devnull c = getchar(ts);
577 431e32de 2005-09-30 devnull if(c != '\n'){
578 7cf289ca 2004-04-06 devnull if(c >= 0)
579 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
580 7cf289ca 2004-04-06 devnull c = '\n';
581 7cf289ca 2004-04-06 devnull }
582 7cf289ca 2004-04-06 devnull }
583 7cf289ca 2004-04-06 devnull }
584 7cf289ca 2004-04-06 devnull else
585 7cf289ca 2004-04-06 devnull c = 0;
586 7cf289ca 2004-04-06 devnull }
587 431e32de 2005-09-30 devnull if(c != 0){
588 7cf289ca 2004-04-06 devnull buf[j++] = c;
589 4056d6be 2021-01-29 crossd if(j == BIGBUFSIZE-1){
590 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
591 7cf289ca 2004-04-06 devnull j = 0;
592 7cf289ca 2004-04-06 devnull }
593 7cf289ca 2004-04-06 devnull }
594 7cf289ca 2004-04-06 devnull if(c == '\n')
595 7cf289ca 2004-04-06 devnull break;
596 7cf289ca 2004-04-06 devnull }
597 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
598 7cf289ca 2004-04-06 devnull if(s == nil)
599 7cf289ca 2004-04-06 devnull return -1;
600 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
601 7cf289ca 2004-04-06 devnull tok->tag = Data;
602 7cf289ca 2004-04-06 devnull tok->text = s;
603 7cf289ca 2004-04-06 devnull tok->attr = nil;
604 7cf289ca 2004-04-06 devnull tok->starti = starti;
605 7cf289ca 2004-04-06 devnull return Data;
606 7cf289ca 2004-04-06 devnull }
607 7cf289ca 2004-04-06 devnull
608 cbeb0b26 2006-04-01 devnull /* Return concatenation of s and buf[0:j] */
609 8241eca9 2020-01-08 crossd /* Frees s. */
610 7cf289ca 2004-04-06 devnull static Rune*
611 7cf289ca 2004-04-06 devnull buftostr(Rune* s, Rune* buf, int j)
612 7cf289ca 2004-04-06 devnull {
613 8241eca9 2020-01-08 crossd Rune *tmp;
614 7cf289ca 2004-04-06 devnull buf[j] = 0;
615 7cf289ca 2004-04-06 devnull if(s == nil)
616 8241eca9 2020-01-08 crossd tmp = _Strndup(buf, j);
617 fa325e9b 2020-01-10 cross else
618 8241eca9 2020-01-08 crossd tmp = _Strdup2(s, buf);
619 8241eca9 2020-01-08 crossd free(s);
620 8241eca9 2020-01-08 crossd return tmp;
621 7cf289ca 2004-04-06 devnull }
622 7cf289ca 2004-04-06 devnull
623 cbeb0b26 2006-04-01 devnull /* Gather data up to next start-of-tag or end-of-buffer. */
624 cbeb0b26 2006-04-01 devnull /* Translate entity references (&amp;). */
625 cbeb0b26 2006-04-01 devnull /* Ignore non-whitespace control characters and get rid of \r's. */
626 cbeb0b26 2006-04-01 devnull /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
627 cbeb0b26 2006-04-01 devnull /* Otherwise return -1; */
628 7cf289ca 2004-04-06 devnull static int
629 7cf289ca 2004-04-06 devnull getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
630 7cf289ca 2004-04-06 devnull {
631 7cf289ca 2004-04-06 devnull Rune* s;
632 7cf289ca 2004-04-06 devnull int j;
633 7cf289ca 2004-04-06 devnull int c;
634 7cf289ca 2004-04-06 devnull Token* tok;
635 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
636 7cf289ca 2004-04-06 devnull
637 7cf289ca 2004-04-06 devnull s = nil;
638 7cf289ca 2004-04-06 devnull j = 0;
639 7cf289ca 2004-04-06 devnull c = firstc;
640 431e32de 2005-09-30 devnull while(c >= 0){
641 431e32de 2005-09-30 devnull if(c == '&'){
642 7cf289ca 2004-04-06 devnull c = ampersand(ts);
643 7cf289ca 2004-04-06 devnull if(c < 0)
644 7cf289ca 2004-04-06 devnull break;
645 7cf289ca 2004-04-06 devnull }
646 431e32de 2005-09-30 devnull else if(c < ' '){
647 431e32de 2005-09-30 devnull if(isspace(c)){
648 431e32de 2005-09-30 devnull if(c == '\r'){
649 cbeb0b26 2006-04-01 devnull /* ignore it unless no following '\n', */
650 cbeb0b26 2006-04-01 devnull /* in which case treat it like '\n' */
651 7cf289ca 2004-04-06 devnull c = getchar(ts);
652 431e32de 2005-09-30 devnull if(c != '\n'){
653 7cf289ca 2004-04-06 devnull if(c >= 0)
654 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
655 7cf289ca 2004-04-06 devnull c = '\n';
656 7cf289ca 2004-04-06 devnull }
657 7cf289ca 2004-04-06 devnull }
658 7cf289ca 2004-04-06 devnull }
659 7cf289ca 2004-04-06 devnull else {
660 7cf289ca 2004-04-06 devnull if(warn)
661 7cf289ca 2004-04-06 devnull fprint(2, "warning: non-whitespace control character %d ignored\n", c);
662 7cf289ca 2004-04-06 devnull c = 0;
663 7cf289ca 2004-04-06 devnull }
664 7cf289ca 2004-04-06 devnull }
665 431e32de 2005-09-30 devnull else if(c == '<'){
666 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
667 7cf289ca 2004-04-06 devnull break;
668 7cf289ca 2004-04-06 devnull }
669 431e32de 2005-09-30 devnull if(c != 0){
670 7cf289ca 2004-04-06 devnull buf[j++] = c;
671 431e32de 2005-09-30 devnull if(j == BIGBUFSIZE-1){
672 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
673 7cf289ca 2004-04-06 devnull j = 0;
674 7cf289ca 2004-04-06 devnull }
675 7cf289ca 2004-04-06 devnull }
676 7cf289ca 2004-04-06 devnull c = getchar(ts);
677 7cf289ca 2004-04-06 devnull }
678 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
679 7cf289ca 2004-04-06 devnull if(s == nil)
680 7cf289ca 2004-04-06 devnull return -1;
681 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
682 7cf289ca 2004-04-06 devnull tok->tag = Data;
683 7cf289ca 2004-04-06 devnull tok->text = s;
684 7cf289ca 2004-04-06 devnull tok->attr = nil;
685 7cf289ca 2004-04-06 devnull tok->starti = starti;
686 7cf289ca 2004-04-06 devnull return Data;
687 7cf289ca 2004-04-06 devnull }
688 7cf289ca 2004-04-06 devnull
689 cbeb0b26 2006-04-01 devnull /* The rules for lexing scripts are different (ugh). */
690 cbeb0b26 2006-04-01 devnull /* Gather up everything until see a </SCRIPT>. */
691 7cf289ca 2004-04-06 devnull static int
692 7cf289ca 2004-04-06 devnull getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
693 7cf289ca 2004-04-06 devnull {
694 7cf289ca 2004-04-06 devnull Rune* s;
695 7cf289ca 2004-04-06 devnull int j;
696 7cf289ca 2004-04-06 devnull int tstarti;
697 7cf289ca 2004-04-06 devnull int savei;
698 7cf289ca 2004-04-06 devnull int c;
699 7cf289ca 2004-04-06 devnull int tag;
700 7cf289ca 2004-04-06 devnull int done;
701 7cf289ca 2004-04-06 devnull Token* tok;
702 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
703 7cf289ca 2004-04-06 devnull
704 7cf289ca 2004-04-06 devnull s = nil;
705 7cf289ca 2004-04-06 devnull j = 0;
706 7cf289ca 2004-04-06 devnull tstarti = starti;
707 7cf289ca 2004-04-06 devnull c = firstc;
708 7cf289ca 2004-04-06 devnull done = 0;
709 431e32de 2005-09-30 devnull while(c >= 0){
710 431e32de 2005-09-30 devnull if(c == '<'){
711 cbeb0b26 2006-04-01 devnull /* other browsers ignore stuff to end of line after <! */
712 7cf289ca 2004-04-06 devnull savei = ts->i;
713 7cf289ca 2004-04-06 devnull c = getchar(ts);
714 431e32de 2005-09-30 devnull if(c == '!'){
715 7cf289ca 2004-04-06 devnull while(c >= 0 && c != '\n' && c != '\r')
716 7cf289ca 2004-04-06 devnull c = getchar(ts);
717 7cf289ca 2004-04-06 devnull if(c == '\r')
718 7cf289ca 2004-04-06 devnull c = getchar(ts);
719 7cf289ca 2004-04-06 devnull if(c == '\n')
720 7cf289ca 2004-04-06 devnull c = getchar(ts);
721 7cf289ca 2004-04-06 devnull }
722 431e32de 2005-09-30 devnull else if(c >= 0){
723 7cf289ca 2004-04-06 devnull backup(ts, savei);
724 7cf289ca 2004-04-06 devnull tag = gettag(ts, tstarti, a, pai);
725 7cf289ca 2004-04-06 devnull if(tag == -1)
726 7cf289ca 2004-04-06 devnull break;
727 7cf289ca 2004-04-06 devnull if(tag != Comment)
728 7cf289ca 2004-04-06 devnull (*pai)--;
729 7cf289ca 2004-04-06 devnull backup(ts, tstarti);
730 431e32de 2005-09-30 devnull if(tag == Tscript + RBRA){
731 7cf289ca 2004-04-06 devnull done = 1;
732 7cf289ca 2004-04-06 devnull break;
733 7cf289ca 2004-04-06 devnull }
734 cbeb0b26 2006-04-01 devnull /* here tag was not </SCRIPT>, so take as regular data */
735 7cf289ca 2004-04-06 devnull c = getchar(ts);
736 7cf289ca 2004-04-06 devnull }
737 7cf289ca 2004-04-06 devnull }
738 7cf289ca 2004-04-06 devnull if(c < 0)
739 7cf289ca 2004-04-06 devnull break;
740 431e32de 2005-09-30 devnull if(c != 0){
741 7cf289ca 2004-04-06 devnull buf[j++] = c;
742 431e32de 2005-09-30 devnull if(j == BIGBUFSIZE-1){
743 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
744 7cf289ca 2004-04-06 devnull j = 0;
745 7cf289ca 2004-04-06 devnull }
746 7cf289ca 2004-04-06 devnull }
747 7cf289ca 2004-04-06 devnull tstarti = ts->i;
748 7cf289ca 2004-04-06 devnull c = getchar(ts);
749 7cf289ca 2004-04-06 devnull }
750 431e32de 2005-09-30 devnull if(done || ts->i == ts->edata){
751 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
752 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
753 7cf289ca 2004-04-06 devnull tok->tag = Data;
754 7cf289ca 2004-04-06 devnull tok->text = s;
755 7cf289ca 2004-04-06 devnull tok->attr = nil;
756 7cf289ca 2004-04-06 devnull tok->starti = starti;
757 7cf289ca 2004-04-06 devnull return Data;
758 7cf289ca 2004-04-06 devnull }
759 7cf289ca 2004-04-06 devnull backup(ts, starti);
760 7cf289ca 2004-04-06 devnull return -1;
761 7cf289ca 2004-04-06 devnull }
762 7cf289ca 2004-04-06 devnull
763 cbeb0b26 2006-04-01 devnull /* We've just seen a '<'. Gather up stuff to closing '>' (if buffer */
764 cbeb0b26 2006-04-01 devnull /* ends before then, return -1). */
765 cbeb0b26 2006-04-01 devnull /* If it's a tag, look up the name, gather the attributes, and return */
766 cbeb0b26 2006-04-01 devnull /* the appropriate token. */
767 cbeb0b26 2006-04-01 devnull /* Else it's either just plain data or some kind of ignorable stuff: */
768 cbeb0b26 2006-04-01 devnull /* return Data or Comment as appropriate. */
769 cbeb0b26 2006-04-01 devnull /* If it's not a Comment, put it in a[*pai] and bump *pai. */
770 7cf289ca 2004-04-06 devnull static int
771 7cf289ca 2004-04-06 devnull gettag(TokenSource* ts, int starti, Token* a, int* pai)
772 7cf289ca 2004-04-06 devnull {
773 7cf289ca 2004-04-06 devnull int rbra;
774 7cf289ca 2004-04-06 devnull int ans;
775 7cf289ca 2004-04-06 devnull Attr* al;
776 7cf289ca 2004-04-06 devnull int nexti;
777 7cf289ca 2004-04-06 devnull int c;
778 7cf289ca 2004-04-06 devnull int ti;
779 7cf289ca 2004-04-06 devnull int afnd;
780 7cf289ca 2004-04-06 devnull int attid;
781 7cf289ca 2004-04-06 devnull int quote;
782 7cf289ca 2004-04-06 devnull Rune* val;
783 7cf289ca 2004-04-06 devnull int nv;
784 7cf289ca 2004-04-06 devnull int i;
785 7cf289ca 2004-04-06 devnull int tag;
786 7cf289ca 2004-04-06 devnull Token* tok;
787 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
788 7cf289ca 2004-04-06 devnull
789 7cf289ca 2004-04-06 devnull rbra = 0;
790 7cf289ca 2004-04-06 devnull nexti = ts->i;
791 7cf289ca 2004-04-06 devnull tok = &a[*pai];
792 7cf289ca 2004-04-06 devnull tok->tag = Notfound;
793 7cf289ca 2004-04-06 devnull tok->text = nil;
794 7cf289ca 2004-04-06 devnull tok->attr = nil;
795 7cf289ca 2004-04-06 devnull tok->starti = starti;
796 7cf289ca 2004-04-06 devnull c = getchar(ts);
797 431e32de 2005-09-30 devnull if(c == '/'){
798 7cf289ca 2004-04-06 devnull rbra = RBRA;
799 7cf289ca 2004-04-06 devnull c = getchar(ts);
800 7cf289ca 2004-04-06 devnull }
801 7cf289ca 2004-04-06 devnull if(c < 0)
802 7cf289ca 2004-04-06 devnull goto eob_done;
803 431e32de 2005-09-30 devnull if(c >= 256 || !isalpha(c)){
804 cbeb0b26 2006-04-01 devnull /* not a tag */
805 431e32de 2005-09-30 devnull if(c == '!'){
806 7cf289ca 2004-04-06 devnull ans = comment(ts);
807 7cf289ca 2004-04-06 devnull if(ans != -1)
808 7cf289ca 2004-04-06 devnull return ans;
809 7cf289ca 2004-04-06 devnull goto eob_done;
810 7cf289ca 2004-04-06 devnull }
811 7cf289ca 2004-04-06 devnull else {
812 7cf289ca 2004-04-06 devnull backup(ts, nexti);
813 7cf289ca 2004-04-06 devnull tok->tag = Data;
814 7cf289ca 2004-04-06 devnull tok->text = _Strdup(L(Llt));
815 7cf289ca 2004-04-06 devnull (*pai)++;
816 7cf289ca 2004-04-06 devnull return Data;
817 7cf289ca 2004-04-06 devnull }
818 7cf289ca 2004-04-06 devnull }
819 cbeb0b26 2006-04-01 devnull /* c starts a tagname */
820 7cf289ca 2004-04-06 devnull buf[0] = c;
821 7cf289ca 2004-04-06 devnull i = 1;
822 431e32de 2005-09-30 devnull for(;;){
823 7cf289ca 2004-04-06 devnull c = getchar(ts);
824 7cf289ca 2004-04-06 devnull if(c < 0)
825 7cf289ca 2004-04-06 devnull goto eob_done;
826 7cf289ca 2004-04-06 devnull if(!ISNAMCHAR(c))
827 7cf289ca 2004-04-06 devnull break;
828 cbeb0b26 2006-04-01 devnull /* if name is bigger than buf it won't be found anyway... */
829 7cf289ca 2004-04-06 devnull if(i < BIGBUFSIZE)
830 7cf289ca 2004-04-06 devnull buf[i++] = c;
831 7cf289ca 2004-04-06 devnull }
832 7cf289ca 2004-04-06 devnull if(_lookup(tagtable, Numtags, buf, i, &tag))
833 7cf289ca 2004-04-06 devnull tok->tag = tag + rbra;
834 7cf289ca 2004-04-06 devnull else
835 cbeb0b26 2006-04-01 devnull tok->text = _Strndup(buf, i); /* for warning print, in build */
836 7cf289ca 2004-04-06 devnull
837 cbeb0b26 2006-04-01 devnull /* attribute gathering loop */
838 7cf289ca 2004-04-06 devnull al = nil;
839 431e32de 2005-09-30 devnull for(;;){
840 cbeb0b26 2006-04-01 devnull /* look for "ws name" or "ws name ws = ws val" (ws=whitespace) */
841 cbeb0b26 2006-04-01 devnull /* skip whitespace */
842 7cf289ca 2004-04-06 devnull attrloop_continue:
843 431e32de 2005-09-30 devnull while(c < 256 && isspace(c)){
844 7cf289ca 2004-04-06 devnull c = getchar(ts);
845 7cf289ca 2004-04-06 devnull if(c < 0)
846 7cf289ca 2004-04-06 devnull goto eob_done;
847 7cf289ca 2004-04-06 devnull }
848 7cf289ca 2004-04-06 devnull if(c == '>')
849 7cf289ca 2004-04-06 devnull goto attrloop_done;
850 431e32de 2005-09-30 devnull if(c == '<'){
851 7cf289ca 2004-04-06 devnull if(warn)
852 7cf289ca 2004-04-06 devnull fprint(2, "warning: unclosed tag\n");
853 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
854 7cf289ca 2004-04-06 devnull goto attrloop_done;
855 7cf289ca 2004-04-06 devnull }
856 431e32de 2005-09-30 devnull if(c >= 256 || !isalpha(c)){
857 7cf289ca 2004-04-06 devnull if(warn)
858 7cf289ca 2004-04-06 devnull fprint(2, "warning: expected attribute name\n");
859 cbeb0b26 2006-04-01 devnull /* skipt to next attribute name */
860 431e32de 2005-09-30 devnull for(;;){
861 7cf289ca 2004-04-06 devnull c = getchar(ts);
862 7cf289ca 2004-04-06 devnull if(c < 0)
863 7cf289ca 2004-04-06 devnull goto eob_done;
864 7cf289ca 2004-04-06 devnull if(c < 256 && isalpha(c))
865 7cf289ca 2004-04-06 devnull goto attrloop_continue;
866 431e32de 2005-09-30 devnull if(c == '<'){
867 7cf289ca 2004-04-06 devnull if(warn)
868 7cf289ca 2004-04-06 devnull fprint(2, "warning: unclosed tag\n");
869 7cf289ca 2004-04-06 devnull ungetchar(ts, 60);
870 7cf289ca 2004-04-06 devnull goto attrloop_done;
871 7cf289ca 2004-04-06 devnull }
872 7cf289ca 2004-04-06 devnull if(c == '>')
873 7cf289ca 2004-04-06 devnull goto attrloop_done;
874 7cf289ca 2004-04-06 devnull }
875 7cf289ca 2004-04-06 devnull }
876 cbeb0b26 2006-04-01 devnull /* gather attribute name */
877 7cf289ca 2004-04-06 devnull buf[0] = c;
878 7cf289ca 2004-04-06 devnull i = 1;
879 431e32de 2005-09-30 devnull for(;;){
880 7cf289ca 2004-04-06 devnull c = getchar(ts);
881 7cf289ca 2004-04-06 devnull if(c < 0)
882 7cf289ca 2004-04-06 devnull goto eob_done;
883 7cf289ca 2004-04-06 devnull if(!ISNAMCHAR(c))
884 7cf289ca 2004-04-06 devnull break;
885 7cf289ca 2004-04-06 devnull if(i < BIGBUFSIZE-1)
886 7cf289ca 2004-04-06 devnull buf[i++] = c;
887 7cf289ca 2004-04-06 devnull }
888 7cf289ca 2004-04-06 devnull afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
889 431e32de 2005-09-30 devnull if(warn && !afnd){
890 7cf289ca 2004-04-06 devnull buf[i] = 0;
891 7cf289ca 2004-04-06 devnull fprint(2, "warning: unknown attribute name %S\n", buf);
892 7cf289ca 2004-04-06 devnull }
893 cbeb0b26 2006-04-01 devnull /* skip whitespace */
894 431e32de 2005-09-30 devnull while(c < 256 && isspace(c)){
895 7cf289ca 2004-04-06 devnull c = getchar(ts);
896 7cf289ca 2004-04-06 devnull if(c < 0)
897 7cf289ca 2004-04-06 devnull goto eob_done;
898 7cf289ca 2004-04-06 devnull }
899 431e32de 2005-09-30 devnull if(c != '='){
900 7cf289ca 2004-04-06 devnull if(afnd)
901 7cf289ca 2004-04-06 devnull al = newattr(attid, nil, al);
902 7cf289ca 2004-04-06 devnull goto attrloop_continue;
903 7cf289ca 2004-04-06 devnull }
904 cbeb0b26 2006-04-01 devnull /*# c is '=' here; skip whitespace */
905 431e32de 2005-09-30 devnull for(;;){
906 7cf289ca 2004-04-06 devnull c = getchar(ts);
907 7cf289ca 2004-04-06 devnull if(c < 0)
908 7cf289ca 2004-04-06 devnull goto eob_done;
909 7cf289ca 2004-04-06 devnull if(c >= 256 || !isspace(c))
910 7cf289ca 2004-04-06 devnull break;
911 7cf289ca 2004-04-06 devnull }
912 7cf289ca 2004-04-06 devnull quote = 0;
913 431e32de 2005-09-30 devnull if(c == '\'' || c == '"'){
914 7cf289ca 2004-04-06 devnull quote = c;
915 7cf289ca 2004-04-06 devnull c = getchar(ts);
916 7cf289ca 2004-04-06 devnull if(c < 0)
917 7cf289ca 2004-04-06 devnull goto eob_done;
918 7cf289ca 2004-04-06 devnull }
919 7cf289ca 2004-04-06 devnull val = nil;
920 7cf289ca 2004-04-06 devnull nv = 0;
921 431e32de 2005-09-30 devnull for(;;){
922 7cf289ca 2004-04-06 devnull valloop_continue:
923 7cf289ca 2004-04-06 devnull if(c < 0)
924 7cf289ca 2004-04-06 devnull goto eob_done;
925 431e32de 2005-09-30 devnull if(c == '>'){
926 431e32de 2005-09-30 devnull if(quote){
927 cbeb0b26 2006-04-01 devnull /* c might be part of string (though not good style) */
928 cbeb0b26 2006-04-01 devnull /* but if line ends before close quote, assume */
929 cbeb0b26 2006-04-01 devnull /* there was an unmatched quote */
930 7cf289ca 2004-04-06 devnull ti = ts->i;
931 431e32de 2005-09-30 devnull for(;;){
932 7cf289ca 2004-04-06 devnull c = getchar(ts);
933 7cf289ca 2004-04-06 devnull if(c < 0)
934 7cf289ca 2004-04-06 devnull goto eob_done;
935 431e32de 2005-09-30 devnull if(c == quote){
936 7cf289ca 2004-04-06 devnull backup(ts, ti);
937 7cf289ca 2004-04-06 devnull buf[nv++] = '>';
938 431e32de 2005-09-30 devnull if(nv == BIGBUFSIZE-1){
939 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
940 7cf289ca 2004-04-06 devnull nv = 0;
941 7cf289ca 2004-04-06 devnull }
942 7cf289ca 2004-04-06 devnull c = getchar(ts);
943 7cf289ca 2004-04-06 devnull goto valloop_continue;
944 7cf289ca 2004-04-06 devnull }
945 431e32de 2005-09-30 devnull if(c == '\n'){
946 7cf289ca 2004-04-06 devnull if(warn)
947 7cf289ca 2004-04-06 devnull fprint(2, "warning: apparent unmatched quote\n");
948 7cf289ca 2004-04-06 devnull backup(ts, ti);
949 7cf289ca 2004-04-06 devnull c = '>';
950 7cf289ca 2004-04-06 devnull goto valloop_done;
951 7cf289ca 2004-04-06 devnull }
952 7cf289ca 2004-04-06 devnull }
953 7cf289ca 2004-04-06 devnull }
954 7cf289ca 2004-04-06 devnull else
955 7cf289ca 2004-04-06 devnull goto valloop_done;
956 7cf289ca 2004-04-06 devnull }
957 431e32de 2005-09-30 devnull if(quote){
958 431e32de 2005-09-30 devnull if(c == quote){
959 7cf289ca 2004-04-06 devnull c = getchar(ts);
960 7cf289ca 2004-04-06 devnull if(c < 0)
961 7cf289ca 2004-04-06 devnull goto eob_done;
962 7cf289ca 2004-04-06 devnull goto valloop_done;
963 7cf289ca 2004-04-06 devnull }
964 431e32de 2005-09-30 devnull if(c == '\r'){
965 7cf289ca 2004-04-06 devnull c = getchar(ts);
966 7cf289ca 2004-04-06 devnull goto valloop_continue;
967 7cf289ca 2004-04-06 devnull }
968 7cf289ca 2004-04-06 devnull if(c == '\t' || c == '\n')
969 7cf289ca 2004-04-06 devnull c = ' ';
970 7cf289ca 2004-04-06 devnull }
971 7cf289ca 2004-04-06 devnull else {
972 7cf289ca 2004-04-06 devnull if(c < 256 && isspace(c))
973 7cf289ca 2004-04-06 devnull goto valloop_done;
974 7cf289ca 2004-04-06 devnull }
975 431e32de 2005-09-30 devnull if(c == '&'){
976 7cf289ca 2004-04-06 devnull c = ampersand(ts);
977 7cf289ca 2004-04-06 devnull if(c == -1)
978 7cf289ca 2004-04-06 devnull goto eob_done;
979 7cf289ca 2004-04-06 devnull }
980 7cf289ca 2004-04-06 devnull buf[nv++] = c;
981 431e32de 2005-09-30 devnull if(nv == BIGBUFSIZE-1){
982 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
983 7cf289ca 2004-04-06 devnull nv = 0;
984 7cf289ca 2004-04-06 devnull }
985 7cf289ca 2004-04-06 devnull c = getchar(ts);
986 7cf289ca 2004-04-06 devnull }
987 7cf289ca 2004-04-06 devnull valloop_done:
988 431e32de 2005-09-30 devnull if(afnd){
989 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
990 7cf289ca 2004-04-06 devnull al = newattr(attid, val, al);
991 7cf289ca 2004-04-06 devnull }
992 7cf289ca 2004-04-06 devnull }
993 7cf289ca 2004-04-06 devnull
994 7cf289ca 2004-04-06 devnull attrloop_done:
995 7cf289ca 2004-04-06 devnull tok->attr = al;
996 7cf289ca 2004-04-06 devnull (*pai)++;
997 7cf289ca 2004-04-06 devnull return tok->tag;
998 7cf289ca 2004-04-06 devnull
999 7cf289ca 2004-04-06 devnull eob_done:
1000 7cf289ca 2004-04-06 devnull if(warn)
1001 7cf289ca 2004-04-06 devnull fprint(2, "warning: incomplete tag at end of page\n");
1002 7cf289ca 2004-04-06 devnull backup(ts, nexti);
1003 7cf289ca 2004-04-06 devnull tok->tag = Data;
1004 7cf289ca 2004-04-06 devnull tok->text = _Strdup(L(Llt));
1005 7cf289ca 2004-04-06 devnull return Data;
1006 7cf289ca 2004-04-06 devnull }
1007 7cf289ca 2004-04-06 devnull
1008 cbeb0b26 2006-04-01 devnull /* We've just read a '<!' at position starti, */
1009 cbeb0b26 2006-04-01 devnull /* so this may be a comment or other ignored section, or it may */
1010 cbeb0b26 2006-04-01 devnull /* be just a literal string if there is no close before end of file */
1011 cbeb0b26 2006-04-01 devnull /* (other browsers do that). */
1012 cbeb0b26 2006-04-01 devnull /* The accepted practice seems to be (note: contrary to SGML spec!): */
1013 cbeb0b26 2006-04-01 devnull /* If see <!--, look for --> to close, or if none, > to close. */
1014 cbeb0b26 2006-04-01 devnull /* If see <!(not --), look for > to close. */
1015 cbeb0b26 2006-04-01 devnull /* If no close before end of file, leave original characters in as literal data. */
1016 cbeb0b26 2006-04-01 devnull /* */
1017 cbeb0b26 2006-04-01 devnull /* If we see ignorable stuff, return Comment. */
1018 cbeb0b26 2006-04-01 devnull /* Else return nil (caller should back up and try again when more data arrives, */
1019 cbeb0b26 2006-04-01 devnull /* unless at end of file, in which case caller should just make '<' a data token). */
1020 7cf289ca 2004-04-06 devnull static int
1021 7cf289ca 2004-04-06 devnull comment(TokenSource* ts)
1022 7cf289ca 2004-04-06 devnull {
1023 7cf289ca 2004-04-06 devnull int nexti;
1024 7cf289ca 2004-04-06 devnull int havecomment;
1025 7cf289ca 2004-04-06 devnull int c;
1026 7cf289ca 2004-04-06 devnull
1027 7cf289ca 2004-04-06 devnull nexti = ts->i;
1028 7cf289ca 2004-04-06 devnull havecomment = 0;
1029 7cf289ca 2004-04-06 devnull c = getchar(ts);
1030 431e32de 2005-09-30 devnull if(c == '-'){
1031 7cf289ca 2004-04-06 devnull c = getchar(ts);
1032 431e32de 2005-09-30 devnull if(c == '-'){
1033 7cf289ca 2004-04-06 devnull if(findstr(ts, L(Larrow)))
1034 7cf289ca 2004-04-06 devnull havecomment = 1;
1035 7cf289ca 2004-04-06 devnull else
1036 7cf289ca 2004-04-06 devnull backup(ts, nexti);
1037 7cf289ca 2004-04-06 devnull }
1038 7cf289ca 2004-04-06 devnull }
1039 431e32de 2005-09-30 devnull if(!havecomment){
1040 7cf289ca 2004-04-06 devnull if(c == '>')
1041 7cf289ca 2004-04-06 devnull havecomment = 1;
1042 431e32de 2005-09-30 devnull else if(c >= 0){
1043 7cf289ca 2004-04-06 devnull if(findstr(ts, L(Lgt)))
1044 7cf289ca 2004-04-06 devnull havecomment = 1;
1045 7cf289ca 2004-04-06 devnull }
1046 7cf289ca 2004-04-06 devnull }
1047 7cf289ca 2004-04-06 devnull if(havecomment)
1048 7cf289ca 2004-04-06 devnull return Comment;
1049 7cf289ca 2004-04-06 devnull return -1;
1050 7cf289ca 2004-04-06 devnull }
1051 7cf289ca 2004-04-06 devnull
1052 cbeb0b26 2006-04-01 devnull /* Look for string s in token source. */
1053 cbeb0b26 2006-04-01 devnull /* If found, return 1, with buffer at next char after s, */
1054 cbeb0b26 2006-04-01 devnull /* else return 0 (caller should back up). */
1055 7cf289ca 2004-04-06 devnull static int
1056 7cf289ca 2004-04-06 devnull findstr(TokenSource* ts, Rune* s)
1057 7cf289ca 2004-04-06 devnull {
1058 7cf289ca 2004-04-06 devnull int c0;
1059 7cf289ca 2004-04-06 devnull int n;
1060 7cf289ca 2004-04-06 devnull int nexti;
1061 7cf289ca 2004-04-06 devnull int i;
1062 7cf289ca 2004-04-06 devnull int c;
1063 7cf289ca 2004-04-06 devnull
1064 7cf289ca 2004-04-06 devnull c0 = s[0];
1065 7cf289ca 2004-04-06 devnull n = runestrlen(s);
1066 431e32de 2005-09-30 devnull for(;;){
1067 7cf289ca 2004-04-06 devnull c = getchar(ts);
1068 7cf289ca 2004-04-06 devnull if(c < 0)
1069 7cf289ca 2004-04-06 devnull break;
1070 431e32de 2005-09-30 devnull if(c == c0){
1071 7cf289ca 2004-04-06 devnull if(n == 1)
1072 7cf289ca 2004-04-06 devnull return 1;
1073 7cf289ca 2004-04-06 devnull nexti = ts->i;
1074 431e32de 2005-09-30 devnull for(i = 1; i < n; i++){
1075 7cf289ca 2004-04-06 devnull c = getchar(ts);
1076 7cf289ca 2004-04-06 devnull if(c < 0)
1077 7cf289ca 2004-04-06 devnull goto mainloop_done;
1078 7cf289ca 2004-04-06 devnull if(c != s[i])
1079 7cf289ca 2004-04-06 devnull break;
1080 7cf289ca 2004-04-06 devnull }
1081 7cf289ca 2004-04-06 devnull if(i == n)
1082 7cf289ca 2004-04-06 devnull return 1;
1083 7cf289ca 2004-04-06 devnull backup(ts, nexti);
1084 7cf289ca 2004-04-06 devnull }
1085 7cf289ca 2004-04-06 devnull }
1086 7cf289ca 2004-04-06 devnull mainloop_done:
1087 7cf289ca 2004-04-06 devnull return 0;
1088 7cf289ca 2004-04-06 devnull }
1089 7cf289ca 2004-04-06 devnull
1090 431e32de 2005-09-30 devnull static int
1091 431e32de 2005-09-30 devnull xdigit(int c)
1092 431e32de 2005-09-30 devnull {
1093 431e32de 2005-09-30 devnull if('0' <= c && c <= '9')
1094 431e32de 2005-09-30 devnull return c-'0';
1095 431e32de 2005-09-30 devnull if('a' <= c && c <= 'f')
1096 431e32de 2005-09-30 devnull return c-'a'+10;
1097 431e32de 2005-09-30 devnull if('A' <= c && c <= 'F')
1098 431e32de 2005-09-30 devnull return c-'A'+10;
1099 431e32de 2005-09-30 devnull return -1;
1100 431e32de 2005-09-30 devnull }
1101 431e32de 2005-09-30 devnull
1102 cbeb0b26 2006-04-01 devnull /* We've just read an '&'; look for an entity reference */
1103 cbeb0b26 2006-04-01 devnull /* name, and if found, return translated char. */
1104 cbeb0b26 2006-04-01 devnull /* if there is a complete entity name but it isn't known, */
1105 cbeb0b26 2006-04-01 devnull /* try prefixes (gets around some buggy HTML out there), */
1106 cbeb0b26 2006-04-01 devnull /* and if that fails, back up to just past the '&' and return '&'. */
1107 cbeb0b26 2006-04-01 devnull /* If the entity can't be completed in the current buffer, back up */
1108 cbeb0b26 2006-04-01 devnull /* to the '&' and return -1. */
1109 7cf289ca 2004-04-06 devnull static int
1110 7cf289ca 2004-04-06 devnull ampersand(TokenSource* ts)
1111 7cf289ca 2004-04-06 devnull {
1112 7cf289ca 2004-04-06 devnull int savei;
1113 7cf289ca 2004-04-06 devnull int c;
1114 7cf289ca 2004-04-06 devnull int fnd;
1115 7cf289ca 2004-04-06 devnull int ans;
1116 7cf289ca 2004-04-06 devnull int v;
1117 7cf289ca 2004-04-06 devnull int i;
1118 7cf289ca 2004-04-06 devnull int k;
1119 7cf289ca 2004-04-06 devnull Rune buf[SMALLBUFSIZE];
1120 7cf289ca 2004-04-06 devnull
1121 7cf289ca 2004-04-06 devnull savei = ts->i;
1122 7cf289ca 2004-04-06 devnull c = getchar(ts);
1123 7cf289ca 2004-04-06 devnull fnd = 0;
1124 7cf289ca 2004-04-06 devnull ans = -1;
1125 431e32de 2005-09-30 devnull if(c == '#'){
1126 7cf289ca 2004-04-06 devnull c = getchar(ts);
1127 7cf289ca 2004-04-06 devnull v = 0;
1128 431e32de 2005-09-30 devnull if(c == 'x'){
1129 7cf289ca 2004-04-06 devnull c = getchar(ts);
1130 431e32de 2005-09-30 devnull while((i=xdigit(c)) != -1){
1131 431e32de 2005-09-30 devnull v = v*16 + i;
1132 431e32de 2005-09-30 devnull c = getchar(ts);
1133 431e32de 2005-09-30 devnull }
1134 431e32de 2005-09-30 devnull }else{
1135 431e32de 2005-09-30 devnull while('0' <= c && c <= '9'){
1136 431e32de 2005-09-30 devnull v = v*10 + c - '0';
1137 431e32de 2005-09-30 devnull c = getchar(ts);
1138 431e32de 2005-09-30 devnull }
1139 7cf289ca 2004-04-06 devnull }
1140 431e32de 2005-09-30 devnull if(c >= 0){
1141 7cf289ca 2004-04-06 devnull if(!(c == ';' || c == '\n' || c == '\r'))
1142 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1143 7cf289ca 2004-04-06 devnull c = v;
1144 7cf289ca 2004-04-06 devnull if(c == 160)
1145 7cf289ca 2004-04-06 devnull c = 160;
1146 431e32de 2005-09-30 devnull if(c >= Winstart && c <= Winend){
1147 7cf289ca 2004-04-06 devnull c = winchars[c - Winstart];
1148 7cf289ca 2004-04-06 devnull }
1149 7cf289ca 2004-04-06 devnull ans = c;
1150 7cf289ca 2004-04-06 devnull fnd = 1;
1151 7cf289ca 2004-04-06 devnull }
1152 7cf289ca 2004-04-06 devnull }
1153 431e32de 2005-09-30 devnull else if(c < 256 && isalpha(c)){
1154 7cf289ca 2004-04-06 devnull buf[0] = c;
1155 7cf289ca 2004-04-06 devnull k = 1;
1156 431e32de 2005-09-30 devnull for(;;){
1157 7cf289ca 2004-04-06 devnull c = getchar(ts);
1158 7cf289ca 2004-04-06 devnull if(c < 0)
1159 7cf289ca 2004-04-06 devnull break;
1160 431e32de 2005-09-30 devnull if(ISNAMCHAR(c)){
1161 7cf289ca 2004-04-06 devnull if(k < SMALLBUFSIZE-1)
1162 7cf289ca 2004-04-06 devnull buf[k++] = c;
1163 7cf289ca 2004-04-06 devnull }
1164 7cf289ca 2004-04-06 devnull else {
1165 7cf289ca 2004-04-06 devnull if(!(c == ';' || c == '\n' || c == '\r'))
1166 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1167 7cf289ca 2004-04-06 devnull break;
1168 7cf289ca 2004-04-06 devnull }
1169 7cf289ca 2004-04-06 devnull }
1170 431e32de 2005-09-30 devnull if(c >= 0){
1171 7cf289ca 2004-04-06 devnull fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1172 431e32de 2005-09-30 devnull if(!fnd){
1173 cbeb0b26 2006-04-01 devnull /* Try prefixes of s */
1174 7cf289ca 2004-04-06 devnull if(c == ';' || c == '\n' || c == '\r')
1175 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1176 7cf289ca 2004-04-06 devnull i = k;
1177 431e32de 2005-09-30 devnull while(--k > 0){
1178 7cf289ca 2004-04-06 devnull fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1179 431e32de 2005-09-30 devnull if(fnd){
1180 431e32de 2005-09-30 devnull while(i > k){
1181 7cf289ca 2004-04-06 devnull i--;
1182 7cf289ca 2004-04-06 devnull ungetchar(ts, buf[i]);
1183 7cf289ca 2004-04-06 devnull }
1184 7cf289ca 2004-04-06 devnull break;
1185 7cf289ca 2004-04-06 devnull }
1186 7cf289ca 2004-04-06 devnull }
1187 7cf289ca 2004-04-06 devnull }
1188 7cf289ca 2004-04-06 devnull }
1189 7cf289ca 2004-04-06 devnull }
1190 431e32de 2005-09-30 devnull if(!fnd){
1191 7cf289ca 2004-04-06 devnull backup(ts, savei);
1192 7cf289ca 2004-04-06 devnull ans = '&';
1193 7cf289ca 2004-04-06 devnull }
1194 7cf289ca 2004-04-06 devnull return ans;
1195 7cf289ca 2004-04-06 devnull }
1196 7cf289ca 2004-04-06 devnull
1197 cbeb0b26 2006-04-01 devnull /* Get next char, obeying ts.chset. */
1198 cbeb0b26 2006-04-01 devnull /* Returns -1 if no complete character left before current end of data. */
1199 7cf289ca 2004-04-06 devnull static int
1200 7cf289ca 2004-04-06 devnull getchar(TokenSource* ts)
1201 7cf289ca 2004-04-06 devnull {
1202 7cf289ca 2004-04-06 devnull uchar* buf;
1203 7cf289ca 2004-04-06 devnull int c;
1204 7cf289ca 2004-04-06 devnull int n;
1205 7cf289ca 2004-04-06 devnull int ok;
1206 7cf289ca 2004-04-06 devnull Rune r;
1207 7cf289ca 2004-04-06 devnull
1208 7cf289ca 2004-04-06 devnull if(ts->i >= ts->edata)
1209 7cf289ca 2004-04-06 devnull return -1;
1210 7cf289ca 2004-04-06 devnull buf = ts->data;
1211 7cf289ca 2004-04-06 devnull c = buf[ts->i];
1212 431e32de 2005-09-30 devnull switch(ts->chset){
1213 7cf289ca 2004-04-06 devnull case ISO_8859_1:
1214 7cf289ca 2004-04-06 devnull if(c >= Winstart && c <= Winend)
1215 7cf289ca 2004-04-06 devnull c = winchars[c - Winstart];
1216 7cf289ca 2004-04-06 devnull ts->i++;
1217 7cf289ca 2004-04-06 devnull break;
1218 7cf289ca 2004-04-06 devnull case US_Ascii:
1219 431e32de 2005-09-30 devnull if(c > 127){
1220 7cf289ca 2004-04-06 devnull if(warn)
1221 7cf289ca 2004-04-06 devnull fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1222 7cf289ca 2004-04-06 devnull }
1223 7cf289ca 2004-04-06 devnull ts->i++;
1224 7cf289ca 2004-04-06 devnull break;
1225 7cf289ca 2004-04-06 devnull case UTF_8:
1226 7cf289ca 2004-04-06 devnull ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1227 7cf289ca 2004-04-06 devnull n = chartorune(&r, (char*)(buf+ts->i));
1228 431e32de 2005-09-30 devnull if(ok){
1229 7cf289ca 2004-04-06 devnull if(warn && c == 0x80)
1230 7cf289ca 2004-04-06 devnull fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1231 7cf289ca 2004-04-06 devnull ts->i += n;
1232 7cf289ca 2004-04-06 devnull c = r;
1233 7cf289ca 2004-04-06 devnull }
1234 7cf289ca 2004-04-06 devnull else {
1235 cbeb0b26 2006-04-01 devnull /* not enough bytes in buf to complete utf-8 char */
1236 cbeb0b26 2006-04-01 devnull ts->i = ts->edata; /* mark "all used" */
1237 7cf289ca 2004-04-06 devnull c = -1;
1238 7cf289ca 2004-04-06 devnull }
1239 7cf289ca 2004-04-06 devnull break;
1240 7cf289ca 2004-04-06 devnull case Unicode:
1241 431e32de 2005-09-30 devnull if(ts->i < ts->edata - 1){
1242 cbeb0b26 2006-04-01 devnull /*standards say most-significant byte first */
1243 7cf289ca 2004-04-06 devnull c = (c << 8)|(buf[ts->i + 1]);
1244 7cf289ca 2004-04-06 devnull ts->i += 2;
1245 7cf289ca 2004-04-06 devnull }
1246 7cf289ca 2004-04-06 devnull else {
1247 cbeb0b26 2006-04-01 devnull ts->i = ts->edata; /* mark "all used" */
1248 7cf289ca 2004-04-06 devnull c = -1;
1249 7cf289ca 2004-04-06 devnull }
1250 7cf289ca 2004-04-06 devnull break;
1251 7cf289ca 2004-04-06 devnull }
1252 7cf289ca 2004-04-06 devnull return c;
1253 7cf289ca 2004-04-06 devnull }
1254 7cf289ca 2004-04-06 devnull
1255 cbeb0b26 2006-04-01 devnull /* Assuming c was the last character returned by getchar, set */
1256 cbeb0b26 2006-04-01 devnull /* things up so that next getchar will get that same character */
1257 cbeb0b26 2006-04-01 devnull /* followed by the current 'next character', etc. */
1258 7cf289ca 2004-04-06 devnull static void
1259 7cf289ca 2004-04-06 devnull ungetchar(TokenSource* ts, int c)
1260 7cf289ca 2004-04-06 devnull {
1261 7cf289ca 2004-04-06 devnull int n;
1262 7cf289ca 2004-04-06 devnull Rune r;
1263 7cf289ca 2004-04-06 devnull char a[UTFmax];
1264 7cf289ca 2004-04-06 devnull
1265 7cf289ca 2004-04-06 devnull n = 1;
1266 431e32de 2005-09-30 devnull switch(ts->chset){
1267 7cf289ca 2004-04-06 devnull case UTF_8:
1268 431e32de 2005-09-30 devnull if(c >= 128){
1269 7cf289ca 2004-04-06 devnull r = c;
1270 7cf289ca 2004-04-06 devnull n = runetochar(a, &r);
1271 7cf289ca 2004-04-06 devnull }
1272 7cf289ca 2004-04-06 devnull break;
1273 7cf289ca 2004-04-06 devnull case Unicode:
1274 7cf289ca 2004-04-06 devnull n = 2;
1275 7cf289ca 2004-04-06 devnull break;
1276 7cf289ca 2004-04-06 devnull }
1277 7cf289ca 2004-04-06 devnull ts->i -= n;
1278 7cf289ca 2004-04-06 devnull }
1279 7cf289ca 2004-04-06 devnull
1280 cbeb0b26 2006-04-01 devnull /* Restore ts so that it is at the state where the index was savei. */
1281 7cf289ca 2004-04-06 devnull static void
1282 7cf289ca 2004-04-06 devnull backup(TokenSource* ts, int savei)
1283 7cf289ca 2004-04-06 devnull {
1284 7cf289ca 2004-04-06 devnull if(dbglex)
1285 7cf289ca 2004-04-06 devnull fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1286 7cf289ca 2004-04-06 devnull ts->i = savei;
1287 7cf289ca 2004-04-06 devnull }
1288 7cf289ca 2004-04-06 devnull
1289 7cf289ca 2004-04-06 devnull
1290 cbeb0b26 2006-04-01 devnull /* Look for value associated with attribute attid in token t. */
1291 cbeb0b26 2006-04-01 devnull /* If there is one, return 1 and put the value in *pans, */
1292 cbeb0b26 2006-04-01 devnull /* else return 0. */
1293 cbeb0b26 2006-04-01 devnull /* If xfer is true, transfer ownership of the string to the caller */
1294 cbeb0b26 2006-04-01 devnull /* (nil it out here); otherwise, caller must duplicate the answer */
1295 cbeb0b26 2006-04-01 devnull /* if it needs to save it. */
1296 cbeb0b26 2006-04-01 devnull /* OK to have pans==0, in which case this is just looking */
1297 cbeb0b26 2006-04-01 devnull /* to see if token is present. */
1298 7cf289ca 2004-04-06 devnull int
1299 7cf289ca 2004-04-06 devnull _tokaval(Token* t, int attid, Rune** pans, int xfer)
1300 7cf289ca 2004-04-06 devnull {
1301 7cf289ca 2004-04-06 devnull Attr* attr;
1302 7cf289ca 2004-04-06 devnull
1303 7cf289ca 2004-04-06 devnull attr = t->attr;
1304 431e32de 2005-09-30 devnull while(attr != nil){
1305 431e32de 2005-09-30 devnull if(attr->attid == attid){
1306 7cf289ca 2004-04-06 devnull if(pans != nil)
1307 7cf289ca 2004-04-06 devnull *pans = attr->value;
1308 7cf289ca 2004-04-06 devnull if(xfer)
1309 7cf289ca 2004-04-06 devnull attr->value = nil;
1310 7cf289ca 2004-04-06 devnull return 1;
1311 7cf289ca 2004-04-06 devnull }
1312 7cf289ca 2004-04-06 devnull attr = attr->next;
1313 7cf289ca 2004-04-06 devnull }
1314 7cf289ca 2004-04-06 devnull if(pans != nil)
1315 7cf289ca 2004-04-06 devnull *pans = nil;
1316 7cf289ca 2004-04-06 devnull return 0;
1317 7cf289ca 2004-04-06 devnull }
1318 7cf289ca 2004-04-06 devnull
1319 7cf289ca 2004-04-06 devnull static int
1320 7cf289ca 2004-04-06 devnull Tconv(Fmt *f)
1321 7cf289ca 2004-04-06 devnull {
1322 7cf289ca 2004-04-06 devnull Token* t;
1323 7cf289ca 2004-04-06 devnull int i;
1324 7cf289ca 2004-04-06 devnull int tag;
1325 7cf289ca 2004-04-06 devnull char* srbra;
1326 7cf289ca 2004-04-06 devnull Rune* aname;
1327 7cf289ca 2004-04-06 devnull Rune* tname;
1328 7cf289ca 2004-04-06 devnull Attr* a;
1329 7cf289ca 2004-04-06 devnull char buf[BIGBUFSIZE];
1330 7cf289ca 2004-04-06 devnull
1331 7cf289ca 2004-04-06 devnull t = va_arg(f->args, Token*);
1332 7cf289ca 2004-04-06 devnull if(t == nil)
1333 7cf289ca 2004-04-06 devnull sprint(buf, "<null>");
1334 7cf289ca 2004-04-06 devnull else {
1335 7cf289ca 2004-04-06 devnull i = 0;
1336 7cf289ca 2004-04-06 devnull if(dbglex > 1)
1337 7cf289ca 2004-04-06 devnull i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1338 7cf289ca 2004-04-06 devnull tag = t->tag;
1339 431e32de 2005-09-30 devnull if(tag == Data){
1340 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1341 7cf289ca 2004-04-06 devnull }
1342 7cf289ca 2004-04-06 devnull else {
1343 7cf289ca 2004-04-06 devnull srbra = "";
1344 431e32de 2005-09-30 devnull if(tag >= RBRA){
1345 7cf289ca 2004-04-06 devnull tag -= RBRA;
1346 7cf289ca 2004-04-06 devnull srbra = "/";
1347 7cf289ca 2004-04-06 devnull }
1348 7cf289ca 2004-04-06 devnull tname = tagnames[tag];
1349 7cf289ca 2004-04-06 devnull if(tag == Notfound)
1350 7cf289ca 2004-04-06 devnull tname = L(Lquestion);
1351 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1352 431e32de 2005-09-30 devnull for(a = t->attr; a != nil; a = a->next){
1353 7cf289ca 2004-04-06 devnull aname = attrnames[a->attid];
1354 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1355 7cf289ca 2004-04-06 devnull if(a->value != nil)
1356 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1357 7cf289ca 2004-04-06 devnull }
1358 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, ">");
1359 7cf289ca 2004-04-06 devnull }
1360 7cf289ca 2004-04-06 devnull buf[i] = 0;
1361 7cf289ca 2004-04-06 devnull }
1362 7cf289ca 2004-04-06 devnull return fmtstrcpy(f, buf);
1363 7cf289ca 2004-04-06 devnull }
1364 7cf289ca 2004-04-06 devnull
1365 cbeb0b26 2006-04-01 devnull /* Attrs own their constituent strings, but build may eventually */
1366 cbeb0b26 2006-04-01 devnull /* transfer some values to its items and nil them out in the Attr. */
1367 7cf289ca 2004-04-06 devnull static Attr*
1368 7cf289ca 2004-04-06 devnull newattr(int attid, Rune* value, Attr* link)
1369 7cf289ca 2004-04-06 devnull {
1370 7cf289ca 2004-04-06 devnull Attr* ans;
1371 7cf289ca 2004-04-06 devnull
1372 7cf289ca 2004-04-06 devnull ans = (Attr*)emalloc(sizeof(Attr));
1373 7cf289ca 2004-04-06 devnull ans->attid = attid;
1374 7cf289ca 2004-04-06 devnull ans->value = value;
1375 7cf289ca 2004-04-06 devnull ans->next = link;
1376 7cf289ca 2004-04-06 devnull return ans;
1377 7cf289ca 2004-04-06 devnull }
1378 7cf289ca 2004-04-06 devnull
1379 cbeb0b26 2006-04-01 devnull /* Free list of Attrs linked through next field */
1380 7cf289ca 2004-04-06 devnull static void
1381 7cf289ca 2004-04-06 devnull freeattrs(Attr* ahead)
1382 7cf289ca 2004-04-06 devnull {
1383 7cf289ca 2004-04-06 devnull Attr* a;
1384 7cf289ca 2004-04-06 devnull Attr* nexta;
1385 7cf289ca 2004-04-06 devnull
1386 7cf289ca 2004-04-06 devnull a = ahead;
1387 431e32de 2005-09-30 devnull while(a != nil){
1388 7cf289ca 2004-04-06 devnull nexta = a->next;
1389 7cf289ca 2004-04-06 devnull free(a->value);
1390 7cf289ca 2004-04-06 devnull free(a);
1391 7cf289ca 2004-04-06 devnull a = nexta;
1392 7cf289ca 2004-04-06 devnull }
1393 7cf289ca 2004-04-06 devnull }
1394 7cf289ca 2004-04-06 devnull
1395 cbeb0b26 2006-04-01 devnull /* Free array of Tokens. */
1396 cbeb0b26 2006-04-01 devnull /* Allocated space might have room for more than n tokens, */
1397 cbeb0b26 2006-04-01 devnull /* but only n of them are initialized. */
1398 cbeb0b26 2006-04-01 devnull /* If caller has transferred ownership of constitutent strings */
1399 cbeb0b26 2006-04-01 devnull /* or attributes, it must have nil'd out the pointers in the Tokens. */
1400 7cf289ca 2004-04-06 devnull void
1401 7cf289ca 2004-04-06 devnull _freetokens(Token* tarray, int n)
1402 7cf289ca 2004-04-06 devnull {
1403 7cf289ca 2004-04-06 devnull int i;
1404 7cf289ca 2004-04-06 devnull Token* t;
1405 7cf289ca 2004-04-06 devnull
1406 7cf289ca 2004-04-06 devnull if(tarray == nil)
1407 7cf289ca 2004-04-06 devnull return;
1408 431e32de 2005-09-30 devnull for(i = 0; i < n; i++){
1409 7cf289ca 2004-04-06 devnull t = &tarray[i];
1410 7cf289ca 2004-04-06 devnull free(t->text);
1411 7cf289ca 2004-04-06 devnull freeattrs(t->attr);
1412 7cf289ca 2004-04-06 devnull }
1413 7cf289ca 2004-04-06 devnull free(tarray);
1414 7cf289ca 2004-04-06 devnull }