Blame


1 7cf289ca 2004-04-06 devnull #include <u.h>
2 7cf289ca 2004-04-06 devnull #include <libc.h>
3 7cf289ca 2004-04-06 devnull #include <draw.h>
4 7cf289ca 2004-04-06 devnull #include <ctype.h>
5 7cf289ca 2004-04-06 devnull #include <html.h>
6 7cf289ca 2004-04-06 devnull #include "impl.h"
7 7cf289ca 2004-04-06 devnull
8 7cf289ca 2004-04-06 devnull typedef struct TokenSource TokenSource;
9 7cf289ca 2004-04-06 devnull struct TokenSource
10 7cf289ca 2004-04-06 devnull {
11 7cf289ca 2004-04-06 devnull int i; // index of next byte to use
12 7cf289ca 2004-04-06 devnull uchar* data; // all the data
13 7cf289ca 2004-04-06 devnull int edata; // data[0:edata] is valid
14 7cf289ca 2004-04-06 devnull int chset; // one of US_Ascii, etc.
15 7cf289ca 2004-04-06 devnull int mtype; // TextHtml or TextPlain
16 7cf289ca 2004-04-06 devnull };
17 7cf289ca 2004-04-06 devnull
18 7cf289ca 2004-04-06 devnull enum {
19 7cf289ca 2004-04-06 devnull EOF = -2,
20 7cf289ca 2004-04-06 devnull EOB = -1
21 7cf289ca 2004-04-06 devnull };
22 7cf289ca 2004-04-06 devnull
23 7cf289ca 2004-04-06 devnull #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 7cf289ca 2004-04-06 devnull
25 7cf289ca 2004-04-06 devnull #define SMALLBUFSIZE 240
26 7cf289ca 2004-04-06 devnull #define BIGBUFSIZE 2000
27 7cf289ca 2004-04-06 devnull
28 7cf289ca 2004-04-06 devnull // HTML 4.0 tag names.
29 7cf289ca 2004-04-06 devnull // Keep sorted, and in correspondence with enum in iparse.h.
30 7cf289ca 2004-04-06 devnull Rune **tagnames;
31 7cf289ca 2004-04-06 devnull char *_tagnames[] = {
32 7cf289ca 2004-04-06 devnull " ",
33 7cf289ca 2004-04-06 devnull "!",
34 7cf289ca 2004-04-06 devnull "a",
35 7cf289ca 2004-04-06 devnull "abbr",
36 7cf289ca 2004-04-06 devnull "acronym",
37 7cf289ca 2004-04-06 devnull "address",
38 7cf289ca 2004-04-06 devnull "applet",
39 7cf289ca 2004-04-06 devnull "area",
40 7cf289ca 2004-04-06 devnull "b",
41 7cf289ca 2004-04-06 devnull "base",
42 7cf289ca 2004-04-06 devnull "basefont",
43 7cf289ca 2004-04-06 devnull "bdo",
44 7cf289ca 2004-04-06 devnull "big",
45 7cf289ca 2004-04-06 devnull "blink",
46 7cf289ca 2004-04-06 devnull "blockquote",
47 7cf289ca 2004-04-06 devnull "body",
48 7cf289ca 2004-04-06 devnull "bq",
49 7cf289ca 2004-04-06 devnull "br",
50 7cf289ca 2004-04-06 devnull "button",
51 7cf289ca 2004-04-06 devnull "caption",
52 7cf289ca 2004-04-06 devnull "center",
53 7cf289ca 2004-04-06 devnull "cite",
54 7cf289ca 2004-04-06 devnull "code",
55 7cf289ca 2004-04-06 devnull "col",
56 7cf289ca 2004-04-06 devnull "colgroup",
57 7cf289ca 2004-04-06 devnull "dd",
58 7cf289ca 2004-04-06 devnull "del",
59 7cf289ca 2004-04-06 devnull "dfn",
60 7cf289ca 2004-04-06 devnull "dir",
61 7cf289ca 2004-04-06 devnull "div",
62 7cf289ca 2004-04-06 devnull "dl",
63 7cf289ca 2004-04-06 devnull "dt",
64 7cf289ca 2004-04-06 devnull "em",
65 7cf289ca 2004-04-06 devnull "fieldset",
66 7cf289ca 2004-04-06 devnull "font",
67 7cf289ca 2004-04-06 devnull "form",
68 7cf289ca 2004-04-06 devnull "frame",
69 7cf289ca 2004-04-06 devnull "frameset",
70 7cf289ca 2004-04-06 devnull "h1",
71 7cf289ca 2004-04-06 devnull "h2",
72 7cf289ca 2004-04-06 devnull "h3",
73 7cf289ca 2004-04-06 devnull "h4",
74 7cf289ca 2004-04-06 devnull "h5",
75 7cf289ca 2004-04-06 devnull "h6",
76 7cf289ca 2004-04-06 devnull "head",
77 7cf289ca 2004-04-06 devnull "hr",
78 7cf289ca 2004-04-06 devnull "html",
79 7cf289ca 2004-04-06 devnull "i",
80 7cf289ca 2004-04-06 devnull "iframe",
81 7cf289ca 2004-04-06 devnull "img",
82 7cf289ca 2004-04-06 devnull "input",
83 7cf289ca 2004-04-06 devnull "ins",
84 7cf289ca 2004-04-06 devnull "isindex",
85 7cf289ca 2004-04-06 devnull "kbd",
86 7cf289ca 2004-04-06 devnull "label",
87 7cf289ca 2004-04-06 devnull "legend",
88 7cf289ca 2004-04-06 devnull "li",
89 7cf289ca 2004-04-06 devnull "link",
90 7cf289ca 2004-04-06 devnull "map",
91 7cf289ca 2004-04-06 devnull "menu",
92 7cf289ca 2004-04-06 devnull "meta",
93 7cf289ca 2004-04-06 devnull "nobr",
94 7cf289ca 2004-04-06 devnull "noframes",
95 7cf289ca 2004-04-06 devnull "noscript",
96 7cf289ca 2004-04-06 devnull "object",
97 7cf289ca 2004-04-06 devnull "ol",
98 7cf289ca 2004-04-06 devnull "optgroup",
99 7cf289ca 2004-04-06 devnull "option",
100 7cf289ca 2004-04-06 devnull "p",
101 7cf289ca 2004-04-06 devnull "param",
102 7cf289ca 2004-04-06 devnull "pre",
103 7cf289ca 2004-04-06 devnull "q",
104 7cf289ca 2004-04-06 devnull "s",
105 7cf289ca 2004-04-06 devnull "samp",
106 7cf289ca 2004-04-06 devnull "script",
107 7cf289ca 2004-04-06 devnull "select",
108 7cf289ca 2004-04-06 devnull "small",
109 7cf289ca 2004-04-06 devnull "span",
110 7cf289ca 2004-04-06 devnull "strike",
111 7cf289ca 2004-04-06 devnull "strong",
112 7cf289ca 2004-04-06 devnull "style",
113 7cf289ca 2004-04-06 devnull "sub",
114 7cf289ca 2004-04-06 devnull "sup",
115 7cf289ca 2004-04-06 devnull "table",
116 7cf289ca 2004-04-06 devnull "tbody",
117 7cf289ca 2004-04-06 devnull "td",
118 7cf289ca 2004-04-06 devnull "textarea",
119 7cf289ca 2004-04-06 devnull "tfoot",
120 7cf289ca 2004-04-06 devnull "th",
121 7cf289ca 2004-04-06 devnull "thead",
122 7cf289ca 2004-04-06 devnull "title",
123 7cf289ca 2004-04-06 devnull "tr",
124 7cf289ca 2004-04-06 devnull "tt",
125 7cf289ca 2004-04-06 devnull "u",
126 7cf289ca 2004-04-06 devnull "ul",
127 7cf289ca 2004-04-06 devnull "var"
128 7cf289ca 2004-04-06 devnull };
129 7cf289ca 2004-04-06 devnull
130 7cf289ca 2004-04-06 devnull // HTML 4.0 attribute names.
131 7cf289ca 2004-04-06 devnull // Keep sorted, and in correspondence with enum in i.h.
132 7cf289ca 2004-04-06 devnull Rune **attrnames;
133 7cf289ca 2004-04-06 devnull char* _attrnames[] = {
134 7cf289ca 2004-04-06 devnull "abbr",
135 7cf289ca 2004-04-06 devnull "accept-charset",
136 7cf289ca 2004-04-06 devnull "access-key",
137 7cf289ca 2004-04-06 devnull "action",
138 7cf289ca 2004-04-06 devnull "align",
139 7cf289ca 2004-04-06 devnull "alink",
140 7cf289ca 2004-04-06 devnull "alt",
141 7cf289ca 2004-04-06 devnull "archive",
142 7cf289ca 2004-04-06 devnull "axis",
143 7cf289ca 2004-04-06 devnull "background",
144 7cf289ca 2004-04-06 devnull "bgcolor",
145 7cf289ca 2004-04-06 devnull "border",
146 7cf289ca 2004-04-06 devnull "cellpadding",
147 7cf289ca 2004-04-06 devnull "cellspacing",
148 7cf289ca 2004-04-06 devnull "char",
149 7cf289ca 2004-04-06 devnull "charoff",
150 7cf289ca 2004-04-06 devnull "charset",
151 7cf289ca 2004-04-06 devnull "checked",
152 7cf289ca 2004-04-06 devnull "cite",
153 7cf289ca 2004-04-06 devnull "class",
154 7cf289ca 2004-04-06 devnull "classid",
155 7cf289ca 2004-04-06 devnull "clear",
156 7cf289ca 2004-04-06 devnull "code",
157 7cf289ca 2004-04-06 devnull "codebase",
158 7cf289ca 2004-04-06 devnull "codetype",
159 7cf289ca 2004-04-06 devnull "color",
160 7cf289ca 2004-04-06 devnull "cols",
161 7cf289ca 2004-04-06 devnull "colspan",
162 7cf289ca 2004-04-06 devnull "compact",
163 7cf289ca 2004-04-06 devnull "content",
164 7cf289ca 2004-04-06 devnull "coords",
165 7cf289ca 2004-04-06 devnull "data",
166 7cf289ca 2004-04-06 devnull "datetime",
167 7cf289ca 2004-04-06 devnull "declare",
168 7cf289ca 2004-04-06 devnull "defer",
169 7cf289ca 2004-04-06 devnull "dir",
170 7cf289ca 2004-04-06 devnull "disabled",
171 7cf289ca 2004-04-06 devnull "enctype",
172 7cf289ca 2004-04-06 devnull "face",
173 7cf289ca 2004-04-06 devnull "for",
174 7cf289ca 2004-04-06 devnull "frame",
175 7cf289ca 2004-04-06 devnull "frameborder",
176 7cf289ca 2004-04-06 devnull "headers",
177 7cf289ca 2004-04-06 devnull "height",
178 7cf289ca 2004-04-06 devnull "href",
179 7cf289ca 2004-04-06 devnull "hreflang",
180 7cf289ca 2004-04-06 devnull "hspace",
181 7cf289ca 2004-04-06 devnull "http-equiv",
182 7cf289ca 2004-04-06 devnull "id",
183 7cf289ca 2004-04-06 devnull "ismap",
184 7cf289ca 2004-04-06 devnull "label",
185 7cf289ca 2004-04-06 devnull "lang",
186 7cf289ca 2004-04-06 devnull "link",
187 7cf289ca 2004-04-06 devnull "longdesc",
188 7cf289ca 2004-04-06 devnull "marginheight",
189 7cf289ca 2004-04-06 devnull "marginwidth",
190 7cf289ca 2004-04-06 devnull "maxlength",
191 7cf289ca 2004-04-06 devnull "media",
192 7cf289ca 2004-04-06 devnull "method",
193 7cf289ca 2004-04-06 devnull "multiple",
194 7cf289ca 2004-04-06 devnull "name",
195 7cf289ca 2004-04-06 devnull "nohref",
196 7cf289ca 2004-04-06 devnull "noresize",
197 7cf289ca 2004-04-06 devnull "noshade",
198 7cf289ca 2004-04-06 devnull "nowrap",
199 7cf289ca 2004-04-06 devnull "object",
200 7cf289ca 2004-04-06 devnull "onblur",
201 7cf289ca 2004-04-06 devnull "onchange",
202 7cf289ca 2004-04-06 devnull "onclick",
203 7cf289ca 2004-04-06 devnull "ondblclick",
204 7cf289ca 2004-04-06 devnull "onfocus",
205 7cf289ca 2004-04-06 devnull "onkeypress",
206 7cf289ca 2004-04-06 devnull "onkeyup",
207 7cf289ca 2004-04-06 devnull "onload",
208 7cf289ca 2004-04-06 devnull "onmousedown",
209 7cf289ca 2004-04-06 devnull "onmousemove",
210 7cf289ca 2004-04-06 devnull "onmouseout",
211 7cf289ca 2004-04-06 devnull "onmouseover",
212 7cf289ca 2004-04-06 devnull "onmouseup",
213 7cf289ca 2004-04-06 devnull "onreset",
214 7cf289ca 2004-04-06 devnull "onselect",
215 7cf289ca 2004-04-06 devnull "onsubmit",
216 7cf289ca 2004-04-06 devnull "onunload",
217 7cf289ca 2004-04-06 devnull "profile",
218 7cf289ca 2004-04-06 devnull "prompt",
219 7cf289ca 2004-04-06 devnull "readonly",
220 7cf289ca 2004-04-06 devnull "rel",
221 7cf289ca 2004-04-06 devnull "rev",
222 7cf289ca 2004-04-06 devnull "rows",
223 7cf289ca 2004-04-06 devnull "rowspan",
224 7cf289ca 2004-04-06 devnull "rules",
225 7cf289ca 2004-04-06 devnull "scheme",
226 7cf289ca 2004-04-06 devnull "scope",
227 7cf289ca 2004-04-06 devnull "scrolling",
228 7cf289ca 2004-04-06 devnull "selected",
229 7cf289ca 2004-04-06 devnull "shape",
230 7cf289ca 2004-04-06 devnull "size",
231 7cf289ca 2004-04-06 devnull "span",
232 7cf289ca 2004-04-06 devnull "src",
233 7cf289ca 2004-04-06 devnull "standby",
234 7cf289ca 2004-04-06 devnull "start",
235 7cf289ca 2004-04-06 devnull "style",
236 7cf289ca 2004-04-06 devnull "summary",
237 7cf289ca 2004-04-06 devnull "tabindex",
238 7cf289ca 2004-04-06 devnull "target",
239 7cf289ca 2004-04-06 devnull "text",
240 7cf289ca 2004-04-06 devnull "title",
241 7cf289ca 2004-04-06 devnull "type",
242 7cf289ca 2004-04-06 devnull "usemap",
243 7cf289ca 2004-04-06 devnull "valign",
244 7cf289ca 2004-04-06 devnull "value",
245 7cf289ca 2004-04-06 devnull "valuetype",
246 7cf289ca 2004-04-06 devnull "version",
247 7cf289ca 2004-04-06 devnull "vlink",
248 7cf289ca 2004-04-06 devnull "vspace",
249 7cf289ca 2004-04-06 devnull "width"
250 7cf289ca 2004-04-06 devnull };
251 7cf289ca 2004-04-06 devnull
252 7cf289ca 2004-04-06 devnull
253 7cf289ca 2004-04-06 devnull // Character entity to unicode character number map.
254 7cf289ca 2004-04-06 devnull // Keep sorted by name.
255 7cf289ca 2004-04-06 devnull StringInt *chartab;
256 7cf289ca 2004-04-06 devnull AsciiInt _chartab[142] = {
257 7cf289ca 2004-04-06 devnull {"AElig", 198},
258 7cf289ca 2004-04-06 devnull {"Aacute", 193},
259 7cf289ca 2004-04-06 devnull {"Acirc", 194},
260 7cf289ca 2004-04-06 devnull {"Agrave", 192},
261 7cf289ca 2004-04-06 devnull {"Aring", 197},
262 7cf289ca 2004-04-06 devnull {"Atilde", 195},
263 7cf289ca 2004-04-06 devnull {"Auml", 196},
264 7cf289ca 2004-04-06 devnull {"Ccedil", 199},
265 7cf289ca 2004-04-06 devnull {"ETH", 208},
266 7cf289ca 2004-04-06 devnull {"Eacute", 201},
267 7cf289ca 2004-04-06 devnull {"Ecirc", 202},
268 7cf289ca 2004-04-06 devnull {"Egrave", 200},
269 7cf289ca 2004-04-06 devnull {"Euml", 203},
270 7cf289ca 2004-04-06 devnull {"Iacute", 205},
271 7cf289ca 2004-04-06 devnull {"Icirc", 206},
272 7cf289ca 2004-04-06 devnull {"Igrave", 204},
273 7cf289ca 2004-04-06 devnull {"Iuml", 207},
274 7cf289ca 2004-04-06 devnull {"Ntilde", 209},
275 7cf289ca 2004-04-06 devnull {"Oacute", 211},
276 7cf289ca 2004-04-06 devnull {"Ocirc", 212},
277 7cf289ca 2004-04-06 devnull {"Ograve", 210},
278 7cf289ca 2004-04-06 devnull {"Oslash", 216},
279 7cf289ca 2004-04-06 devnull {"Otilde", 213},
280 7cf289ca 2004-04-06 devnull {"Ouml", 214},
281 7cf289ca 2004-04-06 devnull {"THORN", 222},
282 7cf289ca 2004-04-06 devnull {"Uacute", 218},
283 7cf289ca 2004-04-06 devnull {"Ucirc", 219},
284 7cf289ca 2004-04-06 devnull {"Ugrave", 217},
285 7cf289ca 2004-04-06 devnull {"Uuml", 220},
286 7cf289ca 2004-04-06 devnull {"Yacute", 221},
287 7cf289ca 2004-04-06 devnull {"aacute", 225},
288 7cf289ca 2004-04-06 devnull {"acirc", 226},
289 7cf289ca 2004-04-06 devnull {"acute", 180},
290 7cf289ca 2004-04-06 devnull {"aelig", 230},
291 7cf289ca 2004-04-06 devnull {"agrave", 224},
292 7cf289ca 2004-04-06 devnull {"alpha", 945},
293 7cf289ca 2004-04-06 devnull {"amp", 38},
294 7cf289ca 2004-04-06 devnull {"aring", 229},
295 7cf289ca 2004-04-06 devnull {"atilde", 227},
296 7cf289ca 2004-04-06 devnull {"auml", 228},
297 7cf289ca 2004-04-06 devnull {"beta", 946},
298 7cf289ca 2004-04-06 devnull {"brvbar", 166},
299 7cf289ca 2004-04-06 devnull {"ccedil", 231},
300 7cf289ca 2004-04-06 devnull {"cdots", 8943},
301 7cf289ca 2004-04-06 devnull {"cedil", 184},
302 7cf289ca 2004-04-06 devnull {"cent", 162},
303 7cf289ca 2004-04-06 devnull {"chi", 967},
304 7cf289ca 2004-04-06 devnull {"copy", 169},
305 7cf289ca 2004-04-06 devnull {"curren", 164},
306 7cf289ca 2004-04-06 devnull {"ddots", 8945},
307 7cf289ca 2004-04-06 devnull {"deg", 176},
308 7cf289ca 2004-04-06 devnull {"delta", 948},
309 7cf289ca 2004-04-06 devnull {"divide", 247},
310 7cf289ca 2004-04-06 devnull {"eacute", 233},
311 7cf289ca 2004-04-06 devnull {"ecirc", 234},
312 7cf289ca 2004-04-06 devnull {"egrave", 232},
313 7cf289ca 2004-04-06 devnull {"emdash", 8212},
314 7cf289ca 2004-04-06 devnull {"emsp", 8195},
315 7cf289ca 2004-04-06 devnull {"endash", 8211},
316 7cf289ca 2004-04-06 devnull {"ensp", 8194},
317 7cf289ca 2004-04-06 devnull {"epsilon", 949},
318 7cf289ca 2004-04-06 devnull {"eta", 951},
319 7cf289ca 2004-04-06 devnull {"eth", 240},
320 7cf289ca 2004-04-06 devnull {"euml", 235},
321 7cf289ca 2004-04-06 devnull {"frac12", 189},
322 7cf289ca 2004-04-06 devnull {"frac14", 188},
323 7cf289ca 2004-04-06 devnull {"frac34", 190},
324 7cf289ca 2004-04-06 devnull {"gamma", 947},
325 7cf289ca 2004-04-06 devnull {"gt", 62},
326 7cf289ca 2004-04-06 devnull {"iacute", 237},
327 7cf289ca 2004-04-06 devnull {"icirc", 238},
328 7cf289ca 2004-04-06 devnull {"iexcl", 161},
329 7cf289ca 2004-04-06 devnull {"igrave", 236},
330 7cf289ca 2004-04-06 devnull {"iota", 953},
331 7cf289ca 2004-04-06 devnull {"iquest", 191},
332 7cf289ca 2004-04-06 devnull {"iuml", 239},
333 7cf289ca 2004-04-06 devnull {"kappa", 954},
334 7cf289ca 2004-04-06 devnull {"lambda", 955},
335 7cf289ca 2004-04-06 devnull {"laquo", 171},
336 7cf289ca 2004-04-06 devnull {"ldots", 8230},
337 7cf289ca 2004-04-06 devnull {"lt", 60},
338 7cf289ca 2004-04-06 devnull {"macr", 175},
339 7cf289ca 2004-04-06 devnull {"micro", 181},
340 7cf289ca 2004-04-06 devnull {"middot", 183},
341 7cf289ca 2004-04-06 devnull {"mu", 956},
342 7cf289ca 2004-04-06 devnull {"nbsp", 160},
343 7cf289ca 2004-04-06 devnull {"not", 172},
344 7cf289ca 2004-04-06 devnull {"ntilde", 241},
345 7cf289ca 2004-04-06 devnull {"nu", 957},
346 7cf289ca 2004-04-06 devnull {"oacute", 243},
347 7cf289ca 2004-04-06 devnull {"ocirc", 244},
348 7cf289ca 2004-04-06 devnull {"ograve", 242},
349 7cf289ca 2004-04-06 devnull {"omega", 969},
350 7cf289ca 2004-04-06 devnull {"omicron", 959},
351 7cf289ca 2004-04-06 devnull {"ordf", 170},
352 7cf289ca 2004-04-06 devnull {"ordm", 186},
353 7cf289ca 2004-04-06 devnull {"oslash", 248},
354 7cf289ca 2004-04-06 devnull {"otilde", 245},
355 7cf289ca 2004-04-06 devnull {"ouml", 246},
356 7cf289ca 2004-04-06 devnull {"para", 182},
357 7cf289ca 2004-04-06 devnull {"phi", 966},
358 7cf289ca 2004-04-06 devnull {"pi", 960},
359 7cf289ca 2004-04-06 devnull {"plusmn", 177},
360 7cf289ca 2004-04-06 devnull {"pound", 163},
361 7cf289ca 2004-04-06 devnull {"psi", 968},
362 7cf289ca 2004-04-06 devnull {"quad", 8193},
363 7cf289ca 2004-04-06 devnull {"quot", 34},
364 7cf289ca 2004-04-06 devnull {"raquo", 187},
365 7cf289ca 2004-04-06 devnull {"reg", 174},
366 7cf289ca 2004-04-06 devnull {"rho", 961},
367 7cf289ca 2004-04-06 devnull {"sect", 167},
368 7cf289ca 2004-04-06 devnull {"shy", 173},
369 7cf289ca 2004-04-06 devnull {"sigma", 963},
370 7cf289ca 2004-04-06 devnull {"sp", 8194},
371 7cf289ca 2004-04-06 devnull {"sup1", 185},
372 7cf289ca 2004-04-06 devnull {"sup2", 178},
373 7cf289ca 2004-04-06 devnull {"sup3", 179},
374 7cf289ca 2004-04-06 devnull {"szlig", 223},
375 7cf289ca 2004-04-06 devnull {"tau", 964},
376 7cf289ca 2004-04-06 devnull {"theta", 952},
377 7cf289ca 2004-04-06 devnull {"thinsp", 8201},
378 7cf289ca 2004-04-06 devnull {"thorn", 254},
379 7cf289ca 2004-04-06 devnull {"times", 215},
380 7cf289ca 2004-04-06 devnull {"trade", 8482},
381 7cf289ca 2004-04-06 devnull {"uacute", 250},
382 7cf289ca 2004-04-06 devnull {"ucirc", 251},
383 7cf289ca 2004-04-06 devnull {"ugrave", 249},
384 7cf289ca 2004-04-06 devnull {"uml", 168},
385 7cf289ca 2004-04-06 devnull {"upsilon", 965},
386 7cf289ca 2004-04-06 devnull {"uuml", 252},
387 7cf289ca 2004-04-06 devnull {"varepsilon", 8712},
388 7cf289ca 2004-04-06 devnull {"varphi", 981},
389 7cf289ca 2004-04-06 devnull {"varpi", 982},
390 7cf289ca 2004-04-06 devnull {"varrho", 1009},
391 7cf289ca 2004-04-06 devnull {"vdots", 8942},
392 7cf289ca 2004-04-06 devnull {"vsigma", 962},
393 7cf289ca 2004-04-06 devnull {"vtheta", 977},
394 7cf289ca 2004-04-06 devnull {"xi", 958},
395 7cf289ca 2004-04-06 devnull {"yacute", 253},
396 7cf289ca 2004-04-06 devnull {"yen", 165},
397 7cf289ca 2004-04-06 devnull {"yuml", 255},
398 7cf289ca 2004-04-06 devnull {"zeta", 950}
399 7cf289ca 2004-04-06 devnull };
400 7cf289ca 2004-04-06 devnull #define NCHARTAB (sizeof(chartab)/sizeof(chartab[0]))
401 7cf289ca 2004-04-06 devnull
402 7cf289ca 2004-04-06 devnull // Characters Winstart..Winend are those that Windows
403 7cf289ca 2004-04-06 devnull // uses interpolated into the Latin1 set.
404 7cf289ca 2004-04-06 devnull // They aren't supposed to appear in HTML, but they do....
405 7cf289ca 2004-04-06 devnull enum {
406 7cf289ca 2004-04-06 devnull Winstart = 127,
407 7cf289ca 2004-04-06 devnull Winend = 159
408 7cf289ca 2004-04-06 devnull };
409 7cf289ca 2004-04-06 devnull
410 7cf289ca 2004-04-06 devnull static int winchars[]= { 8226, // 8226 is a bullet
411 7cf289ca 2004-04-06 devnull 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
412 7cf289ca 2004-04-06 devnull 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
413 7cf289ca 2004-04-06 devnull 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
414 7cf289ca 2004-04-06 devnull 732, 8482, 353, 8250, 339, 8226, 8226, 376};
415 7cf289ca 2004-04-06 devnull
416 7cf289ca 2004-04-06 devnull static StringInt* tagtable; // initialized from tagnames
417 7cf289ca 2004-04-06 devnull static StringInt* attrtable; // initialized from attrnames
418 7cf289ca 2004-04-06 devnull
419 7cf289ca 2004-04-06 devnull static void lexinit();
420 7cf289ca 2004-04-06 devnull static int getplaindata(TokenSource* ts, Token* a, int* pai);
421 7cf289ca 2004-04-06 devnull static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
422 7cf289ca 2004-04-06 devnull static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
423 7cf289ca 2004-04-06 devnull static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
424 7cf289ca 2004-04-06 devnull static Rune* buftostr(Rune* s, Rune* buf, int j);
425 7cf289ca 2004-04-06 devnull static int comment(TokenSource* ts);
426 7cf289ca 2004-04-06 devnull static int findstr(TokenSource* ts, Rune* s);
427 7cf289ca 2004-04-06 devnull static int ampersand(TokenSource* ts);
428 7cf289ca 2004-04-06 devnull //static int lowerc(int c);
429 7cf289ca 2004-04-06 devnull static int getchar(TokenSource* ts);
430 7cf289ca 2004-04-06 devnull static void ungetchar(TokenSource* ts, int c);
431 7cf289ca 2004-04-06 devnull static void backup(TokenSource* ts, int savei);
432 7cf289ca 2004-04-06 devnull //static void freeinsidetoken(Token* t);
433 7cf289ca 2004-04-06 devnull static void freeattrs(Attr* ahead);
434 7cf289ca 2004-04-06 devnull static Attr* newattr(int attid, Rune* value, Attr* link);
435 7cf289ca 2004-04-06 devnull static int Tconv(Fmt* f);
436 7cf289ca 2004-04-06 devnull
437 7cf289ca 2004-04-06 devnull int dbglex = 0;
438 7cf289ca 2004-04-06 devnull static int lexinited = 0;
439 7cf289ca 2004-04-06 devnull
440 7cf289ca 2004-04-06 devnull static void
441 7cf289ca 2004-04-06 devnull lexinit(void)
442 7cf289ca 2004-04-06 devnull {
443 7cf289ca 2004-04-06 devnull chartab = cvtstringinttab(_chartab, nelem(_chartab));
444 7cf289ca 2004-04-06 devnull tagnames = cvtstringtab(_tagnames, nelem(_tagnames));
445 7cf289ca 2004-04-06 devnull tagtable = _makestrinttab(tagnames, Numtags);
446 7cf289ca 2004-04-06 devnull attrnames = cvtstringtab(_attrnames, nelem(_attrnames));
447 7cf289ca 2004-04-06 devnull attrtable = _makestrinttab(attrnames, Numattrs);
448 7cf289ca 2004-04-06 devnull fmtinstall('T', Tconv);
449 7cf289ca 2004-04-06 devnull lexinited = 1;
450 7cf289ca 2004-04-06 devnull }
451 7cf289ca 2004-04-06 devnull
452 7cf289ca 2004-04-06 devnull static TokenSource*
453 7cf289ca 2004-04-06 devnull newtokensource(uchar* data, int edata, int chset, int mtype)
454 7cf289ca 2004-04-06 devnull {
455 7cf289ca 2004-04-06 devnull TokenSource* ans;
456 7cf289ca 2004-04-06 devnull
457 7cf289ca 2004-04-06 devnull assert(chset == US_Ascii || chset == ISO_8859_1 ||
458 7cf289ca 2004-04-06 devnull chset == UTF_8 || chset == Unicode);
459 7cf289ca 2004-04-06 devnull ans = (TokenSource*)emalloc(sizeof(TokenSource));
460 7cf289ca 2004-04-06 devnull ans->i = 0;
461 7cf289ca 2004-04-06 devnull ans->data = data;
462 7cf289ca 2004-04-06 devnull ans->edata = edata;
463 7cf289ca 2004-04-06 devnull ans->chset = chset;
464 7cf289ca 2004-04-06 devnull ans->mtype = mtype;
465 7cf289ca 2004-04-06 devnull return ans;
466 7cf289ca 2004-04-06 devnull }
467 7cf289ca 2004-04-06 devnull
468 7cf289ca 2004-04-06 devnull enum {
469 7cf289ca 2004-04-06 devnull ToksChunk = 500
470 7cf289ca 2004-04-06 devnull };
471 7cf289ca 2004-04-06 devnull
472 7cf289ca 2004-04-06 devnull // Call this to get the tokens.
473 7cf289ca 2004-04-06 devnull // The number of returned tokens is returned in *plen.
474 7cf289ca 2004-04-06 devnull Token*
475 7cf289ca 2004-04-06 devnull _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
476 7cf289ca 2004-04-06 devnull {
477 7cf289ca 2004-04-06 devnull TokenSource* ts;
478 7cf289ca 2004-04-06 devnull Token* a;
479 7cf289ca 2004-04-06 devnull int alen;
480 7cf289ca 2004-04-06 devnull int ai;
481 7cf289ca 2004-04-06 devnull int starti;
482 7cf289ca 2004-04-06 devnull int c;
483 7cf289ca 2004-04-06 devnull int tag;
484 7cf289ca 2004-04-06 devnull
485 7cf289ca 2004-04-06 devnull if(!lexinited)
486 7cf289ca 2004-04-06 devnull lexinit();
487 7cf289ca 2004-04-06 devnull ts = newtokensource(data, datalen, chset, mtype);
488 7cf289ca 2004-04-06 devnull alen = ToksChunk;
489 7cf289ca 2004-04-06 devnull a = (Token*)emalloc(alen * sizeof(Token));
490 7cf289ca 2004-04-06 devnull ai = 0;
491 7cf289ca 2004-04-06 devnull if(dbglex)
492 7cf289ca 2004-04-06 devnull fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
493 7cf289ca 2004-04-06 devnull if(ts->mtype == TextHtml) {
494 7cf289ca 2004-04-06 devnull for(;;) {
495 7cf289ca 2004-04-06 devnull if(ai == alen) {
496 7cf289ca 2004-04-06 devnull a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
497 7cf289ca 2004-04-06 devnull alen += ToksChunk;
498 7cf289ca 2004-04-06 devnull }
499 7cf289ca 2004-04-06 devnull starti = ts->i;
500 7cf289ca 2004-04-06 devnull c = getchar(ts);
501 7cf289ca 2004-04-06 devnull if(c < 0)
502 7cf289ca 2004-04-06 devnull break;
503 7cf289ca 2004-04-06 devnull if(c == '<') {
504 7cf289ca 2004-04-06 devnull tag = gettag(ts, starti, a, &ai);
505 7cf289ca 2004-04-06 devnull if(tag == Tscript) {
506 7cf289ca 2004-04-06 devnull // special rules for getting Data after....
507 7cf289ca 2004-04-06 devnull starti = ts->i;
508 7cf289ca 2004-04-06 devnull c = getchar(ts);
509 7cf289ca 2004-04-06 devnull tag = getscriptdata(ts, c, starti, a, &ai);
510 7cf289ca 2004-04-06 devnull }
511 7cf289ca 2004-04-06 devnull }
512 7cf289ca 2004-04-06 devnull else
513 7cf289ca 2004-04-06 devnull tag = getdata(ts, c, starti, a, &ai);
514 7cf289ca 2004-04-06 devnull if(tag == -1)
515 7cf289ca 2004-04-06 devnull break;
516 7cf289ca 2004-04-06 devnull else if(dbglex > 1 && tag != Comment)
517 7cf289ca 2004-04-06 devnull fprint(2, "lex: got token %T\n", &a[ai-1]);
518 7cf289ca 2004-04-06 devnull }
519 7cf289ca 2004-04-06 devnull }
520 7cf289ca 2004-04-06 devnull else {
521 7cf289ca 2004-04-06 devnull // plain text (non-html) tokens
522 7cf289ca 2004-04-06 devnull for(;;) {
523 7cf289ca 2004-04-06 devnull if(ai == alen) {
524 7cf289ca 2004-04-06 devnull a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
525 7cf289ca 2004-04-06 devnull alen += ToksChunk;
526 7cf289ca 2004-04-06 devnull }
527 7cf289ca 2004-04-06 devnull tag = getplaindata(ts, a, &ai);
528 7cf289ca 2004-04-06 devnull if(tag == -1)
529 7cf289ca 2004-04-06 devnull break;
530 7cf289ca 2004-04-06 devnull if(dbglex > 1)
531 7cf289ca 2004-04-06 devnull fprint(2, "lex: got token %T\n", &a[ai]);
532 7cf289ca 2004-04-06 devnull }
533 7cf289ca 2004-04-06 devnull }
534 7cf289ca 2004-04-06 devnull if(dbglex)
535 7cf289ca 2004-04-06 devnull fprint(2, "lex: returning %d tokens\n", ai);
536 7cf289ca 2004-04-06 devnull *plen = ai;
537 7cf289ca 2004-04-06 devnull if(ai == 0)
538 7cf289ca 2004-04-06 devnull return nil;
539 7cf289ca 2004-04-06 devnull return a;
540 7cf289ca 2004-04-06 devnull }
541 7cf289ca 2004-04-06 devnull
542 7cf289ca 2004-04-06 devnull // For case where source isn't HTML.
543 7cf289ca 2004-04-06 devnull // Just make data tokens, one per line (or partial line,
544 7cf289ca 2004-04-06 devnull // at end of buffer), ignoring non-whitespace control
545 7cf289ca 2004-04-06 devnull // characters and dumping \r's.
546 7cf289ca 2004-04-06 devnull // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
547 7cf289ca 2004-04-06 devnull // Otherwise return -1;
548 7cf289ca 2004-04-06 devnull static int
549 7cf289ca 2004-04-06 devnull getplaindata(TokenSource* ts, Token* a, int* pai)
550 7cf289ca 2004-04-06 devnull {
551 7cf289ca 2004-04-06 devnull Rune* s;
552 7cf289ca 2004-04-06 devnull int j;
553 7cf289ca 2004-04-06 devnull int starti;
554 7cf289ca 2004-04-06 devnull int c;
555 7cf289ca 2004-04-06 devnull Token* tok;
556 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
557 7cf289ca 2004-04-06 devnull
558 7cf289ca 2004-04-06 devnull s = nil;
559 7cf289ca 2004-04-06 devnull j = 0;
560 7cf289ca 2004-04-06 devnull starti = ts->i;
561 7cf289ca 2004-04-06 devnull for(c = getchar(ts); c >= 0; c = getchar(ts)) {
562 7cf289ca 2004-04-06 devnull if(c < ' ') {
563 7cf289ca 2004-04-06 devnull if(isspace(c)) {
564 7cf289ca 2004-04-06 devnull if(c == '\r') {
565 7cf289ca 2004-04-06 devnull // ignore it unless no following '\n',
566 7cf289ca 2004-04-06 devnull // in which case treat it like '\n'
567 7cf289ca 2004-04-06 devnull c = getchar(ts);
568 7cf289ca 2004-04-06 devnull if(c != '\n') {
569 7cf289ca 2004-04-06 devnull if(c >= 0)
570 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
571 7cf289ca 2004-04-06 devnull c = '\n';
572 7cf289ca 2004-04-06 devnull }
573 7cf289ca 2004-04-06 devnull }
574 7cf289ca 2004-04-06 devnull }
575 7cf289ca 2004-04-06 devnull else
576 7cf289ca 2004-04-06 devnull c = 0;
577 7cf289ca 2004-04-06 devnull }
578 7cf289ca 2004-04-06 devnull if(c != 0) {
579 7cf289ca 2004-04-06 devnull buf[j++] = c;
580 7cf289ca 2004-04-06 devnull if(j == sizeof(buf)-1) {
581 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
582 7cf289ca 2004-04-06 devnull j = 0;
583 7cf289ca 2004-04-06 devnull }
584 7cf289ca 2004-04-06 devnull }
585 7cf289ca 2004-04-06 devnull if(c == '\n')
586 7cf289ca 2004-04-06 devnull break;
587 7cf289ca 2004-04-06 devnull }
588 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
589 7cf289ca 2004-04-06 devnull if(s == nil)
590 7cf289ca 2004-04-06 devnull return -1;
591 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
592 7cf289ca 2004-04-06 devnull tok->tag = Data;
593 7cf289ca 2004-04-06 devnull tok->text = s;
594 7cf289ca 2004-04-06 devnull tok->attr = nil;
595 7cf289ca 2004-04-06 devnull tok->starti = starti;
596 7cf289ca 2004-04-06 devnull return Data;
597 7cf289ca 2004-04-06 devnull }
598 7cf289ca 2004-04-06 devnull
599 7cf289ca 2004-04-06 devnull // Return concatenation of s and buf[0:j]
600 7cf289ca 2004-04-06 devnull static Rune*
601 7cf289ca 2004-04-06 devnull buftostr(Rune* s, Rune* buf, int j)
602 7cf289ca 2004-04-06 devnull {
603 7cf289ca 2004-04-06 devnull buf[j] = 0;
604 7cf289ca 2004-04-06 devnull if(s == nil)
605 7cf289ca 2004-04-06 devnull s = _Strndup(buf, j);
606 7cf289ca 2004-04-06 devnull else
607 7cf289ca 2004-04-06 devnull s = _Strdup2(s, buf);
608 7cf289ca 2004-04-06 devnull return s;
609 7cf289ca 2004-04-06 devnull }
610 7cf289ca 2004-04-06 devnull
611 7cf289ca 2004-04-06 devnull // Gather data up to next start-of-tag or end-of-buffer.
612 7cf289ca 2004-04-06 devnull // Translate entity references (&amp;).
613 7cf289ca 2004-04-06 devnull // Ignore non-whitespace control characters and get rid of \r's.
614 7cf289ca 2004-04-06 devnull // If find non-empty token, fill in a[*pai], bump *pai, and return Data.
615 7cf289ca 2004-04-06 devnull // Otherwise return -1;
616 7cf289ca 2004-04-06 devnull static int
617 7cf289ca 2004-04-06 devnull getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
618 7cf289ca 2004-04-06 devnull {
619 7cf289ca 2004-04-06 devnull Rune* s;
620 7cf289ca 2004-04-06 devnull int j;
621 7cf289ca 2004-04-06 devnull int c;
622 7cf289ca 2004-04-06 devnull Token* tok;
623 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
624 7cf289ca 2004-04-06 devnull
625 7cf289ca 2004-04-06 devnull s = nil;
626 7cf289ca 2004-04-06 devnull j = 0;
627 7cf289ca 2004-04-06 devnull c = firstc;
628 7cf289ca 2004-04-06 devnull while(c >= 0) {
629 7cf289ca 2004-04-06 devnull if(c == '&') {
630 7cf289ca 2004-04-06 devnull c = ampersand(ts);
631 7cf289ca 2004-04-06 devnull if(c < 0)
632 7cf289ca 2004-04-06 devnull break;
633 7cf289ca 2004-04-06 devnull }
634 7cf289ca 2004-04-06 devnull else if(c < ' ') {
635 7cf289ca 2004-04-06 devnull if(isspace(c)) {
636 7cf289ca 2004-04-06 devnull if(c == '\r') {
637 7cf289ca 2004-04-06 devnull // ignore it unless no following '\n',
638 7cf289ca 2004-04-06 devnull // in which case treat it like '\n'
639 7cf289ca 2004-04-06 devnull c = getchar(ts);
640 7cf289ca 2004-04-06 devnull if(c != '\n') {
641 7cf289ca 2004-04-06 devnull if(c >= 0)
642 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
643 7cf289ca 2004-04-06 devnull c = '\n';
644 7cf289ca 2004-04-06 devnull }
645 7cf289ca 2004-04-06 devnull }
646 7cf289ca 2004-04-06 devnull }
647 7cf289ca 2004-04-06 devnull else {
648 7cf289ca 2004-04-06 devnull if(warn)
649 7cf289ca 2004-04-06 devnull fprint(2, "warning: non-whitespace control character %d ignored\n", c);
650 7cf289ca 2004-04-06 devnull c = 0;
651 7cf289ca 2004-04-06 devnull }
652 7cf289ca 2004-04-06 devnull }
653 7cf289ca 2004-04-06 devnull else if(c == '<') {
654 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
655 7cf289ca 2004-04-06 devnull break;
656 7cf289ca 2004-04-06 devnull }
657 7cf289ca 2004-04-06 devnull if(c != 0) {
658 7cf289ca 2004-04-06 devnull buf[j++] = c;
659 7cf289ca 2004-04-06 devnull if(j == BIGBUFSIZE-1) {
660 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
661 7cf289ca 2004-04-06 devnull j = 0;
662 7cf289ca 2004-04-06 devnull }
663 7cf289ca 2004-04-06 devnull }
664 7cf289ca 2004-04-06 devnull c = getchar(ts);
665 7cf289ca 2004-04-06 devnull }
666 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
667 7cf289ca 2004-04-06 devnull if(s == nil)
668 7cf289ca 2004-04-06 devnull return -1;
669 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
670 7cf289ca 2004-04-06 devnull tok->tag = Data;
671 7cf289ca 2004-04-06 devnull tok->text = s;
672 7cf289ca 2004-04-06 devnull tok->attr = nil;
673 7cf289ca 2004-04-06 devnull tok->starti = starti;
674 7cf289ca 2004-04-06 devnull return Data;
675 7cf289ca 2004-04-06 devnull }
676 7cf289ca 2004-04-06 devnull
677 7cf289ca 2004-04-06 devnull // The rules for lexing scripts are different (ugh).
678 7cf289ca 2004-04-06 devnull // Gather up everything until see a </SCRIPT>.
679 7cf289ca 2004-04-06 devnull static int
680 7cf289ca 2004-04-06 devnull getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
681 7cf289ca 2004-04-06 devnull {
682 7cf289ca 2004-04-06 devnull Rune* s;
683 7cf289ca 2004-04-06 devnull int j;
684 7cf289ca 2004-04-06 devnull int tstarti;
685 7cf289ca 2004-04-06 devnull int savei;
686 7cf289ca 2004-04-06 devnull int c;
687 7cf289ca 2004-04-06 devnull int tag;
688 7cf289ca 2004-04-06 devnull int done;
689 7cf289ca 2004-04-06 devnull Token* tok;
690 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
691 7cf289ca 2004-04-06 devnull
692 7cf289ca 2004-04-06 devnull s = nil;
693 7cf289ca 2004-04-06 devnull j = 0;
694 7cf289ca 2004-04-06 devnull tstarti = starti;
695 7cf289ca 2004-04-06 devnull c = firstc;
696 7cf289ca 2004-04-06 devnull done = 0;
697 7cf289ca 2004-04-06 devnull while(c >= 0) {
698 7cf289ca 2004-04-06 devnull if(c == '<') {
699 7cf289ca 2004-04-06 devnull // other browsers ignore stuff to end of line after <!
700 7cf289ca 2004-04-06 devnull savei = ts->i;
701 7cf289ca 2004-04-06 devnull c = getchar(ts);
702 7cf289ca 2004-04-06 devnull if(c == '!') {
703 7cf289ca 2004-04-06 devnull while(c >= 0 && c != '\n' && c != '\r')
704 7cf289ca 2004-04-06 devnull c = getchar(ts);
705 7cf289ca 2004-04-06 devnull if(c == '\r')
706 7cf289ca 2004-04-06 devnull c = getchar(ts);
707 7cf289ca 2004-04-06 devnull if(c == '\n')
708 7cf289ca 2004-04-06 devnull c = getchar(ts);
709 7cf289ca 2004-04-06 devnull }
710 7cf289ca 2004-04-06 devnull else if(c >= 0) {
711 7cf289ca 2004-04-06 devnull backup(ts, savei);
712 7cf289ca 2004-04-06 devnull tag = gettag(ts, tstarti, a, pai);
713 7cf289ca 2004-04-06 devnull if(tag == -1)
714 7cf289ca 2004-04-06 devnull break;
715 7cf289ca 2004-04-06 devnull if(tag != Comment)
716 7cf289ca 2004-04-06 devnull (*pai)--;
717 7cf289ca 2004-04-06 devnull backup(ts, tstarti);
718 7cf289ca 2004-04-06 devnull if(tag == Tscript + RBRA) {
719 7cf289ca 2004-04-06 devnull done = 1;
720 7cf289ca 2004-04-06 devnull break;
721 7cf289ca 2004-04-06 devnull }
722 7cf289ca 2004-04-06 devnull // here tag was not </SCRIPT>, so take as regular data
723 7cf289ca 2004-04-06 devnull c = getchar(ts);
724 7cf289ca 2004-04-06 devnull }
725 7cf289ca 2004-04-06 devnull }
726 7cf289ca 2004-04-06 devnull if(c < 0)
727 7cf289ca 2004-04-06 devnull break;
728 7cf289ca 2004-04-06 devnull if(c != 0) {
729 7cf289ca 2004-04-06 devnull buf[j++] = c;
730 7cf289ca 2004-04-06 devnull if(j == BIGBUFSIZE-1) {
731 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
732 7cf289ca 2004-04-06 devnull j = 0;
733 7cf289ca 2004-04-06 devnull }
734 7cf289ca 2004-04-06 devnull }
735 7cf289ca 2004-04-06 devnull tstarti = ts->i;
736 7cf289ca 2004-04-06 devnull c = getchar(ts);
737 7cf289ca 2004-04-06 devnull }
738 7cf289ca 2004-04-06 devnull if(done || ts->i == ts->edata) {
739 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
740 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
741 7cf289ca 2004-04-06 devnull tok->tag = Data;
742 7cf289ca 2004-04-06 devnull tok->text = s;
743 7cf289ca 2004-04-06 devnull tok->attr = nil;
744 7cf289ca 2004-04-06 devnull tok->starti = starti;
745 7cf289ca 2004-04-06 devnull return Data;
746 7cf289ca 2004-04-06 devnull }
747 7cf289ca 2004-04-06 devnull backup(ts, starti);
748 7cf289ca 2004-04-06 devnull return -1;
749 7cf289ca 2004-04-06 devnull }
750 7cf289ca 2004-04-06 devnull
751 7cf289ca 2004-04-06 devnull // We've just seen a '<'. Gather up stuff to closing '>' (if buffer
752 7cf289ca 2004-04-06 devnull // ends before then, return -1).
753 7cf289ca 2004-04-06 devnull // If it's a tag, look up the name, gather the attributes, and return
754 7cf289ca 2004-04-06 devnull // the appropriate token.
755 7cf289ca 2004-04-06 devnull // Else it's either just plain data or some kind of ignorable stuff:
756 7cf289ca 2004-04-06 devnull // return Data or Comment as appropriate.
757 7cf289ca 2004-04-06 devnull // If it's not a Comment, put it in a[*pai] and bump *pai.
758 7cf289ca 2004-04-06 devnull static int
759 7cf289ca 2004-04-06 devnull gettag(TokenSource* ts, int starti, Token* a, int* pai)
760 7cf289ca 2004-04-06 devnull {
761 7cf289ca 2004-04-06 devnull int rbra;
762 7cf289ca 2004-04-06 devnull int ans;
763 7cf289ca 2004-04-06 devnull Attr* al;
764 7cf289ca 2004-04-06 devnull int nexti;
765 7cf289ca 2004-04-06 devnull int c;
766 7cf289ca 2004-04-06 devnull int ti;
767 7cf289ca 2004-04-06 devnull int afnd;
768 7cf289ca 2004-04-06 devnull int attid;
769 7cf289ca 2004-04-06 devnull int quote;
770 7cf289ca 2004-04-06 devnull Rune* val;
771 7cf289ca 2004-04-06 devnull int nv;
772 7cf289ca 2004-04-06 devnull int i;
773 7cf289ca 2004-04-06 devnull int tag;
774 7cf289ca 2004-04-06 devnull Token* tok;
775 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
776 7cf289ca 2004-04-06 devnull
777 7cf289ca 2004-04-06 devnull rbra = 0;
778 7cf289ca 2004-04-06 devnull nexti = ts->i;
779 7cf289ca 2004-04-06 devnull tok = &a[*pai];
780 7cf289ca 2004-04-06 devnull tok->tag = Notfound;
781 7cf289ca 2004-04-06 devnull tok->text = nil;
782 7cf289ca 2004-04-06 devnull tok->attr = nil;
783 7cf289ca 2004-04-06 devnull tok->starti = starti;
784 7cf289ca 2004-04-06 devnull c = getchar(ts);
785 7cf289ca 2004-04-06 devnull if(c == '/') {
786 7cf289ca 2004-04-06 devnull rbra = RBRA;
787 7cf289ca 2004-04-06 devnull c = getchar(ts);
788 7cf289ca 2004-04-06 devnull }
789 7cf289ca 2004-04-06 devnull if(c < 0)
790 7cf289ca 2004-04-06 devnull goto eob_done;
791 7cf289ca 2004-04-06 devnull if(c >= 256 || !isalpha(c)) {
792 7cf289ca 2004-04-06 devnull // not a tag
793 7cf289ca 2004-04-06 devnull if(c == '!') {
794 7cf289ca 2004-04-06 devnull ans = comment(ts);
795 7cf289ca 2004-04-06 devnull if(ans != -1)
796 7cf289ca 2004-04-06 devnull return ans;
797 7cf289ca 2004-04-06 devnull goto eob_done;
798 7cf289ca 2004-04-06 devnull }
799 7cf289ca 2004-04-06 devnull else {
800 7cf289ca 2004-04-06 devnull backup(ts, nexti);
801 7cf289ca 2004-04-06 devnull tok->tag = Data;
802 7cf289ca 2004-04-06 devnull tok->text = _Strdup(L(Llt));
803 7cf289ca 2004-04-06 devnull (*pai)++;
804 7cf289ca 2004-04-06 devnull return Data;
805 7cf289ca 2004-04-06 devnull }
806 7cf289ca 2004-04-06 devnull }
807 7cf289ca 2004-04-06 devnull // c starts a tagname
808 7cf289ca 2004-04-06 devnull buf[0] = c;
809 7cf289ca 2004-04-06 devnull i = 1;
810 7cf289ca 2004-04-06 devnull while(1) {
811 7cf289ca 2004-04-06 devnull c = getchar(ts);
812 7cf289ca 2004-04-06 devnull if(c < 0)
813 7cf289ca 2004-04-06 devnull goto eob_done;
814 7cf289ca 2004-04-06 devnull if(!ISNAMCHAR(c))
815 7cf289ca 2004-04-06 devnull break;
816 7cf289ca 2004-04-06 devnull // if name is bigger than buf it won't be found anyway...
817 7cf289ca 2004-04-06 devnull if(i < BIGBUFSIZE)
818 7cf289ca 2004-04-06 devnull buf[i++] = c;
819 7cf289ca 2004-04-06 devnull }
820 7cf289ca 2004-04-06 devnull if(_lookup(tagtable, Numtags, buf, i, &tag))
821 7cf289ca 2004-04-06 devnull tok->tag = tag + rbra;
822 7cf289ca 2004-04-06 devnull else
823 7cf289ca 2004-04-06 devnull tok->text = _Strndup(buf, i); // for warning print, in build
824 7cf289ca 2004-04-06 devnull
825 7cf289ca 2004-04-06 devnull // attribute gathering loop
826 7cf289ca 2004-04-06 devnull al = nil;
827 7cf289ca 2004-04-06 devnull while(1) {
828 7cf289ca 2004-04-06 devnull // look for "ws name" or "ws name ws = ws val" (ws=whitespace)
829 7cf289ca 2004-04-06 devnull // skip whitespace
830 7cf289ca 2004-04-06 devnull attrloop_continue:
831 7cf289ca 2004-04-06 devnull while(c < 256 && isspace(c)) {
832 7cf289ca 2004-04-06 devnull c = getchar(ts);
833 7cf289ca 2004-04-06 devnull if(c < 0)
834 7cf289ca 2004-04-06 devnull goto eob_done;
835 7cf289ca 2004-04-06 devnull }
836 7cf289ca 2004-04-06 devnull if(c == '>')
837 7cf289ca 2004-04-06 devnull goto attrloop_done;
838 7cf289ca 2004-04-06 devnull if(c == '<') {
839 7cf289ca 2004-04-06 devnull if(warn)
840 7cf289ca 2004-04-06 devnull fprint(2, "warning: unclosed tag\n");
841 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
842 7cf289ca 2004-04-06 devnull goto attrloop_done;
843 7cf289ca 2004-04-06 devnull }
844 7cf289ca 2004-04-06 devnull if(c >= 256 || !isalpha(c)) {
845 7cf289ca 2004-04-06 devnull if(warn)
846 7cf289ca 2004-04-06 devnull fprint(2, "warning: expected attribute name\n");
847 7cf289ca 2004-04-06 devnull // skipt to next attribute name
848 7cf289ca 2004-04-06 devnull while(1) {
849 7cf289ca 2004-04-06 devnull c = getchar(ts);
850 7cf289ca 2004-04-06 devnull if(c < 0)
851 7cf289ca 2004-04-06 devnull goto eob_done;
852 7cf289ca 2004-04-06 devnull if(c < 256 && isalpha(c))
853 7cf289ca 2004-04-06 devnull goto attrloop_continue;
854 7cf289ca 2004-04-06 devnull if(c == '<') {
855 7cf289ca 2004-04-06 devnull if(warn)
856 7cf289ca 2004-04-06 devnull fprint(2, "warning: unclosed tag\n");
857 7cf289ca 2004-04-06 devnull ungetchar(ts, 60);
858 7cf289ca 2004-04-06 devnull goto attrloop_done;
859 7cf289ca 2004-04-06 devnull }
860 7cf289ca 2004-04-06 devnull if(c == '>')
861 7cf289ca 2004-04-06 devnull goto attrloop_done;
862 7cf289ca 2004-04-06 devnull }
863 7cf289ca 2004-04-06 devnull }
864 7cf289ca 2004-04-06 devnull // gather attribute name
865 7cf289ca 2004-04-06 devnull buf[0] = c;
866 7cf289ca 2004-04-06 devnull i = 1;
867 7cf289ca 2004-04-06 devnull while(1) {
868 7cf289ca 2004-04-06 devnull c = getchar(ts);
869 7cf289ca 2004-04-06 devnull if(c < 0)
870 7cf289ca 2004-04-06 devnull goto eob_done;
871 7cf289ca 2004-04-06 devnull if(!ISNAMCHAR(c))
872 7cf289ca 2004-04-06 devnull break;
873 7cf289ca 2004-04-06 devnull if(i < BIGBUFSIZE-1)
874 7cf289ca 2004-04-06 devnull buf[i++] = c;
875 7cf289ca 2004-04-06 devnull }
876 7cf289ca 2004-04-06 devnull afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
877 7cf289ca 2004-04-06 devnull if(warn && !afnd) {
878 7cf289ca 2004-04-06 devnull buf[i] = 0;
879 7cf289ca 2004-04-06 devnull fprint(2, "warning: unknown attribute name %S\n", buf);
880 7cf289ca 2004-04-06 devnull }
881 7cf289ca 2004-04-06 devnull // skip whitespace
882 7cf289ca 2004-04-06 devnull while(c < 256 && isspace(c)) {
883 7cf289ca 2004-04-06 devnull c = getchar(ts);
884 7cf289ca 2004-04-06 devnull if(c < 0)
885 7cf289ca 2004-04-06 devnull goto eob_done;
886 7cf289ca 2004-04-06 devnull }
887 7cf289ca 2004-04-06 devnull if(c != '=') {
888 7cf289ca 2004-04-06 devnull if(afnd)
889 7cf289ca 2004-04-06 devnull al = newattr(attid, nil, al);
890 7cf289ca 2004-04-06 devnull goto attrloop_continue;
891 7cf289ca 2004-04-06 devnull }
892 7cf289ca 2004-04-06 devnull //# c is '=' here; skip whitespace
893 7cf289ca 2004-04-06 devnull while(1) {
894 7cf289ca 2004-04-06 devnull c = getchar(ts);
895 7cf289ca 2004-04-06 devnull if(c < 0)
896 7cf289ca 2004-04-06 devnull goto eob_done;
897 7cf289ca 2004-04-06 devnull if(c >= 256 || !isspace(c))
898 7cf289ca 2004-04-06 devnull break;
899 7cf289ca 2004-04-06 devnull }
900 7cf289ca 2004-04-06 devnull quote = 0;
901 7cf289ca 2004-04-06 devnull if(c == '\'' || c == '"') {
902 7cf289ca 2004-04-06 devnull quote = c;
903 7cf289ca 2004-04-06 devnull c = getchar(ts);
904 7cf289ca 2004-04-06 devnull if(c < 0)
905 7cf289ca 2004-04-06 devnull goto eob_done;
906 7cf289ca 2004-04-06 devnull }
907 7cf289ca 2004-04-06 devnull val = nil;
908 7cf289ca 2004-04-06 devnull nv = 0;
909 7cf289ca 2004-04-06 devnull while(1) {
910 7cf289ca 2004-04-06 devnull valloop_continue:
911 7cf289ca 2004-04-06 devnull if(c < 0)
912 7cf289ca 2004-04-06 devnull goto eob_done;
913 7cf289ca 2004-04-06 devnull if(c == '>') {
914 7cf289ca 2004-04-06 devnull if(quote) {
915 7cf289ca 2004-04-06 devnull // c might be part of string (though not good style)
916 7cf289ca 2004-04-06 devnull // but if line ends before close quote, assume
917 7cf289ca 2004-04-06 devnull // there was an unmatched quote
918 7cf289ca 2004-04-06 devnull ti = ts->i;
919 7cf289ca 2004-04-06 devnull while(1) {
920 7cf289ca 2004-04-06 devnull c = getchar(ts);
921 7cf289ca 2004-04-06 devnull if(c < 0)
922 7cf289ca 2004-04-06 devnull goto eob_done;
923 7cf289ca 2004-04-06 devnull if(c == quote) {
924 7cf289ca 2004-04-06 devnull backup(ts, ti);
925 7cf289ca 2004-04-06 devnull buf[nv++] = '>';
926 7cf289ca 2004-04-06 devnull if(nv == BIGBUFSIZE-1) {
927 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
928 7cf289ca 2004-04-06 devnull nv = 0;
929 7cf289ca 2004-04-06 devnull }
930 7cf289ca 2004-04-06 devnull c = getchar(ts);
931 7cf289ca 2004-04-06 devnull goto valloop_continue;
932 7cf289ca 2004-04-06 devnull }
933 7cf289ca 2004-04-06 devnull if(c == '\n') {
934 7cf289ca 2004-04-06 devnull if(warn)
935 7cf289ca 2004-04-06 devnull fprint(2, "warning: apparent unmatched quote\n");
936 7cf289ca 2004-04-06 devnull backup(ts, ti);
937 7cf289ca 2004-04-06 devnull c = '>';
938 7cf289ca 2004-04-06 devnull goto valloop_done;
939 7cf289ca 2004-04-06 devnull }
940 7cf289ca 2004-04-06 devnull }
941 7cf289ca 2004-04-06 devnull }
942 7cf289ca 2004-04-06 devnull else
943 7cf289ca 2004-04-06 devnull goto valloop_done;
944 7cf289ca 2004-04-06 devnull }
945 7cf289ca 2004-04-06 devnull if(quote) {
946 7cf289ca 2004-04-06 devnull if(c == quote) {
947 7cf289ca 2004-04-06 devnull c = getchar(ts);
948 7cf289ca 2004-04-06 devnull if(c < 0)
949 7cf289ca 2004-04-06 devnull goto eob_done;
950 7cf289ca 2004-04-06 devnull goto valloop_done;
951 7cf289ca 2004-04-06 devnull }
952 7cf289ca 2004-04-06 devnull if(c == '\r') {
953 7cf289ca 2004-04-06 devnull c = getchar(ts);
954 7cf289ca 2004-04-06 devnull goto valloop_continue;
955 7cf289ca 2004-04-06 devnull }
956 7cf289ca 2004-04-06 devnull if(c == '\t' || c == '\n')
957 7cf289ca 2004-04-06 devnull c = ' ';
958 7cf289ca 2004-04-06 devnull }
959 7cf289ca 2004-04-06 devnull else {
960 7cf289ca 2004-04-06 devnull if(c < 256 && isspace(c))
961 7cf289ca 2004-04-06 devnull goto valloop_done;
962 7cf289ca 2004-04-06 devnull }
963 7cf289ca 2004-04-06 devnull if(c == '&') {
964 7cf289ca 2004-04-06 devnull c = ampersand(ts);
965 7cf289ca 2004-04-06 devnull if(c == -1)
966 7cf289ca 2004-04-06 devnull goto eob_done;
967 7cf289ca 2004-04-06 devnull }
968 7cf289ca 2004-04-06 devnull buf[nv++] = c;
969 7cf289ca 2004-04-06 devnull if(nv == BIGBUFSIZE-1) {
970 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
971 7cf289ca 2004-04-06 devnull nv = 0;
972 7cf289ca 2004-04-06 devnull }
973 7cf289ca 2004-04-06 devnull c = getchar(ts);
974 7cf289ca 2004-04-06 devnull }
975 7cf289ca 2004-04-06 devnull valloop_done:
976 7cf289ca 2004-04-06 devnull if(afnd) {
977 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
978 7cf289ca 2004-04-06 devnull al = newattr(attid, val, al);
979 7cf289ca 2004-04-06 devnull }
980 7cf289ca 2004-04-06 devnull }
981 7cf289ca 2004-04-06 devnull
982 7cf289ca 2004-04-06 devnull attrloop_done:
983 7cf289ca 2004-04-06 devnull tok->attr = al;
984 7cf289ca 2004-04-06 devnull (*pai)++;
985 7cf289ca 2004-04-06 devnull return tok->tag;
986 7cf289ca 2004-04-06 devnull
987 7cf289ca 2004-04-06 devnull eob_done:
988 7cf289ca 2004-04-06 devnull if(warn)
989 7cf289ca 2004-04-06 devnull fprint(2, "warning: incomplete tag at end of page\n");
990 7cf289ca 2004-04-06 devnull backup(ts, nexti);
991 7cf289ca 2004-04-06 devnull tok->tag = Data;
992 7cf289ca 2004-04-06 devnull tok->text = _Strdup(L(Llt));
993 7cf289ca 2004-04-06 devnull return Data;
994 7cf289ca 2004-04-06 devnull }
995 7cf289ca 2004-04-06 devnull
996 7cf289ca 2004-04-06 devnull // We've just read a '<!' at position starti,
997 7cf289ca 2004-04-06 devnull // so this may be a comment or other ignored section, or it may
998 7cf289ca 2004-04-06 devnull // be just a literal string if there is no close before end of file
999 7cf289ca 2004-04-06 devnull // (other browsers do that).
1000 7cf289ca 2004-04-06 devnull // The accepted practice seems to be (note: contrary to SGML spec!):
1001 7cf289ca 2004-04-06 devnull // If see <!--, look for --> to close, or if none, > to close.
1002 7cf289ca 2004-04-06 devnull // If see <!(not --), look for > to close.
1003 7cf289ca 2004-04-06 devnull // If no close before end of file, leave original characters in as literal data.
1004 7cf289ca 2004-04-06 devnull //
1005 7cf289ca 2004-04-06 devnull // If we see ignorable stuff, return Comment.
1006 7cf289ca 2004-04-06 devnull // Else return nil (caller should back up and try again when more data arrives,
1007 7cf289ca 2004-04-06 devnull // unless at end of file, in which case caller should just make '<' a data token).
1008 7cf289ca 2004-04-06 devnull static int
1009 7cf289ca 2004-04-06 devnull comment(TokenSource* ts)
1010 7cf289ca 2004-04-06 devnull {
1011 7cf289ca 2004-04-06 devnull int nexti;
1012 7cf289ca 2004-04-06 devnull int havecomment;
1013 7cf289ca 2004-04-06 devnull int c;
1014 7cf289ca 2004-04-06 devnull
1015 7cf289ca 2004-04-06 devnull nexti = ts->i;
1016 7cf289ca 2004-04-06 devnull havecomment = 0;
1017 7cf289ca 2004-04-06 devnull c = getchar(ts);
1018 7cf289ca 2004-04-06 devnull if(c == '-') {
1019 7cf289ca 2004-04-06 devnull c = getchar(ts);
1020 7cf289ca 2004-04-06 devnull if(c == '-') {
1021 7cf289ca 2004-04-06 devnull if(findstr(ts, L(Larrow)))
1022 7cf289ca 2004-04-06 devnull havecomment = 1;
1023 7cf289ca 2004-04-06 devnull else
1024 7cf289ca 2004-04-06 devnull backup(ts, nexti);
1025 7cf289ca 2004-04-06 devnull }
1026 7cf289ca 2004-04-06 devnull }
1027 7cf289ca 2004-04-06 devnull if(!havecomment) {
1028 7cf289ca 2004-04-06 devnull if(c == '>')
1029 7cf289ca 2004-04-06 devnull havecomment = 1;
1030 7cf289ca 2004-04-06 devnull else if(c >= 0) {
1031 7cf289ca 2004-04-06 devnull if(findstr(ts, L(Lgt)))
1032 7cf289ca 2004-04-06 devnull havecomment = 1;
1033 7cf289ca 2004-04-06 devnull }
1034 7cf289ca 2004-04-06 devnull }
1035 7cf289ca 2004-04-06 devnull if(havecomment)
1036 7cf289ca 2004-04-06 devnull return Comment;
1037 7cf289ca 2004-04-06 devnull return -1;
1038 7cf289ca 2004-04-06 devnull }
1039 7cf289ca 2004-04-06 devnull
1040 7cf289ca 2004-04-06 devnull // Look for string s in token source.
1041 7cf289ca 2004-04-06 devnull // If found, return 1, with buffer at next char after s,
1042 7cf289ca 2004-04-06 devnull // else return 0 (caller should back up).
1043 7cf289ca 2004-04-06 devnull static int
1044 7cf289ca 2004-04-06 devnull findstr(TokenSource* ts, Rune* s)
1045 7cf289ca 2004-04-06 devnull {
1046 7cf289ca 2004-04-06 devnull int c0;
1047 7cf289ca 2004-04-06 devnull int n;
1048 7cf289ca 2004-04-06 devnull int nexti;
1049 7cf289ca 2004-04-06 devnull int i;
1050 7cf289ca 2004-04-06 devnull int c;
1051 7cf289ca 2004-04-06 devnull
1052 7cf289ca 2004-04-06 devnull c0 = s[0];
1053 7cf289ca 2004-04-06 devnull n = runestrlen(s);
1054 7cf289ca 2004-04-06 devnull while(1) {
1055 7cf289ca 2004-04-06 devnull c = getchar(ts);
1056 7cf289ca 2004-04-06 devnull if(c < 0)
1057 7cf289ca 2004-04-06 devnull break;
1058 7cf289ca 2004-04-06 devnull if(c == c0) {
1059 7cf289ca 2004-04-06 devnull if(n == 1)
1060 7cf289ca 2004-04-06 devnull return 1;
1061 7cf289ca 2004-04-06 devnull nexti = ts->i;
1062 7cf289ca 2004-04-06 devnull for(i = 1; i < n; i++) {
1063 7cf289ca 2004-04-06 devnull c = getchar(ts);
1064 7cf289ca 2004-04-06 devnull if(c < 0)
1065 7cf289ca 2004-04-06 devnull goto mainloop_done;
1066 7cf289ca 2004-04-06 devnull if(c != s[i])
1067 7cf289ca 2004-04-06 devnull break;
1068 7cf289ca 2004-04-06 devnull }
1069 7cf289ca 2004-04-06 devnull if(i == n)
1070 7cf289ca 2004-04-06 devnull return 1;
1071 7cf289ca 2004-04-06 devnull backup(ts, nexti);
1072 7cf289ca 2004-04-06 devnull }
1073 7cf289ca 2004-04-06 devnull }
1074 7cf289ca 2004-04-06 devnull mainloop_done:
1075 7cf289ca 2004-04-06 devnull return 0;
1076 7cf289ca 2004-04-06 devnull }
1077 7cf289ca 2004-04-06 devnull
1078 7cf289ca 2004-04-06 devnull // We've just read an '&'; look for an entity reference
1079 7cf289ca 2004-04-06 devnull // name, and if found, return translated char.
1080 7cf289ca 2004-04-06 devnull // if there is a complete entity name but it isn't known,
1081 7cf289ca 2004-04-06 devnull // try prefixes (gets around some buggy HTML out there),
1082 7cf289ca 2004-04-06 devnull // and if that fails, back up to just past the '&' and return '&'.
1083 7cf289ca 2004-04-06 devnull // If the entity can't be completed in the current buffer, back up
1084 7cf289ca 2004-04-06 devnull // to the '&' and return -1.
1085 7cf289ca 2004-04-06 devnull static int
1086 7cf289ca 2004-04-06 devnull ampersand(TokenSource* ts)
1087 7cf289ca 2004-04-06 devnull {
1088 7cf289ca 2004-04-06 devnull int savei;
1089 7cf289ca 2004-04-06 devnull int c;
1090 7cf289ca 2004-04-06 devnull int fnd;
1091 7cf289ca 2004-04-06 devnull int ans;
1092 7cf289ca 2004-04-06 devnull int v;
1093 7cf289ca 2004-04-06 devnull int i;
1094 7cf289ca 2004-04-06 devnull int k;
1095 7cf289ca 2004-04-06 devnull Rune buf[SMALLBUFSIZE];
1096 7cf289ca 2004-04-06 devnull
1097 7cf289ca 2004-04-06 devnull savei = ts->i;
1098 7cf289ca 2004-04-06 devnull c = getchar(ts);
1099 7cf289ca 2004-04-06 devnull fnd = 0;
1100 7cf289ca 2004-04-06 devnull ans = -1;
1101 7cf289ca 2004-04-06 devnull if(c == '#') {
1102 7cf289ca 2004-04-06 devnull c = getchar(ts);
1103 7cf289ca 2004-04-06 devnull v = 0;
1104 7cf289ca 2004-04-06 devnull while(c >= 0) {
1105 7cf289ca 2004-04-06 devnull if(!(c < 256 && isdigit(c)))
1106 7cf289ca 2004-04-06 devnull break;
1107 7cf289ca 2004-04-06 devnull v = v*10 + c - 48;
1108 7cf289ca 2004-04-06 devnull c = getchar(ts);
1109 7cf289ca 2004-04-06 devnull }
1110 7cf289ca 2004-04-06 devnull if(c >= 0) {
1111 7cf289ca 2004-04-06 devnull if(!(c == ';' || c == '\n' || c == '\r'))
1112 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1113 7cf289ca 2004-04-06 devnull c = v;
1114 7cf289ca 2004-04-06 devnull if(c == 160)
1115 7cf289ca 2004-04-06 devnull c = 160;
1116 7cf289ca 2004-04-06 devnull if(c >= Winstart && c <= Winend) {
1117 7cf289ca 2004-04-06 devnull c = winchars[c - Winstart];
1118 7cf289ca 2004-04-06 devnull }
1119 7cf289ca 2004-04-06 devnull ans = c;
1120 7cf289ca 2004-04-06 devnull fnd = 1;
1121 7cf289ca 2004-04-06 devnull }
1122 7cf289ca 2004-04-06 devnull }
1123 7cf289ca 2004-04-06 devnull else if(c < 256 && isalpha(c)) {
1124 7cf289ca 2004-04-06 devnull buf[0] = c;
1125 7cf289ca 2004-04-06 devnull k = 1;
1126 7cf289ca 2004-04-06 devnull while(1) {
1127 7cf289ca 2004-04-06 devnull c = getchar(ts);
1128 7cf289ca 2004-04-06 devnull if(c < 0)
1129 7cf289ca 2004-04-06 devnull break;
1130 7cf289ca 2004-04-06 devnull if(ISNAMCHAR(c)) {
1131 7cf289ca 2004-04-06 devnull if(k < SMALLBUFSIZE-1)
1132 7cf289ca 2004-04-06 devnull buf[k++] = c;
1133 7cf289ca 2004-04-06 devnull }
1134 7cf289ca 2004-04-06 devnull else {
1135 7cf289ca 2004-04-06 devnull if(!(c == ';' || c == '\n' || c == '\r'))
1136 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1137 7cf289ca 2004-04-06 devnull break;
1138 7cf289ca 2004-04-06 devnull }
1139 7cf289ca 2004-04-06 devnull }
1140 7cf289ca 2004-04-06 devnull if(c >= 0) {
1141 7cf289ca 2004-04-06 devnull fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1142 7cf289ca 2004-04-06 devnull if(!fnd) {
1143 7cf289ca 2004-04-06 devnull // Try prefixes of s
1144 7cf289ca 2004-04-06 devnull if(c == ';' || c == '\n' || c == '\r')
1145 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1146 7cf289ca 2004-04-06 devnull i = k;
1147 7cf289ca 2004-04-06 devnull while(--k > 0) {
1148 7cf289ca 2004-04-06 devnull fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1149 7cf289ca 2004-04-06 devnull if(fnd) {
1150 7cf289ca 2004-04-06 devnull while(i > k) {
1151 7cf289ca 2004-04-06 devnull i--;
1152 7cf289ca 2004-04-06 devnull ungetchar(ts, buf[i]);
1153 7cf289ca 2004-04-06 devnull }
1154 7cf289ca 2004-04-06 devnull break;
1155 7cf289ca 2004-04-06 devnull }
1156 7cf289ca 2004-04-06 devnull }
1157 7cf289ca 2004-04-06 devnull }
1158 7cf289ca 2004-04-06 devnull }
1159 7cf289ca 2004-04-06 devnull }
1160 7cf289ca 2004-04-06 devnull if(!fnd) {
1161 7cf289ca 2004-04-06 devnull backup(ts, savei);
1162 7cf289ca 2004-04-06 devnull ans = '&';
1163 7cf289ca 2004-04-06 devnull }
1164 7cf289ca 2004-04-06 devnull return ans;
1165 7cf289ca 2004-04-06 devnull }
1166 7cf289ca 2004-04-06 devnull
1167 7cf289ca 2004-04-06 devnull // Get next char, obeying ts.chset.
1168 7cf289ca 2004-04-06 devnull // Returns -1 if no complete character left before current end of data.
1169 7cf289ca 2004-04-06 devnull static int
1170 7cf289ca 2004-04-06 devnull getchar(TokenSource* ts)
1171 7cf289ca 2004-04-06 devnull {
1172 7cf289ca 2004-04-06 devnull uchar* buf;
1173 7cf289ca 2004-04-06 devnull int c;
1174 7cf289ca 2004-04-06 devnull int n;
1175 7cf289ca 2004-04-06 devnull int ok;
1176 7cf289ca 2004-04-06 devnull Rune r;
1177 7cf289ca 2004-04-06 devnull
1178 7cf289ca 2004-04-06 devnull if(ts->i >= ts->edata)
1179 7cf289ca 2004-04-06 devnull return -1;
1180 7cf289ca 2004-04-06 devnull buf = ts->data;
1181 7cf289ca 2004-04-06 devnull c = buf[ts->i];
1182 7cf289ca 2004-04-06 devnull switch(ts->chset) {
1183 7cf289ca 2004-04-06 devnull case ISO_8859_1:
1184 7cf289ca 2004-04-06 devnull if(c >= Winstart && c <= Winend)
1185 7cf289ca 2004-04-06 devnull c = winchars[c - Winstart];
1186 7cf289ca 2004-04-06 devnull ts->i++;
1187 7cf289ca 2004-04-06 devnull break;
1188 7cf289ca 2004-04-06 devnull case US_Ascii:
1189 7cf289ca 2004-04-06 devnull if(c > 127) {
1190 7cf289ca 2004-04-06 devnull if(warn)
1191 7cf289ca 2004-04-06 devnull fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1192 7cf289ca 2004-04-06 devnull }
1193 7cf289ca 2004-04-06 devnull ts->i++;
1194 7cf289ca 2004-04-06 devnull break;
1195 7cf289ca 2004-04-06 devnull case UTF_8:
1196 7cf289ca 2004-04-06 devnull ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1197 7cf289ca 2004-04-06 devnull n = chartorune(&r, (char*)(buf+ts->i));
1198 7cf289ca 2004-04-06 devnull if(ok) {
1199 7cf289ca 2004-04-06 devnull if(warn && c == 0x80)
1200 7cf289ca 2004-04-06 devnull fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1201 7cf289ca 2004-04-06 devnull ts->i += n;
1202 7cf289ca 2004-04-06 devnull c = r;
1203 7cf289ca 2004-04-06 devnull }
1204 7cf289ca 2004-04-06 devnull else {
1205 7cf289ca 2004-04-06 devnull // not enough bytes in buf to complete utf-8 char
1206 7cf289ca 2004-04-06 devnull ts->i = ts->edata; // mark "all used"
1207 7cf289ca 2004-04-06 devnull c = -1;
1208 7cf289ca 2004-04-06 devnull }
1209 7cf289ca 2004-04-06 devnull break;
1210 7cf289ca 2004-04-06 devnull case Unicode:
1211 7cf289ca 2004-04-06 devnull if(ts->i < ts->edata - 1) {
1212 7cf289ca 2004-04-06 devnull //standards say most-significant byte first
1213 7cf289ca 2004-04-06 devnull c = (c << 8)|(buf[ts->i + 1]);
1214 7cf289ca 2004-04-06 devnull ts->i += 2;
1215 7cf289ca 2004-04-06 devnull }
1216 7cf289ca 2004-04-06 devnull else {
1217 7cf289ca 2004-04-06 devnull ts->i = ts->edata; // mark "all used"
1218 7cf289ca 2004-04-06 devnull c = -1;
1219 7cf289ca 2004-04-06 devnull }
1220 7cf289ca 2004-04-06 devnull break;
1221 7cf289ca 2004-04-06 devnull }
1222 7cf289ca 2004-04-06 devnull return c;
1223 7cf289ca 2004-04-06 devnull }
1224 7cf289ca 2004-04-06 devnull
1225 7cf289ca 2004-04-06 devnull // Assuming c was the last character returned by getchar, set
1226 7cf289ca 2004-04-06 devnull // things up so that next getchar will get that same character
1227 7cf289ca 2004-04-06 devnull // followed by the current 'next character', etc.
1228 7cf289ca 2004-04-06 devnull static void
1229 7cf289ca 2004-04-06 devnull ungetchar(TokenSource* ts, int c)
1230 7cf289ca 2004-04-06 devnull {
1231 7cf289ca 2004-04-06 devnull int n;
1232 7cf289ca 2004-04-06 devnull Rune r;
1233 7cf289ca 2004-04-06 devnull char a[UTFmax];
1234 7cf289ca 2004-04-06 devnull
1235 7cf289ca 2004-04-06 devnull n = 1;
1236 7cf289ca 2004-04-06 devnull switch(ts->chset) {
1237 7cf289ca 2004-04-06 devnull case UTF_8:
1238 7cf289ca 2004-04-06 devnull if(c >= 128) {
1239 7cf289ca 2004-04-06 devnull r = c;
1240 7cf289ca 2004-04-06 devnull n = runetochar(a, &r);
1241 7cf289ca 2004-04-06 devnull }
1242 7cf289ca 2004-04-06 devnull break;
1243 7cf289ca 2004-04-06 devnull case Unicode:
1244 7cf289ca 2004-04-06 devnull n = 2;
1245 7cf289ca 2004-04-06 devnull break;
1246 7cf289ca 2004-04-06 devnull }
1247 7cf289ca 2004-04-06 devnull ts->i -= n;
1248 7cf289ca 2004-04-06 devnull }
1249 7cf289ca 2004-04-06 devnull
1250 7cf289ca 2004-04-06 devnull // Restore ts so that it is at the state where the index was savei.
1251 7cf289ca 2004-04-06 devnull static void
1252 7cf289ca 2004-04-06 devnull backup(TokenSource* ts, int savei)
1253 7cf289ca 2004-04-06 devnull {
1254 7cf289ca 2004-04-06 devnull if(dbglex)
1255 7cf289ca 2004-04-06 devnull fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1256 7cf289ca 2004-04-06 devnull ts->i = savei;
1257 7cf289ca 2004-04-06 devnull }
1258 7cf289ca 2004-04-06 devnull
1259 7cf289ca 2004-04-06 devnull
1260 7cf289ca 2004-04-06 devnull // Look for value associated with attribute attid in token t.
1261 7cf289ca 2004-04-06 devnull // If there is one, return 1 and put the value in *pans,
1262 7cf289ca 2004-04-06 devnull // else return 0.
1263 7cf289ca 2004-04-06 devnull // If xfer is true, transfer ownership of the string to the caller
1264 7cf289ca 2004-04-06 devnull // (nil it out here); otherwise, caller must duplicate the answer
1265 7cf289ca 2004-04-06 devnull // if it needs to save it.
1266 7cf289ca 2004-04-06 devnull // OK to have pans==0, in which case this is just looking
1267 7cf289ca 2004-04-06 devnull // to see if token is present.
1268 7cf289ca 2004-04-06 devnull int
1269 7cf289ca 2004-04-06 devnull _tokaval(Token* t, int attid, Rune** pans, int xfer)
1270 7cf289ca 2004-04-06 devnull {
1271 7cf289ca 2004-04-06 devnull Attr* attr;
1272 7cf289ca 2004-04-06 devnull
1273 7cf289ca 2004-04-06 devnull attr = t->attr;
1274 7cf289ca 2004-04-06 devnull while(attr != nil) {
1275 7cf289ca 2004-04-06 devnull if(attr->attid == attid) {
1276 7cf289ca 2004-04-06 devnull if(pans != nil)
1277 7cf289ca 2004-04-06 devnull *pans = attr->value;
1278 7cf289ca 2004-04-06 devnull if(xfer)
1279 7cf289ca 2004-04-06 devnull attr->value = nil;
1280 7cf289ca 2004-04-06 devnull return 1;
1281 7cf289ca 2004-04-06 devnull }
1282 7cf289ca 2004-04-06 devnull attr = attr->next;
1283 7cf289ca 2004-04-06 devnull }
1284 7cf289ca 2004-04-06 devnull if(pans != nil)
1285 7cf289ca 2004-04-06 devnull *pans = nil;
1286 7cf289ca 2004-04-06 devnull return 0;
1287 7cf289ca 2004-04-06 devnull }
1288 7cf289ca 2004-04-06 devnull
1289 7cf289ca 2004-04-06 devnull static int
1290 7cf289ca 2004-04-06 devnull Tconv(Fmt *f)
1291 7cf289ca 2004-04-06 devnull {
1292 7cf289ca 2004-04-06 devnull Token* t;
1293 7cf289ca 2004-04-06 devnull int i;
1294 7cf289ca 2004-04-06 devnull int tag;
1295 7cf289ca 2004-04-06 devnull char* srbra;
1296 7cf289ca 2004-04-06 devnull Rune* aname;
1297 7cf289ca 2004-04-06 devnull Rune* tname;
1298 7cf289ca 2004-04-06 devnull Attr* a;
1299 7cf289ca 2004-04-06 devnull char buf[BIGBUFSIZE];
1300 7cf289ca 2004-04-06 devnull
1301 7cf289ca 2004-04-06 devnull t = va_arg(f->args, Token*);
1302 7cf289ca 2004-04-06 devnull if(t == nil)
1303 7cf289ca 2004-04-06 devnull sprint(buf, "<null>");
1304 7cf289ca 2004-04-06 devnull else {
1305 7cf289ca 2004-04-06 devnull i = 0;
1306 7cf289ca 2004-04-06 devnull if(dbglex > 1)
1307 7cf289ca 2004-04-06 devnull i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1308 7cf289ca 2004-04-06 devnull tag = t->tag;
1309 7cf289ca 2004-04-06 devnull if(tag == Data) {
1310 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1311 7cf289ca 2004-04-06 devnull }
1312 7cf289ca 2004-04-06 devnull else {
1313 7cf289ca 2004-04-06 devnull srbra = "";
1314 7cf289ca 2004-04-06 devnull if(tag >= RBRA) {
1315 7cf289ca 2004-04-06 devnull tag -= RBRA;
1316 7cf289ca 2004-04-06 devnull srbra = "/";
1317 7cf289ca 2004-04-06 devnull }
1318 7cf289ca 2004-04-06 devnull tname = tagnames[tag];
1319 7cf289ca 2004-04-06 devnull if(tag == Notfound)
1320 7cf289ca 2004-04-06 devnull tname = L(Lquestion);
1321 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1322 7cf289ca 2004-04-06 devnull for(a = t->attr; a != nil; a = a->next) {
1323 7cf289ca 2004-04-06 devnull aname = attrnames[a->attid];
1324 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1325 7cf289ca 2004-04-06 devnull if(a->value != nil)
1326 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1327 7cf289ca 2004-04-06 devnull }
1328 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, ">");
1329 7cf289ca 2004-04-06 devnull }
1330 7cf289ca 2004-04-06 devnull buf[i] = 0;
1331 7cf289ca 2004-04-06 devnull }
1332 7cf289ca 2004-04-06 devnull return fmtstrcpy(f, buf);
1333 7cf289ca 2004-04-06 devnull }
1334 7cf289ca 2004-04-06 devnull
1335 7cf289ca 2004-04-06 devnull // Attrs own their constituent strings, but build may eventually
1336 7cf289ca 2004-04-06 devnull // transfer some values to its items and nil them out in the Attr.
1337 7cf289ca 2004-04-06 devnull static Attr*
1338 7cf289ca 2004-04-06 devnull newattr(int attid, Rune* value, Attr* link)
1339 7cf289ca 2004-04-06 devnull {
1340 7cf289ca 2004-04-06 devnull Attr* ans;
1341 7cf289ca 2004-04-06 devnull
1342 7cf289ca 2004-04-06 devnull ans = (Attr*)emalloc(sizeof(Attr));
1343 7cf289ca 2004-04-06 devnull ans->attid = attid;
1344 7cf289ca 2004-04-06 devnull ans->value = value;
1345 7cf289ca 2004-04-06 devnull ans->next = link;
1346 7cf289ca 2004-04-06 devnull return ans;
1347 7cf289ca 2004-04-06 devnull }
1348 7cf289ca 2004-04-06 devnull
1349 7cf289ca 2004-04-06 devnull // Free list of Attrs linked through next field
1350 7cf289ca 2004-04-06 devnull static void
1351 7cf289ca 2004-04-06 devnull freeattrs(Attr* ahead)
1352 7cf289ca 2004-04-06 devnull {
1353 7cf289ca 2004-04-06 devnull Attr* a;
1354 7cf289ca 2004-04-06 devnull Attr* nexta;
1355 7cf289ca 2004-04-06 devnull
1356 7cf289ca 2004-04-06 devnull a = ahead;
1357 7cf289ca 2004-04-06 devnull while(a != nil) {
1358 7cf289ca 2004-04-06 devnull nexta = a->next;
1359 7cf289ca 2004-04-06 devnull free(a->value);
1360 7cf289ca 2004-04-06 devnull free(a);
1361 7cf289ca 2004-04-06 devnull a = nexta;
1362 7cf289ca 2004-04-06 devnull }
1363 7cf289ca 2004-04-06 devnull }
1364 7cf289ca 2004-04-06 devnull
1365 7cf289ca 2004-04-06 devnull // Free array of Tokens.
1366 7cf289ca 2004-04-06 devnull // Allocated space might have room for more than n tokens,
1367 7cf289ca 2004-04-06 devnull // but only n of them are initialized.
1368 7cf289ca 2004-04-06 devnull // If caller has transferred ownership of constitutent strings
1369 7cf289ca 2004-04-06 devnull // or attributes, it must have nil'd out the pointers in the Tokens.
1370 7cf289ca 2004-04-06 devnull void
1371 7cf289ca 2004-04-06 devnull _freetokens(Token* tarray, int n)
1372 7cf289ca 2004-04-06 devnull {
1373 7cf289ca 2004-04-06 devnull int i;
1374 7cf289ca 2004-04-06 devnull Token* t;
1375 7cf289ca 2004-04-06 devnull
1376 7cf289ca 2004-04-06 devnull if(tarray == nil)
1377 7cf289ca 2004-04-06 devnull return;
1378 7cf289ca 2004-04-06 devnull for(i = 0; i < n; i++) {
1379 7cf289ca 2004-04-06 devnull t = &tarray[i];
1380 7cf289ca 2004-04-06 devnull free(t->text);
1381 7cf289ca 2004-04-06 devnull freeattrs(t->attr);
1382 7cf289ca 2004-04-06 devnull }
1383 7cf289ca 2004-04-06 devnull free(tarray);
1384 7cf289ca 2004-04-06 devnull }