Blame


1 7cf289ca 2004-04-06 devnull #include <u.h>
2 7cf289ca 2004-04-06 devnull #include <libc.h>
3 7cf289ca 2004-04-06 devnull #include <draw.h>
4 7cf289ca 2004-04-06 devnull #include <ctype.h>
5 7cf289ca 2004-04-06 devnull #include <html.h>
6 7cf289ca 2004-04-06 devnull #include "impl.h"
7 7cf289ca 2004-04-06 devnull
8 7cf289ca 2004-04-06 devnull typedef struct TokenSource TokenSource;
9 7cf289ca 2004-04-06 devnull struct TokenSource
10 7cf289ca 2004-04-06 devnull {
11 cbeb0b26 2006-04-01 devnull int i; /* index of next byte to use */
12 cbeb0b26 2006-04-01 devnull uchar* data; /* all the data */
13 cbeb0b26 2006-04-01 devnull int edata; /* data[0:edata] is valid */
14 cbeb0b26 2006-04-01 devnull int chset; /* one of US_Ascii, etc. */
15 cbeb0b26 2006-04-01 devnull int mtype; /* TextHtml or TextPlain */
16 7cf289ca 2004-04-06 devnull };
17 7cf289ca 2004-04-06 devnull
18 7cf289ca 2004-04-06 devnull enum {
19 7cf289ca 2004-04-06 devnull EOF = -2,
20 7cf289ca 2004-04-06 devnull EOB = -1
21 7cf289ca 2004-04-06 devnull };
22 7cf289ca 2004-04-06 devnull
23 7cf289ca 2004-04-06 devnull #define ISNAMCHAR(c) ((c)<256 && (isalpha(c) || isdigit(c) || (c) == '-' || (c) == '.'))
24 7cf289ca 2004-04-06 devnull
25 7cf289ca 2004-04-06 devnull #define SMALLBUFSIZE 240
26 7cf289ca 2004-04-06 devnull #define BIGBUFSIZE 2000
27 7cf289ca 2004-04-06 devnull
28 cbeb0b26 2006-04-01 devnull /* HTML 4.0 tag names. */
29 cbeb0b26 2006-04-01 devnull /* Keep sorted, and in correspondence with enum in iparse.h. */
30 7cf289ca 2004-04-06 devnull Rune **tagnames;
31 7cf289ca 2004-04-06 devnull char *_tagnames[] = {
32 7cf289ca 2004-04-06 devnull " ",
33 7cf289ca 2004-04-06 devnull "!",
34 7cf289ca 2004-04-06 devnull "a",
35 7cf289ca 2004-04-06 devnull "abbr",
36 7cf289ca 2004-04-06 devnull "acronym",
37 7cf289ca 2004-04-06 devnull "address",
38 7cf289ca 2004-04-06 devnull "applet",
39 7cf289ca 2004-04-06 devnull "area",
40 7cf289ca 2004-04-06 devnull "b",
41 7cf289ca 2004-04-06 devnull "base",
42 7cf289ca 2004-04-06 devnull "basefont",
43 7cf289ca 2004-04-06 devnull "bdo",
44 7cf289ca 2004-04-06 devnull "big",
45 7cf289ca 2004-04-06 devnull "blink",
46 7cf289ca 2004-04-06 devnull "blockquote",
47 7cf289ca 2004-04-06 devnull "body",
48 7cf289ca 2004-04-06 devnull "bq",
49 7cf289ca 2004-04-06 devnull "br",
50 7cf289ca 2004-04-06 devnull "button",
51 7cf289ca 2004-04-06 devnull "caption",
52 7cf289ca 2004-04-06 devnull "center",
53 7cf289ca 2004-04-06 devnull "cite",
54 7cf289ca 2004-04-06 devnull "code",
55 7cf289ca 2004-04-06 devnull "col",
56 7cf289ca 2004-04-06 devnull "colgroup",
57 7cf289ca 2004-04-06 devnull "dd",
58 7cf289ca 2004-04-06 devnull "del",
59 7cf289ca 2004-04-06 devnull "dfn",
60 7cf289ca 2004-04-06 devnull "dir",
61 7cf289ca 2004-04-06 devnull "div",
62 7cf289ca 2004-04-06 devnull "dl",
63 7cf289ca 2004-04-06 devnull "dt",
64 7cf289ca 2004-04-06 devnull "em",
65 7cf289ca 2004-04-06 devnull "fieldset",
66 7cf289ca 2004-04-06 devnull "font",
67 7cf289ca 2004-04-06 devnull "form",
68 7cf289ca 2004-04-06 devnull "frame",
69 7cf289ca 2004-04-06 devnull "frameset",
70 7cf289ca 2004-04-06 devnull "h1",
71 7cf289ca 2004-04-06 devnull "h2",
72 7cf289ca 2004-04-06 devnull "h3",
73 7cf289ca 2004-04-06 devnull "h4",
74 7cf289ca 2004-04-06 devnull "h5",
75 7cf289ca 2004-04-06 devnull "h6",
76 7cf289ca 2004-04-06 devnull "head",
77 7cf289ca 2004-04-06 devnull "hr",
78 7cf289ca 2004-04-06 devnull "html",
79 7cf289ca 2004-04-06 devnull "i",
80 7cf289ca 2004-04-06 devnull "iframe",
81 7cf289ca 2004-04-06 devnull "img",
82 7cf289ca 2004-04-06 devnull "input",
83 7cf289ca 2004-04-06 devnull "ins",
84 7cf289ca 2004-04-06 devnull "isindex",
85 7cf289ca 2004-04-06 devnull "kbd",
86 7cf289ca 2004-04-06 devnull "label",
87 7cf289ca 2004-04-06 devnull "legend",
88 7cf289ca 2004-04-06 devnull "li",
89 7cf289ca 2004-04-06 devnull "link",
90 7cf289ca 2004-04-06 devnull "map",
91 7cf289ca 2004-04-06 devnull "menu",
92 7cf289ca 2004-04-06 devnull "meta",
93 7cf289ca 2004-04-06 devnull "nobr",
94 7cf289ca 2004-04-06 devnull "noframes",
95 7cf289ca 2004-04-06 devnull "noscript",
96 7cf289ca 2004-04-06 devnull "object",
97 7cf289ca 2004-04-06 devnull "ol",
98 7cf289ca 2004-04-06 devnull "optgroup",
99 7cf289ca 2004-04-06 devnull "option",
100 7cf289ca 2004-04-06 devnull "p",
101 7cf289ca 2004-04-06 devnull "param",
102 7cf289ca 2004-04-06 devnull "pre",
103 7cf289ca 2004-04-06 devnull "q",
104 7cf289ca 2004-04-06 devnull "s",
105 7cf289ca 2004-04-06 devnull "samp",
106 7cf289ca 2004-04-06 devnull "script",
107 7cf289ca 2004-04-06 devnull "select",
108 7cf289ca 2004-04-06 devnull "small",
109 7cf289ca 2004-04-06 devnull "span",
110 7cf289ca 2004-04-06 devnull "strike",
111 7cf289ca 2004-04-06 devnull "strong",
112 7cf289ca 2004-04-06 devnull "style",
113 7cf289ca 2004-04-06 devnull "sub",
114 7cf289ca 2004-04-06 devnull "sup",
115 7cf289ca 2004-04-06 devnull "table",
116 7cf289ca 2004-04-06 devnull "tbody",
117 7cf289ca 2004-04-06 devnull "td",
118 7cf289ca 2004-04-06 devnull "textarea",
119 7cf289ca 2004-04-06 devnull "tfoot",
120 7cf289ca 2004-04-06 devnull "th",
121 7cf289ca 2004-04-06 devnull "thead",
122 7cf289ca 2004-04-06 devnull "title",
123 7cf289ca 2004-04-06 devnull "tr",
124 7cf289ca 2004-04-06 devnull "tt",
125 7cf289ca 2004-04-06 devnull "u",
126 7cf289ca 2004-04-06 devnull "ul",
127 7cf289ca 2004-04-06 devnull "var"
128 7cf289ca 2004-04-06 devnull };
129 7cf289ca 2004-04-06 devnull
130 cbeb0b26 2006-04-01 devnull /* HTML 4.0 attribute names. */
131 cbeb0b26 2006-04-01 devnull /* Keep sorted, and in correspondence with enum in i.h. */
132 7cf289ca 2004-04-06 devnull Rune **attrnames;
133 7cf289ca 2004-04-06 devnull char* _attrnames[] = {
134 7cf289ca 2004-04-06 devnull "abbr",
135 7cf289ca 2004-04-06 devnull "accept-charset",
136 7cf289ca 2004-04-06 devnull "access-key",
137 7cf289ca 2004-04-06 devnull "action",
138 7cf289ca 2004-04-06 devnull "align",
139 7cf289ca 2004-04-06 devnull "alink",
140 7cf289ca 2004-04-06 devnull "alt",
141 7cf289ca 2004-04-06 devnull "archive",
142 7cf289ca 2004-04-06 devnull "axis",
143 7cf289ca 2004-04-06 devnull "background",
144 7cf289ca 2004-04-06 devnull "bgcolor",
145 7cf289ca 2004-04-06 devnull "border",
146 7cf289ca 2004-04-06 devnull "cellpadding",
147 7cf289ca 2004-04-06 devnull "cellspacing",
148 7cf289ca 2004-04-06 devnull "char",
149 7cf289ca 2004-04-06 devnull "charoff",
150 7cf289ca 2004-04-06 devnull "charset",
151 7cf289ca 2004-04-06 devnull "checked",
152 7cf289ca 2004-04-06 devnull "cite",
153 7cf289ca 2004-04-06 devnull "class",
154 7cf289ca 2004-04-06 devnull "classid",
155 7cf289ca 2004-04-06 devnull "clear",
156 7cf289ca 2004-04-06 devnull "code",
157 7cf289ca 2004-04-06 devnull "codebase",
158 7cf289ca 2004-04-06 devnull "codetype",
159 7cf289ca 2004-04-06 devnull "color",
160 7cf289ca 2004-04-06 devnull "cols",
161 7cf289ca 2004-04-06 devnull "colspan",
162 7cf289ca 2004-04-06 devnull "compact",
163 7cf289ca 2004-04-06 devnull "content",
164 7cf289ca 2004-04-06 devnull "coords",
165 7cf289ca 2004-04-06 devnull "data",
166 7cf289ca 2004-04-06 devnull "datetime",
167 7cf289ca 2004-04-06 devnull "declare",
168 7cf289ca 2004-04-06 devnull "defer",
169 7cf289ca 2004-04-06 devnull "dir",
170 7cf289ca 2004-04-06 devnull "disabled",
171 7cf289ca 2004-04-06 devnull "enctype",
172 7cf289ca 2004-04-06 devnull "face",
173 7cf289ca 2004-04-06 devnull "for",
174 7cf289ca 2004-04-06 devnull "frame",
175 7cf289ca 2004-04-06 devnull "frameborder",
176 7cf289ca 2004-04-06 devnull "headers",
177 7cf289ca 2004-04-06 devnull "height",
178 7cf289ca 2004-04-06 devnull "href",
179 7cf289ca 2004-04-06 devnull "hreflang",
180 7cf289ca 2004-04-06 devnull "hspace",
181 7cf289ca 2004-04-06 devnull "http-equiv",
182 7cf289ca 2004-04-06 devnull "id",
183 7cf289ca 2004-04-06 devnull "ismap",
184 7cf289ca 2004-04-06 devnull "label",
185 7cf289ca 2004-04-06 devnull "lang",
186 7cf289ca 2004-04-06 devnull "link",
187 7cf289ca 2004-04-06 devnull "longdesc",
188 7cf289ca 2004-04-06 devnull "marginheight",
189 7cf289ca 2004-04-06 devnull "marginwidth",
190 7cf289ca 2004-04-06 devnull "maxlength",
191 7cf289ca 2004-04-06 devnull "media",
192 7cf289ca 2004-04-06 devnull "method",
193 7cf289ca 2004-04-06 devnull "multiple",
194 7cf289ca 2004-04-06 devnull "name",
195 7cf289ca 2004-04-06 devnull "nohref",
196 7cf289ca 2004-04-06 devnull "noresize",
197 7cf289ca 2004-04-06 devnull "noshade",
198 7cf289ca 2004-04-06 devnull "nowrap",
199 7cf289ca 2004-04-06 devnull "object",
200 7cf289ca 2004-04-06 devnull "onblur",
201 7cf289ca 2004-04-06 devnull "onchange",
202 7cf289ca 2004-04-06 devnull "onclick",
203 7cf289ca 2004-04-06 devnull "ondblclick",
204 7cf289ca 2004-04-06 devnull "onfocus",
205 7cf289ca 2004-04-06 devnull "onkeypress",
206 7cf289ca 2004-04-06 devnull "onkeyup",
207 7cf289ca 2004-04-06 devnull "onload",
208 7cf289ca 2004-04-06 devnull "onmousedown",
209 7cf289ca 2004-04-06 devnull "onmousemove",
210 7cf289ca 2004-04-06 devnull "onmouseout",
211 7cf289ca 2004-04-06 devnull "onmouseover",
212 7cf289ca 2004-04-06 devnull "onmouseup",
213 7cf289ca 2004-04-06 devnull "onreset",
214 7cf289ca 2004-04-06 devnull "onselect",
215 7cf289ca 2004-04-06 devnull "onsubmit",
216 7cf289ca 2004-04-06 devnull "onunload",
217 7cf289ca 2004-04-06 devnull "profile",
218 7cf289ca 2004-04-06 devnull "prompt",
219 7cf289ca 2004-04-06 devnull "readonly",
220 7cf289ca 2004-04-06 devnull "rel",
221 7cf289ca 2004-04-06 devnull "rev",
222 7cf289ca 2004-04-06 devnull "rows",
223 7cf289ca 2004-04-06 devnull "rowspan",
224 7cf289ca 2004-04-06 devnull "rules",
225 7cf289ca 2004-04-06 devnull "scheme",
226 7cf289ca 2004-04-06 devnull "scope",
227 7cf289ca 2004-04-06 devnull "scrolling",
228 7cf289ca 2004-04-06 devnull "selected",
229 7cf289ca 2004-04-06 devnull "shape",
230 7cf289ca 2004-04-06 devnull "size",
231 7cf289ca 2004-04-06 devnull "span",
232 7cf289ca 2004-04-06 devnull "src",
233 7cf289ca 2004-04-06 devnull "standby",
234 7cf289ca 2004-04-06 devnull "start",
235 7cf289ca 2004-04-06 devnull "style",
236 7cf289ca 2004-04-06 devnull "summary",
237 7cf289ca 2004-04-06 devnull "tabindex",
238 7cf289ca 2004-04-06 devnull "target",
239 7cf289ca 2004-04-06 devnull "text",
240 7cf289ca 2004-04-06 devnull "title",
241 7cf289ca 2004-04-06 devnull "type",
242 7cf289ca 2004-04-06 devnull "usemap",
243 7cf289ca 2004-04-06 devnull "valign",
244 7cf289ca 2004-04-06 devnull "value",
245 7cf289ca 2004-04-06 devnull "valuetype",
246 7cf289ca 2004-04-06 devnull "version",
247 7cf289ca 2004-04-06 devnull "vlink",
248 7cf289ca 2004-04-06 devnull "vspace",
249 7cf289ca 2004-04-06 devnull "width"
250 7cf289ca 2004-04-06 devnull };
251 7cf289ca 2004-04-06 devnull
252 7cf289ca 2004-04-06 devnull
253 cbeb0b26 2006-04-01 devnull /* Character entity to unicode character number map. */
254 cbeb0b26 2006-04-01 devnull /* Keep sorted by name. */
255 7cf289ca 2004-04-06 devnull StringInt *chartab;
256 5b76ae26 2005-09-19 devnull AsciiInt _chartab[] = {
257 7cf289ca 2004-04-06 devnull {"AElig", 198},
258 7cf289ca 2004-04-06 devnull {"Aacute", 193},
259 7cf289ca 2004-04-06 devnull {"Acirc", 194},
260 7cf289ca 2004-04-06 devnull {"Agrave", 192},
261 7cf289ca 2004-04-06 devnull {"Aring", 197},
262 7cf289ca 2004-04-06 devnull {"Atilde", 195},
263 7cf289ca 2004-04-06 devnull {"Auml", 196},
264 7cf289ca 2004-04-06 devnull {"Ccedil", 199},
265 7cf289ca 2004-04-06 devnull {"ETH", 208},
266 7cf289ca 2004-04-06 devnull {"Eacute", 201},
267 7cf289ca 2004-04-06 devnull {"Ecirc", 202},
268 7cf289ca 2004-04-06 devnull {"Egrave", 200},
269 7cf289ca 2004-04-06 devnull {"Euml", 203},
270 7cf289ca 2004-04-06 devnull {"Iacute", 205},
271 7cf289ca 2004-04-06 devnull {"Icirc", 206},
272 7cf289ca 2004-04-06 devnull {"Igrave", 204},
273 7cf289ca 2004-04-06 devnull {"Iuml", 207},
274 7cf289ca 2004-04-06 devnull {"Ntilde", 209},
275 7cf289ca 2004-04-06 devnull {"Oacute", 211},
276 7cf289ca 2004-04-06 devnull {"Ocirc", 212},
277 7cf289ca 2004-04-06 devnull {"Ograve", 210},
278 7cf289ca 2004-04-06 devnull {"Oslash", 216},
279 7cf289ca 2004-04-06 devnull {"Otilde", 213},
280 7cf289ca 2004-04-06 devnull {"Ouml", 214},
281 7cf289ca 2004-04-06 devnull {"THORN", 222},
282 7cf289ca 2004-04-06 devnull {"Uacute", 218},
283 7cf289ca 2004-04-06 devnull {"Ucirc", 219},
284 7cf289ca 2004-04-06 devnull {"Ugrave", 217},
285 7cf289ca 2004-04-06 devnull {"Uuml", 220},
286 7cf289ca 2004-04-06 devnull {"Yacute", 221},
287 7cf289ca 2004-04-06 devnull {"aacute", 225},
288 7cf289ca 2004-04-06 devnull {"acirc", 226},
289 7cf289ca 2004-04-06 devnull {"acute", 180},
290 7cf289ca 2004-04-06 devnull {"aelig", 230},
291 7cf289ca 2004-04-06 devnull {"agrave", 224},
292 7cf289ca 2004-04-06 devnull {"alpha", 945},
293 7cf289ca 2004-04-06 devnull {"amp", 38},
294 7cf289ca 2004-04-06 devnull {"aring", 229},
295 7cf289ca 2004-04-06 devnull {"atilde", 227},
296 7cf289ca 2004-04-06 devnull {"auml", 228},
297 7cf289ca 2004-04-06 devnull {"beta", 946},
298 7cf289ca 2004-04-06 devnull {"brvbar", 166},
299 7cf289ca 2004-04-06 devnull {"ccedil", 231},
300 7cf289ca 2004-04-06 devnull {"cdots", 8943},
301 7cf289ca 2004-04-06 devnull {"cedil", 184},
302 7cf289ca 2004-04-06 devnull {"cent", 162},
303 7cf289ca 2004-04-06 devnull {"chi", 967},
304 7cf289ca 2004-04-06 devnull {"copy", 169},
305 7cf289ca 2004-04-06 devnull {"curren", 164},
306 7cf289ca 2004-04-06 devnull {"ddots", 8945},
307 7cf289ca 2004-04-06 devnull {"deg", 176},
308 7cf289ca 2004-04-06 devnull {"delta", 948},
309 7cf289ca 2004-04-06 devnull {"divide", 247},
310 7cf289ca 2004-04-06 devnull {"eacute", 233},
311 7cf289ca 2004-04-06 devnull {"ecirc", 234},
312 7cf289ca 2004-04-06 devnull {"egrave", 232},
313 5b76ae26 2005-09-19 devnull {"emdash", 8212}, /* non-standard but commonly used */
314 7cf289ca 2004-04-06 devnull {"emsp", 8195},
315 5b76ae26 2005-09-19 devnull {"endash", 8211}, /* non-standard but commonly used */
316 7cf289ca 2004-04-06 devnull {"ensp", 8194},
317 7cf289ca 2004-04-06 devnull {"epsilon", 949},
318 7cf289ca 2004-04-06 devnull {"eta", 951},
319 7cf289ca 2004-04-06 devnull {"eth", 240},
320 7cf289ca 2004-04-06 devnull {"euml", 235},
321 7cf289ca 2004-04-06 devnull {"frac12", 189},
322 7cf289ca 2004-04-06 devnull {"frac14", 188},
323 7cf289ca 2004-04-06 devnull {"frac34", 190},
324 7cf289ca 2004-04-06 devnull {"gamma", 947},
325 7cf289ca 2004-04-06 devnull {"gt", 62},
326 7cf289ca 2004-04-06 devnull {"iacute", 237},
327 7cf289ca 2004-04-06 devnull {"icirc", 238},
328 7cf289ca 2004-04-06 devnull {"iexcl", 161},
329 7cf289ca 2004-04-06 devnull {"igrave", 236},
330 7cf289ca 2004-04-06 devnull {"iota", 953},
331 7cf289ca 2004-04-06 devnull {"iquest", 191},
332 7cf289ca 2004-04-06 devnull {"iuml", 239},
333 7cf289ca 2004-04-06 devnull {"kappa", 954},
334 7cf289ca 2004-04-06 devnull {"lambda", 955},
335 7cf289ca 2004-04-06 devnull {"laquo", 171},
336 431e32de 2005-09-30 devnull {"ldquo", 8220},
337 7cf289ca 2004-04-06 devnull {"ldots", 8230},
338 431e32de 2005-09-30 devnull {"lsquo", 8216},
339 7cf289ca 2004-04-06 devnull {"lt", 60},
340 7cf289ca 2004-04-06 devnull {"macr", 175},
341 5b76ae26 2005-09-19 devnull {"mdash", 8212},
342 7cf289ca 2004-04-06 devnull {"micro", 181},
343 7cf289ca 2004-04-06 devnull {"middot", 183},
344 7cf289ca 2004-04-06 devnull {"mu", 956},
345 7cf289ca 2004-04-06 devnull {"nbsp", 160},
346 5b76ae26 2005-09-19 devnull {"ndash", 8211},
347 7cf289ca 2004-04-06 devnull {"not", 172},
348 7cf289ca 2004-04-06 devnull {"ntilde", 241},
349 7cf289ca 2004-04-06 devnull {"nu", 957},
350 7cf289ca 2004-04-06 devnull {"oacute", 243},
351 7cf289ca 2004-04-06 devnull {"ocirc", 244},
352 7cf289ca 2004-04-06 devnull {"ograve", 242},
353 7cf289ca 2004-04-06 devnull {"omega", 969},
354 7cf289ca 2004-04-06 devnull {"omicron", 959},
355 7cf289ca 2004-04-06 devnull {"ordf", 170},
356 7cf289ca 2004-04-06 devnull {"ordm", 186},
357 7cf289ca 2004-04-06 devnull {"oslash", 248},
358 7cf289ca 2004-04-06 devnull {"otilde", 245},
359 7cf289ca 2004-04-06 devnull {"ouml", 246},
360 7cf289ca 2004-04-06 devnull {"para", 182},
361 7cf289ca 2004-04-06 devnull {"phi", 966},
362 7cf289ca 2004-04-06 devnull {"pi", 960},
363 7cf289ca 2004-04-06 devnull {"plusmn", 177},
364 7cf289ca 2004-04-06 devnull {"pound", 163},
365 7cf289ca 2004-04-06 devnull {"psi", 968},
366 7cf289ca 2004-04-06 devnull {"quad", 8193},
367 7cf289ca 2004-04-06 devnull {"quot", 34},
368 7cf289ca 2004-04-06 devnull {"raquo", 187},
369 431e32de 2005-09-30 devnull {"rdquo", 8221},
370 7cf289ca 2004-04-06 devnull {"reg", 174},
371 7cf289ca 2004-04-06 devnull {"rho", 961},
372 431e32de 2005-09-30 devnull {"rsquo", 8217},
373 7cf289ca 2004-04-06 devnull {"sect", 167},
374 7cf289ca 2004-04-06 devnull {"shy", 173},
375 7cf289ca 2004-04-06 devnull {"sigma", 963},
376 7cf289ca 2004-04-06 devnull {"sp", 8194},
377 7cf289ca 2004-04-06 devnull {"sup1", 185},
378 7cf289ca 2004-04-06 devnull {"sup2", 178},
379 7cf289ca 2004-04-06 devnull {"sup3", 179},
380 7cf289ca 2004-04-06 devnull {"szlig", 223},
381 7cf289ca 2004-04-06 devnull {"tau", 964},
382 7cf289ca 2004-04-06 devnull {"theta", 952},
383 7cf289ca 2004-04-06 devnull {"thinsp", 8201},
384 7cf289ca 2004-04-06 devnull {"thorn", 254},
385 7cf289ca 2004-04-06 devnull {"times", 215},
386 7cf289ca 2004-04-06 devnull {"trade", 8482},
387 7cf289ca 2004-04-06 devnull {"uacute", 250},
388 7cf289ca 2004-04-06 devnull {"ucirc", 251},
389 7cf289ca 2004-04-06 devnull {"ugrave", 249},
390 7cf289ca 2004-04-06 devnull {"uml", 168},
391 7cf289ca 2004-04-06 devnull {"upsilon", 965},
392 7cf289ca 2004-04-06 devnull {"uuml", 252},
393 7cf289ca 2004-04-06 devnull {"varepsilon", 8712},
394 7cf289ca 2004-04-06 devnull {"varphi", 981},
395 7cf289ca 2004-04-06 devnull {"varpi", 982},
396 7cf289ca 2004-04-06 devnull {"varrho", 1009},
397 7cf289ca 2004-04-06 devnull {"vdots", 8942},
398 7cf289ca 2004-04-06 devnull {"vsigma", 962},
399 7cf289ca 2004-04-06 devnull {"vtheta", 977},
400 7cf289ca 2004-04-06 devnull {"xi", 958},
401 7cf289ca 2004-04-06 devnull {"yacute", 253},
402 7cf289ca 2004-04-06 devnull {"yen", 165},
403 7cf289ca 2004-04-06 devnull {"yuml", 255},
404 7cf289ca 2004-04-06 devnull {"zeta", 950}
405 7cf289ca 2004-04-06 devnull };
406 5b76ae26 2005-09-19 devnull #define NCHARTAB (sizeof(_chartab)/sizeof(_chartab[0]))
407 7cf289ca 2004-04-06 devnull
408 cbeb0b26 2006-04-01 devnull /* Characters Winstart..Winend are those that Windows */
409 cbeb0b26 2006-04-01 devnull /* uses interpolated into the Latin1 set. */
410 cbeb0b26 2006-04-01 devnull /* They aren't supposed to appear in HTML, but they do.... */
411 7cf289ca 2004-04-06 devnull enum {
412 7cf289ca 2004-04-06 devnull Winstart = 127,
413 7cf289ca 2004-04-06 devnull Winend = 159
414 7cf289ca 2004-04-06 devnull };
415 7cf289ca 2004-04-06 devnull
416 cbeb0b26 2006-04-01 devnull static int winchars[]= { 8226, /* 8226 is a bullet */
417 7cf289ca 2004-04-06 devnull 8226, 8226, 8218, 402, 8222, 8230, 8224, 8225,
418 7cf289ca 2004-04-06 devnull 710, 8240, 352, 8249, 338, 8226, 8226, 8226,
419 7cf289ca 2004-04-06 devnull 8226, 8216, 8217, 8220, 8221, 8226, 8211, 8212,
420 7cf289ca 2004-04-06 devnull 732, 8482, 353, 8250, 339, 8226, 8226, 376};
421 7cf289ca 2004-04-06 devnull
422 cbeb0b26 2006-04-01 devnull static StringInt* tagtable; /* initialized from tagnames */
423 cbeb0b26 2006-04-01 devnull static StringInt* attrtable; /* initialized from attrnames */
424 7cf289ca 2004-04-06 devnull
425 2b604081 2005-05-07 devnull static void lexinit(void);
426 7cf289ca 2004-04-06 devnull static int getplaindata(TokenSource* ts, Token* a, int* pai);
427 7cf289ca 2004-04-06 devnull static int getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
428 7cf289ca 2004-04-06 devnull static int getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai);
429 7cf289ca 2004-04-06 devnull static int gettag(TokenSource* ts, int starti, Token* a, int* pai);
430 7cf289ca 2004-04-06 devnull static Rune* buftostr(Rune* s, Rune* buf, int j);
431 7cf289ca 2004-04-06 devnull static int comment(TokenSource* ts);
432 7cf289ca 2004-04-06 devnull static int findstr(TokenSource* ts, Rune* s);
433 7cf289ca 2004-04-06 devnull static int ampersand(TokenSource* ts);
434 cbeb0b26 2006-04-01 devnull /*static int lowerc(int c); */
435 7cf289ca 2004-04-06 devnull static int getchar(TokenSource* ts);
436 7cf289ca 2004-04-06 devnull static void ungetchar(TokenSource* ts, int c);
437 7cf289ca 2004-04-06 devnull static void backup(TokenSource* ts, int savei);
438 cbeb0b26 2006-04-01 devnull /*static void freeinsidetoken(Token* t); */
439 7cf289ca 2004-04-06 devnull static void freeattrs(Attr* ahead);
440 7cf289ca 2004-04-06 devnull static Attr* newattr(int attid, Rune* value, Attr* link);
441 7cf289ca 2004-04-06 devnull static int Tconv(Fmt* f);
442 7cf289ca 2004-04-06 devnull
443 7cf289ca 2004-04-06 devnull int dbglex = 0;
444 7cf289ca 2004-04-06 devnull static int lexinited = 0;
445 7cf289ca 2004-04-06 devnull
446 7cf289ca 2004-04-06 devnull static void
447 7cf289ca 2004-04-06 devnull lexinit(void)
448 7cf289ca 2004-04-06 devnull {
449 7e19561a 2005-01-04 devnull chartab = _cvtstringinttab(_chartab, nelem(_chartab));
450 7e19561a 2005-01-04 devnull tagnames = _cvtstringtab(_tagnames, nelem(_tagnames));
451 7cf289ca 2004-04-06 devnull tagtable = _makestrinttab(tagnames, Numtags);
452 7e19561a 2005-01-04 devnull attrnames = _cvtstringtab(_attrnames, nelem(_attrnames));
453 7cf289ca 2004-04-06 devnull attrtable = _makestrinttab(attrnames, Numattrs);
454 7cf289ca 2004-04-06 devnull fmtinstall('T', Tconv);
455 7cf289ca 2004-04-06 devnull lexinited = 1;
456 7cf289ca 2004-04-06 devnull }
457 7cf289ca 2004-04-06 devnull
458 7cf289ca 2004-04-06 devnull static TokenSource*
459 7cf289ca 2004-04-06 devnull newtokensource(uchar* data, int edata, int chset, int mtype)
460 7cf289ca 2004-04-06 devnull {
461 7cf289ca 2004-04-06 devnull TokenSource* ans;
462 7cf289ca 2004-04-06 devnull
463 7cf289ca 2004-04-06 devnull assert(chset == US_Ascii || chset == ISO_8859_1 ||
464 7cf289ca 2004-04-06 devnull chset == UTF_8 || chset == Unicode);
465 7cf289ca 2004-04-06 devnull ans = (TokenSource*)emalloc(sizeof(TokenSource));
466 7cf289ca 2004-04-06 devnull ans->i = 0;
467 7cf289ca 2004-04-06 devnull ans->data = data;
468 7cf289ca 2004-04-06 devnull ans->edata = edata;
469 7cf289ca 2004-04-06 devnull ans->chset = chset;
470 7cf289ca 2004-04-06 devnull ans->mtype = mtype;
471 7cf289ca 2004-04-06 devnull return ans;
472 7cf289ca 2004-04-06 devnull }
473 7cf289ca 2004-04-06 devnull
474 7cf289ca 2004-04-06 devnull enum {
475 7cf289ca 2004-04-06 devnull ToksChunk = 500
476 7cf289ca 2004-04-06 devnull };
477 7cf289ca 2004-04-06 devnull
478 cbeb0b26 2006-04-01 devnull /* Call this to get the tokens. */
479 cbeb0b26 2006-04-01 devnull /* The number of returned tokens is returned in *plen. */
480 7cf289ca 2004-04-06 devnull Token*
481 7cf289ca 2004-04-06 devnull _gettoks(uchar* data, int datalen, int chset, int mtype, int* plen)
482 7cf289ca 2004-04-06 devnull {
483 7cf289ca 2004-04-06 devnull TokenSource* ts;
484 7cf289ca 2004-04-06 devnull Token* a;
485 7cf289ca 2004-04-06 devnull int alen;
486 7cf289ca 2004-04-06 devnull int ai;
487 7cf289ca 2004-04-06 devnull int starti;
488 7cf289ca 2004-04-06 devnull int c;
489 7cf289ca 2004-04-06 devnull int tag;
490 7cf289ca 2004-04-06 devnull
491 7cf289ca 2004-04-06 devnull if(!lexinited)
492 7cf289ca 2004-04-06 devnull lexinit();
493 7cf289ca 2004-04-06 devnull ts = newtokensource(data, datalen, chset, mtype);
494 7cf289ca 2004-04-06 devnull alen = ToksChunk;
495 7cf289ca 2004-04-06 devnull a = (Token*)emalloc(alen * sizeof(Token));
496 7cf289ca 2004-04-06 devnull ai = 0;
497 7cf289ca 2004-04-06 devnull if(dbglex)
498 7cf289ca 2004-04-06 devnull fprint(2, "_gettoks starts, ts.i=%d, ts.edata=%d\n", ts->i, ts->edata);
499 431e32de 2005-09-30 devnull if(ts->mtype == TextHtml){
500 431e32de 2005-09-30 devnull for(;;){
501 431e32de 2005-09-30 devnull if(ai == alen){
502 7cf289ca 2004-04-06 devnull a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
503 7cf289ca 2004-04-06 devnull alen += ToksChunk;
504 7cf289ca 2004-04-06 devnull }
505 7cf289ca 2004-04-06 devnull starti = ts->i;
506 7cf289ca 2004-04-06 devnull c = getchar(ts);
507 7cf289ca 2004-04-06 devnull if(c < 0)
508 7cf289ca 2004-04-06 devnull break;
509 431e32de 2005-09-30 devnull if(c == '<'){
510 7cf289ca 2004-04-06 devnull tag = gettag(ts, starti, a, &ai);
511 431e32de 2005-09-30 devnull if(tag == Tscript){
512 cbeb0b26 2006-04-01 devnull /* special rules for getting Data after.... */
513 7cf289ca 2004-04-06 devnull starti = ts->i;
514 7cf289ca 2004-04-06 devnull c = getchar(ts);
515 7cf289ca 2004-04-06 devnull tag = getscriptdata(ts, c, starti, a, &ai);
516 7cf289ca 2004-04-06 devnull }
517 7cf289ca 2004-04-06 devnull }
518 7cf289ca 2004-04-06 devnull else
519 7cf289ca 2004-04-06 devnull tag = getdata(ts, c, starti, a, &ai);
520 7cf289ca 2004-04-06 devnull if(tag == -1)
521 7cf289ca 2004-04-06 devnull break;
522 7cf289ca 2004-04-06 devnull else if(dbglex > 1 && tag != Comment)
523 7cf289ca 2004-04-06 devnull fprint(2, "lex: got token %T\n", &a[ai-1]);
524 7cf289ca 2004-04-06 devnull }
525 7cf289ca 2004-04-06 devnull }
526 7cf289ca 2004-04-06 devnull else {
527 cbeb0b26 2006-04-01 devnull /* plain text (non-html) tokens */
528 431e32de 2005-09-30 devnull for(;;){
529 431e32de 2005-09-30 devnull if(ai == alen){
530 7cf289ca 2004-04-06 devnull a = (Token*)erealloc(a, (alen+ToksChunk)*sizeof(Token));
531 7cf289ca 2004-04-06 devnull alen += ToksChunk;
532 7cf289ca 2004-04-06 devnull }
533 7cf289ca 2004-04-06 devnull tag = getplaindata(ts, a, &ai);
534 7cf289ca 2004-04-06 devnull if(tag == -1)
535 7cf289ca 2004-04-06 devnull break;
536 7cf289ca 2004-04-06 devnull if(dbglex > 1)
537 7cf289ca 2004-04-06 devnull fprint(2, "lex: got token %T\n", &a[ai]);
538 7cf289ca 2004-04-06 devnull }
539 7cf289ca 2004-04-06 devnull }
540 7cf289ca 2004-04-06 devnull if(dbglex)
541 7cf289ca 2004-04-06 devnull fprint(2, "lex: returning %d tokens\n", ai);
542 7cf289ca 2004-04-06 devnull *plen = ai;
543 7cf289ca 2004-04-06 devnull if(ai == 0)
544 7cf289ca 2004-04-06 devnull return nil;
545 7cf289ca 2004-04-06 devnull return a;
546 7cf289ca 2004-04-06 devnull }
547 7cf289ca 2004-04-06 devnull
548 cbeb0b26 2006-04-01 devnull /* For case where source isn't HTML. */
549 cbeb0b26 2006-04-01 devnull /* Just make data tokens, one per line (or partial line, */
550 cbeb0b26 2006-04-01 devnull /* at end of buffer), ignoring non-whitespace control */
551 cbeb0b26 2006-04-01 devnull /* characters and dumping \r's. */
552 cbeb0b26 2006-04-01 devnull /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
553 cbeb0b26 2006-04-01 devnull /* Otherwise return -1; */
554 7cf289ca 2004-04-06 devnull static int
555 7cf289ca 2004-04-06 devnull getplaindata(TokenSource* ts, Token* a, int* pai)
556 7cf289ca 2004-04-06 devnull {
557 7cf289ca 2004-04-06 devnull Rune* s;
558 7cf289ca 2004-04-06 devnull int j;
559 7cf289ca 2004-04-06 devnull int starti;
560 7cf289ca 2004-04-06 devnull int c;
561 7cf289ca 2004-04-06 devnull Token* tok;
562 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
563 7cf289ca 2004-04-06 devnull
564 7cf289ca 2004-04-06 devnull s = nil;
565 7cf289ca 2004-04-06 devnull j = 0;
566 7cf289ca 2004-04-06 devnull starti = ts->i;
567 431e32de 2005-09-30 devnull for(c = getchar(ts); c >= 0; c = getchar(ts)){
568 431e32de 2005-09-30 devnull if(c < ' '){
569 431e32de 2005-09-30 devnull if(isspace(c)){
570 431e32de 2005-09-30 devnull if(c == '\r'){
571 cbeb0b26 2006-04-01 devnull /* ignore it unless no following '\n', */
572 cbeb0b26 2006-04-01 devnull /* in which case treat it like '\n' */
573 7cf289ca 2004-04-06 devnull c = getchar(ts);
574 431e32de 2005-09-30 devnull if(c != '\n'){
575 7cf289ca 2004-04-06 devnull if(c >= 0)
576 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
577 7cf289ca 2004-04-06 devnull c = '\n';
578 7cf289ca 2004-04-06 devnull }
579 7cf289ca 2004-04-06 devnull }
580 7cf289ca 2004-04-06 devnull }
581 7cf289ca 2004-04-06 devnull else
582 7cf289ca 2004-04-06 devnull c = 0;
583 7cf289ca 2004-04-06 devnull }
584 431e32de 2005-09-30 devnull if(c != 0){
585 7cf289ca 2004-04-06 devnull buf[j++] = c;
586 431e32de 2005-09-30 devnull if(j == sizeof(buf)-1){
587 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
588 7cf289ca 2004-04-06 devnull j = 0;
589 7cf289ca 2004-04-06 devnull }
590 7cf289ca 2004-04-06 devnull }
591 7cf289ca 2004-04-06 devnull if(c == '\n')
592 7cf289ca 2004-04-06 devnull break;
593 7cf289ca 2004-04-06 devnull }
594 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
595 7cf289ca 2004-04-06 devnull if(s == nil)
596 7cf289ca 2004-04-06 devnull return -1;
597 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
598 7cf289ca 2004-04-06 devnull tok->tag = Data;
599 7cf289ca 2004-04-06 devnull tok->text = s;
600 7cf289ca 2004-04-06 devnull tok->attr = nil;
601 7cf289ca 2004-04-06 devnull tok->starti = starti;
602 7cf289ca 2004-04-06 devnull return Data;
603 7cf289ca 2004-04-06 devnull }
604 7cf289ca 2004-04-06 devnull
605 cbeb0b26 2006-04-01 devnull /* Return concatenation of s and buf[0:j] */
606 7cf289ca 2004-04-06 devnull static Rune*
607 7cf289ca 2004-04-06 devnull buftostr(Rune* s, Rune* buf, int j)
608 7cf289ca 2004-04-06 devnull {
609 7cf289ca 2004-04-06 devnull buf[j] = 0;
610 7cf289ca 2004-04-06 devnull if(s == nil)
611 7cf289ca 2004-04-06 devnull s = _Strndup(buf, j);
612 7cf289ca 2004-04-06 devnull else
613 7cf289ca 2004-04-06 devnull s = _Strdup2(s, buf);
614 7cf289ca 2004-04-06 devnull return s;
615 7cf289ca 2004-04-06 devnull }
616 7cf289ca 2004-04-06 devnull
617 cbeb0b26 2006-04-01 devnull /* Gather data up to next start-of-tag or end-of-buffer. */
618 cbeb0b26 2006-04-01 devnull /* Translate entity references (&amp;). */
619 cbeb0b26 2006-04-01 devnull /* Ignore non-whitespace control characters and get rid of \r's. */
620 cbeb0b26 2006-04-01 devnull /* If find non-empty token, fill in a[*pai], bump *pai, and return Data. */
621 cbeb0b26 2006-04-01 devnull /* Otherwise return -1; */
622 7cf289ca 2004-04-06 devnull static int
623 7cf289ca 2004-04-06 devnull getdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
624 7cf289ca 2004-04-06 devnull {
625 7cf289ca 2004-04-06 devnull Rune* s;
626 7cf289ca 2004-04-06 devnull int j;
627 7cf289ca 2004-04-06 devnull int c;
628 7cf289ca 2004-04-06 devnull Token* tok;
629 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
630 7cf289ca 2004-04-06 devnull
631 7cf289ca 2004-04-06 devnull s = nil;
632 7cf289ca 2004-04-06 devnull j = 0;
633 7cf289ca 2004-04-06 devnull c = firstc;
634 431e32de 2005-09-30 devnull while(c >= 0){
635 431e32de 2005-09-30 devnull if(c == '&'){
636 7cf289ca 2004-04-06 devnull c = ampersand(ts);
637 7cf289ca 2004-04-06 devnull if(c < 0)
638 7cf289ca 2004-04-06 devnull break;
639 7cf289ca 2004-04-06 devnull }
640 431e32de 2005-09-30 devnull else if(c < ' '){
641 431e32de 2005-09-30 devnull if(isspace(c)){
642 431e32de 2005-09-30 devnull if(c == '\r'){
643 cbeb0b26 2006-04-01 devnull /* ignore it unless no following '\n', */
644 cbeb0b26 2006-04-01 devnull /* in which case treat it like '\n' */
645 7cf289ca 2004-04-06 devnull c = getchar(ts);
646 431e32de 2005-09-30 devnull if(c != '\n'){
647 7cf289ca 2004-04-06 devnull if(c >= 0)
648 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
649 7cf289ca 2004-04-06 devnull c = '\n';
650 7cf289ca 2004-04-06 devnull }
651 7cf289ca 2004-04-06 devnull }
652 7cf289ca 2004-04-06 devnull }
653 7cf289ca 2004-04-06 devnull else {
654 7cf289ca 2004-04-06 devnull if(warn)
655 7cf289ca 2004-04-06 devnull fprint(2, "warning: non-whitespace control character %d ignored\n", c);
656 7cf289ca 2004-04-06 devnull c = 0;
657 7cf289ca 2004-04-06 devnull }
658 7cf289ca 2004-04-06 devnull }
659 431e32de 2005-09-30 devnull else if(c == '<'){
660 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
661 7cf289ca 2004-04-06 devnull break;
662 7cf289ca 2004-04-06 devnull }
663 431e32de 2005-09-30 devnull if(c != 0){
664 7cf289ca 2004-04-06 devnull buf[j++] = c;
665 431e32de 2005-09-30 devnull if(j == BIGBUFSIZE-1){
666 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
667 7cf289ca 2004-04-06 devnull j = 0;
668 7cf289ca 2004-04-06 devnull }
669 7cf289ca 2004-04-06 devnull }
670 7cf289ca 2004-04-06 devnull c = getchar(ts);
671 7cf289ca 2004-04-06 devnull }
672 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
673 7cf289ca 2004-04-06 devnull if(s == nil)
674 7cf289ca 2004-04-06 devnull return -1;
675 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
676 7cf289ca 2004-04-06 devnull tok->tag = Data;
677 7cf289ca 2004-04-06 devnull tok->text = s;
678 7cf289ca 2004-04-06 devnull tok->attr = nil;
679 7cf289ca 2004-04-06 devnull tok->starti = starti;
680 7cf289ca 2004-04-06 devnull return Data;
681 7cf289ca 2004-04-06 devnull }
682 7cf289ca 2004-04-06 devnull
683 cbeb0b26 2006-04-01 devnull /* The rules for lexing scripts are different (ugh). */
684 cbeb0b26 2006-04-01 devnull /* Gather up everything until see a </SCRIPT>. */
685 7cf289ca 2004-04-06 devnull static int
686 7cf289ca 2004-04-06 devnull getscriptdata(TokenSource* ts, int firstc, int starti, Token* a, int* pai)
687 7cf289ca 2004-04-06 devnull {
688 7cf289ca 2004-04-06 devnull Rune* s;
689 7cf289ca 2004-04-06 devnull int j;
690 7cf289ca 2004-04-06 devnull int tstarti;
691 7cf289ca 2004-04-06 devnull int savei;
692 7cf289ca 2004-04-06 devnull int c;
693 7cf289ca 2004-04-06 devnull int tag;
694 7cf289ca 2004-04-06 devnull int done;
695 7cf289ca 2004-04-06 devnull Token* tok;
696 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
697 7cf289ca 2004-04-06 devnull
698 7cf289ca 2004-04-06 devnull s = nil;
699 7cf289ca 2004-04-06 devnull j = 0;
700 7cf289ca 2004-04-06 devnull tstarti = starti;
701 7cf289ca 2004-04-06 devnull c = firstc;
702 7cf289ca 2004-04-06 devnull done = 0;
703 431e32de 2005-09-30 devnull while(c >= 0){
704 431e32de 2005-09-30 devnull if(c == '<'){
705 cbeb0b26 2006-04-01 devnull /* other browsers ignore stuff to end of line after <! */
706 7cf289ca 2004-04-06 devnull savei = ts->i;
707 7cf289ca 2004-04-06 devnull c = getchar(ts);
708 431e32de 2005-09-30 devnull if(c == '!'){
709 7cf289ca 2004-04-06 devnull while(c >= 0 && c != '\n' && c != '\r')
710 7cf289ca 2004-04-06 devnull c = getchar(ts);
711 7cf289ca 2004-04-06 devnull if(c == '\r')
712 7cf289ca 2004-04-06 devnull c = getchar(ts);
713 7cf289ca 2004-04-06 devnull if(c == '\n')
714 7cf289ca 2004-04-06 devnull c = getchar(ts);
715 7cf289ca 2004-04-06 devnull }
716 431e32de 2005-09-30 devnull else if(c >= 0){
717 7cf289ca 2004-04-06 devnull backup(ts, savei);
718 7cf289ca 2004-04-06 devnull tag = gettag(ts, tstarti, a, pai);
719 7cf289ca 2004-04-06 devnull if(tag == -1)
720 7cf289ca 2004-04-06 devnull break;
721 7cf289ca 2004-04-06 devnull if(tag != Comment)
722 7cf289ca 2004-04-06 devnull (*pai)--;
723 7cf289ca 2004-04-06 devnull backup(ts, tstarti);
724 431e32de 2005-09-30 devnull if(tag == Tscript + RBRA){
725 7cf289ca 2004-04-06 devnull done = 1;
726 7cf289ca 2004-04-06 devnull break;
727 7cf289ca 2004-04-06 devnull }
728 cbeb0b26 2006-04-01 devnull /* here tag was not </SCRIPT>, so take as regular data */
729 7cf289ca 2004-04-06 devnull c = getchar(ts);
730 7cf289ca 2004-04-06 devnull }
731 7cf289ca 2004-04-06 devnull }
732 7cf289ca 2004-04-06 devnull if(c < 0)
733 7cf289ca 2004-04-06 devnull break;
734 431e32de 2005-09-30 devnull if(c != 0){
735 7cf289ca 2004-04-06 devnull buf[j++] = c;
736 431e32de 2005-09-30 devnull if(j == BIGBUFSIZE-1){
737 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
738 7cf289ca 2004-04-06 devnull j = 0;
739 7cf289ca 2004-04-06 devnull }
740 7cf289ca 2004-04-06 devnull }
741 7cf289ca 2004-04-06 devnull tstarti = ts->i;
742 7cf289ca 2004-04-06 devnull c = getchar(ts);
743 7cf289ca 2004-04-06 devnull }
744 431e32de 2005-09-30 devnull if(done || ts->i == ts->edata){
745 7cf289ca 2004-04-06 devnull s = buftostr(s, buf, j);
746 7cf289ca 2004-04-06 devnull tok = &a[(*pai)++];
747 7cf289ca 2004-04-06 devnull tok->tag = Data;
748 7cf289ca 2004-04-06 devnull tok->text = s;
749 7cf289ca 2004-04-06 devnull tok->attr = nil;
750 7cf289ca 2004-04-06 devnull tok->starti = starti;
751 7cf289ca 2004-04-06 devnull return Data;
752 7cf289ca 2004-04-06 devnull }
753 7cf289ca 2004-04-06 devnull backup(ts, starti);
754 7cf289ca 2004-04-06 devnull return -1;
755 7cf289ca 2004-04-06 devnull }
756 7cf289ca 2004-04-06 devnull
757 cbeb0b26 2006-04-01 devnull /* We've just seen a '<'. Gather up stuff to closing '>' (if buffer */
758 cbeb0b26 2006-04-01 devnull /* ends before then, return -1). */
759 cbeb0b26 2006-04-01 devnull /* If it's a tag, look up the name, gather the attributes, and return */
760 cbeb0b26 2006-04-01 devnull /* the appropriate token. */
761 cbeb0b26 2006-04-01 devnull /* Else it's either just plain data or some kind of ignorable stuff: */
762 cbeb0b26 2006-04-01 devnull /* return Data or Comment as appropriate. */
763 cbeb0b26 2006-04-01 devnull /* If it's not a Comment, put it in a[*pai] and bump *pai. */
764 7cf289ca 2004-04-06 devnull static int
765 7cf289ca 2004-04-06 devnull gettag(TokenSource* ts, int starti, Token* a, int* pai)
766 7cf289ca 2004-04-06 devnull {
767 7cf289ca 2004-04-06 devnull int rbra;
768 7cf289ca 2004-04-06 devnull int ans;
769 7cf289ca 2004-04-06 devnull Attr* al;
770 7cf289ca 2004-04-06 devnull int nexti;
771 7cf289ca 2004-04-06 devnull int c;
772 7cf289ca 2004-04-06 devnull int ti;
773 7cf289ca 2004-04-06 devnull int afnd;
774 7cf289ca 2004-04-06 devnull int attid;
775 7cf289ca 2004-04-06 devnull int quote;
776 7cf289ca 2004-04-06 devnull Rune* val;
777 7cf289ca 2004-04-06 devnull int nv;
778 7cf289ca 2004-04-06 devnull int i;
779 7cf289ca 2004-04-06 devnull int tag;
780 7cf289ca 2004-04-06 devnull Token* tok;
781 7cf289ca 2004-04-06 devnull Rune buf[BIGBUFSIZE];
782 7cf289ca 2004-04-06 devnull
783 7cf289ca 2004-04-06 devnull rbra = 0;
784 7cf289ca 2004-04-06 devnull nexti = ts->i;
785 7cf289ca 2004-04-06 devnull tok = &a[*pai];
786 7cf289ca 2004-04-06 devnull tok->tag = Notfound;
787 7cf289ca 2004-04-06 devnull tok->text = nil;
788 7cf289ca 2004-04-06 devnull tok->attr = nil;
789 7cf289ca 2004-04-06 devnull tok->starti = starti;
790 7cf289ca 2004-04-06 devnull c = getchar(ts);
791 431e32de 2005-09-30 devnull if(c == '/'){
792 7cf289ca 2004-04-06 devnull rbra = RBRA;
793 7cf289ca 2004-04-06 devnull c = getchar(ts);
794 7cf289ca 2004-04-06 devnull }
795 7cf289ca 2004-04-06 devnull if(c < 0)
796 7cf289ca 2004-04-06 devnull goto eob_done;
797 431e32de 2005-09-30 devnull if(c >= 256 || !isalpha(c)){
798 cbeb0b26 2006-04-01 devnull /* not a tag */
799 431e32de 2005-09-30 devnull if(c == '!'){
800 7cf289ca 2004-04-06 devnull ans = comment(ts);
801 7cf289ca 2004-04-06 devnull if(ans != -1)
802 7cf289ca 2004-04-06 devnull return ans;
803 7cf289ca 2004-04-06 devnull goto eob_done;
804 7cf289ca 2004-04-06 devnull }
805 7cf289ca 2004-04-06 devnull else {
806 7cf289ca 2004-04-06 devnull backup(ts, nexti);
807 7cf289ca 2004-04-06 devnull tok->tag = Data;
808 7cf289ca 2004-04-06 devnull tok->text = _Strdup(L(Llt));
809 7cf289ca 2004-04-06 devnull (*pai)++;
810 7cf289ca 2004-04-06 devnull return Data;
811 7cf289ca 2004-04-06 devnull }
812 7cf289ca 2004-04-06 devnull }
813 cbeb0b26 2006-04-01 devnull /* c starts a tagname */
814 7cf289ca 2004-04-06 devnull buf[0] = c;
815 7cf289ca 2004-04-06 devnull i = 1;
816 431e32de 2005-09-30 devnull for(;;){
817 7cf289ca 2004-04-06 devnull c = getchar(ts);
818 7cf289ca 2004-04-06 devnull if(c < 0)
819 7cf289ca 2004-04-06 devnull goto eob_done;
820 7cf289ca 2004-04-06 devnull if(!ISNAMCHAR(c))
821 7cf289ca 2004-04-06 devnull break;
822 cbeb0b26 2006-04-01 devnull /* if name is bigger than buf it won't be found anyway... */
823 7cf289ca 2004-04-06 devnull if(i < BIGBUFSIZE)
824 7cf289ca 2004-04-06 devnull buf[i++] = c;
825 7cf289ca 2004-04-06 devnull }
826 7cf289ca 2004-04-06 devnull if(_lookup(tagtable, Numtags, buf, i, &tag))
827 7cf289ca 2004-04-06 devnull tok->tag = tag + rbra;
828 7cf289ca 2004-04-06 devnull else
829 cbeb0b26 2006-04-01 devnull tok->text = _Strndup(buf, i); /* for warning print, in build */
830 7cf289ca 2004-04-06 devnull
831 cbeb0b26 2006-04-01 devnull /* attribute gathering loop */
832 7cf289ca 2004-04-06 devnull al = nil;
833 431e32de 2005-09-30 devnull for(;;){
834 cbeb0b26 2006-04-01 devnull /* look for "ws name" or "ws name ws = ws val" (ws=whitespace) */
835 cbeb0b26 2006-04-01 devnull /* skip whitespace */
836 7cf289ca 2004-04-06 devnull attrloop_continue:
837 431e32de 2005-09-30 devnull while(c < 256 && isspace(c)){
838 7cf289ca 2004-04-06 devnull c = getchar(ts);
839 7cf289ca 2004-04-06 devnull if(c < 0)
840 7cf289ca 2004-04-06 devnull goto eob_done;
841 7cf289ca 2004-04-06 devnull }
842 7cf289ca 2004-04-06 devnull if(c == '>')
843 7cf289ca 2004-04-06 devnull goto attrloop_done;
844 431e32de 2005-09-30 devnull if(c == '<'){
845 7cf289ca 2004-04-06 devnull if(warn)
846 7cf289ca 2004-04-06 devnull fprint(2, "warning: unclosed tag\n");
847 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
848 7cf289ca 2004-04-06 devnull goto attrloop_done;
849 7cf289ca 2004-04-06 devnull }
850 431e32de 2005-09-30 devnull if(c >= 256 || !isalpha(c)){
851 7cf289ca 2004-04-06 devnull if(warn)
852 7cf289ca 2004-04-06 devnull fprint(2, "warning: expected attribute name\n");
853 cbeb0b26 2006-04-01 devnull /* skipt to next attribute name */
854 431e32de 2005-09-30 devnull for(;;){
855 7cf289ca 2004-04-06 devnull c = getchar(ts);
856 7cf289ca 2004-04-06 devnull if(c < 0)
857 7cf289ca 2004-04-06 devnull goto eob_done;
858 7cf289ca 2004-04-06 devnull if(c < 256 && isalpha(c))
859 7cf289ca 2004-04-06 devnull goto attrloop_continue;
860 431e32de 2005-09-30 devnull if(c == '<'){
861 7cf289ca 2004-04-06 devnull if(warn)
862 7cf289ca 2004-04-06 devnull fprint(2, "warning: unclosed tag\n");
863 7cf289ca 2004-04-06 devnull ungetchar(ts, 60);
864 7cf289ca 2004-04-06 devnull goto attrloop_done;
865 7cf289ca 2004-04-06 devnull }
866 7cf289ca 2004-04-06 devnull if(c == '>')
867 7cf289ca 2004-04-06 devnull goto attrloop_done;
868 7cf289ca 2004-04-06 devnull }
869 7cf289ca 2004-04-06 devnull }
870 cbeb0b26 2006-04-01 devnull /* gather attribute name */
871 7cf289ca 2004-04-06 devnull buf[0] = c;
872 7cf289ca 2004-04-06 devnull i = 1;
873 431e32de 2005-09-30 devnull for(;;){
874 7cf289ca 2004-04-06 devnull c = getchar(ts);
875 7cf289ca 2004-04-06 devnull if(c < 0)
876 7cf289ca 2004-04-06 devnull goto eob_done;
877 7cf289ca 2004-04-06 devnull if(!ISNAMCHAR(c))
878 7cf289ca 2004-04-06 devnull break;
879 7cf289ca 2004-04-06 devnull if(i < BIGBUFSIZE-1)
880 7cf289ca 2004-04-06 devnull buf[i++] = c;
881 7cf289ca 2004-04-06 devnull }
882 7cf289ca 2004-04-06 devnull afnd = _lookup(attrtable, Numattrs, buf, i, &attid);
883 431e32de 2005-09-30 devnull if(warn && !afnd){
884 7cf289ca 2004-04-06 devnull buf[i] = 0;
885 7cf289ca 2004-04-06 devnull fprint(2, "warning: unknown attribute name %S\n", buf);
886 7cf289ca 2004-04-06 devnull }
887 cbeb0b26 2006-04-01 devnull /* skip whitespace */
888 431e32de 2005-09-30 devnull while(c < 256 && isspace(c)){
889 7cf289ca 2004-04-06 devnull c = getchar(ts);
890 7cf289ca 2004-04-06 devnull if(c < 0)
891 7cf289ca 2004-04-06 devnull goto eob_done;
892 7cf289ca 2004-04-06 devnull }
893 431e32de 2005-09-30 devnull if(c != '='){
894 7cf289ca 2004-04-06 devnull if(afnd)
895 7cf289ca 2004-04-06 devnull al = newattr(attid, nil, al);
896 7cf289ca 2004-04-06 devnull goto attrloop_continue;
897 7cf289ca 2004-04-06 devnull }
898 cbeb0b26 2006-04-01 devnull /*# c is '=' here; skip whitespace */
899 431e32de 2005-09-30 devnull for(;;){
900 7cf289ca 2004-04-06 devnull c = getchar(ts);
901 7cf289ca 2004-04-06 devnull if(c < 0)
902 7cf289ca 2004-04-06 devnull goto eob_done;
903 7cf289ca 2004-04-06 devnull if(c >= 256 || !isspace(c))
904 7cf289ca 2004-04-06 devnull break;
905 7cf289ca 2004-04-06 devnull }
906 7cf289ca 2004-04-06 devnull quote = 0;
907 431e32de 2005-09-30 devnull if(c == '\'' || c == '"'){
908 7cf289ca 2004-04-06 devnull quote = c;
909 7cf289ca 2004-04-06 devnull c = getchar(ts);
910 7cf289ca 2004-04-06 devnull if(c < 0)
911 7cf289ca 2004-04-06 devnull goto eob_done;
912 7cf289ca 2004-04-06 devnull }
913 7cf289ca 2004-04-06 devnull val = nil;
914 7cf289ca 2004-04-06 devnull nv = 0;
915 431e32de 2005-09-30 devnull for(;;){
916 7cf289ca 2004-04-06 devnull valloop_continue:
917 7cf289ca 2004-04-06 devnull if(c < 0)
918 7cf289ca 2004-04-06 devnull goto eob_done;
919 431e32de 2005-09-30 devnull if(c == '>'){
920 431e32de 2005-09-30 devnull if(quote){
921 cbeb0b26 2006-04-01 devnull /* c might be part of string (though not good style) */
922 cbeb0b26 2006-04-01 devnull /* but if line ends before close quote, assume */
923 cbeb0b26 2006-04-01 devnull /* there was an unmatched quote */
924 7cf289ca 2004-04-06 devnull ti = ts->i;
925 431e32de 2005-09-30 devnull for(;;){
926 7cf289ca 2004-04-06 devnull c = getchar(ts);
927 7cf289ca 2004-04-06 devnull if(c < 0)
928 7cf289ca 2004-04-06 devnull goto eob_done;
929 431e32de 2005-09-30 devnull if(c == quote){
930 7cf289ca 2004-04-06 devnull backup(ts, ti);
931 7cf289ca 2004-04-06 devnull buf[nv++] = '>';
932 431e32de 2005-09-30 devnull if(nv == BIGBUFSIZE-1){
933 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
934 7cf289ca 2004-04-06 devnull nv = 0;
935 7cf289ca 2004-04-06 devnull }
936 7cf289ca 2004-04-06 devnull c = getchar(ts);
937 7cf289ca 2004-04-06 devnull goto valloop_continue;
938 7cf289ca 2004-04-06 devnull }
939 431e32de 2005-09-30 devnull if(c == '\n'){
940 7cf289ca 2004-04-06 devnull if(warn)
941 7cf289ca 2004-04-06 devnull fprint(2, "warning: apparent unmatched quote\n");
942 7cf289ca 2004-04-06 devnull backup(ts, ti);
943 7cf289ca 2004-04-06 devnull c = '>';
944 7cf289ca 2004-04-06 devnull goto valloop_done;
945 7cf289ca 2004-04-06 devnull }
946 7cf289ca 2004-04-06 devnull }
947 7cf289ca 2004-04-06 devnull }
948 7cf289ca 2004-04-06 devnull else
949 7cf289ca 2004-04-06 devnull goto valloop_done;
950 7cf289ca 2004-04-06 devnull }
951 431e32de 2005-09-30 devnull if(quote){
952 431e32de 2005-09-30 devnull if(c == quote){
953 7cf289ca 2004-04-06 devnull c = getchar(ts);
954 7cf289ca 2004-04-06 devnull if(c < 0)
955 7cf289ca 2004-04-06 devnull goto eob_done;
956 7cf289ca 2004-04-06 devnull goto valloop_done;
957 7cf289ca 2004-04-06 devnull }
958 431e32de 2005-09-30 devnull if(c == '\r'){
959 7cf289ca 2004-04-06 devnull c = getchar(ts);
960 7cf289ca 2004-04-06 devnull goto valloop_continue;
961 7cf289ca 2004-04-06 devnull }
962 7cf289ca 2004-04-06 devnull if(c == '\t' || c == '\n')
963 7cf289ca 2004-04-06 devnull c = ' ';
964 7cf289ca 2004-04-06 devnull }
965 7cf289ca 2004-04-06 devnull else {
966 7cf289ca 2004-04-06 devnull if(c < 256 && isspace(c))
967 7cf289ca 2004-04-06 devnull goto valloop_done;
968 7cf289ca 2004-04-06 devnull }
969 431e32de 2005-09-30 devnull if(c == '&'){
970 7cf289ca 2004-04-06 devnull c = ampersand(ts);
971 7cf289ca 2004-04-06 devnull if(c == -1)
972 7cf289ca 2004-04-06 devnull goto eob_done;
973 7cf289ca 2004-04-06 devnull }
974 7cf289ca 2004-04-06 devnull buf[nv++] = c;
975 431e32de 2005-09-30 devnull if(nv == BIGBUFSIZE-1){
976 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
977 7cf289ca 2004-04-06 devnull nv = 0;
978 7cf289ca 2004-04-06 devnull }
979 7cf289ca 2004-04-06 devnull c = getchar(ts);
980 7cf289ca 2004-04-06 devnull }
981 7cf289ca 2004-04-06 devnull valloop_done:
982 431e32de 2005-09-30 devnull if(afnd){
983 7cf289ca 2004-04-06 devnull val = buftostr(val, buf, nv);
984 7cf289ca 2004-04-06 devnull al = newattr(attid, val, al);
985 7cf289ca 2004-04-06 devnull }
986 7cf289ca 2004-04-06 devnull }
987 7cf289ca 2004-04-06 devnull
988 7cf289ca 2004-04-06 devnull attrloop_done:
989 7cf289ca 2004-04-06 devnull tok->attr = al;
990 7cf289ca 2004-04-06 devnull (*pai)++;
991 7cf289ca 2004-04-06 devnull return tok->tag;
992 7cf289ca 2004-04-06 devnull
993 7cf289ca 2004-04-06 devnull eob_done:
994 7cf289ca 2004-04-06 devnull if(warn)
995 7cf289ca 2004-04-06 devnull fprint(2, "warning: incomplete tag at end of page\n");
996 7cf289ca 2004-04-06 devnull backup(ts, nexti);
997 7cf289ca 2004-04-06 devnull tok->tag = Data;
998 7cf289ca 2004-04-06 devnull tok->text = _Strdup(L(Llt));
999 7cf289ca 2004-04-06 devnull return Data;
1000 7cf289ca 2004-04-06 devnull }
1001 7cf289ca 2004-04-06 devnull
1002 cbeb0b26 2006-04-01 devnull /* We've just read a '<!' at position starti, */
1003 cbeb0b26 2006-04-01 devnull /* so this may be a comment or other ignored section, or it may */
1004 cbeb0b26 2006-04-01 devnull /* be just a literal string if there is no close before end of file */
1005 cbeb0b26 2006-04-01 devnull /* (other browsers do that). */
1006 cbeb0b26 2006-04-01 devnull /* The accepted practice seems to be (note: contrary to SGML spec!): */
1007 cbeb0b26 2006-04-01 devnull /* If see <!--, look for --> to close, or if none, > to close. */
1008 cbeb0b26 2006-04-01 devnull /* If see <!(not --), look for > to close. */
1009 cbeb0b26 2006-04-01 devnull /* If no close before end of file, leave original characters in as literal data. */
1010 cbeb0b26 2006-04-01 devnull /* */
1011 cbeb0b26 2006-04-01 devnull /* If we see ignorable stuff, return Comment. */
1012 cbeb0b26 2006-04-01 devnull /* Else return nil (caller should back up and try again when more data arrives, */
1013 cbeb0b26 2006-04-01 devnull /* unless at end of file, in which case caller should just make '<' a data token). */
1014 7cf289ca 2004-04-06 devnull static int
1015 7cf289ca 2004-04-06 devnull comment(TokenSource* ts)
1016 7cf289ca 2004-04-06 devnull {
1017 7cf289ca 2004-04-06 devnull int nexti;
1018 7cf289ca 2004-04-06 devnull int havecomment;
1019 7cf289ca 2004-04-06 devnull int c;
1020 7cf289ca 2004-04-06 devnull
1021 7cf289ca 2004-04-06 devnull nexti = ts->i;
1022 7cf289ca 2004-04-06 devnull havecomment = 0;
1023 7cf289ca 2004-04-06 devnull c = getchar(ts);
1024 431e32de 2005-09-30 devnull if(c == '-'){
1025 7cf289ca 2004-04-06 devnull c = getchar(ts);
1026 431e32de 2005-09-30 devnull if(c == '-'){
1027 7cf289ca 2004-04-06 devnull if(findstr(ts, L(Larrow)))
1028 7cf289ca 2004-04-06 devnull havecomment = 1;
1029 7cf289ca 2004-04-06 devnull else
1030 7cf289ca 2004-04-06 devnull backup(ts, nexti);
1031 7cf289ca 2004-04-06 devnull }
1032 7cf289ca 2004-04-06 devnull }
1033 431e32de 2005-09-30 devnull if(!havecomment){
1034 7cf289ca 2004-04-06 devnull if(c == '>')
1035 7cf289ca 2004-04-06 devnull havecomment = 1;
1036 431e32de 2005-09-30 devnull else if(c >= 0){
1037 7cf289ca 2004-04-06 devnull if(findstr(ts, L(Lgt)))
1038 7cf289ca 2004-04-06 devnull havecomment = 1;
1039 7cf289ca 2004-04-06 devnull }
1040 7cf289ca 2004-04-06 devnull }
1041 7cf289ca 2004-04-06 devnull if(havecomment)
1042 7cf289ca 2004-04-06 devnull return Comment;
1043 7cf289ca 2004-04-06 devnull return -1;
1044 7cf289ca 2004-04-06 devnull }
1045 7cf289ca 2004-04-06 devnull
1046 cbeb0b26 2006-04-01 devnull /* Look for string s in token source. */
1047 cbeb0b26 2006-04-01 devnull /* If found, return 1, with buffer at next char after s, */
1048 cbeb0b26 2006-04-01 devnull /* else return 0 (caller should back up). */
1049 7cf289ca 2004-04-06 devnull static int
1050 7cf289ca 2004-04-06 devnull findstr(TokenSource* ts, Rune* s)
1051 7cf289ca 2004-04-06 devnull {
1052 7cf289ca 2004-04-06 devnull int c0;
1053 7cf289ca 2004-04-06 devnull int n;
1054 7cf289ca 2004-04-06 devnull int nexti;
1055 7cf289ca 2004-04-06 devnull int i;
1056 7cf289ca 2004-04-06 devnull int c;
1057 7cf289ca 2004-04-06 devnull
1058 7cf289ca 2004-04-06 devnull c0 = s[0];
1059 7cf289ca 2004-04-06 devnull n = runestrlen(s);
1060 431e32de 2005-09-30 devnull for(;;){
1061 7cf289ca 2004-04-06 devnull c = getchar(ts);
1062 7cf289ca 2004-04-06 devnull if(c < 0)
1063 7cf289ca 2004-04-06 devnull break;
1064 431e32de 2005-09-30 devnull if(c == c0){
1065 7cf289ca 2004-04-06 devnull if(n == 1)
1066 7cf289ca 2004-04-06 devnull return 1;
1067 7cf289ca 2004-04-06 devnull nexti = ts->i;
1068 431e32de 2005-09-30 devnull for(i = 1; i < n; i++){
1069 7cf289ca 2004-04-06 devnull c = getchar(ts);
1070 7cf289ca 2004-04-06 devnull if(c < 0)
1071 7cf289ca 2004-04-06 devnull goto mainloop_done;
1072 7cf289ca 2004-04-06 devnull if(c != s[i])
1073 7cf289ca 2004-04-06 devnull break;
1074 7cf289ca 2004-04-06 devnull }
1075 7cf289ca 2004-04-06 devnull if(i == n)
1076 7cf289ca 2004-04-06 devnull return 1;
1077 7cf289ca 2004-04-06 devnull backup(ts, nexti);
1078 7cf289ca 2004-04-06 devnull }
1079 7cf289ca 2004-04-06 devnull }
1080 7cf289ca 2004-04-06 devnull mainloop_done:
1081 7cf289ca 2004-04-06 devnull return 0;
1082 7cf289ca 2004-04-06 devnull }
1083 7cf289ca 2004-04-06 devnull
1084 431e32de 2005-09-30 devnull static int
1085 431e32de 2005-09-30 devnull xdigit(int c)
1086 431e32de 2005-09-30 devnull {
1087 431e32de 2005-09-30 devnull if('0' <= c && c <= '9')
1088 431e32de 2005-09-30 devnull return c-'0';
1089 431e32de 2005-09-30 devnull if('a' <= c && c <= 'f')
1090 431e32de 2005-09-30 devnull return c-'a'+10;
1091 431e32de 2005-09-30 devnull if('A' <= c && c <= 'F')
1092 431e32de 2005-09-30 devnull return c-'A'+10;
1093 431e32de 2005-09-30 devnull return -1;
1094 431e32de 2005-09-30 devnull }
1095 431e32de 2005-09-30 devnull
1096 cbeb0b26 2006-04-01 devnull /* We've just read an '&'; look for an entity reference */
1097 cbeb0b26 2006-04-01 devnull /* name, and if found, return translated char. */
1098 cbeb0b26 2006-04-01 devnull /* if there is a complete entity name but it isn't known, */
1099 cbeb0b26 2006-04-01 devnull /* try prefixes (gets around some buggy HTML out there), */
1100 cbeb0b26 2006-04-01 devnull /* and if that fails, back up to just past the '&' and return '&'. */
1101 cbeb0b26 2006-04-01 devnull /* If the entity can't be completed in the current buffer, back up */
1102 cbeb0b26 2006-04-01 devnull /* to the '&' and return -1. */
1103 7cf289ca 2004-04-06 devnull static int
1104 7cf289ca 2004-04-06 devnull ampersand(TokenSource* ts)
1105 7cf289ca 2004-04-06 devnull {
1106 7cf289ca 2004-04-06 devnull int savei;
1107 7cf289ca 2004-04-06 devnull int c;
1108 7cf289ca 2004-04-06 devnull int fnd;
1109 7cf289ca 2004-04-06 devnull int ans;
1110 7cf289ca 2004-04-06 devnull int v;
1111 7cf289ca 2004-04-06 devnull int i;
1112 7cf289ca 2004-04-06 devnull int k;
1113 7cf289ca 2004-04-06 devnull Rune buf[SMALLBUFSIZE];
1114 7cf289ca 2004-04-06 devnull
1115 7cf289ca 2004-04-06 devnull savei = ts->i;
1116 7cf289ca 2004-04-06 devnull c = getchar(ts);
1117 7cf289ca 2004-04-06 devnull fnd = 0;
1118 7cf289ca 2004-04-06 devnull ans = -1;
1119 431e32de 2005-09-30 devnull if(c == '#'){
1120 7cf289ca 2004-04-06 devnull c = getchar(ts);
1121 7cf289ca 2004-04-06 devnull v = 0;
1122 431e32de 2005-09-30 devnull if(c == 'x'){
1123 7cf289ca 2004-04-06 devnull c = getchar(ts);
1124 431e32de 2005-09-30 devnull while((i=xdigit(c)) != -1){
1125 431e32de 2005-09-30 devnull v = v*16 + i;
1126 431e32de 2005-09-30 devnull c = getchar(ts);
1127 431e32de 2005-09-30 devnull }
1128 431e32de 2005-09-30 devnull }else{
1129 431e32de 2005-09-30 devnull while('0' <= c && c <= '9'){
1130 431e32de 2005-09-30 devnull v = v*10 + c - '0';
1131 431e32de 2005-09-30 devnull c = getchar(ts);
1132 431e32de 2005-09-30 devnull }
1133 7cf289ca 2004-04-06 devnull }
1134 431e32de 2005-09-30 devnull if(c >= 0){
1135 7cf289ca 2004-04-06 devnull if(!(c == ';' || c == '\n' || c == '\r'))
1136 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1137 7cf289ca 2004-04-06 devnull c = v;
1138 7cf289ca 2004-04-06 devnull if(c == 160)
1139 7cf289ca 2004-04-06 devnull c = 160;
1140 431e32de 2005-09-30 devnull if(c >= Winstart && c <= Winend){
1141 7cf289ca 2004-04-06 devnull c = winchars[c - Winstart];
1142 7cf289ca 2004-04-06 devnull }
1143 7cf289ca 2004-04-06 devnull ans = c;
1144 7cf289ca 2004-04-06 devnull fnd = 1;
1145 7cf289ca 2004-04-06 devnull }
1146 7cf289ca 2004-04-06 devnull }
1147 431e32de 2005-09-30 devnull else if(c < 256 && isalpha(c)){
1148 7cf289ca 2004-04-06 devnull buf[0] = c;
1149 7cf289ca 2004-04-06 devnull k = 1;
1150 431e32de 2005-09-30 devnull for(;;){
1151 7cf289ca 2004-04-06 devnull c = getchar(ts);
1152 7cf289ca 2004-04-06 devnull if(c < 0)
1153 7cf289ca 2004-04-06 devnull break;
1154 431e32de 2005-09-30 devnull if(ISNAMCHAR(c)){
1155 7cf289ca 2004-04-06 devnull if(k < SMALLBUFSIZE-1)
1156 7cf289ca 2004-04-06 devnull buf[k++] = c;
1157 7cf289ca 2004-04-06 devnull }
1158 7cf289ca 2004-04-06 devnull else {
1159 7cf289ca 2004-04-06 devnull if(!(c == ';' || c == '\n' || c == '\r'))
1160 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1161 7cf289ca 2004-04-06 devnull break;
1162 7cf289ca 2004-04-06 devnull }
1163 7cf289ca 2004-04-06 devnull }
1164 431e32de 2005-09-30 devnull if(c >= 0){
1165 7cf289ca 2004-04-06 devnull fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1166 431e32de 2005-09-30 devnull if(!fnd){
1167 cbeb0b26 2006-04-01 devnull /* Try prefixes of s */
1168 7cf289ca 2004-04-06 devnull if(c == ';' || c == '\n' || c == '\r')
1169 7cf289ca 2004-04-06 devnull ungetchar(ts, c);
1170 7cf289ca 2004-04-06 devnull i = k;
1171 431e32de 2005-09-30 devnull while(--k > 0){
1172 7cf289ca 2004-04-06 devnull fnd = _lookup(chartab, NCHARTAB, buf, k, &ans);
1173 431e32de 2005-09-30 devnull if(fnd){
1174 431e32de 2005-09-30 devnull while(i > k){
1175 7cf289ca 2004-04-06 devnull i--;
1176 7cf289ca 2004-04-06 devnull ungetchar(ts, buf[i]);
1177 7cf289ca 2004-04-06 devnull }
1178 7cf289ca 2004-04-06 devnull break;
1179 7cf289ca 2004-04-06 devnull }
1180 7cf289ca 2004-04-06 devnull }
1181 7cf289ca 2004-04-06 devnull }
1182 7cf289ca 2004-04-06 devnull }
1183 7cf289ca 2004-04-06 devnull }
1184 431e32de 2005-09-30 devnull if(!fnd){
1185 7cf289ca 2004-04-06 devnull backup(ts, savei);
1186 7cf289ca 2004-04-06 devnull ans = '&';
1187 7cf289ca 2004-04-06 devnull }
1188 7cf289ca 2004-04-06 devnull return ans;
1189 7cf289ca 2004-04-06 devnull }
1190 7cf289ca 2004-04-06 devnull
1191 cbeb0b26 2006-04-01 devnull /* Get next char, obeying ts.chset. */
1192 cbeb0b26 2006-04-01 devnull /* Returns -1 if no complete character left before current end of data. */
1193 7cf289ca 2004-04-06 devnull static int
1194 7cf289ca 2004-04-06 devnull getchar(TokenSource* ts)
1195 7cf289ca 2004-04-06 devnull {
1196 7cf289ca 2004-04-06 devnull uchar* buf;
1197 7cf289ca 2004-04-06 devnull int c;
1198 7cf289ca 2004-04-06 devnull int n;
1199 7cf289ca 2004-04-06 devnull int ok;
1200 7cf289ca 2004-04-06 devnull Rune r;
1201 7cf289ca 2004-04-06 devnull
1202 7cf289ca 2004-04-06 devnull if(ts->i >= ts->edata)
1203 7cf289ca 2004-04-06 devnull return -1;
1204 7cf289ca 2004-04-06 devnull buf = ts->data;
1205 7cf289ca 2004-04-06 devnull c = buf[ts->i];
1206 431e32de 2005-09-30 devnull switch(ts->chset){
1207 7cf289ca 2004-04-06 devnull case ISO_8859_1:
1208 7cf289ca 2004-04-06 devnull if(c >= Winstart && c <= Winend)
1209 7cf289ca 2004-04-06 devnull c = winchars[c - Winstart];
1210 7cf289ca 2004-04-06 devnull ts->i++;
1211 7cf289ca 2004-04-06 devnull break;
1212 7cf289ca 2004-04-06 devnull case US_Ascii:
1213 431e32de 2005-09-30 devnull if(c > 127){
1214 7cf289ca 2004-04-06 devnull if(warn)
1215 7cf289ca 2004-04-06 devnull fprint(2, "non-ascii char (%x) when US-ASCII specified\n", c);
1216 7cf289ca 2004-04-06 devnull }
1217 7cf289ca 2004-04-06 devnull ts->i++;
1218 7cf289ca 2004-04-06 devnull break;
1219 7cf289ca 2004-04-06 devnull case UTF_8:
1220 7cf289ca 2004-04-06 devnull ok = fullrune((char*)(buf+ts->i), ts->edata-ts->i);
1221 7cf289ca 2004-04-06 devnull n = chartorune(&r, (char*)(buf+ts->i));
1222 431e32de 2005-09-30 devnull if(ok){
1223 7cf289ca 2004-04-06 devnull if(warn && c == 0x80)
1224 7cf289ca 2004-04-06 devnull fprint(2, "warning: invalid utf-8 sequence (starts with %x)\n", ts->data[ts->i]);
1225 7cf289ca 2004-04-06 devnull ts->i += n;
1226 7cf289ca 2004-04-06 devnull c = r;
1227 7cf289ca 2004-04-06 devnull }
1228 7cf289ca 2004-04-06 devnull else {
1229 cbeb0b26 2006-04-01 devnull /* not enough bytes in buf to complete utf-8 char */
1230 cbeb0b26 2006-04-01 devnull ts->i = ts->edata; /* mark "all used" */
1231 7cf289ca 2004-04-06 devnull c = -1;
1232 7cf289ca 2004-04-06 devnull }
1233 7cf289ca 2004-04-06 devnull break;
1234 7cf289ca 2004-04-06 devnull case Unicode:
1235 431e32de 2005-09-30 devnull if(ts->i < ts->edata - 1){
1236 cbeb0b26 2006-04-01 devnull /*standards say most-significant byte first */
1237 7cf289ca 2004-04-06 devnull c = (c << 8)|(buf[ts->i + 1]);
1238 7cf289ca 2004-04-06 devnull ts->i += 2;
1239 7cf289ca 2004-04-06 devnull }
1240 7cf289ca 2004-04-06 devnull else {
1241 cbeb0b26 2006-04-01 devnull ts->i = ts->edata; /* mark "all used" */
1242 7cf289ca 2004-04-06 devnull c = -1;
1243 7cf289ca 2004-04-06 devnull }
1244 7cf289ca 2004-04-06 devnull break;
1245 7cf289ca 2004-04-06 devnull }
1246 7cf289ca 2004-04-06 devnull return c;
1247 7cf289ca 2004-04-06 devnull }
1248 7cf289ca 2004-04-06 devnull
1249 cbeb0b26 2006-04-01 devnull /* Assuming c was the last character returned by getchar, set */
1250 cbeb0b26 2006-04-01 devnull /* things up so that next getchar will get that same character */
1251 cbeb0b26 2006-04-01 devnull /* followed by the current 'next character', etc. */
1252 7cf289ca 2004-04-06 devnull static void
1253 7cf289ca 2004-04-06 devnull ungetchar(TokenSource* ts, int c)
1254 7cf289ca 2004-04-06 devnull {
1255 7cf289ca 2004-04-06 devnull int n;
1256 7cf289ca 2004-04-06 devnull Rune r;
1257 7cf289ca 2004-04-06 devnull char a[UTFmax];
1258 7cf289ca 2004-04-06 devnull
1259 7cf289ca 2004-04-06 devnull n = 1;
1260 431e32de 2005-09-30 devnull switch(ts->chset){
1261 7cf289ca 2004-04-06 devnull case UTF_8:
1262 431e32de 2005-09-30 devnull if(c >= 128){
1263 7cf289ca 2004-04-06 devnull r = c;
1264 7cf289ca 2004-04-06 devnull n = runetochar(a, &r);
1265 7cf289ca 2004-04-06 devnull }
1266 7cf289ca 2004-04-06 devnull break;
1267 7cf289ca 2004-04-06 devnull case Unicode:
1268 7cf289ca 2004-04-06 devnull n = 2;
1269 7cf289ca 2004-04-06 devnull break;
1270 7cf289ca 2004-04-06 devnull }
1271 7cf289ca 2004-04-06 devnull ts->i -= n;
1272 7cf289ca 2004-04-06 devnull }
1273 7cf289ca 2004-04-06 devnull
1274 cbeb0b26 2006-04-01 devnull /* Restore ts so that it is at the state where the index was savei. */
1275 7cf289ca 2004-04-06 devnull static void
1276 7cf289ca 2004-04-06 devnull backup(TokenSource* ts, int savei)
1277 7cf289ca 2004-04-06 devnull {
1278 7cf289ca 2004-04-06 devnull if(dbglex)
1279 7cf289ca 2004-04-06 devnull fprint(2, "lex: backup; i=%d, savei=%d\n", ts->i, savei);
1280 7cf289ca 2004-04-06 devnull ts->i = savei;
1281 7cf289ca 2004-04-06 devnull }
1282 7cf289ca 2004-04-06 devnull
1283 7cf289ca 2004-04-06 devnull
1284 cbeb0b26 2006-04-01 devnull /* Look for value associated with attribute attid in token t. */
1285 cbeb0b26 2006-04-01 devnull /* If there is one, return 1 and put the value in *pans, */
1286 cbeb0b26 2006-04-01 devnull /* else return 0. */
1287 cbeb0b26 2006-04-01 devnull /* If xfer is true, transfer ownership of the string to the caller */
1288 cbeb0b26 2006-04-01 devnull /* (nil it out here); otherwise, caller must duplicate the answer */
1289 cbeb0b26 2006-04-01 devnull /* if it needs to save it. */
1290 cbeb0b26 2006-04-01 devnull /* OK to have pans==0, in which case this is just looking */
1291 cbeb0b26 2006-04-01 devnull /* to see if token is present. */
1292 7cf289ca 2004-04-06 devnull int
1293 7cf289ca 2004-04-06 devnull _tokaval(Token* t, int attid, Rune** pans, int xfer)
1294 7cf289ca 2004-04-06 devnull {
1295 7cf289ca 2004-04-06 devnull Attr* attr;
1296 7cf289ca 2004-04-06 devnull
1297 7cf289ca 2004-04-06 devnull attr = t->attr;
1298 431e32de 2005-09-30 devnull while(attr != nil){
1299 431e32de 2005-09-30 devnull if(attr->attid == attid){
1300 7cf289ca 2004-04-06 devnull if(pans != nil)
1301 7cf289ca 2004-04-06 devnull *pans = attr->value;
1302 7cf289ca 2004-04-06 devnull if(xfer)
1303 7cf289ca 2004-04-06 devnull attr->value = nil;
1304 7cf289ca 2004-04-06 devnull return 1;
1305 7cf289ca 2004-04-06 devnull }
1306 7cf289ca 2004-04-06 devnull attr = attr->next;
1307 7cf289ca 2004-04-06 devnull }
1308 7cf289ca 2004-04-06 devnull if(pans != nil)
1309 7cf289ca 2004-04-06 devnull *pans = nil;
1310 7cf289ca 2004-04-06 devnull return 0;
1311 7cf289ca 2004-04-06 devnull }
1312 7cf289ca 2004-04-06 devnull
1313 7cf289ca 2004-04-06 devnull static int
1314 7cf289ca 2004-04-06 devnull Tconv(Fmt *f)
1315 7cf289ca 2004-04-06 devnull {
1316 7cf289ca 2004-04-06 devnull Token* t;
1317 7cf289ca 2004-04-06 devnull int i;
1318 7cf289ca 2004-04-06 devnull int tag;
1319 7cf289ca 2004-04-06 devnull char* srbra;
1320 7cf289ca 2004-04-06 devnull Rune* aname;
1321 7cf289ca 2004-04-06 devnull Rune* tname;
1322 7cf289ca 2004-04-06 devnull Attr* a;
1323 7cf289ca 2004-04-06 devnull char buf[BIGBUFSIZE];
1324 7cf289ca 2004-04-06 devnull
1325 7cf289ca 2004-04-06 devnull t = va_arg(f->args, Token*);
1326 7cf289ca 2004-04-06 devnull if(t == nil)
1327 7cf289ca 2004-04-06 devnull sprint(buf, "<null>");
1328 7cf289ca 2004-04-06 devnull else {
1329 7cf289ca 2004-04-06 devnull i = 0;
1330 7cf289ca 2004-04-06 devnull if(dbglex > 1)
1331 7cf289ca 2004-04-06 devnull i = snprint(buf, sizeof(buf), "[%d]", t->starti);
1332 7cf289ca 2004-04-06 devnull tag = t->tag;
1333 431e32de 2005-09-30 devnull if(tag == Data){
1334 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "'%S'", t->text);
1335 7cf289ca 2004-04-06 devnull }
1336 7cf289ca 2004-04-06 devnull else {
1337 7cf289ca 2004-04-06 devnull srbra = "";
1338 431e32de 2005-09-30 devnull if(tag >= RBRA){
1339 7cf289ca 2004-04-06 devnull tag -= RBRA;
1340 7cf289ca 2004-04-06 devnull srbra = "/";
1341 7cf289ca 2004-04-06 devnull }
1342 7cf289ca 2004-04-06 devnull tname = tagnames[tag];
1343 7cf289ca 2004-04-06 devnull if(tag == Notfound)
1344 7cf289ca 2004-04-06 devnull tname = L(Lquestion);
1345 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "<%s%S", srbra, tname);
1346 431e32de 2005-09-30 devnull for(a = t->attr; a != nil; a = a->next){
1347 7cf289ca 2004-04-06 devnull aname = attrnames[a->attid];
1348 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, " %S", aname);
1349 7cf289ca 2004-04-06 devnull if(a->value != nil)
1350 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, "=%S", a->value);
1351 7cf289ca 2004-04-06 devnull }
1352 7cf289ca 2004-04-06 devnull i += snprint(buf+i, sizeof(buf)-i-1, ">");
1353 7cf289ca 2004-04-06 devnull }
1354 7cf289ca 2004-04-06 devnull buf[i] = 0;
1355 7cf289ca 2004-04-06 devnull }
1356 7cf289ca 2004-04-06 devnull return fmtstrcpy(f, buf);
1357 7cf289ca 2004-04-06 devnull }
1358 7cf289ca 2004-04-06 devnull
1359 cbeb0b26 2006-04-01 devnull /* Attrs own their constituent strings, but build may eventually */
1360 cbeb0b26 2006-04-01 devnull /* transfer some values to its items and nil them out in the Attr. */
1361 7cf289ca 2004-04-06 devnull static Attr*
1362 7cf289ca 2004-04-06 devnull newattr(int attid, Rune* value, Attr* link)
1363 7cf289ca 2004-04-06 devnull {
1364 7cf289ca 2004-04-06 devnull Attr* ans;
1365 7cf289ca 2004-04-06 devnull
1366 7cf289ca 2004-04-06 devnull ans = (Attr*)emalloc(sizeof(Attr));
1367 7cf289ca 2004-04-06 devnull ans->attid = attid;
1368 7cf289ca 2004-04-06 devnull ans->value = value;
1369 7cf289ca 2004-04-06 devnull ans->next = link;
1370 7cf289ca 2004-04-06 devnull return ans;
1371 7cf289ca 2004-04-06 devnull }
1372 7cf289ca 2004-04-06 devnull
1373 cbeb0b26 2006-04-01 devnull /* Free list of Attrs linked through next field */
1374 7cf289ca 2004-04-06 devnull static void
1375 7cf289ca 2004-04-06 devnull freeattrs(Attr* ahead)
1376 7cf289ca 2004-04-06 devnull {
1377 7cf289ca 2004-04-06 devnull Attr* a;
1378 7cf289ca 2004-04-06 devnull Attr* nexta;
1379 7cf289ca 2004-04-06 devnull
1380 7cf289ca 2004-04-06 devnull a = ahead;
1381 431e32de 2005-09-30 devnull while(a != nil){
1382 7cf289ca 2004-04-06 devnull nexta = a->next;
1383 7cf289ca 2004-04-06 devnull free(a->value);
1384 7cf289ca 2004-04-06 devnull free(a);
1385 7cf289ca 2004-04-06 devnull a = nexta;
1386 7cf289ca 2004-04-06 devnull }
1387 7cf289ca 2004-04-06 devnull }
1388 7cf289ca 2004-04-06 devnull
1389 cbeb0b26 2006-04-01 devnull /* Free array of Tokens. */
1390 cbeb0b26 2006-04-01 devnull /* Allocated space might have room for more than n tokens, */
1391 cbeb0b26 2006-04-01 devnull /* but only n of them are initialized. */
1392 cbeb0b26 2006-04-01 devnull /* If caller has transferred ownership of constitutent strings */
1393 cbeb0b26 2006-04-01 devnull /* or attributes, it must have nil'd out the pointers in the Tokens. */
1394 7cf289ca 2004-04-06 devnull void
1395 7cf289ca 2004-04-06 devnull _freetokens(Token* tarray, int n)
1396 7cf289ca 2004-04-06 devnull {
1397 7cf289ca 2004-04-06 devnull int i;
1398 7cf289ca 2004-04-06 devnull Token* t;
1399 7cf289ca 2004-04-06 devnull
1400 7cf289ca 2004-04-06 devnull if(tarray == nil)
1401 7cf289ca 2004-04-06 devnull return;
1402 431e32de 2005-09-30 devnull for(i = 0; i < n; i++){
1403 7cf289ca 2004-04-06 devnull t = &tarray[i];
1404 7cf289ca 2004-04-06 devnull free(t->text);
1405 7cf289ca 2004-04-06 devnull freeattrs(t->attr);
1406 7cf289ca 2004-04-06 devnull }
1407 7cf289ca 2004-04-06 devnull free(tarray);
1408 7cf289ca 2004-04-06 devnull }