Blame


1 7cf289ca 2004-04-06 devnull #include <u.h>
2 7cf289ca 2004-04-06 devnull #include <libc.h>
3 7cf289ca 2004-04-06 devnull #include <bio.h>
4 7cf289ca 2004-04-06 devnull #include <draw.h>
5 7cf289ca 2004-04-06 devnull #include <regexp.h>
6 7cf289ca 2004-04-06 devnull #include <html.h>
7 7cf289ca 2004-04-06 devnull #include <ctype.h>
8 7cf289ca 2004-04-06 devnull #include "dat.h"
9 7cf289ca 2004-04-06 devnull
10 7cf289ca 2004-04-06 devnull char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
11 7cf289ca 2004-04-06 devnull Reprog *urlprog;
12 7cf289ca 2004-04-06 devnull
13 7cf289ca 2004-04-06 devnull int inword = 0;
14 7cf289ca 2004-04-06 devnull int col = 0;
15 7cf289ca 2004-04-06 devnull int wordi = 0;
16 7cf289ca 2004-04-06 devnull
17 7cf289ca 2004-04-06 devnull char*
18 7cf289ca 2004-04-06 devnull loadhtml(int fd)
19 7cf289ca 2004-04-06 devnull {
20 7cf289ca 2004-04-06 devnull URLwin *u;
21 7cf289ca 2004-04-06 devnull Bytes *b;
22 7cf289ca 2004-04-06 devnull int n;
23 7cf289ca 2004-04-06 devnull char buf[4096];
24 7cf289ca 2004-04-06 devnull
25 7cf289ca 2004-04-06 devnull u = emalloc(sizeof(URLwin));
26 7cf289ca 2004-04-06 devnull u->infd = fd;
27 7cf289ca 2004-04-06 devnull u->outfd = 1;
28 7cf289ca 2004-04-06 devnull u->url = estrdup(url);
29 7cf289ca 2004-04-06 devnull u->type = TextHtml;
30 7cf289ca 2004-04-06 devnull
31 7cf289ca 2004-04-06 devnull b = emalloc(sizeof(Bytes));
32 7cf289ca 2004-04-06 devnull while((n = read(fd, buf, sizeof buf)) > 0)
33 7cf289ca 2004-04-06 devnull growbytes(b, buf, n);
34 7cf289ca 2004-04-06 devnull if(b->b == nil)
35 7cf289ca 2004-04-06 devnull return nil; /* empty file */
36 7cf289ca 2004-04-06 devnull rendertext(u, b);
37 7cf289ca 2004-04-06 devnull freeurlwin(u);
38 7cf289ca 2004-04-06 devnull return nil;
39 7cf289ca 2004-04-06 devnull }
40 7cf289ca 2004-04-06 devnull
41 7cf289ca 2004-04-06 devnull char*
42 7cf289ca 2004-04-06 devnull runetobyte(Rune *r, int n)
43 7cf289ca 2004-04-06 devnull {
44 7cf289ca 2004-04-06 devnull char *s;
45 7cf289ca 2004-04-06 devnull
46 7cf289ca 2004-04-06 devnull if(n == 0)
47 7cf289ca 2004-04-06 devnull return emalloc(1);
48 7cf289ca 2004-04-06 devnull s = smprint("%.*S", n, r);
49 7cf289ca 2004-04-06 devnull if(s == nil)
50 7cf289ca 2004-04-06 devnull error("malloc failed");
51 7cf289ca 2004-04-06 devnull return s;
52 7cf289ca 2004-04-06 devnull }
53 7cf289ca 2004-04-06 devnull
54 7cf289ca 2004-04-06 devnull int
55 7cf289ca 2004-04-06 devnull closingpunct(int c)
56 7cf289ca 2004-04-06 devnull {
57 7cf289ca 2004-04-06 devnull return strchr(".,:;'\")]}>!?", c) != nil;
58 7cf289ca 2004-04-06 devnull }
59 7cf289ca 2004-04-06 devnull
60 7cf289ca 2004-04-06 devnull void
61 7cf289ca 2004-04-06 devnull emitword(Bytes *b, Rune *r, int nr)
62 7cf289ca 2004-04-06 devnull {
63 7cf289ca 2004-04-06 devnull char *s;
64 7cf289ca 2004-04-06 devnull int space;
65 7cf289ca 2004-04-06 devnull
66 7cf289ca 2004-04-06 devnull if(nr == 0)
67 7cf289ca 2004-04-06 devnull return;
68 7cf289ca 2004-04-06 devnull s = smprint("%.*S", nr, r);
69 7cf289ca 2004-04-06 devnull space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
70 7cf289ca 2004-04-06 devnull if(col>0 && col+space+nr > width){
71 7cf289ca 2004-04-06 devnull growbytes(b, "\n", 1);
72 7cf289ca 2004-04-06 devnull space = 0;
73 7cf289ca 2004-04-06 devnull col = 0;
74 7cf289ca 2004-04-06 devnull }
75 7cf289ca 2004-04-06 devnull if(space && col>0){
76 7cf289ca 2004-04-06 devnull growbytes(b, " ", 1);
77 7cf289ca 2004-04-06 devnull col++;
78 7cf289ca 2004-04-06 devnull }
79 7cf289ca 2004-04-06 devnull growbytes(b, s, strlen(s));
80 7cf289ca 2004-04-06 devnull col += nr;
81 7cf289ca 2004-04-06 devnull free(s);
82 7cf289ca 2004-04-06 devnull inword = 0;
83 7cf289ca 2004-04-06 devnull }
84 7cf289ca 2004-04-06 devnull
85 7cf289ca 2004-04-06 devnull void
86 7cf289ca 2004-04-06 devnull renderrunes(Bytes *b, Rune *r)
87 7cf289ca 2004-04-06 devnull {
88 7cf289ca 2004-04-06 devnull int i, n;
89 7cf289ca 2004-04-06 devnull
90 7cf289ca 2004-04-06 devnull n = runestrlen(r);
91 7cf289ca 2004-04-06 devnull for(i=0; i<n; i++){
92 7cf289ca 2004-04-06 devnull switch(r[i]){
93 7cf289ca 2004-04-06 devnull case '\n':
94 7cf289ca 2004-04-06 devnull if(inword)
95 7cf289ca 2004-04-06 devnull emitword(b, r+wordi, i-wordi);
96 7cf289ca 2004-04-06 devnull col = 0;
97 7cf289ca 2004-04-06 devnull if(b->n == 0)
98 7cf289ca 2004-04-06 devnull break; /* don't start with blank lines */
99 7cf289ca 2004-04-06 devnull if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
100 7cf289ca 2004-04-06 devnull growbytes(b, "\n", 1);
101 7cf289ca 2004-04-06 devnull break;
102 7cf289ca 2004-04-06 devnull case ' ':
103 7cf289ca 2004-04-06 devnull if(inword)
104 7cf289ca 2004-04-06 devnull emitword(b, r+wordi, i-wordi);
105 7cf289ca 2004-04-06 devnull break;
106 7cf289ca 2004-04-06 devnull default:
107 7cf289ca 2004-04-06 devnull if(!inword)
108 7cf289ca 2004-04-06 devnull wordi = i;
109 7cf289ca 2004-04-06 devnull inword = 1;
110 7cf289ca 2004-04-06 devnull break;
111 7cf289ca 2004-04-06 devnull }
112 7cf289ca 2004-04-06 devnull }
113 7cf289ca 2004-04-06 devnull if(inword)
114 7cf289ca 2004-04-06 devnull emitword(b, r+wordi, i-wordi);
115 7cf289ca 2004-04-06 devnull }
116 7cf289ca 2004-04-06 devnull
117 7cf289ca 2004-04-06 devnull void
118 7cf289ca 2004-04-06 devnull renderbytes(Bytes *b, char *fmt, ...)
119 7cf289ca 2004-04-06 devnull {
120 7cf289ca 2004-04-06 devnull Rune *r;
121 7cf289ca 2004-04-06 devnull va_list arg;
122 7cf289ca 2004-04-06 devnull
123 7cf289ca 2004-04-06 devnull va_start(arg, fmt);
124 7cf289ca 2004-04-06 devnull r = runevsmprint(fmt, arg);
125 7cf289ca 2004-04-06 devnull va_end(arg);
126 7cf289ca 2004-04-06 devnull renderrunes(b, r);
127 7cf289ca 2004-04-06 devnull free(r);
128 7cf289ca 2004-04-06 devnull }
129 7cf289ca 2004-04-06 devnull
130 7cf289ca 2004-04-06 devnull char*
131 7cf289ca 2004-04-06 devnull baseurl(char *url)
132 7cf289ca 2004-04-06 devnull {
133 7cf289ca 2004-04-06 devnull char *base, *slash;
134 7cf289ca 2004-04-06 devnull Resub rs[10];
135 7cf289ca 2004-04-06 devnull
136 7cf289ca 2004-04-06 devnull if(url == nil)
137 7cf289ca 2004-04-06 devnull return nil;
138 7cf289ca 2004-04-06 devnull if(urlprog == nil){
139 7cf289ca 2004-04-06 devnull urlprog = regcomp(urlexpr);
140 7cf289ca 2004-04-06 devnull if(urlprog == nil)
141 7cf289ca 2004-04-06 devnull error("can't compile URL regexp");
142 7cf289ca 2004-04-06 devnull }
143 7cf289ca 2004-04-06 devnull memset(rs, 0, sizeof rs);
144 7cf289ca 2004-04-06 devnull if(regexec(urlprog, url, rs, nelem(rs)) == 0)
145 7cf289ca 2004-04-06 devnull return nil;
146 7cf289ca 2004-04-06 devnull base = estrdup(url);
147 7cf289ca 2004-04-06 devnull slash = strrchr(base, '/');
148 650deb79 2004-04-08 devnull if(slash!=nil && slash>=&base[rs[0].e.ep-rs[0].s.sp])
149 7cf289ca 2004-04-06 devnull *slash = '\0';
150 7cf289ca 2004-04-06 devnull else
151 650deb79 2004-04-08 devnull base[rs[0].e.ep-rs[0].s.sp] = '\0';
152 7cf289ca 2004-04-06 devnull return base;
153 7cf289ca 2004-04-06 devnull }
154 7cf289ca 2004-04-06 devnull
155 7cf289ca 2004-04-06 devnull char*
156 7cf289ca 2004-04-06 devnull fullurl(URLwin *u, Rune *rhref)
157 7cf289ca 2004-04-06 devnull {
158 7cf289ca 2004-04-06 devnull char *base, *href, *hrefbase;
159 7cf289ca 2004-04-06 devnull char *result;
160 7cf289ca 2004-04-06 devnull
161 7cf289ca 2004-04-06 devnull if(rhref == nil)
162 7cf289ca 2004-04-06 devnull return estrdup("NULL URL");
163 7cf289ca 2004-04-06 devnull href = runetobyte(rhref, runestrlen(rhref));
164 7cf289ca 2004-04-06 devnull hrefbase = baseurl(href);
165 7cf289ca 2004-04-06 devnull result = nil;
166 7cf289ca 2004-04-06 devnull if(hrefbase==nil && (base = baseurl(u->url))!=nil){
167 7cf289ca 2004-04-06 devnull result = estrdup(base);
168 7cf289ca 2004-04-06 devnull if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
169 7cf289ca 2004-04-06 devnull result = eappend(result, "/", "");
170 7cf289ca 2004-04-06 devnull free(base);
171 7cf289ca 2004-04-06 devnull }
172 7cf289ca 2004-04-06 devnull if(href){
173 7cf289ca 2004-04-06 devnull if(result)
174 7cf289ca 2004-04-06 devnull result = eappend(result, "", href);
175 7cf289ca 2004-04-06 devnull else
176 7cf289ca 2004-04-06 devnull result = estrdup(href);
177 7cf289ca 2004-04-06 devnull }
178 7cf289ca 2004-04-06 devnull free(hrefbase);
179 7cf289ca 2004-04-06 devnull if(result == nil)
180 7cf289ca 2004-04-06 devnull return estrdup("***unknown***");
181 7cf289ca 2004-04-06 devnull return result;
182 7cf289ca 2004-04-06 devnull }
183 7cf289ca 2004-04-06 devnull
184 7cf289ca 2004-04-06 devnull void
185 7cf289ca 2004-04-06 devnull render(URLwin *u, Bytes *t, Item *items, int curanchor)
186 7cf289ca 2004-04-06 devnull {
187 7cf289ca 2004-04-06 devnull Item *il;
188 7cf289ca 2004-04-06 devnull Itext *it;
189 7cf289ca 2004-04-06 devnull Ifloat *ifl;
190 7cf289ca 2004-04-06 devnull Ispacer *is;
191 7cf289ca 2004-04-06 devnull Itable *ita;
192 7cf289ca 2004-04-06 devnull Iimage *im;
193 7cf289ca 2004-04-06 devnull Anchor *a;
194 7cf289ca 2004-04-06 devnull Table *tab;
195 7cf289ca 2004-04-06 devnull Tablecell *cell;
196 7cf289ca 2004-04-06 devnull char *href;
197 7cf289ca 2004-04-06 devnull
198 7cf289ca 2004-04-06 devnull inword = 0;
199 7cf289ca 2004-04-06 devnull col = 0;
200 7cf289ca 2004-04-06 devnull wordi = 0;
201 7cf289ca 2004-04-06 devnull
202 7cf289ca 2004-04-06 devnull for(il=items; il!=nil; il=il->next){
203 7cf289ca 2004-04-06 devnull if(il->state & IFbrk)
204 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
205 7cf289ca 2004-04-06 devnull if(il->state & IFbrksp)
206 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
207 7cf289ca 2004-04-06 devnull
208 7cf289ca 2004-04-06 devnull switch(il->tag){
209 7cf289ca 2004-04-06 devnull case Itexttag:
210 7cf289ca 2004-04-06 devnull it = (Itext*)il;
211 7cf289ca 2004-04-06 devnull renderrunes(t, it->s);
212 7cf289ca 2004-04-06 devnull break;
213 7cf289ca 2004-04-06 devnull case Iruletag:
214 7cf289ca 2004-04-06 devnull if(t->n>0 && t->b[t->n-1]!='\n')
215 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
216 7cf289ca 2004-04-06 devnull renderbytes(t, "=======\n");
217 7cf289ca 2004-04-06 devnull break;
218 7cf289ca 2004-04-06 devnull case Iimagetag:
219 7cf289ca 2004-04-06 devnull if(!aflag)
220 7cf289ca 2004-04-06 devnull break;
221 7cf289ca 2004-04-06 devnull im = (Iimage*)il;
222 7cf289ca 2004-04-06 devnull if(im->imsrc){
223 7cf289ca 2004-04-06 devnull href = fullurl(u, im->imsrc);
224 7cf289ca 2004-04-06 devnull renderbytes(t, "[image %s]", href);
225 7cf289ca 2004-04-06 devnull free(href);
226 7cf289ca 2004-04-06 devnull }
227 7cf289ca 2004-04-06 devnull break;
228 7cf289ca 2004-04-06 devnull case Iformfieldtag:
229 7cf289ca 2004-04-06 devnull if(aflag)
230 7cf289ca 2004-04-06 devnull renderbytes(t, "[formfield]");
231 7cf289ca 2004-04-06 devnull break;
232 7cf289ca 2004-04-06 devnull case Itabletag:
233 7cf289ca 2004-04-06 devnull ita = (Itable*)il;
234 7cf289ca 2004-04-06 devnull tab = ita->table;
235 7cf289ca 2004-04-06 devnull for(cell=tab->cells; cell!=nil; cell=cell->next){
236 7cf289ca 2004-04-06 devnull render(u, t, cell->content, curanchor);
237 7cf289ca 2004-04-06 devnull }
238 7cf289ca 2004-04-06 devnull if(t->n>0 && t->b[t->n-1]!='\n')
239 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
240 7cf289ca 2004-04-06 devnull break;
241 7cf289ca 2004-04-06 devnull case Ifloattag:
242 7cf289ca 2004-04-06 devnull ifl = (Ifloat*)il;
243 7cf289ca 2004-04-06 devnull render(u, t, ifl->item, curanchor);
244 7cf289ca 2004-04-06 devnull break;
245 7cf289ca 2004-04-06 devnull case Ispacertag:
246 7cf289ca 2004-04-06 devnull is = (Ispacer*)il;
247 7cf289ca 2004-04-06 devnull if(is->spkind != ISPnull)
248 7cf289ca 2004-04-06 devnull renderbytes(t, " ");
249 7cf289ca 2004-04-06 devnull break;
250 7cf289ca 2004-04-06 devnull default:
251 7cf289ca 2004-04-06 devnull error("unknown item tag %d\n", il->tag);
252 7cf289ca 2004-04-06 devnull }
253 7cf289ca 2004-04-06 devnull if(il->anchorid != 0 && il->anchorid!=curanchor){
254 7cf289ca 2004-04-06 devnull for(a=u->docinfo->anchors; a!=nil; a=a->next)
255 7cf289ca 2004-04-06 devnull if(aflag && a->index == il->anchorid){
256 7cf289ca 2004-04-06 devnull href = fullurl(u, a->href);
257 7cf289ca 2004-04-06 devnull renderbytes(t, "[%s]", href);
258 7cf289ca 2004-04-06 devnull free(href);
259 7cf289ca 2004-04-06 devnull break;
260 7cf289ca 2004-04-06 devnull }
261 7cf289ca 2004-04-06 devnull curanchor = il->anchorid;
262 7cf289ca 2004-04-06 devnull }
263 7cf289ca 2004-04-06 devnull }
264 7cf289ca 2004-04-06 devnull if(t->n>0 && t->b[t->n-1]!='\n')
265 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
266 7cf289ca 2004-04-06 devnull }
267 7cf289ca 2004-04-06 devnull
268 7cf289ca 2004-04-06 devnull void
269 7cf289ca 2004-04-06 devnull rerender(URLwin *u)
270 7cf289ca 2004-04-06 devnull {
271 7cf289ca 2004-04-06 devnull Bytes *t;
272 7cf289ca 2004-04-06 devnull
273 7cf289ca 2004-04-06 devnull t = emalloc(sizeof(Bytes));
274 7cf289ca 2004-04-06 devnull
275 7cf289ca 2004-04-06 devnull render(u, t, u->items, 0);
276 7cf289ca 2004-04-06 devnull
277 7cf289ca 2004-04-06 devnull if(t->n)
278 7cf289ca 2004-04-06 devnull write(u->outfd, (char*)t->b, t->n);
279 7cf289ca 2004-04-06 devnull free(t->b);
280 7cf289ca 2004-04-06 devnull free(t);
281 7cf289ca 2004-04-06 devnull }
282 7cf289ca 2004-04-06 devnull
283 7cf289ca 2004-04-06 devnull /*
284 7cf289ca 2004-04-06 devnull * Somewhat of a hack. Not a full parse, just looks for strings in the beginning
285 7cf289ca 2004-04-06 devnull * of the document (cistrstr only looks at first somewhat bytes).
286 7cf289ca 2004-04-06 devnull */
287 7cf289ca 2004-04-06 devnull int
288 7cf289ca 2004-04-06 devnull charset(char *s)
289 7cf289ca 2004-04-06 devnull {
290 7cf289ca 2004-04-06 devnull char *meta, *emeta, *charset;
291 7cf289ca 2004-04-06 devnull
292 7cf289ca 2004-04-06 devnull if(defcharset == 0)
293 7cf289ca 2004-04-06 devnull defcharset = ISO_8859_1;
294 7cf289ca 2004-04-06 devnull meta = cistrstr(s, "<meta");
295 7cf289ca 2004-04-06 devnull if(meta == nil)
296 7cf289ca 2004-04-06 devnull return defcharset;
297 7cf289ca 2004-04-06 devnull for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
298 7cf289ca 2004-04-06 devnull ;
299 7cf289ca 2004-04-06 devnull charset = cistrstr(s, "charset=");
300 7cf289ca 2004-04-06 devnull if(charset == nil)
301 7cf289ca 2004-04-06 devnull return defcharset;
302 7cf289ca 2004-04-06 devnull charset += 8;
303 7cf289ca 2004-04-06 devnull if(*charset == '"')
304 7cf289ca 2004-04-06 devnull charset++;
305 7cf289ca 2004-04-06 devnull if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
306 7cf289ca 2004-04-06 devnull return UTF_8;
307 7cf289ca 2004-04-06 devnull return defcharset;
308 7cf289ca 2004-04-06 devnull }
309 7cf289ca 2004-04-06 devnull
310 7cf289ca 2004-04-06 devnull void
311 7cf289ca 2004-04-06 devnull rendertext(URLwin *u, Bytes *b)
312 7cf289ca 2004-04-06 devnull {
313 7cf289ca 2004-04-06 devnull Rune *rurl;
314 7cf289ca 2004-04-06 devnull
315 7cf289ca 2004-04-06 devnull rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
316 7cf289ca 2004-04-06 devnull u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
317 cbeb0b26 2006-04-01 devnull /* free(rurl); */
318 7cf289ca 2004-04-06 devnull
319 7cf289ca 2004-04-06 devnull rerender(u);
320 7cf289ca 2004-04-06 devnull }
321 7cf289ca 2004-04-06 devnull
322 7cf289ca 2004-04-06 devnull
323 7cf289ca 2004-04-06 devnull void
324 7cf289ca 2004-04-06 devnull freeurlwin(URLwin *u)
325 7cf289ca 2004-04-06 devnull {
326 7cf289ca 2004-04-06 devnull freeitems(u->items);
327 7cf289ca 2004-04-06 devnull u->items = nil;
328 7cf289ca 2004-04-06 devnull freedocinfo(u->docinfo);
329 7cf289ca 2004-04-06 devnull u->docinfo = nil;
330 7cf289ca 2004-04-06 devnull free(u);
331 7cf289ca 2004-04-06 devnull }