1 7cf289ca 2004-04-06 devnull #include <u.h>
2 7cf289ca 2004-04-06 devnull #include <libc.h>
3 7cf289ca 2004-04-06 devnull #include <bio.h>
4 7cf289ca 2004-04-06 devnull #include <draw.h>
5 7cf289ca 2004-04-06 devnull #include <regexp.h>
6 7cf289ca 2004-04-06 devnull #include <html.h>
7 7cf289ca 2004-04-06 devnull #include <ctype.h>
8 7cf289ca 2004-04-06 devnull #include "dat.h"
10 7cf289ca 2004-04-06 devnull char urlexpr[] = "^(https?|ftp|file|gopher|mailto|news|nntp|telnet|wais|prospero)://([a-zA-Z0-9_@\\-]+([.:][a-zA-Z0-9_@\\-]+)*)";
11 7cf289ca 2004-04-06 devnull Reprog *urlprog;
13 7cf289ca 2004-04-06 devnull int inword = 0;
14 7cf289ca 2004-04-06 devnull int col = 0;
15 7cf289ca 2004-04-06 devnull int wordi = 0;
18 7cf289ca 2004-04-06 devnull loadhtml(int fd)
20 7cf289ca 2004-04-06 devnull URLwin *u;
21 7cf289ca 2004-04-06 devnull Bytes *b;
23 7cf289ca 2004-04-06 devnull char buf[4096];
25 7cf289ca 2004-04-06 devnull u = emalloc(sizeof(URLwin));
26 7cf289ca 2004-04-06 devnull u->infd = fd;
27 7cf289ca 2004-04-06 devnull u->outfd = 1;
28 7cf289ca 2004-04-06 devnull u->url = estrdup(url);
29 7cf289ca 2004-04-06 devnull u->type = TextHtml;
31 7cf289ca 2004-04-06 devnull b = emalloc(sizeof(Bytes));
32 7cf289ca 2004-04-06 devnull while((n = read(fd, buf, sizeof buf)) > 0)
33 7cf289ca 2004-04-06 devnull growbytes(b, buf, n);
34 7cf289ca 2004-04-06 devnull if(b->b == nil)
35 7cf289ca 2004-04-06 devnull return nil; /* empty file */
36 7cf289ca 2004-04-06 devnull rendertext(u, b);
37 7cf289ca 2004-04-06 devnull freeurlwin(u);
38 7cf289ca 2004-04-06 devnull return nil;
42 7cf289ca 2004-04-06 devnull runetobyte(Rune *r, int n)
46 7cf289ca 2004-04-06 devnull if(n == 0)
47 7cf289ca 2004-04-06 devnull return emalloc(1);
48 7cf289ca 2004-04-06 devnull s = smprint("%.*S", n, r);
49 7cf289ca 2004-04-06 devnull if(s == nil)
50 7cf289ca 2004-04-06 devnull error("malloc failed");
51 7cf289ca 2004-04-06 devnull return s;
55 7cf289ca 2004-04-06 devnull closingpunct(int c)
57 7cf289ca 2004-04-06 devnull return strchr(".,:;'\")]}>!?", c) != nil;
61 7cf289ca 2004-04-06 devnull emitword(Bytes *b, Rune *r, int nr)
64 7cf289ca 2004-04-06 devnull int space;
66 7cf289ca 2004-04-06 devnull if(nr == 0)
68 7cf289ca 2004-04-06 devnull s = smprint("%.*S", nr, r);
69 7cf289ca 2004-04-06 devnull space = (b->n>0) && !isspace(b->b[b->n-1]) && !closingpunct(r[0]);
70 7cf289ca 2004-04-06 devnull if(col>0 && col+space+nr > width){
71 7cf289ca 2004-04-06 devnull growbytes(b, "\n", 1);
72 7cf289ca 2004-04-06 devnull space = 0;
75 7cf289ca 2004-04-06 devnull if(space && col>0){
76 7cf289ca 2004-04-06 devnull growbytes(b, " ", 1);
79 7cf289ca 2004-04-06 devnull growbytes(b, s, strlen(s));
80 7cf289ca 2004-04-06 devnull col += nr;
82 7cf289ca 2004-04-06 devnull inword = 0;
86 7cf289ca 2004-04-06 devnull renderrunes(Bytes *b, Rune *r)
88 7cf289ca 2004-04-06 devnull int i, n;
90 7cf289ca 2004-04-06 devnull n = runestrlen(r);
91 7cf289ca 2004-04-06 devnull for(i=0; i<n; i++){
92 7cf289ca 2004-04-06 devnull switch(r[i]){
93 7cf289ca 2004-04-06 devnull case '\n':
94 7cf289ca 2004-04-06 devnull if(inword)
95 7cf289ca 2004-04-06 devnull emitword(b, r+wordi, i-wordi);
97 7cf289ca 2004-04-06 devnull if(b->n == 0)
98 7cf289ca 2004-04-06 devnull break; /* don't start with blank lines */
99 7cf289ca 2004-04-06 devnull if(b->n<2 || b->b[b->n-1]!='\n' || b->b[b->n-2]!='\n')
100 7cf289ca 2004-04-06 devnull growbytes(b, "\n", 1);
102 7cf289ca 2004-04-06 devnull case ' ':
103 7cf289ca 2004-04-06 devnull if(inword)
104 7cf289ca 2004-04-06 devnull emitword(b, r+wordi, i-wordi);
106 7cf289ca 2004-04-06 devnull default:
107 7cf289ca 2004-04-06 devnull if(!inword)
108 7cf289ca 2004-04-06 devnull wordi = i;
109 7cf289ca 2004-04-06 devnull inword = 1;
113 7cf289ca 2004-04-06 devnull if(inword)
114 7cf289ca 2004-04-06 devnull emitword(b, r+wordi, i-wordi);
118 7cf289ca 2004-04-06 devnull renderbytes(Bytes *b, char *fmt, ...)
120 7cf289ca 2004-04-06 devnull Rune *r;
121 7cf289ca 2004-04-06 devnull va_list arg;
123 7cf289ca 2004-04-06 devnull va_start(arg, fmt);
124 7cf289ca 2004-04-06 devnull r = runevsmprint(fmt, arg);
125 7cf289ca 2004-04-06 devnull va_end(arg);
126 7cf289ca 2004-04-06 devnull renderrunes(b, r);
127 7cf289ca 2004-04-06 devnull free(r);
131 7cf289ca 2004-04-06 devnull baseurl(char *url)
133 7cf289ca 2004-04-06 devnull char *base, *slash;
134 7cf289ca 2004-04-06 devnull Resub rs[10];
136 7cf289ca 2004-04-06 devnull if(url == nil)
137 7cf289ca 2004-04-06 devnull return nil;
138 7cf289ca 2004-04-06 devnull if(urlprog == nil){
139 7cf289ca 2004-04-06 devnull urlprog = regcomp(urlexpr);
140 7cf289ca 2004-04-06 devnull if(urlprog == nil)
141 7cf289ca 2004-04-06 devnull error("can't compile URL regexp");
143 7cf289ca 2004-04-06 devnull memset(rs, 0, sizeof rs);
144 7cf289ca 2004-04-06 devnull if(regexec(urlprog, url, rs, nelem(rs)) == 0)
145 7cf289ca 2004-04-06 devnull return nil;
146 7cf289ca 2004-04-06 devnull base = estrdup(url);
147 7cf289ca 2004-04-06 devnull slash = strrchr(base, '/');
148 650deb79 2004-04-08 devnull if(slash!=nil && slash>=&base[rs[0].e.ep-rs[0].s.sp])
149 7cf289ca 2004-04-06 devnull *slash = '\0';
151 650deb79 2004-04-08 devnull base[rs[0].e.ep-rs[0].s.sp] = '\0';
152 7cf289ca 2004-04-06 devnull return base;
156 7cf289ca 2004-04-06 devnull fullurl(URLwin *u, Rune *rhref)
158 7cf289ca 2004-04-06 devnull char *base, *href, *hrefbase;
159 7cf289ca 2004-04-06 devnull char *result;
161 7cf289ca 2004-04-06 devnull if(rhref == nil)
162 7cf289ca 2004-04-06 devnull return estrdup("NULL URL");
163 7cf289ca 2004-04-06 devnull href = runetobyte(rhref, runestrlen(rhref));
164 7cf289ca 2004-04-06 devnull hrefbase = baseurl(href);
165 7cf289ca 2004-04-06 devnull result = nil;
166 7cf289ca 2004-04-06 devnull if(hrefbase==nil && (base = baseurl(u->url))!=nil){
167 7cf289ca 2004-04-06 devnull result = estrdup(base);
168 7cf289ca 2004-04-06 devnull if(base[strlen(base)-1]!='/' && (href==nil || href[0]!='/'))
169 7cf289ca 2004-04-06 devnull result = eappend(result, "/", "");
170 7cf289ca 2004-04-06 devnull free(base);
172 7cf289ca 2004-04-06 devnull if(href){
173 7cf289ca 2004-04-06 devnull if(result)
174 7cf289ca 2004-04-06 devnull result = eappend(result, "", href);
176 7cf289ca 2004-04-06 devnull result = estrdup(href);
178 7cf289ca 2004-04-06 devnull free(hrefbase);
179 7cf289ca 2004-04-06 devnull if(result == nil)
180 7cf289ca 2004-04-06 devnull return estrdup("***unknown***");
181 7cf289ca 2004-04-06 devnull return result;
185 7cf289ca 2004-04-06 devnull render(URLwin *u, Bytes *t, Item *items, int curanchor)
187 7cf289ca 2004-04-06 devnull Item *il;
188 7cf289ca 2004-04-06 devnull Itext *it;
189 7cf289ca 2004-04-06 devnull Ifloat *ifl;
190 7cf289ca 2004-04-06 devnull Ispacer *is;
191 7cf289ca 2004-04-06 devnull Itable *ita;
192 7cf289ca 2004-04-06 devnull Iimage *im;
193 7cf289ca 2004-04-06 devnull Anchor *a;
194 7cf289ca 2004-04-06 devnull Table *tab;
195 7cf289ca 2004-04-06 devnull Tablecell *cell;
196 7cf289ca 2004-04-06 devnull char *href;
198 7cf289ca 2004-04-06 devnull inword = 0;
199 7cf289ca 2004-04-06 devnull col = 0;
200 7cf289ca 2004-04-06 devnull wordi = 0;
202 7cf289ca 2004-04-06 devnull for(il=items; il!=nil; il=il->next){
203 7cf289ca 2004-04-06 devnull if(il->state & IFbrk)
204 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
205 7cf289ca 2004-04-06 devnull if(il->state & IFbrksp)
206 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
208 7cf289ca 2004-04-06 devnull switch(il->tag){
209 7cf289ca 2004-04-06 devnull case Itexttag:
210 7cf289ca 2004-04-06 devnull it = (Itext*)il;
211 7cf289ca 2004-04-06 devnull renderrunes(t, it->s);
213 7cf289ca 2004-04-06 devnull case Iruletag:
214 7cf289ca 2004-04-06 devnull if(t->n>0 && t->b[t->n-1]!='\n')
215 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
216 7cf289ca 2004-04-06 devnull renderbytes(t, "=======\n");
218 7cf289ca 2004-04-06 devnull case Iimagetag:
219 7cf289ca 2004-04-06 devnull if(!aflag)
221 7cf289ca 2004-04-06 devnull im = (Iimage*)il;
222 7cf289ca 2004-04-06 devnull if(im->imsrc){
223 7cf289ca 2004-04-06 devnull href = fullurl(u, im->imsrc);
224 7cf289ca 2004-04-06 devnull renderbytes(t, "[image %s]", href);
225 7cf289ca 2004-04-06 devnull free(href);
228 7cf289ca 2004-04-06 devnull case Iformfieldtag:
229 7cf289ca 2004-04-06 devnull if(aflag)
230 7cf289ca 2004-04-06 devnull renderbytes(t, "[formfield]");
232 7cf289ca 2004-04-06 devnull case Itabletag:
233 7cf289ca 2004-04-06 devnull ita = (Itable*)il;
234 7cf289ca 2004-04-06 devnull tab = ita->table;
235 7cf289ca 2004-04-06 devnull for(cell=tab->cells; cell!=nil; cell=cell->next){
236 7cf289ca 2004-04-06 devnull render(u, t, cell->content, curanchor);
238 7cf289ca 2004-04-06 devnull if(t->n>0 && t->b[t->n-1]!='\n')
239 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
241 7cf289ca 2004-04-06 devnull case Ifloattag:
242 7cf289ca 2004-04-06 devnull ifl = (Ifloat*)il;
243 7cf289ca 2004-04-06 devnull render(u, t, ifl->item, curanchor);
245 7cf289ca 2004-04-06 devnull case Ispacertag:
246 7cf289ca 2004-04-06 devnull is = (Ispacer*)il;
247 7cf289ca 2004-04-06 devnull if(is->spkind != ISPnull)
248 7cf289ca 2004-04-06 devnull renderbytes(t, " ");
250 7cf289ca 2004-04-06 devnull default:
251 7cf289ca 2004-04-06 devnull error("unknown item tag %d\n", il->tag);
253 7cf289ca 2004-04-06 devnull if(il->anchorid != 0 && il->anchorid!=curanchor){
254 7cf289ca 2004-04-06 devnull for(a=u->docinfo->anchors; a!=nil; a=a->next)
255 7cf289ca 2004-04-06 devnull if(aflag && a->index == il->anchorid){
256 7cf289ca 2004-04-06 devnull href = fullurl(u, a->href);
257 7cf289ca 2004-04-06 devnull renderbytes(t, "[%s]", href);
258 7cf289ca 2004-04-06 devnull free(href);
261 7cf289ca 2004-04-06 devnull curanchor = il->anchorid;
264 7cf289ca 2004-04-06 devnull if(t->n>0 && t->b[t->n-1]!='\n')
265 7cf289ca 2004-04-06 devnull renderbytes(t, "\n");
269 7cf289ca 2004-04-06 devnull rerender(URLwin *u)
271 7cf289ca 2004-04-06 devnull Bytes *t;
273 7cf289ca 2004-04-06 devnull t = emalloc(sizeof(Bytes));
275 7cf289ca 2004-04-06 devnull render(u, t, u->items, 0);
277 7cf289ca 2004-04-06 devnull if(t->n)
278 7cf289ca 2004-04-06 devnull write(u->outfd, (char*)t->b, t->n);
279 7cf289ca 2004-04-06 devnull free(t->b);
280 7cf289ca 2004-04-06 devnull free(t);
284 7cf289ca 2004-04-06 devnull * Somewhat of a hack. Not a full parse, just looks for strings in the beginning
285 7cf289ca 2004-04-06 devnull * of the document (cistrstr only looks at first somewhat bytes).
288 7cf289ca 2004-04-06 devnull charset(char *s)
290 7cf289ca 2004-04-06 devnull char *meta, *emeta, *charset;
292 7cf289ca 2004-04-06 devnull if(defcharset == 0)
293 7cf289ca 2004-04-06 devnull defcharset = ISO_8859_1;
294 7cf289ca 2004-04-06 devnull meta = cistrstr(s, "<meta");
295 7cf289ca 2004-04-06 devnull if(meta == nil)
296 7cf289ca 2004-04-06 devnull return defcharset;
297 7cf289ca 2004-04-06 devnull for(emeta=meta; *emeta!='>' && *emeta!='\0'; emeta++)
299 7cf289ca 2004-04-06 devnull charset = cistrstr(s, "charset=");
300 7cf289ca 2004-04-06 devnull if(charset == nil)
301 7cf289ca 2004-04-06 devnull return defcharset;
302 7cf289ca 2004-04-06 devnull charset += 8;
303 7cf289ca 2004-04-06 devnull if(*charset == '"')
304 7cf289ca 2004-04-06 devnull charset++;
305 7cf289ca 2004-04-06 devnull if(cistrncmp(charset, "utf-8", 5) || cistrncmp(charset, "utf8", 4))
306 7cf289ca 2004-04-06 devnull return UTF_8;
307 7cf289ca 2004-04-06 devnull return defcharset;
311 7cf289ca 2004-04-06 devnull rendertext(URLwin *u, Bytes *b)
313 7cf289ca 2004-04-06 devnull Rune *rurl;
315 7cf289ca 2004-04-06 devnull rurl = toStr((uchar*)u->url, strlen(u->url), ISO_8859_1);
316 7cf289ca 2004-04-06 devnull u->items = parsehtml(b->b, b->n, rurl, u->type, charset((char*)b->b), &u->docinfo);
317 cbeb0b26 2006-04-01 devnull /* free(rurl); */
319 7cf289ca 2004-04-06 devnull rerender(u);
324 7cf289ca 2004-04-06 devnull freeurlwin(URLwin *u)
326 7cf289ca 2004-04-06 devnull freeitems(u->items);
327 7cf289ca 2004-04-06 devnull u->items = nil;
328 7cf289ca 2004-04-06 devnull freedocinfo(u->docinfo);
329 7cf289ca 2004-04-06 devnull u->docinfo = nil;
330 7cf289ca 2004-04-06 devnull free(u);