Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include "spam.h"
7 enum {
8 Quanta = 8192,
9 Minbody = 6000,
10 HdrMax = 15
11 };
13 typedef struct keyword Keyword;
14 typedef struct word Word;
16 struct word{
17 char *string;
18 int n;
19 };
21 struct keyword{
22 char *string;
23 int value;
24 };
26 Word htmlcmds[] =
27 {
28 "html", 4,
29 "!doctype html", 13,
30 0,
32 };
34 Word hrefs[] =
35 {
36 "a href=", 7,
37 "a title=", 8,
38 "a target=", 9,
39 "base href=", 10,
40 "img src=", 8,
41 "img border=", 11,
42 "form action=", 12,
43 "!--", 3,
44 0,
46 };
48 /*
49 * RFC822 header keywords to look for for fractured header.
50 * all lengths must be less than HdrMax defined above.
51 */
52 Word hdrwords[] =
53 {
54 "cc:", 3,
55 "bcc:", 4,
56 "to:", 3,
57 0, 0,
59 };
61 Keyword keywords[] =
62 {
63 "header", HoldHeader,
64 "line", SaveLine,
65 "hold", Hold,
66 "dump", Dump,
67 "loff", Lineoff,
68 0, Nactions
69 };
71 Patterns patterns[] = {
72 [Dump] { "DUMP:", 0, 0 },
73 [HoldHeader] { "HEADER:", 0, 0 },
74 [Hold] { "HOLD:", 0, 0 },
75 [SaveLine] { "LINE:", 0, 0 },
76 [Lineoff] { "LINEOFF:", 0, 0 },
77 [Nactions] { 0, 0, 0 }
78 };
80 static char* endofhdr(char*, char*);
81 static int escape(char**);
82 static int extract(char*);
83 static int findkey(char*);
84 static int hash(int);
85 static int isword(Word*, char*, int);
86 static void parsealt(Biobuf*, char*, Spat**);
88 /*
89 * The canonicalizer: convert input to canonical representation
90 */
91 char*
92 readmsg(Biobuf *bp, int *hsize, int *bufsize)
93 {
94 char *p, *buf;
95 int n, offset, eoh, bsize, delta;
97 buf = 0;
98 offset = 0;
99 if(bufsize)
100 *bufsize = 0;
101 if(hsize)
102 *hsize = 0;
103 for(;;) {
104 buf = Realloc(buf, offset+Quanta+1);
105 n = Bread(bp, buf+offset, Quanta);
106 if(n < 0){
107 free(buf);
108 return 0;
110 p = buf+offset; /* start of this chunk */
111 offset += n; /* end of this chunk */
112 buf[offset] = 0;
113 if(n == 0){
114 if(offset == 0)
115 return 0;
116 break;
119 if(hsize == 0) /* don't process header */
120 break;
121 if(p != buf && p[-1] == '\n') /* check for EOH across buffer split */
122 p--;
123 p = endofhdr(p, buf+offset);
124 if(p)
125 break;
126 if(offset >= Maxread) /* gargantuan header - just punt*/
128 if(hsize)
129 *hsize = offset;
130 if(bufsize)
131 *bufsize = offset;
132 return buf;
135 eoh = p-buf; /* End of header */
136 bsize = offset - eoh; /* amount of body already read */
138 /* Read at least Minbody bytes of the body */
139 if (bsize < Minbody){
140 delta = Minbody-bsize;
141 buf = Realloc(buf, offset+delta+1);
142 n = Bread(bp, buf+offset, delta);
143 if(n > 0) {
144 offset += n;
145 buf[offset] = 0;
148 if(hsize)
149 *hsize = eoh;
150 if(bufsize)
151 *bufsize = offset;
152 return buf;
155 static int
156 isword(Word *wp, char *text, int len)
158 for(;wp->string; wp++)
159 if(len >= wp->n && strncmp(text, wp->string, wp->n) == 0)
160 return 1;
161 return 0;
164 static char*
165 endofhdr(char *raw, char *end)
167 int i;
168 char *p, *q;
169 char buf[HdrMax];
171 /*
172 * can't use strchr to search for newlines because
173 * there may be embedded NULL's.
174 */
175 for(p = raw; p < end; p++){
176 if(*p != '\n' || p[1] != '\n')
177 continue;
178 p++;
179 for(i = 0, q = p+1; i < sizeof(buf) && *q; q++){
180 buf[i++] = tolower(*q);
181 if(*q == ':' || *q == '\n')
182 break;
184 if(!isword(hdrwords, buf, i))
185 return p+1;
187 return 0;
190 static int
191 htmlmatch(Word *wp, char *text, char *end, int *n)
193 char *cp;
194 int i, c, lastc;
195 char buf[MaxHtml];
197 /*
198 * extract a string up to '>'
199 */
201 i = lastc = 0;
202 cp = text;
203 while (cp < end && i < sizeof(buf)-1){
204 c = *cp++;
205 if(c == '=')
206 c = escape(&cp);
207 switch(c){
208 case 0:
209 case '\r':
210 continue;
211 case '>':
212 goto out;
213 case '\n':
214 case ' ':
215 case '\t':
216 if(lastc == ' ')
217 continue;
218 c = ' ';
219 break;
220 default:
221 c = tolower(c);
222 break;
224 buf[i++] = lastc = c;
226 out:
227 buf[i] = 0;
228 if(n)
229 *n = cp-text;
230 return isword(wp, buf, i);
233 static int
234 escape(char **msg)
236 int c;
237 char *p;
239 p = *msg;
240 c = *p;
241 if(c == '\n'){
242 p++;
243 c = *p++;
244 } else
245 if(c == '2'){
246 c = tolower(p[1]);
247 if(c == 'e'){
248 p += 2;
249 c = '.';
250 }else
251 if(c == 'f'){
252 p += 2;
253 c = '/';
254 }else
255 if(c == '0'){
256 p += 2;
257 c = ' ';
259 else c = '=';
260 } else {
261 if(c == '3' && tolower(p[1]) == 'd')
262 p += 2;
263 c = '=';
265 *msg = p;
266 return c;
269 static int
270 htmlchk(char **msg, char *end)
272 int n;
273 char *p;
275 static int ishtml;
277 p = *msg;
278 if(ishtml == 0){
279 ishtml = htmlmatch(htmlcmds, p, end, &n);
281 /* If not an HTML keyword, check if it's
282 * an HTML comment (<!comment>). if so,
283 * skip over it; otherwise copy it in.
284 */
285 if(ishtml == 0 && *p != '!') /* not comment */
286 return '<'; /* copy it */
288 } else if(htmlmatch(hrefs, p, end, &n)) /* if special HTML string */
289 return '<'; /* copy it */
291 /*
292 * this is an uninteresting HTML command; skip over it.
293 */
294 p += n;
295 *msg = p+1;
296 return *p;
299 /*
300 * decode a base 64 encode body
301 */
302 void
303 conv64(char *msg, char *end, char *buf, int bufsize)
305 int len, i;
306 char *cp;
308 len = end - msg;
309 i = (len*3)/4+1; /* room for max chars + null */
310 cp = Malloc(i);
311 len = dec64((uchar*)cp, i, msg, len);
312 convert(cp, cp+len, buf, bufsize, 1);
313 free(cp);
316 int
317 convert(char *msg, char *end, char *buf, int bufsize, int isbody)
320 char *p;
321 int c, lastc, base64;
323 lastc = 0;
324 base64 = 0;
325 while(msg < end && bufsize > 0){
326 c = *msg++;
328 /*
329 * In the body only, try to strip most HTML and
330 * replace certain MIME escape sequences with the character
331 */
332 if(isbody) {
333 do{
334 p = msg;
335 if(c == '<')
336 c = htmlchk(&msg, end);
337 if(c == '=')
338 c = escape(&msg);
339 } while(p != msg && p < end);
341 switch(c){
342 case 0:
343 case '\r':
344 continue;
345 case '\t':
346 case ' ':
347 case '\n':
348 if(lastc == ' ')
349 continue;
350 c = ' ';
351 break;
352 case 'C': /* check for MIME base 64 encoding in header */
353 case 'c':
354 if(isbody == 0)
355 if(msg < end-32 && *msg == 'o' && msg[1] == 'n')
356 if(cistrncmp(msg+2, "tent-transfer-encoding: base64", 30) == 0)
357 base64 = 1;
358 c = 'c';
359 break;
360 default:
361 c = tolower(c);
362 break;
364 *buf++ = c;
365 lastc = c;
366 bufsize--;
368 *buf = 0;
369 return base64;
372 /*
373 * The pattern parser: build data structures from the pattern file
374 */
376 static int
377 hash(int c)
379 return c & 127;
382 static int
383 findkey(char *val)
385 Keyword *kp;
387 for(kp = keywords; kp->string; kp++)
388 if(strcmp(val, kp->string) == 0)
389 break;
390 return kp->value;
393 #define whitespace(c) ((c) == ' ' || (c) == '\t')
395 void
396 parsepats(Biobuf *bp)
398 Pattern *p, *new;
399 char *cp, *qp;
400 int type, action, n, h;
401 Spat *spat;
403 for(;;){
404 cp = Brdline(bp, '\n');
405 if(cp == 0)
406 break;
407 cp[Blinelen(bp)-1] = 0;
408 while(*cp == ' ' || *cp == '\t')
409 cp++;
410 if(*cp == '#' || *cp == 0)
411 continue;
412 type = regexp;
413 if(*cp == '*'){
414 type = string;
415 cp++;
417 qp = strchr(cp, ':');
418 if(qp == 0)
419 continue;
420 *qp = 0;
421 if(debug)
422 fprint(2, "action = %s\n", cp);
423 action = findkey(cp);
424 if(action >= Nactions)
425 continue;
426 cp = qp+1;
427 n = extract(cp);
428 if(n <= 0 || *cp == 0)
429 continue;
431 qp = strstr(cp, "~~");
432 if(qp){
433 *qp = 0;
434 n = strlen(cp);
436 if(debug)
437 fprint(2, " Pattern: `%s'\n", cp);
439 /* Hook regexps into a chain */
440 if(type == regexp) {
441 new = Malloc(sizeof(Pattern));
442 new->action = action;
443 new->pat = regcomp(cp);
444 if(new->pat == 0){
445 free(new);
446 continue;
448 new->type = regexp;
449 new->alt = 0;
450 new->next = 0;
452 if(qp)
453 parsealt(bp, qp+2, &new->alt);
455 new->next = patterns[action].regexps;
456 patterns[action].regexps = new;
457 continue;
460 /* not a Regexp - hook strings into Pattern hash chain */
461 spat = Malloc(sizeof(*spat));
462 spat->next = 0;
463 spat->alt = 0;
464 spat->len = n;
465 spat->string = Malloc(n+1);
466 spat->c1 = cp[1];
467 strcpy(spat->string, cp);
469 if(qp)
470 parsealt(bp, qp+2, &spat->alt);
472 p = patterns[action].strings;
473 if(p == 0) {
474 p = Malloc(sizeof(Pattern));
475 memset(p, 0, sizeof(*p));
476 p->action = action;
477 p->type = string;
478 patterns[action].strings = p;
480 h = hash(*spat->string);
481 spat->next = p->spat[h];
482 p->spat[h] = spat;
486 static void
487 parsealt(Biobuf *bp, char *cp, Spat** head)
489 char *p;
490 Spat *alt;
492 while(cp){
493 if(*cp == 0){ /*escaped newline*/
494 do{
495 cp = Brdline(bp, '\n');
496 if(cp == 0)
497 return;
498 cp[Blinelen(bp)-1] = 0;
499 } while(extract(cp) <= 0 || *cp == 0);
502 p = cp;
503 cp = strstr(p, "~~");
504 if(cp){
505 *cp = 0;
506 cp += 2;
508 if(strlen(p)){
509 alt = Malloc(sizeof(*alt));
510 alt->string = strdup(p);
511 alt->next = *head;
512 *head = alt;
517 static int
518 extract(char *cp)
520 int c;
521 char *p, *q, *r;
523 p = q = r = cp;
524 while(whitespace(*p))
525 p++;
526 while(c = *p++){
527 if (c == '#')
528 break;
529 if(c == '"'){
530 while(*p && *p != '"'){
531 if(*p == '\\' && p[1] == '"')
532 p++;
533 if('A' <= *p && *p <= 'Z')
534 *q++ = *p++ + ('a'-'A');
535 else
536 *q++ = *p++;
538 if(*p)
539 p++;
540 r = q; /* never back up over a quoted string */
541 } else {
542 if('A' <= c && c <= 'Z')
543 c += ('a'-'A');
544 *q++ = c;
547 while(q > r && whitespace(q[-1]))
548 q--;
549 *q = 0;
550 return q-cp;
553 /*
554 * The matching engine: compare canonical input to pattern structures
555 */
557 static Spat*
558 isalt(char *message, Spat *alt)
560 while(alt) {
561 if(*cmd)
562 if(message != cmd && strstr(cmd, alt->string))
563 break;
564 if(message != header+1 && strstr(header+1, alt->string))
565 break;
566 if(strstr(message, alt->string))
567 break;
568 alt = alt->next;
570 return alt;
573 int
574 matchpat(Pattern *p, char *message, Resub *m)
576 Spat *spat;
577 char *s;
578 int c, c1;
580 if(p->type == string){
581 c1 = *message;
582 for(s=message; c=c1; s++){
583 c1 = s[1];
584 for(spat=p->spat[hash(c)]; spat; spat=spat->next){
585 if(c1 == spat->c1)
586 if(memcmp(s, spat->string, spat->len) == 0)
587 if(!isalt(message, spat->alt)){
588 m->s.sp = s;
589 m->e.ep = s + spat->len;
590 return 1;
594 return 0;
596 m->s.sp = m->e.ep = 0;
597 if(regexec(p->pat, message, m, 1) == 0)
598 return 0;
599 if(isalt(message, p->alt))
600 return 0;
601 return 1;
605 void
606 xprint(int fd, char *type, Resub *m)
608 char *p, *q;
609 int i;
611 if(m->s.sp == 0 || m->e.ep == 0)
612 return;
614 /* back up approx 30 characters to whitespace */
615 for(p = m->s.sp, i = 0; *p && i < 30; i++, p--)
617 while(*p && *p != ' ')
618 p--;
619 p++;
621 /* grab about 30 more chars beyond the end of the match */
622 for(q = m->e.ep, i = 0; *q && i < 30; i++, q++)
624 while(*q && *q != ' ')
625 q++;
627 fprint(fd, "%s %.*s~%.*s~%.*s\n", type, (int)(m->s.sp-p), p, (int)(m->e.ep-m->s.sp), m->s.sp, (int)(q-m->e.ep), m->e.ep);
630 enum {
631 INVAL= 255
632 };
634 static uchar t64d[256] = {
635 /*00 */ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
636 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
637 /*10*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
638 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
639 /*20*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
640 INVAL, INVAL, INVAL, 62, INVAL, INVAL, INVAL, 63,
641 /*30*/ 52, 53, 54, 55, 56, 57, 58, 59,
642 60, 61, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
643 /*40*/ INVAL, 0, 1, 2, 3, 4, 5, 6,
644 7, 8, 9, 10, 11, 12, 13, 14,
645 /*50*/ 15, 16, 17, 18, 19, 20, 21, 22,
646 23, 24, 25, INVAL, INVAL, INVAL, INVAL, INVAL,
647 /*60*/ INVAL, 26, 27, 28, 29, 30, 31, 32,
648 33, 34, 35, 36, 37, 38, 39, 40,
649 /*70*/ 41, 42, 43, 44, 45, 46, 47, 48,
650 49, 50, 51, INVAL, INVAL, INVAL, INVAL, INVAL,
651 /*80*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
652 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
653 /*90*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
654 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
655 /*A0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
656 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
657 /*B0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
658 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
659 /*C0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
660 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
661 /*D0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
662 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
663 /*E0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
664 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
665 /*F0*/ INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL,
666 INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL, INVAL
667 };