Blob
1 #include <u.h>2 #include <libc.h>3 #include <bio.h>4 #include <regexp.h>5 #include "dfa.h"7 /***8 * Regular expression for matching.9 */11 char *ignore[] =12 {13 /* HTML that isn't A, IMG, or FONT */14 /* Must have a space somewhere to avoid catching <email@address> */15 "<[ \n\r]*("16 "[^aif]|"17 "a[^> \t\r\n]|"18 "i[^mM \t\r\n]|"19 "im[^gG \t\r\n]|"20 "img[^> \t\r\n]|"21 "f[^oO \t\r\n]|"22 "fo[^Nn \t\r\n]|"23 "fon[^tT \t\r\n]|"24 "font[^> \r\t\n]"25 ")[^>]*[ \t\n\r][^>]*>",26 "<[ \n\r]*("27 "i|im|f|fo|fon"28 ")[ \t\r\n][^>]*>",30 /* ignore html comments */31 "<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->",33 /* random mail strings */34 "^message-id:.*\n([ ].*\n)*",35 "^in-reply-to:.*\n([ ].*\n)*",36 "^references:.*\n([ ].*\n)*",37 "^date:.*\n([ ].*\n)*",38 "^delivery-date:.*\n([ ].*\n)*",39 "e?smtp id .*",40 "^ id.*",41 "boundary=.*",42 "name=\"",43 "filename=\"",44 "news:<[^>]+>",45 "^--[^ ]*$",47 /* base64 encoding */48 "^[0-9a-zA-Z+\\-=/]+$",50 /* uu encoding */51 "^[!-Z]+$",53 /* little things */54 ".",55 "\n",56 };58 char *keywords[] =59 {60 "([a-zA-Z'`$!¡-]|[0-9]([.,][0-9])*)+",61 };63 int debug;65 Dreprog*66 dregcomp(char *buf)67 {68 Reprog *r;69 Dreprog *d;71 if(debug)72 print(">>> '%s'\n", buf);74 r = regcomp(buf);75 if(r == nil)76 sysfatal("regcomp");77 d = dregcvt(r);78 if(d == nil)79 sysfatal("dregcomp");80 free(r);81 return d;82 }84 char*85 strcpycase(char *d, char *s)86 {87 int cc, esc;89 cc = 0;90 esc = 0;91 while(*s){92 if(*s == '[')93 cc++;94 if(*s == ']')95 cc--;96 if(!cc && 'a' <= *s && *s <= 'z'){97 *d++ = '[';98 *d++ = *s;99 *d++ = *s+'A'-'a';100 *d++ = ']';101 }else102 *d++ = *s;103 if(*s == '\\')104 esc++;105 else if(esc)106 esc--;107 s++;108 }109 return d;110 }112 void113 regerror(char *msg)114 {115 sysfatal("regerror: %s", msg);116 }118 void119 buildre(Dreprog *re[3])120 {121 int i;122 static char buf[16384], *s;124 re[0] = dregcomp("^From ");126 s = buf;127 for(i=0; i<nelem(keywords); i++){128 if(i != 0)129 *s++ = '|';130 s = strcpycase(s, keywords[i]);131 }132 *s = 0;133 re[1] = dregcomp(buf);135 s = buf;136 for(i=0; i<nelem(ignore); i++){137 if(i != 0)138 *s++ = '|';139 s = strcpycase(s, ignore[i]);140 }141 *s = 0;142 re[2] = dregcomp(buf);143 }145 void146 usage(void)147 {148 fprint(2, "usage: regen [-d]\n");149 exits("usage");150 }152 void153 main(int argc, char **argv)154 {155 Dreprog *re[3];156 Biobuf b;158 ARGBEGIN{159 default:160 usage();161 case 'd':162 debug = 1;163 }ARGEND165 if(argc != 0)166 usage();168 buildre(re);169 Binit(&b, 1, OWRITE);170 Bprintdfa(&b, re[0]);171 Bprintdfa(&b, re[1]);172 Bprintdfa(&b, re[2]);173 exits(0);174 }