Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include "dfa.h"
7 /***
8 * Regular expression for matching.
9 */
11 char *ignore[] =
12 {
13 /* HTML that isn't A, IMG, or FONT */
14 /* Must have a space somewhere to avoid catching <email@address> */
15 "<[ \n\r]*("
16 "[^aif]|"
17 "a[^> \t\r\n]|"
18 "i[^mM \t\r\n]|"
19 "im[^gG \t\r\n]|"
20 "img[^> \t\r\n]|"
21 "f[^oO \t\r\n]|"
22 "fo[^Nn \t\r\n]|"
23 "fon[^tT \t\r\n]|"
24 "font[^> \r\t\n]"
25 ")[^>]*[ \t\n\r][^>]*>",
26 "<[ \n\r]*("
27 "i|im|f|fo|fon"
28 ")[ \t\r\n][^>]*>",
30 /* ignore html comments */
31 "<!--([^\\-]|-[^\\-]|--[^>]|\n)*-->",
33 /* random mail strings */
34 "^message-id:.*\n([ ].*\n)*",
35 "^in-reply-to:.*\n([ ].*\n)*",
36 "^references:.*\n([ ].*\n)*",
37 "^date:.*\n([ ].*\n)*",
38 "^delivery-date:.*\n([ ].*\n)*",
39 "e?smtp id .*",
40 "^ id.*",
41 "boundary=.*",
42 "name=\"",
43 "filename=\"",
44 "news:<[^>]+>",
45 "^--[^ ]*$",
47 /* base64 encoding */
48 "^[0-9a-zA-Z+\\-=/]+$",
50 /* uu encoding */
51 "^[!-Z]+$",
53 /* little things */
54 ".",
55 "\n",
56 };
58 char *keywords[] =
59 {
60 "([a-zA-Z'`$!¡-￿]|[0-9]([.,][0-9])*)+",
61 };
63 int debug;
65 Dreprog*
66 dregcomp(char *buf)
67 {
68 Reprog *r;
69 Dreprog *d;
71 if(debug)
72 print(">>> '%s'\n", buf);
74 r = regcomp(buf);
75 if(r == nil)
76 sysfatal("regcomp");
77 d = dregcvt(r);
78 if(d == nil)
79 sysfatal("dregcomp");
80 free(r);
81 return d;
82 }
84 char*
85 strcpycase(char *d, char *s)
86 {
87 int cc, esc;
89 cc = 0;
90 esc = 0;
91 while(*s){
92 if(*s == '[')
93 cc++;
94 if(*s == ']')
95 cc--;
96 if(!cc && 'a' <= *s && *s <= 'z'){
97 *d++ = '[';
98 *d++ = *s;
99 *d++ = *s+'A'-'a';
100 *d++ = ']';
101 }else
102 *d++ = *s;
103 if(*s == '\\')
104 esc++;
105 else if(esc)
106 esc--;
107 s++;
109 return d;
112 void
113 regerror(char *msg)
115 sysfatal("regerror: %s", msg);
118 void
119 buildre(Dreprog *re[3])
121 int i;
122 static char buf[16384], *s;
124 re[0] = dregcomp("^From ");
126 s = buf;
127 for(i=0; i<nelem(keywords); i++){
128 if(i != 0)
129 *s++ = '|';
130 s = strcpycase(s, keywords[i]);
132 *s = 0;
133 re[1] = dregcomp(buf);
135 s = buf;
136 for(i=0; i<nelem(ignore); i++){
137 if(i != 0)
138 *s++ = '|';
139 s = strcpycase(s, ignore[i]);
141 *s = 0;
142 re[2] = dregcomp(buf);
145 void
146 usage(void)
148 fprint(2, "usage: regen [-d]\n");
149 exits("usage");
152 void
153 main(int argc, char **argv)
155 Dreprog *re[3];
156 Biobuf b;
158 ARGBEGIN{
159 default:
160 usage();
161 case 'd':
162 debug = 1;
163 }ARGEND
165 if(argc != 0)
166 usage();
168 buildre(re);
169 Binit(&b, 1, OWRITE);
170 Bprintdfa(&b, re[0]);
171 Bprintdfa(&b, re[1]);
172 Bprintdfa(&b, re[2]);
173 exits(0);