Blob


1 /*
2 * RFC822 message tokenizer (really feature generator) for spam filter.
3 *
4 * See Paul Graham's musings on spam filtering for theory.
5 */
7 #include <u.h>
8 #include <libc.h>
9 #include <bio.h>
10 #include <regexp.h>
11 #include <ctype.h>
12 #include "dfa.h"
14 void buildre(Dreprog*[3]);
15 int debug;
16 char *refile = "#9/mail/lib/classify.re";
17 int maxtoklen = 20;
18 int trim(char*);
20 void
21 usage(void)
22 {
23 fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");
24 exits("usage");
25 }
27 void
28 main(int argc, char **argv)
29 {
30 int i, hdr, n, eof, off;
31 Dreprog *re[3];
32 int m[3];
33 char *p, *ep, *tag;
34 Biobuf bout, bin;
35 char msg[1024+1];
36 char buf[1024];
38 refile = unsharp(refile);
39 buildre(re);
40 ARGBEGIN{
41 case 'D':
42 debug = 1;
43 break;
44 case 'n':
45 maxtoklen = atoi(EARGF(usage()));
46 break;
47 case 'r':
48 refile = EARGF(usage());
49 break;
50 default:
51 usage();
52 }ARGEND;
54 if(argc > 1)
55 usage();
56 if(argc == 1){
57 close(0);
58 if(open(argv[0], OREAD) < 0)
59 sysfatal("open %s: %r", argv[0]);
60 }
62 tag = nil;
63 Binit(&bin, 0, OREAD);
64 Binit(&bout, 1, OWRITE);
65 ep = msg;
66 p = msg;
67 eof = 0;
68 off = 0;
69 hdr = 1;
70 for(;;){
71 /* replenish buffer */
72 if(ep - p < 512 && !eof){
73 if(p > msg + 1){
74 n = ep - p;
75 memmove(msg, p-1, ep-(p-1));
76 off += (p-1) - msg;
77 p = msg+1;
78 ep = p + n;
79 }
80 n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);
81 if(n < 0)
82 sysfatal("read error: %r");
83 if(n == 0)
84 eof = 1;
85 ep += n;
86 *ep = 0;
87 }
88 if(p >= ep)
89 break;
91 if(*p == 0){
92 p++;
93 continue;
94 }
96 if(hdr && p[-1]=='\n'){
97 if(p[0]=='\n')
98 hdr = 0;
99 else if(cistrncmp(p-1, "\nfrom:", 6) == 0)
100 tag = "From*";
101 else if(cistrncmp(p-1, "\nto:", 4) == 0)
102 tag = "To*";
103 else if(cistrncmp(p-1, "\nsubject:", 9) == 0)
104 tag = "Subject*";
105 else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)
106 tag = "Return-Path*";
107 else
108 tag = nil;
110 m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');
111 m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');
112 m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');
114 n = m[0];
115 if(n < m[1])
116 n = m[1];
117 if(n < m[2])
118 n = m[2];
119 if(n <= 0){
120 fprint(2, "«%s» %.2ux", p, p[0]);
121 sysfatal("no regexps matched at %ld", off + (p-msg));
124 if(m[0] >= m[1] && m[0] >= m[2]){
125 /* "From " marks start of new message */
126 Bprint(&bout, "*From*\n");
127 n = m[0];
128 hdr = 1;
129 }else if(m[2] > 1){
130 /* ignore */
131 n = m[2];
132 }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){
133 /* keyword */
134 /* should do UTF-aware lowercasing, too much bother */
135 /*
136 for(i=0; i<n; i++)
137 if('A' <= p[i] && p[i] <= 'Z')
138 p[i] += 'a' - 'A';
139 */
140 if(tag){
141 i = strlen(tag);
142 memmove(buf, tag, i);
143 memmove(buf+i, p, m[1]);
144 buf[i+m[1]] = 0;
145 }else{
146 memmove(buf, p, m[1]);
147 buf[m[1]] = 0;
149 Bprint(&bout, "%s\n", buf);
150 while(trim(buf) >= 0)
151 Bprint(&bout, "stem*%s\n", buf);
152 n = m[1];
153 }else
154 n = m[2];
155 if(debug)
156 fprint(2, "%.*s¦", utfnlen(p, n), p);
157 p += n;
159 Bterm(&bout);
160 exits(0);
163 void
164 buildre(Dreprog *re[3])
166 Biobuf *b;
168 if((b = Bopen(refile, OREAD)) == nil)
169 sysfatal("open %s: %r", refile);
171 re[0] = Breaddfa(b);
172 re[1] = Breaddfa(b);
173 re[2] = Breaddfa(b);
175 if(re[0]==nil || re[1]==nil || re[2]==nil)
176 sysfatal("Breaddfa: %r");
177 Bterm(b);
180 /* perhaps this belongs in the tokenizer */
181 int
182 trim(char *s)
184 char *p, *op;
185 int mix, mix1;
187 if(*s == '*')
188 return -1;
190 /* strip leading punctuation */
191 p = strchr(s, '*');
192 if(p == nil)
193 p = s;
194 while(*p && !isalpha(*p))
195 p++;
196 if(strlen(p) < 2)
198 return -1;
200 memmove(s, p, strlen(p)+1);
202 /* strip suffix of punctuation */
203 p = s+strlen(s);
204 op = p;
205 while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))
206 p--;
208 /* chop punctuation */
209 if(p > s){
210 /* free!!! -> free! */
211 if(p+1 < op){
212 p[1] = 0;
213 return 0;
215 /* free! -> free */
216 if(p < op){
217 p[0] = 0;
218 return 0;
222 mix = mix1 = 0;
223 if(isupper(s[0]))
224 mix = 1;
225 for(p=s+1; *p; p++)
226 if(isupper(*p)){
227 mix1 = 1;
228 break;
231 /* turn FREE into Free */
232 if(mix1){
233 for(p=s+1; *p; p++)
234 if(isupper(*p))
235 *p += 'a'-'A';
236 return 0;
239 /* turn Free into free */
240 if(mix){
241 *s += 'a'-'A';
242 return 0;
244 return -1;