Blob
1 /*2 * RFC822 message tokenizer (really feature generator) for spam filter.3 *4 * See Paul Graham's musings on spam filtering for theory.5 */7 #include <u.h>8 #include <libc.h>9 #include <bio.h>10 #include <regexp.h>11 #include <ctype.h>12 #include "dfa.h"14 void buildre(Dreprog*[3]);15 int debug;16 char *refile = "#9/mail/lib/classify.re";17 int maxtoklen = 20;18 int trim(char*);20 void21 usage(void)22 {23 fprint(2, "usage: msgtok [-D] [-r /mail/lib/classify.re] [file]\n");24 exits("usage");25 }27 void28 main(int argc, char **argv)29 {30 int i, hdr, n, eof, off;31 Dreprog *re[3];32 int m[3];33 char *p, *ep, *tag;34 Biobuf bout, bin;35 char msg[1024+1];36 char buf[1024];38 refile = unsharp(refile);39 buildre(re);40 ARGBEGIN{41 case 'D':42 debug = 1;43 break;44 case 'n':45 maxtoklen = atoi(EARGF(usage()));46 break;47 case 'r':48 refile = EARGF(usage());49 break;50 default:51 usage();52 }ARGEND;54 if(argc > 1)55 usage();56 if(argc == 1){57 close(0);58 if(open(argv[0], OREAD) < 0)59 sysfatal("open %s: %r", argv[0]);60 }62 tag = nil;63 Binit(&bin, 0, OREAD);64 Binit(&bout, 1, OWRITE);65 ep = msg;66 p = msg;67 eof = 0;68 off = 0;69 hdr = 1;70 for(;;){71 /* replenish buffer */72 if(ep - p < 512 && !eof){73 if(p > msg + 1){74 n = ep - p;75 memmove(msg, p-1, ep-(p-1));76 off += (p-1) - msg;77 p = msg+1;78 ep = p + n;79 }80 n = Bread(&bin, ep, msg+(sizeof msg - 1)- ep);81 if(n < 0)82 sysfatal("read error: %r");83 if(n == 0)84 eof = 1;85 ep += n;86 *ep = 0;87 }88 if(p >= ep)89 break;91 if(*p == 0){92 p++;93 continue;94 }96 if(hdr && p[-1]=='\n'){97 if(p[0]=='\n')98 hdr = 0;99 else if(cistrncmp(p-1, "\nfrom:", 6) == 0)100 tag = "From*";101 else if(cistrncmp(p-1, "\nto:", 4) == 0)102 tag = "To*";103 else if(cistrncmp(p-1, "\nsubject:", 9) == 0)104 tag = "Subject*";105 else if(cistrncmp(p-1, "\nreturn-path:", 13) == 0)106 tag = "Return-Path*";107 else108 tag = nil;109 }110 m[0] = dregexec(re[0], p, p==msg || p[-1]=='\n');111 m[1] = dregexec(re[1], p, p==msg || p[-1]=='\n');112 m[2] = dregexec(re[2], p, p==msg || p[-1]=='\n');114 n = m[0];115 if(n < m[1])116 n = m[1];117 if(n < m[2])118 n = m[2];119 if(n <= 0){120 fprint(2, "«%s» %.2ux", p, p[0]);121 sysfatal("no regexps matched at %ld", off + (p-msg));122 }124 if(m[0] >= m[1] && m[0] >= m[2]){125 /* "From " marks start of new message */126 Bprint(&bout, "*From*\n");127 n = m[0];128 hdr = 1;129 }else if(m[2] > 1){130 /* ignore */131 n = m[2];132 }else if(m[1] >= m[0] && m[1] >= m[2] && m[1] > 2 && m[1] <= maxtoklen){133 /* keyword */134 /* should do UTF-aware lowercasing, too much bother */135 /*136 for(i=0; i<n; i++)137 if('A' <= p[i] && p[i] <= 'Z')138 p[i] += 'a' - 'A';139 */140 if(tag){141 i = strlen(tag);142 memmove(buf, tag, i);143 memmove(buf+i, p, m[1]);144 buf[i+m[1]] = 0;145 }else{146 memmove(buf, p, m[1]);147 buf[m[1]] = 0;148 }149 Bprint(&bout, "%s\n", buf);150 while(trim(buf) >= 0)151 Bprint(&bout, "stem*%s\n", buf);152 n = m[1];153 }else154 n = m[2];155 if(debug)156 fprint(2, "%.*s¦", utfnlen(p, n), p);157 p += n;158 }159 Bterm(&bout);160 exits(0);161 }163 void164 buildre(Dreprog *re[3])165 {166 Biobuf *b;168 if((b = Bopen(refile, OREAD)) == nil)169 sysfatal("open %s: %r", refile);171 re[0] = Breaddfa(b);172 re[1] = Breaddfa(b);173 re[2] = Breaddfa(b);175 if(re[0]==nil || re[1]==nil || re[2]==nil)176 sysfatal("Breaddfa: %r");177 Bterm(b);178 }180 /* perhaps this belongs in the tokenizer */181 int182 trim(char *s)183 {184 char *p, *op;185 int mix, mix1;187 if(*s == '*')188 return -1;190 /* strip leading punctuation */191 p = strchr(s, '*');192 if(p == nil)193 p = s;194 while(*p && !isalpha(*p))195 p++;196 if(strlen(p) < 2)197 {198 return -1;199 }200 memmove(s, p, strlen(p)+1);202 /* strip suffix of punctuation */203 p = s+strlen(s);204 op = p;205 while(p > s && (uchar)p[-1]<0x80 && !isalpha(p[-1]))206 p--;208 /* chop punctuation */209 if(p > s){210 /* free!!! -> free! */211 if(p+1 < op){212 p[1] = 0;213 return 0;214 }215 /* free! -> free */216 if(p < op){217 p[0] = 0;218 return 0;219 }220 }222 mix = mix1 = 0;223 if(isupper(s[0]))224 mix = 1;225 for(p=s+1; *p; p++)226 if(isupper(*p)){227 mix1 = 1;228 break;229 }231 /* turn FREE into Free */232 if(mix1){233 for(p=s+1; *p; p++)234 if(isupper(*p))235 *p += 'a'-'A';236 return 0;237 }239 /* turn Free into free */240 if(mix){241 *s += 'a'-'A';242 return 0;243 }244 return -1;245 }