Blob
1 #include <u.h>2 #include <libc.h>3 #include <bio.h>4 #include <regexp.h>5 #include "hash.h"7 enum8 {9 MAXTAB = 256,10 MAXBEST = 3211 };13 typedef struct Table Table;14 struct Table15 {16 char *file;17 Hash *hash;18 int nmsg;19 };21 typedef struct Word Word;22 struct Word23 {24 Stringtab *s; /* from hmsg */25 int count[MAXTAB]; /* counts from each table */26 double p[MAXTAB]; /* probabilities from each table */27 double mp; /* max probability */28 int mi; /* w.p[w.mi] = w.mp */29 };31 Table tab[MAXTAB];32 int ntab;34 Word best[MAXBEST];35 int mbest;36 int nbest;38 int debug;40 void41 usage(void)42 {43 fprint(2, "usage: bayes [-D] [-m maxword] boxhash ... ~ msghash ...\n");44 exits("usage");45 }47 void*48 emalloc(int n)49 {50 void *v;52 v = mallocz(n, 1);53 if(v == nil)54 sysfatal("out of memory");55 return v;56 }58 void59 noteword(Word *w)60 {61 int i;63 for(i=nbest-1; i>=0; i--)64 if(w->mp < best[i].mp)65 break;66 i++;68 if(i >= mbest)69 return;70 if(nbest == mbest)71 nbest--;72 if(i < nbest)73 memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));74 best[i] = *w;75 nbest++;76 }78 Hash*79 hread(char *s)80 {81 Hash *h;82 Biobuf *b;84 if((b = Bopenlock(s, OREAD)) == nil)85 sysfatal("open %s: %r", s);87 h = emalloc(sizeof(Hash));88 Breadhash(b, h, 1);89 Bterm(b);90 return h;91 }93 void94 main(int argc, char **argv)95 {96 int i, j, a, mi, oi, tot, keywords;97 double totp, p, xp[MAXTAB];98 Hash *hmsg;99 Word w;100 Stringtab *s, *t;101 Biobuf bout;103 mbest = 15;104 keywords = 0;105 ARGBEGIN{106 case 'D':107 debug = 1;108 break;109 case 'k':110 keywords = 1;111 break;112 case 'm':113 mbest = atoi(EARGF(usage()));114 if(mbest > MAXBEST)115 sysfatal("cannot keep more than %d words", MAXBEST);116 break;117 default:118 usage();119 }ARGEND121 for(i=0; i<argc; i++)122 if(strcmp(argv[i], "~") == 0)123 break;125 if(i > MAXTAB)126 sysfatal("cannot handle more than %d tables", MAXTAB);128 if(i+1 >= argc)129 usage();131 for(i=0; i<argc; i++){132 if(strcmp(argv[i], "~") == 0)133 break;134 tab[ntab].file = argv[i];135 tab[ntab].hash = hread(argv[i]);136 s = findstab(tab[ntab].hash, "*nmsg*", 6, 1);137 if(s == nil || s->count == 0)138 tab[ntab].nmsg = 1;139 else140 tab[ntab].nmsg = s->count;141 ntab++;142 }144 Binit(&bout, 1, OWRITE);146 oi = ++i;147 for(a=i; a<argc; a++){148 hmsg = hread(argv[a]);149 nbest = 0;150 for(s=hmsg->all; s; s=s->link){151 w.s = s;152 tot = 0;153 totp = 0.0;154 for(i=0; i<ntab; i++){155 t = findstab(tab[i].hash, s->str, s->n, 0);156 if(t == nil)157 w.count[i] = 0;158 else159 w.count[i] = t->count;160 tot += w.count[i];161 p = w.count[i]/(double)tab[i].nmsg;162 if(p >= 1.0)163 p = 1.0;164 w.p[i] = p;165 totp += p;166 }168 if(tot < 5){ /* word does not appear enough; give to box 0 */169 w.p[0] = 0.5;170 for(i=1; i<ntab; i++)171 w.p[i] = 0.1;172 w.mp = 0.5;173 w.mi = 0;174 noteword(&w);175 continue;176 }178 w.mp = 0.0;179 for(i=0; i<ntab; i++){180 p = w.p[i];181 p /= totp;182 if(p < 0.01)183 p = 0.01;184 else if(p > 0.99)185 p = 0.99;186 if(p > w.mp){187 w.mp = p;188 w.mi = i;189 }190 w.p[i] = p;191 }192 noteword(&w);193 }195 totp = 0.0;196 for(i=0; i<ntab; i++){197 p = 1.0;198 for(j=0; j<nbest; j++)199 p *= best[j].p[i];200 xp[i] = p;201 totp += p;202 }203 for(i=0; i<ntab; i++)204 xp[i] /= totp;205 mi = 0;206 for(i=1; i<ntab; i++)207 if(xp[i] > xp[mi])208 mi = i;209 if(oi != argc-1)210 Bprint(&bout, "%s: ", argv[a]);211 Bprint(&bout, "%s %f", tab[mi].file, xp[mi]);212 if(keywords){213 for(i=0; i<nbest; i++){214 Bprint(&bout, " ");215 Bwrite(&bout, best[i].s->str, best[i].s->n);216 Bprint(&bout, " %f", best[i].p[mi]);217 }218 }219 freehash(hmsg);220 Bprint(&bout, "\n");221 if(debug){222 for(i=0; i<nbest; i++){223 Bwrite(&bout, best[i].s->str, best[i].s->n);224 Bprint(&bout, " %f", best[i].p[mi]);225 if(best[i].p[mi] < best[i].mp)226 Bprint(&bout, " (%f %s)", best[i].mp, tab[best[i].mi].file);227 Bprint(&bout, "\n");228 }229 }230 }231 Bterm(&bout);232 }