Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include "hash.h"
7 enum
8 {
9 MAXTAB = 256,
10 MAXBEST = 32,
11 };
13 typedef struct Table Table;
14 struct Table
15 {
16 char *file;
17 Hash *hash;
18 int nmsg;
19 };
21 typedef struct Word Word;
22 struct Word
23 {
24 Stringtab *s; /* from hmsg */
25 int count[MAXTAB]; /* counts from each table */
26 double p[MAXTAB]; /* probabilities from each table */
27 double mp; /* max probability */
28 int mi; /* w.p[w.mi] = w.mp */
29 };
31 Table tab[MAXTAB];
32 int ntab;
34 Word best[MAXBEST];
35 int mbest;
36 int nbest;
38 int debug;
40 void
41 usage(void)
42 {
43 fprint(2, "usage: bayes [-D] [-m maxword] boxhash ... ~ msghash ...\n");
44 exits("usage");
45 }
47 void*
48 emalloc(int n)
49 {
50 void *v;
52 v = mallocz(n, 1);
53 if(v == nil)
54 sysfatal("out of memory");
55 return v;
56 }
58 void
59 noteword(Word *w)
60 {
61 int i;
63 for(i=nbest-1; i>=0; i--)
64 if(w->mp < best[i].mp)
65 break;
66 i++;
68 if(i >= mbest)
69 return;
70 if(nbest == mbest)
71 nbest--;
72 if(i < nbest)
73 memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
74 best[i] = *w;
75 nbest++;
76 }
78 Hash*
79 hread(char *s)
80 {
81 Hash *h;
82 Biobuf *b;
84 if((b = Bopenlock(s, OREAD)) == nil)
85 sysfatal("open %s: %r", s);
87 h = emalloc(sizeof(Hash));
88 Breadhash(b, h, 1);
89 Bterm(b);
90 return h;
91 }
93 void
94 main(int argc, char **argv)
95 {
96 int i, j, a, mi, oi, tot, keywords;
97 double totp, p, xp[MAXTAB];
98 Hash *hmsg;
99 Word w;
100 Stringtab *s, *t;
101 Biobuf bout;
103 mbest = 15;
104 keywords = 0;
105 ARGBEGIN{
106 case 'D':
107 debug = 1;
108 break;
109 case 'k':
110 keywords = 1;
111 break;
112 case 'm':
113 mbest = atoi(EARGF(usage()));
114 if(mbest > MAXBEST)
115 sysfatal("cannot keep more than %d words", MAXBEST);
116 break;
117 default:
118 usage();
119 }ARGEND
121 for(i=0; i<argc; i++)
122 if(strcmp(argv[i], "~") == 0)
123 break;
125 if(i > MAXTAB)
126 sysfatal("cannot handle more than %d tables", MAXTAB);
128 if(i+1 >= argc)
129 usage();
131 for(i=0; i<argc; i++){
132 if(strcmp(argv[i], "~") == 0)
133 break;
134 tab[ntab].file = argv[i];
135 tab[ntab].hash = hread(argv[i]);
136 s = findstab(tab[ntab].hash, "*nmsg*", 6, 1);
137 if(s == nil || s->count == 0)
138 tab[ntab].nmsg = 1;
139 else
140 tab[ntab].nmsg = s->count;
141 ntab++;
144 Binit(&bout, 1, OWRITE);
146 oi = ++i;
147 for(a=i; a<argc; a++){
148 hmsg = hread(argv[a]);
149 nbest = 0;
150 for(s=hmsg->all; s; s=s->link){
151 w.s = s;
152 tot = 0;
153 totp = 0.0;
154 for(i=0; i<ntab; i++){
155 t = findstab(tab[i].hash, s->str, s->n, 0);
156 if(t == nil)
157 w.count[i] = 0;
158 else
159 w.count[i] = t->count;
160 tot += w.count[i];
161 p = w.count[i]/(double)tab[i].nmsg;
162 if(p >= 1.0)
163 p = 1.0;
164 w.p[i] = p;
165 totp += p;
168 if(tot < 5){ /* word does not appear enough; give to box 0 */
169 w.p[0] = 0.5;
170 for(i=1; i<ntab; i++)
171 w.p[i] = 0.1;
172 w.mp = 0.5;
173 w.mi = 0;
174 noteword(&w);
175 continue;
178 w.mp = 0.0;
179 for(i=0; i<ntab; i++){
180 p = w.p[i];
181 p /= totp;
182 if(p < 0.01)
183 p = 0.01;
184 else if(p > 0.99)
185 p = 0.99;
186 if(p > w.mp){
187 w.mp = p;
188 w.mi = i;
190 w.p[i] = p;
192 noteword(&w);
195 totp = 0.0;
196 for(i=0; i<ntab; i++){
197 p = 1.0;
198 for(j=0; j<nbest; j++)
199 p *= best[j].p[i];
200 xp[i] = p;
201 totp += p;
203 for(i=0; i<ntab; i++)
204 xp[i] /= totp;
205 mi = 0;
206 for(i=1; i<ntab; i++)
207 if(xp[i] > xp[mi])
208 mi = i;
209 if(oi != argc-1)
210 Bprint(&bout, "%s: ", argv[a]);
211 Bprint(&bout, "%s %f", tab[mi].file, xp[mi]);
212 if(keywords){
213 for(i=0; i<nbest; i++){
214 Bprint(&bout, " ");
215 Bwrite(&bout, best[i].s->str, best[i].s->n);
216 Bprint(&bout, " %f", best[i].p[mi]);
219 freehash(hmsg);
220 Bprint(&bout, "\n");
221 if(debug){
222 for(i=0; i<nbest; i++){
223 Bwrite(&bout, best[i].s->str, best[i].s->n);
224 Bprint(&bout, " %f", best[i].p[mi]);
225 if(best[i].p[mi] < best[i].mp)
226 Bprint(&bout, " (%f %s)", best[i].mp, tab[best[i].mi].file);
227 Bprint(&bout, "\n");
231 Bterm(&bout);