Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include <ctype.h>
6 #include "dict.h"
8 /*
9 * Assumed index file structure: lines of form
10 * [^\t]+\t[0-9]+
11 * First field is key, second is byte offset into dictionary.
12 * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
13 */
14 typedef struct Addr Addr;
16 struct Addr {
17 int n; /* number of offsets */
18 int cur; /* current position within doff array */
19 int maxn; /* actual current size of doff array */
20 ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
21 };
23 Biobuf binbuf;
24 Biobuf boutbuf;
25 Biobuf *bin = &binbuf; /* user cmd input */
26 Biobuf *bout = &boutbuf; /* output */
27 Biobuf *bdict; /* dictionary */
28 Biobuf *bindex; /* index file */
29 long indextop; /* index offset at end of file */
30 int lastcmd; /* last executed command */
31 Addr *dot; /* "current" address */
32 Dict *dict; /* current dictionary */
33 int linelen;
34 int breaklen = 60;
35 int outinhibit;
36 int debug;
38 void execcmd(int);
39 int getpref(char*, Rune*);
40 Entry getentry(int);
41 int getfield(Rune*);
42 long locate(Rune*);
43 int parseaddr(char*, char**);
44 int parsecmd(char*);
45 int search(char*, int);
46 long seeknextline(Biobuf*, long);
47 void setdotnext(void);
48 void setdotprev(void);
49 void sortaddr(Addr*);
50 void usage(void);
52 enum {
53 Plen=300, /* max length of a search pattern */
54 Fieldlen=200, /* max length of an index field */
55 Aslots=10, /* initial number of slots in an address */
56 };
58 void
59 main(int argc, char **argv)
60 {
61 int i, cmd, kflag;
62 char *line, *p;
64 Binit(&binbuf, 0, OREAD);
65 Binit(&boutbuf, 1, OWRITE);
66 kflag = 0;
67 line = 0;
68 dict = 0;
70 for(i=0; dicts[i].name; i++){
71 if(access(unsharp(dicts[i].path), 0)>=0 && access(unsharp(dicts[i].indexpath), 0)>=0){
72 dict = &dicts[i];
73 break;
74 }
75 }
76 ARGBEGIN {
77 case 'd':
78 p = ARGF();
79 dict = 0;
80 if(p) {
81 for(i=0; dicts[i].name; i++)
82 if(strcmp(p, dicts[i].name)==0) {
83 dict = &dicts[i];
84 break;
85 }
86 }
87 if(!dict)
88 usage();
89 break;
90 case 'c':
91 line = ARGF();
92 if(!line)
93 usage();
94 break;
95 case 'k':
96 kflag++;
97 break;
98 case 'D':
99 debug++;
100 break;
101 default:
102 usage();
103 ARGEND }
104 if(dict == 0){
105 err("no dictionaries present on this system");
106 exits("nodict");
109 if(kflag) {
110 (*dict->printkey)();
111 exits(0);
113 if(argc > 1)
114 usage();
115 else if(argc == 1) {
116 if(line)
117 usage();
118 p = argv[0];
119 line = malloc(strlen(p)+5);
120 sprint(line, "/%s/P\n", p);
122 dict->path = unsharp(dict->path);
123 dict->indexpath = unsharp(dict->indexpath);
124 bdict = Bopen(dict->path, OREAD);
125 if(!bdict) {
126 err("can't open dictionary %s", dict->path);
127 exits("nodict");
129 bindex = Bopen(dict->indexpath, OREAD);
130 if(!bindex) {
131 err("can't open index %s", dict->indexpath);
132 exits("noindex");
134 indextop = Bseek(bindex, 0L, 2);
136 dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
137 dot->n = 0;
138 dot->cur = 0;
139 dot->maxn = Aslots;
140 lastcmd = 0;
142 if(line) {
143 cmd = parsecmd(line);
144 if(cmd)
145 execcmd(cmd);
146 } else {
147 for(;;) {
148 Bprint(bout, "*");
149 Bflush(bout);
150 line = Brdline(bin, '\n');
151 linelen = 0;
152 if(!line)
153 break;
154 cmd = parsecmd(line);
155 if(cmd) {
156 execcmd(cmd);
157 lastcmd = cmd;
161 exits(0);
164 void
165 usage(void)
167 int i;
168 char *a, *b;
170 Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
171 Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
172 for(i = 0; dicts[i].name; i++){
173 a = b = "";
174 if(access(unsharp(dicts[i].path), 0)<0 || access(unsharp(dicts[i].indexpath), 0)<0){
175 a = "[";
176 b = "]";
178 Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
180 exits("usage");
183 int
184 parsecmd(char *line)
186 char *e;
187 int cmd, ans;
189 if(parseaddr(line, &e) >= 0)
190 line = e;
191 else
192 return 0;
193 cmd = *line;
194 ans = cmd;
195 if(isupper(cmd))
196 cmd = tolower(cmd);
197 if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
198 cmd == '\n')) {
199 err("unknown command %c", cmd);
200 return 0;
202 if(cmd == '\n')
203 switch(lastcmd) {
204 case 0: ans = 'H'; break;
205 case 'H': ans = 'p'; break;
206 default : ans = lastcmd; break;
208 else if(line[1] != '\n' && line[1] != 0)
209 err("extra stuff after command %c ignored", cmd);
210 return ans;
213 void
214 execcmd(int cmd)
216 Entry e;
217 int cur, doall;
219 if(isupper(cmd)) {
220 doall = 1;
221 cmd = tolower(cmd);
222 cur = 0;
223 } else {
224 doall = 0;
225 cur = dot->cur;
227 if(debug && doall && cmd == 'a')
228 Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
229 for(;;){
230 if(cur >= dot->n)
231 break;
232 if(doall) {
233 Bprint(bout, "%d\t", cur+1);
234 linelen += 4 + (cur >= 10);
236 switch(cmd) {
237 case 'a':
238 Bprint(bout, "#%lud\n", dot->doff[cur]);
239 break;
240 case 'h':
241 case 'p':
242 case 'r':
243 e = getentry(cur);
244 (*dict->printentry)(e, cmd);
245 break;
247 cur++;
248 if(doall) {
249 if(cmd == 'p' || cmd == 'r') {
250 Bputc(bout, '\n');
251 linelen = 0;
253 } else
254 break;
256 if(cur >= dot->n)
257 cur = 0;
258 dot->cur = cur;
261 /*
262 * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
263 * Answer goes in dot.
264 * Return -1 if address starts, but get error.
265 * Return 0 if no address.
266 */
267 int
268 parseaddr(char *line, char **eptr)
270 int delim, plen;
271 ulong v;
272 char *e;
273 char pat[Plen];
275 if(*line == '/' || *line == '!') {
276 /* anchored regular expression match; '!' means no folding */
277 if(*line == '/') {
278 delim = '/';
279 e = strpbrk(line+1, "/\n");
280 } else {
281 delim = '!';
282 e = strpbrk(line+1, "!\n");
284 plen = e-line-1;
285 if(plen >= Plen-3) {
286 err("pattern too big");
287 return -1;
289 pat[0] = '^';
290 memcpy(pat+1, line+1, plen);
291 pat[plen+1] = '$';
292 pat[plen+2] = 0;
293 if(*e == '\n')
294 line = e;
295 else
296 line = e+1;
297 if(!search(pat, delim == '/')) {
298 err("pattern not found");
299 return -1;
301 } else if(*line == '#') {
302 /* absolute byte offset into dictionary */
303 line++;
304 if(!isdigit(*line))
305 return -1;
306 v = strtoul(line, &e, 10);
307 line = e;
308 dot->doff[0] = v;
309 dot->n = 1;
310 dot->cur = 0;
311 } else if(isdigit(*line)) {
312 v = strtoul(line, &e, 10);
313 line = e;
314 if(v < 1 || v > dot->n)
315 err(".%d not in range [1,%d], ignored",
316 v, dot->n);
317 else
318 dot->cur = v-1;
319 } else if(*line == '.') {
320 line++;
321 } else {
322 *eptr = line;
323 return 0;
325 while(*line == '+' || *line == '-') {
326 if(*line == '+')
327 setdotnext();
328 else
329 setdotprev();
330 line++;
332 *eptr = line;
333 return 1;
336 /*
337 * Index file is sorted by folded field1.
338 * Method: find pre, a folded prefix of r.e. pat,
339 * and then low = offset to beginning of
340 * line in index file where first match of prefix occurs.
341 * Then go through index until prefix no longer matches,
342 * adding each line that matches real pattern to dot.
343 * Finally, sort dot offsets (uniquing).
344 * We know pat len < Plen, and that it is surrounded by ^..$
345 */
346 int
347 search(char *pat, int dofold)
349 int needre, prelen, match, n;
350 Reprog *re;
351 long ioff, v;
352 Rune pre[Plen];
353 Rune lit[Plen];
354 Rune entry[Fieldlen];
355 char fpat[Plen];
357 prelen = getpref(pat+1, pre);
358 if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
359 runescpy(lit, pre);
360 if(dofold)
361 fold(lit);
362 needre = 0;
363 SET(re);
364 } else {
365 needre = 1;
366 if(dofold) {
367 foldre(fpat, pat);
368 re = regcomp(fpat);
369 } else
370 re = regcomp(pat);
372 fold(pre);
373 ioff = locate(pre);
374 if(ioff < 0)
375 return 0;
376 dot->n = 0;
377 Bseek(bindex, ioff, 0);
378 for(;;) {
379 if(!getfield(entry))
380 break;
381 if(dofold)
382 fold(entry);
383 if(needre)
384 match = rregexec(re, entry, 0, 0);
385 else
386 match = (acomp(lit, entry) == 0);
387 if(match) {
388 if(!getfield(entry))
389 break;
390 v = runetol(entry);
391 if(dot->n >= dot->maxn) {
392 n = 2*dot->maxn;
393 dot = realloc(dot,
394 sizeof(Addr)+(n-1)*sizeof(long));
395 if(!dot) {
396 err("out of memory");
397 exits("nomem");
399 dot->maxn = n;
401 dot->doff[dot->n++] = v;
402 } else {
403 if(!dofold)
404 fold(entry);
405 if(*pre) {
406 n = acomp(pre, entry);
407 if(n < -1 || (!needre && n < 0))
408 break;
410 /* get to next index entry */
411 if(!getfield(entry))
412 break;
415 sortaddr(dot);
416 dot->cur = 0;
417 return dot->n;
420 /*
421 * Return offset in index file of first line whose folded
422 * first field has pre as a prefix. -1 if none found.
423 */
424 long
425 locate(Rune *pre)
427 long top, bot, mid;
428 Rune entry[Fieldlen];
430 if(*pre == 0)
431 return 0;
432 bot = 0;
433 top = indextop;
434 if(debug>1)
435 fprint(2, "locate looking for prefix %S\n", pre);
436 for(;;) {
437 /*
438 * Loop invariant: foldkey(bot) < pre <= foldkey(top)
439 * and bot < top, and bot,top point at beginning of lines
440 */
441 mid = (top+bot) / 2;
442 mid = seeknextline(bindex, mid);
443 if(debug > 1)
444 fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
445 bot, (top+bot) / 2, mid, top);
446 if(mid == top || !getfield(entry))
447 break;
448 if(debug > 1)
449 fprint(2, "key=%S\n", entry);
450 /*
451 * here mid is strictly between bot and top
452 */
453 fold(entry);
454 if(acomp(pre, entry) <= 0)
455 top = mid;
456 else
457 bot = mid;
459 /*
460 * bot < top, but they don't necessarily point at successive lines
461 * Use linear search from bot to find first line that pre is a
462 * prefix of
463 */
464 while((bot = seeknextline(bindex, bot)) <= top) {
465 if(!getfield(entry))
466 return -1;
467 if(debug > 1)
468 fprint(2, "key=%S\n", entry);
469 fold(entry);
470 switch(acomp(pre, entry)) {
471 case -2:
472 return -1;
473 case -1:
474 case 0:
475 return bot;
476 case 1:
477 case 2:
478 continue;
481 return -1;
485 /*
486 * Get prefix of non re-metacharacters, runified, into pre,
487 * and return length
488 */
489 int
490 getpref(char *pat, Rune *pre)
492 int n, r;
493 char *p;
495 p = pat;
496 while(*p) {
497 n = chartorune(pre, p);
498 r = *pre;
499 switch(r) {
500 case 0x2e: case 0x2a: case 0x2b: case 0x3f:
501 case 0x5b: case 0x5d: case 0x28: case ')':
502 case 0x7c: case 0x5e: case 0x24:
503 *pre = 0;
504 return p-pat;
505 case L'\\':
506 p += n;
507 p += chartorune(++pre, p);
508 pre++;
509 break;
510 default:
511 p += n;
512 pre++;
515 return p-pat;
518 long
519 seeknextline(Biobuf *b, long off)
521 long c;
523 Bseek(b, off, 0);
524 do {
525 c = Bgetrune(b);
526 } while(c>=0 && c!='\n');
527 return Boffset(b);
530 /*
531 * Get next field out of index file (either tab- or nl- terminated)
532 * Answer in *rp, assumed to be Fieldlen long.
533 * Return 0 if read error first.
534 */
535 int
536 getfield(Rune *rp)
538 long c;
539 int n;
541 for(n=Fieldlen; n-- > 0; ) {
542 if ((c = Bgetrune(bindex)) < 0)
543 return 0;
544 if(c == '\t' || c == '\n') {
545 *rp = L'\0';
546 return 1;
548 *rp++ = c;
550 err("word too long");
551 return 0;
554 /*
555 * A compare longs function suitable for qsort
556 */
557 static int
558 longcmp(const void *av, const void *bv)
560 long v;
561 long *a, *b;
563 a = (long*)av;
564 b = (long*)bv;
566 v = *a - *b;
567 if(v < 0)
568 return -1;
569 else if(v == 0)
570 return 0;
571 else
572 return 1;
575 void
576 sortaddr(Addr *a)
578 int i, j;
579 long v;
581 if(a->n <= 1)
582 return;
584 qsort(a->doff, a->n, sizeof(long), longcmp);
586 /* remove duplicates */
587 for(i=0, j=0; j < a->n; j++) {
588 v = a->doff[j];
589 if(i > 0 && v == a->doff[i-1])
590 continue;
591 a->doff[i++] = v;
593 a->n = i;
596 Entry
597 getentry(int i)
599 long b, e, n;
600 static Entry ans;
601 static int anslen = 0;
603 b = dot->doff[i];
604 e = (*dict->nextoff)(b+1);
605 ans.doff = b;
606 if(e < 0) {
607 err("couldn't seek to entry");
608 ans.start = 0;
609 ans.end = 0;
610 } else {
611 n = e-b;
612 if(n+1 > anslen) {
613 ans.start = realloc(ans.start, n+1);
614 if(!ans.start) {
615 err("out of memory");
616 exits("nomem");
618 anslen = n+1;
620 Bseek(bdict, b, 0);
621 n = Bread(bdict, ans.start, n);
622 ans.end = ans.start + n;
623 *ans.end = 0;
625 return ans;
628 void
629 setdotnext(void)
631 long b;
633 b = (*dict->nextoff)(dot->doff[dot->cur]+1);
634 if(b < 0) {
635 err("couldn't find a next entry");
636 return;
638 dot->doff[0] = b;
639 dot->n = 1;
640 dot->cur = 0;
643 void
644 setdotprev(void)
646 int tryback;
647 long here, last, p;
649 if(dot->cur < 0 || dot->cur >= dot->n)
650 return;
651 tryback = 2000;
652 here = dot->doff[dot->cur];
653 last = 0;
654 while(last == 0) {
655 p = here - tryback;
656 if(p < 0)
657 p = 0;
658 for(;;) {
659 p = (*dict->nextoff)(p+1);
660 if(p < 0)
661 return; /* shouldn't happen */
662 if(p >= here)
663 break;
664 last = p;
666 if(!last) {
667 if(here - tryback < 0) {
668 err("can't find a previous entry");
669 return;
671 tryback = 2*tryback;
674 dot->doff[0] = last;
675 dot->n = 1;
676 dot->cur = 0;