Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <regexp.h>
5 #include <ctype.h>
6 #include "dict.h"
8 /*
9 * Assumed index file structure: lines of form
10 * [^\t]+\t[0-9]+
11 * First field is key, second is byte offset into dictionary.
12 * Should be sorted with args -u -t' ' +0f -1 +0 -1 +1n -2
13 */
14 typedef struct Addr Addr;
16 struct Addr {
17 int n; /* number of offsets */
18 int cur; /* current position within doff array */
19 int maxn; /* actual current size of doff array */
20 ulong doff[1]; /* doff[maxn], with 0..n-1 significant */
21 };
23 Biobuf binbuf;
24 Biobuf boutbuf;
25 Biobuf *bin = &binbuf; /* user cmd input */
26 Biobuf *bout = &boutbuf; /* output */
27 Biobuf *bdict; /* dictionary */
28 Biobuf *bindex; /* index file */
29 long indextop; /* index offset at end of file */
30 int lastcmd; /* last executed command */
31 Addr *dot; /* "current" address */
32 Dict *dict; /* current dictionary */
33 int linelen;
34 int breaklen = 60;
35 int outinhibit;
36 int debug;
38 void execcmd(int);
39 int getpref(char*, Rune*);
40 Entry getentry(int);
41 int getfield(Rune*);
42 long locate(Rune*);
43 int parseaddr(char*, char**);
44 int parsecmd(char*);
45 int search(char*, int);
46 long seeknextline(Biobuf*, long);
47 void setdotnext(void);
48 void setdotprev(void);
49 void sortaddr(Addr*);
50 void usage(void);
52 enum {
53 Plen=300, /* max length of a search pattern */
54 Fieldlen=200, /* max length of an index field */
55 Aslots=10, /* initial number of slots in an address */
56 };
58 void
59 main(int argc, char **argv)
60 {
61 int i, cmd, kflag;
62 char *line, *p;
64 Binit(&binbuf, 0, OREAD);
65 Binit(&boutbuf, 1, OWRITE);
66 kflag = 0;
67 line = 0;
68 dict = 0;
70 for(i=0; dicts[i].name; i++){
71 if(access(dicts[i].path, 0)>=0 && access(dicts[i].indexpath, 0)>=0){
72 dict = &dicts[i];
73 break;
74 }
75 }
76 ARGBEGIN {
77 case 'd':
78 p = ARGF();
79 dict = 0;
80 if(p) {
81 for(i=0; dicts[i].name; i++)
82 if(strcmp(p, dicts[i].name)==0) {
83 dict = &dicts[i];
84 break;
85 }
86 }
87 if(!dict)
88 usage();
89 break;
90 case 'c':
91 line = ARGF();
92 if(!line)
93 usage();
94 break;
95 case 'k':
96 kflag++;
97 break;
98 case 'D':
99 debug++;
100 break;
101 default:
102 usage();
103 ARGEND }
104 if(dict == 0){
105 err("no dictionaries present on this system");
106 exits("nodict");
109 if(kflag) {
110 (*dict->printkey)();
111 exits(0);
113 if(argc > 1)
114 usage();
115 else if(argc == 1) {
116 if(line)
117 usage();
118 p = argv[0];
119 line = malloc(strlen(p)+5);
120 sprint(line, "/%s/P\n", p);
122 bdict = Bopen(dict->path, OREAD);
123 if(!bdict) {
124 err("can't open dictionary %s", dict->path);
125 exits("nodict");
127 bindex = Bopen(dict->indexpath, OREAD);
128 if(!bindex) {
129 err("can't open index %s", dict->indexpath);
130 exits("noindex");
132 indextop = Bseek(bindex, 0L, 2);
134 dot = malloc(sizeof(Addr)+(Aslots-1)*sizeof(ulong));
135 dot->n = 0;
136 dot->cur = 0;
137 dot->maxn = Aslots;
138 lastcmd = 0;
140 if(line) {
141 cmd = parsecmd(line);
142 if(cmd)
143 execcmd(cmd);
144 } else {
145 for(;;) {
146 Bprint(bout, "*");
147 Bflush(bout);
148 line = Brdline(bin, '\n');
149 linelen = 0;
150 if(!line)
151 break;
152 cmd = parsecmd(line);
153 if(cmd) {
154 execcmd(cmd);
155 lastcmd = cmd;
159 exits(0);
162 void
163 usage(void)
165 int i;
166 char *a, *b;
168 Bprint(bout, "Usage: %s [-d dict] [-k] [-c cmd] [word]\n", argv0);
169 Bprint(bout, "dictionaries (brackets mark dictionaries not present on this system):\n");
170 for(i = 0; dicts[i].name; i++){
171 a = b = "";
172 if(access(dicts[i].path, 0)<0 || access(dicts[i].indexpath, 0)<0){
173 a = "[";
174 b = "]";
176 Bprint(bout, " %s%s\t%s%s\n", a, dicts[i].name, dicts[i].desc, b);
178 exits("usage");
181 int
182 parsecmd(char *line)
184 char *e;
185 int cmd, ans;
187 if(parseaddr(line, &e) >= 0)
188 line = e;
189 else
190 return 0;
191 cmd = *line;
192 ans = cmd;
193 if(isupper(cmd))
194 cmd = tolower(cmd);
195 if(!(cmd == 'a' || cmd == 'h' || cmd == 'p' || cmd == 'r' ||
196 cmd == '\n')) {
197 err("unknown command %c", cmd);
198 return 0;
200 if(cmd == '\n')
201 switch(lastcmd) {
202 case 0: ans = 'H'; break;
203 case 'H': ans = 'p'; break;
204 default : ans = lastcmd; break;
206 else if(line[1] != '\n' && line[1] != 0)
207 err("extra stuff after command %c ignored", cmd);
208 return ans;
211 void
212 execcmd(int cmd)
214 Entry e;
215 int cur, doall;
217 if(isupper(cmd)) {
218 doall = 1;
219 cmd = tolower(cmd);
220 cur = 0;
221 } else {
222 doall = 0;
223 cur = dot->cur;
225 if(debug && doall && cmd == 'a')
226 Bprint(bout, "%d entries, cur=%d\n", dot->n, cur+1);
227 for(;;){
228 if(cur >= dot->n)
229 break;
230 if(doall) {
231 Bprint(bout, "%d\t", cur+1);
232 linelen += 4 + (cur >= 10);
234 switch(cmd) {
235 case 'a':
236 Bprint(bout, "#%lud\n", dot->doff[cur]);
237 break;
238 case 'h':
239 case 'p':
240 case 'r':
241 e = getentry(cur);
242 (*dict->printentry)(e, cmd);
243 break;
245 cur++;
246 if(doall) {
247 if(cmd == 'p' || cmd == 'r') {
248 Bputc(bout, '\n');
249 linelen = 0;
251 } else
252 break;
254 if(cur >= dot->n)
255 cur = 0;
256 dot->cur = cur;
259 /*
260 * Address syntax: ('.' | '/' re '/' | '!' re '!' | number | '#' number) ('+' | '-')*
261 * Answer goes in dot.
262 * Return -1 if address starts, but get error.
263 * Return 0 if no address.
264 */
265 int
266 parseaddr(char *line, char **eptr)
268 int delim, plen;
269 ulong v;
270 char *e;
271 char pat[Plen];
273 if(*line == '/' || *line == '!') {
274 /* anchored regular expression match; '!' means no folding */
275 if(*line == '/') {
276 delim = '/';
277 e = strpbrk(line+1, "/\n");
278 } else {
279 delim = '!';
280 e = strpbrk(line+1, "!\n");
282 plen = e-line-1;
283 if(plen >= Plen-3) {
284 err("pattern too big");
285 return -1;
287 pat[0] = '^';
288 memcpy(pat+1, line+1, plen);
289 pat[plen+1] = '$';
290 pat[plen+2] = 0;
291 if(*e == '\n')
292 line = e;
293 else
294 line = e+1;
295 if(!search(pat, delim == '/')) {
296 err("pattern not found");
297 return -1;
299 } else if(*line == '#') {
300 /* absolute byte offset into dictionary */
301 line++;
302 if(!isdigit(*line))
303 return -1;
304 v = strtoul(line, &e, 10);
305 line = e;
306 dot->doff[0] = v;
307 dot->n = 1;
308 dot->cur = 0;
309 } else if(isdigit(*line)) {
310 v = strtoul(line, &e, 10);
311 line = e;
312 if(v < 1 || v > dot->n)
313 err(".%d not in range [1,%d], ignored",
314 v, dot->n);
315 else
316 dot->cur = v-1;
317 } else if(*line == '.') {
318 line++;
319 } else {
320 *eptr = line;
321 return 0;
323 while(*line == '+' || *line == '-') {
324 if(*line == '+')
325 setdotnext();
326 else
327 setdotprev();
328 line++;
330 *eptr = line;
331 return 1;
334 /*
335 * Index file is sorted by folded field1.
336 * Method: find pre, a folded prefix of r.e. pat,
337 * and then low = offset to beginning of
338 * line in index file where first match of prefix occurs.
339 * Then go through index until prefix no longer matches,
340 * adding each line that matches real pattern to dot.
341 * Finally, sort dot offsets (uniquing).
342 * We know pat len < Plen, and that it is surrounded by ^..$
343 */
344 int
345 search(char *pat, int dofold)
347 int needre, prelen, match, n;
348 Reprog *re;
349 long ioff, v;
350 Rune pre[Plen];
351 Rune lit[Plen];
352 Rune entry[Fieldlen];
353 char fpat[Plen];
355 prelen = getpref(pat+1, pre);
356 if(pat[prelen+1] == 0 || pat[prelen+1] == '$') {
357 runescpy(lit, pre);
358 if(dofold)
359 fold(lit);
360 needre = 0;
361 SET(re);
362 } else {
363 needre = 1;
364 if(dofold) {
365 foldre(fpat, pat);
366 re = regcomp(fpat);
367 } else
368 re = regcomp(pat);
370 fold(pre);
371 ioff = locate(pre);
372 if(ioff < 0)
373 return 0;
374 dot->n = 0;
375 Bseek(bindex, ioff, 0);
376 for(;;) {
377 if(!getfield(entry))
378 break;
379 if(dofold)
380 fold(entry);
381 if(needre)
382 match = rregexec(re, entry, 0, 0);
383 else
384 match = (acomp(lit, entry) == 0);
385 if(match) {
386 if(!getfield(entry))
387 break;
388 v = runetol(entry);
389 if(dot->n >= dot->maxn) {
390 n = 2*dot->maxn;
391 dot = realloc(dot,
392 sizeof(Addr)+(n-1)*sizeof(long));
393 if(!dot) {
394 err("out of memory");
395 exits("nomem");
397 dot->maxn = n;
399 dot->doff[dot->n++] = v;
400 } else {
401 if(!dofold)
402 fold(entry);
403 if(*pre) {
404 n = acomp(pre, entry);
405 if(n < -1 || (!needre && n < 0))
406 break;
408 /* get to next index entry */
409 if(!getfield(entry))
410 break;
413 sortaddr(dot);
414 dot->cur = 0;
415 return dot->n;
418 /*
419 * Return offset in index file of first line whose folded
420 * first field has pre as a prefix. -1 if none found.
421 */
422 long
423 locate(Rune *pre)
425 long top, bot, mid;
426 Rune entry[Fieldlen];
428 if(*pre == 0)
429 return 0;
430 bot = 0;
431 top = indextop;
432 if(debug>1)
433 fprint(2, "locate looking for prefix %S\n", pre);
434 for(;;) {
435 /*
436 * Loop invariant: foldkey(bot) < pre <= foldkey(top)
437 * and bot < top, and bot,top point at beginning of lines
438 */
439 mid = (top+bot) / 2;
440 mid = seeknextline(bindex, mid);
441 if(debug > 1)
442 fprint(2, "bot=%ld, mid=%ld->%ld, top=%ld\n",
443 bot, (top+bot) / 2, mid, top);
444 if(mid == top || !getfield(entry))
445 break;
446 if(debug > 1)
447 fprint(2, "key=%S\n", entry);
448 /*
449 * here mid is strictly between bot and top
450 */
451 fold(entry);
452 if(acomp(pre, entry) <= 0)
453 top = mid;
454 else
455 bot = mid;
457 /*
458 * bot < top, but they don't necessarily point at successive lines
459 * Use linear search from bot to find first line that pre is a
460 * prefix of
461 */
462 while((bot = seeknextline(bindex, bot)) <= top) {
463 if(!getfield(entry))
464 return -1;
465 if(debug > 1)
466 fprint(2, "key=%S\n", entry);
467 fold(entry);
468 switch(acomp(pre, entry)) {
469 case -2:
470 return -1;
471 case -1:
472 case 0:
473 return bot;
474 case 1:
475 case 2:
476 continue;
479 return -1;
483 /*
484 * Get prefix of non re-metacharacters, runified, into pre,
485 * and return length
486 */
487 int
488 getpref(char *pat, Rune *pre)
490 int n, r;
491 char *p;
493 p = pat;
494 while(*p) {
495 n = chartorune(pre, p);
496 r = *pre;
497 switch(r) {
498 case 0x2e: case 0x2a: case 0x2b: case 0x3f:
499 case 0x5b: case 0x5d: case 0x28: case ')':
500 case 0x7c: case 0x5e: case 0x24:
501 *pre = 0;
502 return p-pat;
503 case L'\\':
504 p += n;
505 p += chartorune(++pre, p);
506 pre++;
507 break;
508 default:
509 p += n;
510 pre++;
513 return p-pat;
516 long
517 seeknextline(Biobuf *b, long off)
519 long c;
521 Bseek(b, off, 0);
522 do {
523 c = Bgetrune(b);
524 } while(c>=0 && c!='\n');
525 return Boffset(b);
528 /*
529 * Get next field out of index file (either tab- or nl- terminated)
530 * Answer in *rp, assumed to be Fieldlen long.
531 * Return 0 if read error first.
532 */
533 int
534 getfield(Rune *rp)
536 long c;
537 int n;
539 for(n=Fieldlen; n-- > 0; ) {
540 if ((c = Bgetrune(bindex)) < 0)
541 return 0;
542 if(c == '\t' || c == '\n') {
543 *rp = L'\0';
544 return 1;
546 *rp++ = c;
548 err("word too long");
549 return 0;
552 /*
553 * A compare longs function suitable for qsort
554 */
555 static int
556 longcmp(const void *av, const void *bv)
558 long v;
559 long *a, *b;
561 a = (long*)av;
562 b = (long*)bv;
564 v = *a - *b;
565 if(v < 0)
566 return -1;
567 else if(v == 0)
568 return 0;
569 else
570 return 1;
573 void
574 sortaddr(Addr *a)
576 int i, j;
577 long v;
579 if(a->n <= 1)
580 return;
582 qsort(a->doff, a->n, sizeof(long), longcmp);
584 /* remove duplicates */
585 for(i=0, j=0; j < a->n; j++) {
586 v = a->doff[j];
587 if(i > 0 && v == a->doff[i-1])
588 continue;
589 a->doff[i++] = v;
591 a->n = i;
594 Entry
595 getentry(int i)
597 long b, e, n;
598 static Entry ans;
599 static int anslen = 0;
601 b = dot->doff[i];
602 e = (*dict->nextoff)(b+1);
603 ans.doff = b;
604 if(e < 0) {
605 err("couldn't seek to entry");
606 ans.start = 0;
607 ans.end = 0;
608 } else {
609 n = e-b;
610 if(n+1 > anslen) {
611 ans.start = realloc(ans.start, n+1);
612 if(!ans.start) {
613 err("out of memory");
614 exits("nomem");
616 anslen = n+1;
618 Bseek(bdict, b, 0);
619 n = Bread(bdict, ans.start, n);
620 ans.end = ans.start + n;
621 *ans.end = 0;
623 return ans;
626 void
627 setdotnext(void)
629 long b;
631 b = (*dict->nextoff)(dot->doff[dot->cur]+1);
632 if(b < 0) {
633 err("couldn't find a next entry");
634 return;
636 dot->doff[0] = b;
637 dot->n = 1;
638 dot->cur = 0;
641 void
642 setdotprev(void)
644 int tryback;
645 long here, last, p;
647 if(dot->cur < 0 || dot->cur >= dot->n)
648 return;
649 tryback = 2000;
650 here = dot->doff[dot->cur];
651 last = 0;
652 while(last == 0) {
653 p = here - tryback;
654 if(p < 0)
655 p = 0;
656 for(;;) {
657 p = (*dict->nextoff)(p+1);
658 if(p < 0)
659 return; /* shouldn't happen */
660 if(p >= here)
661 break;
662 last = p;
664 if(!last) {
665 if(here - tryback < 0) {
666 err("can't find a previous entry");
667 return;
669 tryback = 2*tryback;
672 dot->doff[0] = last;
673 dot->n = 1;
674 dot->cur = 0;