Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bin.h>
4 #include <bio.h>
5 #include <regexp.h>
6 #include "../../../libregexp/regcomp.h"
7 #include "dfa.h"
9 void rdump(Reprog*);
10 void dump(Dreprog*);
12 /*
13 * Standard NFA determinization and DFA minimization.
14 */
15 typedef struct Deter Deter;
16 typedef struct Reiset Reiset;
18 void ddump(Deter*);
20 /* state of determinization */
21 struct Deter
22 {
23 jmp_buf kaboom; /* jmp on error */
25 Bin *bin; /* bin for temporary allocations */
27 Reprog *p; /* program being determinized */
28 uint ninst; /* number of instructions in program */
30 Reiset *alloc; /* chain of all Reisets */
31 Reiset **last;
33 Reiset **hash; /* hash of all Reisets */
34 uint nhash;
36 Reiset *tmp; /* temporaries for walk */
37 uchar *bits;
39 Rune *c; /* ``interesting'' characters */
40 uint nc;
41 };
43 /* set of Reinsts: perhaps we should use a bit list instead of the indices? */
44 struct Reiset
45 {
46 uint *inst; /* indices of instructions in set */
47 uint ninst; /* size of set */
49 Reiset *next; /* d.alloc chain */
50 Reiset *hash; /* d.hash chain */
51 Reiset **delta; /* where to go on each interesting char */
52 uint id; /* assigned id during minimization */
53 uint isfinal; /* is an accepting (final) state */
54 };
56 static Reiset*
57 ralloc(Deter *d, int ninst)
58 {
59 Reiset *t;
61 t = binalloc(&d->bin, sizeof(Reiset)+2*d->nc*sizeof(Reiset*)+sizeof(uint)*ninst, 0);
62 if(t == nil)
63 longjmp(d->kaboom, 1);
64 t->delta = (Reiset**)&t[1];
65 t->inst = (uint*)&t->delta[2*d->nc];
66 return t;
67 }
69 /* find the canonical form a given Reiset */
70 static Reiset*
71 findreiset(Deter *d, Reiset *s)
72 {
73 int i, szinst;
74 uint h;
75 Reiset *t;
77 h = 0;
78 for(i=0; i<s->ninst; i++)
79 h = h*1000003 + s->inst[i];
80 h %= d->nhash;
82 szinst = s->ninst*sizeof(s->inst[0]);
83 for(t=d->hash[h]; t; t=t->hash)
84 if(t->ninst==s->ninst && memcmp(t->inst, s->inst, szinst)==0)
85 return t;
87 t = ralloc(d, s->ninst);
88 t->hash = d->hash[h];
89 d->hash[h] = t;
91 *d->last = t;
92 d->last = &t->next;
93 t->next = 0;
95 t->ninst = s->ninst;
96 memmove(t->inst, s->inst, szinst);
98 /* delta is filled in later */
100 return t;
103 /* convert bits to a real reiset */
104 static Reiset*
105 bits2reiset(Deter *d, uchar *bits)
107 int k;
108 Reiset *s;
110 s = d->tmp;
111 s->ninst = 0;
112 for(k=0; k<d->ninst; k++)
113 if(bits[k])
114 s->inst[s->ninst++] = k;
115 return findreiset(d, s);
118 /* add n to state set; if n < k, need to go around again */
119 static int
120 add(int n, uchar *bits, int k)
122 if(bits[n])
123 return 0;
124 bits[n] = 1;
125 return n < k;
128 /* update bits to follow all the empty (non-character-related) transitions possible */
129 static void
130 followempty(Deter *d, uchar *bits, int bol, int eol)
132 int again, k;
133 Reinst *i;
135 do{
136 again = 0;
137 for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
138 if(!bits[k])
139 continue;
140 switch(i->type){
141 case RBRA:
142 case LBRA:
143 again |= add(i->u2.next - d->p->firstinst, bits, k);
144 break;
145 case OR:
146 again |= add(i->u2.left - d->p->firstinst, bits, k);
147 again |= add(i->u1.right - d->p->firstinst, bits, k);
148 break;
149 case BOL:
150 if(bol)
151 again |= add(i->u2.next - d->p->firstinst, bits, k);
152 break;
153 case EOL:
154 if(eol)
155 again |= add(i->u2.next - d->p->firstinst, bits, k);
156 break;
159 }while(again);
161 /*
162 * Clear bits for useless transitions. We could do this during
163 * the switch above, but then we have no guarantee of termination
164 * if we get a loop in the regexp.
165 */
166 for(i=d->p->firstinst, k=0; k < d->ninst; i++, k++){
167 if(!bits[k])
168 continue;
169 switch(i->type){
170 case RBRA:
171 case LBRA:
172 case OR:
173 case BOL:
174 case EOL:
175 bits[k] = 0;
176 break;
181 /*
182 * Where does s go if it sees rune r?
183 * Eol is true if a $ matches the string at the position just after r.
184 */
185 static Reiset*
186 transition(Deter *d, Reiset *s, Rune r, uint eol)
188 int k;
189 uchar *bits;
190 Reinst *i, *inst0;
191 Rune *rp, *ep;
193 bits = d->bits;
194 memset(bits, 0, d->ninst);
196 inst0 = d->p->firstinst;
197 for(k=0; k < s->ninst; k++){
198 i = inst0 + s->inst[k];
199 switch(i->type){
200 default:
201 werrstr("bad reprog: got type %d", i->type);
202 longjmp(d->kaboom, 1);
203 case RBRA:
204 case LBRA:
205 case OR:
206 case BOL:
207 case EOL:
208 werrstr("internal error: got type %d", i->type);
209 longjmp(d->kaboom, 1);
211 case RUNE:
212 if(r == i->u1.r)
213 bits[i->u2.next - inst0] = 1;
214 break;
215 case ANY:
216 if(r != L'\n')
217 bits[i->u2.next - inst0] = 1;
218 break;
219 case ANYNL:
220 bits[i->u2.next - inst0] = 1;
221 break;
222 case NCCLASS:
223 if(r == L'\n')
224 break;
225 /* fall through */
226 case CCLASS:
227 ep = i->u1.cp->end;
228 for(rp = i->u1.cp->spans; rp < ep; rp += 2)
229 if(rp[0] <= r && r <= rp[1])
230 break;
231 if((rp < ep) ^! (i->type == CCLASS))
232 bits[i->u2.next - inst0] = 1;
233 break;
234 case END:
235 break;
239 followempty(d, bits, r=='\n', eol);
240 return bits2reiset(d, bits);
243 static int
244 countinst(Reprog *pp)
246 int n;
247 Reinst *l;
249 n = 0;
250 l = pp->firstinst;
251 while(l++->type)
252 n++;
253 return n;
256 static void
257 set(Deter *d, u32int **tab, Rune r)
259 u32int *u;
261 if((u = tab[r/4096]) == nil){
262 u = binalloc(&d->bin, 4096/8, 1);
263 if(u == nil)
264 longjmp(d->kaboom, 1);
265 tab[r/4096] = u;
267 u[(r%4096)/32] |= 1<<(r%32);
270 /*
271 * Compute the list of important characters.
272 * Other characters behave like the ones that surround them.
273 */
274 static void
275 findchars(Deter *d, Reprog *p)
277 u32int *tab[65536/4096], *u, x;
278 Reinst *i;
279 Rune *rp, *ep;
280 int k, m, n, a;
282 memset(tab, 0, sizeof tab);
283 set(d, tab, 0);
284 set(d, tab, 0xFFFF);
285 for(i=p->firstinst; i->type; i++){
286 switch(i->type){
287 case ANY:
288 set(d, tab, L'\n'-1);
289 set(d, tab, L'\n');
290 set(d, tab, L'\n'+1);
291 break;
292 case RUNE:
293 set(d, tab, i->u1.r-1);
294 set(d, tab, i->u1.r);
295 set(d, tab, i->u1.r+1);
296 break;
297 case NCCLASS:
298 set(d, tab, L'\n'-1);
299 set(d, tab, L'\n');
300 set(d, tab, L'\n'+1);
301 /* fall through */
302 case CCLASS:
303 ep = i->u1.cp->end;
304 for(rp = i->u1.cp->spans; rp < ep; rp += 2){
305 set(d, tab, rp[0]-1);
306 set(d, tab, rp[0]);
307 set(d, tab, rp[1]);
308 set(d, tab, rp[1]+1);
310 break;
314 n = 0;
315 for(k=0; k<nelem(tab); k++){
316 if((u = tab[k]) == nil)
317 continue;
318 for(m=0; m<4096/32; m++){
319 if((x = u[m]) == 0)
320 continue;
321 for(a=0; a<32; a++)
322 if(x&(1<<a))
323 n++;
327 d->c = binalloc(&d->bin, (n+1)*sizeof(Rune), 0);
328 if(d->c == 0)
329 longjmp(d->kaboom, 1);
330 d->nc = n;
332 n = 0;
333 for(k=0; k<nelem(tab); k++){
334 if((u = tab[k]) == nil)
335 continue;
336 for(m=0; m<4096/32; m++){
337 if((x = u[m]) == 0)
338 continue;
339 for(a=0; a<32; a++)
340 if(x&(1<<a))
341 d->c[n++] = k*4096+m*32+a;
345 d->c[n] = 0;
346 if(n != d->nc)
347 abort();
350 /*
351 * convert the Deter and Reisets into a Dreprog.
352 * if dp and c are nil, just return the count of Drecases needed.
353 */
354 static int
355 buildprog(Deter *d, Reiset **id2set, int nid, Dreprog *dp, Drecase *c)
357 int i, j, id, n, nn;
358 Dreinst *di;
359 Reiset *s;
361 nn = 0;
362 di = 0;
363 for(i=0; i<nid; i++){
364 s = id2set[i];
365 if(c){
366 di = &dp->inst[i];
367 di->isfinal = s->isfinal;
369 n = 0;
370 id = -1;
371 for(j=0; j<2*d->nc; j++){
372 if(s->delta[j]->id != id){
373 id = s->delta[j]->id;
374 if(c){
375 c[n].start = ((j/d->nc)<<16) | d->c[j%d->nc];
376 c[n].next = &dp->inst[id];
378 n++;
381 if(c){
382 if(n == 1 && c[0].next == di)
383 di->isloop = 1;
384 di->c = c;
385 di->nc = n;
386 c += n;
388 nn += n;
390 return nn;
393 Dreprog*
394 dregcvt(Reprog *p)
396 uchar *bits;
397 uint again, n, nid, id;
398 Deter d;
399 Reiset **id2set, *s, *t, *start[4];
400 Dreprog *dp;
401 Drecase *c;
403 memset(&d, 0, sizeof d);
405 if(setjmp(d.kaboom)){
406 binfree(&d.bin);
407 return nil;
410 d.p = p;
411 d.ninst = countinst(p);
413 d.last = &d.alloc;
415 n = d.ninst;
416 /* round up to power of two; this loop is the least of our efficiency problems */
417 while(n&(n-1))
418 n++;
419 d.nhash = n;
420 d.hash = binalloc(&d.bin, d.nhash*sizeof(Reinst*), 1);
422 /* get list of important runes */
423 findchars(&d, p);
425 #ifdef DUMP
426 print("relevant chars are: «%S»\n", d.c+1);
427 #endif
429 d.bits = bits = binalloc(&d.bin, d.ninst, 0);
430 d.tmp = ralloc(&d, d.ninst);
432 /*
433 * Convert to DFA
434 */
436 /* 4 start states, depending on initial bol, eol */
437 for(n=0; n<4; n++){
438 memset(bits, 0, d.ninst);
439 bits[p->startinst - p->firstinst] = 1;
440 followempty(&d, bits, n&1, n&2);
441 start[n] = bits2reiset(&d, bits);
444 /* explore the reiset space */
445 for(s=d.alloc; s; s=s->next)
446 for(n=0; n<2*d.nc; n++)
447 s->delta[n] = transition(&d, s, d.c[n%d.nc], n/d.nc);
449 #ifdef DUMP
450 nid = 0;
451 for(s=d.alloc; s; s=s->next)
452 s->id = nid++;
453 ddump(&d);
454 #endif
456 /*
457 * Minimize.
458 */
460 /* first class division is final or not */
461 for(s=d.alloc; s; s=s->next){
462 s->isfinal = 0;
463 for(n=0; n<s->ninst; n++)
464 if(p->firstinst[s->inst[n]].type == END)
465 s->isfinal = 1;
466 s->id = s->isfinal;
469 /* divide states with different transition tables in id space */
470 nid = 2;
471 do{
472 again = 0;
473 for(s=d.alloc; s; s=s->next){
474 id = -1;
475 for(t=s->next; t; t=t->next){
476 if(s->id != t->id)
477 continue;
478 for(n=0; n<2*d.nc; n++){
479 /* until we finish the for(t) loop, s->id and id are same */
480 if((s->delta[n]->id == t->delta[n]->id)
481 || (s->delta[n]->id == s->id && t->delta[n]->id == id)
482 || (s->delta[n]->id == id && t->delta[n]->id == s->id))
483 continue;
484 break;
486 if(n == 2*d.nc)
487 continue;
488 if(id == -1)
489 id = nid++;
490 t->id = id;
491 again = 1;
494 }while(again);
496 #ifdef DUMP
497 ddump(&d);
498 #endif
500 /* build dreprog */
501 id2set = binalloc(&d.bin, nid*sizeof(Reiset*), 1);
502 if(id2set == nil)
503 longjmp(d.kaboom, 1);
504 for(s=d.alloc; s; s=s->next)
505 id2set[s->id] = s;
507 n = buildprog(&d, id2set, nid, nil, nil);
508 dp = mallocz(sizeof(Dreprog)+nid*sizeof(Dreinst)+n*sizeof(Drecase), 1);
509 if(dp == nil)
510 longjmp(d.kaboom, 1);
511 c = (Drecase*)&dp->inst[nid];
512 buildprog(&d, id2set, nid, dp, c);
514 for(n=0; n<4; n++)
515 dp->start[n] = &dp->inst[start[n]->id];
516 dp->ninst = nid;
518 binfree(&d.bin);
519 return dp;
522 int
523 dregexec(Dreprog *p, char *s, int bol)
525 Rune r;
526 ulong rr;
527 Dreinst *i;
528 Drecase *c, *ec;
529 int best, n;
530 char *os;
532 i = p->start[(bol ? 1 : 0) | (s[1]=='\n' ? 2 : 0)];
533 best = -1;
534 os = s;
535 for(; *s; s+=n){
536 if(i->isfinal)
537 best = s - os;
538 if(i->isloop){
539 if(i->isfinal)
540 return strlen(os);
541 else
542 return best;
544 if((*s&0xFF) < Runeself){
545 r = *s;
546 n = 1;
547 }else
548 n = chartorune(&r, s);
549 c = i->c;
550 ec = c+i->nc;
551 rr = r;
552 if(s[n] == '\n' || s[n] == '\0')
553 rr |= 0x10000;
554 for(; c<ec; c++){
555 if(c->start > rr){
556 i = c[-1].next;
557 goto Out;
560 i = ec[-1].next;
561 Out:;
563 if(i->isfinal)
564 best = s - os;
565 return best;
569 #ifdef DUMP
570 void
571 ddump(Deter *d)
573 int i, id;
574 Reiset *s;
576 for(s=d->alloc; s; s=s->next){
577 print("%d ", s->id);
578 id = -1;
579 for(i=0; i<2*d->nc; i++){
580 if(id != s->delta[i]->id){
581 if(i==0)
582 print(" [");
583 else if(i/d->nc)
584 print(" [%C$", d->c[i%d->nc]);
585 else
586 print(" [%C", d->c[i%d->nc]);
587 print(" %d]", s->delta[i]->id);
588 id = s->delta[i]->id;
591 print("\n");
595 void
596 rdump(Reprog *pp)
598 Reinst *l;
599 Rune *p;
601 l = pp->firstinst;
602 do{
603 print("%ld:\t0%o\t%ld\t%ld", l-pp->firstinst, l->type,
604 l->left-pp->firstinst, l->right-pp->firstinst);
605 if(l->type == RUNE)
606 print("\t%C\n", l->r);
607 else if(l->type == CCLASS || l->type == NCCLASS){
608 print("\t[");
609 if(l->type == NCCLASS)
610 print("^");
611 for(p = l->cp->spans; p < l->cp->end; p += 2)
612 if(p[0] == p[1])
613 print("%C", p[0]);
614 else
615 print("%C-%C", p[0], p[1]);
616 print("]\n");
617 } else
618 print("\n");
619 }while(l++->type);
622 void
623 dump(Dreprog *pp)
625 int i, j;
626 Dreinst *l;
628 print("start %ld %ld %ld %ld\n",
629 pp->start[0]-pp->inst,
630 pp->start[1]-pp->inst,
631 pp->start[2]-pp->inst,
632 pp->start[3]-pp->inst);
634 for(i=0; i<pp->ninst; i++){
635 l = &pp->inst[i];
636 print("%d:", i);
637 for(j=0; j<l->nc; j++){
638 print(" [");
639 if(j == 0)
640 if(l->c[j].start != 1)
641 abort();
642 if(j != 0)
643 print("%C%s", l->c[j].start&0xFFFF, (l->c[j].start&0x10000) ? "$" : "");
644 print("-");
645 if(j != l->nc-1)
646 print("%C%s", (l->c[j+1].start&0xFFFF)-1, (l->c[j+1].start&0x10000) ? "$" : "");
647 print("] %ld", l->c[j].next - pp->inst);
649 if(l->isfinal)
650 print(" final");
651 if(l->isloop)
652 print(" loop");
653 print("\n");
658 void
659 main(int argc, char **argv)
661 int i;
662 Reprog *p;
663 Dreprog *dp;
665 i = 1;
666 p = regcomp(argv[i]);
667 if(p == 0){
668 print("=== %s: bad regexp\n", argv[i]);
670 // print("=== %s\n", argv[i]);
671 // rdump(p);
672 dp = dregcvt(p);
673 print("=== dfa\n");
674 dump(dp);
676 for(i=2; i<argc; i++)
677 print("match %d\n", dregexec(dp, argv[i], 0));
678 exits(0);
680 #endif
682 void
683 Bprintdfa(Biobuf *b, Dreprog *p)
685 int i, j, nc;
687 Bprint(b, "# dreprog\n");
688 nc = 0;
689 for(i=0; i<p->ninst; i++)
690 nc += p->inst[i].nc;
691 Bprint(b, "%d %d %ld %ld %ld %ld\n", p->ninst, nc,
692 p->start[0]-p->inst, p->start[1]-p->inst,
693 p->start[2]-p->inst, p->start[3]-p->inst);
694 for(i=0; i<p->ninst; i++){
695 Bprint(b, "%d %d %d", p->inst[i].isfinal, p->inst[i].isloop, p->inst[i].nc);
696 for(j=0; j<p->inst[i].nc; j++)
697 Bprint(b, " %d %ld", p->inst[i].c[j].start, p->inst[i].c[j].next-p->inst);
698 Bprint(b, "\n");
702 static char*
703 egetline(Biobuf *b, int c, jmp_buf jb)
705 char *p;
707 p = Brdline(b, c);
708 if(p == nil)
709 longjmp(jb, 1);
710 p[Blinelen(b)-1] = '\0';
711 return p;
714 static void
715 egetc(Biobuf *b, int c, jmp_buf jb)
717 if(Bgetc(b) != c)
718 longjmp(jb, 1);
721 static int
722 egetnum(Biobuf *b, int want, jmp_buf jb)
724 int c;
725 int n, first;
727 n = 0;
728 first = 1;
729 while((c = Bgetc(b)) != Beof){
730 if(c < '0' || c > '9'){
731 if(want == 0){
732 Bungetc(b);
733 c = 0;
735 if(first || c != want){
736 werrstr("format error");
737 longjmp(jb, 1);
739 return n;
741 n = n*10 + c - '0';
742 first = 0;
744 werrstr("unexpected eof");
745 longjmp(jb, 1);
746 return -1;
749 Dreprog*
750 Breaddfa(Biobuf *b)
752 char *s;
753 int ninst, nc;
754 jmp_buf jb;
755 Dreprog *p;
756 Drecase *c;
757 Dreinst *l;
758 int j, k;
760 p = nil;
761 if(setjmp(jb)){
762 free(p);
763 return nil;
766 s = egetline(b, '\n', jb);
767 if(strcmp(s, "# dreprog") != 0){
768 werrstr("format error");
769 longjmp(jb, 1);
772 ninst = egetnum(b, ' ', jb);
773 nc = egetnum(b, ' ', jb);
775 p = mallocz(sizeof(Dreprog)+ninst*sizeof(Dreinst)+nc*sizeof(Drecase), 1);
776 if(p == nil)
777 longjmp(jb, 1);
778 c = (Drecase*)&p->inst[ninst];
780 p->start[0] = &p->inst[egetnum(b, ' ', jb)];
781 p->start[1] = &p->inst[egetnum(b, ' ', jb)];
782 p->start[2] = &p->inst[egetnum(b, ' ', jb)];
783 p->start[3] = &p->inst[egetnum(b, '\n', jb)];
785 for(j=0; j<ninst; j++){
786 l = &p->inst[j];
787 l->isfinal = egetnum(b, ' ', jb);
788 l->isloop = egetnum(b, ' ', jb);
789 l->nc = egetnum(b, 0, jb);
790 l->c = c;
791 for(k=0; k<l->nc; k++){
792 egetc(b, ' ', jb);
793 c->start = egetnum(b, ' ', jb);
794 c->next = &p->inst[egetnum(b, 0, jb)];
795 c++;
797 egetc(b, '\n', jb);
799 return p;