Blob


1 #include "sam.h"
3 Rangeset sel;
4 String lastregexp;
5 /*
6 * Machine Information
7 */
8 typedef struct Inst Inst;
10 struct Inst
11 {
12 long type; /* < OPERATOR ==> literal, otherwise action */
13 union {
14 int rsid;
15 int rsubid;
16 int class;
17 struct Inst *rother;
18 struct Inst *rright;
19 } r;
20 union{
21 struct Inst *lleft;
22 struct Inst *lnext;
23 } l;
24 };
25 #define sid r.rsid
26 #define subid r.rsubid
27 #define rclass r.class
28 #define other r.rother
29 #define right r.rright
30 #define left l.lleft
31 #define next l.lnext
33 #define NPROG 1024
34 Inst program[NPROG];
35 Inst *progp;
36 Inst *startinst; /* First inst. of program; might not be program[0] */
37 Inst *bstartinst; /* same for backwards machine */
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 Inst *inst; /* Instruction of the thread */
43 Rangeset se;
44 Posn startp; /* first char of match */
45 };
47 #define NLIST 127
49 Ilist *tl, *nl; /* This list, next list */
50 Ilist list[2][NLIST+1]; /* +1 for trailing null */
51 static Rangeset sempty;
53 /*
54 * Actions and Tokens
55 *
56 * 0x10000xx are operators, value == precedence
57 * 0x20000xx are tokens, i.e. operands for operators
58 */
59 #define OPERATOR 0x1000000 /* Bit set in all operators */
60 #define START (OPERATOR+0) /* Start, used for marker on stack */
61 #define RBRA (OPERATOR+1) /* Right bracket, ) */
62 #define LBRA (OPERATOR+2) /* Left bracket, ( */
63 #define OR (OPERATOR+3) /* Alternation, | */
64 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */
65 #define STAR (OPERATOR+5) /* Closure, * */
66 #define PLUS (OPERATOR+6) /* a+ == aa* */
67 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */
68 #define ANY 0x2000000 /* Any character but newline, . */
69 #define NOP (ANY+1) /* No operation, internal use only */
70 #define BOL (ANY+2) /* Beginning of line, ^ */
71 #define EOL (ANY+3) /* End of line, $ */
72 #define CCLASS (ANY+4) /* Character class, [] */
73 #define NCCLASS (ANY+5) /* Negated character class, [^] */
74 #define END (ANY+0x77) /* Terminate: match found */
76 #define ISATOR OPERATOR
77 #define ISAND ANY
79 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */
81 /*
82 * Parser Information
83 */
84 typedef struct Node Node;
85 struct Node
86 {
87 Inst *first;
88 Inst *last;
89 };
91 #define NSTACK 20
92 Node andstack[NSTACK];
93 Node *andp;
94 int atorstack[NSTACK];
95 int *atorp;
96 int lastwasand; /* Last token was operand */
97 int cursubid;
98 int subidstack[NSTACK];
99 int *subidp;
100 int backwards;
101 int nbra;
102 Rune *exprp; /* pointer to next character in source expression */
103 #define DCLASS 10 /* allocation increment */
104 int nclass; /* number active */
105 int Nclass; /* high water mark */
106 Rune **class;
107 int negateclass;
109 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
110 void newmatch(Rangeset*);
111 void bnewmatch(Rangeset*);
112 void pushand(Inst*, Inst*);
113 void pushator(int);
114 Node *popand(int);
115 int popator(void);
116 void startlex(Rune*);
117 int lex(void);
118 void operator(int);
119 void operand(int);
120 void evaluntil(int);
121 void optimize(Inst*);
122 void bldcclass(void);
124 void
125 regerror(Err e)
127 Strzero(&lastregexp);
128 error(e);
131 void
132 regerror_c(Err e, int c)
134 Strzero(&lastregexp);
135 error_c(e, c);
138 Inst *
139 newinst(int t)
141 if(progp >= &program[NPROG])
142 regerror(Etoolong);
143 progp->type = t;
144 progp->left = 0;
145 progp->right = 0;
146 return progp++;
149 Inst *
150 realcompile(Rune *s)
152 int token;
154 startlex(s);
155 atorp = atorstack;
156 andp = andstack;
157 subidp = subidstack;
158 cursubid = 0;
159 lastwasand = FALSE;
160 /* Start with a low priority operator to prime parser */
161 pushator(START-1);
162 while((token=lex()) != END){
163 if((token&ISATOR) == OPERATOR)
164 operator(token);
165 else
166 operand(token);
168 /* Close with a low priority operator */
169 evaluntil(START);
170 /* Force END */
171 operand(END);
172 evaluntil(START);
173 if(nbra)
174 regerror(Eleftpar);
175 --andp; /* points to first and only operand */
176 return andp->first;
179 void
180 compile(String *s)
182 int i;
183 Inst *oprogp;
185 if(Strcmp(s, &lastregexp)==0)
186 return;
187 for(i=0; i<nclass; i++)
188 free(class[i]);
189 nclass = 0;
190 progp = program;
191 backwards = FALSE;
192 startinst = realcompile(s->s);
193 optimize(program);
194 oprogp = progp;
195 backwards = TRUE;
196 bstartinst = realcompile(s->s);
197 optimize(oprogp);
198 Strduplstr(&lastregexp, s);
201 void
202 operand(int t)
204 Inst *i;
205 if(lastwasand)
206 operator(CAT); /* catenate is implicit */
207 i = newinst(t);
208 if(t == CCLASS){
209 if(negateclass)
210 i->type = NCCLASS; /* UGH */
211 i->rclass = nclass-1; /* UGH */
213 pushand(i, i);
214 lastwasand = TRUE;
217 void
218 operator(int t)
220 if(t==RBRA && --nbra<0)
221 regerror(Erightpar);
222 if(t==LBRA){
223 /*
224 * if(++cursubid >= NSUBEXP)
225 * regerror(Esubexp);
226 */
227 cursubid++; /* silently ignored */
228 nbra++;
229 if(lastwasand)
230 operator(CAT);
231 }else
232 evaluntil(t);
233 if(t!=RBRA)
234 pushator(t);
235 lastwasand = FALSE;
236 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
237 lastwasand = TRUE; /* these look like operands */
240 void
241 cant(char *s)
243 char buf[100];
245 sprint(buf, "regexp: can't happen: %s", s);
246 panic(buf);
249 void
250 pushand(Inst *f, Inst *l)
252 if(andp >= &andstack[NSTACK])
253 cant("operand stack overflow");
254 andp->first = f;
255 andp->last = l;
256 andp++;
259 void
260 pushator(int t)
262 if(atorp >= &atorstack[NSTACK])
263 cant("operator stack overflow");
264 *atorp++=t;
265 if(cursubid >= NSUBEXP)
266 *subidp++= -1;
267 else
268 *subidp++=cursubid;
271 Node *
272 popand(int op)
274 if(andp <= &andstack[0])
275 if(op)
276 regerror_c(Emissop, op);
277 else
278 regerror(Ebadregexp);
279 return --andp;
282 int
283 popator(void)
285 if(atorp <= &atorstack[0])
286 cant("operator stack underflow");
287 --subidp;
288 return *--atorp;
291 void
292 evaluntil(int pri)
294 Node *op1, *op2, *t;
295 Inst *inst1, *inst2;
297 while(pri==RBRA || atorp[-1]>=pri){
298 switch(popator()){
299 case LBRA:
300 op1 = popand('(');
301 inst2 = newinst(RBRA);
302 inst2->subid = *subidp;
303 op1->last->next = inst2;
304 inst1 = newinst(LBRA);
305 inst1->subid = *subidp;
306 inst1->next = op1->first;
307 pushand(inst1, inst2);
308 return; /* must have been RBRA */
309 default:
310 panic("unknown regexp operator");
311 break;
312 case OR:
313 op2 = popand('|');
314 op1 = popand('|');
315 inst2 = newinst(NOP);
316 op2->last->next = inst2;
317 op1->last->next = inst2;
318 inst1 = newinst(OR);
319 inst1->right = op1->first;
320 inst1->left = op2->first;
321 pushand(inst1, inst2);
322 break;
323 case CAT:
324 op2 = popand(0);
325 op1 = popand(0);
326 if(backwards && op2->first->type!=END)
327 t = op1, op1 = op2, op2 = t;
328 op1->last->next = op2->first;
329 pushand(op1->first, op2->last);
330 break;
331 case STAR:
332 op2 = popand('*');
333 inst1 = newinst(OR);
334 op2->last->next = inst1;
335 inst1->right = op2->first;
336 pushand(inst1, inst1);
337 break;
338 case PLUS:
339 op2 = popand('+');
340 inst1 = newinst(OR);
341 op2->last->next = inst1;
342 inst1->right = op2->first;
343 pushand(op2->first, inst1);
344 break;
345 case QUEST:
346 op2 = popand('?');
347 inst1 = newinst(OR);
348 inst2 = newinst(NOP);
349 inst1->left = inst2;
350 inst1->right = op2->first;
351 op2->last->next = inst2;
352 pushand(inst1, inst2);
353 break;
359 void
360 optimize(Inst *start)
362 Inst *inst, *target;
364 for(inst=start; inst->type!=END; inst++){
365 target = inst->next;
366 while(target->type == NOP)
367 target = target->next;
368 inst->next = target;
372 #ifdef DEBUG
373 void
374 dumpstack(void){
375 Node *stk;
376 int *ip;
378 dprint("operators\n");
379 for(ip = atorstack; ip<atorp; ip++)
380 dprint("0%o\n", *ip);
381 dprint("operands\n");
382 for(stk = andstack; stk<andp; stk++)
383 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
385 void
386 dump(void){
387 Inst *l;
389 l = program;
390 do{
391 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
392 l->left-program, l->right-program);
393 }while(l++->type);
395 #endif
397 void
398 startlex(Rune *s)
400 exprp = s;
401 nbra = 0;
405 int
406 lex(void){
407 int c= *exprp++;
409 switch(c){
410 case '\\':
411 if(*exprp)
412 if((c= *exprp++)=='n')
413 c='\n';
414 break;
415 case 0:
416 c = END;
417 --exprp; /* In case we come here again */
418 break;
419 case '*':
420 c = STAR;
421 break;
422 case '?':
423 c = QUEST;
424 break;
425 case '+':
426 c = PLUS;
427 break;
428 case '|':
429 c = OR;
430 break;
431 case '.':
432 c = ANY;
433 break;
434 case '(':
435 c = LBRA;
436 break;
437 case ')':
438 c = RBRA;
439 break;
440 case '^':
441 c = BOL;
442 break;
443 case '$':
444 c = EOL;
445 break;
446 case '[':
447 c = CCLASS;
448 bldcclass();
449 break;
451 return c;
454 long
455 nextrec(void){
456 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
457 regerror(Ebadclass);
458 if(exprp[0] == '\\'){
459 exprp++;
460 if(*exprp=='n'){
461 exprp++;
462 return '\n';
464 return *exprp++|QUOTED;
466 return *exprp++;
469 void
470 bldcclass(void)
472 long c1, c2, n, na;
473 Rune *classp;
475 classp = emalloc(DCLASS*RUNESIZE);
476 n = 0;
477 na = DCLASS;
478 /* we have already seen the '[' */
479 if(*exprp == '^'){
480 classp[n++] = '\n'; /* don't match newline in negate case */
481 negateclass = TRUE;
482 exprp++;
483 }else
484 negateclass = FALSE;
485 while((c1 = nextrec()) != ']'){
486 if(c1 == '-'){
487 Error:
488 free(classp);
489 regerror(Ebadclass);
491 if(n+4 >= na){ /* 3 runes plus NUL */
492 na += DCLASS;
493 classp = erealloc(classp, na*RUNESIZE);
495 if(*exprp == '-'){
496 exprp++; /* eat '-' */
497 if((c2 = nextrec()) == ']')
498 goto Error;
499 classp[n+0] = Runemax;
500 classp[n+1] = c1;
501 classp[n+2] = c2;
502 n += 3;
503 }else
504 classp[n++] = c1 & ~QUOTED;
506 classp[n] = 0;
507 if(nclass == Nclass){
508 Nclass += DCLASS;
509 class = erealloc(class, Nclass*sizeof(Rune*));
511 class[nclass++] = classp;
514 int
515 classmatch(int classno, int c, int negate)
517 Rune *p;
519 p = class[classno];
520 while(*p){
521 if(*p == Runemax){
522 if(p[1]<=c && c<=p[2])
523 return !negate;
524 p += 3;
525 }else if(*p++ == c)
526 return !negate;
528 return negate;
531 /*
532 * Note optimization in addinst:
533 * *l must be pending when addinst called; if *l has been looked
534 * at already, the optimization is a bug.
535 */
536 int
537 addinst(Ilist *l, Inst *inst, Rangeset *sep)
539 Ilist *p;
541 for(p = l; p->inst; p++){
542 if(p->inst==inst){
543 if((sep)->p[0].p1 < p->se.p[0].p1)
544 p->se= *sep; /* this would be bug */
545 return 0; /* It's already there */
548 p->inst = inst;
549 p->se= *sep;
550 (p+1)->inst = 0;
551 return 1;
554 int
555 execute(File *f, Posn startp, Posn eof)
557 int flag = 0;
558 Inst *inst;
559 Ilist *tlp;
560 Posn p = startp;
561 int nnl = 0, ntl;
562 int c;
563 int wrapped = 0;
564 int startchar = startinst->type<OPERATOR? startinst->type : 0;
566 list[0][0].inst = list[1][0].inst = 0;
567 sel.p[0].p1 = -1;
568 /* Execute machine once for each character */
569 for(;;p++){
570 doloop:
571 c = filereadc(f, p);
572 if(p>=eof || c<0){
573 switch(wrapped++){
574 case 0: /* let loop run one more click */
575 case 2:
576 break;
577 case 1: /* expired; wrap to beginning */
578 if(sel.p[0].p1>=0 || eof!=INFINITY)
579 goto Return;
580 list[0][0].inst = list[1][0].inst = 0;
581 p = 0;
582 goto doloop;
583 default:
584 goto Return;
586 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
587 break;
588 /* fast check for first char */
589 if(startchar && nnl==0 && c!=startchar)
590 continue;
591 tl = list[flag];
592 nl = list[flag^=1];
593 nl->inst = 0;
594 ntl = nnl;
595 nnl = 0;
596 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
597 /* Add first instruction to this list */
598 sempty.p[0].p1 = p;
599 if(addinst(tl, startinst, &sempty))
600 if(++ntl >= NLIST)
601 Overflow:
602 error(Eoverflow);
604 /* Execute machine until this list is empty */
605 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
606 Switchstmt:
607 switch(inst->type){
608 default: /* regular character */
609 if(inst->type==c){
610 Addinst:
611 if(addinst(nl, inst->next, &tlp->se))
612 if(++nnl >= NLIST)
613 goto Overflow;
615 break;
616 case LBRA:
617 if(inst->subid>=0)
618 tlp->se.p[inst->subid].p1 = p;
619 inst = inst->next;
620 goto Switchstmt;
621 case RBRA:
622 if(inst->subid>=0)
623 tlp->se.p[inst->subid].p2 = p;
624 inst = inst->next;
625 goto Switchstmt;
626 case ANY:
627 if(c!='\n')
628 goto Addinst;
629 break;
630 case BOL:
631 if(p==0 || filereadc(f, p - 1)=='\n'){
632 Step:
633 inst = inst->next;
634 goto Switchstmt;
636 break;
637 case EOL:
638 if(c == '\n')
639 goto Step;
640 break;
641 case CCLASS:
642 if(c>=0 && classmatch(inst->rclass, c, 0))
643 goto Addinst;
644 break;
645 case NCCLASS:
646 if(c>=0 && classmatch(inst->rclass, c, 1))
647 goto Addinst;
648 break;
649 case OR:
650 /* evaluate right choice later */
651 if(addinst(tl, inst->right, &tlp->se))
652 if(++ntl >= NLIST)
653 goto Overflow;
654 /* efficiency: advance and re-evaluate */
655 inst = inst->left;
656 goto Switchstmt;
657 case END: /* Match! */
658 tlp->se.p[0].p2 = p;
659 newmatch(&tlp->se);
660 break;
664 Return:
665 return sel.p[0].p1>=0;
668 void
669 newmatch(Rangeset *sp)
671 int i;
673 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
674 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
675 for(i = 0; i<NSUBEXP; i++)
676 sel.p[i] = sp->p[i];
679 int
680 bexecute(File *f, Posn startp)
682 int flag = 0;
683 Inst *inst;
684 Ilist *tlp;
685 Posn p = startp;
686 int nnl = 0, ntl;
687 int c;
688 int wrapped = 0;
689 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
691 list[0][0].inst = list[1][0].inst = 0;
692 sel.p[0].p1= -1;
693 /* Execute machine once for each character, including terminal NUL */
694 for(;;--p){
695 doloop:
696 if((c = filereadc(f, p - 1))==-1){
697 switch(wrapped++){
698 case 0: /* let loop run one more click */
699 case 2:
700 break;
701 case 1: /* expired; wrap to end */
702 if(sel.p[0].p1>=0)
703 goto Return;
704 list[0][0].inst = list[1][0].inst = 0;
705 p = f->b.nc;
706 goto doloop;
707 case 3:
708 default:
709 goto Return;
711 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
712 break;
713 /* fast check for first char */
714 if(startchar && nnl==0 && c!=startchar)
715 continue;
716 tl = list[flag];
717 nl = list[flag^=1];
718 nl->inst = 0;
719 ntl = nnl;
720 nnl = 0;
721 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
722 /* Add first instruction to this list */
723 /* the minus is so the optimizations in addinst work */
724 sempty.p[0].p1 = -p;
725 if(addinst(tl, bstartinst, &sempty))
726 if(++ntl >= NLIST)
727 Overflow:
728 error(Eoverflow);
730 /* Execute machine until this list is empty */
731 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
732 Switchstmt:
733 switch(inst->type){
734 default: /* regular character */
735 if(inst->type == c){
736 Addinst:
737 if(addinst(nl, inst->next, &tlp->se))
738 if(++nnl >= NLIST)
739 goto Overflow;
741 break;
742 case LBRA:
743 if(inst->subid>=0)
744 tlp->se.p[inst->subid].p1 = p;
745 inst = inst->next;
746 goto Switchstmt;
747 case RBRA:
748 if(inst->subid >= 0)
749 tlp->se.p[inst->subid].p2 = p;
750 inst = inst->next;
751 goto Switchstmt;
752 case ANY:
753 if(c != '\n')
754 goto Addinst;
755 break;
756 case BOL:
757 if(c=='\n' || p==0){
758 Step:
759 inst = inst->next;
760 goto Switchstmt;
762 break;
763 case EOL:
764 if(p==f->b.nc || filereadc(f, p)=='\n')
765 goto Step;
766 break;
767 case CCLASS:
768 if(c>=0 && classmatch(inst->rclass, c, 0))
769 goto Addinst;
770 break;
771 case NCCLASS:
772 if(c>=0 && classmatch(inst->rclass, c, 1))
773 goto Addinst;
774 break;
775 case OR:
776 /* evaluate right choice later */
777 if(addinst(tlp, inst->right, &tlp->se))
778 if(++ntl >= NLIST)
779 goto Overflow;
780 /* efficiency: advance and re-evaluate */
781 inst = inst->left;
782 goto Switchstmt;
783 case END: /* Match! */
784 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
785 tlp->se.p[0].p2 = p;
786 bnewmatch(&tlp->se);
787 break;
791 Return:
792 return sel.p[0].p1>=0;
795 void
796 bnewmatch(Rangeset *sp)
798 int i;
799 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
800 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */
801 sel.p[i].p1 = sp->p[i].p2;
802 sel.p[i].p2 = sp->p[i].p1;