Blob


1 #include "sam.h"
3 Rangeset sel;
4 String lastregexp;
5 /*
6 * Machine Information
7 */
8 typedef struct Inst Inst;
10 struct Inst
11 {
12 long type; /* < 0x10000 ==> literal, otherwise action */
13 union {
14 int rsid;
15 int rsubid;
16 int class;
17 struct Inst *rother;
18 struct Inst *rright;
19 } r;
20 union{
21 struct Inst *lleft;
22 struct Inst *lnext;
23 } l;
24 };
25 #define sid r.rsid
26 #define subid r.rsubid
27 #define rclass r.class
28 #define other r.rother
29 #define right r.rright
30 #define left l.lleft
31 #define next l.lnext
33 #define NPROG 1024
34 Inst program[NPROG];
35 Inst *progp;
36 Inst *startinst; /* First inst. of program; might not be program[0] */
37 Inst *bstartinst; /* same for backwards machine */
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 Inst *inst; /* Instruction of the thread */
43 Rangeset se;
44 Posn startp; /* first char of match */
45 };
47 #define NLIST 127
49 Ilist *tl, *nl; /* This list, next list */
50 Ilist list[2][NLIST+1]; /* +1 for trailing null */
51 static Rangeset sempty;
53 /*
54 * Actions and Tokens
55 *
56 * 0x100xx are operators, value == precedence
57 * 0x200xx are tokens, i.e. operands for operators
58 */
59 #define OPERATOR 0x10000 /* Bitmask of all operators */
60 #define START 0x10000 /* Start, used for marker on stack */
61 #define RBRA 0x10001 /* Right bracket, ) */
62 #define LBRA 0x10002 /* Left bracket, ( */
63 #define OR 0x10003 /* Alternation, | */
64 #define CAT 0x10004 /* Concatentation, implicit operator */
65 #define STAR 0x10005 /* Closure, * */
66 #define PLUS 0x10006 /* a+ == aa* */
67 #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
68 #define ANY 0x20000 /* Any character but newline, . */
69 #define NOP 0x20001 /* No operation, internal use only */
70 #define BOL 0x20002 /* Beginning of line, ^ */
71 #define EOL 0x20003 /* End of line, $ */
72 #define CCLASS 0x20004 /* Character class, [] */
73 #define NCCLASS 0x20005 /* Negated character class, [^] */
74 #define END 0x20077 /* Terminate: match found */
76 #define ISATOR 0x10000
77 #define ISAND 0x20000
79 /*
80 * Parser Information
81 */
82 typedef struct Node Node;
83 struct Node
84 {
85 Inst *first;
86 Inst *last;
87 };
89 #define NSTACK 20
90 Node andstack[NSTACK];
91 Node *andp;
92 int atorstack[NSTACK];
93 int *atorp;
94 int lastwasand; /* Last token was operand */
95 int cursubid;
96 int subidstack[NSTACK];
97 int *subidp;
98 int backwards;
99 int nbra;
100 Rune *exprp; /* pointer to next character in source expression */
101 #define DCLASS 10 /* allocation increment */
102 int nclass; /* number active */
103 int Nclass; /* high water mark */
104 Rune **class;
105 int negateclass;
107 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
108 void newmatch(Rangeset*);
109 void bnewmatch(Rangeset*);
110 void pushand(Inst*, Inst*);
111 void pushator(int);
112 Node *popand(int);
113 int popator(void);
114 void startlex(Rune*);
115 int lex(void);
116 void operator(int);
117 void operand(int);
118 void evaluntil(int);
119 void optimize(Inst*);
120 void bldcclass(void);
122 void
123 regerror(Err e)
125 Strzero(&lastregexp);
126 error(e);
129 void
130 regerror_c(Err e, int c)
132 Strzero(&lastregexp);
133 error_c(e, c);
136 Inst *
137 newinst(int t)
139 if(progp >= &program[NPROG])
140 regerror(Etoolong);
141 progp->type = t;
142 progp->left = 0;
143 progp->right = 0;
144 return progp++;
147 Inst *
148 realcompile(Rune *s)
150 int token;
152 startlex(s);
153 atorp = atorstack;
154 andp = andstack;
155 subidp = subidstack;
156 cursubid = 0;
157 lastwasand = FALSE;
158 /* Start with a low priority operator to prime parser */
159 pushator(START-1);
160 while((token=lex()) != END){
161 if((token&ISATOR) == OPERATOR)
162 operator(token);
163 else
164 operand(token);
166 /* Close with a low priority operator */
167 evaluntil(START);
168 /* Force END */
169 operand(END);
170 evaluntil(START);
171 if(nbra)
172 regerror(Eleftpar);
173 --andp; /* points to first and only operand */
174 return andp->first;
177 void
178 compile(String *s)
180 int i;
181 Inst *oprogp;
183 if(Strcmp(s, &lastregexp)==0)
184 return;
185 for(i=0; i<nclass; i++)
186 free(class[i]);
187 nclass = 0;
188 progp = program;
189 backwards = FALSE;
190 startinst = realcompile(s->s);
191 optimize(program);
192 oprogp = progp;
193 backwards = TRUE;
194 bstartinst = realcompile(s->s);
195 optimize(oprogp);
196 Strduplstr(&lastregexp, s);
199 void
200 operand(int t)
202 Inst *i;
203 if(lastwasand)
204 operator(CAT); /* catenate is implicit */
205 i = newinst(t);
206 if(t == CCLASS){
207 if(negateclass)
208 i->type = NCCLASS; /* UGH */
209 i->rclass = nclass-1; /* UGH */
211 pushand(i, i);
212 lastwasand = TRUE;
215 void
216 operator(int t)
218 if(t==RBRA && --nbra<0)
219 regerror(Erightpar);
220 if(t==LBRA){
221 /*
222 * if(++cursubid >= NSUBEXP)
223 * regerror(Esubexp);
224 */
225 cursubid++; /* silently ignored */
226 nbra++;
227 if(lastwasand)
228 operator(CAT);
229 }else
230 evaluntil(t);
231 if(t!=RBRA)
232 pushator(t);
233 lastwasand = FALSE;
234 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
235 lastwasand = TRUE; /* these look like operands */
238 void
239 cant(char *s)
241 char buf[100];
243 sprint(buf, "regexp: can't happen: %s", s);
244 panic(buf);
247 void
248 pushand(Inst *f, Inst *l)
250 if(andp >= &andstack[NSTACK])
251 cant("operand stack overflow");
252 andp->first = f;
253 andp->last = l;
254 andp++;
257 void
258 pushator(int t)
260 if(atorp >= &atorstack[NSTACK])
261 cant("operator stack overflow");
262 *atorp++=t;
263 if(cursubid >= NSUBEXP)
264 *subidp++= -1;
265 else
266 *subidp++=cursubid;
269 Node *
270 popand(int op)
272 if(andp <= &andstack[0])
273 if(op)
274 regerror_c(Emissop, op);
275 else
276 regerror(Ebadregexp);
277 return --andp;
280 int
281 popator(void)
283 if(atorp <= &atorstack[0])
284 cant("operator stack underflow");
285 --subidp;
286 return *--atorp;
289 void
290 evaluntil(int pri)
292 Node *op1, *op2, *t;
293 Inst *inst1, *inst2;
295 while(pri==RBRA || atorp[-1]>=pri){
296 switch(popator()){
297 case LBRA:
298 op1 = popand('(');
299 inst2 = newinst(RBRA);
300 inst2->subid = *subidp;
301 op1->last->next = inst2;
302 inst1 = newinst(LBRA);
303 inst1->subid = *subidp;
304 inst1->next = op1->first;
305 pushand(inst1, inst2);
306 return; /* must have been RBRA */
307 default:
308 panic("unknown regexp operator");
309 break;
310 case OR:
311 op2 = popand('|');
312 op1 = popand('|');
313 inst2 = newinst(NOP);
314 op2->last->next = inst2;
315 op1->last->next = inst2;
316 inst1 = newinst(OR);
317 inst1->right = op1->first;
318 inst1->left = op2->first;
319 pushand(inst1, inst2);
320 break;
321 case CAT:
322 op2 = popand(0);
323 op1 = popand(0);
324 if(backwards && op2->first->type!=END)
325 t = op1, op1 = op2, op2 = t;
326 op1->last->next = op2->first;
327 pushand(op1->first, op2->last);
328 break;
329 case STAR:
330 op2 = popand('*');
331 inst1 = newinst(OR);
332 op2->last->next = inst1;
333 inst1->right = op2->first;
334 pushand(inst1, inst1);
335 break;
336 case PLUS:
337 op2 = popand('+');
338 inst1 = newinst(OR);
339 op2->last->next = inst1;
340 inst1->right = op2->first;
341 pushand(op2->first, inst1);
342 break;
343 case QUEST:
344 op2 = popand('?');
345 inst1 = newinst(OR);
346 inst2 = newinst(NOP);
347 inst1->left = inst2;
348 inst1->right = op2->first;
349 op2->last->next = inst2;
350 pushand(inst1, inst2);
351 break;
357 void
358 optimize(Inst *start)
360 Inst *inst, *target;
362 for(inst=start; inst->type!=END; inst++){
363 target = inst->next;
364 while(target->type == NOP)
365 target = target->next;
366 inst->next = target;
370 #ifdef DEBUG
371 void
372 dumpstack(void){
373 Node *stk;
374 int *ip;
376 dprint("operators\n");
377 for(ip = atorstack; ip<atorp; ip++)
378 dprint("0%o\n", *ip);
379 dprint("operands\n");
380 for(stk = andstack; stk<andp; stk++)
381 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
383 void
384 dump(void){
385 Inst *l;
387 l = program;
388 do{
389 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
390 l->left-program, l->right-program);
391 }while(l++->type);
393 #endif
395 void
396 startlex(Rune *s)
398 exprp = s;
399 nbra = 0;
403 int
404 lex(void){
405 int c= *exprp++;
407 switch(c){
408 case '\\':
409 if(*exprp)
410 if((c= *exprp++)=='n')
411 c='\n';
412 break;
413 case 0:
414 c = END;
415 --exprp; /* In case we come here again */
416 break;
417 case '*':
418 c = STAR;
419 break;
420 case '?':
421 c = QUEST;
422 break;
423 case '+':
424 c = PLUS;
425 break;
426 case '|':
427 c = OR;
428 break;
429 case '.':
430 c = ANY;
431 break;
432 case '(':
433 c = LBRA;
434 break;
435 case ')':
436 c = RBRA;
437 break;
438 case '^':
439 c = BOL;
440 break;
441 case '$':
442 c = EOL;
443 break;
444 case '[':
445 c = CCLASS;
446 bldcclass();
447 break;
449 return c;
452 long
453 nextrec(void){
454 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
455 regerror(Ebadclass);
456 if(exprp[0] == '\\'){
457 exprp++;
458 if(*exprp=='n'){
459 exprp++;
460 return '\n';
462 return *exprp++|0x10000;
464 return *exprp++;
467 void
468 bldcclass(void)
470 long c1, c2, n, na;
471 Rune *classp;
473 classp = emalloc(DCLASS*RUNESIZE);
474 n = 0;
475 na = DCLASS;
476 /* we have already seen the '[' */
477 if(*exprp == '^'){
478 classp[n++] = '\n'; /* don't match newline in negate case */
479 negateclass = TRUE;
480 exprp++;
481 }else
482 negateclass = FALSE;
483 while((c1 = nextrec()) != ']'){
484 if(c1 == '-'){
485 Error:
486 free(classp);
487 regerror(Ebadclass);
489 if(n+4 >= na){ /* 3 runes plus NUL */
490 na += DCLASS;
491 classp = erealloc(classp, na*RUNESIZE);
493 if(*exprp == '-'){
494 exprp++; /* eat '-' */
495 if((c2 = nextrec()) == ']')
496 goto Error;
497 classp[n+0] = 0xFFFF;
498 classp[n+1] = c1;
499 classp[n+2] = c2;
500 n += 3;
501 }else
502 classp[n++] = c1;
504 classp[n] = 0;
505 if(nclass == Nclass){
506 Nclass += DCLASS;
507 class = erealloc(class, Nclass*sizeof(Rune*));
509 class[nclass++] = classp;
512 int
513 classmatch(int classno, int c, int negate)
515 Rune *p;
517 p = class[classno];
518 while(*p){
519 if(*p == 0xFFFF){
520 if(p[1]<=c && c<=p[2])
521 return !negate;
522 p += 3;
523 }else if(*p++ == c)
524 return !negate;
526 return negate;
529 /*
530 * Note optimization in addinst:
531 * *l must be pending when addinst called; if *l has been looked
532 * at already, the optimization is a bug.
533 */
534 int
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
537 Ilist *p;
539 for(p = l; p->inst; p++){
540 if(p->inst==inst){
541 if((sep)->p[0].p1 < p->se.p[0].p1)
542 p->se= *sep; /* this would be bug */
543 return 0; /* It's already there */
546 p->inst = inst;
547 p->se= *sep;
548 (p+1)->inst = 0;
549 return 1;
552 int
553 execute(File *f, Posn startp, Posn eof)
555 int flag = 0;
556 Inst *inst;
557 Ilist *tlp;
558 Posn p = startp;
559 int nnl = 0, ntl;
560 int c;
561 int wrapped = 0;
562 int startchar = startinst->type<OPERATOR? startinst->type : 0;
564 list[0][0].inst = list[1][0].inst = 0;
565 sel.p[0].p1 = -1;
566 /* Execute machine once for each character */
567 for(;;p++){
568 doloop:
569 c = filereadc(f, p);
570 if(p>=eof || c<0){
571 switch(wrapped++){
572 case 0: /* let loop run one more click */
573 case 2:
574 break;
575 case 1: /* expired; wrap to beginning */
576 if(sel.p[0].p1>=0 || eof!=INFINITY)
577 goto Return;
578 list[0][0].inst = list[1][0].inst = 0;
579 p = 0;
580 goto doloop;
581 default:
582 goto Return;
584 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
585 break;
586 /* fast check for first char */
587 if(startchar && nnl==0 && c!=startchar)
588 continue;
589 tl = list[flag];
590 nl = list[flag^=1];
591 nl->inst = 0;
592 ntl = nnl;
593 nnl = 0;
594 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
595 /* Add first instruction to this list */
596 sempty.p[0].p1 = p;
597 if(addinst(tl, startinst, &sempty))
598 if(++ntl >= NLIST)
599 Overflow:
600 error(Eoverflow);
602 /* Execute machine until this list is empty */
603 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
604 Switchstmt:
605 switch(inst->type){
606 default: /* regular character */
607 if(inst->type==c){
608 Addinst:
609 if(addinst(nl, inst->next, &tlp->se))
610 if(++nnl >= NLIST)
611 goto Overflow;
613 break;
614 case LBRA:
615 if(inst->subid>=0)
616 tlp->se.p[inst->subid].p1 = p;
617 inst = inst->next;
618 goto Switchstmt;
619 case RBRA:
620 if(inst->subid>=0)
621 tlp->se.p[inst->subid].p2 = p;
622 inst = inst->next;
623 goto Switchstmt;
624 case ANY:
625 if(c!='\n')
626 goto Addinst;
627 break;
628 case BOL:
629 if(p==0 || filereadc(f, p - 1)=='\n'){
630 Step:
631 inst = inst->next;
632 goto Switchstmt;
634 break;
635 case EOL:
636 if(c == '\n')
637 goto Step;
638 break;
639 case CCLASS:
640 if(c>=0 && classmatch(inst->rclass, c, 0))
641 goto Addinst;
642 break;
643 case NCCLASS:
644 if(c>=0 && classmatch(inst->rclass, c, 1))
645 goto Addinst;
646 break;
647 case OR:
648 /* evaluate right choice later */
649 if(addinst(tl, inst->right, &tlp->se))
650 if(++ntl >= NLIST)
651 goto Overflow;
652 /* efficiency: advance and re-evaluate */
653 inst = inst->left;
654 goto Switchstmt;
655 case END: /* Match! */
656 tlp->se.p[0].p2 = p;
657 newmatch(&tlp->se);
658 break;
662 Return:
663 return sel.p[0].p1>=0;
666 void
667 newmatch(Rangeset *sp)
669 int i;
671 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
672 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
673 for(i = 0; i<NSUBEXP; i++)
674 sel.p[i] = sp->p[i];
677 int
678 bexecute(File *f, Posn startp)
680 int flag = 0;
681 Inst *inst;
682 Ilist *tlp;
683 Posn p = startp;
684 int nnl = 0, ntl;
685 int c;
686 int wrapped = 0;
687 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
689 list[0][0].inst = list[1][0].inst = 0;
690 sel.p[0].p1= -1;
691 /* Execute machine once for each character, including terminal NUL */
692 for(;;--p){
693 doloop:
694 if((c = filereadc(f, p - 1))==-1){
695 switch(wrapped++){
696 case 0: /* let loop run one more click */
697 case 2:
698 break;
699 case 1: /* expired; wrap to end */
700 if(sel.p[0].p1>=0)
701 case 3:
702 goto Return;
703 list[0][0].inst = list[1][0].inst = 0;
704 p = f->b.nc;
705 goto doloop;
706 default:
707 goto Return;
709 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
710 break;
711 /* fast check for first char */
712 if(startchar && nnl==0 && c!=startchar)
713 continue;
714 tl = list[flag];
715 nl = list[flag^=1];
716 nl->inst = 0;
717 ntl = nnl;
718 nnl = 0;
719 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
720 /* Add first instruction to this list */
721 /* the minus is so the optimizations in addinst work */
722 sempty.p[0].p1 = -p;
723 if(addinst(tl, bstartinst, &sempty))
724 if(++ntl >= NLIST)
725 Overflow:
726 error(Eoverflow);
728 /* Execute machine until this list is empty */
729 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
730 Switchstmt:
731 switch(inst->type){
732 default: /* regular character */
733 if(inst->type == c){
734 Addinst:
735 if(addinst(nl, inst->next, &tlp->se))
736 if(++nnl >= NLIST)
737 goto Overflow;
739 break;
740 case LBRA:
741 if(inst->subid>=0)
742 tlp->se.p[inst->subid].p1 = p;
743 inst = inst->next;
744 goto Switchstmt;
745 case RBRA:
746 if(inst->subid >= 0)
747 tlp->se.p[inst->subid].p2 = p;
748 inst = inst->next;
749 goto Switchstmt;
750 case ANY:
751 if(c != '\n')
752 goto Addinst;
753 break;
754 case BOL:
755 if(c=='\n' || p==0){
756 Step:
757 inst = inst->next;
758 goto Switchstmt;
760 break;
761 case EOL:
762 if(p==f->b.nc || filereadc(f, p)=='\n')
763 goto Step;
764 break;
765 case CCLASS:
766 if(c>=0 && classmatch(inst->rclass, c, 0))
767 goto Addinst;
768 break;
769 case NCCLASS:
770 if(c>=0 && classmatch(inst->rclass, c, 1))
771 goto Addinst;
772 break;
773 case OR:
774 /* evaluate right choice later */
775 if(addinst(tl, inst->right, &tlp->se))
776 if(++ntl >= NLIST)
777 goto Overflow;
778 /* efficiency: advance and re-evaluate */
779 inst = inst->left;
780 goto Switchstmt;
781 case END: /* Match! */
782 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
783 tlp->se.p[0].p2 = p;
784 bnewmatch(&tlp->se);
785 break;
789 Return:
790 return sel.p[0].p1>=0;
793 void
794 bnewmatch(Rangeset *sp)
796 int i;
797 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
798 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */
799 sel.p[i].p1 = sp->p[i].p2;
800 sel.p[i].p2 = sp->p[i].p1;