8 typedef struct Inst Inst;
12 long type; /* < OPERATOR ==> literal, otherwise action */
26 #define subid r.rsubid
27 #define rclass r.class
28 #define other r.rother
29 #define right r.rright
36 Inst *startinst; /* First inst. of program; might not be program[0] */
37 Inst *bstartinst; /* same for backwards machine */
39 typedef struct Ilist Ilist;
42 Inst *inst; /* Instruction of the thread */
44 Posn startp; /* first char of match */
49 Ilist *tl, *nl; /* This list, next list */
50 Ilist list[2][NLIST+1]; /* +1 for trailing null */
51 static Rangeset sempty;
56 * 0x10000xx are operators, value == precedence
57 * 0x20000xx are tokens, i.e. operands for operators
59 #define OPERATOR 0x1000000 /* Bit set in all operators */
60 #define START (OPERATOR+0) /* Start, used for marker on stack */
61 #define RBRA (OPERATOR+1) /* Right bracket, ) */
62 #define LBRA (OPERATOR+2) /* Left bracket, ( */
63 #define OR (OPERATOR+3) /* Alternation, | */
64 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */
65 #define STAR (OPERATOR+5) /* Closure, * */
66 #define PLUS (OPERATOR+6) /* a+ == aa* */
67 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */
68 #define ANY 0x2000000 /* Any character but newline, . */
69 #define NOP (ANY+1) /* No operation, internal use only */
70 #define BOL (ANY+2) /* Beginning of line, ^ */
71 #define EOL (ANY+3) /* End of line, $ */
72 #define CCLASS (ANY+4) /* Character class, [] */
73 #define NCCLASS (ANY+5) /* Negated character class, [^] */
74 #define END (ANY+0x77) /* Terminate: match found */
76 #define ISATOR OPERATOR
79 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */
84 typedef struct Node Node;
92 Node andstack[NSTACK];
94 int atorstack[NSTACK];
96 int lastwasand; /* Last token was operand */
98 int subidstack[NSTACK];
102 Rune *exprp; /* pointer to next character in source expression */
103 #define DCLASS 10 /* allocation increment */
104 int nclass; /* number active */
105 int Nclass; /* high water mark */
109 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
110 void newmatch(Rangeset*);
111 void bnewmatch(Rangeset*);
112 void pushand(Inst*, Inst*);
116 void startlex(Rune*);
121 void optimize(Inst*);
122 void bldcclass(void);
127 Strzero(&lastregexp);
132 regerror_c(Err e, int c)
134 Strzero(&lastregexp);
141 if(progp >= &program[NPROG])
160 /* Start with a low priority operator to prime parser */
162 while((token=lex()) != END){
163 if((token&ISATOR) == OPERATOR)
168 /* Close with a low priority operator */
175 --andp; /* points to first and only operand */
185 if(Strcmp(s, &lastregexp)==0)
187 for(i=0; i<nclass; i++)
192 startinst = realcompile(s->s);
196 bstartinst = realcompile(s->s);
198 Strduplstr(&lastregexp, s);
206 operator(CAT); /* catenate is implicit */
210 i->type = NCCLASS; /* UGH */
211 i->rclass = nclass-1; /* UGH */
220 if(t==RBRA && --nbra<0)
224 * if(++cursubid >= NSUBEXP)
227 cursubid++; /* silently ignored */
236 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
237 lastwasand = TRUE; /* these look like operands */
245 sprint(buf, "regexp: can't happen: %s", s);
250 pushand(Inst *f, Inst *l)
252 if(andp >= &andstack[NSTACK])
253 cant("operand stack overflow");
262 if(atorp >= &atorstack[NSTACK])
263 cant("operator stack overflow");
265 if(cursubid >= NSUBEXP)
274 if(andp <= &andstack[0])
276 regerror_c(Emissop, op);
278 regerror(Ebadregexp);
285 if(atorp <= &atorstack[0])
286 cant("operator stack underflow");
297 while(pri==RBRA || atorp[-1]>=pri){
301 inst2 = newinst(RBRA);
302 inst2->subid = *subidp;
303 op1->last->next = inst2;
304 inst1 = newinst(LBRA);
305 inst1->subid = *subidp;
306 inst1->next = op1->first;
307 pushand(inst1, inst2);
308 return; /* must have been RBRA */
310 panic("unknown regexp operator");
315 inst2 = newinst(NOP);
316 op2->last->next = inst2;
317 op1->last->next = inst2;
319 inst1->right = op1->first;
320 inst1->left = op2->first;
321 pushand(inst1, inst2);
326 if(backwards && op2->first->type!=END)
327 t = op1, op1 = op2, op2 = t;
328 op1->last->next = op2->first;
329 pushand(op1->first, op2->last);
334 op2->last->next = inst1;
335 inst1->right = op2->first;
336 pushand(inst1, inst1);
341 op2->last->next = inst1;
342 inst1->right = op2->first;
343 pushand(op2->first, inst1);
348 inst2 = newinst(NOP);
350 inst1->right = op2->first;
351 op2->last->next = inst2;
352 pushand(inst1, inst2);
360 optimize(Inst *start)
364 for(inst=start; inst->type!=END; inst++){
366 while(target->type == NOP)
367 target = target->next;
378 dprint("operators\n");
379 for(ip = atorstack; ip<atorp; ip++)
380 dprint("0%o\n", *ip);
381 dprint("operands\n");
382 for(stk = andstack; stk<andp; stk++)
383 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
391 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
392 l->left-program, l->right-program);
412 if((c= *exprp++)=='n')
417 --exprp; /* In case we come here again */
456 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
458 if(exprp[0] == '\\'){
464 return *exprp++|QUOTED;
475 classp = emalloc(DCLASS*RUNESIZE);
478 /* we have already seen the '[' */
480 classp[n++] = '\n'; /* don't match newline in negate case */
485 while((c1 = nextrec()) != ']'){
491 if(n+4 >= na){ /* 3 runes plus NUL */
493 classp = erealloc(classp, na*RUNESIZE);
496 exprp++; /* eat '-' */
497 if((c2 = nextrec()) == ']')
499 classp[n+0] = Runemax;
504 classp[n++] = c1 & ~QUOTED;
507 if(nclass == Nclass){
509 class = erealloc(class, Nclass*sizeof(Rune*));
511 class[nclass++] = classp;
515 classmatch(int classno, int c, int negate)
522 if(p[1]<=c && c<=p[2])
532 * Note optimization in addinst:
533 * *l must be pending when addinst called; if *l has been looked
534 * at already, the optimization is a bug.
537 addinst(Ilist *l, Inst *inst, Rangeset *sep)
541 for(p = l; p->inst; p++){
543 if((sep)->p[0].p1 < p->se.p[0].p1)
544 p->se= *sep; /* this would be bug */
545 return 0; /* It's already there */
555 execute(File *f, Posn startp, Posn eof)
564 int startchar = startinst->type<OPERATOR? startinst->type : 0;
566 list[0][0].inst = list[1][0].inst = 0;
568 /* Execute machine once for each character */
574 case 0: /* let loop run one more click */
577 case 1: /* expired; wrap to beginning */
578 if(sel.p[0].p1>=0 || eof!=INFINITY)
580 list[0][0].inst = list[1][0].inst = 0;
586 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
588 /* fast check for first char */
589 if(startchar && nnl==0 && c!=startchar)
596 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
597 /* Add first instruction to this list */
599 if(addinst(tl, startinst, &sempty))
604 /* Execute machine until this list is empty */
605 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
608 default: /* regular character */
611 if(addinst(nl, inst->next, &tlp->se))
618 tlp->se.p[inst->subid].p1 = p;
623 tlp->se.p[inst->subid].p2 = p;
631 if(p==0 || filereadc(f, p - 1)=='\n'){
642 if(c>=0 && classmatch(inst->rclass, c, 0))
646 if(c>=0 && classmatch(inst->rclass, c, 1))
650 /* evaluate right choice later */
651 if(addinst(tl, inst->right, &tlp->se))
654 /* efficiency: advance and re-evaluate */
657 case END: /* Match! */
665 return sel.p[0].p1>=0;
669 newmatch(Rangeset *sp)
673 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
674 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
675 for(i = 0; i<NSUBEXP; i++)
680 bexecute(File *f, Posn startp)
689 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
691 list[0][0].inst = list[1][0].inst = 0;
693 /* Execute machine once for each character, including terminal NUL */
696 if((c = filereadc(f, p - 1))==-1){
698 case 0: /* let loop run one more click */
701 case 1: /* expired; wrap to end */
705 list[0][0].inst = list[1][0].inst = 0;
711 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
713 /* fast check for first char */
714 if(startchar && nnl==0 && c!=startchar)
721 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
722 /* Add first instruction to this list */
723 /* the minus is so the optimizations in addinst work */
725 if(addinst(tl, bstartinst, &sempty))
730 /* Execute machine until this list is empty */
731 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
734 default: /* regular character */
737 if(addinst(nl, inst->next, &tlp->se))
744 tlp->se.p[inst->subid].p1 = p;
749 tlp->se.p[inst->subid].p2 = p;
764 if(p==f->b.nc || filereadc(f, p)=='\n')
768 if(c>=0 && classmatch(inst->rclass, c, 0))
772 if(c>=0 && classmatch(inst->rclass, c, 1))
776 /* evaluate right choice later */
777 if(addinst(tlp, inst->right, &tlp->se))
780 /* efficiency: advance and re-evaluate */
783 case END: /* Match! */
784 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
792 return sel.p[0].p1>=0;
796 bnewmatch(Rangeset *sp)
799 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
800 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */
801 sel.p[i].p1 = sp->p[i].p2;
802 sel.p[i].p2 = sp->p[i].p1;