21 typedef struct Inst Inst;
24 uint type; /* < OPERATOR ==> literal, otherwise action */
41 Inst *startinst; /* First inst. of program; might not be program[0] */
42 Inst *bstartinst; /* same for backwards machine */
43 Channel *rechan; /* chan(Inst*) */
45 typedef struct Ilist Ilist;
48 Inst *inst; /* Instruction of the thread */
50 uint startp; /* first char of match */
55 Ilist *tl, *nl; /* This list, next list */
56 Ilist list[2][NLIST+1]; /* +1 for trailing null */
57 static Rangeset sempty;
62 * 0x10000xx are operators, value == precedence
63 * 0x20000xx are tokens, i.e. operands for operators
65 #define OPERATOR 0x1000000 /* Bit set in all operators */
66 #define START (OPERATOR+0) /* Start, used for marker on stack */
67 #define RBRA (OPERATOR+1) /* Right bracket, ) */
68 #define LBRA (OPERATOR+2) /* Left bracket, ( */
69 #define OR (OPERATOR+3) /* Alternation, | */
70 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */
71 #define STAR (OPERATOR+5) /* Closure, * */
72 #define PLUS (OPERATOR+6) /* a+ == aa* */
73 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */
74 #define ANY 0x2000000 /* Any character but newline, . */
75 #define NOP (ANY+1) /* No operation, internal use only */
76 #define BOL (ANY+2) /* Beginning of line, ^ */
77 #define EOL (ANY+3) /* End of line, $ */
78 #define CCLASS (ANY+4) /* Character class, [] */
79 #define NCCLASS (ANY+5) /* Negated character class, [^] */
80 #define END (ANY+0x77) /* Terminate: match found */
82 #define ISATOR OPERATOR
85 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */
90 typedef struct Node Node;
98 Node andstack[NSTACK];
100 int atorstack[NSTACK];
102 int lastwasand; /* Last token was operand */
104 int subidstack[NSTACK];
108 Rune *exprp; /* pointer to next character in source expression */
109 #define DCLASS 10 /* allocation increment */
110 int nclass; /* number active */
111 int Nclass; /* high water mark */
115 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
116 void newmatch(Rangeset*);
117 void bnewmatch(Rangeset*);
118 void pushand(Inst*, Inst*);
122 void startlex(Rune*);
127 void optimize(Inst*);
128 void bldcclass(void);
133 rechan = chancreate(sizeof(Inst*), 0);
134 chansetname(rechan, "rechan");
135 lastregexp = runemalloc(1);
142 warning(nil, "regexp: %s\n", e);
150 if(progp >= &program[NPROG])
151 regerror("expression too long");
153 progp->u1.left = nil;
154 progp->u.right = nil;
159 realcompile(void *arg)
164 threadsetname("regcomp");
172 /* Start with a low priority operator to prime parser */
174 while((token=lex()) != END){
175 if((token&ISATOR) == OPERATOR)
180 /* Close with a low priority operator */
186 regerror("unmatched `('");
187 --andp; /* points to first and only operand */
188 sendp(rechan, andp->first);
192 /* r is null terminated */
199 nr = runestrlen(r)+1;
200 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
203 for(i=0; i<nclass; i++)
209 threadcreate(realcompile, r, STACK);
210 startinst = recvp(rechan);
216 threadcreate(realcompile, r, STACK);
217 bstartinst = recvp(rechan);
218 if(bstartinst == nil)
221 lastregexp = runerealloc(lastregexp, nr);
222 runemove(lastregexp, r, nr);
231 operator(CAT); /* catenate is implicit */
235 i->type = NCCLASS; /* UGH */
236 i->u.class = nclass-1; /* UGH */
245 if(t==RBRA && --nbra<0)
246 regerror("unmatched `)'");
248 cursubid++; /* silently ignored */
257 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
258 lastwasand = TRUE; /* these look like operands */
262 pushand(Inst *f, Inst *l)
264 if(andp >= &andstack[NSTACK])
265 error("operand stack overflow");
274 if(atorp >= &atorstack[NSTACK])
275 error("operator stack overflow");
277 if(cursubid >= NRange)
288 if(andp <= &andstack[0])
290 sprint(buf, "missing operand for %c", op);
293 regerror("malformed regexp");
300 if(atorp <= &atorstack[0])
301 error("operator stack underflow");
312 while(pri==RBRA || atorp[-1]>=pri){
316 inst2 = newinst(RBRA);
317 inst2->u.subid = *subidp;
318 op1->last->u1.next = inst2;
319 inst1 = newinst(LBRA);
320 inst1->u.subid = *subidp;
321 inst1->u1.next = op1->first;
322 pushand(inst1, inst2);
323 return; /* must have been RBRA */
325 error("unknown regexp operator");
330 inst2 = newinst(NOP);
331 op2->last->u1.next = inst2;
332 op1->last->u1.next = inst2;
334 inst1->u.right = op1->first;
335 inst1->u1.left = op2->first;
336 pushand(inst1, inst2);
341 if(backwards && op2->first->type!=END){
346 op1->last->u1.next = op2->first;
347 pushand(op1->first, op2->last);
352 op2->last->u1.next = inst1;
353 inst1->u.right = op2->first;
354 pushand(inst1, inst1);
359 op2->last->u1.next = inst1;
360 inst1->u.right = op2->first;
361 pushand(op2->first, inst1);
366 inst2 = newinst(NOP);
367 inst1->u1.left = inst2;
368 inst1->u.right = op2->first;
369 op2->last->u1.next = inst2;
370 pushand(inst1, inst2);
378 optimize(Inst *start)
382 for(inst=start; inst->type!=END; inst++){
383 target = inst->u1.next;
384 while(target->type == NOP)
385 target = target->u1.next;
386 inst->u1.next = target;
406 if((c= *exprp++)=='n')
411 --exprp; /* In case we come here again */
451 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
452 regerror("malformed `[]'");
453 if(exprp[0] == '\\'){
459 return *exprp++|QUOTED;
470 classp = runemalloc(DCLASS);
473 /* we have already seen the '[' */
475 classp[n++] = '\n'; /* don't match newline in negate case */
480 while((c1 = nextrec()) != ']'){
484 regerror("malformed `[]'");
486 if(n+4 >= na){ /* 3 runes plus NUL */
488 classp = runerealloc(classp, na);
491 exprp++; /* eat '-' */
492 if((c2 = nextrec()) == ']')
494 classp[n+0] = Runemax;
499 classp[n++] = c1 & ~QUOTED;
502 if(nclass == Nclass){
504 class = realloc(class, Nclass*sizeof(Rune*));
506 class[nclass++] = classp;
510 classmatch(int classno, int c, int negate)
517 if(p[1]<=c && c<=p[2])
527 * Note optimization in addinst:
528 * *l must be pending when addinst called; if *l has been looked
529 * at already, the optimization is a bug.
532 addinst(Ilist *l, Inst *inst, Rangeset *sep)
536 for(p = l; p->inst; p++){
538 if((sep)->r[0].q0 < p->se.r[0].q0)
539 p->se= *sep; /* this would be bug */
540 return 0; /* It's already there */
552 return startinst==nil || bstartinst==nil;
555 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
557 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
573 if(startinst->type<OPERATOR)
574 startchar = startinst->type;
575 list[0][0].inst = list[1][0].inst = nil;
581 /* Execute machine once for each character */
586 case 0: /* let loop run one more click */
589 case 1: /* expired; wrap to beginning */
590 if(sel.r[0].q0>=0 || eof!=Infinity)
592 list[0][0].inst = list[1][0].inst = nil;
600 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
607 /* fast check for first char */
608 if(startchar && nnl==0 && c!=startchar)
615 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
616 /* Add first instruction to this list */
618 if(addinst(tl, startinst, &sempty))
621 warning(nil, "regexp list overflow\n");
626 /* Execute machine until this list is empty */
627 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
630 default: /* regular character */
633 if(addinst(nl, inst->u1.next, &tlp->se))
640 tlp->se.r[inst->u.subid].q0 = p;
641 inst = inst->u1.next;
645 tlp->se.r[inst->u.subid].q1 = p;
646 inst = inst->u1.next;
653 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
655 inst = inst->u1.next;
664 if(c>=0 && classmatch(inst->u.class, c, 0))
668 if(c>=0 && classmatch(inst->u.class, c, 1))
672 /* evaluate right choice later */
673 if(addinst(tlp, inst->u.right, &tlp->se))
676 /* efficiency: advance and re-evaluate */
677 inst = inst->u1.left;
679 case END: /* Match! */
688 return sel.r[0].q0 >= 0;
692 newmatch(Rangeset *sp)
694 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
695 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
700 rxbexecute(Text *t, uint startp, Rangeset *rp)
716 if(bstartinst->type<OPERATOR)
717 startchar = bstartinst->type;
718 list[0][0].inst = list[1][0].inst = nil;
720 /* Execute machine once for each character, including terminal NUL */
725 case 0: /* let loop run one more click */
728 case 1: /* expired; wrap to end */
731 list[0][0].inst = list[1][0].inst = nil;
740 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
742 c = textreadc(t, p-1);
744 /* fast check for first char */
745 if(startchar && nnl==0 && c!=startchar)
752 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
753 /* Add first instruction to this list */
754 /* the minus is so the optimizations in addinst work */
756 if(addinst(tl, bstartinst, &sempty))
759 warning(nil, "regexp list overflow\n");
764 /* Execute machine until this list is empty */
765 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
768 default: /* regular character */
771 if(addinst(nl, inst->u1.next, &tlp->se))
778 tlp->se.r[inst->u.subid].q0 = p;
779 inst = inst->u1.next;
782 if(inst->u.subid >= 0)
783 tlp->se.r[inst->u.subid].q1 = p;
784 inst = inst->u1.next;
793 inst = inst->u1.next;
798 if(p<t->file->b.nc && textreadc(t, p)=='\n')
802 if(c>0 && classmatch(inst->u.class, c, 0))
806 if(c>0 && classmatch(inst->u.class, c, 1))
810 /* evaluate right choice later */
811 if(addinst(tl, inst->u.right, &tlp->se))
814 /* efficiency: advance and re-evaluate */
815 inst = inst->u1.left;
817 case END: /* Match! */
818 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
827 return sel.r[0].q0 >= 0;
831 bnewmatch(Rangeset *sp)
835 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
836 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
837 sel.r[i].q0 = sp->r[i].q1;
838 sel.r[i].q1 = sp->r[i].q0;