19 #define class regxclass /* some systems declare "class" in system headers */
24 typedef struct Inst Inst;
27 uint type; /* < OPERATOR ==> literal, otherwise action */
44 Inst *startinst; /* First inst. of program; might not be program[0] */
45 Inst *bstartinst; /* same for backwards machine */
46 Channel *rechan; /* chan(Inst*) */
48 typedef struct Ilist Ilist;
51 Inst *inst; /* Instruction of the thread */
53 uint startp; /* first char of match */
58 Ilist *tl, *nl; /* This list, next list */
59 Ilist list[2][NLIST+1]; /* +1 for trailing null */
60 static Rangeset sempty;
65 * 0x10000xx are operators, value == precedence
66 * 0x20000xx are tokens, i.e. operands for operators
68 #define OPERATOR 0x1000000 /* Bit set in all operators */
69 #define START (OPERATOR+0) /* Start, used for marker on stack */
70 #define RBRA (OPERATOR+1) /* Right bracket, ) */
71 #define LBRA (OPERATOR+2) /* Left bracket, ( */
72 #define OR (OPERATOR+3) /* Alternation, | */
73 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */
74 #define STAR (OPERATOR+5) /* Closure, * */
75 #define PLUS (OPERATOR+6) /* a+ == aa* */
76 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */
77 #define ANY 0x2000000 /* Any character but newline, . */
78 #define NOP (ANY+1) /* No operation, internal use only */
79 #define BOL (ANY+2) /* Beginning of line, ^ */
80 #define EOL (ANY+3) /* End of line, $ */
81 #define CCLASS (ANY+4) /* Character class, [] */
82 #define NCCLASS (ANY+5) /* Negated character class, [^] */
83 #define END (ANY+0x77) /* Terminate: match found */
85 #define ISATOR OPERATOR
88 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */
93 typedef struct Node Node;
101 Node andstack[NSTACK];
103 int atorstack[NSTACK];
105 int lastwasand; /* Last token was operand */
107 int subidstack[NSTACK];
111 Rune *exprp; /* pointer to next character in source expression */
112 #define DCLASS 10 /* allocation increment */
113 int nclass; /* number active */
114 int Nclass; /* high water mark */
118 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
119 void newmatch(Rangeset*);
120 void bnewmatch(Rangeset*);
121 void pushand(Inst*, Inst*);
125 void startlex(Rune*);
130 void optimize(Inst*);
131 void bldcclass(void);
136 rechan = chancreate(sizeof(Inst*), 0);
137 chansetname(rechan, "rechan");
138 lastregexp = runemalloc(1);
145 warning(nil, "regexp: %s\n", e);
153 if(progp >= &program[NPROG])
154 regerror("expression too long");
156 progp->u1.left = nil;
157 progp->u.right = nil;
162 realcompile(void *arg)
167 threadsetname("regcomp");
175 /* Start with a low priority operator to prime parser */
177 while((token=lex()) != END){
178 if((token&ISATOR) == OPERATOR)
183 /* Close with a low priority operator */
189 regerror("unmatched `('");
190 --andp; /* points to first and only operand */
191 sendp(rechan, andp->first);
195 /* r is null terminated */
202 nr = runestrlen(r)+1;
203 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
206 for(i=0; i<nclass; i++)
212 threadcreate(realcompile, r, STACK);
213 startinst = recvp(rechan);
219 threadcreate(realcompile, r, STACK);
220 bstartinst = recvp(rechan);
221 if(bstartinst == nil)
224 lastregexp = runerealloc(lastregexp, nr);
225 runemove(lastregexp, r, nr);
234 operator(CAT); /* catenate is implicit */
238 i->type = NCCLASS; /* UGH */
239 i->u.class = nclass-1; /* UGH */
248 if(t==RBRA && --nbra<0)
249 regerror("unmatched `)'");
251 cursubid++; /* silently ignored */
260 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
261 lastwasand = TRUE; /* these look like operands */
265 pushand(Inst *f, Inst *l)
267 if(andp >= &andstack[NSTACK])
268 error("operand stack overflow");
277 if(atorp >= &atorstack[NSTACK])
278 error("operator stack overflow");
280 if(cursubid >= NRange)
291 if(andp <= &andstack[0])
293 sprint(buf, "missing operand for %c", op);
296 regerror("malformed regexp");
303 if(atorp <= &atorstack[0])
304 error("operator stack underflow");
315 while(pri==RBRA || atorp[-1]>=pri){
319 inst2 = newinst(RBRA);
320 inst2->u.subid = *subidp;
321 op1->last->u1.next = inst2;
322 inst1 = newinst(LBRA);
323 inst1->u.subid = *subidp;
324 inst1->u1.next = op1->first;
325 pushand(inst1, inst2);
326 return; /* must have been RBRA */
328 error("unknown regexp operator");
333 inst2 = newinst(NOP);
334 op2->last->u1.next = inst2;
335 op1->last->u1.next = inst2;
337 inst1->u.right = op1->first;
338 inst1->u1.left = op2->first;
339 pushand(inst1, inst2);
344 if(backwards && op2->first->type!=END){
349 op1->last->u1.next = op2->first;
350 pushand(op1->first, op2->last);
355 op2->last->u1.next = inst1;
356 inst1->u.right = op2->first;
357 pushand(inst1, inst1);
362 op2->last->u1.next = inst1;
363 inst1->u.right = op2->first;
364 pushand(op2->first, inst1);
369 inst2 = newinst(NOP);
370 inst1->u1.left = inst2;
371 inst1->u.right = op2->first;
372 op2->last->u1.next = inst2;
373 pushand(inst1, inst2);
381 optimize(Inst *start)
385 for(inst=start; inst->type!=END; inst++){
386 target = inst->u1.next;
387 while(target->type == NOP)
388 target = target->u1.next;
389 inst->u1.next = target;
409 if((c= *exprp++)=='n')
414 --exprp; /* In case we come here again */
454 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
455 regerror("malformed `[]'");
456 if(exprp[0] == '\\'){
462 return *exprp++|QUOTED;
473 classp = runemalloc(DCLASS);
476 /* we have already seen the '[' */
478 classp[n++] = '\n'; /* don't match newline in negate case */
483 while((c1 = nextrec()) != ']'){
487 regerror("malformed `[]'");
489 if(n+4 >= na){ /* 3 runes plus NUL */
491 classp = runerealloc(classp, na);
494 exprp++; /* eat '-' */
495 if((c2 = nextrec()) == ']')
497 classp[n+0] = Runemax;
502 classp[n++] = c1 & ~QUOTED;
505 if(nclass == Nclass){
507 class = realloc(class, Nclass*sizeof(Rune*));
509 class[nclass++] = classp;
513 classmatch(int classno, int c, int negate)
520 if(p[1]<=c && c<=p[2])
530 * Note optimization in addinst:
531 * *l must be pending when addinst called; if *l has been looked
532 * at already, the optimization is a bug.
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
539 for(p = l; p->inst; p++){
541 if((sep)->r[0].q0 < p->se.r[0].q0)
542 p->se= *sep; /* this would be bug */
543 return 0; /* It's already there */
555 return startinst==nil || bstartinst==nil;
558 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
560 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
576 if(startinst->type<OPERATOR)
577 startchar = startinst->type;
578 list[0][0].inst = list[1][0].inst = nil;
584 /* Execute machine once for each character */
589 case 0: /* let loop run one more click */
592 case 1: /* expired; wrap to beginning */
593 if(sel.r[0].q0>=0 || eof!=Infinity)
595 list[0][0].inst = list[1][0].inst = nil;
603 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
610 /* fast check for first char */
611 if(startchar && nnl==0 && c!=startchar)
618 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
619 /* Add first instruction to this list */
621 if(addinst(tl, startinst, &sempty))
624 warning(nil, "regexp list overflow\n");
629 /* Execute machine until this list is empty */
630 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
633 default: /* regular character */
636 if(addinst(nl, inst->u1.next, &tlp->se))
643 tlp->se.r[inst->u.subid].q0 = p;
644 inst = inst->u1.next;
648 tlp->se.r[inst->u.subid].q1 = p;
649 inst = inst->u1.next;
656 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
658 inst = inst->u1.next;
667 if(c>=0 && classmatch(inst->u.class, c, 0))
671 if(c>=0 && classmatch(inst->u.class, c, 1))
675 /* evaluate right choice later */
676 if(addinst(tlp, inst->u.right, &tlp->se))
679 /* efficiency: advance and re-evaluate */
680 inst = inst->u1.left;
682 case END: /* Match! */
691 return sel.r[0].q0 >= 0;
695 newmatch(Rangeset *sp)
697 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
698 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
703 rxbexecute(Text *t, uint startp, Rangeset *rp)
719 if(bstartinst->type<OPERATOR)
720 startchar = bstartinst->type;
721 list[0][0].inst = list[1][0].inst = nil;
723 /* Execute machine once for each character, including terminal NUL */
728 case 0: /* let loop run one more click */
731 case 1: /* expired; wrap to end */
734 list[0][0].inst = list[1][0].inst = nil;
743 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
745 c = textreadc(t, p-1);
747 /* fast check for first char */
748 if(startchar && nnl==0 && c!=startchar)
755 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
756 /* Add first instruction to this list */
757 /* the minus is so the optimizations in addinst work */
759 if(addinst(tl, bstartinst, &sempty))
762 warning(nil, "regexp list overflow\n");
767 /* Execute machine until this list is empty */
768 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
771 default: /* regular character */
774 if(addinst(nl, inst->u1.next, &tlp->se))
781 tlp->se.r[inst->u.subid].q0 = p;
782 inst = inst->u1.next;
785 if(inst->u.subid >= 0)
786 tlp->se.r[inst->u.subid].q1 = p;
787 inst = inst->u1.next;
796 inst = inst->u1.next;
801 if(p<t->file->b.nc && textreadc(t, p)=='\n')
805 if(c>0 && classmatch(inst->u.class, c, 0))
809 if(c>0 && classmatch(inst->u.class, c, 1))
813 /* evaluate right choice later */
814 if(addinst(tl, inst->u.right, &tlp->se))
817 /* efficiency: advance and re-evaluate */
818 inst = inst->u1.left;
820 case END: /* Match! */
821 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
830 return sel.r[0].q0 >= 0;
834 bnewmatch(Rangeset *sp)
838 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
839 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
840 sel.r[i].q0 = sp->r[i].q1;
841 sel.r[i].q1 = sp->r[i].q0;