20 typedef struct Inst Inst;
23 uint type; /* < 0x10000 ==> literal, otherwise action */
40 Inst *startinst; /* First inst. of program; might not be program[0] */
41 Inst *bstartinst; /* same for backwards machine */
42 Channel *rechan; /* chan(Inst*) */
44 typedef struct Ilist Ilist;
47 Inst *inst; /* Instruction of the thread */
49 uint startp; /* first char of match */
54 Ilist *tl, *nl; /* This list, next list */
56 static Rangeset sempty;
61 * 0x100xx are operators, value == precedence
62 * 0x200xx are tokens, i.e. operands for operators
64 #define OPERATOR 0x10000 /* Bitmask of all operators */
65 #define START 0x10000 /* Start, used for marker on stack */
66 #define RBRA 0x10001 /* Right bracket, ) */
67 #define LBRA 0x10002 /* Left bracket, ( */
68 #define OR 0x10003 /* Alternation, | */
69 #define CAT 0x10004 /* Concatentation, implicit operator */
70 #define STAR 0x10005 /* Closure, * */
71 #define PLUS 0x10006 /* a+ == aa* */
72 #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
73 #define ANY 0x20000 /* Any character but newline, . */
74 #define NOP 0x20001 /* No operation, internal use only */
75 #define BOL 0x20002 /* Beginning of line, ^ */
76 #define EOL 0x20003 /* End of line, $ */
77 #define CCLASS 0x20004 /* Character class, [] */
78 #define NCCLASS 0x20005 /* Negated character class, [^] */
79 #define END 0x20077 /* Terminate: match found */
81 #define ISATOR 0x10000
87 typedef struct Node Node;
95 Node andstack[NSTACK];
97 int atorstack[NSTACK];
99 int lastwasand; /* Last token was operand */
101 int subidstack[NSTACK];
105 Rune *exprp; /* pointer to next character in source expression */
106 #define DCLASS 10 /* allocation increment */
107 int nclass; /* number active */
108 int Nclass; /* high water mark */
112 void addinst(Ilist *l, Inst *inst, Rangeset *sep);
113 void newmatch(Rangeset*);
114 void bnewmatch(Rangeset*);
115 void pushand(Inst*, Inst*);
119 void startlex(Rune*);
124 void optimize(Inst*);
125 void bldcclass(void);
130 rechan = chancreate(sizeof(Inst*), 0);
131 chansetname(rechan, "rechan");
132 lastregexp = runemalloc(1);
139 warning(nil, "regexp: %s\n", e);
147 if(progp >= &program[NPROG])
148 regerror("expression too long");
150 progp->u1.left = nil;
151 progp->u.right = nil;
156 realcompile(void *arg)
161 threadsetname("regcomp");
169 /* Start with a low priority operator to prime parser */
171 while((token=lex()) != END){
172 if((token&ISATOR) == OPERATOR)
177 /* Close with a low priority operator */
183 regerror("unmatched `('");
184 --andp; /* points to first and only operand */
185 sendp(rechan, andp->first);
189 /* r is null terminated */
196 nr = runestrlen(r)+1;
197 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
200 for(i=0; i<nclass; i++)
206 threadcreate(realcompile, r, STACK);
207 startinst = recvp(rechan);
213 threadcreate(realcompile, r, STACK);
214 bstartinst = recvp(rechan);
215 if(bstartinst == nil)
218 lastregexp = runerealloc(lastregexp, nr);
219 runemove(lastregexp, r, nr);
228 operator(CAT); /* catenate is implicit */
232 i->type = NCCLASS; /* UGH */
233 i->u.class = nclass-1; /* UGH */
242 if(t==RBRA && --nbra<0)
243 regerror("unmatched `)'");
245 cursubid++; /* silently ignored */
254 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
255 lastwasand = TRUE; /* these look like operands */
259 pushand(Inst *f, Inst *l)
261 if(andp >= &andstack[NSTACK])
262 error("operand stack overflow");
271 if(atorp >= &atorstack[NSTACK])
272 error("operator stack overflow");
274 if(cursubid >= NRange)
285 if(andp <= &andstack[0])
287 sprint(buf, "missing operand for %c", op);
290 regerror("malformed regexp");
297 if(atorp <= &atorstack[0])
298 error("operator stack underflow");
309 while(pri==RBRA || atorp[-1]>=pri){
313 inst2 = newinst(RBRA);
314 inst2->u.subid = *subidp;
315 op1->last->u1.next = inst2;
316 inst1 = newinst(LBRA);
317 inst1->u.subid = *subidp;
318 inst1->u1.next = op1->first;
319 pushand(inst1, inst2);
320 return; /* must have been RBRA */
322 error("unknown regexp operator");
327 inst2 = newinst(NOP);
328 op2->last->u1.next = inst2;
329 op1->last->u1.next = inst2;
331 inst1->u.right = op1->first;
332 inst1->u1.left = op2->first;
333 pushand(inst1, inst2);
338 if(backwards && op2->first->type!=END){
343 op1->last->u1.next = op2->first;
344 pushand(op1->first, op2->last);
349 op2->last->u1.next = inst1;
350 inst1->u.right = op2->first;
351 pushand(inst1, inst1);
356 op2->last->u1.next = inst1;
357 inst1->u.right = op2->first;
358 pushand(op2->first, inst1);
363 inst2 = newinst(NOP);
364 inst1->u1.left = inst2;
365 inst1->u.right = op2->first;
366 op2->last->u1.next = inst2;
367 pushand(inst1, inst2);
375 optimize(Inst *start)
379 for(inst=start; inst->type!=END; inst++){
380 target = inst->u1.next;
381 while(target->type == NOP)
382 target = target->u1.next;
383 inst->u1.next = target;
403 if((c= *exprp++)=='n')
408 --exprp; /* In case we come here again */
448 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
449 regerror("malformed `[]'");
450 if(exprp[0] == '\\'){
456 return *exprp++|0x10000;
467 classp = runemalloc(DCLASS);
470 /* we have already seen the '[' */
472 classp[n++] = '\n'; /* don't match newline in negate case */
477 while((c1 = nextrec()) != ']'){
481 regerror("malformed `[]'");
483 if(n+4 >= na){ /* 3 runes plus NUL */
485 classp = runerealloc(classp, na);
488 exprp++; /* eat '-' */
489 if((c2 = nextrec()) == ']')
491 classp[n+0] = 0xFFFF;
499 if(nclass == Nclass){
501 class = realloc(class, Nclass*sizeof(Rune*));
503 class[nclass++] = classp;
507 classmatch(int classno, int c, int negate)
514 if(p[1]<=c && c<=p[2])
524 * Note optimization in addinst:
525 * *l must be pending when addinst called; if *l has been looked
526 * at already, the optimization is a bug.
529 addinst(Ilist *l, Inst *inst, Rangeset *sep)
533 for(p = l; p->inst; p++){
535 if((sep)->r[0].q0 < p->se.r[0].q0)
536 p->se= *sep; /* this would be bug */
537 return; /* It's already there */
548 return startinst==nil || bstartinst==nil;
551 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
553 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
569 if(startinst->type<OPERATOR)
570 startchar = startinst->type;
571 list[0][0].inst = list[1][0].inst = nil;
577 /* Execute machine once for each character */
582 case 0: /* let loop run one more click */
585 case 1: /* expired; wrap to beginning */
586 if(sel.r[0].q0>=0 || eof!=Infinity)
588 list[0][0].inst = list[1][0].inst = nil;
596 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
603 /* fast check for first char */
604 if(startchar && nnl==0 && c!=startchar)
611 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
612 /* Add first instruction to this list */
615 warning(nil, "regexp list overflow\n");
620 addinst(tl, startinst, &sempty);
622 /* Execute machine until this list is empty */
623 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
626 default: /* regular character */
631 addinst(nl, inst->u1.next, &tlp->se);
636 tlp->se.r[inst->u.subid].q0 = p;
637 inst = inst->u1.next;
641 tlp->se.r[inst->u.subid].q1 = p;
642 inst = inst->u1.next;
649 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
651 inst = inst->u1.next;
660 if(c>=0 && classmatch(inst->u.class, c, 0))
664 if(c>=0 && classmatch(inst->u.class, c, 1))
668 /* evaluate right choice later */
671 addinst(tlp, inst->u.right, &tlp->se);
672 /* efficiency: advance and re-evaluate */
673 inst = inst->u1.left;
675 case END: /* Match! */
684 return sel.r[0].q0 >= 0;
688 newmatch(Rangeset *sp)
690 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
691 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
696 rxbexecute(Text *t, uint startp, Rangeset *rp)
712 if(bstartinst->type<OPERATOR)
713 startchar = bstartinst->type;
714 list[0][0].inst = list[1][0].inst = nil;
716 /* Execute machine once for each character, including terminal NUL */
721 case 0: /* let loop run one more click */
724 case 1: /* expired; wrap to end */
727 list[0][0].inst = list[1][0].inst = nil;
736 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
738 c = textreadc(t, p-1);
740 /* fast check for first char */
741 if(startchar && nnl==0 && c!=startchar)
748 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
749 /* Add first instruction to this list */
752 warning(nil, "regexp list overflow\n");
756 /* the minus is so the optimizations in addinst work */
758 addinst(tl, bstartinst, &sempty);
760 /* Execute machine until this list is empty */
761 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
764 default: /* regular character */
769 addinst(nl, inst->u1.next, &tlp->se);
774 tlp->se.r[inst->u.subid].q0 = p;
775 inst = inst->u1.next;
778 if(inst->u.subid >= 0)
779 tlp->se.r[inst->u.subid].q1 = p;
780 inst = inst->u1.next;
789 inst = inst->u1.next;
794 if(p<t->file->b.nc && textreadc(t, p)=='\n')
798 if(c>0 && classmatch(inst->u.class, c, 0))
802 if(c>0 && classmatch(inst->u.class, c, 1))
806 /* evaluate right choice later */
809 addinst(tlp, inst->u.right, &tlp->se);
810 /* efficiency: advance and re-evaluate */
811 inst = inst->u1.left;
813 case END: /* Match! */
814 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
823 return sel.r[0].q0 >= 0;
827 bnewmatch(Rangeset *sp)
831 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
832 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
833 sel.r[i].q0 = sp->r[i].q1;
834 sel.r[i].q1 = sp->r[i].q0;