Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <thread.h>
5 #include <cursor.h>
6 #include <mouse.h>
7 #include <keyboard.h>
8 #include <frame.h>
9 #include <fcall.h>
10 #include <plumb.h>
11 #include "dat.h"
12 #include "fns.h"
14 Rangeset sel;
15 Rune *lastregexp;
17 /*
18 * Machine Information
19 */
20 typedef struct Inst Inst;
21 struct Inst
22 {
23 uint type; /* < OPERATOR ==> literal, otherwise action */
24 union {
25 int sid;
26 int subid;
27 int class;
28 Inst *other;
29 Inst *right;
30 } u;
31 union{
32 Inst *left;
33 Inst *next;
34 } u1;
35 };
37 #define NPROG 1024
38 Inst program[NPROG];
39 Inst *progp;
40 Inst *startinst; /* First inst. of program; might not be program[0] */
41 Inst *bstartinst; /* same for backwards machine */
42 Channel *rechan; /* chan(Inst*) */
44 typedef struct Ilist Ilist;
45 struct Ilist
46 {
47 Inst *inst; /* Instruction of the thread */
48 Rangeset se;
49 uint startp; /* first char of match */
50 };
52 #define NLIST 127
54 Ilist *tl, *nl; /* This list, next list */
55 Ilist list[2][NLIST+1]; /* +1 for trailing null */
56 static Rangeset sempty;
58 /*
59 * Actions and Tokens
60 *
61 * 0x10000xx are operators, value == precedence
62 * 0x20000xx are tokens, i.e. operands for operators
63 */
64 #define OPERATOR 0x1000000 /* Bit set in all operators */
65 #define START (OPERATOR+0) /* Start, used for marker on stack */
66 #define RBRA (OPERATOR+1) /* Right bracket, ) */
67 #define LBRA (OPERATOR+2) /* Left bracket, ( */
68 #define OR (OPERATOR+3) /* Alternation, | */
69 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */
70 #define STAR (OPERATOR+5) /* Closure, * */
71 #define PLUS (OPERATOR+6) /* a+ == aa* */
72 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */
73 #define ANY 0x2000000 /* Any character but newline, . */
74 #define NOP (ANY+1) /* No operation, internal use only */
75 #define BOL (ANY+2) /* Beginning of line, ^ */
76 #define EOL (ANY+3) /* End of line, $ */
77 #define CCLASS (ANY+4) /* Character class, [] */
78 #define NCCLASS (ANY+5) /* Negated character class, [^] */
79 #define END (ANY+0x77) /* Terminate: match found */
81 #define ISATOR OPERATOR
82 #define ISAND ANY
84 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */
86 /*
87 * Parser Information
88 */
89 typedef struct Node Node;
90 struct Node
91 {
92 Inst *first;
93 Inst *last;
94 };
96 #define NSTACK 20
97 Node andstack[NSTACK];
98 Node *andp;
99 int atorstack[NSTACK];
100 int *atorp;
101 int lastwasand; /* Last token was operand */
102 int cursubid;
103 int subidstack[NSTACK];
104 int *subidp;
105 int backwards;
106 int nbra;
107 Rune *exprp; /* pointer to next character in source expression */
108 #define DCLASS 10 /* allocation increment */
109 int nclass; /* number active */
110 int Nclass; /* high water mark */
111 Rune **class;
112 int negateclass;
114 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
115 void newmatch(Rangeset*);
116 void bnewmatch(Rangeset*);
117 void pushand(Inst*, Inst*);
118 void pushator(int);
119 Node *popand(int);
120 int popator(void);
121 void startlex(Rune*);
122 int lex(void);
123 void operator(int);
124 void operand(int);
125 void evaluntil(int);
126 void optimize(Inst*);
127 void bldcclass(void);
129 void
130 rxinit(void)
132 rechan = chancreate(sizeof(Inst*), 0);
133 chansetname(rechan, "rechan");
134 lastregexp = runemalloc(1);
137 void
138 regerror(char *e)
140 lastregexp[0] = 0;
141 warning(nil, "regexp: %s\n", e);
142 sendp(rechan, nil);
143 threadexits(nil);
146 Inst *
147 newinst(int t)
149 if(progp >= &program[NPROG])
150 regerror("expression too long");
151 progp->type = t;
152 progp->u1.left = nil;
153 progp->u.right = nil;
154 return progp++;
157 void
158 realcompile(void *arg)
160 int token;
161 Rune *s;
163 threadsetname("regcomp");
164 s = arg;
165 startlex(s);
166 atorp = atorstack;
167 andp = andstack;
168 subidp = subidstack;
169 cursubid = 0;
170 lastwasand = FALSE;
171 /* Start with a low priority operator to prime parser */
172 pushator(START-1);
173 while((token=lex()) != END){
174 if((token&ISATOR) == OPERATOR)
175 operator(token);
176 else
177 operand(token);
179 /* Close with a low priority operator */
180 evaluntil(START);
181 /* Force END */
182 operand(END);
183 evaluntil(START);
184 if(nbra)
185 regerror("unmatched `('");
186 --andp; /* points to first and only operand */
187 sendp(rechan, andp->first);
188 threadexits(nil);
191 /* r is null terminated */
192 int
193 rxcompile(Rune *r)
195 int i, nr;
196 Inst *oprogp;
198 nr = runestrlen(r)+1;
199 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
200 return TRUE;
201 lastregexp[0] = 0;
202 for(i=0; i<nclass; i++)
203 free(class[i]);
204 nclass = 0;
205 progp = program;
206 backwards = FALSE;
207 bstartinst = nil;
208 threadcreate(realcompile, r, STACK);
209 startinst = recvp(rechan);
210 if(startinst == nil)
211 return FALSE;
212 optimize(program);
213 oprogp = progp;
214 backwards = TRUE;
215 threadcreate(realcompile, r, STACK);
216 bstartinst = recvp(rechan);
217 if(bstartinst == nil)
218 return FALSE;
219 optimize(oprogp);
220 lastregexp = runerealloc(lastregexp, nr);
221 runemove(lastregexp, r, nr);
222 return TRUE;
225 void
226 operand(int t)
228 Inst *i;
229 if(lastwasand)
230 operator(CAT); /* catenate is implicit */
231 i = newinst(t);
232 if(t == CCLASS){
233 if(negateclass)
234 i->type = NCCLASS; /* UGH */
235 i->u.class = nclass-1; /* UGH */
237 pushand(i, i);
238 lastwasand = TRUE;
241 void
242 operator(int t)
244 if(t==RBRA && --nbra<0)
245 regerror("unmatched `)'");
246 if(t==LBRA){
247 cursubid++; /* silently ignored */
248 nbra++;
249 if(lastwasand)
250 operator(CAT);
251 }else
252 evaluntil(t);
253 if(t!=RBRA)
254 pushator(t);
255 lastwasand = FALSE;
256 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
257 lastwasand = TRUE; /* these look like operands */
260 void
261 pushand(Inst *f, Inst *l)
263 if(andp >= &andstack[NSTACK])
264 error("operand stack overflow");
265 andp->first = f;
266 andp->last = l;
267 andp++;
270 void
271 pushator(int t)
273 if(atorp >= &atorstack[NSTACK])
274 error("operator stack overflow");
275 *atorp++=t;
276 if(cursubid >= NRange)
277 *subidp++= -1;
278 else
279 *subidp++=cursubid;
282 Node *
283 popand(int op)
285 char buf[64];
287 if(andp <= &andstack[0])
288 if(op){
289 sprint(buf, "missing operand for %c", op);
290 regerror(buf);
291 }else
292 regerror("malformed regexp");
293 return --andp;
296 int
297 popator()
299 if(atorp <= &atorstack[0])
300 error("operator stack underflow");
301 --subidp;
302 return *--atorp;
305 void
306 evaluntil(int pri)
308 Node *op1, *op2, *t;
309 Inst *inst1, *inst2;
311 while(pri==RBRA || atorp[-1]>=pri){
312 switch(popator()){
313 case LBRA:
314 op1 = popand('(');
315 inst2 = newinst(RBRA);
316 inst2->u.subid = *subidp;
317 op1->last->u1.next = inst2;
318 inst1 = newinst(LBRA);
319 inst1->u.subid = *subidp;
320 inst1->u1.next = op1->first;
321 pushand(inst1, inst2);
322 return; /* must have been RBRA */
323 default:
324 error("unknown regexp operator");
325 break;
326 case OR:
327 op2 = popand('|');
328 op1 = popand('|');
329 inst2 = newinst(NOP);
330 op2->last->u1.next = inst2;
331 op1->last->u1.next = inst2;
332 inst1 = newinst(OR);
333 inst1->u.right = op1->first;
334 inst1->u1.left = op2->first;
335 pushand(inst1, inst2);
336 break;
337 case CAT:
338 op2 = popand(0);
339 op1 = popand(0);
340 if(backwards && op2->first->type!=END){
341 t = op1;
342 op1 = op2;
343 op2 = t;
345 op1->last->u1.next = op2->first;
346 pushand(op1->first, op2->last);
347 break;
348 case STAR:
349 op2 = popand('*');
350 inst1 = newinst(OR);
351 op2->last->u1.next = inst1;
352 inst1->u.right = op2->first;
353 pushand(inst1, inst1);
354 break;
355 case PLUS:
356 op2 = popand('+');
357 inst1 = newinst(OR);
358 op2->last->u1.next = inst1;
359 inst1->u.right = op2->first;
360 pushand(op2->first, inst1);
361 break;
362 case QUEST:
363 op2 = popand('?');
364 inst1 = newinst(OR);
365 inst2 = newinst(NOP);
366 inst1->u1.left = inst2;
367 inst1->u.right = op2->first;
368 op2->last->u1.next = inst2;
369 pushand(inst1, inst2);
370 break;
376 void
377 optimize(Inst *start)
379 Inst *inst, *target;
381 for(inst=start; inst->type!=END; inst++){
382 target = inst->u1.next;
383 while(target->type == NOP)
384 target = target->u1.next;
385 inst->u1.next = target;
389 void
390 startlex(Rune *s)
392 exprp = s;
393 nbra = 0;
397 int
398 lex(void){
399 int c;
401 c = *exprp++;
402 switch(c){
403 case '\\':
404 if(*exprp)
405 if((c= *exprp++)=='n')
406 c='\n';
407 break;
408 case 0:
409 c = END;
410 --exprp; /* In case we come here again */
411 break;
412 case '*':
413 c = STAR;
414 break;
415 case '?':
416 c = QUEST;
417 break;
418 case '+':
419 c = PLUS;
420 break;
421 case '|':
422 c = OR;
423 break;
424 case '.':
425 c = ANY;
426 break;
427 case '(':
428 c = LBRA;
429 break;
430 case ')':
431 c = RBRA;
432 break;
433 case '^':
434 c = BOL;
435 break;
436 case '$':
437 c = EOL;
438 break;
439 case '[':
440 c = CCLASS;
441 bldcclass();
442 break;
444 return c;
447 int
448 nextrec(void)
450 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
451 regerror("malformed `[]'");
452 if(exprp[0] == '\\'){
453 exprp++;
454 if(*exprp=='n'){
455 exprp++;
456 return '\n';
458 return *exprp++|QUOTED;
460 return *exprp++;
463 void
464 bldcclass(void)
466 int c1, c2, n, na;
467 Rune *classp;
469 classp = runemalloc(DCLASS);
470 n = 0;
471 na = DCLASS;
472 /* we have already seen the '[' */
473 if(*exprp == '^'){
474 classp[n++] = '\n'; /* don't match newline in negate case */
475 negateclass = TRUE;
476 exprp++;
477 }else
478 negateclass = FALSE;
479 while((c1 = nextrec()) != ']'){
480 if(c1 == '-'){
481 Error:
482 free(classp);
483 regerror("malformed `[]'");
485 if(n+4 >= na){ /* 3 runes plus NUL */
486 na += DCLASS;
487 classp = runerealloc(classp, na);
489 if(*exprp == '-'){
490 exprp++; /* eat '-' */
491 if((c2 = nextrec()) == ']')
492 goto Error;
493 classp[n+0] = Runemax;
494 classp[n+1] = c1;
495 classp[n+2] = c2;
496 n += 3;
497 }else
498 classp[n++] = c1 & ~QUOTED;
500 classp[n] = 0;
501 if(nclass == Nclass){
502 Nclass += DCLASS;
503 class = realloc(class, Nclass*sizeof(Rune*));
505 class[nclass++] = classp;
508 int
509 classmatch(int classno, int c, int negate)
511 Rune *p;
513 p = class[classno];
514 while(*p){
515 if(*p == Runemax){
516 if(p[1]<=c && c<=p[2])
517 return !negate;
518 p += 3;
519 }else if(*p++ == c)
520 return !negate;
522 return negate;
525 /*
526 * Note optimization in addinst:
527 * *l must be pending when addinst called; if *l has been looked
528 * at already, the optimization is a bug.
529 */
530 int
531 addinst(Ilist *l, Inst *inst, Rangeset *sep)
533 Ilist *p;
535 for(p = l; p->inst; p++){
536 if(p->inst==inst){
537 if((sep)->r[0].q0 < p->se.r[0].q0)
538 p->se= *sep; /* this would be bug */
539 return 0; /* It's already there */
542 p->inst = inst;
543 p->se= *sep;
544 (p+1)->inst = nil;
545 return 1;
548 int
549 rxnull(void)
551 return startinst==nil || bstartinst==nil;
554 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
555 int
556 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
558 int flag;
559 Inst *inst;
560 Ilist *tlp;
561 uint p;
562 int nnl, ntl;
563 int nc, c;
564 int wrapped;
565 int startchar;
567 flag = 0;
568 p = startp;
569 startchar = 0;
570 wrapped = 0;
571 nnl = 0;
572 if(startinst->type<OPERATOR)
573 startchar = startinst->type;
574 list[0][0].inst = list[1][0].inst = nil;
575 sel.r[0].q0 = -1;
576 if(t != nil)
577 nc = t->file->b.nc;
578 else
579 nc = runestrlen(r);
580 /* Execute machine once for each character */
581 for(;;p++){
582 doloop:
583 if(p>=eof || p>=nc){
584 switch(wrapped++){
585 case 0: /* let loop run one more click */
586 case 2:
587 break;
588 case 1: /* expired; wrap to beginning */
589 if(sel.r[0].q0>=0 || eof!=Infinity)
590 goto Return;
591 list[0][0].inst = list[1][0].inst = nil;
592 p = 0;
593 goto doloop;
594 default:
595 goto Return;
597 c = 0;
598 }else{
599 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
600 break;
601 if(t != nil)
602 c = textreadc(t, p);
603 else
604 c = r[p];
606 /* fast check for first char */
607 if(startchar && nnl==0 && c!=startchar)
608 continue;
609 tl = list[flag];
610 nl = list[flag^=1];
611 nl->inst = nil;
612 ntl = nnl;
613 nnl = 0;
614 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
615 /* Add first instruction to this list */
616 sempty.r[0].q0 = p;
617 if(addinst(tl, startinst, &sempty))
618 if(++ntl >= NLIST){
619 Overflow:
620 warning(nil, "regexp list overflow\n");
621 sel.r[0].q0 = -1;
622 goto Return;
625 /* Execute machine until this list is empty */
626 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
627 Switchstmt:
628 switch(inst->type){
629 default: /* regular character */
630 if(inst->type==c){
631 Addinst:
632 if(addinst(nl, inst->u1.next, &tlp->se))
633 if(++nnl >= NLIST)
634 goto Overflow;
636 break;
637 case LBRA:
638 if(inst->u.subid>=0)
639 tlp->se.r[inst->u.subid].q0 = p;
640 inst = inst->u1.next;
641 goto Switchstmt;
642 case RBRA:
643 if(inst->u.subid>=0)
644 tlp->se.r[inst->u.subid].q1 = p;
645 inst = inst->u1.next;
646 goto Switchstmt;
647 case ANY:
648 if(c!='\n')
649 goto Addinst;
650 break;
651 case BOL:
652 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
653 Step:
654 inst = inst->u1.next;
655 goto Switchstmt;
657 break;
658 case EOL:
659 if(c == '\n')
660 goto Step;
661 break;
662 case CCLASS:
663 if(c>=0 && classmatch(inst->u.class, c, 0))
664 goto Addinst;
665 break;
666 case NCCLASS:
667 if(c>=0 && classmatch(inst->u.class, c, 1))
668 goto Addinst;
669 break;
670 case OR:
671 /* evaluate right choice later */
672 if(addinst(tlp, inst->u.right, &tlp->se))
673 if(++ntl >= NLIST)
674 goto Overflow;
675 /* efficiency: advance and re-evaluate */
676 inst = inst->u1.left;
677 goto Switchstmt;
678 case END: /* Match! */
679 tlp->se.r[0].q1 = p;
680 newmatch(&tlp->se);
681 break;
685 Return:
686 *rp = sel;
687 return sel.r[0].q0 >= 0;
690 void
691 newmatch(Rangeset *sp)
693 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
694 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
695 sel = *sp;
698 int
699 rxbexecute(Text *t, uint startp, Rangeset *rp)
701 int flag;
702 Inst *inst;
703 Ilist *tlp;
704 int p;
705 int nnl, ntl;
706 int c;
707 int wrapped;
708 int startchar;
710 flag = 0;
711 nnl = 0;
712 wrapped = 0;
713 p = startp;
714 startchar = 0;
715 if(bstartinst->type<OPERATOR)
716 startchar = bstartinst->type;
717 list[0][0].inst = list[1][0].inst = nil;
718 sel.r[0].q0= -1;
719 /* Execute machine once for each character, including terminal NUL */
720 for(;;--p){
721 doloop:
722 if(p <= 0){
723 switch(wrapped++){
724 case 0: /* let loop run one more click */
725 case 2:
726 break;
727 case 1: /* expired; wrap to end */
728 if(sel.r[0].q0>=0)
729 goto Return;
730 list[0][0].inst = list[1][0].inst = nil;
731 p = t->file->b.nc;
732 goto doloop;
733 case 3:
734 default:
735 goto Return;
737 c = 0;
738 }else{
739 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
740 break;
741 c = textreadc(t, p-1);
743 /* fast check for first char */
744 if(startchar && nnl==0 && c!=startchar)
745 continue;
746 tl = list[flag];
747 nl = list[flag^=1];
748 nl->inst = nil;
749 ntl = nnl;
750 nnl = 0;
751 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
752 /* Add first instruction to this list */
753 /* the minus is so the optimizations in addinst work */
754 sempty.r[0].q0 = -p;
755 if(addinst(tl, bstartinst, &sempty))
756 if(++ntl >= NLIST){
757 Overflow:
758 warning(nil, "regexp list overflow\n");
759 sel.r[0].q0 = -1;
760 goto Return;
763 /* Execute machine until this list is empty */
764 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
765 Switchstmt:
766 switch(inst->type){
767 default: /* regular character */
768 if(inst->type == c){
769 Addinst:
770 if(addinst(nl, inst->u1.next, &tlp->se))
771 if(++nnl >= NLIST)
772 goto Overflow;
774 break;
775 case LBRA:
776 if(inst->u.subid>=0)
777 tlp->se.r[inst->u.subid].q0 = p;
778 inst = inst->u1.next;
779 goto Switchstmt;
780 case RBRA:
781 if(inst->u.subid >= 0)
782 tlp->se.r[inst->u.subid].q1 = p;
783 inst = inst->u1.next;
784 goto Switchstmt;
785 case ANY:
786 if(c != '\n')
787 goto Addinst;
788 break;
789 case BOL:
790 if(c=='\n' || p==0){
791 Step:
792 inst = inst->u1.next;
793 goto Switchstmt;
795 break;
796 case EOL:
797 if(p<t->file->b.nc && textreadc(t, p)=='\n')
798 goto Step;
799 break;
800 case CCLASS:
801 if(c>0 && classmatch(inst->u.class, c, 0))
802 goto Addinst;
803 break;
804 case NCCLASS:
805 if(c>0 && classmatch(inst->u.class, c, 1))
806 goto Addinst;
807 break;
808 case OR:
809 /* evaluate right choice later */
810 if(addinst(tl, inst->u.right, &tlp->se))
811 if(++ntl >= NLIST)
812 goto Overflow;
813 /* efficiency: advance and re-evaluate */
814 inst = inst->u1.left;
815 goto Switchstmt;
816 case END: /* Match! */
817 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
818 tlp->se.r[0].q1 = p;
819 bnewmatch(&tlp->se);
820 break;
824 Return:
825 *rp = sel;
826 return sel.r[0].q0 >= 0;
829 void
830 bnewmatch(Rangeset *sp)
832 int i;
834 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
835 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
836 sel.r[i].q0 = sp->r[i].q1;
837 sel.r[i].q1 = sp->r[i].q0;