Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <thread.h>
5 #include <cursor.h>
6 #include <mouse.h>
7 #include <keyboard.h>
8 #include <frame.h>
9 #include <fcall.h>
10 #include <plumb.h>
11 #include <libsec.h>
12 #include "dat.h"
13 #include "fns.h"
15 Rangeset sel;
16 Rune *lastregexp;
18 /*
19 * Machine Information
20 */
21 typedef struct Inst Inst;
22 struct Inst
23 {
24 uint type; /* < OPERATOR ==> literal, otherwise action */
25 union {
26 int sid;
27 int subid;
28 int class;
29 Inst *other;
30 Inst *right;
31 } u;
32 union{
33 Inst *left;
34 Inst *next;
35 } u1;
36 };
38 #define NPROG 1024
39 Inst program[NPROG];
40 Inst *progp;
41 Inst *startinst; /* First inst. of program; might not be program[0] */
42 Inst *bstartinst; /* same for backwards machine */
43 Channel *rechan; /* chan(Inst*) */
45 typedef struct Ilist Ilist;
46 struct Ilist
47 {
48 Inst *inst; /* Instruction of the thread */
49 Rangeset se;
50 uint startp; /* first char of match */
51 };
53 #define NLIST 127
55 Ilist *tl, *nl; /* This list, next list */
56 Ilist list[2][NLIST+1]; /* +1 for trailing null */
57 static Rangeset sempty;
59 /*
60 * Actions and Tokens
61 *
62 * 0x10000xx are operators, value == precedence
63 * 0x20000xx are tokens, i.e. operands for operators
64 */
65 #define OPERATOR 0x1000000 /* Bit set in all operators */
66 #define START (OPERATOR+0) /* Start, used for marker on stack */
67 #define RBRA (OPERATOR+1) /* Right bracket, ) */
68 #define LBRA (OPERATOR+2) /* Left bracket, ( */
69 #define OR (OPERATOR+3) /* Alternation, | */
70 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */
71 #define STAR (OPERATOR+5) /* Closure, * */
72 #define PLUS (OPERATOR+6) /* a+ == aa* */
73 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */
74 #define ANY 0x2000000 /* Any character but newline, . */
75 #define NOP (ANY+1) /* No operation, internal use only */
76 #define BOL (ANY+2) /* Beginning of line, ^ */
77 #define EOL (ANY+3) /* End of line, $ */
78 #define CCLASS (ANY+4) /* Character class, [] */
79 #define NCCLASS (ANY+5) /* Negated character class, [^] */
80 #define END (ANY+0x77) /* Terminate: match found */
82 #define ISATOR OPERATOR
83 #define ISAND ANY
85 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */
87 /*
88 * Parser Information
89 */
90 typedef struct Node Node;
91 struct Node
92 {
93 Inst *first;
94 Inst *last;
95 };
97 #define NSTACK 20
98 Node andstack[NSTACK];
99 Node *andp;
100 int atorstack[NSTACK];
101 int *atorp;
102 int lastwasand; /* Last token was operand */
103 int cursubid;
104 int subidstack[NSTACK];
105 int *subidp;
106 int backwards;
107 int nbra;
108 Rune *exprp; /* pointer to next character in source expression */
109 #define DCLASS 10 /* allocation increment */
110 int nclass; /* number active */
111 int Nclass; /* high water mark */
112 Rune **class;
113 int negateclass;
115 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
116 void newmatch(Rangeset*);
117 void bnewmatch(Rangeset*);
118 void pushand(Inst*, Inst*);
119 void pushator(int);
120 Node *popand(int);
121 int popator(void);
122 void startlex(Rune*);
123 int lex(void);
124 void operator(int);
125 void operand(int);
126 void evaluntil(int);
127 void optimize(Inst*);
128 void bldcclass(void);
130 void
131 rxinit(void)
133 rechan = chancreate(sizeof(Inst*), 0);
134 chansetname(rechan, "rechan");
135 lastregexp = runemalloc(1);
138 void
139 regerror(char *e)
141 lastregexp[0] = 0;
142 warning(nil, "regexp: %s\n", e);
143 sendp(rechan, nil);
144 threadexits(nil);
147 Inst *
148 newinst(int t)
150 if(progp >= &program[NPROG])
151 regerror("expression too long");
152 progp->type = t;
153 progp->u1.left = nil;
154 progp->u.right = nil;
155 return progp++;
158 void
159 realcompile(void *arg)
161 int token;
162 Rune *s;
164 threadsetname("regcomp");
165 s = arg;
166 startlex(s);
167 atorp = atorstack;
168 andp = andstack;
169 subidp = subidstack;
170 cursubid = 0;
171 lastwasand = FALSE;
172 /* Start with a low priority operator to prime parser */
173 pushator(START-1);
174 while((token=lex()) != END){
175 if((token&ISATOR) == OPERATOR)
176 operator(token);
177 else
178 operand(token);
180 /* Close with a low priority operator */
181 evaluntil(START);
182 /* Force END */
183 operand(END);
184 evaluntil(START);
185 if(nbra)
186 regerror("unmatched `('");
187 --andp; /* points to first and only operand */
188 sendp(rechan, andp->first);
189 threadexits(nil);
192 /* r is null terminated */
193 int
194 rxcompile(Rune *r)
196 int i, nr;
197 Inst *oprogp;
199 nr = runestrlen(r)+1;
200 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
201 return TRUE;
202 lastregexp[0] = 0;
203 for(i=0; i<nclass; i++)
204 free(class[i]);
205 nclass = 0;
206 progp = program;
207 backwards = FALSE;
208 bstartinst = nil;
209 threadcreate(realcompile, r, STACK);
210 startinst = recvp(rechan);
211 if(startinst == nil)
212 return FALSE;
213 optimize(program);
214 oprogp = progp;
215 backwards = TRUE;
216 threadcreate(realcompile, r, STACK);
217 bstartinst = recvp(rechan);
218 if(bstartinst == nil)
219 return FALSE;
220 optimize(oprogp);
221 lastregexp = runerealloc(lastregexp, nr);
222 runemove(lastregexp, r, nr);
223 return TRUE;
226 void
227 operand(int t)
229 Inst *i;
230 if(lastwasand)
231 operator(CAT); /* catenate is implicit */
232 i = newinst(t);
233 if(t == CCLASS){
234 if(negateclass)
235 i->type = NCCLASS; /* UGH */
236 i->u.class = nclass-1; /* UGH */
238 pushand(i, i);
239 lastwasand = TRUE;
242 void
243 operator(int t)
245 if(t==RBRA && --nbra<0)
246 regerror("unmatched `)'");
247 if(t==LBRA){
248 cursubid++; /* silently ignored */
249 nbra++;
250 if(lastwasand)
251 operator(CAT);
252 }else
253 evaluntil(t);
254 if(t!=RBRA)
255 pushator(t);
256 lastwasand = FALSE;
257 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
258 lastwasand = TRUE; /* these look like operands */
261 void
262 pushand(Inst *f, Inst *l)
264 if(andp >= &andstack[NSTACK])
265 error("operand stack overflow");
266 andp->first = f;
267 andp->last = l;
268 andp++;
271 void
272 pushator(int t)
274 if(atorp >= &atorstack[NSTACK])
275 error("operator stack overflow");
276 *atorp++=t;
277 if(cursubid >= NRange)
278 *subidp++= -1;
279 else
280 *subidp++=cursubid;
283 Node *
284 popand(int op)
286 char buf[64];
288 if(andp <= &andstack[0])
289 if(op){
290 sprint(buf, "missing operand for %c", op);
291 regerror(buf);
292 }else
293 regerror("malformed regexp");
294 return --andp;
297 int
298 popator()
300 if(atorp <= &atorstack[0])
301 error("operator stack underflow");
302 --subidp;
303 return *--atorp;
306 void
307 evaluntil(int pri)
309 Node *op1, *op2, *t;
310 Inst *inst1, *inst2;
312 while(pri==RBRA || atorp[-1]>=pri){
313 switch(popator()){
314 case LBRA:
315 op1 = popand('(');
316 inst2 = newinst(RBRA);
317 inst2->u.subid = *subidp;
318 op1->last->u1.next = inst2;
319 inst1 = newinst(LBRA);
320 inst1->u.subid = *subidp;
321 inst1->u1.next = op1->first;
322 pushand(inst1, inst2);
323 return; /* must have been RBRA */
324 default:
325 error("unknown regexp operator");
326 break;
327 case OR:
328 op2 = popand('|');
329 op1 = popand('|');
330 inst2 = newinst(NOP);
331 op2->last->u1.next = inst2;
332 op1->last->u1.next = inst2;
333 inst1 = newinst(OR);
334 inst1->u.right = op1->first;
335 inst1->u1.left = op2->first;
336 pushand(inst1, inst2);
337 break;
338 case CAT:
339 op2 = popand(0);
340 op1 = popand(0);
341 if(backwards && op2->first->type!=END){
342 t = op1;
343 op1 = op2;
344 op2 = t;
346 op1->last->u1.next = op2->first;
347 pushand(op1->first, op2->last);
348 break;
349 case STAR:
350 op2 = popand('*');
351 inst1 = newinst(OR);
352 op2->last->u1.next = inst1;
353 inst1->u.right = op2->first;
354 pushand(inst1, inst1);
355 break;
356 case PLUS:
357 op2 = popand('+');
358 inst1 = newinst(OR);
359 op2->last->u1.next = inst1;
360 inst1->u.right = op2->first;
361 pushand(op2->first, inst1);
362 break;
363 case QUEST:
364 op2 = popand('?');
365 inst1 = newinst(OR);
366 inst2 = newinst(NOP);
367 inst1->u1.left = inst2;
368 inst1->u.right = op2->first;
369 op2->last->u1.next = inst2;
370 pushand(inst1, inst2);
371 break;
377 void
378 optimize(Inst *start)
380 Inst *inst, *target;
382 for(inst=start; inst->type!=END; inst++){
383 target = inst->u1.next;
384 while(target->type == NOP)
385 target = target->u1.next;
386 inst->u1.next = target;
390 void
391 startlex(Rune *s)
393 exprp = s;
394 nbra = 0;
398 int
399 lex(void){
400 int c;
402 c = *exprp++;
403 switch(c){
404 case '\\':
405 if(*exprp)
406 if((c= *exprp++)=='n')
407 c='\n';
408 break;
409 case 0:
410 c = END;
411 --exprp; /* In case we come here again */
412 break;
413 case '*':
414 c = STAR;
415 break;
416 case '?':
417 c = QUEST;
418 break;
419 case '+':
420 c = PLUS;
421 break;
422 case '|':
423 c = OR;
424 break;
425 case '.':
426 c = ANY;
427 break;
428 case '(':
429 c = LBRA;
430 break;
431 case ')':
432 c = RBRA;
433 break;
434 case '^':
435 c = BOL;
436 break;
437 case '$':
438 c = EOL;
439 break;
440 case '[':
441 c = CCLASS;
442 bldcclass();
443 break;
445 return c;
448 int
449 nextrec(void)
451 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
452 regerror("malformed `[]'");
453 if(exprp[0] == '\\'){
454 exprp++;
455 if(*exprp=='n'){
456 exprp++;
457 return '\n';
459 return *exprp++|QUOTED;
461 return *exprp++;
464 void
465 bldcclass(void)
467 int c1, c2, n, na;
468 Rune *classp;
470 classp = runemalloc(DCLASS);
471 n = 0;
472 na = DCLASS;
473 /* we have already seen the '[' */
474 if(*exprp == '^'){
475 classp[n++] = '\n'; /* don't match newline in negate case */
476 negateclass = TRUE;
477 exprp++;
478 }else
479 negateclass = FALSE;
480 while((c1 = nextrec()) != ']'){
481 if(c1 == '-'){
482 Error:
483 free(classp);
484 regerror("malformed `[]'");
486 if(n+4 >= na){ /* 3 runes plus NUL */
487 na += DCLASS;
488 classp = runerealloc(classp, na);
490 if(*exprp == '-'){
491 exprp++; /* eat '-' */
492 if((c2 = nextrec()) == ']')
493 goto Error;
494 classp[n+0] = Runemax;
495 classp[n+1] = c1;
496 classp[n+2] = c2;
497 n += 3;
498 }else
499 classp[n++] = c1 & ~QUOTED;
501 classp[n] = 0;
502 if(nclass == Nclass){
503 Nclass += DCLASS;
504 class = realloc(class, Nclass*sizeof(Rune*));
506 class[nclass++] = classp;
509 int
510 classmatch(int classno, int c, int negate)
512 Rune *p;
514 p = class[classno];
515 while(*p){
516 if(*p == Runemax){
517 if(p[1]<=c && c<=p[2])
518 return !negate;
519 p += 3;
520 }else if(*p++ == c)
521 return !negate;
523 return negate;
526 /*
527 * Note optimization in addinst:
528 * *l must be pending when addinst called; if *l has been looked
529 * at already, the optimization is a bug.
530 */
531 int
532 addinst(Ilist *l, Inst *inst, Rangeset *sep)
534 Ilist *p;
536 for(p = l; p->inst; p++){
537 if(p->inst==inst){
538 if((sep)->r[0].q0 < p->se.r[0].q0)
539 p->se= *sep; /* this would be bug */
540 return 0; /* It's already there */
543 p->inst = inst;
544 p->se= *sep;
545 (p+1)->inst = nil;
546 return 1;
549 int
550 rxnull(void)
552 return startinst==nil || bstartinst==nil;
555 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
556 int
557 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
559 int flag;
560 Inst *inst;
561 Ilist *tlp;
562 uint p;
563 int nnl, ntl;
564 int nc, c;
565 int wrapped;
566 int startchar;
568 flag = 0;
569 p = startp;
570 startchar = 0;
571 wrapped = 0;
572 nnl = 0;
573 if(startinst->type<OPERATOR)
574 startchar = startinst->type;
575 list[0][0].inst = list[1][0].inst = nil;
576 sel.r[0].q0 = -1;
577 if(t != nil)
578 nc = t->file->b.nc;
579 else
580 nc = runestrlen(r);
581 /* Execute machine once for each character */
582 for(;;p++){
583 doloop:
584 if(p>=eof || p>=nc){
585 switch(wrapped++){
586 case 0: /* let loop run one more click */
587 case 2:
588 break;
589 case 1: /* expired; wrap to beginning */
590 if(sel.r[0].q0>=0 || eof!=Infinity)
591 goto Return;
592 list[0][0].inst = list[1][0].inst = nil;
593 p = 0;
594 goto doloop;
595 default:
596 goto Return;
598 c = 0;
599 }else{
600 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
601 break;
602 if(t != nil)
603 c = textreadc(t, p);
604 else
605 c = r[p];
607 /* fast check for first char */
608 if(startchar && nnl==0 && c!=startchar)
609 continue;
610 tl = list[flag];
611 nl = list[flag^=1];
612 nl->inst = nil;
613 ntl = nnl;
614 nnl = 0;
615 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
616 /* Add first instruction to this list */
617 sempty.r[0].q0 = p;
618 if(addinst(tl, startinst, &sempty))
619 if(++ntl >= NLIST){
620 Overflow:
621 warning(nil, "regexp list overflow\n");
622 sel.r[0].q0 = -1;
623 goto Return;
626 /* Execute machine until this list is empty */
627 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
628 Switchstmt:
629 switch(inst->type){
630 default: /* regular character */
631 if(inst->type==c){
632 Addinst:
633 if(addinst(nl, inst->u1.next, &tlp->se))
634 if(++nnl >= NLIST)
635 goto Overflow;
637 break;
638 case LBRA:
639 if(inst->u.subid>=0)
640 tlp->se.r[inst->u.subid].q0 = p;
641 inst = inst->u1.next;
642 goto Switchstmt;
643 case RBRA:
644 if(inst->u.subid>=0)
645 tlp->se.r[inst->u.subid].q1 = p;
646 inst = inst->u1.next;
647 goto Switchstmt;
648 case ANY:
649 if(c!='\n')
650 goto Addinst;
651 break;
652 case BOL:
653 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
654 Step:
655 inst = inst->u1.next;
656 goto Switchstmt;
658 break;
659 case EOL:
660 if(c == '\n')
661 goto Step;
662 break;
663 case CCLASS:
664 if(c>=0 && classmatch(inst->u.class, c, 0))
665 goto Addinst;
666 break;
667 case NCCLASS:
668 if(c>=0 && classmatch(inst->u.class, c, 1))
669 goto Addinst;
670 break;
671 case OR:
672 /* evaluate right choice later */
673 if(addinst(tlp, inst->u.right, &tlp->se))
674 if(++ntl >= NLIST)
675 goto Overflow;
676 /* efficiency: advance and re-evaluate */
677 inst = inst->u1.left;
678 goto Switchstmt;
679 case END: /* Match! */
680 tlp->se.r[0].q1 = p;
681 newmatch(&tlp->se);
682 break;
686 Return:
687 *rp = sel;
688 return sel.r[0].q0 >= 0;
691 void
692 newmatch(Rangeset *sp)
694 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
695 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
696 sel = *sp;
699 int
700 rxbexecute(Text *t, uint startp, Rangeset *rp)
702 int flag;
703 Inst *inst;
704 Ilist *tlp;
705 int p;
706 int nnl, ntl;
707 int c;
708 int wrapped;
709 int startchar;
711 flag = 0;
712 nnl = 0;
713 wrapped = 0;
714 p = startp;
715 startchar = 0;
716 if(bstartinst->type<OPERATOR)
717 startchar = bstartinst->type;
718 list[0][0].inst = list[1][0].inst = nil;
719 sel.r[0].q0= -1;
720 /* Execute machine once for each character, including terminal NUL */
721 for(;;--p){
722 doloop:
723 if(p <= 0){
724 switch(wrapped++){
725 case 0: /* let loop run one more click */
726 case 2:
727 break;
728 case 1: /* expired; wrap to end */
729 if(sel.r[0].q0>=0)
730 goto Return;
731 list[0][0].inst = list[1][0].inst = nil;
732 p = t->file->b.nc;
733 goto doloop;
734 case 3:
735 default:
736 goto Return;
738 c = 0;
739 }else{
740 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
741 break;
742 c = textreadc(t, p-1);
744 /* fast check for first char */
745 if(startchar && nnl==0 && c!=startchar)
746 continue;
747 tl = list[flag];
748 nl = list[flag^=1];
749 nl->inst = nil;
750 ntl = nnl;
751 nnl = 0;
752 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
753 /* Add first instruction to this list */
754 /* the minus is so the optimizations in addinst work */
755 sempty.r[0].q0 = -p;
756 if(addinst(tl, bstartinst, &sempty))
757 if(++ntl >= NLIST){
758 Overflow:
759 warning(nil, "regexp list overflow\n");
760 sel.r[0].q0 = -1;
761 goto Return;
764 /* Execute machine until this list is empty */
765 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
766 Switchstmt:
767 switch(inst->type){
768 default: /* regular character */
769 if(inst->type == c){
770 Addinst:
771 if(addinst(nl, inst->u1.next, &tlp->se))
772 if(++nnl >= NLIST)
773 goto Overflow;
775 break;
776 case LBRA:
777 if(inst->u.subid>=0)
778 tlp->se.r[inst->u.subid].q0 = p;
779 inst = inst->u1.next;
780 goto Switchstmt;
781 case RBRA:
782 if(inst->u.subid >= 0)
783 tlp->se.r[inst->u.subid].q1 = p;
784 inst = inst->u1.next;
785 goto Switchstmt;
786 case ANY:
787 if(c != '\n')
788 goto Addinst;
789 break;
790 case BOL:
791 if(c=='\n' || p==0){
792 Step:
793 inst = inst->u1.next;
794 goto Switchstmt;
796 break;
797 case EOL:
798 if(p<t->file->b.nc && textreadc(t, p)=='\n')
799 goto Step;
800 break;
801 case CCLASS:
802 if(c>0 && classmatch(inst->u.class, c, 0))
803 goto Addinst;
804 break;
805 case NCCLASS:
806 if(c>0 && classmatch(inst->u.class, c, 1))
807 goto Addinst;
808 break;
809 case OR:
810 /* evaluate right choice later */
811 if(addinst(tl, inst->u.right, &tlp->se))
812 if(++ntl >= NLIST)
813 goto Overflow;
814 /* efficiency: advance and re-evaluate */
815 inst = inst->u1.left;
816 goto Switchstmt;
817 case END: /* Match! */
818 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
819 tlp->se.r[0].q1 = p;
820 bnewmatch(&tlp->se);
821 break;
825 Return:
826 *rp = sel;
827 return sel.r[0].q0 >= 0;
830 void
831 bnewmatch(Rangeset *sp)
833 int i;
835 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
836 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
837 sel.r[i].q0 = sp->r[i].q1;
838 sel.r[i].q1 = sp->r[i].q0;