Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <thread.h>
5 #include <cursor.h>
6 #include <mouse.h>
7 #include <keyboard.h>
8 #include <frame.h>
9 #include <fcall.h>
10 #include <plumb.h>
11 #include "dat.h"
12 #include "fns.h"
14 Rangeset sel;
15 Rune *lastregexp;
17 /*
18 * Machine Information
19 */
20 typedef struct Inst Inst;
21 struct Inst
22 {
23 uint type; /* < 0x10000 ==> literal, otherwise action */
24 union {
25 int sid;
26 int subid;
27 int class;
28 Inst *other;
29 Inst *right;
30 } u;
31 union{
32 Inst *left;
33 Inst *next;
34 } u1;
35 };
37 #define NPROG 1024
38 Inst program[NPROG];
39 Inst *progp;
40 Inst *startinst; /* First inst. of program; might not be program[0] */
41 Inst *bstartinst; /* same for backwards machine */
42 Channel *rechan; /* chan(Inst*) */
44 typedef struct Ilist Ilist;
45 struct Ilist
46 {
47 Inst *inst; /* Instruction of the thread */
48 Rangeset se;
49 uint startp; /* first char of match */
50 };
52 #define NLIST 127
54 Ilist *tl, *nl; /* This list, next list */
55 Ilist list[2][NLIST+1]; /* +1 for trailing null */
56 static Rangeset sempty;
58 /*
59 * Actions and Tokens
60 *
61 * 0x100xx are operators, value == precedence
62 * 0x200xx are tokens, i.e. operands for operators
63 */
64 #define OPERATOR 0x10000 /* Bitmask of all operators */
65 #define START 0x10000 /* Start, used for marker on stack */
66 #define RBRA 0x10001 /* Right bracket, ) */
67 #define LBRA 0x10002 /* Left bracket, ( */
68 #define OR 0x10003 /* Alternation, | */
69 #define CAT 0x10004 /* Concatentation, implicit operator */
70 #define STAR 0x10005 /* Closure, * */
71 #define PLUS 0x10006 /* a+ == aa* */
72 #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
73 #define ANY 0x20000 /* Any character but newline, . */
74 #define NOP 0x20001 /* No operation, internal use only */
75 #define BOL 0x20002 /* Beginning of line, ^ */
76 #define EOL 0x20003 /* End of line, $ */
77 #define CCLASS 0x20004 /* Character class, [] */
78 #define NCCLASS 0x20005 /* Negated character class, [^] */
79 #define END 0x20077 /* Terminate: match found */
81 #define ISATOR 0x10000
82 #define ISAND 0x20000
84 /*
85 * Parser Information
86 */
87 typedef struct Node Node;
88 struct Node
89 {
90 Inst *first;
91 Inst *last;
92 };
94 #define NSTACK 20
95 Node andstack[NSTACK];
96 Node *andp;
97 int atorstack[NSTACK];
98 int *atorp;
99 int lastwasand; /* Last token was operand */
100 int cursubid;
101 int subidstack[NSTACK];
102 int *subidp;
103 int backwards;
104 int nbra;
105 Rune *exprp; /* pointer to next character in source expression */
106 #define DCLASS 10 /* allocation increment */
107 int nclass; /* number active */
108 int Nclass; /* high water mark */
109 Rune **class;
110 int negateclass;
112 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
113 void newmatch(Rangeset*);
114 void bnewmatch(Rangeset*);
115 void pushand(Inst*, Inst*);
116 void pushator(int);
117 Node *popand(int);
118 int popator(void);
119 void startlex(Rune*);
120 int lex(void);
121 void operator(int);
122 void operand(int);
123 void evaluntil(int);
124 void optimize(Inst*);
125 void bldcclass(void);
127 void
128 rxinit(void)
130 rechan = chancreate(sizeof(Inst*), 0);
131 chansetname(rechan, "rechan");
132 lastregexp = runemalloc(1);
135 void
136 regerror(char *e)
138 lastregexp[0] = 0;
139 warning(nil, "regexp: %s\n", e);
140 sendp(rechan, nil);
141 threadexits(nil);
144 Inst *
145 newinst(int t)
147 if(progp >= &program[NPROG])
148 regerror("expression too long");
149 progp->type = t;
150 progp->u1.left = nil;
151 progp->u.right = nil;
152 return progp++;
155 void
156 realcompile(void *arg)
158 int token;
159 Rune *s;
161 threadsetname("regcomp");
162 s = arg;
163 startlex(s);
164 atorp = atorstack;
165 andp = andstack;
166 subidp = subidstack;
167 cursubid = 0;
168 lastwasand = FALSE;
169 /* Start with a low priority operator to prime parser */
170 pushator(START-1);
171 while((token=lex()) != END){
172 if((token&ISATOR) == OPERATOR)
173 operator(token);
174 else
175 operand(token);
177 /* Close with a low priority operator */
178 evaluntil(START);
179 /* Force END */
180 operand(END);
181 evaluntil(START);
182 if(nbra)
183 regerror("unmatched `('");
184 --andp; /* points to first and only operand */
185 sendp(rechan, andp->first);
186 threadexits(nil);
189 /* r is null terminated */
190 int
191 rxcompile(Rune *r)
193 int i, nr;
194 Inst *oprogp;
196 nr = runestrlen(r)+1;
197 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
198 return TRUE;
199 lastregexp[0] = 0;
200 for(i=0; i<nclass; i++)
201 free(class[i]);
202 nclass = 0;
203 progp = program;
204 backwards = FALSE;
205 bstartinst = nil;
206 threadcreate(realcompile, r, STACK);
207 startinst = recvp(rechan);
208 if(startinst == nil)
209 return FALSE;
210 optimize(program);
211 oprogp = progp;
212 backwards = TRUE;
213 threadcreate(realcompile, r, STACK);
214 bstartinst = recvp(rechan);
215 if(bstartinst == nil)
216 return FALSE;
217 optimize(oprogp);
218 lastregexp = runerealloc(lastregexp, nr);
219 runemove(lastregexp, r, nr);
220 return TRUE;
223 void
224 operand(int t)
226 Inst *i;
227 if(lastwasand)
228 operator(CAT); /* catenate is implicit */
229 i = newinst(t);
230 if(t == CCLASS){
231 if(negateclass)
232 i->type = NCCLASS; /* UGH */
233 i->u.class = nclass-1; /* UGH */
235 pushand(i, i);
236 lastwasand = TRUE;
239 void
240 operator(int t)
242 if(t==RBRA && --nbra<0)
243 regerror("unmatched `)'");
244 if(t==LBRA){
245 cursubid++; /* silently ignored */
246 nbra++;
247 if(lastwasand)
248 operator(CAT);
249 }else
250 evaluntil(t);
251 if(t!=RBRA)
252 pushator(t);
253 lastwasand = FALSE;
254 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
255 lastwasand = TRUE; /* these look like operands */
258 void
259 pushand(Inst *f, Inst *l)
261 if(andp >= &andstack[NSTACK])
262 error("operand stack overflow");
263 andp->first = f;
264 andp->last = l;
265 andp++;
268 void
269 pushator(int t)
271 if(atorp >= &atorstack[NSTACK])
272 error("operator stack overflow");
273 *atorp++=t;
274 if(cursubid >= NRange)
275 *subidp++= -1;
276 else
277 *subidp++=cursubid;
280 Node *
281 popand(int op)
283 char buf[64];
285 if(andp <= &andstack[0])
286 if(op){
287 sprint(buf, "missing operand for %c", op);
288 regerror(buf);
289 }else
290 regerror("malformed regexp");
291 return --andp;
294 int
295 popator()
297 if(atorp <= &atorstack[0])
298 error("operator stack underflow");
299 --subidp;
300 return *--atorp;
303 void
304 evaluntil(int pri)
306 Node *op1, *op2, *t;
307 Inst *inst1, *inst2;
309 while(pri==RBRA || atorp[-1]>=pri){
310 switch(popator()){
311 case LBRA:
312 op1 = popand('(');
313 inst2 = newinst(RBRA);
314 inst2->u.subid = *subidp;
315 op1->last->u1.next = inst2;
316 inst1 = newinst(LBRA);
317 inst1->u.subid = *subidp;
318 inst1->u1.next = op1->first;
319 pushand(inst1, inst2);
320 return; /* must have been RBRA */
321 default:
322 error("unknown regexp operator");
323 break;
324 case OR:
325 op2 = popand('|');
326 op1 = popand('|');
327 inst2 = newinst(NOP);
328 op2->last->u1.next = inst2;
329 op1->last->u1.next = inst2;
330 inst1 = newinst(OR);
331 inst1->u.right = op1->first;
332 inst1->u1.left = op2->first;
333 pushand(inst1, inst2);
334 break;
335 case CAT:
336 op2 = popand(0);
337 op1 = popand(0);
338 if(backwards && op2->first->type!=END){
339 t = op1;
340 op1 = op2;
341 op2 = t;
343 op1->last->u1.next = op2->first;
344 pushand(op1->first, op2->last);
345 break;
346 case STAR:
347 op2 = popand('*');
348 inst1 = newinst(OR);
349 op2->last->u1.next = inst1;
350 inst1->u.right = op2->first;
351 pushand(inst1, inst1);
352 break;
353 case PLUS:
354 op2 = popand('+');
355 inst1 = newinst(OR);
356 op2->last->u1.next = inst1;
357 inst1->u.right = op2->first;
358 pushand(op2->first, inst1);
359 break;
360 case QUEST:
361 op2 = popand('?');
362 inst1 = newinst(OR);
363 inst2 = newinst(NOP);
364 inst1->u1.left = inst2;
365 inst1->u.right = op2->first;
366 op2->last->u1.next = inst2;
367 pushand(inst1, inst2);
368 break;
374 void
375 optimize(Inst *start)
377 Inst *inst, *target;
379 for(inst=start; inst->type!=END; inst++){
380 target = inst->u1.next;
381 while(target->type == NOP)
382 target = target->u1.next;
383 inst->u1.next = target;
387 void
388 startlex(Rune *s)
390 exprp = s;
391 nbra = 0;
395 int
396 lex(void){
397 int c;
399 c = *exprp++;
400 switch(c){
401 case '\\':
402 if(*exprp)
403 if((c= *exprp++)=='n')
404 c='\n';
405 break;
406 case 0:
407 c = END;
408 --exprp; /* In case we come here again */
409 break;
410 case '*':
411 c = STAR;
412 break;
413 case '?':
414 c = QUEST;
415 break;
416 case '+':
417 c = PLUS;
418 break;
419 case '|':
420 c = OR;
421 break;
422 case '.':
423 c = ANY;
424 break;
425 case '(':
426 c = LBRA;
427 break;
428 case ')':
429 c = RBRA;
430 break;
431 case '^':
432 c = BOL;
433 break;
434 case '$':
435 c = EOL;
436 break;
437 case '[':
438 c = CCLASS;
439 bldcclass();
440 break;
442 return c;
445 int
446 nextrec(void)
448 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
449 regerror("malformed `[]'");
450 if(exprp[0] == '\\'){
451 exprp++;
452 if(*exprp=='n'){
453 exprp++;
454 return '\n';
456 return *exprp++|0x10000;
458 return *exprp++;
461 void
462 bldcclass(void)
464 int c1, c2, n, na;
465 Rune *classp;
467 classp = runemalloc(DCLASS);
468 n = 0;
469 na = DCLASS;
470 /* we have already seen the '[' */
471 if(*exprp == '^'){
472 classp[n++] = '\n'; /* don't match newline in negate case */
473 negateclass = TRUE;
474 exprp++;
475 }else
476 negateclass = FALSE;
477 while((c1 = nextrec()) != ']'){
478 if(c1 == '-'){
479 Error:
480 free(classp);
481 regerror("malformed `[]'");
483 if(n+4 >= na){ /* 3 runes plus NUL */
484 na += DCLASS;
485 classp = runerealloc(classp, na);
487 if(*exprp == '-'){
488 exprp++; /* eat '-' */
489 if((c2 = nextrec()) == ']')
490 goto Error;
491 classp[n+0] = 0xFFFF;
492 classp[n+1] = c1;
493 classp[n+2] = c2;
494 n += 3;
495 }else
496 classp[n++] = c1;
498 classp[n] = 0;
499 if(nclass == Nclass){
500 Nclass += DCLASS;
501 class = realloc(class, Nclass*sizeof(Rune*));
503 class[nclass++] = classp;
506 int
507 classmatch(int classno, int c, int negate)
509 Rune *p;
511 p = class[classno];
512 while(*p){
513 if(*p == 0xFFFF){
514 if(p[1]<=c && c<=p[2])
515 return !negate;
516 p += 3;
517 }else if(*p++ == c)
518 return !negate;
520 return negate;
523 /*
524 * Note optimization in addinst:
525 * *l must be pending when addinst called; if *l has been looked
526 * at already, the optimization is a bug.
527 */
528 int
529 addinst(Ilist *l, Inst *inst, Rangeset *sep)
531 Ilist *p;
533 for(p = l; p->inst; p++){
534 if(p->inst==inst){
535 if((sep)->r[0].q0 < p->se.r[0].q0)
536 p->se= *sep; /* this would be bug */
537 return 0; /* It's already there */
540 p->inst = inst;
541 p->se= *sep;
542 (p+1)->inst = nil;
543 return 1;
546 int
547 rxnull(void)
549 return startinst==nil || bstartinst==nil;
552 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
553 int
554 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
556 int flag;
557 Inst *inst;
558 Ilist *tlp;
559 uint p;
560 int nnl, ntl;
561 int nc, c;
562 int wrapped;
563 int startchar;
565 flag = 0;
566 p = startp;
567 startchar = 0;
568 wrapped = 0;
569 nnl = 0;
570 if(startinst->type<OPERATOR)
571 startchar = startinst->type;
572 list[0][0].inst = list[1][0].inst = nil;
573 sel.r[0].q0 = -1;
574 if(t != nil)
575 nc = t->file->b.nc;
576 else
577 nc = runestrlen(r);
578 /* Execute machine once for each character */
579 for(;;p++){
580 doloop:
581 if(p>=eof || p>=nc){
582 switch(wrapped++){
583 case 0: /* let loop run one more click */
584 case 2:
585 break;
586 case 1: /* expired; wrap to beginning */
587 if(sel.r[0].q0>=0 || eof!=Infinity)
588 goto Return;
589 list[0][0].inst = list[1][0].inst = nil;
590 p = 0;
591 goto doloop;
592 default:
593 goto Return;
595 c = 0;
596 }else{
597 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
598 break;
599 if(t != nil)
600 c = textreadc(t, p);
601 else
602 c = r[p];
604 /* fast check for first char */
605 if(startchar && nnl==0 && c!=startchar)
606 continue;
607 tl = list[flag];
608 nl = list[flag^=1];
609 nl->inst = nil;
610 ntl = nnl;
611 nnl = 0;
612 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
613 /* Add first instruction to this list */
614 sempty.r[0].q0 = p;
615 if(addinst(tl, startinst, &sempty))
616 if(++ntl >= NLIST){
617 Overflow:
618 warning(nil, "regexp list overflow\n");
619 sel.r[0].q0 = -1;
620 goto Return;
623 /* Execute machine until this list is empty */
624 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
625 Switchstmt:
626 switch(inst->type){
627 default: /* regular character */
628 if(inst->type==c){
629 Addinst:
630 if(addinst(nl, inst->u1.next, &tlp->se))
631 if(++nnl >= NLIST)
632 goto Overflow;
634 break;
635 case LBRA:
636 if(inst->u.subid>=0)
637 tlp->se.r[inst->u.subid].q0 = p;
638 inst = inst->u1.next;
639 goto Switchstmt;
640 case RBRA:
641 if(inst->u.subid>=0)
642 tlp->se.r[inst->u.subid].q1 = p;
643 inst = inst->u1.next;
644 goto Switchstmt;
645 case ANY:
646 if(c!='\n')
647 goto Addinst;
648 break;
649 case BOL:
650 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
651 Step:
652 inst = inst->u1.next;
653 goto Switchstmt;
655 break;
656 case EOL:
657 if(c == '\n')
658 goto Step;
659 break;
660 case CCLASS:
661 if(c>=0 && classmatch(inst->u.class, c, 0))
662 goto Addinst;
663 break;
664 case NCCLASS:
665 if(c>=0 && classmatch(inst->u.class, c, 1))
666 goto Addinst;
667 break;
668 case OR:
669 /* evaluate right choice later */
670 if(addinst(tl, inst->u.right, &tlp->se))
671 if(++ntl >= NLIST)
672 goto Overflow;
673 /* efficiency: advance and re-evaluate */
674 inst = inst->u1.left;
675 goto Switchstmt;
676 case END: /* Match! */
677 tlp->se.r[0].q1 = p;
678 newmatch(&tlp->se);
679 break;
683 Return:
684 *rp = sel;
685 return sel.r[0].q0 >= 0;
688 void
689 newmatch(Rangeset *sp)
691 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
692 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
693 sel = *sp;
696 int
697 rxbexecute(Text *t, uint startp, Rangeset *rp)
699 int flag;
700 Inst *inst;
701 Ilist *tlp;
702 int p;
703 int nnl, ntl;
704 int c;
705 int wrapped;
706 int startchar;
708 flag = 0;
709 nnl = 0;
710 wrapped = 0;
711 p = startp;
712 startchar = 0;
713 if(bstartinst->type<OPERATOR)
714 startchar = bstartinst->type;
715 list[0][0].inst = list[1][0].inst = nil;
716 sel.r[0].q0= -1;
717 /* Execute machine once for each character, including terminal NUL */
718 for(;;--p){
719 doloop:
720 if(p <= 0){
721 switch(wrapped++){
722 case 0: /* let loop run one more click */
723 case 2:
724 break;
725 case 1: /* expired; wrap to end */
726 if(sel.r[0].q0>=0)
727 goto Return;
728 list[0][0].inst = list[1][0].inst = nil;
729 p = t->file->b.nc;
730 goto doloop;
731 case 3:
732 default:
733 goto Return;
735 c = 0;
736 }else{
737 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
738 break;
739 c = textreadc(t, p-1);
741 /* fast check for first char */
742 if(startchar && nnl==0 && c!=startchar)
743 continue;
744 tl = list[flag];
745 nl = list[flag^=1];
746 nl->inst = nil;
747 ntl = nnl;
748 nnl = 0;
749 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
750 /* Add first instruction to this list */
751 /* the minus is so the optimizations in addinst work */
752 sempty.r[0].q0 = -p;
753 if(addinst(tl, bstartinst, &sempty))
754 if(++ntl >= NLIST){
755 Overflow:
756 warning(nil, "regexp list overflow\n");
757 sel.r[0].q0 = -1;
758 goto Return;
761 /* Execute machine until this list is empty */
762 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
763 Switchstmt:
764 switch(inst->type){
765 default: /* regular character */
766 if(inst->type == c){
767 Addinst:
768 if(addinst(nl, inst->u1.next, &tlp->se))
769 if(++nnl >= NLIST)
770 goto Overflow;
772 break;
773 case LBRA:
774 if(inst->u.subid>=0)
775 tlp->se.r[inst->u.subid].q0 = p;
776 inst = inst->u1.next;
777 goto Switchstmt;
778 case RBRA:
779 if(inst->u.subid >= 0)
780 tlp->se.r[inst->u.subid].q1 = p;
781 inst = inst->u1.next;
782 goto Switchstmt;
783 case ANY:
784 if(c != '\n')
785 goto Addinst;
786 break;
787 case BOL:
788 if(c=='\n' || p==0){
789 Step:
790 inst = inst->u1.next;
791 goto Switchstmt;
793 break;
794 case EOL:
795 if(p<t->file->b.nc && textreadc(t, p)=='\n')
796 goto Step;
797 break;
798 case CCLASS:
799 if(c>0 && classmatch(inst->u.class, c, 0))
800 goto Addinst;
801 break;
802 case NCCLASS:
803 if(c>0 && classmatch(inst->u.class, c, 1))
804 goto Addinst;
805 break;
806 case OR:
807 /* evaluate right choice later */
808 if(addinst(tl, inst->u.right, &tlp->se))
809 if(++ntl >= NLIST)
810 goto Overflow;
811 /* efficiency: advance and re-evaluate */
812 inst = inst->u1.left;
813 goto Switchstmt;
814 case END: /* Match! */
815 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
816 tlp->se.r[0].q1 = p;
817 bnewmatch(&tlp->se);
818 break;
822 Return:
823 *rp = sel;
824 return sel.r[0].q0 >= 0;
827 void
828 bnewmatch(Rangeset *sp)
830 int i;
832 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
833 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
834 sel.r[i].q0 = sp->r[i].q1;
835 sel.r[i].q1 = sp->r[i].q0;