Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <thread.h>
5 #include <cursor.h>
6 #include <mouse.h>
7 #include <keyboard.h>
8 #include <frame.h>
9 #include <fcall.h>
10 #include <plumb.h>
11 #include <libsec.h>
12 #include "dat.h"
13 #include "fns.h"
15 Rangeset sel;
16 Rune *lastregexp;
18 #undef class
19 #define class regxclass /* some systems declare "class" in system headers */
21 /*
22 * Machine Information
23 */
24 typedef struct Inst Inst;
25 struct Inst
26 {
27 uint type; /* < OPERATOR ==> literal, otherwise action */
28 union {
29 int sid;
30 int subid;
31 int class;
32 Inst *other;
33 Inst *right;
34 } u;
35 union{
36 Inst *left;
37 Inst *next;
38 } u1;
39 };
41 #define NPROG 1024
42 Inst program[NPROG];
43 Inst *progp;
44 Inst *startinst; /* First inst. of program; might not be program[0] */
45 Inst *bstartinst; /* same for backwards machine */
46 Channel *rechan; /* chan(Inst*) */
48 typedef struct Ilist Ilist;
49 struct Ilist
50 {
51 Inst *inst; /* Instruction of the thread */
52 Rangeset se;
53 uint startp; /* first char of match */
54 };
56 #define NLIST 127
58 Ilist *tl, *nl; /* This list, next list */
59 Ilist list[2][NLIST+1]; /* +1 for trailing null */
60 static Rangeset sempty;
62 /*
63 * Actions and Tokens
64 *
65 * 0x10000xx are operators, value == precedence
66 * 0x20000xx are tokens, i.e. operands for operators
67 */
68 #define OPERATOR 0x1000000 /* Bit set in all operators */
69 #define START (OPERATOR+0) /* Start, used for marker on stack */
70 #define RBRA (OPERATOR+1) /* Right bracket, ) */
71 #define LBRA (OPERATOR+2) /* Left bracket, ( */
72 #define OR (OPERATOR+3) /* Alternation, | */
73 #define CAT (OPERATOR+4) /* Concatentation, implicit operator */
74 #define STAR (OPERATOR+5) /* Closure, * */
75 #define PLUS (OPERATOR+6) /* a+ == aa* */
76 #define QUEST (OPERATOR+7) /* a? == a|nothing, i.e. 0 or 1 a's */
77 #define ANY 0x2000000 /* Any character but newline, . */
78 #define NOP (ANY+1) /* No operation, internal use only */
79 #define BOL (ANY+2) /* Beginning of line, ^ */
80 #define EOL (ANY+3) /* End of line, $ */
81 #define CCLASS (ANY+4) /* Character class, [] */
82 #define NCCLASS (ANY+5) /* Negated character class, [^] */
83 #define END (ANY+0x77) /* Terminate: match found */
85 #define ISATOR OPERATOR
86 #define ISAND ANY
88 #define QUOTED 0x4000000 /* Bit set for \-ed lex characters */
90 /*
91 * Parser Information
92 */
93 typedef struct Node Node;
94 struct Node
95 {
96 Inst *first;
97 Inst *last;
98 };
100 #define NSTACK 20
101 Node andstack[NSTACK];
102 Node *andp;
103 int atorstack[NSTACK];
104 int *atorp;
105 int lastwasand; /* Last token was operand */
106 int cursubid;
107 int subidstack[NSTACK];
108 int *subidp;
109 int backwards;
110 int nbra;
111 Rune *exprp; /* pointer to next character in source expression */
112 #define DCLASS 10 /* allocation increment */
113 int nclass; /* number active */
114 int Nclass; /* high water mark */
115 Rune **class;
116 int negateclass;
118 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
119 void newmatch(Rangeset*);
120 void bnewmatch(Rangeset*);
121 void pushand(Inst*, Inst*);
122 void pushator(int);
123 Node *popand(int);
124 int popator(void);
125 void startlex(Rune*);
126 int lex(void);
127 void operator(int);
128 void operand(int);
129 void evaluntil(int);
130 void optimize(Inst*);
131 void bldcclass(void);
133 void
134 rxinit(void)
136 rechan = chancreate(sizeof(Inst*), 0);
137 chansetname(rechan, "rechan");
138 lastregexp = runemalloc(1);
141 void
142 regerror(char *e)
144 lastregexp[0] = 0;
145 warning(nil, "regexp: %s\n", e);
146 sendp(rechan, nil);
147 threadexits(nil);
150 Inst *
151 newinst(int t)
153 if(progp >= &program[NPROG])
154 regerror("expression too long");
155 progp->type = t;
156 progp->u1.left = nil;
157 progp->u.right = nil;
158 return progp++;
161 void
162 realcompile(void *arg)
164 int token;
165 Rune *s;
167 threadsetname("regcomp");
168 s = arg;
169 startlex(s);
170 atorp = atorstack;
171 andp = andstack;
172 subidp = subidstack;
173 cursubid = 0;
174 lastwasand = FALSE;
175 /* Start with a low priority operator to prime parser */
176 pushator(START-1);
177 while((token=lex()) != END){
178 if((token&ISATOR) == OPERATOR)
179 operator(token);
180 else
181 operand(token);
183 /* Close with a low priority operator */
184 evaluntil(START);
185 /* Force END */
186 operand(END);
187 evaluntil(START);
188 if(nbra)
189 regerror("unmatched `('");
190 --andp; /* points to first and only operand */
191 sendp(rechan, andp->first);
192 threadexits(nil);
195 /* r is null terminated */
196 int
197 rxcompile(Rune *r)
199 int i, nr;
200 Inst *oprogp;
202 nr = runestrlen(r)+1;
203 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
204 return TRUE;
205 lastregexp[0] = 0;
206 for(i=0; i<nclass; i++)
207 free(class[i]);
208 nclass = 0;
209 progp = program;
210 backwards = FALSE;
211 bstartinst = nil;
212 threadcreate(realcompile, r, STACK);
213 startinst = recvp(rechan);
214 if(startinst == nil)
215 return FALSE;
216 optimize(program);
217 oprogp = progp;
218 backwards = TRUE;
219 threadcreate(realcompile, r, STACK);
220 bstartinst = recvp(rechan);
221 if(bstartinst == nil)
222 return FALSE;
223 optimize(oprogp);
224 lastregexp = runerealloc(lastregexp, nr);
225 runemove(lastregexp, r, nr);
226 return TRUE;
229 void
230 operand(int t)
232 Inst *i;
233 if(lastwasand)
234 operator(CAT); /* catenate is implicit */
235 i = newinst(t);
236 if(t == CCLASS){
237 if(negateclass)
238 i->type = NCCLASS; /* UGH */
239 i->u.class = nclass-1; /* UGH */
241 pushand(i, i);
242 lastwasand = TRUE;
245 void
246 operator(int t)
248 if(t==RBRA && --nbra<0)
249 regerror("unmatched `)'");
250 if(t==LBRA){
251 cursubid++; /* silently ignored */
252 nbra++;
253 if(lastwasand)
254 operator(CAT);
255 }else
256 evaluntil(t);
257 if(t!=RBRA)
258 pushator(t);
259 lastwasand = FALSE;
260 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
261 lastwasand = TRUE; /* these look like operands */
264 void
265 pushand(Inst *f, Inst *l)
267 if(andp >= &andstack[NSTACK])
268 error("operand stack overflow");
269 andp->first = f;
270 andp->last = l;
271 andp++;
274 void
275 pushator(int t)
277 if(atorp >= &atorstack[NSTACK])
278 error("operator stack overflow");
279 *atorp++=t;
280 if(cursubid >= NRange)
281 *subidp++= -1;
282 else
283 *subidp++=cursubid;
286 Node *
287 popand(int op)
289 char buf[64];
291 if(andp <= &andstack[0])
292 if(op){
293 sprint(buf, "missing operand for %c", op);
294 regerror(buf);
295 }else
296 regerror("malformed regexp");
297 return --andp;
300 int
301 popator()
303 if(atorp <= &atorstack[0])
304 error("operator stack underflow");
305 --subidp;
306 return *--atorp;
309 void
310 evaluntil(int pri)
312 Node *op1, *op2, *t;
313 Inst *inst1, *inst2;
315 while(pri==RBRA || atorp[-1]>=pri){
316 switch(popator()){
317 case LBRA:
318 op1 = popand('(');
319 inst2 = newinst(RBRA);
320 inst2->u.subid = *subidp;
321 op1->last->u1.next = inst2;
322 inst1 = newinst(LBRA);
323 inst1->u.subid = *subidp;
324 inst1->u1.next = op1->first;
325 pushand(inst1, inst2);
326 return; /* must have been RBRA */
327 default:
328 error("unknown regexp operator");
329 break;
330 case OR:
331 op2 = popand('|');
332 op1 = popand('|');
333 inst2 = newinst(NOP);
334 op2->last->u1.next = inst2;
335 op1->last->u1.next = inst2;
336 inst1 = newinst(OR);
337 inst1->u.right = op1->first;
338 inst1->u1.left = op2->first;
339 pushand(inst1, inst2);
340 break;
341 case CAT:
342 op2 = popand(0);
343 op1 = popand(0);
344 if(backwards && op2->first->type!=END){
345 t = op1;
346 op1 = op2;
347 op2 = t;
349 op1->last->u1.next = op2->first;
350 pushand(op1->first, op2->last);
351 break;
352 case STAR:
353 op2 = popand('*');
354 inst1 = newinst(OR);
355 op2->last->u1.next = inst1;
356 inst1->u.right = op2->first;
357 pushand(inst1, inst1);
358 break;
359 case PLUS:
360 op2 = popand('+');
361 inst1 = newinst(OR);
362 op2->last->u1.next = inst1;
363 inst1->u.right = op2->first;
364 pushand(op2->first, inst1);
365 break;
366 case QUEST:
367 op2 = popand('?');
368 inst1 = newinst(OR);
369 inst2 = newinst(NOP);
370 inst1->u1.left = inst2;
371 inst1->u.right = op2->first;
372 op2->last->u1.next = inst2;
373 pushand(inst1, inst2);
374 break;
380 void
381 optimize(Inst *start)
383 Inst *inst, *target;
385 for(inst=start; inst->type!=END; inst++){
386 target = inst->u1.next;
387 while(target->type == NOP)
388 target = target->u1.next;
389 inst->u1.next = target;
393 void
394 startlex(Rune *s)
396 exprp = s;
397 nbra = 0;
401 int
402 lex(void){
403 int c;
405 c = *exprp++;
406 switch(c){
407 case '\\':
408 if(*exprp)
409 if((c= *exprp++)=='n')
410 c='\n';
411 break;
412 case 0:
413 c = END;
414 --exprp; /* In case we come here again */
415 break;
416 case '*':
417 c = STAR;
418 break;
419 case '?':
420 c = QUEST;
421 break;
422 case '+':
423 c = PLUS;
424 break;
425 case '|':
426 c = OR;
427 break;
428 case '.':
429 c = ANY;
430 break;
431 case '(':
432 c = LBRA;
433 break;
434 case ')':
435 c = RBRA;
436 break;
437 case '^':
438 c = BOL;
439 break;
440 case '$':
441 c = EOL;
442 break;
443 case '[':
444 c = CCLASS;
445 bldcclass();
446 break;
448 return c;
451 int
452 nextrec(void)
454 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
455 regerror("malformed `[]'");
456 if(exprp[0] == '\\'){
457 exprp++;
458 if(*exprp=='n'){
459 exprp++;
460 return '\n';
462 return *exprp++|QUOTED;
464 return *exprp++;
467 void
468 bldcclass(void)
470 int c1, c2, n, na;
471 Rune *classp;
473 classp = runemalloc(DCLASS);
474 n = 0;
475 na = DCLASS;
476 /* we have already seen the '[' */
477 if(*exprp == '^'){
478 classp[n++] = '\n'; /* don't match newline in negate case */
479 negateclass = TRUE;
480 exprp++;
481 }else
482 negateclass = FALSE;
483 while((c1 = nextrec()) != ']'){
484 if(c1 == '-'){
485 Error:
486 free(classp);
487 regerror("malformed `[]'");
489 if(n+4 >= na){ /* 3 runes plus NUL */
490 na += DCLASS;
491 classp = runerealloc(classp, na);
493 if(*exprp == '-'){
494 exprp++; /* eat '-' */
495 if((c2 = nextrec()) == ']')
496 goto Error;
497 classp[n+0] = Runemax;
498 classp[n+1] = c1;
499 classp[n+2] = c2;
500 n += 3;
501 }else
502 classp[n++] = c1 & ~QUOTED;
504 classp[n] = 0;
505 if(nclass == Nclass){
506 Nclass += DCLASS;
507 class = realloc(class, Nclass*sizeof(Rune*));
509 class[nclass++] = classp;
512 int
513 classmatch(int classno, int c, int negate)
515 Rune *p;
517 p = class[classno];
518 while(*p){
519 if(*p == Runemax){
520 if(p[1]<=c && c<=p[2])
521 return !negate;
522 p += 3;
523 }else if(*p++ == c)
524 return !negate;
526 return negate;
529 /*
530 * Note optimization in addinst:
531 * *l must be pending when addinst called; if *l has been looked
532 * at already, the optimization is a bug.
533 */
534 int
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
537 Ilist *p;
539 for(p = l; p->inst; p++){
540 if(p->inst==inst){
541 if((sep)->r[0].q0 < p->se.r[0].q0)
542 p->se= *sep; /* this would be bug */
543 return 0; /* It's already there */
546 p->inst = inst;
547 p->se= *sep;
548 (p+1)->inst = nil;
549 return 1;
552 int
553 rxnull(void)
555 return startinst==nil || bstartinst==nil;
558 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
559 int
560 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
562 int flag;
563 Inst *inst;
564 Ilist *tlp;
565 uint p;
566 int nnl, ntl;
567 int nc, c;
568 int wrapped;
569 int startchar;
571 flag = 0;
572 p = startp;
573 startchar = 0;
574 wrapped = 0;
575 nnl = 0;
576 if(startinst->type<OPERATOR)
577 startchar = startinst->type;
578 list[0][0].inst = list[1][0].inst = nil;
579 sel.r[0].q0 = -1;
580 if(t != nil)
581 nc = t->file->b.nc;
582 else
583 nc = runestrlen(r);
584 /* Execute machine once for each character */
585 for(;;p++){
586 doloop:
587 if(p>=eof || p>=nc){
588 switch(wrapped++){
589 case 0: /* let loop run one more click */
590 case 2:
591 break;
592 case 1: /* expired; wrap to beginning */
593 if(sel.r[0].q0>=0 || eof!=Infinity)
594 goto Return;
595 list[0][0].inst = list[1][0].inst = nil;
596 p = 0;
597 goto doloop;
598 default:
599 goto Return;
601 c = 0;
602 }else{
603 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
604 break;
605 if(t != nil)
606 c = textreadc(t, p);
607 else
608 c = r[p];
610 /* fast check for first char */
611 if(startchar && nnl==0 && c!=startchar)
612 continue;
613 tl = list[flag];
614 nl = list[flag^=1];
615 nl->inst = nil;
616 ntl = nnl;
617 nnl = 0;
618 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
619 /* Add first instruction to this list */
620 sempty.r[0].q0 = p;
621 if(addinst(tl, startinst, &sempty))
622 if(++ntl >= NLIST){
623 Overflow:
624 warning(nil, "regexp list overflow\n");
625 sel.r[0].q0 = -1;
626 goto Return;
629 /* Execute machine until this list is empty */
630 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
631 Switchstmt:
632 switch(inst->type){
633 default: /* regular character */
634 if(inst->type==c){
635 Addinst:
636 if(addinst(nl, inst->u1.next, &tlp->se))
637 if(++nnl >= NLIST)
638 goto Overflow;
640 break;
641 case LBRA:
642 if(inst->u.subid>=0)
643 tlp->se.r[inst->u.subid].q0 = p;
644 inst = inst->u1.next;
645 goto Switchstmt;
646 case RBRA:
647 if(inst->u.subid>=0)
648 tlp->se.r[inst->u.subid].q1 = p;
649 inst = inst->u1.next;
650 goto Switchstmt;
651 case ANY:
652 if(c!='\n')
653 goto Addinst;
654 break;
655 case BOL:
656 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
657 Step:
658 inst = inst->u1.next;
659 goto Switchstmt;
661 break;
662 case EOL:
663 if(c == '\n')
664 goto Step;
665 break;
666 case CCLASS:
667 if(c>=0 && classmatch(inst->u.class, c, 0))
668 goto Addinst;
669 break;
670 case NCCLASS:
671 if(c>=0 && classmatch(inst->u.class, c, 1))
672 goto Addinst;
673 break;
674 case OR:
675 /* evaluate right choice later */
676 if(addinst(tlp, inst->u.right, &tlp->se))
677 if(++ntl >= NLIST)
678 goto Overflow;
679 /* efficiency: advance and re-evaluate */
680 inst = inst->u1.left;
681 goto Switchstmt;
682 case END: /* Match! */
683 tlp->se.r[0].q1 = p;
684 newmatch(&tlp->se);
685 break;
689 Return:
690 *rp = sel;
691 return sel.r[0].q0 >= 0;
694 void
695 newmatch(Rangeset *sp)
697 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
698 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
699 sel = *sp;
702 int
703 rxbexecute(Text *t, uint startp, Rangeset *rp)
705 int flag;
706 Inst *inst;
707 Ilist *tlp;
708 int p;
709 int nnl, ntl;
710 int c;
711 int wrapped;
712 int startchar;
714 flag = 0;
715 nnl = 0;
716 wrapped = 0;
717 p = startp;
718 startchar = 0;
719 if(bstartinst->type<OPERATOR)
720 startchar = bstartinst->type;
721 list[0][0].inst = list[1][0].inst = nil;
722 sel.r[0].q0= -1;
723 /* Execute machine once for each character, including terminal NUL */
724 for(;;--p){
725 doloop:
726 if(p <= 0){
727 switch(wrapped++){
728 case 0: /* let loop run one more click */
729 case 2:
730 break;
731 case 1: /* expired; wrap to end */
732 if(sel.r[0].q0>=0)
733 goto Return;
734 list[0][0].inst = list[1][0].inst = nil;
735 p = t->file->b.nc;
736 goto doloop;
737 case 3:
738 default:
739 goto Return;
741 c = 0;
742 }else{
743 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
744 break;
745 c = textreadc(t, p-1);
747 /* fast check for first char */
748 if(startchar && nnl==0 && c!=startchar)
749 continue;
750 tl = list[flag];
751 nl = list[flag^=1];
752 nl->inst = nil;
753 ntl = nnl;
754 nnl = 0;
755 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
756 /* Add first instruction to this list */
757 /* the minus is so the optimizations in addinst work */
758 sempty.r[0].q0 = -p;
759 if(addinst(tl, bstartinst, &sempty))
760 if(++ntl >= NLIST){
761 Overflow:
762 warning(nil, "regexp list overflow\n");
763 sel.r[0].q0 = -1;
764 goto Return;
767 /* Execute machine until this list is empty */
768 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
769 Switchstmt:
770 switch(inst->type){
771 default: /* regular character */
772 if(inst->type == c){
773 Addinst:
774 if(addinst(nl, inst->u1.next, &tlp->se))
775 if(++nnl >= NLIST)
776 goto Overflow;
778 break;
779 case LBRA:
780 if(inst->u.subid>=0)
781 tlp->se.r[inst->u.subid].q0 = p;
782 inst = inst->u1.next;
783 goto Switchstmt;
784 case RBRA:
785 if(inst->u.subid >= 0)
786 tlp->se.r[inst->u.subid].q1 = p;
787 inst = inst->u1.next;
788 goto Switchstmt;
789 case ANY:
790 if(c != '\n')
791 goto Addinst;
792 break;
793 case BOL:
794 if(c=='\n' || p==0){
795 Step:
796 inst = inst->u1.next;
797 goto Switchstmt;
799 break;
800 case EOL:
801 if(p<t->file->b.nc && textreadc(t, p)=='\n')
802 goto Step;
803 break;
804 case CCLASS:
805 if(c>0 && classmatch(inst->u.class, c, 0))
806 goto Addinst;
807 break;
808 case NCCLASS:
809 if(c>0 && classmatch(inst->u.class, c, 1))
810 goto Addinst;
811 break;
812 case OR:
813 /* evaluate right choice later */
814 if(addinst(tl, inst->u.right, &tlp->se))
815 if(++ntl >= NLIST)
816 goto Overflow;
817 /* efficiency: advance and re-evaluate */
818 inst = inst->u1.left;
819 goto Switchstmt;
820 case END: /* Match! */
821 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
822 tlp->se.r[0].q1 = p;
823 bnewmatch(&tlp->se);
824 break;
828 Return:
829 *rp = sel;
830 return sel.r[0].q0 >= 0;
833 void
834 bnewmatch(Rangeset *sp)
836 int i;
838 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
839 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
840 sel.r[i].q0 = sp->r[i].q1;
841 sel.r[i].q1 = sp->r[i].q0;