Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <thread.h>
5 #include <cursor.h>
6 #include <mouse.h>
7 #include <keyboard.h>
8 #include <frame.h>
9 #include <fcall.h>
10 #include <plumb.h>
11 #include "dat.h"
12 #include "fns.h"
14 Rangeset sel;
15 Rune *lastregexp;
17 /*
18 * Machine Information
19 */
20 typedef struct Inst Inst;
21 struct Inst
22 {
23 uint type; /* < 0x10000 ==> literal, otherwise action */
24 union {
25 int sid;
26 int subid;
27 int class;
28 Inst *other;
29 Inst *right;
30 } u;
31 union{
32 Inst *left;
33 Inst *next;
34 } u1;
35 };
37 #define NPROG 1024
38 Inst program[NPROG];
39 Inst *progp;
40 Inst *startinst; /* First inst. of program; might not be program[0] */
41 Inst *bstartinst; /* same for backwards machine */
42 Channel *rechan; /* chan(Inst*) */
44 typedef struct Ilist Ilist;
45 struct Ilist
46 {
47 Inst *inst; /* Instruction of the thread */
48 Rangeset se;
49 uint startp; /* first char of match */
50 };
52 #define NLIST 128
54 Ilist *tl, *nl; /* This list, next list */
55 Ilist list[2][NLIST];
56 static Rangeset sempty;
58 /*
59 * Actions and Tokens
60 *
61 * 0x100xx are operators, value == precedence
62 * 0x200xx are tokens, i.e. operands for operators
63 */
64 #define OPERATOR 0x10000 /* Bitmask of all operators */
65 #define START 0x10000 /* Start, used for marker on stack */
66 #define RBRA 0x10001 /* Right bracket, ) */
67 #define LBRA 0x10002 /* Left bracket, ( */
68 #define OR 0x10003 /* Alternation, | */
69 #define CAT 0x10004 /* Concatentation, implicit operator */
70 #define STAR 0x10005 /* Closure, * */
71 #define PLUS 0x10006 /* a+ == aa* */
72 #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */
73 #define ANY 0x20000 /* Any character but newline, . */
74 #define NOP 0x20001 /* No operation, internal use only */
75 #define BOL 0x20002 /* Beginning of line, ^ */
76 #define EOL 0x20003 /* End of line, $ */
77 #define CCLASS 0x20004 /* Character class, [] */
78 #define NCCLASS 0x20005 /* Negated character class, [^] */
79 #define END 0x20077 /* Terminate: match found */
81 #define ISATOR 0x10000
82 #define ISAND 0x20000
84 /*
85 * Parser Information
86 */
87 typedef struct Node Node;
88 struct Node
89 {
90 Inst *first;
91 Inst *last;
92 };
94 #define NSTACK 20
95 Node andstack[NSTACK];
96 Node *andp;
97 int atorstack[NSTACK];
98 int *atorp;
99 int lastwasand; /* Last token was operand */
100 int cursubid;
101 int subidstack[NSTACK];
102 int *subidp;
103 int backwards;
104 int nbra;
105 Rune *exprp; /* pointer to next character in source expression */
106 #define DCLASS 10 /* allocation increment */
107 int nclass; /* number active */
108 int Nclass; /* high water mark */
109 Rune **class;
110 int negateclass;
112 void addinst(Ilist *l, Inst *inst, Rangeset *sep);
113 void newmatch(Rangeset*);
114 void bnewmatch(Rangeset*);
115 void pushand(Inst*, Inst*);
116 void pushator(int);
117 Node *popand(int);
118 int popator(void);
119 void startlex(Rune*);
120 int lex(void);
121 void operator(int);
122 void operand(int);
123 void evaluntil(int);
124 void optimize(Inst*);
125 void bldcclass(void);
127 void
128 rxinit(void)
130 rechan = chancreate(sizeof(Inst*), 0);
131 lastregexp = runemalloc(1);
134 void
135 regerror(char *e)
137 lastregexp[0] = 0;
138 warning(nil, "regexp: %s\n", e);
139 sendp(rechan, nil);
140 threadexits(nil);
143 Inst *
144 newinst(int t)
146 if(progp >= &program[NPROG])
147 regerror("expression too long");
148 progp->type = t;
149 progp->u1.left = nil;
150 progp->u.right = nil;
151 return progp++;
154 void
155 realcompile(void *arg)
157 int token;
158 Rune *s;
160 threadsetname("regcomp");
161 s = arg;
162 startlex(s);
163 atorp = atorstack;
164 andp = andstack;
165 subidp = subidstack;
166 cursubid = 0;
167 lastwasand = FALSE;
168 /* Start with a low priority operator to prime parser */
169 pushator(START-1);
170 while((token=lex()) != END){
171 if((token&ISATOR) == OPERATOR)
172 operator(token);
173 else
174 operand(token);
176 /* Close with a low priority operator */
177 evaluntil(START);
178 /* Force END */
179 operand(END);
180 evaluntil(START);
181 if(nbra)
182 regerror("unmatched `('");
183 --andp; /* points to first and only operand */
184 sendp(rechan, andp->first);
185 threadexits(nil);
188 /* r is null terminated */
189 int
190 rxcompile(Rune *r)
192 int i, nr;
193 Inst *oprogp;
195 nr = runestrlen(r)+1;
196 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
197 return TRUE;
198 lastregexp[0] = 0;
199 for(i=0; i<nclass; i++)
200 free(class[i]);
201 nclass = 0;
202 progp = program;
203 backwards = FALSE;
204 bstartinst = nil;
205 threadcreate(realcompile, r, STACK);
206 startinst = recvp(rechan);
207 if(startinst == nil)
208 return FALSE;
209 optimize(program);
210 oprogp = progp;
211 backwards = TRUE;
212 threadcreate(realcompile, r, STACK);
213 bstartinst = recvp(rechan);
214 if(bstartinst == nil)
215 return FALSE;
216 optimize(oprogp);
217 lastregexp = runerealloc(lastregexp, nr);
218 runemove(lastregexp, r, nr);
219 return TRUE;
222 void
223 operand(int t)
225 Inst *i;
226 if(lastwasand)
227 operator(CAT); /* catenate is implicit */
228 i = newinst(t);
229 if(t == CCLASS){
230 if(negateclass)
231 i->type = NCCLASS; /* UGH */
232 i->u.class = nclass-1; /* UGH */
234 pushand(i, i);
235 lastwasand = TRUE;
238 void
239 operator(int t)
241 if(t==RBRA && --nbra<0)
242 regerror("unmatched `)'");
243 if(t==LBRA){
244 cursubid++; /* silently ignored */
245 nbra++;
246 if(lastwasand)
247 operator(CAT);
248 }else
249 evaluntil(t);
250 if(t!=RBRA)
251 pushator(t);
252 lastwasand = FALSE;
253 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
254 lastwasand = TRUE; /* these look like operands */
257 void
258 pushand(Inst *f, Inst *l)
260 if(andp >= &andstack[NSTACK])
261 error("operand stack overflow");
262 andp->first = f;
263 andp->last = l;
264 andp++;
267 void
268 pushator(int t)
270 if(atorp >= &atorstack[NSTACK])
271 error("operator stack overflow");
272 *atorp++=t;
273 if(cursubid >= NRange)
274 *subidp++= -1;
275 else
276 *subidp++=cursubid;
279 Node *
280 popand(int op)
282 char buf[64];
284 if(andp <= &andstack[0])
285 if(op){
286 sprint(buf, "missing operand for %c", op);
287 regerror(buf);
288 }else
289 regerror("malformed regexp");
290 return --andp;
293 int
294 popator()
296 if(atorp <= &atorstack[0])
297 error("operator stack underflow");
298 --subidp;
299 return *--atorp;
302 void
303 evaluntil(int pri)
305 Node *op1, *op2, *t;
306 Inst *inst1, *inst2;
308 while(pri==RBRA || atorp[-1]>=pri){
309 switch(popator()){
310 case LBRA:
311 op1 = popand('(');
312 inst2 = newinst(RBRA);
313 inst2->u.subid = *subidp;
314 op1->last->u1.next = inst2;
315 inst1 = newinst(LBRA);
316 inst1->u.subid = *subidp;
317 inst1->u1.next = op1->first;
318 pushand(inst1, inst2);
319 return; /* must have been RBRA */
320 default:
321 error("unknown regexp operator");
322 break;
323 case OR:
324 op2 = popand('|');
325 op1 = popand('|');
326 inst2 = newinst(NOP);
327 op2->last->u1.next = inst2;
328 op1->last->u1.next = inst2;
329 inst1 = newinst(OR);
330 inst1->u.right = op1->first;
331 inst1->u1.left = op2->first;
332 pushand(inst1, inst2);
333 break;
334 case CAT:
335 op2 = popand(0);
336 op1 = popand(0);
337 if(backwards && op2->first->type!=END){
338 t = op1;
339 op1 = op2;
340 op2 = t;
342 op1->last->u1.next = op2->first;
343 pushand(op1->first, op2->last);
344 break;
345 case STAR:
346 op2 = popand('*');
347 inst1 = newinst(OR);
348 op2->last->u1.next = inst1;
349 inst1->u.right = op2->first;
350 pushand(inst1, inst1);
351 break;
352 case PLUS:
353 op2 = popand('+');
354 inst1 = newinst(OR);
355 op2->last->u1.next = inst1;
356 inst1->u.right = op2->first;
357 pushand(op2->first, inst1);
358 break;
359 case QUEST:
360 op2 = popand('?');
361 inst1 = newinst(OR);
362 inst2 = newinst(NOP);
363 inst1->u1.left = inst2;
364 inst1->u.right = op2->first;
365 op2->last->u1.next = inst2;
366 pushand(inst1, inst2);
367 break;
373 void
374 optimize(Inst *start)
376 Inst *inst, *target;
378 for(inst=start; inst->type!=END; inst++){
379 target = inst->u1.next;
380 while(target->type == NOP)
381 target = target->u1.next;
382 inst->u1.next = target;
386 void
387 startlex(Rune *s)
389 exprp = s;
390 nbra = 0;
394 int
395 lex(void){
396 int c;
398 c = *exprp++;
399 switch(c){
400 case '\\':
401 if(*exprp)
402 if((c= *exprp++)=='n')
403 c='\n';
404 break;
405 case 0:
406 c = END;
407 --exprp; /* In case we come here again */
408 break;
409 case '*':
410 c = STAR;
411 break;
412 case '?':
413 c = QUEST;
414 break;
415 case '+':
416 c = PLUS;
417 break;
418 case '|':
419 c = OR;
420 break;
421 case '.':
422 c = ANY;
423 break;
424 case '(':
425 c = LBRA;
426 break;
427 case ')':
428 c = RBRA;
429 break;
430 case '^':
431 c = BOL;
432 break;
433 case '$':
434 c = EOL;
435 break;
436 case '[':
437 c = CCLASS;
438 bldcclass();
439 break;
441 return c;
444 int
445 nextrec(void)
447 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
448 regerror("malformed `[]'");
449 if(exprp[0] == '\\'){
450 exprp++;
451 if(*exprp=='n'){
452 exprp++;
453 return '\n';
455 return *exprp++|0x10000;
457 return *exprp++;
460 void
461 bldcclass(void)
463 int c1, c2, n, na;
464 Rune *classp;
466 classp = runemalloc(DCLASS);
467 n = 0;
468 na = DCLASS;
469 /* we have already seen the '[' */
470 if(*exprp == '^'){
471 classp[n++] = '\n'; /* don't match newline in negate case */
472 negateclass = TRUE;
473 exprp++;
474 }else
475 negateclass = FALSE;
476 while((c1 = nextrec()) != ']'){
477 if(c1 == '-'){
478 Error:
479 free(classp);
480 regerror("malformed `[]'");
482 if(n+4 >= na){ /* 3 runes plus NUL */
483 na += DCLASS;
484 classp = runerealloc(classp, na);
486 if(*exprp == '-'){
487 exprp++; /* eat '-' */
488 if((c2 = nextrec()) == ']')
489 goto Error;
490 classp[n+0] = 0xFFFF;
491 classp[n+1] = c1;
492 classp[n+2] = c2;
493 n += 3;
494 }else
495 classp[n++] = c1;
497 classp[n] = 0;
498 if(nclass == Nclass){
499 Nclass += DCLASS;
500 class = realloc(class, Nclass*sizeof(Rune*));
502 class[nclass++] = classp;
505 int
506 classmatch(int classno, int c, int negate)
508 Rune *p;
510 p = class[classno];
511 while(*p){
512 if(*p == 0xFFFF){
513 if(p[1]<=c && c<=p[2])
514 return !negate;
515 p += 3;
516 }else if(*p++ == c)
517 return !negate;
519 return negate;
522 /*
523 * Note optimization in addinst:
524 * *l must be pending when addinst called; if *l has been looked
525 * at already, the optimization is a bug.
526 */
527 void
528 addinst(Ilist *l, Inst *inst, Rangeset *sep)
530 Ilist *p;
532 for(p = l; p->inst; p++){
533 if(p->inst==inst){
534 if((sep)->r[0].q0 < p->se.r[0].q0)
535 p->se= *sep; /* this would be bug */
536 return; /* It's already there */
539 p->inst = inst;
540 p->se= *sep;
541 (p+1)->inst = nil;
544 int
545 rxnull(void)
547 return startinst==nil || bstartinst==nil;
550 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
551 int
552 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
554 int flag;
555 Inst *inst;
556 Ilist *tlp;
557 uint p;
558 int nnl, ntl;
559 int nc, c;
560 int wrapped;
561 int startchar;
563 flag = 0;
564 p = startp;
565 startchar = 0;
566 wrapped = 0;
567 nnl = 0;
568 if(startinst->type<OPERATOR)
569 startchar = startinst->type;
570 list[0][0].inst = list[1][0].inst = nil;
571 sel.r[0].q0 = -1;
572 if(t != nil)
573 nc = t->file->b.nc;
574 else
575 nc = runestrlen(r);
576 /* Execute machine once for each character */
577 for(;;p++){
578 doloop:
579 if(p>=eof || p>=nc){
580 switch(wrapped++){
581 case 0: /* let loop run one more click */
582 case 2:
583 break;
584 case 1: /* expired; wrap to beginning */
585 if(sel.r[0].q0>=0 || eof!=Infinity)
586 goto Return;
587 list[0][0].inst = list[1][0].inst = nil;
588 p = 0;
589 goto doloop;
590 default:
591 goto Return;
593 c = 0;
594 }else{
595 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
596 break;
597 if(t != nil)
598 c = textreadc(t, p);
599 else
600 c = r[p];
602 /* fast check for first char */
603 if(startchar && nnl==0 && c!=startchar)
604 continue;
605 tl = list[flag];
606 nl = list[flag^=1];
607 nl->inst = nil;
608 ntl = nnl;
609 nnl = 0;
610 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
611 /* Add first instruction to this list */
612 if(++ntl >= NLIST){
613 Overflow:
614 warning(nil, "regexp list overflow\n");
615 sel.r[0].q0 = -1;
616 goto Return;
618 sempty.r[0].q0 = p;
619 addinst(tl, startinst, &sempty);
621 /* Execute machine until this list is empty */
622 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
623 Switchstmt:
624 switch(inst->type){
625 default: /* regular character */
626 if(inst->type==c){
627 Addinst:
628 if(++nnl >= NLIST)
629 goto Overflow;
630 addinst(nl, inst->u1.next, &tlp->se);
632 break;
633 case LBRA:
634 if(inst->u.subid>=0)
635 tlp->se.r[inst->u.subid].q0 = p;
636 inst = inst->u1.next;
637 goto Switchstmt;
638 case RBRA:
639 if(inst->u.subid>=0)
640 tlp->se.r[inst->u.subid].q1 = p;
641 inst = inst->u1.next;
642 goto Switchstmt;
643 case ANY:
644 if(c!='\n')
645 goto Addinst;
646 break;
647 case BOL:
648 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
649 Step:
650 inst = inst->u1.next;
651 goto Switchstmt;
653 break;
654 case EOL:
655 if(c == '\n')
656 goto Step;
657 break;
658 case CCLASS:
659 if(c>=0 && classmatch(inst->u.class, c, 0))
660 goto Addinst;
661 break;
662 case NCCLASS:
663 if(c>=0 && classmatch(inst->u.class, c, 1))
664 goto Addinst;
665 break;
666 case OR:
667 /* evaluate right choice later */
668 if(++ntl >= NLIST)
669 goto Overflow;
670 addinst(tlp, inst->u.right, &tlp->se);
671 /* efficiency: advance and re-evaluate */
672 inst = inst->u1.left;
673 goto Switchstmt;
674 case END: /* Match! */
675 tlp->se.r[0].q1 = p;
676 newmatch(&tlp->se);
677 break;
681 Return:
682 *rp = sel;
683 return sel.r[0].q0 >= 0;
686 void
687 newmatch(Rangeset *sp)
689 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
690 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
691 sel = *sp;
694 int
695 rxbexecute(Text *t, uint startp, Rangeset *rp)
697 int flag;
698 Inst *inst;
699 Ilist *tlp;
700 int p;
701 int nnl, ntl;
702 int c;
703 int wrapped;
704 int startchar;
706 flag = 0;
707 nnl = 0;
708 wrapped = 0;
709 p = startp;
710 startchar = 0;
711 if(bstartinst->type<OPERATOR)
712 startchar = bstartinst->type;
713 list[0][0].inst = list[1][0].inst = nil;
714 sel.r[0].q0= -1;
715 /* Execute machine once for each character, including terminal NUL */
716 for(;;--p){
717 doloop:
718 if(p <= 0){
719 switch(wrapped++){
720 case 0: /* let loop run one more click */
721 case 2:
722 break;
723 case 1: /* expired; wrap to end */
724 if(sel.r[0].q0>=0)
725 goto Return;
726 list[0][0].inst = list[1][0].inst = nil;
727 p = t->file->b.nc;
728 goto doloop;
729 case 3:
730 default:
731 goto Return;
733 c = 0;
734 }else{
735 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
736 break;
737 c = textreadc(t, p-1);
739 /* fast check for first char */
740 if(startchar && nnl==0 && c!=startchar)
741 continue;
742 tl = list[flag];
743 nl = list[flag^=1];
744 nl->inst = nil;
745 ntl = nnl;
746 nnl = 0;
747 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
748 /* Add first instruction to this list */
749 if(++ntl >= NLIST){
750 Overflow:
751 warning(nil, "regexp list overflow\n");
752 sel.r[0].q0 = -1;
753 goto Return;
755 /* the minus is so the optimizations in addinst work */
756 sempty.r[0].q0 = -p;
757 addinst(tl, bstartinst, &sempty);
759 /* Execute machine until this list is empty */
760 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
761 Switchstmt:
762 switch(inst->type){
763 default: /* regular character */
764 if(inst->type == c){
765 Addinst:
766 if(++nnl >= NLIST)
767 goto Overflow;
768 addinst(nl, inst->u1.next, &tlp->se);
770 break;
771 case LBRA:
772 if(inst->u.subid>=0)
773 tlp->se.r[inst->u.subid].q0 = p;
774 inst = inst->u1.next;
775 goto Switchstmt;
776 case RBRA:
777 if(inst->u.subid >= 0)
778 tlp->se.r[inst->u.subid].q1 = p;
779 inst = inst->u1.next;
780 goto Switchstmt;
781 case ANY:
782 if(c != '\n')
783 goto Addinst;
784 break;
785 case BOL:
786 if(c=='\n' || p==0){
787 Step:
788 inst = inst->u1.next;
789 goto Switchstmt;
791 break;
792 case EOL:
793 if(p<t->file->b.nc && textreadc(t, p)=='\n')
794 goto Step;
795 break;
796 case CCLASS:
797 if(c>0 && classmatch(inst->u.class, c, 0))
798 goto Addinst;
799 break;
800 case NCCLASS:
801 if(c>0 && classmatch(inst->u.class, c, 1))
802 goto Addinst;
803 break;
804 case OR:
805 /* evaluate right choice later */
806 if(++ntl >= NLIST)
807 goto Overflow;
808 addinst(tlp, inst->u.right, &tlp->se);
809 /* efficiency: advance and re-evaluate */
810 inst = inst->u1.left;
811 goto Switchstmt;
812 case END: /* Match! */
813 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
814 tlp->se.r[0].q1 = p;
815 bnewmatch(&tlp->se);
816 break;
820 Return:
821 *rp = sel;
822 return sel.r[0].q0 >= 0;
825 void
826 bnewmatch(Rangeset *sp)
828 int i;
830 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
831 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
832 sel.r[i].q0 = sp->r[i].q1;
833 sel.r[i].q1 = sp->r[i].q0;