Blob


1 #include "lib9.h"
2 #include "regexp9.h"
3 #include "regcomp.h"
5 #define TRUE 1
6 #define FALSE 0
8 /*
9 * Parser Information
10 */
11 typedef
12 struct Node
13 {
14 Reinst* first;
15 Reinst* last;
16 }Node;
18 #define NSTACK 20
19 static Node andstack[NSTACK];
20 static Node *andp;
21 static int atorstack[NSTACK];
22 static int* atorp;
23 static int cursubid; /* id of current subexpression */
24 static int subidstack[NSTACK]; /* parallel to atorstack */
25 static int* subidp;
26 static int lastwasand; /* Last token was operand */
27 static int nbra;
28 static char* exprp; /* pointer to next character in source expression */
29 static int lexdone;
30 static int nclass;
31 static Reclass*classp;
32 static Reinst* freep;
33 static int errors;
34 static Rune yyrune; /* last lex'd rune */
35 static Reclass*yyclassp; /* last lex'd class */
37 /* predeclared crap */
38 static void operator(int);
39 static void pushand(Reinst*, Reinst*);
40 static void pushator(int);
41 static void evaluntil(int);
42 static int bldcclass(void);
44 static jmp_buf regkaboom;
46 static void
47 rcerror(char *s)
48 {
49 errors++;
50 regerror(s);
51 longjmp(regkaboom, 1);
52 }
54 static Reinst*
55 newinst(int t)
56 {
57 freep->type = t;
58 freep->u2.left = 0;
59 freep->u1.right = 0;
60 return freep++;
61 }
63 static void
64 operand(int t)
65 {
66 Reinst *i;
68 if(lastwasand)
69 operator(CAT); /* catenate is implicit */
70 i = newinst(t);
72 if(t == CCLASS || t == NCCLASS)
73 i->u1.cp = yyclassp;
74 if(t == RUNE)
75 i->u1.r = yyrune;
77 pushand(i, i);
78 lastwasand = TRUE;
79 }
81 static void
82 operator(int t)
83 {
84 if(t==RBRA && --nbra<0)
85 rcerror("unmatched right paren");
86 if(t==LBRA){
87 if(++cursubid >= NSUBEXP)
88 rcerror ("too many subexpressions");
89 nbra++;
90 if(lastwasand)
91 operator(CAT);
92 } else
93 evaluntil(t);
94 if(t != RBRA)
95 pushator(t);
96 lastwasand = FALSE;
97 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
98 lastwasand = TRUE; /* these look like operands */
99 }
101 static void
102 regerr2(char *s, int c)
104 char buf[100];
105 char *cp = buf;
106 while(*s)
107 *cp++ = *s++;
108 *cp++ = c;
109 *cp = '\0';
110 rcerror(buf);
113 static void
114 cant(char *s)
116 char buf[100];
117 strcpy(buf, "can't happen: ");
118 strcat(buf, s);
119 rcerror(buf);
122 static void
123 pushand(Reinst *f, Reinst *l)
125 if(andp >= &andstack[NSTACK])
126 cant("operand stack overflow");
127 andp->first = f;
128 andp->last = l;
129 andp++;
132 static void
133 pushator(int t)
135 if(atorp >= &atorstack[NSTACK])
136 cant("operator stack overflow");
137 *atorp++ = t;
138 *subidp++ = cursubid;
141 static Node*
142 popand(int op)
144 Reinst *inst;
146 if(andp <= &andstack[0]){
147 regerr2("missing operand for ", op);
148 inst = newinst(NOP);
149 pushand(inst,inst);
151 return --andp;
154 static int
155 popator(void)
157 if(atorp <= &atorstack[0])
158 cant("operator stack underflow");
159 --subidp;
160 return *--atorp;
163 static void
164 evaluntil(int pri)
166 Node *op1, *op2;
167 Reinst *inst1, *inst2;
169 while(pri==RBRA || atorp[-1]>=pri){
170 switch(popator()){
171 default:
172 rcerror("unknown operator in evaluntil");
173 break;
174 case LBRA: /* must have been RBRA */
175 op1 = popand('(');
176 inst2 = newinst(RBRA);
177 inst2->u1.subid = *subidp;
178 op1->last->u2.next = inst2;
179 inst1 = newinst(LBRA);
180 inst1->u1.subid = *subidp;
181 inst1->u2.next = op1->first;
182 pushand(inst1, inst2);
183 return;
184 case OR:
185 op2 = popand('|');
186 op1 = popand('|');
187 inst2 = newinst(NOP);
188 op2->last->u2.next = inst2;
189 op1->last->u2.next = inst2;
190 inst1 = newinst(OR);
191 inst1->u1.right = op1->first;
192 inst1->u2.left = op2->first;
193 pushand(inst1, inst2);
194 break;
195 case CAT:
196 op2 = popand(0);
197 op1 = popand(0);
198 op1->last->u2.next = op2->first;
199 pushand(op1->first, op2->last);
200 break;
201 case STAR:
202 op2 = popand('*');
203 inst1 = newinst(OR);
204 op2->last->u2.next = inst1;
205 inst1->u1.right = op2->first;
206 pushand(inst1, inst1);
207 break;
208 case PLUS:
209 op2 = popand('+');
210 inst1 = newinst(OR);
211 op2->last->u2.next = inst1;
212 inst1->u1.right = op2->first;
213 pushand(op2->first, inst1);
214 break;
215 case QUEST:
216 op2 = popand('?');
217 inst1 = newinst(OR);
218 inst2 = newinst(NOP);
219 inst1->u2.left = inst2;
220 inst1->u1.right = op2->first;
221 op2->last->u2.next = inst2;
222 pushand(inst1, inst2);
223 break;
228 static Reprog*
229 optimize(Reprog *pp)
231 Reinst *inst, *target;
232 int size;
233 Reprog *npp;
234 Reclass *cl;
235 int diff;
237 /*
238 * get rid of NOOP chains
239 */
240 for(inst=pp->firstinst; inst->type!=END; inst++){
241 target = inst->u2.next;
242 while(target->type == NOP)
243 target = target->u2.next;
244 inst->u2.next = target;
247 /*
248 * The original allocation is for an area larger than
249 * necessary. Reallocate to the actual space used
250 * and then relocate the code.
251 */
252 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
253 npp = realloc(pp, size);
254 if(npp==0 || npp==pp)
255 return pp;
256 diff = (char *)npp - (char *)pp;
257 freep = (Reinst *)((char *)freep + diff);
258 for(inst=npp->firstinst; inst<freep; inst++){
259 switch(inst->type){
260 case OR:
261 case STAR:
262 case PLUS:
263 case QUEST:
264 *(char**)(void*)&inst->u1.right += diff;
265 break;
266 case CCLASS:
267 case NCCLASS:
268 *(char**)(void*)&inst->u1.right += diff;
269 cl = inst->u1.cp;
270 *(char**)(void*)&cl->end += diff;
271 break;
273 *(char**)(void*)&inst->u2.left += diff;
275 *(char**)(void*)&npp->startinst += diff;
276 return npp;
279 #ifdef DEBUG
280 static void
281 dumpstack(void){
282 Node *stk;
283 int *ip;
285 print("operators\n");
286 for(ip=atorstack; ip<atorp; ip++)
287 print("0%o\n", *ip);
288 print("operands\n");
289 for(stk=andstack; stk<andp; stk++)
290 print("0%o\t0%o\n", stk->first->type, stk->last->type);
293 static void
294 dump(Reprog *pp)
296 Reinst *l;
297 Rune *p;
299 l = pp->firstinst;
300 do{
301 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
302 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
303 if(l->type == RUNE)
304 print("\t%C\n", l->u1.r);
305 else if(l->type == CCLASS || l->type == NCCLASS){
306 print("\t[");
307 if(l->type == NCCLASS)
308 print("^");
309 for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
310 if(p[0] == p[1])
311 print("%C", p[0]);
312 else
313 print("%C-%C", p[0], p[1]);
314 print("]\n");
315 } else
316 print("\n");
317 }while(l++->type);
319 #endif
321 static Reclass*
322 newclass(void)
324 if(nclass >= NCLASS)
325 regerr2("too many character classes; limit", NCLASS+'0');
326 return &(classp[nclass++]);
329 static int
330 nextc(Rune *rp)
332 if(lexdone){
333 *rp = 0;
334 return 1;
336 exprp += chartorune(rp, exprp);
337 if(*rp == '\\'){
338 exprp += chartorune(rp, exprp);
339 return 1;
341 if(*rp == 0)
342 lexdone = 1;
343 return 0;
346 static int
347 lex(int literal, int dot_type)
349 int quoted;
351 quoted = nextc(&yyrune);
352 if(literal || quoted){
353 if(yyrune == 0)
354 return END;
355 return RUNE;
358 switch(yyrune){
359 case 0:
360 return END;
361 case '*':
362 return STAR;
363 case '?':
364 return QUEST;
365 case '+':
366 return PLUS;
367 case '|':
368 return OR;
369 case '.':
370 return dot_type;
371 case '(':
372 return LBRA;
373 case ')':
374 return RBRA;
375 case '^':
376 return BOL;
377 case '$':
378 return EOL;
379 case '[':
380 return bldcclass();
382 return RUNE;
385 static int
386 bldcclass(void)
388 int type;
389 Rune r[NCCRUNE];
390 Rune *p, *ep, *np;
391 Rune rune;
392 int quoted;
394 /* we have already seen the '[' */
395 type = CCLASS;
396 yyclassp = newclass();
398 /* look ahead for negation */
399 /* SPECIAL CASE!!! negated classes don't match \n */
400 ep = r;
401 quoted = nextc(&rune);
402 if(!quoted && rune == '^'){
403 type = NCCLASS;
404 quoted = nextc(&rune);
405 *ep++ = '\n';
406 *ep++ = '\n';
409 /* parse class into a set of spans */
410 for(; ep<&r[NCCRUNE];){
411 if(rune == 0){
412 rcerror("malformed '[]'");
413 return 0;
415 if(!quoted && rune == ']')
416 break;
417 if(!quoted && rune == '-'){
418 if(ep == r){
419 rcerror("malformed '[]'");
420 return 0;
422 quoted = nextc(&rune);
423 if((!quoted && rune == ']') || rune == 0){
424 rcerror("malformed '[]'");
425 return 0;
427 *(ep-1) = rune;
428 } else {
429 *ep++ = rune;
430 *ep++ = rune;
432 quoted = nextc(&rune);
435 /* sort on span start */
436 for(p = r; p < ep; p += 2){
437 for(np = p; np < ep; np += 2)
438 if(*np < *p){
439 rune = np[0];
440 np[0] = p[0];
441 p[0] = rune;
442 rune = np[1];
443 np[1] = p[1];
444 p[1] = rune;
448 /* merge spans */
449 np = yyclassp->spans;
450 p = r;
451 if(r == ep)
452 yyclassp->end = np;
453 else {
454 np[0] = *p++;
455 np[1] = *p++;
456 for(; p < ep; p += 2)
457 if(p[0] <= np[1]){
458 if(p[1] > np[1])
459 np[1] = p[1];
460 } else {
461 np += 2;
462 np[0] = p[0];
463 np[1] = p[1];
465 yyclassp->end = np+2;
468 return type;
471 static Reprog*
472 regcomp1(char *s, int literal, int dot_type)
474 int token;
475 Reprog *volatile pp;
477 /* get memory for the program */
478 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
479 if(pp == 0){
480 regerror("out of memory");
481 return 0;
483 freep = pp->firstinst;
484 classp = pp->class;
485 errors = 0;
487 if(setjmp(regkaboom))
488 goto out;
490 /* go compile the sucker */
491 lexdone = 0;
492 exprp = s;
493 nclass = 0;
494 nbra = 0;
495 atorp = atorstack;
496 andp = andstack;
497 subidp = subidstack;
498 lastwasand = FALSE;
499 cursubid = 0;
501 /* Start with a low priority operator to prime parser */
502 pushator(START-1);
503 while((token = lex(literal, dot_type)) != END){
504 if((token&0300) == OPERATOR)
505 operator(token);
506 else
507 operand(token);
510 /* Close with a low priority operator */
511 evaluntil(START);
513 /* Force END */
514 operand(END);
515 evaluntil(START);
516 #ifdef DEBUG
517 dumpstack();
518 #endif
519 if(nbra)
520 rcerror("unmatched left paren");
521 --andp; /* points to first and only operand */
522 pp->startinst = andp->first;
523 #ifdef DEBUG
524 dump(pp);
525 #endif
526 pp = optimize(pp);
527 #ifdef DEBUG
528 print("start: %d\n", andp->first-pp->firstinst);
529 dump(pp);
530 #endif
531 out:
532 if(errors){
533 free(pp);
534 pp = 0;
536 return pp;
539 extern Reprog*
540 regcomp(char *s)
542 return regcomp1(s, 0, ANY);
545 extern Reprog*
546 regcomplit(char *s)
548 return regcomp1(s, 1, ANY);
551 extern Reprog*
552 regcompnl(char *s)
554 return regcomp1(s, 0, ANYNL);