Blob


1 #include "lib9.h"
2 #include "regexp9.h"
3 #include "regcomp.h"
5 #define TRUE 1
6 #define FALSE 0
8 /*
9 * Parser Information
10 */
11 typedef
12 struct Node
13 {
14 Reinst* first;
15 Reinst* last;
16 }Node;
18 /* max character classes per program is nelem(reprog->class) */
19 static Reprog *reprog;
21 /* max rune ranges per character class is nelem(classp->spans)/2 */
22 #define NCCRUNE nelem(classp->spans)
24 #define NSTACK 20
25 static Node andstack[NSTACK];
26 static Node *andp;
27 static int atorstack[NSTACK];
28 static int* atorp;
29 static int cursubid; /* id of current subexpression */
30 static int subidstack[NSTACK]; /* parallel to atorstack */
31 static int* subidp;
32 static int lastwasand; /* Last token was operand */
33 static int nbra;
34 static char* exprp; /* pointer to next character in source expression */
35 static int lexdone;
36 static int nclass;
37 static Reclass*classp;
38 static Reinst* freep;
39 static int errors;
40 static Rune yyrune; /* last lex'd rune */
41 static Reclass*yyclassp; /* last lex'd class */
43 /* predeclared crap */
44 static void operator(int);
45 static void pushand(Reinst*, Reinst*);
46 static void pushator(int);
47 static void evaluntil(int);
48 static int bldcclass(void);
50 static jmp_buf regkaboom;
52 static void
53 rcerror(char *s)
54 {
55 errors++;
56 regerror(s);
57 longjmp(regkaboom, 1);
58 }
60 static Reinst*
61 newinst(int t)
62 {
63 freep->type = t;
64 freep->u2.left = 0;
65 freep->u1.right = 0;
66 return freep++;
67 }
69 static void
70 operand(int t)
71 {
72 Reinst *i;
74 if(lastwasand)
75 operator(CAT); /* catenate is implicit */
76 i = newinst(t);
78 if(t == CCLASS || t == NCCLASS)
79 i->u1.cp = yyclassp;
80 if(t == RUNE)
81 i->u1.r = yyrune;
83 pushand(i, i);
84 lastwasand = TRUE;
85 }
87 static void
88 operator(int t)
89 {
90 if(t==RBRA && --nbra<0)
91 rcerror("unmatched right paren");
92 if(t==LBRA){
93 if(++cursubid >= NSUBEXP)
94 rcerror ("too many subexpressions");
95 nbra++;
96 if(lastwasand)
97 operator(CAT);
98 } else
99 evaluntil(t);
100 if(t != RBRA)
101 pushator(t);
102 lastwasand = FALSE;
103 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
104 lastwasand = TRUE; /* these look like operands */
107 static void
108 regerr2(char *s, int c)
110 char buf[100];
111 char *cp = buf;
112 while(*s)
113 *cp++ = *s++;
114 *cp++ = c;
115 *cp = '\0';
116 rcerror(buf);
119 static void
120 cant(char *s)
122 char buf[100];
123 strcpy(buf, "can't happen: ");
124 strcat(buf, s);
125 rcerror(buf);
128 static void
129 pushand(Reinst *f, Reinst *l)
131 if(andp >= &andstack[NSTACK])
132 cant("operand stack overflow");
133 andp->first = f;
134 andp->last = l;
135 andp++;
138 static void
139 pushator(int t)
141 if(atorp >= &atorstack[NSTACK])
142 cant("operator stack overflow");
143 *atorp++ = t;
144 *subidp++ = cursubid;
147 static Node*
148 popand(int op)
150 Reinst *inst;
152 if(andp <= &andstack[0]){
153 regerr2("missing operand for ", op);
154 inst = newinst(NOP);
155 pushand(inst,inst);
157 return --andp;
160 static int
161 popator(void)
163 if(atorp <= &atorstack[0])
164 cant("operator stack underflow");
165 --subidp;
166 return *--atorp;
169 static void
170 evaluntil(int pri)
172 Node *op1, *op2;
173 Reinst *inst1, *inst2;
175 while(pri==RBRA || atorp[-1]>=pri){
176 switch(popator()){
177 default:
178 rcerror("unknown operator in evaluntil");
179 break;
180 case LBRA: /* must have been RBRA */
181 op1 = popand('(');
182 inst2 = newinst(RBRA);
183 inst2->u1.subid = *subidp;
184 op1->last->u2.next = inst2;
185 inst1 = newinst(LBRA);
186 inst1->u1.subid = *subidp;
187 inst1->u2.next = op1->first;
188 pushand(inst1, inst2);
189 return;
190 case OR:
191 op2 = popand('|');
192 op1 = popand('|');
193 inst2 = newinst(NOP);
194 op2->last->u2.next = inst2;
195 op1->last->u2.next = inst2;
196 inst1 = newinst(OR);
197 inst1->u1.right = op1->first;
198 inst1->u2.left = op2->first;
199 pushand(inst1, inst2);
200 break;
201 case CAT:
202 op2 = popand(0);
203 op1 = popand(0);
204 op1->last->u2.next = op2->first;
205 pushand(op1->first, op2->last);
206 break;
207 case STAR:
208 op2 = popand('*');
209 inst1 = newinst(OR);
210 op2->last->u2.next = inst1;
211 inst1->u1.right = op2->first;
212 pushand(inst1, inst1);
213 break;
214 case PLUS:
215 op2 = popand('+');
216 inst1 = newinst(OR);
217 op2->last->u2.next = inst1;
218 inst1->u1.right = op2->first;
219 pushand(op2->first, inst1);
220 break;
221 case QUEST:
222 op2 = popand('?');
223 inst1 = newinst(OR);
224 inst2 = newinst(NOP);
225 inst1->u2.left = inst2;
226 inst1->u1.right = op2->first;
227 op2->last->u2.next = inst2;
228 pushand(inst1, inst2);
229 break;
234 static Reprog*
235 optimize(Reprog *pp)
237 Reinst *inst, *target;
238 int size;
239 Reprog *npp;
240 Reclass *cl;
241 int diff;
243 /*
244 * get rid of NOOP chains
245 */
246 for(inst=pp->firstinst; inst->type!=END; inst++){
247 target = inst->u2.next;
248 while(target->type == NOP)
249 target = target->u2.next;
250 inst->u2.next = target;
253 /*
254 * The original allocation is for an area larger than
255 * necessary. Reallocate to the actual space used
256 * and then relocate the code.
257 */
258 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
259 npp = realloc(pp, size);
260 if(npp==0 || npp==pp)
261 return pp;
262 diff = (char *)npp - (char *)pp;
263 freep = (Reinst *)((char *)freep + diff);
264 for(inst=npp->firstinst; inst<freep; inst++){
265 switch(inst->type){
266 case OR:
267 case STAR:
268 case PLUS:
269 case QUEST:
270 inst->u1.right = (void*)((char*)inst->u1.right + diff);
271 break;
272 case CCLASS:
273 case NCCLASS:
274 inst->u1.right = (void*)((char*)inst->u1.right + diff);
275 cl = inst->u1.cp;
276 cl->end = (void*)((char*)cl->end + diff);
277 break;
279 inst->u2.left = (void*)((char*)inst->u2.left + diff);
281 npp->startinst = (void*)((char*)npp->startinst + diff);
282 return npp;
285 #ifdef DEBUG
286 static void
287 dumpstack(void){
288 Node *stk;
289 int *ip;
291 print("operators\n");
292 for(ip=atorstack; ip<atorp; ip++)
293 print("0%o\n", *ip);
294 print("operands\n");
295 for(stk=andstack; stk<andp; stk++)
296 print("0%o\t0%o\n", stk->first->type, stk->last->type);
299 static void
300 dump(Reprog *pp)
302 Reinst *l;
303 Rune *p;
305 l = pp->firstinst;
306 do{
307 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
308 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
309 if(l->type == RUNE)
310 print("\t%C\n", l->u1.r);
311 else if(l->type == CCLASS || l->type == NCCLASS){
312 print("\t[");
313 if(l->type == NCCLASS)
314 print("^");
315 for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
316 if(p[0] == p[1])
317 print("%C", p[0]);
318 else
319 print("%C-%C", p[0], p[1]);
320 print("]\n");
321 } else
322 print("\n");
323 }while(l++->type);
325 #endif
327 static Reclass*
328 newclass(void)
330 if(nclass >= nelem(reprog->class))
331 rcerror("too many character classes; increase Reprog.class size");
332 return &(classp[nclass++]);
335 static int
336 nextc(Rune *rp)
338 if(lexdone){
339 *rp = 0;
340 return 1;
342 exprp += chartorune(rp, exprp);
343 if(*rp == '\\'){
344 exprp += chartorune(rp, exprp);
345 return 1;
347 if(*rp == 0)
348 lexdone = 1;
349 return 0;
352 static int
353 lex(int literal, int dot_type)
355 int quoted;
357 quoted = nextc(&yyrune);
358 if(literal || quoted){
359 if(yyrune == 0)
360 return END;
361 return RUNE;
364 switch(yyrune){
365 case 0:
366 return END;
367 case '*':
368 return STAR;
369 case '?':
370 return QUEST;
371 case '+':
372 return PLUS;
373 case '|':
374 return OR;
375 case '.':
376 return dot_type;
377 case '(':
378 return LBRA;
379 case ')':
380 return RBRA;
381 case '^':
382 return BOL;
383 case '$':
384 return EOL;
385 case '[':
386 return bldcclass();
388 return RUNE;
391 static int
392 bldcclass(void)
394 int type;
395 Rune r[NCCRUNE];
396 Rune *p, *ep, *np;
397 Rune rune;
398 int quoted;
400 /* we have already seen the '[' */
401 type = CCLASS;
402 yyclassp = newclass();
404 /* look ahead for negation */
405 /* SPECIAL CASE!!! negated classes don't match \n */
406 ep = r;
407 quoted = nextc(&rune);
408 if(!quoted && rune == '^'){
409 type = NCCLASS;
410 quoted = nextc(&rune);
411 *ep++ = '\n';
412 *ep++ = '\n';
415 /* parse class into a set of spans */
416 while(ep < &r[NCCRUNE-1]){
417 if(rune == 0){
418 rcerror("malformed '[]'");
419 return 0;
421 if(!quoted && rune == ']')
422 break;
423 if(!quoted && rune == '-'){
424 if(ep == r){
425 rcerror("malformed '[]'");
426 return 0;
428 quoted = nextc(&rune);
429 if((!quoted && rune == ']') || rune == 0){
430 rcerror("malformed '[]'");
431 return 0;
433 *(ep-1) = rune;
434 } else {
435 *ep++ = rune;
436 *ep++ = rune;
438 quoted = nextc(&rune);
440 if(ep >= &r[NCCRUNE-1]) {
441 rcerror("char class too large; increase Reclass.spans size");
442 return 0;
445 /* sort on span start */
446 for(p = r; p < ep; p += 2){
447 for(np = p; np < ep; np += 2)
448 if(*np < *p){
449 rune = np[0];
450 np[0] = p[0];
451 p[0] = rune;
452 rune = np[1];
453 np[1] = p[1];
454 p[1] = rune;
458 /* merge spans */
459 np = yyclassp->spans;
460 p = r;
461 if(r == ep)
462 yyclassp->end = np;
463 else {
464 np[0] = *p++;
465 np[1] = *p++;
466 for(; p < ep; p += 2)
467 /* overlapping or adjacent ranges? */
468 if(p[0] <= np[1] + 1){
469 if(p[1] >= np[1])
470 np[1] = p[1]; /* coalesce */
471 } else {
472 np += 2;
473 np[0] = p[0];
474 np[1] = p[1];
476 yyclassp->end = np+2;
479 return type;
482 static Reprog*
483 regcomp1(char *s, int literal, int dot_type)
485 int token;
486 Reprog *volatile pp;
488 /* get memory for the program */
489 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
490 if(pp == 0){
491 regerror("out of memory");
492 return 0;
494 freep = pp->firstinst;
495 classp = pp->class;
496 errors = 0;
498 if(setjmp(regkaboom))
499 goto out;
501 /* go compile the sucker */
502 lexdone = 0;
503 exprp = s;
504 nclass = 0;
505 nbra = 0;
506 atorp = atorstack;
507 andp = andstack;
508 subidp = subidstack;
509 lastwasand = FALSE;
510 cursubid = 0;
512 /* Start with a low priority operator to prime parser */
513 pushator(START-1);
514 while((token = lex(literal, dot_type)) != END){
515 if((token&0300) == OPERATOR)
516 operator(token);
517 else
518 operand(token);
521 /* Close with a low priority operator */
522 evaluntil(START);
524 /* Force END */
525 operand(END);
526 evaluntil(START);
527 #ifdef DEBUG
528 dumpstack();
529 #endif
530 if(nbra)
531 rcerror("unmatched left paren");
532 --andp; /* points to first and only operand */
533 pp->startinst = andp->first;
534 #ifdef DEBUG
535 dump(pp);
536 #endif
537 pp = optimize(pp);
538 #ifdef DEBUG
539 print("start: %d\n", andp->first-pp->firstinst);
540 dump(pp);
541 #endif
542 out:
543 if(errors){
544 free(pp);
545 pp = 0;
547 return pp;
550 extern Reprog*
551 regcomp(char *s)
553 return regcomp1(s, 0, ANY);
556 extern Reprog*
557 regcomplit(char *s)
559 return regcomp1(s, 1, ANY);
562 extern Reprog*
563 regcompnl(char *s)
565 return regcomp1(s, 0, ANYNL);