Blob


1 #include "lib9.h"
2 #include "regexp9.h"
3 #include "regcomp.h"
5 #define TRUE 1
6 #define FALSE 0
8 /*
9 * Parser Information
10 */
11 typedef
12 struct Node
13 {
14 Reinst* first;
15 Reinst* last;
16 }Node;
18 /* max rune ranges per character class is nelem(classp->spans)/2 */
19 #define NCCRUNE nelem(classp->spans)
21 #define NSTACK 20
22 static Node andstack[NSTACK];
23 static Node *andp;
24 static int atorstack[NSTACK];
25 static int* atorp;
26 static int cursubid; /* id of current subexpression */
27 static int subidstack[NSTACK]; /* parallel to atorstack */
28 static int* subidp;
29 static int lastwasand; /* Last token was operand */
30 static int nbra;
31 static char* exprp; /* pointer to next character in source expression */
32 static int lexdone;
33 static int nclass;
34 static Reclass*classp;
35 static Reinst* freep;
36 static int errors;
37 static Rune yyrune; /* last lex'd rune */
38 static Reclass*yyclassp; /* last lex'd class */
40 /* predeclared crap */
41 static void operator(int);
42 static void pushand(Reinst*, Reinst*);
43 static void pushator(int);
44 static void evaluntil(int);
45 static int bldcclass(void);
47 static jmp_buf regkaboom;
49 static void
50 rcerror(char *s)
51 {
52 errors++;
53 regerror(s);
54 longjmp(regkaboom, 1);
55 }
57 static Reinst*
58 newinst(int t)
59 {
60 freep->type = t;
61 freep->u2.left = 0;
62 freep->u1.right = 0;
63 return freep++;
64 }
66 static void
67 operand(int t)
68 {
69 Reinst *i;
71 if(lastwasand)
72 operator(CAT); /* catenate is implicit */
73 i = newinst(t);
75 if(t == CCLASS || t == NCCLASS)
76 i->u1.cp = yyclassp;
77 if(t == RUNE)
78 i->u1.r = yyrune;
80 pushand(i, i);
81 lastwasand = TRUE;
82 }
84 static void
85 operator(int t)
86 {
87 if(t==RBRA && --nbra<0)
88 rcerror("unmatched right paren");
89 if(t==LBRA){
90 if(++cursubid >= NSUBEXP)
91 rcerror ("too many subexpressions");
92 nbra++;
93 if(lastwasand)
94 operator(CAT);
95 } else
96 evaluntil(t);
97 if(t != RBRA)
98 pushator(t);
99 lastwasand = FALSE;
100 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
101 lastwasand = TRUE; /* these look like operands */
104 static void
105 regerr2(char *s, int c)
107 char buf[100];
108 char *cp = buf;
109 while(*s)
110 *cp++ = *s++;
111 *cp++ = c;
112 *cp = '\0';
113 rcerror(buf);
116 static void
117 cant(char *s)
119 char buf[100];
120 strcpy(buf, "can't happen: ");
121 strcat(buf, s);
122 rcerror(buf);
125 static void
126 pushand(Reinst *f, Reinst *l)
128 if(andp >= &andstack[NSTACK])
129 cant("operand stack overflow");
130 andp->first = f;
131 andp->last = l;
132 andp++;
135 static void
136 pushator(int t)
138 if(atorp >= &atorstack[NSTACK])
139 cant("operator stack overflow");
140 *atorp++ = t;
141 *subidp++ = cursubid;
144 static Node*
145 popand(int op)
147 Reinst *inst;
149 if(andp <= &andstack[0]){
150 regerr2("missing operand for ", op);
151 inst = newinst(NOP);
152 pushand(inst,inst);
154 return --andp;
157 static int
158 popator(void)
160 if(atorp <= &atorstack[0])
161 cant("operator stack underflow");
162 --subidp;
163 return *--atorp;
166 static void
167 evaluntil(int pri)
169 Node *op1, *op2;
170 Reinst *inst1, *inst2;
172 while(pri==RBRA || atorp[-1]>=pri){
173 switch(popator()){
174 default:
175 rcerror("unknown operator in evaluntil");
176 break;
177 case LBRA: /* must have been RBRA */
178 op1 = popand('(');
179 inst2 = newinst(RBRA);
180 inst2->u1.subid = *subidp;
181 op1->last->u2.next = inst2;
182 inst1 = newinst(LBRA);
183 inst1->u1.subid = *subidp;
184 inst1->u2.next = op1->first;
185 pushand(inst1, inst2);
186 return;
187 case OR:
188 op2 = popand('|');
189 op1 = popand('|');
190 inst2 = newinst(NOP);
191 op2->last->u2.next = inst2;
192 op1->last->u2.next = inst2;
193 inst1 = newinst(OR);
194 inst1->u1.right = op1->first;
195 inst1->u2.left = op2->first;
196 pushand(inst1, inst2);
197 break;
198 case CAT:
199 op2 = popand(0);
200 op1 = popand(0);
201 op1->last->u2.next = op2->first;
202 pushand(op1->first, op2->last);
203 break;
204 case STAR:
205 op2 = popand('*');
206 inst1 = newinst(OR);
207 op2->last->u2.next = inst1;
208 inst1->u1.right = op2->first;
209 pushand(inst1, inst1);
210 break;
211 case PLUS:
212 op2 = popand('+');
213 inst1 = newinst(OR);
214 op2->last->u2.next = inst1;
215 inst1->u1.right = op2->first;
216 pushand(op2->first, inst1);
217 break;
218 case QUEST:
219 op2 = popand('?');
220 inst1 = newinst(OR);
221 inst2 = newinst(NOP);
222 inst1->u2.left = inst2;
223 inst1->u1.right = op2->first;
224 op2->last->u2.next = inst2;
225 pushand(inst1, inst2);
226 break;
231 static Reprog*
232 optimize(Reprog *pp)
234 Reinst *inst, *target;
235 int size;
236 Reprog *npp;
237 Reclass *cl;
238 ptrdiff_t diff;
240 /*
241 * get rid of NOOP chains
242 */
243 for(inst=pp->firstinst; inst->type!=END; inst++){
244 target = inst->u2.next;
245 while(target->type == NOP)
246 target = target->u2.next;
247 inst->u2.next = target;
250 /*
251 * The original allocation is for an area larger than
252 * necessary. Reallocate to the actual space used
253 * and then relocate the code.
254 */
255 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
256 npp = realloc(pp, size);
257 if(npp==0 || npp==pp)
258 return pp;
259 diff = (char *)npp - (char *)pp;
260 freep = (Reinst *)((char *)freep + diff);
261 for(inst=npp->firstinst; inst<freep; inst++){
262 switch(inst->type){
263 case OR:
264 case STAR:
265 case PLUS:
266 case QUEST:
267 inst->u1.right = (void*)((char*)inst->u1.right + diff);
268 break;
269 case CCLASS:
270 case NCCLASS:
271 inst->u1.right = (void*)((char*)inst->u1.right + diff);
272 cl = inst->u1.cp;
273 cl->end = (void*)((char*)cl->end + diff);
274 break;
276 inst->u2.left = (void*)((char*)inst->u2.left + diff);
278 npp->startinst = (void*)((char*)npp->startinst + diff);
279 return npp;
282 #ifdef DEBUG
283 static void
284 dumpstack(void){
285 Node *stk;
286 int *ip;
288 print("operators\n");
289 for(ip=atorstack; ip<atorp; ip++)
290 print("0%o\n", *ip);
291 print("operands\n");
292 for(stk=andstack; stk<andp; stk++)
293 print("0%o\t0%o\n", stk->first->type, stk->last->type);
296 static void
297 dump(Reprog *pp)
299 Reinst *l;
300 Rune *p;
302 l = pp->firstinst;
303 do{
304 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
305 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
306 if(l->type == RUNE)
307 print("\t%C\n", l->u1.r);
308 else if(l->type == CCLASS || l->type == NCCLASS){
309 print("\t[");
310 if(l->type == NCCLASS)
311 print("^");
312 for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)
313 if(p[0] == p[1])
314 print("%C", p[0]);
315 else
316 print("%C-%C", p[0], p[1]);
317 print("]\n");
318 } else
319 print("\n");
320 }while(l++->type);
322 #endif
324 static Reclass*
325 newclass(void)
327 if(nclass >= nelem(((Reprog*)0)->class))
328 rcerror("too many character classes; increase Reprog.class size");
329 return &(classp[nclass++]);
332 static int
333 nextc(Rune *rp)
335 if(lexdone){
336 *rp = 0;
337 return 1;
339 exprp += chartorune(rp, exprp);
340 if(*rp == '\\'){
341 exprp += chartorune(rp, exprp);
342 return 1;
344 if(*rp == 0)
345 lexdone = 1;
346 return 0;
349 static int
350 lex(int literal, int dot_type)
352 int quoted;
354 quoted = nextc(&yyrune);
355 if(literal || quoted){
356 if(yyrune == 0)
357 return END;
358 return RUNE;
361 switch(yyrune){
362 case 0:
363 return END;
364 case '*':
365 return STAR;
366 case '?':
367 return QUEST;
368 case '+':
369 return PLUS;
370 case '|':
371 return OR;
372 case '.':
373 return dot_type;
374 case '(':
375 return LBRA;
376 case ')':
377 return RBRA;
378 case '^':
379 return BOL;
380 case '$':
381 return EOL;
382 case '[':
383 return bldcclass();
385 return RUNE;
388 static int
389 bldcclass(void)
391 int type;
392 Rune r[NCCRUNE];
393 Rune *p, *ep, *np;
394 Rune rune;
395 int quoted;
397 /* we have already seen the '[' */
398 type = CCLASS;
399 yyclassp = newclass();
401 /* look ahead for negation */
402 /* SPECIAL CASE!!! negated classes don't match \n */
403 ep = r;
404 quoted = nextc(&rune);
405 if(!quoted && rune == '^'){
406 type = NCCLASS;
407 quoted = nextc(&rune);
408 *ep++ = '\n';
409 *ep++ = '\n';
412 /* parse class into a set of spans */
413 while(ep < &r[NCCRUNE-1]){
414 if(rune == 0){
415 rcerror("malformed '[]'");
416 return 0;
418 if(!quoted && rune == ']')
419 break;
420 if(!quoted && rune == '-'){
421 if(ep == r){
422 rcerror("malformed '[]'");
423 return 0;
425 quoted = nextc(&rune);
426 if((!quoted && rune == ']') || rune == 0){
427 rcerror("malformed '[]'");
428 return 0;
430 *(ep-1) = rune;
431 } else {
432 *ep++ = rune;
433 *ep++ = rune;
435 quoted = nextc(&rune);
437 if(ep >= &r[NCCRUNE-1]) {
438 rcerror("char class too large; increase Reclass.spans size");
439 return 0;
442 /* sort on span start */
443 for(p = r; p < ep; p += 2){
444 for(np = p; np < ep; np += 2)
445 if(*np < *p){
446 rune = np[0];
447 np[0] = p[0];
448 p[0] = rune;
449 rune = np[1];
450 np[1] = p[1];
451 p[1] = rune;
455 /* merge spans */
456 np = yyclassp->spans;
457 p = r;
458 if(r == ep)
459 yyclassp->end = np;
460 else {
461 np[0] = *p++;
462 np[1] = *p++;
463 for(; p < ep; p += 2)
464 /* overlapping or adjacent ranges? */
465 if(p[0] <= np[1] + 1){
466 if(p[1] >= np[1])
467 np[1] = p[1]; /* coalesce */
468 } else {
469 np += 2;
470 np[0] = p[0];
471 np[1] = p[1];
473 yyclassp->end = np+2;
476 return type;
479 static Reprog*
480 regcomp1(char *s, int literal, int dot_type)
482 int token;
483 Reprog *volatile pp;
485 /* get memory for the program */
486 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
487 if(pp == 0){
488 regerror("out of memory");
489 return 0;
491 freep = pp->firstinst;
492 classp = pp->class;
493 errors = 0;
495 if(setjmp(regkaboom))
496 goto out;
498 /* go compile the sucker */
499 lexdone = 0;
500 exprp = s;
501 nclass = 0;
502 nbra = 0;
503 atorp = atorstack;
504 andp = andstack;
505 subidp = subidstack;
506 lastwasand = FALSE;
507 cursubid = 0;
509 /* Start with a low priority operator to prime parser */
510 pushator(START-1);
511 while((token = lex(literal, dot_type)) != END){
512 if((token&0300) == OPERATOR)
513 operator(token);
514 else
515 operand(token);
518 /* Close with a low priority operator */
519 evaluntil(START);
521 /* Force END */
522 operand(END);
523 evaluntil(START);
524 #ifdef DEBUG
525 dumpstack();
526 #endif
527 if(nbra)
528 rcerror("unmatched left paren");
529 --andp; /* points to first and only operand */
530 pp->startinst = andp->first;
531 #ifdef DEBUG
532 dump(pp);
533 #endif
534 pp = optimize(pp);
535 #ifdef DEBUG
536 print("start: %d\n", andp->first-pp->firstinst);
537 dump(pp);
538 #endif
539 out:
540 if(errors){
541 free(pp);
542 pp = 0;
544 return pp;
547 extern Reprog*
548 regcomp(char *s)
550 return regcomp1(s, 0, ANY);
553 extern Reprog*
554 regcomplit(char *s)
556 return regcomp1(s, 1, ANY);
559 extern Reprog*
560 regcompnl(char *s)
562 return regcomp1(s, 0, ANYNL);