Blob


1 /* From libregexp but leaks extra classes when it runs out */
4 #include <u.h>
5 #include <libc.h>
6 #include "regexp.h"
7 #include "/sys/src/libregexp/regcomp.h"
9 #define TRUE 1
10 #define FALSE 0
12 /*
13 * Parser Information
14 */
15 typedef
16 struct Node
17 {
18 Reinst* first;
19 Reinst* last;
20 }Node;
22 #define NSTACK 20
23 static Node andstack[NSTACK];
24 static Node *andp;
25 static int atorstack[NSTACK];
26 static int* atorp;
27 static int cursubid; /* id of current subexpression */
28 static int subidstack[NSTACK]; /* parallel to atorstack */
29 static int* subidp;
30 static int lastwasand; /* Last token was operand */
31 static int nbra;
32 static char* exprp; /* pointer to next character in source expression */
33 static int lexdone;
34 static int nclass;
35 static Reclass*classp;
36 static Reinst* freep;
37 static int errors;
38 static Rune yyrune; /* last lex'd rune */
39 static Reclass*yyclassp; /* last lex'd class */
41 /* predeclared crap */
42 static void operator(int);
43 static void pushand(Reinst*, Reinst*);
44 static void pushator(int);
45 static void evaluntil(int);
46 static int bldcclass(void);
48 static jmp_buf regkaboom;
50 static void
51 rcerror(char *s)
52 {
53 errors++;
54 regerror(s);
55 longjmp(regkaboom, 1);
56 }
58 static Reinst*
59 newinst(int t)
60 {
61 freep->type = t;
62 freep->left = 0;
63 freep->right = 0;
64 return freep++;
65 }
67 static void
68 operand(int t)
69 {
70 Reinst *i;
72 if(lastwasand)
73 operator(CAT); /* catenate is implicit */
74 i = newinst(t);
76 if(t == CCLASS || t == NCCLASS)
77 i->cp = yyclassp;
78 if(t == RUNE)
79 i->r = yyrune;
81 pushand(i, i);
82 lastwasand = TRUE;
83 }
85 static void
86 operator(int t)
87 {
88 if(t==RBRA && --nbra<0)
89 rcerror("unmatched right paren");
90 if(t==LBRA){
91 if(++cursubid >= NSUBEXP)
92 rcerror ("too many subexpressions");
93 nbra++;
94 if(lastwasand)
95 operator(CAT);
96 } else
97 evaluntil(t);
98 if(t != RBRA)
99 pushator(t);
100 lastwasand = FALSE;
101 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
102 lastwasand = TRUE; /* these look like operands */
105 static void
106 regerr2(char *s, int c)
108 char buf[100];
109 char *cp = buf;
110 while(*s)
111 *cp++ = *s++;
112 *cp++ = c;
113 *cp = '\0';
114 rcerror(buf);
117 static void
118 cant(char *s)
120 char buf[100];
121 strcpy(buf, "can't happen: ");
122 strcat(buf, s);
123 rcerror(buf);
126 static void
127 pushand(Reinst *f, Reinst *l)
129 if(andp >= &andstack[NSTACK])
130 cant("operand stack overflow");
131 andp->first = f;
132 andp->last = l;
133 andp++;
136 static void
137 pushator(int t)
139 if(atorp >= &atorstack[NSTACK])
140 cant("operator stack overflow");
141 *atorp++ = t;
142 *subidp++ = cursubid;
145 static Node*
146 popand(int op)
148 Reinst *inst;
150 if(andp <= &andstack[0]){
151 regerr2("missing operand for ", op);
152 inst = newinst(NOP);
153 pushand(inst,inst);
155 return --andp;
158 static int
159 popator(void)
161 if(atorp <= &atorstack[0])
162 cant("operator stack underflow");
163 --subidp;
164 return *--atorp;
167 static void
168 evaluntil(int pri)
170 Node *op1, *op2;
171 Reinst *inst1, *inst2;
173 while(pri==RBRA || atorp[-1]>=pri){
174 switch(popator()){
175 default:
176 rcerror("unknown operator in evaluntil");
177 break;
178 case LBRA: /* must have been RBRA */
179 op1 = popand('(');
180 inst2 = newinst(RBRA);
181 inst2->subid = *subidp;
182 op1->last->next = inst2;
183 inst1 = newinst(LBRA);
184 inst1->subid = *subidp;
185 inst1->next = op1->first;
186 pushand(inst1, inst2);
187 return;
188 case OR:
189 op2 = popand('|');
190 op1 = popand('|');
191 inst2 = newinst(NOP);
192 op2->last->next = inst2;
193 op1->last->next = inst2;
194 inst1 = newinst(OR);
195 inst1->right = op1->first;
196 inst1->left = op2->first;
197 pushand(inst1, inst2);
198 break;
199 case CAT:
200 op2 = popand(0);
201 op1 = popand(0);
202 op1->last->next = op2->first;
203 pushand(op1->first, op2->last);
204 break;
205 case STAR:
206 op2 = popand('*');
207 inst1 = newinst(OR);
208 op2->last->next = inst1;
209 inst1->right = op2->first;
210 pushand(inst1, inst1);
211 break;
212 case PLUS:
213 op2 = popand('+');
214 inst1 = newinst(OR);
215 op2->last->next = inst1;
216 inst1->right = op2->first;
217 pushand(op2->first, inst1);
218 break;
219 case QUEST:
220 op2 = popand('?');
221 inst1 = newinst(OR);
222 inst2 = newinst(NOP);
223 inst1->left = inst2;
224 inst1->right = op2->first;
225 op2->last->next = inst2;
226 pushand(inst1, inst2);
227 break;
232 static Reprog*
233 optimize(Reprog *pp)
235 Reinst *inst, *target;
236 int size;
237 Reprog *npp;
238 Reclass *cl;
239 int diff;
241 /*
242 * get rid of NOOP chains
243 */
244 for(inst=pp->firstinst; inst->type!=END; inst++){
245 target = inst->next;
246 while(target->type == NOP)
247 target = target->next;
248 inst->next = target;
251 /*
252 * The original allocation is for an area larger than
253 * necessary. Reallocate to the actual space used
254 * and then relocate the code.
255 */
256 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
257 npp = realloc(pp, size);
258 if(npp==0 || npp==pp)
259 return pp;
260 diff = (char *)npp - (char *)pp;
261 freep = (Reinst *)((char *)freep + diff);
262 for(inst=npp->firstinst; inst<freep; inst++){
263 switch(inst->type){
264 case OR:
265 case STAR:
266 case PLUS:
267 case QUEST:
268 *(char **)&inst->right += diff;
269 break;
270 case CCLASS:
271 case NCCLASS:
272 *(char **)&inst->right += diff;
273 cl = inst->cp;
274 *(char **)&cl->end += diff;
275 break;
277 *(char **)&inst->left += diff;
279 *(char **)&npp->startinst += diff;
280 return npp;
283 #ifdef DEBUG
284 static void
285 dumpstack(void){
286 Node *stk;
287 int *ip;
289 print("operators\n");
290 for(ip=atorstack; ip<atorp; ip++)
291 print("0%o\n", *ip);
292 print("operands\n");
293 for(stk=andstack; stk<andp; stk++)
294 print("0%o\t0%o\n", stk->first->type, stk->last->type);
297 static void
298 dump(Reprog *pp)
300 Reinst *l;
301 Rune *p;
303 l = pp->firstinst;
304 do{
305 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
306 l->left-pp->firstinst, l->right-pp->firstinst);
307 if(l->type == RUNE)
308 print("\t%C\n", l->r);
309 else if(l->type == CCLASS || l->type == NCCLASS){
310 print("\t[");
311 if(l->type == NCCLASS)
312 print("^");
313 for(p = l->cp->spans; p < l->cp->end; p += 2)
314 if(p[0] == p[1])
315 print("%C", p[0]);
316 else
317 print("%C-%C", p[0], p[1]);
318 print("]\n");
319 } else
320 print("\n");
321 }while(l++->type);
323 #endif
325 static Reclass*
326 newclass(void)
328 if(nclass <= 0){
329 classp = mallocz(128*sizeof(Reclass), 1);
330 if(classp == nil)
331 regerror("out of memory");
332 nclass = 128;
334 return &classp[--nclass];
337 static int
338 nextc(Rune *rp)
340 if(lexdone){
341 *rp = 0;
342 return 1;
344 exprp += chartorune(rp, exprp);
345 if(*rp == L'\\'){
346 exprp += chartorune(rp, exprp);
347 return 1;
349 if(*rp == 0)
350 lexdone = 1;
351 return 0;
354 static int
355 lex(int literal, int dot_type)
357 int quoted;
359 quoted = nextc(&yyrune);
360 if(literal || quoted){
361 if(yyrune == 0)
362 return END;
363 return RUNE;
366 switch(yyrune){
367 case 0:
368 return END;
369 case L'*':
370 return STAR;
371 case L'?':
372 return QUEST;
373 case L'+':
374 return PLUS;
375 case L'|':
376 return OR;
377 case L'.':
378 return dot_type;
379 case L'(':
380 return LBRA;
381 case L')':
382 return RBRA;
383 case L'^':
384 return BOL;
385 case L'$':
386 return EOL;
387 case L'[':
388 return bldcclass();
390 return RUNE;
393 static int
394 bldcclass(void)
396 int type;
397 Rune r[NCCRUNE];
398 Rune *p, *ep, *np;
399 Rune rune;
400 int quoted;
402 /* we have already seen the '[' */
403 type = CCLASS;
404 yyclassp = newclass();
406 /* look ahead for negation */
407 /* SPECIAL CASE!!! negated classes don't match \n */
408 ep = r;
409 quoted = nextc(&rune);
410 if(!quoted && rune == L'^'){
411 type = NCCLASS;
412 quoted = nextc(&rune);
413 *ep++ = L'\n';
414 *ep++ = L'\n';
417 /* parse class into a set of spans */
418 for(; ep<&r[NCCRUNE];){
419 if(rune == 0){
420 rcerror("malformed '[]'");
421 return 0;
423 if(!quoted && rune == L']')
424 break;
425 if(!quoted && rune == L'-'){
426 if(ep == r){
427 rcerror("malformed '[]'");
428 return 0;
430 quoted = nextc(&rune);
431 if((!quoted && rune == L']') || rune == 0){
432 rcerror("malformed '[]'");
433 return 0;
435 *(ep-1) = rune;
436 } else {
437 *ep++ = rune;
438 *ep++ = rune;
440 quoted = nextc(&rune);
443 /* sort on span start */
444 for(p = r; p < ep; p += 2){
445 for(np = p; np < ep; np += 2)
446 if(*np < *p){
447 rune = np[0];
448 np[0] = p[0];
449 p[0] = rune;
450 rune = np[1];
451 np[1] = p[1];
452 p[1] = rune;
456 /* merge spans */
457 np = yyclassp->spans;
458 p = r;
459 if(r == ep)
460 yyclassp->end = np;
461 else {
462 np[0] = *p++;
463 np[1] = *p++;
464 for(; p < ep; p += 2)
465 if(p[0] <= np[1]){
466 if(p[1] > np[1])
467 np[1] = p[1];
468 } else {
469 np += 2;
470 np[0] = p[0];
471 np[1] = p[1];
473 yyclassp->end = np+2;
476 return type;
479 static Reprog*
480 regcomp1(char *s, int literal, int dot_type)
482 int token;
483 Reprog *pp;
485 /* get memory for the program */
486 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
487 if(pp == 0){
488 regerror("out of memory");
489 return 0;
491 freep = pp->firstinst;
492 classp = pp->class;
493 errors = 0;
495 if(setjmp(regkaboom))
496 goto out;
498 /* go compile the sucker */
499 lexdone = 0;
500 exprp = s;
501 nclass = NCLASS;
502 nbra = 0;
503 atorp = atorstack;
504 andp = andstack;
505 subidp = subidstack;
506 lastwasand = FALSE;
507 cursubid = 0;
509 /* Start with a low priority operator to prime parser */
510 pushator(START-1);
511 while((token = lex(literal, dot_type)) != END){
512 if((token&0300) == OPERATOR)
513 operator(token);
514 else
515 operand(token);
518 /* Close with a low priority operator */
519 evaluntil(START);
521 /* Force END */
522 operand(END);
523 evaluntil(START);
524 #ifdef DEBUG
525 dumpstack();
526 #endif
527 if(nbra)
528 rcerror("unmatched left paren");
529 --andp; /* points to first and only operand */
530 pp->startinst = andp->first;
531 #ifdef DEBUG
532 dump(pp);
533 #endif
534 pp = optimize(pp);
535 #ifdef DEBUG
536 print("start: %d\n", andp->first-pp->firstinst);
537 dump(pp);
538 #endif
539 out:
540 if(errors){
541 free(pp);
542 pp = 0;
544 return pp;
547 extern Reprog*
548 regcomp(char *s)
550 return regcomp1(s, 0, ANY);
553 extern Reprog*
554 regcomplit(char *s)
556 return regcomp1(s, 1, ANY);
559 extern Reprog*
560 regcompnl(char *s)
562 return regcomp1(s, 0, ANYNL);