Blob
1 /* From libregexp but leaks extra classes when it runs out */4 #include <u.h>5 #include <libc.h>6 #include "regexp.h"7 #include "/sys/src/libregexp/regcomp.h"9 #define TRUE 110 #define FALSE 012 /*13 * Parser Information14 */15 typedef16 struct Node17 {18 Reinst* first;19 Reinst* last;20 }Node;22 #define NSTACK 2023 static Node andstack[NSTACK];24 static Node *andp;25 static int atorstack[NSTACK];26 static int* atorp;27 static int cursubid; /* id of current subexpression */28 static int subidstack[NSTACK]; /* parallel to atorstack */29 static int* subidp;30 static int lastwasand; /* Last token was operand */31 static int nbra;32 static char* exprp; /* pointer to next character in source expression */33 static int lexdone;34 static int nclass;35 static Reclass*classp;36 static Reinst* freep;37 static int errors;38 static Rune yyrune; /* last lex'd rune */39 static Reclass*yyclassp; /* last lex'd class */41 /* predeclared crap */42 static void operator(int);43 static void pushand(Reinst*, Reinst*);44 static void pushator(int);45 static void evaluntil(int);46 static int bldcclass(void);48 static jmp_buf regkaboom;50 static void51 rcerror(char *s)52 {53 errors++;54 regerror(s);55 longjmp(regkaboom, 1);56 }58 static Reinst*59 newinst(int t)60 {61 freep->type = t;62 freep->left = 0;63 freep->right = 0;64 return freep++;65 }67 static void68 operand(int t)69 {70 Reinst *i;72 if(lastwasand)73 operator(CAT); /* catenate is implicit */74 i = newinst(t);76 if(t == CCLASS || t == NCCLASS)77 i->cp = yyclassp;78 if(t == RUNE)79 i->r = yyrune;81 pushand(i, i);82 lastwasand = TRUE;83 }85 static void86 operator(int t)87 {88 if(t==RBRA && --nbra<0)89 rcerror("unmatched right paren");90 if(t==LBRA){91 if(++cursubid >= NSUBEXP)92 rcerror ("too many subexpressions");93 nbra++;94 if(lastwasand)95 operator(CAT);96 } else97 evaluntil(t);98 if(t != RBRA)99 pushator(t);100 lastwasand = FALSE;101 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)102 lastwasand = TRUE; /* these look like operands */103 }105 static void106 regerr2(char *s, int c)107 {108 char buf[100];109 char *cp = buf;110 while(*s)111 *cp++ = *s++;112 *cp++ = c;113 *cp = '\0';114 rcerror(buf);115 }117 static void118 cant(char *s)119 {120 char buf[100];121 strcpy(buf, "can't happen: ");122 strcat(buf, s);123 rcerror(buf);124 }126 static void127 pushand(Reinst *f, Reinst *l)128 {129 if(andp >= &andstack[NSTACK])130 cant("operand stack overflow");131 andp->first = f;132 andp->last = l;133 andp++;134 }136 static void137 pushator(int t)138 {139 if(atorp >= &atorstack[NSTACK])140 cant("operator stack overflow");141 *atorp++ = t;142 *subidp++ = cursubid;143 }145 static Node*146 popand(int op)147 {148 Reinst *inst;150 if(andp <= &andstack[0]){151 regerr2("missing operand for ", op);152 inst = newinst(NOP);153 pushand(inst,inst);154 }155 return --andp;156 }158 static int159 popator(void)160 {161 if(atorp <= &atorstack[0])162 cant("operator stack underflow");163 --subidp;164 return *--atorp;165 }167 static void168 evaluntil(int pri)169 {170 Node *op1, *op2;171 Reinst *inst1, *inst2;173 while(pri==RBRA || atorp[-1]>=pri){174 switch(popator()){175 default:176 rcerror("unknown operator in evaluntil");177 break;178 case LBRA: /* must have been RBRA */179 op1 = popand('(');180 inst2 = newinst(RBRA);181 inst2->subid = *subidp;182 op1->last->next = inst2;183 inst1 = newinst(LBRA);184 inst1->subid = *subidp;185 inst1->next = op1->first;186 pushand(inst1, inst2);187 return;188 case OR:189 op2 = popand('|');190 op1 = popand('|');191 inst2 = newinst(NOP);192 op2->last->next = inst2;193 op1->last->next = inst2;194 inst1 = newinst(OR);195 inst1->right = op1->first;196 inst1->left = op2->first;197 pushand(inst1, inst2);198 break;199 case CAT:200 op2 = popand(0);201 op1 = popand(0);202 op1->last->next = op2->first;203 pushand(op1->first, op2->last);204 break;205 case STAR:206 op2 = popand('*');207 inst1 = newinst(OR);208 op2->last->next = inst1;209 inst1->right = op2->first;210 pushand(inst1, inst1);211 break;212 case PLUS:213 op2 = popand('+');214 inst1 = newinst(OR);215 op2->last->next = inst1;216 inst1->right = op2->first;217 pushand(op2->first, inst1);218 break;219 case QUEST:220 op2 = popand('?');221 inst1 = newinst(OR);222 inst2 = newinst(NOP);223 inst1->left = inst2;224 inst1->right = op2->first;225 op2->last->next = inst2;226 pushand(inst1, inst2);227 break;228 }229 }230 }232 static Reprog*233 optimize(Reprog *pp)234 {235 Reinst *inst, *target;236 int size;237 Reprog *npp;238 Reclass *cl;239 int diff;241 /*242 * get rid of NOOP chains243 */244 for(inst=pp->firstinst; inst->type!=END; inst++){245 target = inst->next;246 while(target->type == NOP)247 target = target->next;248 inst->next = target;249 }251 /*252 * The original allocation is for an area larger than253 * necessary. Reallocate to the actual space used254 * and then relocate the code.255 */256 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);257 npp = realloc(pp, size);258 if(npp==0 || npp==pp)259 return pp;260 diff = (char *)npp - (char *)pp;261 freep = (Reinst *)((char *)freep + diff);262 for(inst=npp->firstinst; inst<freep; inst++){263 switch(inst->type){264 case OR:265 case STAR:266 case PLUS:267 case QUEST:268 *(char **)&inst->right += diff;269 break;270 case CCLASS:271 case NCCLASS:272 *(char **)&inst->right += diff;273 cl = inst->cp;274 *(char **)&cl->end += diff;275 break;276 }277 *(char **)&inst->left += diff;278 }279 *(char **)&npp->startinst += diff;280 return npp;281 }283 #ifdef DEBUG284 static void285 dumpstack(void){286 Node *stk;287 int *ip;289 print("operators\n");290 for(ip=atorstack; ip<atorp; ip++)291 print("0%o\n", *ip);292 print("operands\n");293 for(stk=andstack; stk<andp; stk++)294 print("0%o\t0%o\n", stk->first->type, stk->last->type);295 }297 static void298 dump(Reprog *pp)299 {300 Reinst *l;301 Rune *p;303 l = pp->firstinst;304 do{305 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,306 l->left-pp->firstinst, l->right-pp->firstinst);307 if(l->type == RUNE)308 print("\t%C\n", l->r);309 else if(l->type == CCLASS || l->type == NCCLASS){310 print("\t[");311 if(l->type == NCCLASS)312 print("^");313 for(p = l->cp->spans; p < l->cp->end; p += 2)314 if(p[0] == p[1])315 print("%C", p[0]);316 else317 print("%C-%C", p[0], p[1]);318 print("]\n");319 } else320 print("\n");321 }while(l++->type);322 }323 #endif325 static Reclass*326 newclass(void)327 {328 if(nclass <= 0){329 classp = mallocz(128*sizeof(Reclass), 1);330 if(classp == nil)331 regerror("out of memory");332 nclass = 128;333 }334 return &classp[--nclass];335 }337 static int338 nextc(Rune *rp)339 {340 if(lexdone){341 *rp = 0;342 return 1;343 }344 exprp += chartorune(rp, exprp);345 if(*rp == L'\\'){346 exprp += chartorune(rp, exprp);347 return 1;348 }349 if(*rp == 0)350 lexdone = 1;351 return 0;352 }354 static int355 lex(int literal, int dot_type)356 {357 int quoted;359 quoted = nextc(&yyrune);360 if(literal || quoted){361 if(yyrune == 0)362 return END;363 return RUNE;364 }366 switch(yyrune){367 case 0:368 return END;369 case L'*':370 return STAR;371 case L'?':372 return QUEST;373 case L'+':374 return PLUS;375 case L'|':376 return OR;377 case L'.':378 return dot_type;379 case L'(':380 return LBRA;381 case L')':382 return RBRA;383 case L'^':384 return BOL;385 case L'$':386 return EOL;387 case L'[':388 return bldcclass();389 }390 return RUNE;391 }393 static int394 bldcclass(void)395 {396 int type;397 Rune r[NCCRUNE];398 Rune *p, *ep, *np;399 Rune rune;400 int quoted;402 /* we have already seen the '[' */403 type = CCLASS;404 yyclassp = newclass();406 /* look ahead for negation */407 /* SPECIAL CASE!!! negated classes don't match \n */408 ep = r;409 quoted = nextc(&rune);410 if(!quoted && rune == L'^'){411 type = NCCLASS;412 quoted = nextc(&rune);413 *ep++ = L'\n';414 *ep++ = L'\n';415 }417 /* parse class into a set of spans */418 for(; ep<&r[NCCRUNE];){419 if(rune == 0){420 rcerror("malformed '[]'");421 return 0;422 }423 if(!quoted && rune == L']')424 break;425 if(!quoted && rune == L'-'){426 if(ep == r){427 rcerror("malformed '[]'");428 return 0;429 }430 quoted = nextc(&rune);431 if((!quoted && rune == L']') || rune == 0){432 rcerror("malformed '[]'");433 return 0;434 }435 *(ep-1) = rune;436 } else {437 *ep++ = rune;438 *ep++ = rune;439 }440 quoted = nextc(&rune);441 }443 /* sort on span start */444 for(p = r; p < ep; p += 2){445 for(np = p; np < ep; np += 2)446 if(*np < *p){447 rune = np[0];448 np[0] = p[0];449 p[0] = rune;450 rune = np[1];451 np[1] = p[1];452 p[1] = rune;453 }454 }456 /* merge spans */457 np = yyclassp->spans;458 p = r;459 if(r == ep)460 yyclassp->end = np;461 else {462 np[0] = *p++;463 np[1] = *p++;464 for(; p < ep; p += 2)465 if(p[0] <= np[1]){466 if(p[1] > np[1])467 np[1] = p[1];468 } else {469 np += 2;470 np[0] = p[0];471 np[1] = p[1];472 }473 yyclassp->end = np+2;474 }476 return type;477 }479 static Reprog*480 regcomp1(char *s, int literal, int dot_type)481 {482 int token;483 Reprog *pp;485 /* get memory for the program */486 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));487 if(pp == 0){488 regerror("out of memory");489 return 0;490 }491 freep = pp->firstinst;492 classp = pp->class;493 errors = 0;495 if(setjmp(regkaboom))496 goto out;498 /* go compile the sucker */499 lexdone = 0;500 exprp = s;501 nclass = NCLASS;502 nbra = 0;503 atorp = atorstack;504 andp = andstack;505 subidp = subidstack;506 lastwasand = FALSE;507 cursubid = 0;509 /* Start with a low priority operator to prime parser */510 pushator(START-1);511 while((token = lex(literal, dot_type)) != END){512 if((token&0300) == OPERATOR)513 operator(token);514 else515 operand(token);516 }518 /* Close with a low priority operator */519 evaluntil(START);521 /* Force END */522 operand(END);523 evaluntil(START);524 #ifdef DEBUG525 dumpstack();526 #endif527 if(nbra)528 rcerror("unmatched left paren");529 --andp; /* points to first and only operand */530 pp->startinst = andp->first;531 #ifdef DEBUG532 dump(pp);533 #endif534 pp = optimize(pp);535 #ifdef DEBUG536 print("start: %d\n", andp->first-pp->firstinst);537 dump(pp);538 #endif539 out:540 if(errors){541 free(pp);542 pp = 0;543 }544 return pp;545 }547 extern Reprog*548 regcomp(char *s)549 {550 return regcomp1(s, 0, ANY);551 }553 extern Reprog*554 regcomplit(char *s)555 {556 return regcomp1(s, 1, ANY);557 }559 extern Reprog*560 regcompnl(char *s)561 {562 return regcomp1(s, 0, ANYNL);563 }