Blob
1 #include "lib9.h"2 #include "regexp9.h"3 #include "regcomp.h"5 #define TRUE 16 #define FALSE 08 /*9 * Parser Information10 */11 typedef12 struct Node13 {14 Reinst* first;15 Reinst* last;16 }Node;18 Reprog RePrOg;20 #define NSTACK 2021 static Node andstack[NSTACK];22 static Node *andp;23 static int atorstack[NSTACK];24 static int* atorp;25 static int cursubid; /* id of current subexpression */26 static int subidstack[NSTACK]; /* parallel to atorstack */27 static int* subidp;28 static int lastwasand; /* Last token was operand */29 static int nbra;30 static char* exprp; /* pointer to next character in source expression */31 static int lexdone;32 static int nclass;33 static Reclass*classp;34 static Reinst* freep;35 static int errors;36 static Rune yyrune; /* last lex'd rune */37 static Reclass*yyclassp; /* last lex'd class */39 /* predeclared crap */40 static void operator(int);41 static void pushand(Reinst*, Reinst*);42 static void pushator(int);43 static void evaluntil(int);44 static int bldcclass(void);46 static jmp_buf regkaboom;48 static void49 rcerror(char *s)50 {51 errors++;52 regerror(s);53 longjmp(regkaboom, 1);54 }56 static Reinst*57 newinst(int t)58 {59 freep->type = t;60 freep->u2.left = 0;61 freep->u1.right = 0;62 return freep++;63 }65 static void66 operand(int t)67 {68 Reinst *i;70 if(lastwasand)71 operator(CAT); /* catenate is implicit */72 i = newinst(t);74 if(t == CCLASS || t == NCCLASS)75 i->u1.cp = yyclassp;76 if(t == RUNE)77 i->u1.r = yyrune;79 pushand(i, i);80 lastwasand = TRUE;81 }83 static void84 operator(int t)85 {86 if(t==RBRA && --nbra<0)87 rcerror("unmatched right paren");88 if(t==LBRA){89 if(++cursubid >= NSUBEXP)90 rcerror ("too many subexpressions");91 nbra++;92 if(lastwasand)93 operator(CAT);94 } else95 evaluntil(t);96 if(t != RBRA)97 pushator(t);98 lastwasand = FALSE;99 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)100 lastwasand = TRUE; /* these look like operands */101 }103 static void104 regerr2(char *s, int c)105 {106 char buf[100];107 char *cp = buf;108 while(*s)109 *cp++ = *s++;110 *cp++ = c;111 *cp = '\0';112 rcerror(buf);113 }115 static void116 cant(char *s)117 {118 char buf[100];119 strcpy(buf, "can't happen: ");120 strcat(buf, s);121 rcerror(buf);122 }124 static void125 pushand(Reinst *f, Reinst *l)126 {127 if(andp >= &andstack[NSTACK])128 cant("operand stack overflow");129 andp->first = f;130 andp->last = l;131 andp++;132 }134 static void135 pushator(int t)136 {137 if(atorp >= &atorstack[NSTACK])138 cant("operator stack overflow");139 *atorp++ = t;140 *subidp++ = cursubid;141 }143 static Node*144 popand(int op)145 {146 Reinst *inst;148 if(andp <= &andstack[0]){149 regerr2("missing operand for ", op);150 inst = newinst(NOP);151 pushand(inst,inst);152 }153 return --andp;154 }156 static int157 popator(void)158 {159 if(atorp <= &atorstack[0])160 cant("operator stack underflow");161 --subidp;162 return *--atorp;163 }165 static void166 evaluntil(int pri)167 {168 Node *op1, *op2;169 Reinst *inst1, *inst2;171 while(pri==RBRA || atorp[-1]>=pri){172 switch(popator()){173 default:174 rcerror("unknown operator in evaluntil");175 break;176 case LBRA: /* must have been RBRA */177 op1 = popand('(');178 inst2 = newinst(RBRA);179 inst2->u1.subid = *subidp;180 op1->last->u2.next = inst2;181 inst1 = newinst(LBRA);182 inst1->u1.subid = *subidp;183 inst1->u2.next = op1->first;184 pushand(inst1, inst2);185 return;186 case OR:187 op2 = popand('|');188 op1 = popand('|');189 inst2 = newinst(NOP);190 op2->last->u2.next = inst2;191 op1->last->u2.next = inst2;192 inst1 = newinst(OR);193 inst1->u1.right = op1->first;194 inst1->u2.left = op2->first;195 pushand(inst1, inst2);196 break;197 case CAT:198 op2 = popand(0);199 op1 = popand(0);200 op1->last->u2.next = op2->first;201 pushand(op1->first, op2->last);202 break;203 case STAR:204 op2 = popand('*');205 inst1 = newinst(OR);206 op2->last->u2.next = inst1;207 inst1->u1.right = op2->first;208 pushand(inst1, inst1);209 break;210 case PLUS:211 op2 = popand('+');212 inst1 = newinst(OR);213 op2->last->u2.next = inst1;214 inst1->u1.right = op2->first;215 pushand(op2->first, inst1);216 break;217 case QUEST:218 op2 = popand('?');219 inst1 = newinst(OR);220 inst2 = newinst(NOP);221 inst1->u2.left = inst2;222 inst1->u1.right = op2->first;223 op2->last->u2.next = inst2;224 pushand(inst1, inst2);225 break;226 }227 }228 }230 static Reprog*231 optimize(Reprog *pp)232 {233 Reinst *inst, *target;234 int size;235 Reprog *npp;236 Reclass *cl;237 int diff;239 /*240 * get rid of NOOP chains241 */242 for(inst=pp->firstinst; inst->type!=END; inst++){243 target = inst->u2.next;244 while(target->type == NOP)245 target = target->u2.next;246 inst->u2.next = target;247 }249 /*250 * The original allocation is for an area larger than251 * necessary. Reallocate to the actual space used252 * and then relocate the code.253 */254 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);255 npp = (Reprog *)realloc(pp, size);256 if(npp==0 || npp==pp)257 return pp;258 diff = (char *)npp - (char *)pp;259 freep = (Reinst *)((char *)freep + diff);260 for(inst=npp->firstinst; inst<freep; inst++){261 switch(inst->type){262 case OR:263 case STAR:264 case PLUS:265 case QUEST:266 *(char **)&inst->u1.right += diff;267 break;268 case CCLASS:269 case NCCLASS:270 *(char **)&inst->u1.right += diff;271 cl = inst->u1.cp;272 *(char **)&cl->end += diff;273 break;274 }275 *(char **)&inst->u2.left += diff;276 }277 *(char **)&npp->startinst += diff;278 return npp;279 }281 #ifdef DEBUG282 static void283 dumpstack(void){284 Node *stk;285 int *ip;287 print("operators\n");288 for(ip=atorstack; ip<atorp; ip++)289 print("0%o\n", *ip);290 print("operands\n");291 for(stk=andstack; stk<andp; stk++)292 print("0%o\t0%o\n", stk->first->type, stk->last->type);293 }295 static void296 dump(Reprog *pp)297 {298 Reinst *l;299 Rune *p;301 l = pp->firstinst;302 do{303 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,304 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);305 if(l->type == RUNE)306 print("\t%C\n", l->r);307 else if(l->type == CCLASS || l->type == NCCLASS){308 print("\t[");309 if(l->type == NCCLASS)310 print("^");311 for(p = l->cp->spans; p < l->cp->end; p += 2)312 if(p[0] == p[1])313 print("%C", p[0]);314 else315 print("%C-%C", p[0], p[1]);316 print("]\n");317 } else318 print("\n");319 }while(l++->type);320 }321 #endif323 static Reclass*324 newclass(void)325 {326 if(nclass >= NCLASS)327 regerr2("too many character classes; limit", NCLASS+'0');328 return &(classp[nclass++]);329 }331 static int332 nextc(Rune *rp)333 {334 if(lexdone){335 *rp = 0;336 return 1;337 }338 exprp += chartorune(rp, exprp);339 if(*rp == L'\\'){340 exprp += chartorune(rp, exprp);341 return 1;342 }343 if(*rp == 0)344 lexdone = 1;345 return 0;346 }348 static int349 lex(int literal, int dot_type)350 {351 int quoted;353 quoted = nextc(&yyrune);354 if(literal || quoted){355 if(yyrune == 0)356 return END;357 return RUNE;358 }360 switch(yyrune){361 case 0:362 return END;363 case L'*':364 return STAR;365 case L'?':366 return QUEST;367 case L'+':368 return PLUS;369 case L'|':370 return OR;371 case L'.':372 return dot_type;373 case L'(':374 return LBRA;375 case L')':376 return RBRA;377 case L'^':378 return BOL;379 case L'$':380 return EOL;381 case L'[':382 return bldcclass();383 }384 return RUNE;385 }387 static int388 bldcclass(void)389 {390 int type;391 Rune r[NCCRUNE];392 Rune *p, *ep, *np;393 Rune rune;394 int quoted;396 /* we have already seen the '[' */397 type = CCLASS;398 yyclassp = newclass();400 /* look ahead for negation */401 /* SPECIAL CASE!!! negated classes don't match \n */402 ep = r;403 quoted = nextc(&rune);404 if(!quoted && rune == L'^'){405 type = NCCLASS;406 quoted = nextc(&rune);407 *ep++ = L'\n';408 *ep++ = L'\n';409 }411 /* parse class into a set of spans */412 for(; ep<&r[NCCRUNE];){413 if(rune == 0){414 rcerror("malformed '[]'");415 return 0;416 }417 if(!quoted && rune == L']')418 break;419 if(!quoted && rune == L'-'){420 if(ep == r){421 rcerror("malformed '[]'");422 return 0;423 }424 quoted = nextc(&rune);425 if((!quoted && rune == L']') || rune == 0){426 rcerror("malformed '[]'");427 return 0;428 }429 *(ep-1) = rune;430 } else {431 *ep++ = rune;432 *ep++ = rune;433 }434 quoted = nextc(&rune);435 }437 /* sort on span start */438 for(p = r; p < ep; p += 2){439 for(np = p; np < ep; np += 2)440 if(*np < *p){441 rune = np[0];442 np[0] = p[0];443 p[0] = rune;444 rune = np[1];445 np[1] = p[1];446 p[1] = rune;447 }448 }450 /* merge spans */451 np = yyclassp->spans;452 p = r;453 if(r == ep)454 yyclassp->end = np;455 else {456 np[0] = *p++;457 np[1] = *p++;458 for(; p < ep; p += 2)459 if(p[0] <= np[1]){460 if(p[1] > np[1])461 np[1] = p[1];462 } else {463 np += 2;464 np[0] = p[0];465 np[1] = p[1];466 }467 yyclassp->end = np+2;468 }470 return type;471 }473 static Reprog*474 regcomp1(char *s, int literal, int dot_type)475 {476 int token;477 Reprog *pp;479 /* get memory for the program */480 pp = (Reprog *)malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));481 if(pp == 0){482 regerror("out of memory");483 return 0;484 }485 freep = pp->firstinst;486 classp = pp->class;487 errors = 0;489 if(setjmp(regkaboom))490 goto out;492 /* go compile the sucker */493 lexdone = 0;494 exprp = s;495 nclass = 0;496 nbra = 0;497 atorp = atorstack;498 andp = andstack;499 subidp = subidstack;500 lastwasand = FALSE;501 cursubid = 0;503 /* Start with a low priority operator to prime parser */504 pushator(START-1);505 while((token = lex(literal, dot_type)) != END){506 if((token&0300) == OPERATOR)507 operator(token);508 else509 operand(token);510 }512 /* Close with a low priority operator */513 evaluntil(START);515 /* Force END */516 operand(END);517 evaluntil(START);518 #ifdef DEBUG519 dumpstack();520 #endif521 if(nbra)522 rcerror("unmatched left paren");523 --andp; /* points to first and only operand */524 pp->startinst = andp->first;525 #ifdef DEBUG526 dump(pp);527 #endif528 pp = optimize(pp);529 #ifdef DEBUG530 print("start: %d\n", andp->first-pp->firstinst);531 dump(pp);532 #endif533 out:534 if(errors){535 free(pp);536 pp = 0;537 }538 return pp;539 }541 extern Reprog*542 regcomp(char *s)543 {544 return regcomp1(s, 0, ANY);545 }547 extern Reprog*548 regcomplit(char *s)549 {550 return regcomp1(s, 1, ANY);551 }553 extern Reprog*554 regcompnl(char *s)555 {556 return regcomp1(s, 0, ANYNL);557 }