Blob
1 #include "lib9.h"2 #include "regexp9.h"3 #include "regcomp.h"5 #define TRUE 16 #define FALSE 08 /*9 * Parser Information10 */11 typedef12 struct Node13 {14 Reinst* first;15 Reinst* last;16 }Node;18 #define NSTACK 2019 static Node andstack[NSTACK];20 static Node *andp;21 static int atorstack[NSTACK];22 static int* atorp;23 static int cursubid; /* id of current subexpression */24 static int subidstack[NSTACK]; /* parallel to atorstack */25 static int* subidp;26 static int lastwasand; /* Last token was operand */27 static int nbra;28 static char* exprp; /* pointer to next character in source expression */29 static int lexdone;30 static int nclass;31 static Reclass*classp;32 static Reinst* freep;33 static int errors;34 static Rune yyrune; /* last lex'd rune */35 static Reclass*yyclassp; /* last lex'd class */37 /* predeclared crap */38 static void operator(int);39 static void pushand(Reinst*, Reinst*);40 static void pushator(int);41 static void evaluntil(int);42 static int bldcclass(void);44 static jmp_buf regkaboom;46 static void47 rcerror(char *s)48 {49 errors++;50 regerror(s);51 longjmp(regkaboom, 1);52 }54 static Reinst*55 newinst(int t)56 {57 freep->type = t;58 freep->u2.left = 0;59 freep->u1.right = 0;60 return freep++;61 }63 static void64 operand(int t)65 {66 Reinst *i;68 if(lastwasand)69 operator(CAT); /* catenate is implicit */70 i = newinst(t);72 if(t == CCLASS || t == NCCLASS)73 i->u1.cp = yyclassp;74 if(t == RUNE)75 i->u1.r = yyrune;77 pushand(i, i);78 lastwasand = TRUE;79 }81 static void82 operator(int t)83 {84 if(t==RBRA && --nbra<0)85 rcerror("unmatched right paren");86 if(t==LBRA){87 if(++cursubid >= NSUBEXP)88 rcerror ("too many subexpressions");89 nbra++;90 if(lastwasand)91 operator(CAT);92 } else93 evaluntil(t);94 if(t != RBRA)95 pushator(t);96 lastwasand = FALSE;97 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)98 lastwasand = TRUE; /* these look like operands */99 }101 static void102 regerr2(char *s, int c)103 {104 char buf[100];105 char *cp = buf;106 while(*s)107 *cp++ = *s++;108 *cp++ = c;109 *cp = '\0';110 rcerror(buf);111 }113 static void114 cant(char *s)115 {116 char buf[100];117 strcpy(buf, "can't happen: ");118 strcat(buf, s);119 rcerror(buf);120 }122 static void123 pushand(Reinst *f, Reinst *l)124 {125 if(andp >= &andstack[NSTACK])126 cant("operand stack overflow");127 andp->first = f;128 andp->last = l;129 andp++;130 }132 static void133 pushator(int t)134 {135 if(atorp >= &atorstack[NSTACK])136 cant("operator stack overflow");137 *atorp++ = t;138 *subidp++ = cursubid;139 }141 static Node*142 popand(int op)143 {144 Reinst *inst;146 if(andp <= &andstack[0]){147 regerr2("missing operand for ", op);148 inst = newinst(NOP);149 pushand(inst,inst);150 }151 return --andp;152 }154 static int155 popator(void)156 {157 if(atorp <= &atorstack[0])158 cant("operator stack underflow");159 --subidp;160 return *--atorp;161 }163 static void164 evaluntil(int pri)165 {166 Node *op1, *op2;167 Reinst *inst1, *inst2;169 while(pri==RBRA || atorp[-1]>=pri){170 switch(popator()){171 default:172 rcerror("unknown operator in evaluntil");173 break;174 case LBRA: /* must have been RBRA */175 op1 = popand('(');176 inst2 = newinst(RBRA);177 inst2->u1.subid = *subidp;178 op1->last->u2.next = inst2;179 inst1 = newinst(LBRA);180 inst1->u1.subid = *subidp;181 inst1->u2.next = op1->first;182 pushand(inst1, inst2);183 return;184 case OR:185 op2 = popand('|');186 op1 = popand('|');187 inst2 = newinst(NOP);188 op2->last->u2.next = inst2;189 op1->last->u2.next = inst2;190 inst1 = newinst(OR);191 inst1->u1.right = op1->first;192 inst1->u2.left = op2->first;193 pushand(inst1, inst2);194 break;195 case CAT:196 op2 = popand(0);197 op1 = popand(0);198 op1->last->u2.next = op2->first;199 pushand(op1->first, op2->last);200 break;201 case STAR:202 op2 = popand('*');203 inst1 = newinst(OR);204 op2->last->u2.next = inst1;205 inst1->u1.right = op2->first;206 pushand(inst1, inst1);207 break;208 case PLUS:209 op2 = popand('+');210 inst1 = newinst(OR);211 op2->last->u2.next = inst1;212 inst1->u1.right = op2->first;213 pushand(op2->first, inst1);214 break;215 case QUEST:216 op2 = popand('?');217 inst1 = newinst(OR);218 inst2 = newinst(NOP);219 inst1->u2.left = inst2;220 inst1->u1.right = op2->first;221 op2->last->u2.next = inst2;222 pushand(inst1, inst2);223 break;224 }225 }226 }228 static Reprog*229 optimize(Reprog *pp)230 {231 Reinst *inst, *target;232 int size;233 Reprog *npp;234 Reclass *cl;235 int diff;237 /*238 * get rid of NOOP chains239 */240 for(inst=pp->firstinst; inst->type!=END; inst++){241 target = inst->u2.next;242 while(target->type == NOP)243 target = target->u2.next;244 inst->u2.next = target;245 }247 /*248 * The original allocation is for an area larger than249 * necessary. Reallocate to the actual space used250 * and then relocate the code.251 */252 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);253 npp = realloc(pp, size);254 if(npp==0 || npp==pp)255 return pp;256 diff = (char *)npp - (char *)pp;257 freep = (Reinst *)((char *)freep + diff);258 for(inst=npp->firstinst; inst<freep; inst++){259 switch(inst->type){260 case OR:261 case STAR:262 case PLUS:263 case QUEST:264 *(char **)&inst->u1.right += diff;265 break;266 case CCLASS:267 case NCCLASS:268 *(char **)&inst->u1.right += diff;269 cl = inst->u1.cp;270 *(char **)&cl->end += diff;271 break;272 }273 *(char **)&inst->u2.left += diff;274 }275 *(char **)&npp->startinst += diff;276 return npp;277 }279 #ifdef DEBUG280 static void281 dumpstack(void){282 Node *stk;283 int *ip;285 print("operators\n");286 for(ip=atorstack; ip<atorp; ip++)287 print("0%o\n", *ip);288 print("operands\n");289 for(stk=andstack; stk<andp; stk++)290 print("0%o\t0%o\n", stk->first->type, stk->last->type);291 }293 static void294 dump(Reprog *pp)295 {296 Reinst *l;297 Rune *p;299 l = pp->firstinst;300 do{301 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,302 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);303 if(l->type == RUNE)304 print("\t%C\n", l->u1.r);305 else if(l->type == CCLASS || l->type == NCCLASS){306 print("\t[");307 if(l->type == NCCLASS)308 print("^");309 for(p = l->u1.cp->spans; p < l->u1.cp->end; p += 2)310 if(p[0] == p[1])311 print("%C", p[0]);312 else313 print("%C-%C", p[0], p[1]);314 print("]\n");315 } else316 print("\n");317 }while(l++->type);318 }319 #endif321 static Reclass*322 newclass(void)323 {324 if(nclass >= NCLASS)325 regerr2("too many character classes; limit", NCLASS+'0');326 return &(classp[nclass++]);327 }329 static int330 nextc(Rune *rp)331 {332 if(lexdone){333 *rp = 0;334 return 1;335 }336 exprp += chartorune(rp, exprp);337 if(*rp == L'\\'){338 exprp += chartorune(rp, exprp);339 return 1;340 }341 if(*rp == 0)342 lexdone = 1;343 return 0;344 }346 static int347 lex(int literal, int dot_type)348 {349 int quoted;351 quoted = nextc(&yyrune);352 if(literal || quoted){353 if(yyrune == 0)354 return END;355 return RUNE;356 }358 switch(yyrune){359 case 0:360 return END;361 case L'*':362 return STAR;363 case L'?':364 return QUEST;365 case L'+':366 return PLUS;367 case L'|':368 return OR;369 case L'.':370 return dot_type;371 case L'(':372 return LBRA;373 case L')':374 return RBRA;375 case L'^':376 return BOL;377 case L'$':378 return EOL;379 case L'[':380 return bldcclass();381 }382 return RUNE;383 }385 static int386 bldcclass(void)387 {388 int type;389 Rune r[NCCRUNE];390 Rune *p, *ep, *np;391 Rune rune;392 int quoted;394 /* we have already seen the '[' */395 type = CCLASS;396 yyclassp = newclass();398 /* look ahead for negation */399 /* SPECIAL CASE!!! negated classes don't match \n */400 ep = r;401 quoted = nextc(&rune);402 if(!quoted && rune == L'^'){403 type = NCCLASS;404 quoted = nextc(&rune);405 *ep++ = L'\n';406 *ep++ = L'\n';407 }409 /* parse class into a set of spans */410 for(; ep<&r[NCCRUNE];){411 if(rune == 0){412 rcerror("malformed '[]'");413 return 0;414 }415 if(!quoted && rune == L']')416 break;417 if(!quoted && rune == L'-'){418 if(ep == r){419 rcerror("malformed '[]'");420 return 0;421 }422 quoted = nextc(&rune);423 if((!quoted && rune == L']') || rune == 0){424 rcerror("malformed '[]'");425 return 0;426 }427 *(ep-1) = rune;428 } else {429 *ep++ = rune;430 *ep++ = rune;431 }432 quoted = nextc(&rune);433 }435 /* sort on span start */436 for(p = r; p < ep; p += 2){437 for(np = p; np < ep; np += 2)438 if(*np < *p){439 rune = np[0];440 np[0] = p[0];441 p[0] = rune;442 rune = np[1];443 np[1] = p[1];444 p[1] = rune;445 }446 }448 /* merge spans */449 np = yyclassp->spans;450 p = r;451 if(r == ep)452 yyclassp->end = np;453 else {454 np[0] = *p++;455 np[1] = *p++;456 for(; p < ep; p += 2)457 if(p[0] <= np[1]){458 if(p[1] > np[1])459 np[1] = p[1];460 } else {461 np += 2;462 np[0] = p[0];463 np[1] = p[1];464 }465 yyclassp->end = np+2;466 }468 return type;469 }471 static Reprog*472 regcomp1(char *s, int literal, int dot_type)473 {474 int token;475 Reprog *pp;477 /* get memory for the program */478 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));479 if(pp == 0){480 regerror("out of memory");481 return 0;482 }483 freep = pp->firstinst;484 classp = pp->class;485 errors = 0;487 if(setjmp(regkaboom))488 goto out;490 /* go compile the sucker */491 lexdone = 0;492 exprp = s;493 nclass = 0;494 nbra = 0;495 atorp = atorstack;496 andp = andstack;497 subidp = subidstack;498 lastwasand = FALSE;499 cursubid = 0;501 /* Start with a low priority operator to prime parser */502 pushator(START-1);503 while((token = lex(literal, dot_type)) != END){504 if((token&0300) == OPERATOR)505 operator(token);506 else507 operand(token);508 }510 /* Close with a low priority operator */511 evaluntil(START);513 /* Force END */514 operand(END);515 evaluntil(START);516 #ifdef DEBUG517 dumpstack();518 #endif519 if(nbra)520 rcerror("unmatched left paren");521 --andp; /* points to first and only operand */522 pp->startinst = andp->first;523 #ifdef DEBUG524 dump(pp);525 #endif526 pp = optimize(pp);527 #ifdef DEBUG528 print("start: %d\n", andp->first-pp->firstinst);529 dump(pp);530 #endif531 out:532 if(errors){533 free(pp);534 pp = 0;535 }536 return pp;537 }539 extern Reprog*540 regcomp(char *s)541 {542 return regcomp1(s, 0, ANY);543 }545 extern Reprog*546 regcomplit(char *s)547 {548 return regcomp1(s, 1, ANY);549 }551 extern Reprog*552 regcompnl(char *s)553 {554 return regcomp1(s, 0, ANYNL);555 }