Blob


1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
26 #define DEBUG
27 #include <stdio.h>
28 #include <u.h>
29 #include <libc.h>
30 #include <ctype.h>
31 #include <bio.h>
32 #include <regexp.h>
33 #include "awk.h"
34 #include "y.tab.h"
36 /* This file provides the interface between the main body of
37 * awk and the pattern matching package. It preprocesses
38 * patterns prior to compilation to provide awk-like semantics
39 * to character sequences not supported by the pattern package.
40 * The following conversions are performed:
41 *
42 * "()" -> "[]"
43 * "[-" -> "[\-"
44 * "[^-" -> "[^\-"
45 * "-]" -> "\-]"
46 * "[]" -> "[]*"
47 * "\xdddd" -> "\z" where 'z' is the UTF sequence
48 * for the hex value
49 * "\ddd" -> "\o" where 'o' is a char octal value
50 * "\b" -> "\B" where 'B' is backspace
51 * "\t" -> "\T" where 'T' is tab
52 * "\f" -> "\F" where 'F' is form feed
53 * "\n" -> "\N" where 'N' is newline
54 * "\r" -> "\r" where 'C' is cr
55 */
57 #define MAXRE 512
59 static char re[MAXRE]; /* copy buffer */
61 char *patbeg;
62 int patlen; /* number of chars in pattern */
64 #define NPATS 20 /* number of slots in pattern cache */
66 static struct pat_list /* dynamic pattern cache */
67 {
68 char *re;
69 int use;
70 Reprog *program;
71 } pattern[NPATS];
73 static int npats; /* cache fill level */
75 /* Compile a pattern */
76 void
77 *compre(char *pat)
78 {
79 int i, j, inclass;
80 char c, *p, *s;
81 Reprog *program;
83 if (!compile_time) { /* search cache for dynamic pattern */
84 for (i = 0; i < npats; i++)
85 if (!strcmp(pat, pattern[i].re)) {
86 pattern[i].use++;
87 return((void *) pattern[i].program);
88 }
89 }
90 /* Preprocess Pattern for compilation */
91 p = re;
92 s = pat;
93 inclass = 0;
94 while (c = *s++) {
95 if (c == '\\') {
96 quoted(&s, &p, re+MAXRE);
97 continue;
98 }
99 else if (!inclass && c == '(' && *s == ')') {
100 if (p < re+MAXRE-2) { /* '()' -> '[]*' */
101 *p++ = '[';
102 *p++ = ']';
103 c = '*';
104 s++;
106 else overflow();
108 else if (c == '['){ /* '[-' -> '[\-' */
109 inclass = 1;
110 if (*s == '-') {
111 if (p < re+MAXRE-2) {
112 *p++ = '[';
113 *p++ = '\\';
114 c = *s++;
116 else overflow();
117 } /* '[^-' -> '[^\-'*/
118 else if (*s == '^' && s[1] == '-'){
119 if (p < re+MAXRE-3) {
120 *p++ = '[';
121 *p++ = *s++;
122 *p++ = '\\';
123 c = *s++;
125 else overflow();
127 else if (*s == '['){ /* skip '[[' */
128 if (p < re+MAXRE-1)
129 *p++ = c;
130 else overflow();
131 c = *s++;
133 else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
134 if (p < re+MAXRE-2) {
135 *p++ = c;
136 *p++ = *s++;
137 c = *s++;
139 else overflow();
141 else if (*s == ']') { /* '[]' -> '[]*' */
142 if (p < re+MAXRE-2) {
143 *p++ = c;
144 *p++ = *s++;
145 c = '*';
146 inclass = 0;
148 else overflow();
151 else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
152 if (p < re+MAXRE-1)
153 *p++ = '\\';
154 else overflow();
156 else if (c == ']')
157 inclass = 0;
158 if (p < re+MAXRE-1)
159 *p++ = c;
160 else overflow();
162 *p = 0;
163 program = regcomp(re); /* compile pattern */
164 if (!compile_time) {
165 if (npats < NPATS) /* Room in cache */
166 i = npats++;
167 else { /* Throw out least used */
168 int use = pattern[0].use;
169 i = 0;
170 for (j = 1; j < NPATS; j++) {
171 if (pattern[j].use < use) {
172 use = pattern[j].use;
173 i = j;
176 xfree(pattern[i].program);
177 xfree(pattern[i].re);
179 pattern[i].re = tostring(pat);
180 pattern[i].program = program;
181 pattern[i].use = 1;
183 return((void *) program);
186 /* T/F match indication - matched string not exported */
187 int
188 match(void *p, char *s, char *start)
190 return regexec((Reprog *) p, (char *) s, 0, 0);
193 /* match and delimit the matched string */
194 int
195 pmatch(void *p, char *s, char *start)
197 Resub m;
199 m.s.sp = start;
200 m.e.ep = 0;
201 if (regexec((Reprog *) p, (char *) s, &m, 1)) {
202 patbeg = m.s.sp;
203 patlen = m.e.ep-m.s.sp;
204 return 1;
206 patlen = -1;
207 patbeg = start;
208 return 0;
211 /* perform a non-empty match */
212 int
213 nematch(void *p, char *s, char *start)
215 if (pmatch(p, s, start) == 1 && patlen > 0)
216 return 1;
217 patlen = -1;
218 patbeg = start;
219 return 0;
221 /* in the parsing of regular expressions, metacharacters like . have */
222 /* to be seen literally; \056 is not a metacharacter. */
224 int
225 hexstr(char **pp) /* find and eval hex string at pp, return new p */
227 char c;
228 int n = 0;
229 int i;
231 for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
232 if (isdigit(c))
233 n = 16 * n + c - '0';
234 else if ('a' <= c && c <= 'f')
235 n = 16 * n + c - 'a' + 10;
236 else if ('A' <= c && c <= 'F')
237 n = 16 * n + c - 'A' + 10;
239 *pp += i;
240 return n;
243 /* look for awk-specific escape sequences */
245 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
247 void
248 quoted(char **s, char **to, char *end) /* handle escaped sequence */
250 char *p = *s;
251 char *t = *to;
252 wchar_t c;
254 switch(c = *p++) {
255 case 't':
256 c = '\t';
257 break;
258 case 'n':
259 c = '\n';
260 break;
261 case 'f':
262 c = '\f';
263 break;
264 case 'r':
265 c = '\r';
266 break;
267 case 'b':
268 c = '\b';
269 break;
270 default:
271 if (t < end-1) /* all else must be escaped */
272 *t++ = '\\';
273 if (c == 'x') { /* hexadecimal goo follows */
274 c = hexstr(&p);
275 if (t < end-MB_CUR_MAX)
276 t += wctomb(t, c);
277 else overflow();
278 *to = t;
279 *s = p;
280 return;
281 } else if (isoctdigit(c)) { /* \d \dd \ddd */
282 c -= '0';
283 if (isoctdigit(*p)) {
284 c = 8 * c + *p++ - '0';
285 if (isoctdigit(*p))
286 c = 8 * c + *p++ - '0';
289 break;
291 if (t < end-1)
292 *t++ = c;
293 *s = p;
294 *to = t;
296 /* count rune positions */
297 int
298 countposn(char *s, int n)
300 int i, j;
301 char *end;
303 for (i = 0, end = s+n; *s && s < end; i++){
304 j = mblen(s, n);
305 if(j <= 0)
306 j = 1;
307 s += j;
309 return(i);
312 /* pattern package error handler */
314 void
315 regerror(char *s)
317 FATAL("%s", s);
320 void
321 overflow(void)
323 FATAL("%s", "regular expression too big");