Blame


1 63a68686 2008-11-03 jas /****************************************************************
2 63a68686 2008-11-03 jas Copyright (C) Lucent Technologies 1997
3 63a68686 2008-11-03 jas All Rights Reserved
4 63a68686 2008-11-03 jas
5 63a68686 2008-11-03 jas Permission to use, copy, modify, and distribute this software and
6 63a68686 2008-11-03 jas its documentation for any purpose and without fee is hereby
7 63a68686 2008-11-03 jas granted, provided that the above copyright notice appear in all
8 63a68686 2008-11-03 jas copies and that both that the copyright notice and this
9 63a68686 2008-11-03 jas permission notice and warranty disclaimer appear in supporting
10 63a68686 2008-11-03 jas documentation, and that the name Lucent Technologies or any of
11 63a68686 2008-11-03 jas its entities not be used in advertising or publicity pertaining
12 63a68686 2008-11-03 jas to distribution of the software without specific, written prior
13 63a68686 2008-11-03 jas permission.
14 63a68686 2008-11-03 jas
15 63a68686 2008-11-03 jas LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 63a68686 2008-11-03 jas INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 63a68686 2008-11-03 jas IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 63a68686 2008-11-03 jas SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 63a68686 2008-11-03 jas WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 63a68686 2008-11-03 jas IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 63a68686 2008-11-03 jas ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 63a68686 2008-11-03 jas THIS SOFTWARE.
23 63a68686 2008-11-03 jas ****************************************************************/
24 63a68686 2008-11-03 jas
25 63a68686 2008-11-03 jas
26 63a68686 2008-11-03 jas #define DEBUG
27 63a68686 2008-11-03 jas #include <stdio.h>
28 63a68686 2008-11-03 jas #include <u.h>
29 63a68686 2008-11-03 jas #include <libc.h>
30 63a68686 2008-11-03 jas #include <ctype.h>
31 63a68686 2008-11-03 jas #include <bio.h>
32 63a68686 2008-11-03 jas #include <regexp.h>
33 63a68686 2008-11-03 jas #include "awk.h"
34 63a68686 2008-11-03 jas #include "y.tab.h"
35 63a68686 2008-11-03 jas
36 63a68686 2008-11-03 jas /* This file provides the interface between the main body of
37 63a68686 2008-11-03 jas * awk and the pattern matching package. It preprocesses
38 63a68686 2008-11-03 jas * patterns prior to compilation to provide awk-like semantics
39 63a68686 2008-11-03 jas * to character sequences not supported by the pattern package.
40 63a68686 2008-11-03 jas * The following conversions are performed:
41 63a68686 2008-11-03 jas *
42 63a68686 2008-11-03 jas * "()" -> "[]"
43 63a68686 2008-11-03 jas * "[-" -> "[\-"
44 63a68686 2008-11-03 jas * "[^-" -> "[^\-"
45 63a68686 2008-11-03 jas * "-]" -> "\-]"
46 63a68686 2008-11-03 jas * "[]" -> "[]*"
47 63a68686 2008-11-03 jas * "\xdddd" -> "\z" where 'z' is the UTF sequence
48 63a68686 2008-11-03 jas * for the hex value
49 63a68686 2008-11-03 jas * "\ddd" -> "\o" where 'o' is a char octal value
50 63a68686 2008-11-03 jas * "\b" -> "\B" where 'B' is backspace
51 63a68686 2008-11-03 jas * "\t" -> "\T" where 'T' is tab
52 63a68686 2008-11-03 jas * "\f" -> "\F" where 'F' is form feed
53 63a68686 2008-11-03 jas * "\n" -> "\N" where 'N' is newline
54 63a68686 2008-11-03 jas * "\r" -> "\r" where 'C' is cr
55 63a68686 2008-11-03 jas */
56 63a68686 2008-11-03 jas
57 63a68686 2008-11-03 jas #define MAXRE 512
58 63a68686 2008-11-03 jas
59 63a68686 2008-11-03 jas static char re[MAXRE]; /* copy buffer */
60 63a68686 2008-11-03 jas
61 63a68686 2008-11-03 jas char *patbeg;
62 63a68686 2008-11-03 jas int patlen; /* number of chars in pattern */
63 63a68686 2008-11-03 jas
64 63a68686 2008-11-03 jas #define NPATS 20 /* number of slots in pattern cache */
65 63a68686 2008-11-03 jas
66 63a68686 2008-11-03 jas static struct pat_list /* dynamic pattern cache */
67 63a68686 2008-11-03 jas {
68 63a68686 2008-11-03 jas char *re;
69 63a68686 2008-11-03 jas int use;
70 63a68686 2008-11-03 jas Reprog *program;
71 63a68686 2008-11-03 jas } pattern[NPATS];
72 63a68686 2008-11-03 jas
73 63a68686 2008-11-03 jas static int npats; /* cache fill level */
74 63a68686 2008-11-03 jas
75 63a68686 2008-11-03 jas /* Compile a pattern */
76 63a68686 2008-11-03 jas void
77 63a68686 2008-11-03 jas *compre(char *pat)
78 63a68686 2008-11-03 jas {
79 63a68686 2008-11-03 jas int i, j, inclass;
80 63a68686 2008-11-03 jas char c, *p, *s;
81 63a68686 2008-11-03 jas Reprog *program;
82 63a68686 2008-11-03 jas
83 63a68686 2008-11-03 jas if (!compile_time) { /* search cache for dynamic pattern */
84 63a68686 2008-11-03 jas for (i = 0; i < npats; i++)
85 63a68686 2008-11-03 jas if (!strcmp(pat, pattern[i].re)) {
86 63a68686 2008-11-03 jas pattern[i].use++;
87 63a68686 2008-11-03 jas return((void *) pattern[i].program);
88 63a68686 2008-11-03 jas }
89 63a68686 2008-11-03 jas }
90 63a68686 2008-11-03 jas /* Preprocess Pattern for compilation */
91 63a68686 2008-11-03 jas p = re;
92 63a68686 2008-11-03 jas s = pat;
93 63a68686 2008-11-03 jas inclass = 0;
94 63a68686 2008-11-03 jas while (c = *s++) {
95 63a68686 2008-11-03 jas if (c == '\\') {
96 63a68686 2008-11-03 jas quoted(&s, &p, re+MAXRE);
97 63a68686 2008-11-03 jas continue;
98 63a68686 2008-11-03 jas }
99 63a68686 2008-11-03 jas else if (!inclass && c == '(' && *s == ')') {
100 63a68686 2008-11-03 jas if (p < re+MAXRE-2) { /* '()' -> '[]*' */
101 63a68686 2008-11-03 jas *p++ = '[';
102 63a68686 2008-11-03 jas *p++ = ']';
103 63a68686 2008-11-03 jas c = '*';
104 63a68686 2008-11-03 jas s++;
105 63a68686 2008-11-03 jas }
106 63a68686 2008-11-03 jas else overflow();
107 63a68686 2008-11-03 jas }
108 63a68686 2008-11-03 jas else if (c == '['){ /* '[-' -> '[\-' */
109 63a68686 2008-11-03 jas inclass = 1;
110 63a68686 2008-11-03 jas if (*s == '-') {
111 63a68686 2008-11-03 jas if (p < re+MAXRE-2) {
112 63a68686 2008-11-03 jas *p++ = '[';
113 63a68686 2008-11-03 jas *p++ = '\\';
114 63a68686 2008-11-03 jas c = *s++;
115 63a68686 2008-11-03 jas }
116 63a68686 2008-11-03 jas else overflow();
117 63a68686 2008-11-03 jas } /* '[^-' -> '[^\-'*/
118 63a68686 2008-11-03 jas else if (*s == '^' && s[1] == '-'){
119 63a68686 2008-11-03 jas if (p < re+MAXRE-3) {
120 63a68686 2008-11-03 jas *p++ = '[';
121 63a68686 2008-11-03 jas *p++ = *s++;
122 63a68686 2008-11-03 jas *p++ = '\\';
123 63a68686 2008-11-03 jas c = *s++;
124 63a68686 2008-11-03 jas }
125 63a68686 2008-11-03 jas else overflow();
126 63a68686 2008-11-03 jas }
127 63a68686 2008-11-03 jas else if (*s == '['){ /* skip '[[' */
128 63a68686 2008-11-03 jas if (p < re+MAXRE-1)
129 63a68686 2008-11-03 jas *p++ = c;
130 63a68686 2008-11-03 jas else overflow();
131 63a68686 2008-11-03 jas c = *s++;
132 63a68686 2008-11-03 jas }
133 63a68686 2008-11-03 jas else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
134 63a68686 2008-11-03 jas if (p < re+MAXRE-2) {
135 63a68686 2008-11-03 jas *p++ = c;
136 63a68686 2008-11-03 jas *p++ = *s++;
137 63a68686 2008-11-03 jas c = *s++;
138 63a68686 2008-11-03 jas }
139 63a68686 2008-11-03 jas else overflow();
140 63a68686 2008-11-03 jas }
141 63a68686 2008-11-03 jas else if (*s == ']') { /* '[]' -> '[]*' */
142 63a68686 2008-11-03 jas if (p < re+MAXRE-2) {
143 63a68686 2008-11-03 jas *p++ = c;
144 63a68686 2008-11-03 jas *p++ = *s++;
145 63a68686 2008-11-03 jas c = '*';
146 63a68686 2008-11-03 jas inclass = 0;
147 63a68686 2008-11-03 jas }
148 63a68686 2008-11-03 jas else overflow();
149 63a68686 2008-11-03 jas }
150 63a68686 2008-11-03 jas }
151 63a68686 2008-11-03 jas else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
152 63a68686 2008-11-03 jas if (p < re+MAXRE-1)
153 63a68686 2008-11-03 jas *p++ = '\\';
154 63a68686 2008-11-03 jas else overflow();
155 63a68686 2008-11-03 jas }
156 63a68686 2008-11-03 jas else if (c == ']')
157 63a68686 2008-11-03 jas inclass = 0;
158 63a68686 2008-11-03 jas if (p < re+MAXRE-1)
159 63a68686 2008-11-03 jas *p++ = c;
160 63a68686 2008-11-03 jas else overflow();
161 63a68686 2008-11-03 jas }
162 63a68686 2008-11-03 jas *p = 0;
163 63a68686 2008-11-03 jas program = regcomp(re); /* compile pattern */
164 63a68686 2008-11-03 jas if (!compile_time) {
165 63a68686 2008-11-03 jas if (npats < NPATS) /* Room in cache */
166 63a68686 2008-11-03 jas i = npats++;
167 63a68686 2008-11-03 jas else { /* Throw out least used */
168 63a68686 2008-11-03 jas int use = pattern[0].use;
169 63a68686 2008-11-03 jas i = 0;
170 63a68686 2008-11-03 jas for (j = 1; j < NPATS; j++) {
171 63a68686 2008-11-03 jas if (pattern[j].use < use) {
172 63a68686 2008-11-03 jas use = pattern[j].use;
173 63a68686 2008-11-03 jas i = j;
174 63a68686 2008-11-03 jas }
175 63a68686 2008-11-03 jas }
176 63a68686 2008-11-03 jas xfree(pattern[i].program);
177 63a68686 2008-11-03 jas xfree(pattern[i].re);
178 63a68686 2008-11-03 jas }
179 63a68686 2008-11-03 jas pattern[i].re = tostring(pat);
180 63a68686 2008-11-03 jas pattern[i].program = program;
181 63a68686 2008-11-03 jas pattern[i].use = 1;
182 63a68686 2008-11-03 jas }
183 63a68686 2008-11-03 jas return((void *) program);
184 63a68686 2008-11-03 jas }
185 63a68686 2008-11-03 jas
186 63a68686 2008-11-03 jas /* T/F match indication - matched string not exported */
187 63a68686 2008-11-03 jas int
188 63a68686 2008-11-03 jas match(void *p, char *s, char *start)
189 63a68686 2008-11-03 jas {
190 63a68686 2008-11-03 jas return regexec((Reprog *) p, (char *) s, 0, 0);
191 63a68686 2008-11-03 jas }
192 63a68686 2008-11-03 jas
193 63a68686 2008-11-03 jas /* match and delimit the matched string */
194 63a68686 2008-11-03 jas int
195 63a68686 2008-11-03 jas pmatch(void *p, char *s, char *start)
196 63a68686 2008-11-03 jas {
197 63a68686 2008-11-03 jas Resub m;
198 63a68686 2008-11-03 jas
199 63a68686 2008-11-03 jas m.s.sp = start;
200 63a68686 2008-11-03 jas m.e.ep = 0;
201 63a68686 2008-11-03 jas if (regexec((Reprog *) p, (char *) s, &m, 1)) {
202 63a68686 2008-11-03 jas patbeg = m.s.sp;
203 63a68686 2008-11-03 jas patlen = m.e.ep-m.s.sp;
204 63a68686 2008-11-03 jas return 1;
205 63a68686 2008-11-03 jas }
206 63a68686 2008-11-03 jas patlen = -1;
207 63a68686 2008-11-03 jas patbeg = start;
208 63a68686 2008-11-03 jas return 0;
209 63a68686 2008-11-03 jas }
210 63a68686 2008-11-03 jas
211 63a68686 2008-11-03 jas /* perform a non-empty match */
212 63a68686 2008-11-03 jas int
213 63a68686 2008-11-03 jas nematch(void *p, char *s, char *start)
214 63a68686 2008-11-03 jas {
215 63a68686 2008-11-03 jas if (pmatch(p, s, start) == 1 && patlen > 0)
216 63a68686 2008-11-03 jas return 1;
217 63a68686 2008-11-03 jas patlen = -1;
218 fa325e9b 2020-01-10 cross patbeg = start;
219 63a68686 2008-11-03 jas return 0;
220 63a68686 2008-11-03 jas }
221 63a68686 2008-11-03 jas /* in the parsing of regular expressions, metacharacters like . have */
222 63a68686 2008-11-03 jas /* to be seen literally; \056 is not a metacharacter. */
223 63a68686 2008-11-03 jas
224 63a68686 2008-11-03 jas int
225 63a68686 2008-11-03 jas hexstr(char **pp) /* find and eval hex string at pp, return new p */
226 63a68686 2008-11-03 jas {
227 63a68686 2008-11-03 jas char c;
228 63a68686 2008-11-03 jas int n = 0;
229 63a68686 2008-11-03 jas int i;
230 63a68686 2008-11-03 jas
231 63a68686 2008-11-03 jas for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
232 63a68686 2008-11-03 jas if (isdigit(c))
233 63a68686 2008-11-03 jas n = 16 * n + c - '0';
234 63a68686 2008-11-03 jas else if ('a' <= c && c <= 'f')
235 63a68686 2008-11-03 jas n = 16 * n + c - 'a' + 10;
236 63a68686 2008-11-03 jas else if ('A' <= c && c <= 'F')
237 63a68686 2008-11-03 jas n = 16 * n + c - 'A' + 10;
238 63a68686 2008-11-03 jas }
239 63a68686 2008-11-03 jas *pp += i;
240 63a68686 2008-11-03 jas return n;
241 63a68686 2008-11-03 jas }
242 63a68686 2008-11-03 jas
243 63a68686 2008-11-03 jas /* look for awk-specific escape sequences */
244 63a68686 2008-11-03 jas
245 63a68686 2008-11-03 jas #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
246 63a68686 2008-11-03 jas
247 63a68686 2008-11-03 jas void
248 63a68686 2008-11-03 jas quoted(char **s, char **to, char *end) /* handle escaped sequence */
249 63a68686 2008-11-03 jas {
250 63a68686 2008-11-03 jas char *p = *s;
251 63a68686 2008-11-03 jas char *t = *to;
252 63a68686 2008-11-03 jas wchar_t c;
253 63a68686 2008-11-03 jas
254 63a68686 2008-11-03 jas switch(c = *p++) {
255 63a68686 2008-11-03 jas case 't':
256 63a68686 2008-11-03 jas c = '\t';
257 63a68686 2008-11-03 jas break;
258 63a68686 2008-11-03 jas case 'n':
259 63a68686 2008-11-03 jas c = '\n';
260 63a68686 2008-11-03 jas break;
261 63a68686 2008-11-03 jas case 'f':
262 63a68686 2008-11-03 jas c = '\f';
263 63a68686 2008-11-03 jas break;
264 63a68686 2008-11-03 jas case 'r':
265 63a68686 2008-11-03 jas c = '\r';
266 63a68686 2008-11-03 jas break;
267 63a68686 2008-11-03 jas case 'b':
268 63a68686 2008-11-03 jas c = '\b';
269 63a68686 2008-11-03 jas break;
270 63a68686 2008-11-03 jas default:
271 63a68686 2008-11-03 jas if (t < end-1) /* all else must be escaped */
272 63a68686 2008-11-03 jas *t++ = '\\';
273 63a68686 2008-11-03 jas if (c == 'x') { /* hexadecimal goo follows */
274 63a68686 2008-11-03 jas c = hexstr(&p);
275 63a68686 2008-11-03 jas if (t < end-MB_CUR_MAX)
276 63a68686 2008-11-03 jas t += wctomb(t, c);
277 63a68686 2008-11-03 jas else overflow();
278 63a68686 2008-11-03 jas *to = t;
279 63a68686 2008-11-03 jas *s = p;
280 63a68686 2008-11-03 jas return;
281 63a68686 2008-11-03 jas } else if (isoctdigit(c)) { /* \d \dd \ddd */
282 63a68686 2008-11-03 jas c -= '0';
283 63a68686 2008-11-03 jas if (isoctdigit(*p)) {
284 63a68686 2008-11-03 jas c = 8 * c + *p++ - '0';
285 63a68686 2008-11-03 jas if (isoctdigit(*p))
286 63a68686 2008-11-03 jas c = 8 * c + *p++ - '0';
287 63a68686 2008-11-03 jas }
288 63a68686 2008-11-03 jas }
289 63a68686 2008-11-03 jas break;
290 63a68686 2008-11-03 jas }
291 63a68686 2008-11-03 jas if (t < end-1)
292 63a68686 2008-11-03 jas *t++ = c;
293 63a68686 2008-11-03 jas *s = p;
294 63a68686 2008-11-03 jas *to = t;
295 63a68686 2008-11-03 jas }
296 63a68686 2008-11-03 jas /* count rune positions */
297 63a68686 2008-11-03 jas int
298 63a68686 2008-11-03 jas countposn(char *s, int n)
299 63a68686 2008-11-03 jas {
300 63a68686 2008-11-03 jas int i, j;
301 63a68686 2008-11-03 jas char *end;
302 63a68686 2008-11-03 jas
303 63a68686 2008-11-03 jas for (i = 0, end = s+n; *s && s < end; i++){
304 63a68686 2008-11-03 jas j = mblen(s, n);
305 63a68686 2008-11-03 jas if(j <= 0)
306 63a68686 2008-11-03 jas j = 1;
307 63a68686 2008-11-03 jas s += j;
308 63a68686 2008-11-03 jas }
309 63a68686 2008-11-03 jas return(i);
310 63a68686 2008-11-03 jas }
311 63a68686 2008-11-03 jas
312 63a68686 2008-11-03 jas /* pattern package error handler */
313 63a68686 2008-11-03 jas
314 63a68686 2008-11-03 jas void
315 63a68686 2008-11-03 jas regerror(char *s)
316 63a68686 2008-11-03 jas {
317 63a68686 2008-11-03 jas FATAL("%s", s);
318 63a68686 2008-11-03 jas }
319 63a68686 2008-11-03 jas
320 63a68686 2008-11-03 jas void
321 63a68686 2008-11-03 jas overflow(void)
322 63a68686 2008-11-03 jas {
323 63a68686 2008-11-03 jas FATAL("%s", "regular expression too big");
324 63a68686 2008-11-03 jas }