1 63a68686 2008-11-03 jas /****************************************************************
2 63a68686 2008-11-03 jas Copyright (C) Lucent Technologies 1997
3 63a68686 2008-11-03 jas All Rights Reserved
5 63a68686 2008-11-03 jas Permission to use, copy, modify, and distribute this software and
6 63a68686 2008-11-03 jas its documentation for any purpose and without fee is hereby
7 63a68686 2008-11-03 jas granted, provided that the above copyright notice appear in all
8 63a68686 2008-11-03 jas copies and that both that the copyright notice and this
9 63a68686 2008-11-03 jas permission notice and warranty disclaimer appear in supporting
10 63a68686 2008-11-03 jas documentation, and that the name Lucent Technologies or any of
11 63a68686 2008-11-03 jas its entities not be used in advertising or publicity pertaining
12 63a68686 2008-11-03 jas to distribution of the software without specific, written prior
15 63a68686 2008-11-03 jas LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 63a68686 2008-11-03 jas INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 63a68686 2008-11-03 jas IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 63a68686 2008-11-03 jas SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 63a68686 2008-11-03 jas WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 63a68686 2008-11-03 jas IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 63a68686 2008-11-03 jas ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 63a68686 2008-11-03 jas THIS SOFTWARE.
23 63a68686 2008-11-03 jas ****************************************************************/
26 63a68686 2008-11-03 jas #define DEBUG
27 63a68686 2008-11-03 jas #include <stdio.h>
28 63a68686 2008-11-03 jas #include <u.h>
29 63a68686 2008-11-03 jas #include <libc.h>
30 63a68686 2008-11-03 jas #include <ctype.h>
31 63a68686 2008-11-03 jas #include <bio.h>
32 63a68686 2008-11-03 jas #include <regexp.h>
33 63a68686 2008-11-03 jas #include "awk.h"
34 63a68686 2008-11-03 jas #include "y.tab.h"
36 63a68686 2008-11-03 jas /* This file provides the interface between the main body of
37 63a68686 2008-11-03 jas * awk and the pattern matching package. It preprocesses
38 63a68686 2008-11-03 jas * patterns prior to compilation to provide awk-like semantics
39 63a68686 2008-11-03 jas * to character sequences not supported by the pattern package.
40 63a68686 2008-11-03 jas * The following conversions are performed:
42 63a68686 2008-11-03 jas * "()" -> "[]"
43 63a68686 2008-11-03 jas * "[-" -> "[\-"
44 63a68686 2008-11-03 jas * "[^-" -> "[^\-"
45 63a68686 2008-11-03 jas * "-]" -> "\-]"
46 63a68686 2008-11-03 jas * "[]" -> "[]*"
47 63a68686 2008-11-03 jas * "\xdddd" -> "\z" where 'z' is the UTF sequence
48 63a68686 2008-11-03 jas * for the hex value
49 63a68686 2008-11-03 jas * "\ddd" -> "\o" where 'o' is a char octal value
50 63a68686 2008-11-03 jas * "\b" -> "\B" where 'B' is backspace
51 63a68686 2008-11-03 jas * "\t" -> "\T" where 'T' is tab
52 63a68686 2008-11-03 jas * "\f" -> "\F" where 'F' is form feed
53 63a68686 2008-11-03 jas * "\n" -> "\N" where 'N' is newline
54 63a68686 2008-11-03 jas * "\r" -> "\r" where 'C' is cr
57 63a68686 2008-11-03 jas #define MAXRE 512
59 63a68686 2008-11-03 jas static char re[MAXRE]; /* copy buffer */
61 63a68686 2008-11-03 jas char *patbeg;
62 63a68686 2008-11-03 jas int patlen; /* number of chars in pattern */
64 63a68686 2008-11-03 jas #define NPATS 20 /* number of slots in pattern cache */
66 63a68686 2008-11-03 jas static struct pat_list /* dynamic pattern cache */
70 63a68686 2008-11-03 jas Reprog *program;
71 63a68686 2008-11-03 jas } pattern[NPATS];
73 63a68686 2008-11-03 jas static int npats; /* cache fill level */
75 63a68686 2008-11-03 jas /* Compile a pattern */
77 63a68686 2008-11-03 jas *compre(char *pat)
79 63a68686 2008-11-03 jas int i, j, inclass;
80 63a68686 2008-11-03 jas char c, *p, *s;
81 63a68686 2008-11-03 jas Reprog *program;
83 63a68686 2008-11-03 jas if (!compile_time) { /* search cache for dynamic pattern */
84 63a68686 2008-11-03 jas for (i = 0; i < npats; i++)
85 63a68686 2008-11-03 jas if (!strcmp(pat, pattern[i].re)) {
86 63a68686 2008-11-03 jas pattern[i].use++;
87 63a68686 2008-11-03 jas return((void *) pattern[i].program);
90 63a68686 2008-11-03 jas /* Preprocess Pattern for compilation */
94 63a68686 2008-11-03 jas while (c = *s++) {
95 63a68686 2008-11-03 jas if (c == '\\') {
96 63a68686 2008-11-03 jas quoted(&s, &p, re+MAXRE);
99 63a68686 2008-11-03 jas else if (!inclass && c == '(' && *s == ')') {
100 63a68686 2008-11-03 jas if (p < re+MAXRE-2) { /* '()' -> '[]*' */
106 63a68686 2008-11-03 jas else overflow();
108 63a68686 2008-11-03 jas else if (c == '['){ /* '[-' -> '[\-' */
109 63a68686 2008-11-03 jas inclass = 1;
110 63a68686 2008-11-03 jas if (*s == '-') {
111 63a68686 2008-11-03 jas if (p < re+MAXRE-2) {
113 63a68686 2008-11-03 jas *p++ = '\\';
116 63a68686 2008-11-03 jas else overflow();
117 63a68686 2008-11-03 jas } /* '[^-' -> '[^\-'*/
118 63a68686 2008-11-03 jas else if (*s == '^' && s[1] == '-'){
119 63a68686 2008-11-03 jas if (p < re+MAXRE-3) {
121 63a68686 2008-11-03 jas *p++ = *s++;
122 63a68686 2008-11-03 jas *p++ = '\\';
125 63a68686 2008-11-03 jas else overflow();
127 63a68686 2008-11-03 jas else if (*s == '['){ /* skip '[[' */
128 63a68686 2008-11-03 jas if (p < re+MAXRE-1)
130 63a68686 2008-11-03 jas else overflow();
133 63a68686 2008-11-03 jas else if (*s == '^' && s[1] == '[') { /* skip '[^['*/
134 63a68686 2008-11-03 jas if (p < re+MAXRE-2) {
136 63a68686 2008-11-03 jas *p++ = *s++;
139 63a68686 2008-11-03 jas else overflow();
141 63a68686 2008-11-03 jas else if (*s == ']') { /* '[]' -> '[]*' */
142 63a68686 2008-11-03 jas if (p < re+MAXRE-2) {
144 63a68686 2008-11-03 jas *p++ = *s++;
146 63a68686 2008-11-03 jas inclass = 0;
148 63a68686 2008-11-03 jas else overflow();
151 63a68686 2008-11-03 jas else if (c == '-' && *s == ']') { /* '-]' -> '\-]' */
152 63a68686 2008-11-03 jas if (p < re+MAXRE-1)
153 63a68686 2008-11-03 jas *p++ = '\\';
154 63a68686 2008-11-03 jas else overflow();
156 63a68686 2008-11-03 jas else if (c == ']')
157 63a68686 2008-11-03 jas inclass = 0;
158 63a68686 2008-11-03 jas if (p < re+MAXRE-1)
160 63a68686 2008-11-03 jas else overflow();
163 63a68686 2008-11-03 jas program = regcomp(re); /* compile pattern */
164 63a68686 2008-11-03 jas if (!compile_time) {
165 63a68686 2008-11-03 jas if (npats < NPATS) /* Room in cache */
166 63a68686 2008-11-03 jas i = npats++;
167 63a68686 2008-11-03 jas else { /* Throw out least used */
168 63a68686 2008-11-03 jas int use = pattern[0].use;
170 63a68686 2008-11-03 jas for (j = 1; j < NPATS; j++) {
171 63a68686 2008-11-03 jas if (pattern[j].use < use) {
172 63a68686 2008-11-03 jas use = pattern[j].use;
176 63a68686 2008-11-03 jas xfree(pattern[i].program);
177 63a68686 2008-11-03 jas xfree(pattern[i].re);
179 63a68686 2008-11-03 jas pattern[i].re = tostring(pat);
180 63a68686 2008-11-03 jas pattern[i].program = program;
181 63a68686 2008-11-03 jas pattern[i].use = 1;
183 63a68686 2008-11-03 jas return((void *) program);
186 63a68686 2008-11-03 jas /* T/F match indication - matched string not exported */
188 63a68686 2008-11-03 jas match(void *p, char *s, char *start)
190 63a68686 2008-11-03 jas return regexec((Reprog *) p, (char *) s, 0, 0);
193 63a68686 2008-11-03 jas /* match and delimit the matched string */
195 63a68686 2008-11-03 jas pmatch(void *p, char *s, char *start)
199 63a68686 2008-11-03 jas m.s.sp = start;
201 63a68686 2008-11-03 jas if (regexec((Reprog *) p, (char *) s, &m, 1)) {
202 63a68686 2008-11-03 jas patbeg = m.s.sp;
203 63a68686 2008-11-03 jas patlen = m.e.ep-m.s.sp;
206 63a68686 2008-11-03 jas patlen = -1;
207 63a68686 2008-11-03 jas patbeg = start;
211 63a68686 2008-11-03 jas /* perform a non-empty match */
213 63a68686 2008-11-03 jas nematch(void *p, char *s, char *start)
215 63a68686 2008-11-03 jas if (pmatch(p, s, start) == 1 && patlen > 0)
217 63a68686 2008-11-03 jas patlen = -1;
218 63a68686 2008-11-03 jas patbeg = start;
221 63a68686 2008-11-03 jas /* in the parsing of regular expressions, metacharacters like . have */
222 63a68686 2008-11-03 jas /* to be seen literally; \056 is not a metacharacter. */
225 63a68686 2008-11-03 jas hexstr(char **pp) /* find and eval hex string at pp, return new p */
231 63a68686 2008-11-03 jas for (i = 0, c = (*pp)[i]; i < 4 && isxdigit(c); i++, c = (*pp)[i]) {
232 63a68686 2008-11-03 jas if (isdigit(c))
233 63a68686 2008-11-03 jas n = 16 * n + c - '0';
234 63a68686 2008-11-03 jas else if ('a' <= c && c <= 'f')
235 63a68686 2008-11-03 jas n = 16 * n + c - 'a' + 10;
236 63a68686 2008-11-03 jas else if ('A' <= c && c <= 'F')
237 63a68686 2008-11-03 jas n = 16 * n + c - 'A' + 10;
243 63a68686 2008-11-03 jas /* look for awk-specific escape sequences */
245 63a68686 2008-11-03 jas #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
248 63a68686 2008-11-03 jas quoted(char **s, char **to, char *end) /* handle escaped sequence */
250 63a68686 2008-11-03 jas char *p = *s;
251 63a68686 2008-11-03 jas char *t = *to;
254 63a68686 2008-11-03 jas switch(c = *p++) {
271 63a68686 2008-11-03 jas if (t < end-1) /* all else must be escaped */
272 63a68686 2008-11-03 jas *t++ = '\\';
273 63a68686 2008-11-03 jas if (c == 'x') { /* hexadecimal goo follows */
274 63a68686 2008-11-03 jas c = hexstr(&p);
275 63a68686 2008-11-03 jas if (t < end-MB_CUR_MAX)
276 63a68686 2008-11-03 jas t += wctomb(t, c);
277 63a68686 2008-11-03 jas else overflow();
281 63a68686 2008-11-03 jas } else if (isoctdigit(c)) { /* \d \dd \ddd */
283 63a68686 2008-11-03 jas if (isoctdigit(*p)) {
284 63a68686 2008-11-03 jas c = 8 * c + *p++ - '0';
285 63a68686 2008-11-03 jas if (isoctdigit(*p))
286 63a68686 2008-11-03 jas c = 8 * c + *p++ - '0';
291 63a68686 2008-11-03 jas if (t < end-1)
296 63a68686 2008-11-03 jas /* count rune positions */
298 63a68686 2008-11-03 jas countposn(char *s, int n)
303 63a68686 2008-11-03 jas for (i = 0, end = s+n; *s && s < end; i++){
304 63a68686 2008-11-03 jas j = mblen(s, n);
312 63a68686 2008-11-03 jas /* pattern package error handler */
315 63a68686 2008-11-03 jas regerror(char *s)
317 63a68686 2008-11-03 jas FATAL("%s", s);
321 63a68686 2008-11-03 jas overflow(void)
323 63a68686 2008-11-03 jas FATAL("%s", "regular expression too big");