1. tre
Start python section to spkgs/tre.py[1
/1
]
1: #line 6 "./lpsrc/tre.pak"
2:
3: rtl_interfaces = [
4: 'tre/tre_gettext.hpp',
5: 'tre/tre_regex.hpp',
6: 'tre/tre_ast.hpp',
7: 'tre/tre_compile.hpp',
8: 'tre/tre_config.hpp',
9: 'tre/tre_filter.hpp',
10: 'tre/tre_match-utils.hpp',
11: 'tre/tre_mem.hpp',
12: 'tre/tre_parse.hpp',
13: ]
14:
15: TRE_CPPS = [
16: 'tre/tre_regcomp',
17: 'tre/tre_regerror',
18: 'tre/tre_regexec',
19: 'tre/tre_ast',
20: 'tre/tre_compile',
21: 'tre/tre_filter',
22: 'tre/tre_match-approx',
23: 'tre/tre_match-backtrack',
24: 'tre/tre_match-parallel',
25: 'tre/tre_mem',
26: 'tre/tre_parse',
27: 'tre/tre_stack'
28: ]
29:
30: cpp_cpps = TRE_CPPS
31: iscr_source = ['lpsrc/tre.pak']
32: weaver_directory = 'doc/rtl/tre/'
33: build_macro = "TRE"
34: unit_tests = glob.glob("test" + os.sep + "tre*.flx")
35: unit_tests.sort()
36:
Start data section to config/tre.fpc[1
/1
]
1:
2: provides_dlib: -ltre_dynamic
3: provides_slib: -ltre_static
Start felix section to lib/tre.flx[1
/1
]
1: #line 54 "./lpsrc/tre.pak"
2:
3:
4: module Tre
5: {
6: open C_hack;
7: requires package 'tre';
8: header '#include "tre_regex.hpp"';
9: type tre_regex_t = "regex_t";
10:
11: const tre_version: string = "tre_version()";
12:
13: private fun _tre_regcomp: ptr[tre_regex_t] * string -> int =
14: "regncomp($1,$2.data(),$2.size(),REG_EXTENDED)"
15: ;
16:
17: fun tre_regcomp (x:string): opt[tre_regex_t] = {
18: var cr: tre_regex_t;
19: var res = _tre_regcomp(addr cr, x);
20: return
21: if res == 0 then Some cr else None[tre_regex_t] endif
22: ;
23: }
24:
25: fun nsub: ptr[tre_regex_t] -> int = "$1->re_nsub+1";
26:
27: ctypes regoff_t;
28: cstruct regmatch_t {
29: rm_so: regoff_t;
30: rm_eo: regoff_t;
31: }
32:
33: fun add: ptr[char] * regoff_t -> ptr[char] = "$1+$2";
34: fun sub: regoff_t * regoff_t -> int = "(int)($1-$2)";
35: fun _ctor_int : regoff_t -> int = "int($1)";
36:
37: private fun _tre_regexec:
38: ptr[tre_regex_t] *
39: string *
40: int *
41: ptr[regmatch_t]
42: -> int
43: =
44: "regnexec($1,$2.data(),$2.size(),$3,$4,0);"
45: ;
46:
47: fun tre_regexec (re_in: tre_regex_t) (x:string): int * int * ptr[regmatch_t] =
48: {
49: var re = re_in;
50: val nmatches = nsub$ addr re;
51: var matches = Carray::array_alloc[regmatch_t] nmatches;
52: var res = _tre_regexec(addr re, x, nmatches, matches);
53: return res,nmatches,matches;
54: }
55: }
56:
Start felix section to test/tre_01.flx[1
/1
]
1: #line 111 "./lpsrc/tre.pak"
2:
3: include "tre.flx";
4: open Tre;
5: open C_hack;
6: open Carray;
7:
8: print$ "Using tre " tre_version; endl;
9:
10: var r = tre_regcomp("(a|b)*abb");
11: print "Done tre compile"; endl;
12:
13: print
14: match r with
15: | Some _ => "Compiled"
16: | None => "failed"
17: endmatch
18: ;
19: endl;
20:
21:
22: var re : tre_regex_t =
23: match r with
24: | Some ?re => re
25: | None => re
26: endmatch
27: ;
28:
29: var s = "aabbabababb";
30: res,n,a := tre_regexec re s;
31: print "Result = "; print res; endl;
32: print "nmatches = "; print n; endl;
33:
34: var i : int;
35: for_each { i=0; } { i<n } { ++i; }
36: {
37: if int(a.[i].rm_so) == -1 do
38: print i; print " -> nomatch\n";
39: else
40: print i; print "-> match '";
41: start := int(a.[i].rm_so);
42: finish := int(a.[i].rm_eo);
43: print s.[start to finish];
44: print "'"; endl;
45: done;
46: }
47: ;
48:
49: print "Finished"; endl;
50:
Start data section to test/tre_01.expect[1
/1
]
1: Using tre TRE 0.7.2 (GPL)
2: Done tre compile
3: Compiled
4: Result = 0
5: nmatches = 2
6: 0-> match 'aabbabababb'
7: 1-> match 'b'
8: Finished
Start cpp section to rtl/flx_target_tre_config.hpp[1
/1
]
1: #line 172 "./lpsrc/tre.pak"
2:
3:
4:
5:
6:
7:
8:
9:
10:
11:
12: /* config.h. Generated by configure. */
13: /* config.h.in. Generated from configure.ac by autoheader. */
14:
15: /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
16: systems. This function is required for `alloca.c' support on those systems.
17: */
18: /* #undef CRAY_STACKSEG_END */
19:
20: /* Define to 1 if using `alloca.c'. */
21: /* #undef C_ALLOCA */
22:
23: /* Define to 1 if translation of program messages to the user's native
24: language is requested. */
25:
26:
27: /* Define to 1 if you have `alloca', as a function or macro. */
28:
29:
30: /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
31: */
32:
33:
34: /* Define if the GNU dcgettext() function is already present or preinstalled.
35: */
36:
37:
38: /* Define to 1 if you have the <dlfcn.h> header file. */
39:
40:
41: /* Define to 1 if you have the <getopt.h> header file. */
42:
43:
44: /* Define to 1 if you have the `getopt_long' function. */
45:
46:
47: /* Define if the GNU gettext() function is already present or preinstalled. */
48:
49:
50: /* Define if you have the iconv() function. */
51: /* #undef HAVE_ICONV */
52:
53: /* Define to 1 if you have the <inttypes.h> header file. */
54:
55:
56: /* Define to 1 if you have the `isascii' function. */
57:
58:
59: /* Define to 1 if you have the `isblank' function. */
60: /* RF: had to comment this out to get cl.exe version working */
61: /* #define HAVE_ISBLANK 1 */
62:
63: /* Define to 1 if you have the `iswascii' function or macro. */
64: /* #undef HAVE_ISWASCII */
65:
66: /* Define to 1 if you have the `iswblank' function or macro. */
67:
68:
69: /* Define to 1 if you have the `iswctype' function or macro. */
70:
71:
72: /* Define to 1 if you have the `iswlower' function or macro. */
73:
74:
75: /* Define to 1 if you have the `iswupper' function or macro. */
76:
77:
78: /* Define to 1 if you have the <libutf8.h> header file. */
79: /* #undef HAVE_LIBUTF8_H */
80:
81: /* Define to 1 if you have the `mbrtowc' function or macro. */
82:
83:
84: /* Define to 1 if the system has the type `mbstate_t'. */
85:
86:
87: /* Define to 1 if you have the `mbtowc' function or macro. */
88: /* #undef HAVE_MBTOWC */
89:
90: /* Define to 1 if you have the <memory.h> header file. */
91:
92:
93: /* Define to 1 if you have the <regex.h> header file. */
94: /* #undef HAVE_REGEX_H */
95:
96: /* Define to 1 if the system has the type `reg_errcode_t'. */
97: /* #undef HAVE_REG_ERRCODE_T */
98:
99: /* Define to 1 if you have the <stdint.h> header file. */
100:
101:
102: /* Define to 1 if you have the <stdlib.h> header file. */
103:
104:
105: /* Define to 1 if you have the <strings.h> header file. */
106:
107:
108: /* Define to 1 if you have the <string.h> header file. */
109:
110:
111: /* Define to 1 if you have the <sys/stat.h> header file. */
112:
113:
114: /* Define to 1 if you have the <sys/types.h> header file. */
115:
116:
117: /* Define to 1 if you have the `towlower' function or macro. */
118:
119:
120: /* Define to 1 if you have the `towupper' function or macro. */
121:
122:
123: /* Define to 1 if you have the <unistd.h> header file. */
124:
125:
126: /* Define to 1 if you have the <wchar.h> header file. */
127:
128:
129: /* Define to 1 if the system has the type `wchar_t'. */
130:
131:
132: /* Define to 1 if you have the `wcschr' function or macro. */
133:
134:
135: /* Define to 1 if you have the `wcscpy' function or macro. */
136:
137:
138: /* Define to 1 if you have the `wcslen' function or macro. */
139:
140:
141: /* Define to 1 if you have the `wcsncpy' function or macro. */
142:
143:
144: /* Define to 1 if you have the `wcsrtombs' function or macro. */
145:
146:
147: /* Define to 1 if you have the `wcstombs' function or macro. */
148: /* #undef HAVE_WCSTOMBS */
149:
150: /* Define to 1 if you have the `wctype' function or macro. */
151:
152:
153: /* Define to 1 if you have the <wctype.h> header file. */
154:
155:
156: /* Define to 1 if the system has the type `wint_t'. */
157:
158:
159: /* Define if you want to disable debug assertions. */
160:
161:
162: /* Name of package */
163:
164:
165: /* Define to the address where bug reports for this package should be sent. */
166:
167:
168: /* Define to the full name of this package. */
169:
170:
171: /* Define to the full name and version of this package. */
172:
173:
174: /* Define to the one symbol short name of this package. */
175:
176:
177: /* Define to the version of this package. */
178:
179:
180: /* If using the C implementation of alloca, define if you know the
181: direction of stack growth for your system; otherwise it will be
182: automatically deduced at run-time.
183: STACK_DIRECTION > 0 => grows toward higher addresses
184: STACK_DIRECTION < 0 => grows toward lower addresses
185: STACK_DIRECTION = 0 => direction of growth unknown */
186: /* #undef STACK_DIRECTION */
187:
188: /* Define to 1 if you have the ANSI C header files. */
189:
190:
191: /* Define if you want to enable approximate matching functionality. */
192:
193:
194: /* Define if you want TRE to print debug messages to stdout. */
195: /* #undef TRE_DEBUG */
196:
197: /* Define to enable multibyte character set support. */
198:
199:
200: /* Define to a field in the regex_t struct where TRE should store a pointer to
201: the internal tre_tnfa_t structure */
202:
203:
204: /* Define to the absolute path to the system regex.h */
205: /* #undef TRE_SYSTEM_REGEX_H_PATH */
206:
207: /* Define if you want TRE to use alloca() instead of malloc() when allocating
208: memory needed for regexec operations. */
209:
210:
211: /* Define to include the system regex.h from TRE regex.h */
212: /* #undef TRE_USE_SYSTEM_REGEX_H */
213:
214: /* TRE version string. */
215:
216:
217: /* TRE version level 1. */
218:
219:
220: /* TRE version level 2. */
221:
222:
223: /* TRE version level 3. */
224:
225:
226: /* Define to enable wide character (wchar_t) support. */
227:
228:
229: /* Version number of package */
230:
231:
232: /* Define to the maximum value of wchar_t if not already defined elsewhere */
233: /* #undef WCHAR_MAX */
234:
235: /* Define if wchar_t is signed */
236: /* #undef WCHAR_T_SIGNED */
237:
238: /* Define if wchar_t is unsigned */
239: /* #undef WCHAR_T_UNSIGNED */
240:
241: /* Number of bits in a file offset, on hosts where this is settable. */
242: /* #undef _FILE_OFFSET_BITS */
243:
244: /* Define to enable GNU extensions in glibc */
245:
246:
247: /* Define for large files, on AIX-style hosts. */
248: /* #undef _LARGE_FILES */
249:
250: /* Define on IRIX */
251: /* #undef _REGCOMP_INTERNAL */
252:
253: /* Define to empty if `const' does not conform to ANSI C. */
254: /* #undef const */
255:
256: /* Define to `__inline__' or `__inline' if that's what the C compiler
257: calls it, or to nothing if 'inline' is not supported under any name. */
258:
259: /* #undef inline */
260:
261:
Start cpp section to tre/tre_gettext.hpp[1
/1
]
1: #line 434 "./lpsrc/tre.pak"
2: /* Convenience header for conditional use of GNU <libintl.h>.
3: Copyright (C) 1995-1998, 2000-2002 Free Software Foundation, Inc.
4:
5: This program is free software; you can redistribute it and/or modify it
6: under the terms of the GNU Library General Public License as published
7: by the Free Software Foundation; either version 2, or (at your option)
8: any later version.
9:
10: This program is distributed in the hope that it will be useful,
11: but WITHOUT ANY WARRANTY; without even the implied warranty of
12: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13: Library General Public License for more details.
14:
15: You should have received a copy of the GNU Library General Public
16: License along with this program; if not, write to the Free Software
17: Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
18: USA. */
19:
20:
21:
22:
23: /* NLS can be disabled through the configure --disable-nls option. */
24:
25:
26: /* Get declarations of GNU message catalog functions. */
27:
28:
29:
30:
31: /* Solaris /usr/include/locale.h includes /usr/include/libintl.h, which
32: chokes if dcgettext is defined as a macro. So include it now, to make
33: later inclusions of <locale.h> a NOP. We don't include <libintl.h>
34: as well because people using "gettext.h" will not include <libintl.h>,
35: and also including <libintl.h> would fail on SunOS 4, whereas <locale.h>
36: is OK. */
37:
38:
39:
40:
41: /* Disabled NLS.
42: The casts to 'const char *' serve the purpose of producing warnings
43: for invalid uses of the value returned from these functions.
44: On pre-ANSI systems without 'const', the config.h file is supposed to
45: contain "#define const". */
46:
47:
48:
49:
50: ((N) == 1 ? (const char *) (Msgid1) : (const char *) (Msgid2))
51:
52: ((N) == 1 ? (const char *) (Msgid1) : (const char *) (Msgid2))
53:
54: ((N) == 1 ? (const char *) (Msgid1) : (const char *) (Msgid2))
55:
56:
57:
58:
59:
60:
61: /* A pseudo function call that serves as a marker for the automated
62: extraction of messages, but does not call gettext(). The run-time
63: translation is done at a different place in the code.
64: The argument, String, should be a literal string. Concatenated strings
65: and other string expressions won't work.
66: The macro's expansion is not parenthesized, so that it is suitable as
67: initializer for static 'char[]' or 'const char[]' variables. */
68:
69:
70:
Start cpp section to tre/tre_regcomp.cpp[1
/1
]
1: #line 505 "./lpsrc/tre.pak"
2: /*
3: regcomp.c - TRE POSIX compatible regex compilation functions.
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32: int
33: regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
34: {
35: int ret;
36: #if TRE_WCHAR
37: tre_char_t *wregex;
38: int wlen;
39:
40: wregex = (tre_char_t*)xmalloc(sizeof(tre_char_t) * (n + 1));
41: if (wregex == NULL)
42: return REG_ESPACE;
43:
44: /* If the current locale uses the standard single byte encoding of
45: characters, we don't do a multibyte string conversion. If we did,
46: many applications which use the default locale would break since
47: the default "C" locale uses the 7-bit ASCII character set, and
48: all characters with the eighth bit set would be considered invalid. */
49: #if TRE_MULTIBYTE
50: if (TRE_MB_CUR_MAX == 1)
51: #endif /* TRE_MULTIBYTE */
52: {
53: unsigned int i;
54: const unsigned char *str = (unsigned char *)regex;
55: tre_char_t *wstr = wregex;
56:
57: for (i = 0; i < n; i++)
58: *(wstr++) = *(str++);
59: wlen = n;
60: }
61: #if TRE_MULTIBYTE
62: else
63: {
64: int consumed;
65: tre_char_t *wcptr = wregex;
66: #ifdef HAVE_MBSTATE_T
67: mbstate_t state;
68: memset(&state, '\0', sizeof(state));
69: #endif /* HAVE_MBSTATE_T */
70: while (n > 0)
71: {
72: consumed = tre_mbrtowc(wcptr, regex, n, &state);
73:
74: switch (consumed)
75: {
76: case 0:
77: if (*regex == '\0')
78: consumed = 1;
79: else
80: {
81: xfree(wregex);
82: return REG_BADPAT;
83: }
84: break;
85: case -1:
86: DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno)));
87: xfree(wregex);
88: return REG_BADPAT;
89: case -2:
90: /* The last character wasn't complete. Let's not call it a
91: fatal error. */
92: consumed = n;
93: break;
94: }
95: regex += consumed;
96: n -= consumed;
97: wcptr++;
98: }
99: wlen = wcptr - wregex;
100: }
101: #endif /* TRE_MULTIBYTE */
102:
103: wregex[wlen] = L'\0';
104: ret = tre_compile(preg, wregex, wlen, cflags);
105: xfree(wregex);
106: #else /* !TRE_WCHAR */
107: ret = tre_compile(preg, (const tre_char_t*)regex, n, cflags);
108: #endif /* !TRE_WCHAR */
109:
110: return ret;
111: }
112:
113: int
114: regcomp(regex_t *preg, const char *regex, int cflags)
115: {
116: return regncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
117: }
118:
119:
120:
121: int
122: regwncomp(regex_t *preg, const wchar_t *regex, size_t n, int cflags)
123: {
124: return tre_compile(preg, regex, n, cflags);
125: }
126:
127: int
128: regwcomp(regex_t *preg, const wchar_t *regex, int cflags)
129: {
130: return tre_compile(preg, regex, regex ? wcslen(regex) : 0, cflags);
131: }
132:
133:
134: void
135: regfree(regex_t *preg)
136: {
137: tre_free(preg);
138: }
139:
140: /* EOF */
Start cpp section to tre/tre_regerror.cpp[1
/1
]
1: #line 646 "./lpsrc/tre.pak"
2: /*
3: regerror.c - POSIX regerror() implementation for TRE.
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38: /* Error message strings for error codes listed in `regex.h'. This list
39: needs to be in sync with the codes listed there, naturally. */
40: static const char *tre_error_messages[] =
41: { gettext_noop("No error"), /* REG_OK */
42: gettext_noop("No match"), /* REG_NOMATCH */
43: gettext_noop("Invalid regexp"), /* REG_BADPAT */
44: gettext_noop("Unknown collating element"), /* REG_ECOLLATE */
45: gettext_noop("Unknown character klass name"), /* REG_ECTYPE */
46: gettext_noop("Trailing backslash"), /* REG_EESCAPE */
47: gettext_noop("Invalid back reference"), /* REG_ESUBREG */
48: gettext_noop("Missing ']'"), /* REG_EBRACK */
49: gettext_noop("Missing ')'"), /* REG_EPAREN */
50: gettext_noop("Missing '}'"), /* REG_EBRACE */
51: gettext_noop("Invalid contents of {}"), /* REG_BADBR */
52: gettext_noop("Invalid character range"), /* REG_ERANGE */
53: gettext_noop("Out of memory"), /* REG_ESPACE */
54: gettext_noop("XXX") /* REG_BADRPT */
55: };
56:
57: size_t
58: regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
59: {
60: const char *err;
61: size_t err_len;
62:
63: if (errcode >= 0
64: && errcode < (sizeof(tre_error_messages) / sizeof(*tre_error_messages)))
65: err = gettext(tre_error_messages[errcode]);
66: else
67: err = gettext("Unknown error");
68:
69: err_len = strlen(err) + 1;
70: if (errbuf_size > 0 && errbuf != NULL)
71: {
72: if (err_len > errbuf_size)
73: {
74: strncpy(errbuf, err, errbuf_size - 1);
75: errbuf[errbuf_size - 1] = '\0';
76: }
77: else
78: {
79: strcpy(errbuf, err);
80: }
81: }
82: return err_len;
83: }
84:
85: /* EOF */
Start cpp section to tre/tre_regexec.cpp[1
/1
]
1: #line 732 "./lpsrc/tre.pak"
2: /*
3: regexec.c - TRE POSIX compatible matching functions (and more).
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24:
25: /* AIX requires this to be the first thing in the file. */
26:
27:
28:
29:
30:
31:
32:
33:
34: char *alloca ();
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64: /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
65: endpoint values. */
66: void
67: tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
68: const tre_tnfa_t *tnfa, int *tags, int match_eo)
69: {
70: tre_submatch_data_t *submatch_data;
71: unsigned int i, j;
72: int *parents;
73:
74: i = 0;
75: if (match_eo >= 0 && !(cflags & REG_NOSUB))
76: {
77: /* Construct submatch offsets from the tags. */
78: DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
79: submatch_data = tnfa->submatch_data;
80: while (i < tnfa->num_submatches && i < nmatch)
81: {
82: if (submatch_data[i].so_tag == tnfa->end_tag)
83: pmatch[i].rm_so = match_eo;
84: else
85: pmatch[i].rm_so = tags[submatch_data[i].so_tag];
86:
87: if (submatch_data[i].eo_tag == tnfa->end_tag)
88: pmatch[i].rm_eo = match_eo;
89: else
90: pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
91:
92: /* If either of the endpoints were not used, this submatch
93: was not part of the match. */
94: if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
95: pmatch[i].rm_so = pmatch[i].rm_eo = -1;
96:
97: DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
98: submatch_data[i].so_tag, pmatch[i].rm_so,
99: submatch_data[i].eo_tag, pmatch[i].rm_eo));
100: i++;
101: }
102: /* Reset all submatches that are not within all of their parent
103: submatches. */
104: i = 0;
105: while (i < tnfa->num_submatches && i < nmatch)
106: {
107: if (pmatch[i].rm_eo == -1)
108: assert(pmatch[i].rm_so == -1);
109: assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
110:
111: parents = submatch_data[i].parents;
112: if (parents != NULL)
113: for (j = 0; parents[j] >= 0; j++)
114: {
115: DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
116: if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
117: || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
118: pmatch[i].rm_so = pmatch[i].rm_eo = -1;
119: }
120: i++;
121: }
122: }
123:
124: while (i < nmatch)
125: {
126: pmatch[i].rm_so = -1;
127: pmatch[i].rm_eo = -1;
128: i++;
129: }
130: }
131:
132:
133: /*
134: Wrapper functions for POSIX compatible regexp matching.
135: */
136:
137: int
138: tre_have_backrefs(const regex_t *preg)
139: {
140: tre_tnfa_t *tnfa = (tre_tnfa_t*)preg->TRE_REGEX_T_FIELD;
141: return tnfa->have_backrefs;
142: }
143:
144: int
145: tre_have_approx(const regex_t *preg)
146: {
147: tre_tnfa_t *tnfa = (tre_tnfa_t*)preg->TRE_REGEX_T_FIELD;
148: return tnfa->have_approx;
149: }
150:
151: static int
152: tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
153: tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
154: int eflags)
155: {
156: reg_errcode_t status;
157: int *tags = NULL, eo;
158: if (tnfa->num_tags > 0 && nmatch > 0)
159: {
160: #ifdef TRE_USE_ALLOCA
161: tags = (int*)alloca(sizeof(*tags) * tnfa->num_tags);
162: #else /* !TRE_USE_ALLOCA */
163: tags = (int*)xmalloc(sizeof(*tags) * tnfa->num_tags);
164: #endif /* !TRE_USE_ALLOCA */
165: if (tags == NULL)
166: return REG_ESPACE;
167: }
168:
169: /* Dispatch to the appropriate matcher. */
170: if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
171: {
172: /* The regex has back references, use the backtracking matcher. */
173: if (type == STR_USER)
174: {
175: const tre_str_source *source = (tre_str_source*)string;
176: if (source->rewind == NULL || source->compare == NULL)
177: /* The backtracking matcher requires rewind and compare
178: capabilities from the input stream. */
179: return REG_BADPAT;
180: }
181: status = tre_tnfa_run_backtrack(tnfa, string, len, type,
182: tags, eflags, &eo);
183: }
184: #ifdef TRE_APPROX
185: else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
186: {
187: /* The regex uses approximate matching, use the approximate matcher. */
188: regamatch_t match;
189: regaparams_t params;
190: regaparams_default(¶ms);
191: params.max_err = 0;
192: params.max_cost = 0;
193: status = tre_tnfa_run_approx(tnfa, string, len, type, tags,
194: &match, params, eflags, &eo);
195: }
196: #endif /* TRE_APPROX */
197: else
198: {
199: /* Exact matching, no back references, use the parallel matcher. */
200: status = tre_tnfa_run_parallel(tnfa, string, len, type,
201: tags, eflags, &eo);
202: }
203:
204: if (status == REG_OK)
205: /* A match was found, so fill the submatch registers. */
206: tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
207: #ifndef TRE_USE_ALLOCA
208: if (tags)
209: xfree(tags);
210: #endif /* !TRE_USE_ALLOCA */
211: return status;
212: }
213:
214: int
215: regnexec(const regex_t *preg, const char *str, size_t len,
216: size_t nmatch, regmatch_t pmatch[], int eflags)
217: {
218: tre_tnfa_t *tnfa = (tre_tnfa_t*)preg->TRE_REGEX_T_FIELD;
219: tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
220:
221: return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
222: }
223:
224: int
225: regexec(const regex_t *preg, const char *str,
226: size_t nmatch, regmatch_t pmatch[], int eflags)
227: {
228: return regnexec(preg, str, -1, nmatch, pmatch, eflags);
229: }
230:
231:
232:
233:
234: int
235: regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
236: size_t nmatch, regmatch_t pmatch[], int eflags)
237: {
238: tre_tnfa_t *tnfa = (tre_tnfa_t *)preg->TRE_REGEX_T_FIELD;
239: return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
240: }
241:
242: int
243: regwexec(const regex_t *preg, const wchar_t *str,
244: size_t nmatch, regmatch_t pmatch[], int eflags)
245: {
246: return regwnexec(preg, str, -1, nmatch, pmatch, eflags);
247: }
248:
249:
250:
251: int
252: reguexec(const regex_t *preg, const tre_str_source *str,
253: size_t nmatch, regmatch_t pmatch[], int eflags)
254: {
255: tre_tnfa_t *tnfa = (tre_tnfa_t *)preg->TRE_REGEX_T_FIELD;
256: return tre_match(tnfa, str, -1, STR_USER, nmatch, pmatch, eflags);
257: }
258:
259:
260:
261:
262: /*
263: Wrapper functions for approximate regexp matching.
264: */
265:
266: static int
267: tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
268: tre_str_type_t type, regamatch_t *match, regaparams_t params,
269: int eflags)
270: {
271: reg_errcode_t status;
272: int *tags = NULL, eo;
273:
274: /* If the regexp does not use approximate matching features, the
275: maximum cost is zero, and the approximate matcher isn't forced,
276: use the exact matcher instead. */
277: if (params.max_cost == 0 && !tnfa->have_approx
278: && !(eflags & REG_APPROX_MATCHER))
279: return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
280: eflags);
281:
282: /* Back references are not supported by the approximate matcher. */
283: if (tnfa->have_backrefs)
284: return REG_BADPAT;
285:
286: #if 0
287: {
288: int pos;
289: tre_filter_profile_t profile[] =
290: { {'b', 2}, {'e', 3}, {'l', 1}, {'B', 1}, {'r', 1},
291: {'o', 1}, {'x', 1}, {0, 0} };
292: tre_filter_t filter = { 10, profile };
293: pos = tre_filter_find((unsigned char *)string, len, &filter);
294: if (pos < 0)
295: return REG_NOMATCH;
296: }
297: #endif
298:
299: if (tnfa->num_tags > 0 && match->nmatch > 0)
300: {
301: #if TRE_USE_ALLOCA
302: tags = (int*)alloca(sizeof(*tags) * tnfa->num_tags);
303: #else /* !TRE_USE_ALLOCA */
304: tags = (int*)xmalloc(sizeof(*tags) * tnfa->num_tags);
305: #endif /* !TRE_USE_ALLOCA */
306: if (tags == NULL)
307: return REG_ESPACE;
308: }
309: status = tre_tnfa_run_approx(tnfa, string, len, type, tags,
310: match, params, eflags, &eo);
311: if (status == REG_OK)
312: tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
313: #ifndef TRE_USE_ALLOCA
314: if (tags)
315: xfree(tags);
316: #endif /* !TRE_USE_ALLOCA */
317: return status;
318: }
319:
320: int
321: reganexec(const regex_t *preg, const char *str, size_t len,
322: regamatch_t *match, regaparams_t params, int eflags)
323: {
324: tre_tnfa_t *tnfa = (tre_tnfa_t *)preg->TRE_REGEX_T_FIELD;
325: tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
326:
327: return tre_match_approx(tnfa, str, len, type, match, params, eflags);
328: }
329:
330: int
331: regaexec(const regex_t *preg, const char *str,
332: regamatch_t *match, regaparams_t params, int eflags)
333: {
334: return reganexec(preg, str, -1, match, params, eflags);
335: }
336:
337:
338:
339: int
340: regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
341: regamatch_t *match, regaparams_t params, int eflags)
342: {
343: tre_tnfa_t *tnfa = (tre_tnfa_t *)preg->TRE_REGEX_T_FIELD;
344: return tre_match_approx(tnfa, str, len, STR_WIDE,
345: match, params, eflags);
346: }
347:
348: int
349: regawexec(const regex_t *preg, const wchar_t *str,
350: regamatch_t *match, regaparams_t params, int eflags)
351: {
352: return regawnexec(preg, str, -1, match, params, eflags);
353: }
354:
355:
356:
357: void
358: regaparams_default(regaparams_t *params)
359: {
360: memset(params, 0, sizeof(*params));
361: params->cost_ins = 1;
362: params->cost_del = 1;
363: params->cost_subst = 1;
364: params->max_cost = INT_MAX;
365: params->max_ins = INT_MAX;
366: params->max_del = INT_MAX;
367: params->max_subst = INT_MAX;
368: params->max_err = INT_MAX;
369: }
370:
371:
372:
373: /* EOF */
Start cpp section to tre/tre_regex.hpp[1
/1
]
1: #line 1106 "./lpsrc/tre.pak"
2: /*
3: regex.h - POSIX.2 compatible regexp interface and TRE extensions
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36: /* Include the system regex.h to make TRE ABI compatible with the
37: system regex. */
38:
39:
40:
41:
42: extern "C" {
43: #endif
44:
45: #ifdef TRE_USE_SYSTEM_REGEX_H
46:
47: #ifndef REG_OK
48: #define REG_OK 0
49: #endif /* !REG_OK */
50:
51: #ifndef HAVE_REG_ERRCODE_T
52: typedef int reg_errcode_t;
53: #endif /* !HAVE_REG_ERRCODE_T */
54:
55: #if !defined(REG_NOSPEC) && !defined(REG_LITERAL)
56: #define REG_LITERAL 0x1000
57: #endif
58:
59: /* Extra regcomp() flags. */
60: #define REG_RIGHT_ASSOC (REG_LITERAL << 1)
61:
62: /* Extra regexec() flags. */
63: #define REG_APPROX_MATCHER 0x1000
64: #define REG_BACKTRACKING_MATCHER (REG_APPROX_MATCHER << 1)
65:
66: #else /* !TRE_USE_SYSTEM_REGEX_H */
67:
68: /* If the we're not using system regex.h, we need to define the
69: structs and enums ourselves. */
70:
71: typedef int regoff_t;
72: typedef struct {
73: size_t re_nsub; /* Number of parenthesized subexpressions. */
74: void *value; /* For internal use only. */
75: } regex_t;
76:
77: typedef struct {
78: regoff_t rm_so;
79: regoff_t rm_eo;
80: } regmatch_t;
81:
82:
83: typedef enum {
84: REG_OK = 0, /* No error. */
85: /* POSIX regcomp() return error codes. (In the order listed in the
86: standard.) */
87: REG_NOMATCH, /* No match. */
88: REG_BADPAT, /* Invalid regexp. */
89: REG_ECOLLATE, /* Unknown collating element. */
90: REG_ECTYPE, /* Unknown character klass name. */
91: REG_EESCAPE, /* Trailing backslash. */
92: REG_ESUBREG, /* Invalid back reference. */
93: REG_EBRACK, /* "[]" imbalance */
94: REG_EPAREN, /* "\(\)" or "()" imbalance */
95: REG_EBRACE, /* "\{\}" or "{}" imbalance */
96: REG_BADBR, /* Invalid content of {} */
97: REG_ERANGE, /* Invalid use of range operator */
98: REG_ESPACE, /* Out of memory. */
99: REG_BADRPT
100: } reg_errcode_t;
101:
102: /* POSIX regcomp() flags. */
103: #define REG_EXTENDED 1
104: #define REG_ICASE (REG_EXTENDED << 1)
105: #define REG_NEWLINE (REG_ICASE << 1)
106: #define REG_NOSUB (REG_NEWLINE << 1)
107:
108: /* Extra regcomp() flags. */
109: #define REG_BASIC 0
110: #define REG_LITERAL (REG_NOSUB << 1)
111: #define REG_RIGHT_ASSOC (REG_LITERAL << 1)
112:
113: /* POSIX regexec() flags. */
114: #define REG_NOTBOL 1
115: #define REG_NOTEOL (REG_NOTBOL << 1)
116:
117: /* Extra regexec() flags. */
118: #define REG_APPROX_MATCHER (REG_NOTEOL << 1)
119: #define REG_BACKTRACKING_MATCHER (REG_APPROX_MATCHER << 1)
120:
121: #endif /* !TRE_USE_SYSTEM_REGEX_H */
122:
123: /* REG_NOSPEC and REG_LITERAL mean the same thing. */
124: #ifdef REG_LITERAL
125: #define REG_NOSPEC REG_LITERAL
126: #elif defined(REG_NOSPEC)
127: #define REG_LITERAL REG_NOSPEC
128: #endif /* defined(REG_NOSPEC) */
129:
130: /* The maximum number of iterations in a bound expression. */
131: #undef RE_DUP_MAX
132: #define RE_DUP_MAX 255
133:
134: /* The POSIX.2 regexp functions */
135: TRE_EXTERN int regcomp(regex_t *preg, const char *regex, int cflags);
136: TRE_EXTERN int regexec(const regex_t *preg, const char *string, size_t nmatch,
137: regmatch_t pmatch[], int eflags);
138: TRE_EXTERN size_t regerror(int errcode, const regex_t *preg, char *errbuf,
139: size_t errbuf_size);
140: TRE_EXTERN void regfree(regex_t *preg);
141:
142: #ifdef TRE_WCHAR
143: #ifdef HAVE_WCHAR_H
144: #include <wchar.h>
145: #endif /* HAVE_WCHAR_H */
146:
147: /* Wide character versions (not in POSIX.2). */
148: int regwcomp(regex_t *preg, const wchar_t *regex, int cflags);
149: int regwexec(const regex_t *preg, const wchar_t *string, size_t nmatch,
150: regmatch_t pmatch[], int eflags);
151: #endif /* TRE_WCHAR */
152:
153: /* Versions with a maximum length argument and therefore the capability to
154: handle null characters in the middle of the strings (not in POSIX.2). */
155: TRE_EXTERN int regncomp(regex_t *preg, const char *regex, size_t len,
156: int cflags);
157: TRE_EXTERN int regnexec(const regex_t *preg, const char *string, size_t len,
158: size_t nmatch, regmatch_t pmatch[], int eflags);
159: #ifdef TRE_WCHAR
160: TRE_EXTERN int regwncomp(regex_t *preg, const wchar_t *regex, size_t len,
161: int cflags);
162: TRE_EXTERN int regwnexec(const regex_t *preg, const wchar_t *string, size_t len,
163: size_t nmatch, regmatch_t pmatch[], int eflags);
164: #endif /* TRE_WCHAR */
165:
166: #ifdef TRE_APPROX
167:
168: /* Approximate matching parameter struct. */
169: typedef struct {
170: int cost_ins; /* Default cost of an inserted character. */
171: int cost_del; /* Default cost of a deleted character. */
172: int cost_subst; /* Default cost of a substituted character. */
173: int max_cost; /* Maximum allowed cost of a match. */
174:
175: int max_ins; /* Maximum allowed number of inserts. */
176: int max_del; /* Maximum allowed number of deletes. */
177: int max_subst; /* Maximum allowed number of substitutes. */
178: int max_err; /* Maximum allowed number of errors total. */
179: } regaparams_t;
180:
181: /* Approximate matching result struct. */
182: typedef struct {
183: size_t nmatch; /* Length of pmatch[] array. */
184: regmatch_t *pmatch; /* Submatch data. */
185: int cost; /* Cost of the match. */
186: int num_ins; /* Number of inserts in the match. */
187: int num_del; /* Number of deletes in the match. */
188: int num_subst; /* Number of substitutes in the match. */
189: } regamatch_t;
190:
191:
192: /* Approximate matching functions. */
193: int regaexec(const regex_t *preg, const char *string,
194: regamatch_t *match, regaparams_t params, int eflags);
195: int reganexec(const regex_t *preg, const char *string, size_t len,
196: regamatch_t *match, regaparams_t params, int eflags);
197: #ifdef TRE_WCHAR
198: /* Wide character approximate matching. */
199: int regawexec(const regex_t *preg, const wchar_t *string,
200: regamatch_t *match, regaparams_t params, int eflags);
201: int regawnexec(const regex_t *preg, const wchar_t *string, size_t len,
202: regamatch_t *match, regaparams_t params, int eflags);
203: #endif /* TRE_WCHAR */
204:
205: /* Sets the parameters to default values. */
206: void regaparams_default(regaparams_t *params);
207: #endif /* TRE_APPROX */
208:
209: #ifdef TRE_WCHAR
210: typedef wchar_t tre_char_t;
211: #else /* !TRE_WCHAR */
212: typedef unsigned char tre_char_t;
213: #endif /* !TRE_WCHAR */
214:
215: typedef struct {
216: int (*get_next_char)(tre_char_t *c, unsigned int *pos_add, void *context);
217: void (*rewind)(size_t pos, void *context);
218: int (*compare)(size_t pos1, size_t pos2, size_t len, void *context);
219: void *context;
220: } tre_str_source;
221:
222: int reguexec(const regex_t *preg, const tre_str_source *string,
223: size_t nmatch, regmatch_t pmatch[], int eflags);
224:
225: /* Returns the version string. The returned string is static. */
226: TRE_EXTERN char *tre_version(void);
227:
228: /* Returns the value for a config parameter. The type to which `result'
229: must point to depends of the value of `query', see documentation for
230: more details. */
231: TRE_EXTERN int tre_config(int query, void *result);
232:
233: enum {
234: TRE_CONFIG_APPROX,
235: TRE_CONFIG_WCHAR,
236: TRE_CONFIG_MULTIBYTE,
237: TRE_CONFIG_SYSTEM_ABI,
238: TRE_CONFIG_VERSION
239: };
240:
241: /* Returns 1 if the compiled pattern has back references, 0 if not. */
242: TRE_EXTERN int tre_have_backrefs(const regex_t *preg);
243:
244: /* Returns 1 if the compiled pattern uses approximate matching features,
245: 0 if not. */
246: TRE_EXTERN int tre_have_approx(const regex_t *preg);
247:
248: #ifdef __cplusplus
249: }
250:
251:
252:
253: /* EOF */
Start cpp section to tre/tre_ast.cpp[1
/1
]
1: #line 1360 "./lpsrc/tre.pak"
2: /*
3: tre-ast.c - Abstract syntax tree (AST) routines
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24:
25:
26:
27:
28: tre_ast_node_t *
29: tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
30: {
31: tre_ast_node_t *node;
32:
33: node = (tre_ast_node_t*)tre_mem_calloc(mem, sizeof(*node));
34: if (!node)
35: return NULL;
36: node->obj = tre_mem_calloc(mem, size);
37: if (!node->obj)
38: return NULL;
39: node->type = type;
40: node->nullable = -1;
41: node->submatch_id = -1;
42:
43: return node;
44: }
45:
46: tre_ast_node_t *
47: tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
48: {
49: tre_ast_node_t *node;
50: tre_literal_t *lit;
51:
52: node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
53: if (!node)
54: return NULL;
55: lit = (tre_literal_t*)node->obj;
56: lit->code_min = code_min;
57: lit->code_max = code_max;
58: lit->position = position;
59:
60: return node;
61: }
62:
63: tre_ast_node_t *
64: tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
65: int minimal)
66: {
67: tre_ast_node_t *node;
68: tre_iteration_t *iter;
69:
70: node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
71: if (!node)
72: return NULL;
73: iter = (tre_iteration_t*)node->obj;
74: iter->arg = arg;
75: iter->min = min;
76: iter->max = max;
77: iter->minimal = minimal;
78: node->num_submatches = arg->num_submatches;
79:
80: return node;
81: }
82:
83: tre_ast_node_t *
84: tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
85: {
86: tre_ast_node_t *node;
87:
88: node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
89: if (node == NULL)
90: return NULL;
91: ((tre_union_t *)node->obj)->left = left;
92: ((tre_union_t *)node->obj)->right = right;
93: node->num_submatches = left->num_submatches + right->num_submatches;
94:
95: return node;
96: }
97:
98: tre_ast_node_t *
99: tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
100: tre_ast_node_t *right)
101: {
102: tre_ast_node_t *node;
103:
104: node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
105: if (node == NULL)
106: return NULL;
107: ((tre_catenation_t *)node->obj)->left = left;
108: ((tre_catenation_t *)node->obj)->right = right;
109: node->num_submatches = left->num_submatches + right->num_submatches;
110:
111: return node;
112: }
113:
114:
115:
116: static void
117: tre_findent(FILE *stream, int i)
118: {
119: while (i-- > 0)
120: fputc(' ', stream);
121: }
122:
123: void
124: tre_print_params(int *params)
125: {
126: int i;
127: if (params)
128: {
129: DPRINT(("params ["));
130: for (i = 0; i < TRE_PARAM_LAST; i++)
131: {
132: if (params[i] == TRE_PARAM_UNSET)
133: DPRINT(("unset"));
134: else if (params[i] == TRE_PARAM_DEFAULT)
135: DPRINT(("default"));
136: else
137: DPRINT(("%d", params[i]));
138: if (i < TRE_PARAM_LAST - 1)
139: DPRINT((", "));
140: }
141: DPRINT(("]"));
142: }
143: }
144:
145: static void
146: tre_do_print(FILE *stream, tre_ast_node_t *ast, int indent)
147: {
148: int code_min, code_max, pos;
149: int num_tags = ast->num_tags;
150: tre_literal_t *lit;
151: tre_iteration_t *iter;
152:
153: tre_findent(stream, indent);
154: switch (ast->type)
155: {
156: case LITERAL:
157: lit = ast->obj;
158: code_min = lit->code_min;
159: code_max = lit->code_max;
160: pos = lit->position;
161: if (IS_EMPTY(lit))
162: {
163: fprintf(stream, "literal empty\n");
164: }
165: else if (IS_ASSERTION(lit))
166: {
167: int i;
168: char *assertions[] = { "bol", "eol", "ctype", "!ctype",
169: "bow", "eow", "wb", "!wb" };
170: if (code_max >= ASSERT_LAST << 1)
171: assert(0);
172: fprintf(stream, "assertions: ");
173: for (i = 0; (1 << i) <= ASSERT_LAST; i++)
174: if (code_max & (1 << i))
175: fprintf(stream, "%s ", assertions[i]);
176: fprintf(stream, "\n");
177: }
178: else if (IS_TAG(lit))
179: {
180: fprintf(stream, "tag %d\n", code_max);
181: }
182: else if (IS_BACKREF(lit))
183: {
184: fprintf(stream, "backref %d, pos %d\n", code_max, pos);
185: }
186: else if (IS_PARAMETER(lit))
187: {
188: tre_print_params(lit->u.params);
189: fprintf(stream, "\n");
190: }
191: else
192: {
193: fprintf(stream, "literal (%c, %c) (%d, %d), pos %d, sub %d, "
194: "%d tags\n", code_min, code_max, code_min, code_max, pos,
195: ast->submatch_id, num_tags);
196: }
197: break;
198: case ITERATION:
199: iter = ast->obj;
200: fprintf(stream, "iteration {%d, %d}, sub %d, %d tags, %s\n",
201: iter->min, iter->max, ast->submatch_id, num_tags,
202: iter->minimal ? "minimal" : "greedy");
203: tre_do_print(stream, iter->arg, indent + 2);
204: break;
205: case UNION:
206: fprintf(stream, "union, sub %d, %d tags\n", ast->submatch_id, num_tags);
207: tre_do_print(stream, ((tre_union_t *)ast->obj)->left, indent + 2);
208: tre_do_print(stream, ((tre_union_t *)ast->obj)->right, indent + 2);
209: break;
210: case CATENATION:
211: fprintf(stream, "catenation, sub %d, %d tags\n", ast->submatch_id,
212: num_tags);
213: tre_do_print(stream, ((tre_catenation_t *)ast->obj)->left, indent + 2);
214: tre_do_print(stream, ((tre_catenation_t *)ast->obj)->right, indent + 2);
215: break;
216: default:
217: assert(0);
218: break;
219: }
220: }
221:
222: static void
223: tre_ast_fprint(FILE *stream, tre_ast_node_t *ast)
224: {
225: tre_do_print(stream, ast, 0);
226: }
227:
228: void
229: tre_ast_print(tre_ast_node_t *tree)
230: {
231: printf("AST:\n");
232: tre_ast_fprint(stdout, tree);
233: }
234:
235:
236:
237: /* EOF */
Start cpp section to tre/tre_ast.hpp[1
/1
]
1: #line 1598 "./lpsrc/tre.pak"
2: /*
3: tre-ast.h - Abstract syntax tree (AST) definitions
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19: */
20:
21:
22:
23:
24:
25:
26:
27:
28:
29: /* The different AST node types. */
30: typedef enum {
31: LITERAL,
32: CATENATION,
33: ITERATION,
34: UNION
35: } tre_ast_type_t;
36:
37: /* Special subtypes of TRE_LITERAL. */
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52: /* A generic AST node. All AST nodes consist of this node on the top
53: level with `obj' pointing to the actual content. */
54: typedef struct {
55: tre_ast_type_t type; /* Type of the node. */
56: void *obj; /* Pointer to actual node. */
57: int nullable;
58: int submatch_id;
59: int num_submatches;
60: int num_tags;
61: tre_pos_and_tags_t *firstpos;
62: tre_pos_and_tags_t *lastpos;
63: } tre_ast_node_t;
64:
65:
66: /* A "literal" node. These are created for assertions, back references,
67: tags, matching parameter settings, and all expressions that match one
68: character. */
69: typedef struct {
70: long code_min;
71: long code_max;
72: int position;
73: union {
74: tre_ctype_t klass;
75: unsigned int *params;
76: } u;
77: tre_ctype_t *neg_klasses;
78: } tre_literal_t;
79:
80: /* A "catenation" node. These are created when two regexps are concatenated.
81: If there are more than one subexpressions in sequence, the `left' part
82: holds all but the last, and `right' part holds the last subexpression
83: (catenation is left associative). */
84: typedef struct {
85: tre_ast_node_t *left;
86: tre_ast_node_t *right;
87: } tre_catenation_t;
88:
89: /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
90: operators. */
91: typedef struct {
92: /* Subexpression to match. */
93: tre_ast_node_t *arg;
94: /* Minimum number of consecutive matches. */
95: int min;
96: /* Maximum number of consecutive matches. */
97: int max;
98: /* If 0, match as many characters as possible, if 1 match as few as
99: possible. Note that this does not always mean the same thing as
100: matching as many/few repetitions as possible. */
101: unsigned int minimal:1;
102: /* Approximate matching parameters (or NULL). */
103: unsigned int *params;
104: } tre_iteration_t;
105:
106: /* An "union" node. These are created for the "|" operator. */
107: typedef struct {
108: tre_ast_node_t *left;
109: tre_ast_node_t *right;
110: } tre_union_t;
111:
112: tre_ast_node_t *
113: tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
114:
115: tre_ast_node_t *
116: tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
117:
118: tre_ast_node_t *
119: tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
120: int minimal);
121:
122: tre_ast_node_t *
123: tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
124:
125: tre_ast_node_t *
126: tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
127: tre_ast_node_t *right);
128:
129:
130: void
131: tre_ast_print(tre_ast_node_t *tree);
132:
133: /* XXX - rethink AST printing API */
134: void
135: tre_print_params(int *params);
136:
137:
138:
139:
140: /* EOF */
Start cpp section to tre/tre_compile.cpp[1
/1
]
1: #line 1739 "./lpsrc/tre.pak"
2: /*
3: tre-compile.c - TRE regex compiler
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22: /*
23: TODO:
24: - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
25: function calls.
26: */
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43: /*
44: Algorithms to setup tags so that submatch addressing can be done.
45: */
46:
47:
48: /* Inserts a catenation node to the root of the tree given in `node'.
49: As the left child a new tag with number `tag_id' to `node' is added,
50: and the right child is the old root. */
51: static reg_errcode_t
52: tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
53: {
54: tre_catenation_t *c;
55:
56: DPRINT(("add_tag_left: tag %d\n", tag_id));
57:
58: c = (tre_catenation_t*)tre_mem_alloc(mem, sizeof(*c));
59: if (c == NULL)
60: return REG_ESPACE;
61: c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
62: if (c->left == NULL)
63: return REG_ESPACE;
64: c->right = (tre_ast_node_t*)tre_mem_alloc(mem, sizeof(tre_ast_node_t));
65: if (c->right == NULL)
66: return REG_ESPACE;
67:
68: c->right->obj = node->obj;
69: c->right->type = node->type;
70: c->right->nullable = -1;
71: c->right->submatch_id = -1;
72: c->right->firstpos = NULL;
73: c->right->lastpos = NULL;
74: c->right->num_tags = 0;
75: node->obj = c;
76: node->type = CATENATION;
77: return REG_OK;
78: }
79:
80: /* Inserts a catenation node to the root of the tree given in `node'.
81: As the right child a new tag with number `tag_id' to `node' is added,
82: and the left child is the old root. */
83: static reg_errcode_t
84: tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
85: {
86: tre_catenation_t *c;
87:
88: DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
89:
90: c = (tre_catenation_t*)tre_mem_alloc(mem, sizeof(*c));
91: if (c == NULL)
92: return REG_ESPACE;
93: c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
94: if (c->right == NULL)
95: return REG_ESPACE;
96: c->left = (tre_ast_node_t*)tre_mem_alloc(mem, sizeof(tre_ast_node_t));
97: if (c->left == NULL)
98: return REG_ESPACE;
99:
100: c->left->obj = node->obj;
101: c->left->type = node->type;
102: c->left->nullable = -1;
103: c->left->submatch_id = -1;
104: c->left->firstpos = NULL;
105: c->left->lastpos = NULL;
106: c->left->num_tags = 0;
107: node->obj = c;
108: node->type = CATENATION;
109: return REG_OK;
110: }
111:
112: typedef enum {
113: ADDTAGS_RECURSE,
114: ADDTAGS_AFTER_ITERATION,
115: ADDTAGS_AFTER_UNION_LEFT,
116: ADDTAGS_AFTER_UNION_RIGHT,
117: ADDTAGS_AFTER_CAT_LEFT,
118: ADDTAGS_AFTER_CAT_RIGHT,
119: ADDTAGS_SET_SUBMATCH_END
120: } tre_addtags_symbol_t;
121:
122:
123: typedef struct {
124: int tag;
125: int next_tag;
126: } tre_tag_states_t;
127:
128: /* Adds tags to appropriate locations in the parse tree in `tree', so that
129: subexpressions marked for submatch addressing can be traced. */
130: static reg_errcode_t
131: tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
132: tre_tnfa_t *tnfa)
133: {
134: reg_errcode_t status = REG_OK;
135: tre_addtags_symbol_t symbol;
136: tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
137: int bottom = tre_stack_num_objects(stack);
138: /* True for first pass (counting number of needed tags) */
139: int first_pass = (mem == NULL || tnfa == NULL);
140: int *regset, *orig_regset;
141: int num_tags = 0; /* Total number of tags. */
142: int num_minimals = 0; /* Number of special minimal tags. */
143: int tag = 0; /* The tag that is to be added next. */
144: int next_tag = 1; /* Next tag to use after this one. */
145: int *parents; /* Stack of submatches the current submatch is
146: contained in. */
147: int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
148: tre_tag_states_t *saved_states;
149:
150: tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
151: if (!first_pass)
152: {
153: tnfa->end_tag = 0;
154: tnfa->minimal_tags[0] = -1;
155: }
156:
157: regset = (int*)xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
158: if (regset == NULL)
159: return REG_ESPACE;
160: regset[0] = -1;
161: orig_regset = regset;
162:
163: parents = (int*)xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
164: if (parents == NULL)
165: {
166: xfree(regset);
167: return REG_ESPACE;
168: }
169: parents[0] = -1;
170:
171: saved_states = (tre_tag_states_t*)xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
172: if (saved_states == NULL)
173: {
174: xfree(regset);
175: xfree(parents);
176: return REG_ESPACE;
177: }
178: else
179: {
180: unsigned int i;
181: for (i = 0; i <= tnfa->num_submatches; i++)
182: saved_states[i].tag = -1;
183: }
184:
185: STACK_PUSH(stack, node);
186: STACK_PUSH(stack, ADDTAGS_RECURSE);
187:
188: while (tre_stack_num_objects(stack) > bottom)
189: {
190: if (status != REG_OK)
191: break;
192:
193: symbol = (tre_addtags_symbol_t)(FLX_RAWADDRESS)tre_stack_pop(stack);
194: switch (symbol)
195: {
196:
197: case ADDTAGS_SET_SUBMATCH_END:
198: {
199: int id = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
200: int i;
201:
202: /* Add end of this submatch to regset. */
203: for (i = 0; regset[i] >= 0; i++);
204: regset[i] = id * 2 + 1;
205: regset[i + 1] = -1;
206:
207: /* Pop this submatch from the parents stack. */
208: for (i = 0; parents[i] >= 0; i++);
209: parents[i - 1] = -1;
210: break;
211: }
212:
213: case ADDTAGS_RECURSE:
214: node = (tre_ast_node_t*)tre_stack_pop(stack);
215:
216: if (node->submatch_id >= 0)
217: {
218: int id = node->submatch_id;
219: int i;
220:
221:
222: /* Add start of this submatch to regset. */
223: for (i = 0; regset[i] >= 0; i++);
224: regset[i] = id * 2;
225: regset[i + 1] = -1;
226:
227: if (!first_pass)
228: {
229: for (i = 0; parents[i] >= 0; i++);
230: tnfa->submatch_data[id].parents = NULL;
231: if (i > 0)
232: {
233: int *p = (int*)xmalloc(sizeof(*p) * (i + 1));
234: if (p == NULL)
235: {
236: status = REG_ESPACE;
237: break;
238: }
239: assert(tnfa->submatch_data[id].parents == NULL);
240: tnfa->submatch_data[id].parents = p;
241: for (i = 0; parents[i] >= 0; i++)
242: p[i] = parents[i];
243: p[i] = -1;
244: }
245: }
246:
247: /* Add end of this submatch to regset after processing this
248: node. */
249: STACK_PUSHX(stack, node->submatch_id);
250: STACK_PUSHX(stack, ADDTAGS_SET_SUBMATCH_END);
251: }
252:
253: switch (node->type)
254: {
255: case LITERAL:
256: {
257: tre_literal_t *lit = (tre_literal_t*)node->obj;
258:
259: if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
260: {
261: int i;
262: DPRINT(("Literal %d-%d\n",
263: (int)lit->code_min, (int)lit->code_max));
264: if (regset[0] >= 0)
265: {
266: /* Regset is not empty, so add a tag before the
267: literal or backref. */
268: if (!first_pass)
269: {
270: status = tre_add_tag_left(mem, node, tag);
271: tnfa->tag_directions[tag] = direction;
272: if (minimal_tag >= 0)
273: {
274: DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
275: for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
276: tnfa->minimal_tags[i] = tag;
277: tnfa->minimal_tags[i + 1] = minimal_tag;
278: tnfa->minimal_tags[i + 2] = -1;
279: minimal_tag = -1;
280: num_minimals++;
281: }
282: /* Go through the regset and set submatch data for
283: submatches that are using this tag. */
284: for (i = 0; regset[i] >= 0; i++)
285: {
286: int id = regset[i] / 2;
287: int start = !(regset[i] % 2);
288: DPRINT((" Using tag %d for %s offset of "
289: "submatch %d\n", tag,
290: start ? "start" : "end", id));
291: if (start)
292: tnfa->submatch_data[id].so_tag = tag;
293: else
294: tnfa->submatch_data[id].eo_tag = tag;
295: }
296: }
297: else
298: {
299: DPRINT((" num_tags = 1\n"));
300: node->num_tags = 1;
301: }
302:
303: DPRINT((" num_tags++\n"));
304: regset[0] = -1;
305: tag = next_tag;
306: num_tags++;
307: next_tag++;
308: }
309: }
310: else
311: {
312: assert(!IS_TAG(lit));
313: }
314: break;
315: }
316: case CATENATION:
317: {
318: tre_catenation_t *cat = (tre_catenation_t*)node->obj;
319: tre_ast_node_t *left = (tre_ast_node_t*)cat->left;
320: tre_ast_node_t *right = (tre_ast_node_t*)cat->right;
321: int reserved_tag = -1;
322: DPRINT(("Catenation, next_tag = %d\n", next_tag));
323:
324:
325: /* After processing right child. */
326: STACK_PUSHX(stack, node);
327: STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_RIGHT);
328:
329: /* Process right child. */
330: STACK_PUSHX(stack, right);
331: STACK_PUSHX(stack, ADDTAGS_RECURSE);
332:
333: /* After processing left child. */
334: STACK_PUSHX(stack, next_tag + left->num_tags);
335: DPRINT((" Pushing %d for after left\n",
336: next_tag + left->num_tags));
337: if (left->num_tags > 0 && right->num_tags > 0)
338: {
339: /* Reserve the next tag to the right child. */
340: DPRINT((" Reserving next_tag %d to right child\n",
341: next_tag));
342: reserved_tag = next_tag;
343: next_tag++;
344: }
345: STACK_PUSHX(stack, reserved_tag);
346: STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_LEFT);
347:
348: /* Process left child. */
349: STACK_PUSHX(stack, left);
350: STACK_PUSHX(stack, ADDTAGS_RECURSE);
351:
352: }
353: break;
354: case ITERATION:
355: {
356: tre_iteration_t *iter = (tre_iteration_t*)node->obj;
357: DPRINT(("Iteration\n"));
358:
359: if (first_pass)
360: {
361: STACK_PUSHX(stack, regset[0] >= 0 || iter->minimal);
362: }
363: else
364: {
365: STACK_PUSHX(stack, tag);
366: STACK_PUSHX(stack, iter->minimal);
367: }
368: STACK_PUSHX(stack, node);
369: STACK_PUSHX(stack, ADDTAGS_AFTER_ITERATION);
370:
371: STACK_PUSHX(stack, iter->arg);
372: STACK_PUSHX(stack, ADDTAGS_RECURSE);
373:
374: /* Regset is not empty, so add a tag here. */
375: if (regset[0] >= 0 || iter->minimal)
376: {
377: if (!first_pass)
378: {
379: int i;
380: status = tre_add_tag_left(mem, node, tag);
381: if (iter->minimal)
382: tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
383: else
384: tnfa->tag_directions[tag] = direction;
385: if (minimal_tag >= 0)
386: {
387: DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
388: for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
389: tnfa->minimal_tags[i] = tag;
390: tnfa->minimal_tags[i + 1] = minimal_tag;
391: tnfa->minimal_tags[i + 2] = -1;
392: minimal_tag = -1;
393: num_minimals++;
394: }
395: /* Go through the regset and set submatch data for
396: submatches that are using this tag. */
397: for (i = 0; regset[i] >= 0; i++)
398: {
399: int id = regset[i] / 2;
400: int start = !(regset[i] % 2);
401: DPRINT((" Using tag %d for %s offset of "
402: "submatch %d\n", tag,
403: start ? "start" : "end", id));
404: if (start)
405: tnfa->submatch_data[id].so_tag = tag;
406: else
407: tnfa->submatch_data[id].eo_tag = tag;
408: }
409: }
410:
411: DPRINT((" num_tags++\n"));
412: regset[0] = -1;
413: tag = next_tag;
414: num_tags++;
415: next_tag++;
416: }
417: direction = TRE_TAG_MINIMIZE;
418: }
419: break;
420: case UNION:
421: {
422: tre_union_t *uni = (tre_union_t*)node->obj;
423: tre_ast_node_t *left = (tre_ast_node_t*)uni->left;
424: tre_ast_node_t *right = (tre_ast_node_t*)uni->right;
425: int left_tag;
426: int right_tag;
427:
428: if (regset[0] >= 0)
429: {
430: left_tag = next_tag;
431: right_tag = next_tag + 1;
432: }
433: else
434: {
435: left_tag = tag;
436: right_tag = next_tag;
437: }
438:
439: DPRINT(("Union\n"));
440:
441: /* After processing right child. */
442: STACK_PUSHX(stack, right_tag);
443: STACK_PUSHX(stack, left_tag);
444: STACK_PUSHX(stack, regset);
445: STACK_PUSHX(stack, regset[0] >= 0);
446: STACK_PUSHX(stack, node);
447: STACK_PUSHX(stack, right);
448: STACK_PUSHX(stack, left);
449: STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_RIGHT);
450:
451: /* Process right child. */
452: STACK_PUSHX(stack, right);
453: STACK_PUSHX(stack, ADDTAGS_RECURSE);
454:
455: /* After processing left child. */
456: STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_LEFT);
457:
458: /* Process left child. */
459: STACK_PUSHX(stack, left);
460: STACK_PUSHX(stack, ADDTAGS_RECURSE);
461:
462: /* Regset is not empty, so add a tag here. */
463: if (regset[0] >= 0)
464: {
465: if (!first_pass)
466: {
467: int i;
468: status = tre_add_tag_left(mem, node, tag);
469: tnfa->tag_directions[tag] = direction;
470: if (minimal_tag >= 0)
471: {
472: DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
473: for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
474: tnfa->minimal_tags[i] = tag;
475: tnfa->minimal_tags[i + 1] = minimal_tag;
476: tnfa->minimal_tags[i + 2] = -1;
477: minimal_tag = -1;
478: num_minimals++;
479: }
480: /* Go through the regset and set submatch data for
481: submatches that are using this tag. */
482: for (i = 0; regset[i] >= 0; i++)
483: {
484: int id = regset[i] / 2;
485: int start = !(regset[i] % 2);
486: DPRINT((" Using tag %d for %s offset of "
487: "submatch %d\n", tag,
488: start ? "start" : "end", id));
489: if (start)
490: tnfa->submatch_data[id].so_tag = tag;
491: else
492: tnfa->submatch_data[id].eo_tag = tag;
493: }
494: }
495:
496: DPRINT((" num_tags++\n"));
497: regset[0] = -1;
498: tag = next_tag;
499: num_tags++;
500: next_tag++;
501: }
502:
503: if (node->num_submatches > 0)
504: {
505: /* The next two tags are reserved for markers. */
506: next_tag++;
507: tag = next_tag;
508: next_tag++;
509: }
510:
511: break;
512: }
513: }
514:
515: if (node->submatch_id >= 0)
516: {
517: int i;
518: /* Push this submatch on the parents stack. */
519: for (i = 0; parents[i] >= 0; i++);
520: parents[i] = node->submatch_id;
521: parents[i + 1] = -1;
522: }
523:
524: break; /* end case: ADDTAGS_RECURSE */
525:
526: case ADDTAGS_AFTER_ITERATION:
527: {
528: int minimal = 0;
529: int enter_tag;
530: node = (tre_ast_node_t*)tre_stack_pop(stack);
531: if (first_pass)
532: {
533: node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
534: + (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
535: minimal_tag = -1;
536: }
537: else
538: {
539: minimal = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
540: enter_tag = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
541: if (minimal)
542: minimal_tag = enter_tag;
543: }
544:
545: DPRINT(("After iteration\n"));
546: if (!first_pass)
547: {
548: DPRINT((" Setting direction to %s\n",
549: minimal ? "minimize" : "maximize"));
550: if (minimal)
551: direction = TRE_TAG_MINIMIZE;
552: else
553: direction = TRE_TAG_MAXIMIZE;
554: }
555: break;
556: }
557:
558: case ADDTAGS_AFTER_CAT_LEFT:
559: {
560: int new_tag = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
561: next_tag = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
562: DPRINT(("After cat left, tag = %d, next_tag = %d\n",
563: tag, next_tag));
564: if (new_tag >= 0)
565: {
566: DPRINT((" Setting tag to %d\n", new_tag));
567: tag = new_tag;
568: }
569: break;
570: }
571:
572: case ADDTAGS_AFTER_CAT_RIGHT:
573: DPRINT(("After cat right\n"));
574: node = (tre_ast_node_t*)tre_stack_pop(stack);
575: if (first_pass)
576: node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
577: + ((tre_catenation_t *)node->obj)->right->num_tags;
578: break;
579:
580: case ADDTAGS_AFTER_UNION_LEFT:
581: DPRINT(("After union left\n"));
582: /* Lift the bottom of the `regset' array so that when processing
583: the right operand the items currently in the array are
584: invisible. The original bottom was saved at ADDTAGS_UNION and
585: will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
586: while (*regset >= 0)
587: regset++;
588: break;
589:
590: case ADDTAGS_AFTER_UNION_RIGHT:
591: {
592: int added_tags, tag_left, tag_right;
593: tre_ast_node_t *left = (tre_ast_node_t*)tre_stack_pop(stack);
594: tre_ast_node_t *right = (tre_ast_node_t*)tre_stack_pop(stack);
595: DPRINT(("After union right\n"));
596: node = (tre_ast_node_t*)tre_stack_pop(stack);
597: added_tags = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
598: if (first_pass)
599: {
600: node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
601: + ((tre_union_t *)node->obj)->right->num_tags + added_tags
602: + ((node->num_submatches > 0) ? 2 : 0);
603: }
604: regset = (int*)tre_stack_pop(stack);
605: tag_left = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
606: tag_right = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
607:
608: /* Add tags after both children, the left child gets a smaller
609: tag than the right child. This guarantees that we prefer
610: the left child over the right child. */
611: /* XXX - This is not always necessary (if the children have
612: tags which must be seen for every match of that child). */
613: /* XXX - Check if this is the only place where tre_add_tag_right
614: is used. If so, use tre_add_tag_left (putting the tag before
615: the child as opposed after the child) and throw away
616: tre_add_tag_right. */
617: if (node->num_submatches > 0)
618: {
619: if (!first_pass)
620: {
621: status = tre_add_tag_right(mem, left, tag_left);
622: tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
623: status = tre_add_tag_right(mem, right, tag_right);
624: tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
625: }
626: DPRINT((" num_tags += 2\n"));
627: num_tags += 2;
628: }
629: direction = TRE_TAG_MAXIMIZE;
630: break;
631: }
632:
633: default:
634: assert(0);
635: break;
636:
637: } /* end switch(symbol) */
638: } /* end while(tre_stack_num_objects(stack) > bottom) */
639:
640: if (!first_pass)
641: {
642: int i;
643: /* Go through the regset and set submatch data for
644: submatches that are using this tag. */
645: for (i = 0; regset[i] >= 0; i++)
646: {
647: int id = regset[i] / 2;
648: int start = !(regset[i] % 2);
649: DPRINT((" Using tag %d for %s offset of "
650: "submatch %d\n", num_tags,
651: start ? "start" : "end", id));
652: if (start)
653: tnfa->submatch_data[id].so_tag = num_tags;
654: else
655: tnfa->submatch_data[id].eo_tag = num_tags;
656: }
657: }
658:
659: if (!first_pass && minimal_tag >= 0)
660: {
661: int i;
662: DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
663: for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
664: tnfa->minimal_tags[i] = tag;
665: tnfa->minimal_tags[i + 1] = minimal_tag;
666: tnfa->minimal_tags[i + 2] = -1;
667: minimal_tag = -1;
668: num_minimals++;
669: }
670:
671: DPRINT(("tre_add_tags: %s complete. Number of tags %d.\n",
672: first_pass? "First pass" : "Second pass", num_tags));
673:
674: assert(tree->num_tags == num_tags);
675: tnfa->end_tag = num_tags;
676: tnfa->num_tags = num_tags;
677: tnfa->num_minimals = num_minimals;
678: xfree(orig_regset);
679: xfree(parents);
680: xfree(saved_states);
681: return status;
682: }
683:
684:
685:
686: /*
687: AST to TNFA compilation routines.
688: */
689:
690: typedef enum {
691: COPY_RECURSE,
692: COPY_SET_RESULT_PTR
693: } tre_copyast_symbol_t;
694:
695: /* Flags for tre_copy_ast(). */
696:
697:
698:
699: static reg_errcode_t
700: tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
701: int flags, int *pos_add, tre_tag_direction_t *tag_directions,
702: tre_ast_node_t **copy, int *max_pos)
703: {
704: reg_errcode_t status = REG_OK;
705: int bottom = tre_stack_num_objects(stack);
706: int num_copied = 0;
707: int first_tag = 1;
708: tre_ast_node_t **result = copy;
709: tre_copyast_symbol_t symbol;
710:
711: STACK_PUSH(stack, ast);
712: STACK_PUSH(stack, COPY_RECURSE);
713:
714: while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
715: {
716: tre_ast_node_t *node;
717: if (status != REG_OK)
718: break;
719:
720: symbol = (tre_copyast_symbol_t)(FLX_RAWADDRESS)tre_stack_pop(stack);
721: switch (symbol)
722: {
723: case COPY_SET_RESULT_PTR:
724: result = (tre_ast_node_t**)tre_stack_pop(stack);
725: break;
726: case COPY_RECURSE:
727: node = (tre_ast_node_t*)tre_stack_pop(stack);
728: switch (node->type)
729: {
730: case LITERAL:
731: {
732: tre_literal_t *lit = (tre_literal_t*)node->obj;
733: int pos = lit->position;
734: int min = lit->code_min;
735: int max = lit->code_max;
736: if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
737: {
738: /* XXX - e.g. [ab] has only one position but two
739: nodes, so we are creating holes in the state space
740: here. Not fatal, just wastes memory. */
741: pos += *pos_add;
742: num_copied++;
743: }
744: else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
745: {
746: /* Change this tag to empty. */
747: min = EMPTY;
748: max = pos = -1;
749: }
750: else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
751: && first_tag)
752: {
753: /* Maximize the first tag. */
754: tag_directions[max] = TRE_TAG_MAXIMIZE;
755: first_tag = 0;
756: }
757: *result = tre_ast_new_literal(mem, min, max, pos);
758: if (*result == NULL)
759: status = REG_ESPACE;
760:
761: if (pos > *max_pos)
762: *max_pos = pos;
763: break;
764: }
765: case UNION:
766: {
767: tre_union_t *uni = (tre_union_t*)node->obj;
768: tre_union_t *copy;
769: *result = tre_ast_new_union(mem, uni->left, uni->right);
770: if (*result == NULL)
771: {
772: status = REG_ESPACE;
773: break;
774: }
775: copy = (tre_union_t*)(*result)->obj;
776: result = ©->left;
777: STACK_PUSHX(stack, uni->right);
778: STACK_PUSHX(stack, COPY_RECURSE);
779: STACK_PUSHX(stack, ©->right);
780: STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
781: STACK_PUSHX(stack, uni->left);
782: STACK_PUSHX(stack, COPY_RECURSE);
783: break;
784: }
785: case CATENATION:
786: {
787: tre_catenation_t *cat = (tre_catenation_t*)node->obj;
788: tre_catenation_t *copy;
789: *result = tre_ast_new_catenation(mem, cat->left, cat->right);
790: if (*result == NULL)
791: {
792: status = REG_ESPACE;
793: break;
794: }
795: copy = (tre_catenation_t*)(*result)->obj;
796: copy->left = NULL;
797: copy->right = NULL;
798: result = ©->left;
799:
800: STACK_PUSHX(stack, cat->right);
801: STACK_PUSHX(stack, COPY_RECURSE);
802: STACK_PUSHX(stack, ©->right);
803: STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
804: STACK_PUSHX(stack, cat->left);
805: STACK_PUSHX(stack, COPY_RECURSE);
806: break;
807: }
808: case ITERATION:
809: {
810: tre_iteration_t *iter = (tre_iteration_t*)node->obj;
811: STACK_PUSHX(stack, iter->arg);
812: STACK_PUSHX(stack, COPY_RECURSE);
813: *result = tre_ast_new_iter(mem, iter->arg, iter->min,
814: iter->max, iter->minimal);
815: if (*result == NULL)
816: {
817: status = REG_ESPACE;
818: break;
819: }
820: iter = (tre_iteration_t*)(*result)->obj;
821: result = &iter->arg;
822: break;
823: }
824: default:
825: assert(0);
826: break;
827: }
828: break;
829: }
830: }
831: *pos_add += num_copied;
832: return status;
833: }
834:
835: typedef enum {
836: EXPAND_RECURSE,
837: EXPAND_AFTER_ITER
838: } tre_expand_ast_symbol_t;
839:
840: /* Expands each iteration node that has a finite nonzero minimum or maximum
841: iteration count to a catenated sequence of copies of the node. */
842: static reg_errcode_t
843: tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
844: int *position, tre_tag_direction_t *tag_directions,
845: int *max_depth)
846: {
847: reg_errcode_t status = REG_OK;
848: int bottom = tre_stack_num_objects(stack);
849: int pos_add = 0;
850: int pos_add_total = 0;
851: int max_pos = 0;
852: /* Current approximate matching parameters. */
853: int params[TRE_PARAM_LAST];
854: /* Approximate parameter nesting level. */
855: int params_depth = 0;
856: int iter_depth = 0;
857: int i;
858:
859: for (i = 0; i < TRE_PARAM_LAST; i++)
860: params[i] = TRE_PARAM_DEFAULT;
861:
862: STACK_PUSHR(stack, ast);
863: STACK_PUSHR(stack, EXPAND_RECURSE);
864: while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
865: {
866: tre_ast_node_t *node;
867: tre_expand_ast_symbol_t symbol;
868:
869: if (status != REG_OK)
870: break;
871:
872: DPRINT(("pos_add %d\n", pos_add));
873:
874: symbol = (tre_expand_ast_symbol_t)(FLX_RAWADDRESS)tre_stack_pop(stack);
875: node = (tre_ast_node_t*)tre_stack_pop(stack);
876: switch (symbol)
877: {
878: case EXPAND_RECURSE:
879: switch (node->type)
880: {
881: case LITERAL:
882: {
883: tre_literal_t *lit= (tre_literal_t*)node->obj;
884: if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
885: {
886: lit->position += pos_add;
887: if (lit->position > max_pos)
888: max_pos = lit->position;
889: }
890: break;
891: }
892: case UNION:
893: {
894: tre_union_t *uni = (tre_union_t*)node->obj;
895: STACK_PUSHX(stack, uni->right);
896: STACK_PUSHX(stack, EXPAND_RECURSE);
897: STACK_PUSHX(stack, uni->left);
898: STACK_PUSHX(stack, EXPAND_RECURSE);
899: break;
900: }
901: case CATENATION:
902: {
903: tre_catenation_t *cat = (tre_catenation_t*)node->obj;
904: STACK_PUSHX(stack, cat->right);
905: STACK_PUSHX(stack, EXPAND_RECURSE);
906: STACK_PUSHX(stack, cat->left);
907: STACK_PUSHX(stack, EXPAND_RECURSE);
908: break;
909: }
910: case ITERATION:
911: {
912: tre_iteration_t *iter = (tre_iteration_t*)node->obj;
913: STACK_PUSHX(stack, pos_add);
914: STACK_PUSHX(stack, node);
915: STACK_PUSHX(stack, EXPAND_AFTER_ITER);
916: STACK_PUSHX(stack, iter->arg);
917: STACK_PUSHX(stack, EXPAND_RECURSE);
918: /* If we are going to expand this node at EXPAND_AFTER_ITER
919: then don't increase the `pos' fields of the nodes now, it
920: will get done when expanding. */
921: if (iter->min > 1 || iter->max > 1)
922: pos_add = 0;
923: iter_depth++;
924: DPRINT(("iter\n"));
925: break;
926: }
927: default:
928: assert(0);
929: break;
930: }
931: break;
932: case EXPAND_AFTER_ITER:
933: {
934: tre_iteration_t *iter = (tre_iteration_t*)node->obj;
935: int pos_add_last;
936: pos_add = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
937: pos_add_last = pos_add;
938: if (iter->min > 1 || iter->max > 1)
939: {
940: tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
941: int i;
942: int pos_add_save = pos_add;
943:
944: /* Create a catenated sequence of copies of the node. */
945: for (i = 0; i < iter->min; i++)
946: {
947: tre_ast_node_t *copy;
948: /* Remove tags from all but the last copy. */
949: int flags = ((i + 1 < iter->min)
950: ? COPY_REMOVE_TAGS
951: : COPY_MAXIMIZE_FIRST_TAG);
952: DPRINT((" pos_add %d\n", pos_add));
953: pos_add_save = pos_add;
954: status = tre_copy_ast(mem, stack, iter->arg, flags,
955: &pos_add, tag_directions, ©,
956: &max_pos);
957: if (status != REG_OK)
958: return status;
959: if (seq1 != NULL)
960: seq1 = tre_ast_new_catenation(mem, seq1, copy);
961: else
962: seq1 = copy;
963: if (seq1 == NULL)
964: return REG_ESPACE;
965: }
966:
967: if (iter->max == -1)
968: {
969: /* No upper limit. */
970: pos_add_save = pos_add;
971: status = tre_copy_ast(mem, stack, iter->arg, 0,
972: &pos_add, NULL, &seq2, &max_pos);
973: if (status != REG_OK)
974: return status;
975: seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
976: if (seq2 == NULL)
977: return REG_ESPACE;
978: }
979: else
980: {
981: for (i = iter->min; i < iter->max; i++)
982: {
983: tre_ast_node_t *tmp, *copy;
984: pos_add_save = pos_add;
985: status = tre_copy_ast(mem, stack, iter->arg, 0,
986: &pos_add, NULL, ©, &max_pos);
987: if (status != REG_OK)
988: return status;
989: if (seq2 != NULL)
990: seq2 = tre_ast_new_catenation(mem, copy, seq2);
991: else
992: seq2 = copy;
993: if (seq2 == NULL)
994: return REG_ESPACE;
995: tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
996: if (tmp == NULL)
997: return REG_ESPACE;
998: seq2 = tre_ast_new_union(mem, tmp, seq2);
999: if (seq2 == NULL)
1000: return REG_ESPACE;
1001: }
1002: }
1003:
1004: pos_add = pos_add_save;
1005: if (seq1 == NULL)
1006: seq1 = seq2;
1007: else if (seq2 != NULL)
1008: seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1009: if (seq1 == NULL)
1010: return REG_ESPACE;
1011: node->obj = seq1->obj;
1012: node->type = seq1->type;
1013: }
1014:
1015: iter_depth--;
1016: pos_add_total += pos_add - pos_add_last;
1017: if (iter_depth == 0)
1018: pos_add = pos_add_total;
1019:
1020: /* If approximate parameters are specified, surround the result
1021: with two parameter setting nodes. The one on the left sets
1022: the specified parameters, and the one on the right restores
1023: the old parameters. */
1024: if (iter->params)
1025: {
1026: tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
1027: unsigned int *old_params;
1028:
1029: tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
1030: if (!tmp_l)
1031: return REG_ESPACE;
1032: ((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
1033: iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
1034: tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
1035: if (!tmp_r)
1036: return REG_ESPACE;
1037: old_params = (unsigned int*)tre_mem_alloc(mem, sizeof(*old_params)
1038: * TRE_PARAM_LAST);
1039: if (!old_params)
1040: return REG_ESPACE;
1041: for (i = 0; i < TRE_PARAM_LAST; i++)
1042: old_params[i] = params[i];
1043: ((tre_literal_t *)tmp_r->obj)->u.params = old_params;
1044: old_params[TRE_PARAM_DEPTH] = params_depth;
1045: /* XXX - this is the only place where ast_new_node is
1046: needed -- should be moved inside AST module. */
1047: node_copy = tre_ast_new_node(mem, ITERATION,
1048: sizeof(tre_iteration_t));
1049: if (!node_copy)
1050: return REG_ESPACE;
1051: node_copy->obj = node->obj;
1052: tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
1053: if (!tmp_node)
1054: return REG_ESPACE;
1055: tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
1056: if (!tmp_node)
1057: return REG_ESPACE;
1058: /* Replace the contents of `node' with `tmp_node'. */
1059: memcpy(node, tmp_node, sizeof(*node));
1060: node->obj = tmp_node->obj;
1061: node->type = tmp_node->type;
1062: params_depth++;
1063: if (params_depth > *max_depth)
1064: *max_depth = params_depth;
1065: }
1066: break;
1067: }
1068: default:
1069: assert(0);
1070: break;
1071: }
1072: }
1073:
1074: *position += pos_add_total;
1075:
1076: /* `max_pos' should never be larger than `*position' if the above
1077: code works, but just an extra safeguard let's make sure
1078: `*position' is set large enough so enough memory will be
1079: allocated for the transition table. */
1080: if (max_pos > *position)
1081: *position = max_pos;
1082:
1083: #ifdef TRE_DEBUG
1084: DPRINT(("Expanded AST:\n"));
1085: tre_ast_print(ast);
1086: DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
1087: #endif
1088:
1089: return status;
1090: }
1091:
1092: static tre_pos_and_tags_t *
1093: tre_set_empty(tre_mem_t mem)
1094: {
1095: tre_pos_and_tags_t *new_set;
1096:
1097: new_set = (tre_pos_and_tags_t*)tre_mem_calloc(mem, sizeof(*new_set));
1098: if (new_set == NULL)
1099: return NULL;
1100:
1101: new_set[0].position = -1;
1102: new_set[0].code_min = -1;
1103: new_set[0].code_max = -1;
1104:
1105: return new_set;
1106: }
1107:
1108: static tre_pos_and_tags_t *
1109: tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
1110: tre_ctype_t klass, tre_ctype_t *neg_klasses, int backref)
1111: {
1112: tre_pos_and_tags_t *new_set;
1113:
1114: new_set = (tre_pos_and_tags_t*)tre_mem_calloc(mem, sizeof(*new_set) * 2);
1115: if (new_set == NULL)
1116: return NULL;
1117:
1118: new_set[0].position = position;
1119: new_set[0].code_min = code_min;
1120: new_set[0].code_max = code_max;
1121: new_set[0].klass = klass;
1122: new_set[0].neg_klasses = neg_klasses;
1123: new_set[0].backref = backref;
1124: new_set[1].position = -1;
1125: new_set[1].code_min = -1;
1126: new_set[1].code_max = -1;
1127:
1128: return new_set;
1129: }
1130:
1131: static tre_pos_and_tags_t *
1132: tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
1133: int *tags, int assertions, int *params)
1134: {
1135: int s1, s2, i, j;
1136: tre_pos_and_tags_t *new_set;
1137: int *new_tags;
1138: int num_tags;
1139:
1140: for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
1141: for (s1 = 0; set1[s1].position >= 0; s1++);
1142: for (s2 = 0; set2[s2].position >= 0; s2++);
1143: new_set = (tre_pos_and_tags_t*)tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
1144: if (!new_set )
1145: return NULL;
1146:
1147: for (s1 = 0; set1[s1].position >= 0; s1++)
1148: {
1149: new_set[s1].position = set1[s1].position;
1150: new_set[s1].code_min = set1[s1].code_min;
1151: new_set[s1].code_max = set1[s1].code_max;
1152: new_set[s1].assertions = set1[s1].assertions | assertions;
1153: new_set[s1].klass = set1[s1].klass;
1154: new_set[s1].neg_klasses = set1[s1].neg_klasses;
1155: new_set[s1].backref = set1[s1].backref;
1156: if (set1[s1].tags == NULL && tags == NULL)
1157: new_set[s1].tags = NULL;
1158: else
1159: {
1160: for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
1161: new_tags = (int*)tre_mem_alloc(mem, (sizeof(*new_tags)
1162: * (i + num_tags + 1)));
1163: if (new_tags == NULL)
1164: return NULL;
1165: for (j = 0; j < i; j++)
1166: new_tags[j] = set1[s1].tags[j];
1167: for (i = 0; i < num_tags; i++)
1168: new_tags[j + i] = tags[i];
1169: new_tags[j + i] = -1;
1170: new_set[s1].tags = new_tags;
1171: }
1172: if (set1[s1].params)
1173: new_set[s1].params = set1[s1].params;
1174: if (params)
1175: {
1176: if (!new_set[s1].params)
1177: new_set[s1].params = params;
1178: else
1179: {
1180: new_set[s1].params = (int*)tre_mem_alloc(mem, sizeof(*params) *
1181: TRE_PARAM_LAST);
1182: if (!new_set[s1].params)
1183: return NULL;
1184: for (i = 0; i < TRE_PARAM_LAST; i++)
1185: if (params[i] != TRE_PARAM_UNSET)
1186: new_set[s1].params[i] = params[i];
1187: }
1188: }
1189: }
1190:
1191: for (s2 = 0; set2[s2].position >= 0; s2++)
1192: {
1193: new_set[s1 + s2].position = set2[s2].position;
1194: new_set[s1 + s2].code_min = set2[s2].code_min;
1195: new_set[s1 + s2].code_max = set2[s2].code_max;
1196: /* XXX - why not | assertions here as well? */
1197: new_set[s1 + s2].assertions = set2[s2].assertions;
1198: new_set[s1 + s2].klass = set2[s2].klass;
1199: new_set[s1 + s2].neg_klasses = set2[s2].neg_klasses;
1200: new_set[s1 + s2].backref = set2[s2].backref;
1201: if (set2[s2].tags == NULL)
1202: new_set[s1 + s2].tags = NULL;
1203: else
1204: {
1205: for (i = 0; set2[s2].tags[i] >= 0; i++);
1206: new_tags = (int*)tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
1207: if (new_tags == NULL)
1208: return NULL;
1209: for (j = 0; j < i; j++)
1210: new_tags[j] = set2[s2].tags[j];
1211: new_tags[j] = -1;
1212: new_set[s1 + s2].tags = new_tags;
1213: }
1214: if (set2[s2].params)
1215: new_set[s1 + s2].params = set2[s2].params;
1216: if (params)
1217: {
1218: if (!new_set[s1 + s2].params)
1219: new_set[s1 + s2].params = params;
1220: else
1221: {
1222: new_set[s1 + s2].params = (int*)tre_mem_alloc(mem, sizeof(*params) *
1223: TRE_PARAM_LAST);
1224: if (!new_set[s1 + s2].params)
1225: return NULL;
1226: for (i = 0; i < TRE_PARAM_LAST; i++)
1227: if (params[i] != TRE_PARAM_UNSET)
1228: new_set[s1 + s2].params[i] = params[i];
1229: }
1230: }
1231: }
1232: new_set[s1 + s2].position = -1;
1233: return new_set;
1234: }
1235:
1236: /* Finds the empty path through `node' which is the one that should be
1237: taken according to POSIX.2 rules, and adds the tags on that path to
1238: `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
1239: set to the number of tags seen on the path. */
1240: static reg_errcode_t
1241: tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
1242: int *assertions, int *params, int *num_tags_seen,
1243: int *params_seen)
1244: {
1245: tre_literal_t *lit;
1246: tre_union_t *uni;
1247: tre_catenation_t *cat;
1248: tre_iteration_t *iter;
1249: int i;
1250: int bottom = tre_stack_num_objects(stack);
1251: reg_errcode_t status = REG_OK;
1252: if (num_tags_seen)
1253: *num_tags_seen = 0;
1254: if (params_seen)
1255: *params_seen = 0;
1256:
1257: status = tre_stack_push(stack, node);
1258:
1259: /* Walk through the tree recursively. */
1260: while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1261: {
1262: node = (tre_ast_node_t*)tre_stack_pop(stack);
1263:
1264: switch (node->type)
1265: {
1266: case LITERAL:
1267: lit = (tre_literal_t *)node->obj;
1268: switch (lit->code_min)
1269: {
1270: case TAG:
1271: if (lit->code_max >= 0)
1272: {
1273: if (tags != NULL)
1274: {
1275: /* Add the tag to `tags'. */
1276: for (i = 0; tags[i] >= 0; i++)
1277: if (tags[i] == lit->code_max)
1278: break;
1279: if (tags[i] < 0)
1280: {
1281: tags[i] = lit->code_max;
1282: tags[i + 1] = -1;
1283: }
1284: }
1285: if (num_tags_seen)
1286: (*num_tags_seen)++;
1287: }
1288: break;
1289: case ASSERTION:
1290: assert(lit->code_max >= 1
1291: || lit->code_max <= ASSERT_LAST);
1292: if (assertions != NULL)
1293: *assertions |= lit->code_max;
1294: break;
1295: case PARAMETER:
1296: if (params != NULL)
1297: for (i = 0; i < TRE_PARAM_LAST; i++)
1298: params[i] = lit->u.params[i];
1299: if (params_seen != NULL)
1300: *params_seen = 1;
1301: break;
1302: case EMPTY:
1303: break;
1304: default:
1305: assert(0);
1306: break;
1307: }
1308: break;
1309:
1310: case UNION:
1311: /* Subexpressions starting earlier take priority over ones
1312: starting later, so we prefer the left subexpression over the
1313: right subexpression. */
1314: uni = (tre_union_t *)node->obj;
1315: if (uni->left->nullable)
1316: STACK_PUSHX(stack, uni->left)
1317: else if (uni->right->nullable)
1318: STACK_PUSHX(stack, uni->right)
1319: else
1320: assert(0);
1321: break;
1322:
1323: case CATENATION:
1324: /* The path must go through both children. */
1325: cat = (tre_catenation_t *)node->obj;
1326: assert(cat->left->nullable);
1327: assert(cat->right->nullable);
1328: STACK_PUSHX(stack, cat->left);
1329: STACK_PUSHX(stack, cat->right);
1330: break;
1331:
1332: case ITERATION:
1333: /* A match with an empty string is preferred over no match at
1334: all, so we go through the argument if possible. */
1335: iter = (tre_iteration_t *)node->obj;
1336: if (iter->arg->nullable)
1337: STACK_PUSHX(stack, iter->arg);
1338: break;
1339:
1340: default:
1341: assert(0);
1342: break;
1343: }
1344: }
1345:
1346: return status;
1347: }
1348:
1349:
1350: typedef enum {
1351: NFL_RECURSE,
1352: NFL_POST_UNION,
1353: NFL_POST_CATENATION,
1354: NFL_POST_ITERATION
1355: } tre_nfl_stack_symbol_t;
1356:
1357:
1358: /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
1359: the nodes of the AST `tree'. */
1360: static reg_errcode_t
1361: tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
1362: {
1363: int bottom = tre_stack_num_objects(stack);
1364:
1365: STACK_PUSHR(stack, tree);
1366: STACK_PUSHR(stack, NFL_RECURSE);
1367:
1368: while (tre_stack_num_objects(stack) > bottom)
1369: {
1370: tre_nfl_stack_symbol_t symbol;
1371: tre_ast_node_t *node;
1372:
1373: symbol = (tre_nfl_stack_symbol_t) (FLX_RAWADDRESS)tre_stack_pop(stack);
1374: node = (tre_ast_node_t*)tre_stack_pop(stack);
1375: switch (symbol)
1376: {
1377: case NFL_RECURSE:
1378: switch (node->type)
1379: {
1380: case LITERAL:
1381: {
1382: tre_literal_t *lit = (tre_literal_t *)node->obj;
1383: if (IS_BACKREF(lit))
1384: {
1385: /* Back references: nullable = false, firstpos = {i},
1386: lastpos = {i}. */
1387: node->nullable = 0;
1388: node->firstpos = tre_set_one(mem, lit->position, 0,
1389: TRE_CHAR_MAX, 0, NULL, -1);
1390: if (!node->firstpos)
1391: return REG_ESPACE;
1392: node->lastpos = tre_set_one(mem, lit->position, 0,
1393: TRE_CHAR_MAX, 0, NULL,
1394: lit->code_max);
1395: if (!node->lastpos)
1396: return REG_ESPACE;
1397: }
1398: else if (lit->code_min < 0)
1399: {
1400: /* Tags, empty strings, params, and zero width assertions:
1401: nullable = true, firstpos = {}, and lastpos = {}. */
1402: node->nullable = 1;
1403: node->firstpos = tre_set_empty(mem);
1404: if (!node->firstpos)
1405: return REG_ESPACE;
1406: node->lastpos = tre_set_empty(mem);
1407: if (!node->lastpos)
1408: return REG_ESPACE;
1409: }
1410: else
1411: {
1412: /* Literal at position i: nullable = false, firstpos = {i},
1413: lastpos = {i}. */
1414: node->nullable = 0;
1415: node->firstpos =
1416: tre_set_one(mem, lit->position, lit->code_min,
1417: lit->code_max, 0, NULL, -1);
1418: if (!node->firstpos)
1419: return REG_ESPACE;
1420: node->lastpos = tre_set_one(mem, lit->position,
1421: lit->code_min, lit->code_max,
1422: lit->u.klass, lit->neg_klasses,
1423: -1);
1424: if (!node->lastpos)
1425: return REG_ESPACE;
1426: }
1427: break;
1428: }
1429:
1430: case UNION:
1431: /* Compute the attributes for the two subtrees, and after that
1432: for this node. */
1433: STACK_PUSHR(stack, node);
1434: STACK_PUSHR(stack, NFL_POST_UNION);
1435: STACK_PUSHR(stack, ((tre_union_t *)node->obj)->right);
1436: STACK_PUSHR(stack, NFL_RECURSE);
1437: STACK_PUSHR(stack, ((tre_union_t *)node->obj)->left);
1438: STACK_PUSHR(stack, NFL_RECURSE);
1439: break;
1440:
1441: case CATENATION:
1442: /* Compute the attributes for the two subtrees, and after that
1443: for this node. */
1444: STACK_PUSHR(stack, node);
1445: STACK_PUSHR(stack, NFL_POST_CATENATION);
1446: STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->right);
1447: STACK_PUSHR(stack, NFL_RECURSE);
1448: STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->left);
1449: STACK_PUSHR(stack, NFL_RECURSE);
1450: break;
1451:
1452: case ITERATION:
1453: /* Compute the attributes for the subtree, and after that for
1454: this node. */
1455: STACK_PUSHR(stack, node);
1456: STACK_PUSHR(stack, NFL_POST_ITERATION);
1457: STACK_PUSHR(stack, ((tre_iteration_t *)node->obj)->arg);
1458: STACK_PUSHR(stack, NFL_RECURSE);
1459: break;
1460: }
1461: break; /* end case: NFL_RECURSE */
1462:
1463: case NFL_POST_UNION:
1464: {
1465: tre_union_t *uni = (tre_union_t *)node->obj;
1466: node->nullable = uni->left->nullable || uni->right->nullable;
1467: node->firstpos = tre_set_union(mem, uni->left->firstpos,
1468: uni->right->firstpos, NULL, 0, NULL);
1469: if (!node->firstpos)
1470: return REG_ESPACE;
1471: node->lastpos = tre_set_union(mem, uni->left->lastpos,
1472: uni->right->lastpos, NULL, 0, NULL);
1473: if (!node->lastpos)
1474: return REG_ESPACE;
1475: break;
1476: }
1477:
1478: case NFL_POST_ITERATION:
1479: {
1480: tre_iteration_t *iter = (tre_iteration_t *)node->obj;
1481:
1482: if (iter->min == 0 || iter->arg->nullable)
1483: node->nullable = 1;
1484: else
1485: node->nullable = 0;
1486: node->firstpos = iter->arg->firstpos;
1487: node->lastpos = iter->arg->lastpos;
1488: break;
1489: }
1490:
1491: case NFL_POST_CATENATION:
1492: {
1493: int num_tags, *tags, assertions, params_seen;
1494: int *params;
1495: reg_errcode_t status;
1496: tre_catenation_t *cat = (tre_catenation_t*)node->obj;
1497: node->nullable = cat->left->nullable && cat->right->nullable;
1498:
1499: /* Compute firstpos. */
1500: if (cat->left->nullable)
1501: {
1502: /* The left side matches the empty string. Make a first pass
1503: with tre_match_empty() to get the number of tags and
1504: parameters. */
1505: status = tre_match_empty(stack, cat->left,
1506: NULL, NULL, NULL, &num_tags,
1507: ¶ms_seen);
1508: if (status != REG_OK)
1509: return status;
1510: /* Allocate arrays for the tags and parameters. */
1511: tags = (int*)xmalloc(sizeof(*tags) * (num_tags + 1));
1512: if (!tags)
1513: return REG_ESPACE;
1514: tags[0] = -1;
1515: assertions = 0;
1516: params = NULL;
1517: if (params_seen)
1518: {
1519: params = (int*)tre_mem_alloc(mem, sizeof(*params)
1520: * TRE_PARAM_LAST);
1521: if (!params)
1522: {
1523: xfree(tags);
1524: return REG_ESPACE;
1525: }
1526: }
1527: /* Second pass with tre_mach_empty() to get the list of
1528: tags and parameters. */
1529: status = tre_match_empty(stack, cat->left, tags,
1530: &assertions, params, NULL, NULL);
1531: if (status != REG_OK)
1532: {
1533: xfree(tags);
1534: return status;
1535: }
1536: node->firstpos =
1537: tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
1538: tags, assertions, params);
1539: xfree(tags);
1540: if (!node->firstpos)
1541: return REG_ESPACE;
1542: }
1543: else
1544: {
1545: node->firstpos = cat->left->firstpos;
1546: }
1547:
1548: /* Compute lastpos. */
1549: if (cat->right->nullable)
1550: {
1551: /* The right side matches the empty string. Make a first pass
1552: with tre_match_empty() to get the number of tags and
1553: parameters. */
1554: status = tre_match_empty(stack, cat->right,
1555: NULL, NULL, NULL, &num_tags,
1556: ¶ms_seen);
1557: if (status != REG_OK)
1558: return status;
1559: /* Allocate arrays for the tags and parameters. */
1560: tags = (int*)xmalloc(sizeof(int) * (num_tags + 1));
1561: if (!tags)
1562: return REG_ESPACE;
1563: tags[0] = -1;
1564: assertions = 0;
1565: params = NULL;
1566: if (params_seen)
1567: {
1568: params = (int*)tre_mem_alloc(mem, sizeof(*params)
1569: * TRE_PARAM_LAST);
1570: if (!params)
1571: {
1572: xfree(tags);
1573: return REG_ESPACE;
1574: }
1575: }
1576: /* Second pass with tre_mach_empty() to get the list of
1577: tags and parameters. */
1578: status = tre_match_empty(stack, cat->right, tags,
1579: &assertions, params, NULL, NULL);
1580: if (status != REG_OK)
1581: {
1582: xfree(tags);
1583: return status;
1584: }
1585: node->lastpos =
1586: tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
1587: tags, assertions, params);
1588: xfree(tags);
1589: if (!node->lastpos)
1590: return REG_ESPACE;
1591: }
1592: else
1593: {
1594: node->lastpos = cat->right->lastpos;
1595: }
1596: break;
1597: }
1598:
1599: default:
1600: assert(0);
1601: break;
1602: }
1603: }
1604:
1605: return REG_OK;
1606: }
1607:
1608:
1609: /* Adds a transition from each position in `p1' to each position in `p2'. */
1610: static reg_errcode_t
1611: tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
1612: tre_tnfa_transition_t *transitions,
1613: int *counts, int *offs)
1614: {
1615: tre_pos_and_tags_t *orig_p2 = p2;
1616: tre_tnfa_transition_t *trans;
1617: int i, j, k, l, dup, prev_p2_pos;
1618:
1619: if (transitions != NULL)
1620: while (p1->position >= 0)
1621: {
1622: p2 = orig_p2;
1623: prev_p2_pos = -1;
1624: while (p2->position >= 0)
1625: {
1626: /* Optimization: if this position was already handled, skip it. */
1627: if (p2->position == prev_p2_pos)
1628: {
1629: p2++;
1630: continue;
1631: }
1632: prev_p2_pos = p2->position;
1633: /* Set `trans' to point to the next unused transition from
1634: position `p1->position'. */
1635: trans = transitions + offs[p1->position];
1636: while (trans->state != NULL)
1637: {
1638: #if 0
1639: /* If we find a previous transition from `p1->position' to
1640: `p2->position', it is overwritten. This can happen only
1641: if there are nested loops in the regexp, like in "((a)*)*".
1642: In POSIX.2 repetition using the outer loop is always
1643: preferred over using the inner loop. Therefore the
1644: transition for the inner loop is useless and can be thrown
1645: away. */
1646: /* XXX - The same position is used for all nodes in a bracket
1647: expression, so this optimization cannot be used (it will
1648: break bracket expressions) unless I figure out a way to
1649: detect it here. */
1650: if (trans->state_id == p2->position)
1651: {
1652: DPRINT(("*"));
1653: break;
1654: }
1655: #endif
1656: trans++;
1657: }
1658:
1659: if (trans->state == NULL)
1660: (trans + 1)->state = NULL;
1661: /* Use the character ranges, assertions, etc. from `p1' for
1662: the transition from `p1' to `p2'. */
1663: trans->code_min = p1->code_min;
1664: trans->code_max = p1->code_max;
1665: trans->state = transitions + offs[p2->position];
1666: trans->state_id = p2->position;
1667: trans->assertions = p1->assertions | p2->assertions
1668: | (p1->klass ? ASSERT_CHAR_CLASS : 0)
1669: | (p1->neg_klasses != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
1670: if (p1->backref >= 0)
1671: {
1672: assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
1673: assert(p2->backref < 0);
1674: trans->u.backref = p1->backref;
1675: trans->assertions |= ASSERT_BACKREF;
1676: }
1677: else
1678: trans->u.klass = p1->klass;
1679: if (p1->neg_klasses != NULL)
1680: {
1681: for (i = 0; p1->neg_klasses[i] != (tre_ctype_t)0; i++);
1682: trans->neg_klasses =
1683: (tre_ctype_t*)xmalloc(sizeof(*trans->neg_klasses) * (i + 1));
1684: if (trans->neg_klasses == NULL)
1685: return REG_ESPACE;
1686: for (i = 0; p1->neg_klasses[i] != (tre_ctype_t)0; i++)
1687: trans->neg_klasses[i] = p1->neg_klasses[i];
1688: trans->neg_klasses[i] = (tre_ctype_t)0;
1689: }
1690: else
1691: trans->neg_klasses = NULL;
1692:
1693: /* Find out how many tags this transition has. */
1694: i = 0;
1695: if (p1->tags != NULL)
1696: while(p1->tags[i] >= 0)
1697: i++;
1698: j = 0;
1699: if (p2->tags != NULL)
1700: while(p2->tags[j] >= 0)
1701: j++;
1702:
1703: /* If we are overwriting a transition, free the old tag array. */
1704: if (trans->tags != NULL)
1705: xfree(trans->tags);
1706: trans->tags = NULL;
1707:
1708: /* If there were any tags, allocate an array and fill it. */
1709: if (i + j > 0)
1710: {
1711: trans->tags = (int*)xmalloc(sizeof(*trans->tags) * (i + j + 1));
1712: if (!trans->tags)
1713: return REG_ESPACE;
1714: i = 0;
1715: if (p1->tags != NULL)
1716: while(p1->tags[i] >= 0)
1717: {
1718: trans->tags[i] = p1->tags[i];
1719: i++;
1720: }
1721: l = i;
1722: j = 0;
1723: if (p2->tags != NULL)
1724: while (p2->tags[j] >= 0)
1725: {
1726: /* Don't add duplicates. */
1727: dup = 0;
1728: for (k = 0; k < i; k++)
1729: if (trans->tags[k] == p2->tags[j])
1730: {
1731: dup = 1;
1732: break;
1733: }
1734: if (!dup)
1735: trans->tags[l++] = p2->tags[j];
1736: j++;
1737: }
1738: trans->tags[l] = -1;
1739: }
1740:
1741: /* Set the parameter array. If both `p2' and `p1' have same
1742: parameters, the values in `p2' override those in `p1'. */
1743: if (p1->params || p2->params)
1744: {
1745: if (!trans->params)
1746: trans->params = (int*)xmalloc(sizeof(*trans->params)
1747: * TRE_PARAM_LAST);
1748: if (!trans->params)
1749: return REG_ESPACE;
1750: for (i = 0; i < TRE_PARAM_LAST; i++)
1751: {
1752: trans->params[i] = TRE_PARAM_UNSET;
1753: if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
1754: trans->params[i] = p1->params[i];
1755: if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
1756: trans->params[i] = p2->params[i];
1757: }
1758: }
1759: else
1760: {
1761: if (trans->params)
1762: xfree(trans->params);
1763: trans->params = NULL;
1764: }
1765:
1766:
1767: #ifdef TRE_DEBUG
1768: {
1769: int *tags;
1770:
1771: DPRINT((" %2d -> %2d on %3d", p1->position, p2->position,
1772: p1->code_min));
1773: if (p1->code_max != p1->code_min)
1774: DPRINT(("-%3d", p1->code_max));
1775: tags = trans->tags;
1776: if (tags)
1777: {
1778: DPRINT((", tags ["));
1779: while (*tags >= 0)
1780: {
1781: DPRINT(("%d", *tags));
1782: tags++;
1783: if (*tags >= 0)
1784: DPRINT((","));
1785: }
1786: DPRINT(("]"));
1787: }
1788: if (trans->assertions)
1789: DPRINT((", assert %d", trans->assertions));
1790: if (trans->assertions & ASSERT_BACKREF)
1791: DPRINT((", backref %d", trans->u.backref));
1792: else if (trans->u.klass)
1793: DPRINT((", klass %ld", (long)trans->u.klass));
1794: if (trans->neg_klasses)
1795: DPRINT((", neg_klasses %p", trans->neg_klasses));
1796: if (trans->params)
1797: {
1798: DPRINT((", "));
1799: tre_print_params(trans->params);
1800: }
1801: DPRINT(("\n"));
1802: }
1803: #endif /* TRE_DEBUG */
1804: p2++;
1805: }
1806: p1++;
1807: }
1808: else
1809: /* Compute a maximum limit for the number of transitions leaving
1810: from each state. */
1811: while (p1->position >= 0)
1812: {
1813: p2 = orig_p2;
1814: while (p2->position >= 0)
1815: {
1816: counts[p1->position]++;
1817: p2++;
1818: }
1819: p1++;
1820: }
1821: return REG_OK;
1822: }
1823:
1824: /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
1825: labelled with one character range (there are no transitions on empty
1826: strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
1827: the regexp. */
1828: static reg_errcode_t
1829: tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
1830: int *counts, int *offs)
1831: {
1832: tre_union_t *uni;
1833: tre_catenation_t *cat;
1834: tre_iteration_t *iter;
1835: reg_errcode_t errcode = REG_OK;
1836:
1837: /* XXX - recurse using a stack!. */
1838: switch (node->type)
1839: {
1840: case LITERAL:
1841: break;
1842: case UNION:
1843: uni = (tre_union_t *)node->obj;
1844: errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
1845: if (errcode != REG_OK)
1846: return errcode;
1847: errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
1848: break;
1849:
1850: case CATENATION:
1851: cat = (tre_catenation_t *)node->obj;
1852: /* Add a transition from each position in cat->left->lastpos
1853: to each position in cat->right->firstpos. */
1854: errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
1855: transitions, counts, offs);
1856: if (errcode != REG_OK)
1857: return errcode;
1858: errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
1859: if (errcode != REG_OK)
1860: return errcode;
1861: errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
1862: break;
1863:
1864: case ITERATION:
1865: iter = (tre_iteration_t *)node->obj;
1866: assert(iter->max == -1 || iter->max == 1);
1867:
1868: if (iter->max == -1)
1869: {
1870: assert(iter->min == 0 || iter->min == 1);
1871: /* Add a transition from each last position in the iterated
1872: expression to each first position. */
1873: errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
1874: transitions, counts, offs);
1875: if (errcode != REG_OK)
1876: return errcode;
1877: }
1878: errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
1879: break;
1880: }
1881: return errcode;
1882: }
1883:
1884:
1885:
1886: do \
1887: { \
1888: errcode = err; \
1889: if (1) goto error_exit; \
1890: } \
1891: while (0)
1892:
1893:
1894: int
1895: tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
1896: {
1897: tre_stack_t *stack;
1898: tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
1899: tre_pos_and_tags_t *p;
1900: int *counts = NULL, *offs = NULL;
1901: int i, add = 0;
1902: tre_tnfa_transition_t *transitions, *initial;
1903: tre_tnfa_t *tnfa = NULL;
1904: tre_submatch_data_t *submatch_data;
1905: tre_tag_direction_t *tag_directions = NULL;
1906: reg_errcode_t errcode;
1907: tre_mem_t mem;
1908:
1909: /* Parse context. */
1910: tre_parse_ctx_t parse_ctx;
1911:
1912: /* Allocate a stack used throughout the compilation process for various
1913: purposes. */
1914: stack = tre_stack_new(512, 10240, 128);
1915: if (!stack)
1916: return REG_ESPACE;
1917: /* Allocate a fast memory allocator. */
1918: mem = tre_mem_new();
1919: if (!mem)
1920: {
1921: tre_stack_destroy(stack);
1922: return REG_ESPACE;
1923: }
1924:
1925: /* Parse the regexp. */
1926: memset(&parse_ctx, 0, sizeof(parse_ctx));
1927: parse_ctx.mem = mem;
1928: parse_ctx.stack = stack;
1929: parse_ctx.re = regex;
1930: parse_ctx.len = n;
1931: parse_ctx.cflags = cflags;
1932: parse_ctx.max_backref = -1;
1933: DPRINT(("tre_compile: parsing '%.*" STRF "'\n", n, regex));
1934: errcode = tre_parse(&parse_ctx);
1935: if (errcode != REG_OK)
1936: ERROR_EXIT(errcode);
1937: preg->re_nsub = parse_ctx.submatch_id - 1;
1938: tree = parse_ctx.result;
1939:
1940: /* Back references and approximate matching cannot currently be used
1941: in the same regexp. */
1942: if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
1943: ERROR_EXIT(REG_BADPAT);
1944:
1945: #ifdef TRE_DEBUG
1946: tre_ast_print(tree);
1947: #endif /* TRE_DEBUG */
1948:
1949: /* Referring to nonexistent subexpressions is illegal. */
1950: if (parse_ctx.max_backref > (int)preg->re_nsub)
1951: ERROR_EXIT(REG_ESUBREG);
1952:
1953: /* Allocate the TNFA struct. */
1954: tnfa = (tre_tnfa_t*)xcalloc(1, sizeof(tre_tnfa_t));
1955: if (tnfa == NULL)
1956: ERROR_EXIT(REG_ESPACE);
1957: tnfa->have_backrefs = parse_ctx.max_backref >= 0;
1958: tnfa->have_approx = parse_ctx.have_approx;
1959: tnfa->num_submatches = parse_ctx.submatch_id;
1960:
1961: /* Set up tags for submatch addressing. If REG_NOSUB is set and the
1962: regexp does not have back references, this can be skipped. */
1963: if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
1964: {
1965: DPRINT(("tre_compile: setting up tags\n"));
1966:
1967: /* Figure out how many tags we will need. */
1968: errcode = tre_add_tags(NULL, stack, tree, tnfa);
1969: if (errcode != REG_OK)
1970: ERROR_EXIT(errcode);
1971: #ifdef TRE_DEBUG
1972: tre_ast_print(tree);
1973: #endif /* TRE_DEBUG */
1974:
1975: if (tnfa->num_tags > 0)
1976: {
1977: tag_directions = (tre_tag_direction_t*)xmalloc(sizeof(*tag_directions)
1978: * (tnfa->num_tags + 1));
1979: if (tag_directions == NULL)
1980: ERROR_EXIT(REG_ESPACE);
1981: tnfa->tag_directions = tag_directions;
1982: memset(tag_directions, -1,
1983: sizeof(*tag_directions) * (tnfa->num_tags + 1));
1984: }
1985: tnfa->minimal_tags = (int*)xcalloc(tnfa->num_tags * 2 + 1,
1986: sizeof(tnfa->minimal_tags));
1987: if (tnfa->minimal_tags == NULL)
1988: ERROR_EXIT(REG_ESPACE);
1989:
1990: submatch_data = (tre_submatch_data_t*)xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data));
1991: if (submatch_data == NULL)
1992: ERROR_EXIT(REG_ESPACE);
1993: tnfa->submatch_data = submatch_data;
1994:
1995: errcode = tre_add_tags(mem, stack, tree, tnfa);
1996: if (errcode != REG_OK)
1997: ERROR_EXIT(errcode);
1998:
1999: #ifdef TRE_DEBUG
2000: for (i = 0; i < parse_ctx.submatch_id; i++)
2001: DPRINT(("pmatch[%d] = {t%d, t%d}\n",
2002: i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
2003: for (i = 0; i < tnfa->num_tags; i++)
2004: DPRINT(("t%d is %s\n", i,
2005: tag_directions[i] == TRE_TAG_MINIMIZE ?
2006: "minimized" : "maximized"));
2007: #endif /* TRE_DEBUG */
2008: }
2009:
2010: /* Expand iteration nodes. */
2011: errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2012: tag_directions, &tnfa->params_depth);
2013: if (errcode != REG_OK)
2014: ERROR_EXIT(errcode);
2015:
2016: /* Add a dummy node for the final state.
2017: XXX - For certain patterns this dummy node can be optimized away,
2018: for example "a*" or "ab*". Figure out a simple way to detect
2019: this possibility. */
2020: tmp_ast_l = tree;
2021: tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2022: if (tmp_ast_r == NULL)
2023: ERROR_EXIT(REG_ESPACE);
2024:
2025: tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2026: if (tree == NULL)
2027: ERROR_EXIT(REG_ESPACE);
2028:
2029: #ifdef TRE_DEBUG
2030: tre_ast_print(tree);
2031: DPRINT(("Number of states: %d\n", parse_ctx.position));
2032: #endif /* TRE_DEBUG */
2033:
2034: errcode = tre_compute_nfl(mem, stack, tree);
2035: if (errcode != REG_OK)
2036: ERROR_EXIT(errcode);
2037:
2038: counts = (int*)xmalloc(sizeof(int) * parse_ctx.position);
2039: if (counts == NULL)
2040: ERROR_EXIT(REG_ESPACE);
2041:
2042: offs = (int*)xmalloc(sizeof(int) * parse_ctx.position);
2043: if (offs == NULL)
2044: ERROR_EXIT(REG_ESPACE);
2045:
2046: for (i = 0; i < parse_ctx.position; i++)
2047: counts[i] = 0;
2048: tre_ast_to_tnfa(tree, NULL, counts, NULL);
2049:
2050: add = 0;
2051: for (i = 0; i < parse_ctx.position; i++)
2052: {
2053: offs[i] = add;
2054: add += counts[i] + 1;
2055: counts[i] = 0;
2056: }
2057: transitions = (tre_tnfa_transition_t*)xcalloc(add + 1, sizeof(*transitions));
2058: if (transitions == NULL)
2059: ERROR_EXIT(REG_ESPACE);
2060: tnfa->transitions = transitions;
2061: tnfa->num_transitions = add;
2062:
2063: DPRINT(("Converting to TNFA:\n"));
2064: errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2065: if (errcode != REG_OK)
2066: ERROR_EXIT(errcode);
2067:
2068: /* If in eight bit mode, compute a table of characters that can be the
2069: first character of a match. */
2070: tnfa->first_char = -1;
2071: if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
2072: {
2073: int count = 0;
2074: int k;
2075: DPRINT(("Characters that can start a match:"));
2076: tnfa->firstpos_chars = (char*)xcalloc(256, sizeof(char));
2077: if (tnfa->firstpos_chars == NULL)
2078: ERROR_EXIT(REG_ESPACE);
2079: for (p = tree->firstpos; p->position >= 0; p++)
2080: {
2081: tre_tnfa_transition_t *j = transitions + offs[p->position];
2082: while (j->state != NULL)
2083: {
2084: for (k = j->code_min; k <= j->code_max && k < 256; k++)
2085: {
2086: DPRINT((" %d", k));
2087: tnfa->firstpos_chars[k] = 1;
2088: count++;
2089: }
2090: j++;
2091: }
2092: }
2093: DPRINT(("\n"));
2094: #define TRE_OPTIMIZE_FIRST_CHAR 1
2095: #if TRE_OPTIMIZE_FIRST_CHAR
2096: if (count == 1)
2097: {
2098: for (k = 0; k < 256; k++)
2099: if (tnfa->firstpos_chars[k])
2100: {
2101: DPRINT(("first char must be %d\n", k));
2102: tnfa->first_char = k;
2103: xfree(tnfa->firstpos_chars);
2104: tnfa->firstpos_chars = NULL;
2105: break;
2106: }
2107: }
2108: #endif
2109:
2110: }
2111: else
2112: tnfa->firstpos_chars = NULL;
2113:
2114:
2115: p = tree->firstpos;
2116: i = 0;
2117: while (p->position >= 0)
2118: {
2119: i++;
2120:
2121: #ifdef TRE_DEBUG
2122: {
2123: int *tags;
2124: DPRINT(("initial: %d", p->position));
2125: tags = p->tags;
2126: if (tags != NULL)
2127: {
2128: if (*tags >= 0)
2129: DPRINT(("/"));
2130: while (*tags >= 0)
2131: {
2132: DPRINT(("%d", *tags));
2133: tags++;
2134: if (*tags >= 0)
2135: DPRINT((","));
2136: }
2137: }
2138: DPRINT((", assert %d", p->assertions));
2139: if (p->params)
2140: {
2141: DPRINT((", "));
2142: tre_print_params(p->params);
2143: }
2144: DPRINT(("\n"));
2145: }
2146: #endif /* TRE_DEBUG */
2147:
2148: p++;
2149: }
2150:
2151: initial = (tre_tnfa_transition_t*)xcalloc(i + 1, sizeof(tre_tnfa_transition_t));
2152: if (initial == NULL)
2153: ERROR_EXIT(REG_ESPACE);
2154: tnfa->initial = initial;
2155:
2156: i = 0;
2157: for (p = tree->firstpos; p->position >= 0; p++)
2158: {
2159: initial[i].state = transitions + offs[p->position];
2160: initial[i].state_id = p->position;
2161: initial[i].tags = NULL;
2162: /* Copy the arrays p->tags, and p->params, they are allocated
2163: from a tre_mem object. */
2164: if (p->tags)
2165: {
2166: int j;
2167: for (j = 0; p->tags[j] >= 0; j++);
2168: initial[i].tags = (int*)xmalloc(sizeof(*p->tags) * (j + 1));
2169: if (!initial[i].tags)
2170: ERROR_EXIT(REG_ESPACE);
2171: memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2172: }
2173: initial[i].params = NULL;
2174: if (p->params)
2175: {
2176: initial[i].params = (int*)xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
2177: if (!initial[i].params)
2178: ERROR_EXIT(REG_ESPACE);
2179: memcpy(initial[i].params, p->params,
2180: sizeof(*p->params) * TRE_PARAM_LAST);
2181: }
2182: initial[i].assertions = p->assertions;
2183: i++;
2184: }
2185: initial[i].state = NULL;
2186:
2187: tnfa->num_transitions = add;
2188: tnfa->final = transitions + offs[tree->lastpos[0].position];
2189: tnfa->num_states = parse_ctx.position;
2190: tnfa->cflags = cflags;
2191:
2192: DPRINT(("final state %p\n", (void *)tnfa->final));
2193:
2194: tre_mem_destroy(mem);
2195: tre_stack_destroy(stack);
2196: xfree(counts);
2197: xfree(offs);
2198:
2199: preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2200: return REG_OK;
2201:
2202: error_exit:
2203: /* Free everything that was allocated and return the error code. */
2204: tre_mem_destroy(mem);
2205: if (stack != NULL)
2206: tre_stack_destroy(stack);
2207: if (counts != NULL)
2208: xfree(counts);
2209: if (offs != NULL)
2210: xfree(offs);
2211: preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2212: tre_free(preg);
2213: return errcode;
2214: }
2215:
2216:
2217:
2218:
2219: void
2220: tre_free(regex_t *preg)
2221: {
2222: tre_tnfa_t *tnfa;
2223: unsigned int i;
2224: tre_tnfa_transition_t *trans;
2225:
2226: tnfa = (tre_tnfa_t*)preg->TRE_REGEX_T_FIELD;
2227: if (!tnfa)
2228: return;
2229:
2230: for (i = 0; i < tnfa->num_transitions; i++)
2231: if (tnfa->transitions[i].state)
2232: {
2233: if (tnfa->transitions[i].tags)
2234: xfree(tnfa->transitions[i].tags);
2235: if (tnfa->transitions[i].neg_klasses)
2236: xfree(tnfa->transitions[i].neg_klasses);
2237: if (tnfa->transitions[i].params)
2238: xfree(tnfa->transitions[i].params);
2239: }
2240: if (tnfa->transitions)
2241: xfree(tnfa->transitions);
2242:
2243: if (tnfa->initial)
2244: {
2245: for (trans = tnfa->initial; trans->state; trans++)
2246: {
2247: if (trans->tags)
2248: xfree(trans->tags);
2249: if (trans->params)
2250: xfree(trans->params);
2251: }
2252: xfree(tnfa->initial);
2253: }
2254:
2255: if (tnfa->submatch_data)
2256: {
2257: for (i = 0; i < tnfa->num_submatches; i++)
2258: if (tnfa->submatch_data[i].parents)
2259: xfree(tnfa->submatch_data[i].parents);
2260: xfree(tnfa->submatch_data);
2261: }
2262:
2263: if (tnfa->tag_directions)
2264: xfree(tnfa->tag_directions);
2265: if (tnfa->firstpos_chars)
2266: xfree(tnfa->firstpos_chars);
2267: if (tnfa->minimal_tags)
2268: xfree(tnfa->minimal_tags);
2269: xfree(tnfa);
2270: }
2271:
2272: char *
2273: tre_version(void)
2274: {
2275: static char str[256];
2276: char *version;
2277:
2278: if (str[0] == 0)
2279: {
2280: tre_config(TRE_CONFIG_VERSION, &version);
2281: sprintf(str, "TRE %s (GPL)", version);
2282: }
2283: return str;
2284: }
2285:
2286: int
2287: tre_config(int query, void *result)
2288: {
2289: int *int_result = (int*)result;
2290: char **string_result = (char**)result;
2291:
2292: switch (query)
2293: {
2294: case TRE_CONFIG_APPROX:
2295: #ifdef TRE_APPROX
2296: *int_result = 1;
2297: #else /* !TRE_APPROX */
2298: *int_result = 0;
2299: #endif /* !TRE_APPROX */
2300: return REG_OK;
2301:
2302: case TRE_CONFIG_WCHAR:
2303: #ifdef TRE_WCHAR
2304: *int_result = 1;
2305: #else /* !TRE_WCHAR */
2306: *int_result = 0;
2307: #endif /* !TRE_WCHAR */
2308: return REG_OK;
2309:
2310: case TRE_CONFIG_MULTIBYTE:
2311: #ifdef TRE_MULTIBYTE
2312: *int_result = 1;
2313: #else /* !TRE_MULTIBYTE */
2314: *int_result = 0;
2315: #endif /* !TRE_MULTIBYTE */
2316: return REG_OK;
2317:
2318: case TRE_CONFIG_SYSTEM_ABI:
2319: #ifdef TRE_CONFIG_SYSTEM_ABI
2320: *int_result = 1;
2321: #else /* !TRE_CONFIG_SYSTEM_ABI */
2322: *int_result = 0;
2323: #endif /* !TRE_CONFIG_SYSTEM_ABI */
2324: return REG_OK;
2325:
2326: case TRE_CONFIG_VERSION:
2327: *string_result = TRE_VERSION;
2328: return REG_OK;
2329: }
2330:
2331: return REG_NOMATCH;
2332: }
2333:
2334:
2335: /* EOF */
Start cpp section to tre/tre_compile.hpp[1
/1
]
1: #line 4075 "./lpsrc/tre.pak"
2: /*
3: tre-compile.h: Regex compilation definitions
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19: */
20:
21:
22:
23:
24:
25: typedef struct {
26: int position;
27: int code_min;
28: int code_max;
29: int *tags;
30: int assertions;
31: tre_ctype_t klass;
32: tre_ctype_t *neg_klasses;
33: int backref;
34: int *params;
35: } tre_pos_and_tags_t;
36:
37:
38:
39: /* EOF */
Start cpp section to tre/tre_config.hpp[1
/1
]
1: #line 4115 "./lpsrc/tre.pak"
2:
3: /* RF: need this for TRE_EXTERN */
4:
5:
6: /* lib/tre-config.h. Generated by configure. */
7: /* tre-config.h.in. This file has all definitions that are needed in
8: `regex.h'. Note that this file must contain only the bare minimum
9: of definitions without the TRE_ prefix to avoid conflicts between
10: definitions here and definitions included from somewhere else. */
11:
12: /* Define to 1 if you have the <libutf8.h> header file. */
13: /* #undef HAVE_LIBUTF8_H */
14:
15: /* Define to 1 if the system has the type `reg_errcode_t'. */
16: /* #undef HAVE_REG_ERRCODE_T */
17:
18: /* Define to 1 if you have the <sys/types.h> header file. */
19:
20:
21: /* Define to 1 if you have the <wchar.h> header file. */
22:
23:
24: /* Define if you want to enable approximate matching functionality. */
25:
26:
27: /* Define to enable multibyte character set support. */
28:
29:
30: /* Define to the absolute path to the system regex.h */
31: /* #undef TRE_SYSTEM_REGEX_H_PATH */
32:
33: /* Define to include the system regex.h from TRE regex.h */
34: /* #undef TRE_USE_SYSTEM_REGEX_H */
35:
36: /* Define to enable wide character (wchar_t) support. */
37:
38:
39: /* TRE version string. */
40:
41:
42: /* TRE version level 1. */
43:
44:
45: /* TRE version level 2. */
46:
47:
48: /* TRE version level 3. */
49:
Start cpp section to tre/tre_filter.cpp[1
/1
]
1: #line 4165 "./lpsrc/tre.pak"
2: /*
3: tre-filter.c: Histogram filter to quickly find regexp match candidates
4:
5: Copyright (C) 2004 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22: /* The idea of this filter is quite simple. First, let's assume the
23: search pattern is a simple string. In order for a substring of a
24: longer string to match the search pattern, it must have the same
25: numbers of different characters as the pattern, and those
26: characters must occur in the same order as they occur in pattern. */
27:
28:
29:
30:
31:
32:
33: int
34: tre_filter_find(const unsigned char *str, size_t len, tre_filter_t *filter)
35: {
36: unsigned short counts[256];
37: unsigned int i;
38: unsigned int window_len = filter->window_len;
39: tre_filter_profile_t *profile = filter->profile;
40: const unsigned char *str_orig = str;
41:
42: DPRINT(("tre_filter_find: %.*s\n", len, str));
43:
44: for (i = 0; i < elementsof(counts); i++)
45: counts[i] = 0;
46:
47: i = 0;
48: while (*str && i < window_len && i < len)
49: {
50: counts[*str]++;
51: i++;
52: str++;
53: len--;
54: }
55:
56: while (len > 0)
57: {
58: tre_filter_profile_t *p;
59: counts[*str]++;
60: counts[*(str - window_len)]--;
61:
62: p = profile;
63: while (p->ch)
64: {
65: if (counts[p->ch] < p->count)
66: break;
67: p++;
68: }
69: if (!p->ch)
70: {
71: DPRINT(("Found possible match at %d\n",
72: str - str_orig));
73: return str - str_orig;
74: }
75: else
76: {
77: DPRINT(("No match so far...\n"));
78: }
79: len--;
80: str++;
81: }
82: DPRINT(("This string cannot match.\n"));
83: return -1;
84: }
Start cpp section to tre/tre_filter.hpp[1
/1
]
1: #line 4250 "./lpsrc/tre.pak"
2:
3:
4:
5:
6: typedef struct {
7: unsigned char ch;
8: unsigned char count;
9: } tre_filter_profile_t;
10:
11: typedef struct {
12: /* Length of the window where the character counts are kept. */
13: int window_len;
14: /* Required character counts table. */
15: tre_filter_profile_t *profile;
16: } tre_filter_t;
17:
18:
19: int
20: tre_filter_find(const unsigned char *str, size_t len, tre_filter_t *filter);
Start cpp section to tre/tre_internal.hpp[1
/1
]
1: #line 4271 "./lpsrc/tre.pak"
2: /*
3: tre-internal.h - TRE internal definitions
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24:
25:
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59: /* Define the character types and functions. */
60:
61:
62: /* Wide characters. */
63: typedef wint_t tre_cint_t;
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87:
88:
89:
90:
91:
92:
93: /* 8 bit characters. */
94: typedef short tre_cint_t;
95:
96:
97:
98:
99:
100:
101:
102:
103:
104:
105:
106:
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127: /* Use system provided iswctype() and wctype(). */
128: typedef wctype_t tre_ctype_t;
129:
130:
131:
132: /* Define our own versions of iswctype() and wctype(). */
133: typedef int (*tre_ctype_t)(tre_cint_t);
134:
135: tre_ctype_t tre_ctype(const char *name);
136:
137:
138: typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
139:
140: /* Returns number of bytes to add to (char *)ptr to make it
141: properly aligned for the type. */
142:
143: ((((long)ptr) % sizeof(type)) \
144: ? (sizeof(type) - (((long)ptr) % sizeof(type))) \
145: : 0)
146:
147:
148:
149:
150:
151:
152: /* Define STRF to the correct printf formatter for strings. */
153:
154:
155:
156:
157:
158:
159: /* TNFA transition type. A TNFA state is an array of transitions,
160: the terminator is a transition with NULL `state'. */
161: typedef struct tnfa_transition tre_tnfa_transition_t;
162:
163: struct tnfa_transition {
164: /* Range of accepted characters. */
165: tre_cint_t code_min;
166: tre_cint_t code_max;
167: /* Pointer to the destination state. */
168: tre_tnfa_transition_t *state;
169: /* ID number of the destination state. */
170: int state_id;
171: /* -1 terminated array of tags (or NULL). */
172: int *tags;
173: /* Matching parameters settings (or NULL). */
174: int *params;
175: /* Assertion bitmap. */
176: int assertions;
177: /* Assertion parameters. */
178: union {
179: /* Character klass assertion. */
180: tre_ctype_t klass;
181: /* Back reference assertion. */
182: int backref;
183: } u;
184: /* Negative character klass assertions. */
185: tre_ctype_t *neg_klasses;
186: };
187:
188:
189: /* Assertions. */
190:
191:
192:
193:
194:
195:
196:
197:
198:
199:
200:
201: /* Tag directions. */
202: typedef enum {
203: TRE_TAG_MINIMIZE = 0,
204: TRE_TAG_MAXIMIZE = 1
205: } tre_tag_direction_t;
206:
207: /* Parameters that can be changed dynamically while matching. */
208: typedef enum {
209: TRE_PARAM_COST_INS = 0,
210: TRE_PARAM_COST_DEL = 1,
211: TRE_PARAM_COST_SUBST = 2,
212: TRE_PARAM_COST_MAX = 3,
213: TRE_PARAM_MAX_INS = 4,
214: TRE_PARAM_MAX_DEL = 5,
215: TRE_PARAM_MAX_SUBST = 6,
216: TRE_PARAM_MAX_ERR = 7,
217: TRE_PARAM_DEPTH = 8,
218: TRE_PARAM_LAST = 9
219: } tre_param_t;
220:
221: /* Unset matching parameter */
222:
223:
224: /* Signifies the default matching parameter value. */
225:
226:
227: /* Instructions to compute submatch register values from tag values
228: after a successful match. */
229: struct tre_submatch_data {
230: /* Tag that gives the value for rm_so (submatch start offset). */
231: int so_tag;
232: /* Tag that gives the value for rm_eo (submatch end offset). */
233: int eo_tag;
234: /* List of submatches this submatch is contained in. */
235: int *parents;
236: };
237:
238: typedef struct tre_submatch_data tre_submatch_data_t;
239:
240:
241: /* TNFA definition. */
242: typedef struct tnfa tre_tnfa_t;
243:
244: struct tnfa {
245: tre_tnfa_transition_t *transitions;
246: unsigned int num_transitions;
247: tre_tnfa_transition_t *initial;
248: tre_tnfa_transition_t *final;
249: tre_submatch_data_t *submatch_data;
250: char *firstpos_chars;
251: int first_char;
252: unsigned int num_submatches;
253: tre_tag_direction_t *tag_directions;
254: int *minimal_tags;
255: int num_tags;
256: int num_minimals;
257: int end_tag;
258: int num_states;
259: int cflags;
260: int have_backrefs;
261: int have_approx;
262: int params_depth;
263: };
264:
265: int
266: tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags);
267:
268: void
269: tre_free(regex_t *preg);
270:
271: void
272: tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
273: const tre_tnfa_t *tnfa, int *tags, int match_eo);
274:
275: reg_errcode_t
276: tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
277: tre_str_type_t type, int *match_tags, int eflags,
278: int *match_end_ofs);
279:
280: reg_errcode_t
281: tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
282: tre_str_type_t type, int *match_tags, int eflags,
283: int *match_end_ofs);
284:
285: reg_errcode_t
286: tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
287: int len, tre_str_type_t type, int *match_tags,
288: int eflags, int *match_end_ofs);
289:
290:
291: reg_errcode_t
292: tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
293: tre_str_type_t type, int *match_tags,
294: regamatch_t *match, regaparams_t params,
295: int eflags, int *match_end_ofs);
296:
297:
298:
299:
300: /* EOF */
Start cpp section to tre/tre_match-approx.cpp[1
/1
]
1: #line 4572 "./lpsrc/tre.pak"
2: /*
3: tre-match-approx.c - TRE approximate regex matching engine
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24: /* AIX requires this to be the first thing in the file. */
25:
26:
27:
28:
29:
30:
31:
32:
33:
34: char *alloca ();
35:
36:
37:
38:
39:
40:
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75: typedef struct {
76: /* State in the TNFA transition table. */
77: tre_tnfa_transition_t *state;
78: /* Position in input string. */
79: int pos;
80: /* Tag values. */
81: int *tags;
82: /* Matching parameters. */
83: regaparams_t params;
84: /* Nesting depth of parameters. This is used as an index in
85: the `costs' array. */
86: int depth;
87: /* Costs and counter values for different parameter nesting depths. */
88: int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
89: } tre_tnfa_approx_reach_t;
90:
91:
92:
93: /* Prints the `reach' array in a readable fashion with DPRINT. */
94: static void
95: tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
96: int pos, int num_tags)
97: {
98: int id;
99:
100: /* Print each state on one line. */
101: DPRINT((" reach:\n"));
102: for (id = 0; id < tnfa->num_states; id++)
103: {
104: int i, j;
105: if (reach[id].pos < pos)
106: continue; /* Not reached. */
107: DPRINT((" %03d, costs ", id));
108: for (i = 0; i <= reach[id].depth; i++)
109: {
110: DPRINT(("["));
111: for (j = 0; j < TRE_M_LAST; j++)
112: {
113: DPRINT(("%2d", reach[id].costs[i][j]));
114: if (j + 1 < TRE_M_LAST)
115: DPRINT((","));
116: }
117: DPRINT(("]"));
118: if (i + 1 <= reach[id].depth)
119: DPRINT((", "));
120: }
121: DPRINT(("\n tags "));
122: for (i = 0; i < num_tags; i++)
123: {
124: DPRINT(("%02d", reach[id].tags[i]));
125: if (i + 1 < num_tags)
126: DPRINT((","));
127: }
128: DPRINT(("\n"));
129: }
130: DPRINT(("\n"));
131: }
132:
133:
134:
135: /* Sets the matching parameters in `reach' to the ones defined in the `pa'
136: array. If `pa' specifies default values, they are taken from
137: `default_params'. */
138: inline static void
139: tre_set_params(tre_tnfa_approx_reach_t *reach,
140: int *pa, regaparams_t default_params)
141: {
142: int value;
143:
144: /* If depth is increased reset costs and counters to zero for the
145: new levels. */
146: value = pa[TRE_PARAM_DEPTH];
147: assert(value <= TRE_M_MAX_DEPTH);
148: if (value > reach->depth)
149: {
150: int i, j;
151: for (i = reach->depth + 1; i <= value; i++)
152: for (j = 0; j < TRE_M_LAST; j++)
153: reach->costs[i][j] = 0;
154: }
155: reach->depth = value;
156:
157: /* Set insert cost. */
158: value = pa[TRE_PARAM_COST_INS];
159: if (value == TRE_PARAM_DEFAULT)
160: reach->params.cost_ins = default_params.cost_ins;
161: else if (value != TRE_PARAM_UNSET)
162: reach->params.cost_ins = value;
163:
164: /* Set delete cost. */
165: value = pa[TRE_PARAM_COST_DEL];
166: if (value == TRE_PARAM_DEFAULT)
167: reach->params.cost_del = default_params.cost_del;
168: else if (value != TRE_PARAM_UNSET)
169: reach->params.cost_del = value;
170:
171: /* Set substitute cost. */
172: value = pa[TRE_PARAM_COST_SUBST];
173: if (value == TRE_PARAM_DEFAULT)
174: reach->params.cost_subst = default_params.cost_subst;
175: else
176: reach->params.cost_subst = value;
177:
178: /* Set maximum cost. */
179: value = pa[TRE_PARAM_COST_MAX];
180: if (value == TRE_PARAM_DEFAULT)
181: reach->params.max_cost = default_params.max_cost;
182: else if (value != TRE_PARAM_UNSET)
183: reach->params.max_cost = value;
184:
185: /* Set maximum inserts. */
186: value = pa[TRE_PARAM_MAX_INS];
187: if (value == TRE_PARAM_DEFAULT)
188: reach->params.max_ins = default_params.max_ins;
189: else if (value != TRE_PARAM_UNSET)
190: reach->params.max_ins = value;
191:
192: /* Set maximum deletes. */
193: value = pa[TRE_PARAM_MAX_DEL];
194: if (value == TRE_PARAM_DEFAULT)
195: reach->params.max_del = default_params.max_del;
196: else if (value != TRE_PARAM_UNSET)
197: reach->params.max_del = value;
198:
199: /* Set maximum substitutes. */
200: value = pa[TRE_PARAM_MAX_SUBST];
201: if (value == TRE_PARAM_DEFAULT)
202: reach->params.max_subst = default_params.max_subst;
203: else if (value != TRE_PARAM_UNSET)
204: reach->params.max_subst = value;
205:
206: /* Set maximum number of errors. */
207: value = pa[TRE_PARAM_MAX_ERR];
208: if (value == TRE_PARAM_DEFAULT)
209: reach->params.max_err = default_params.max_err;
210: else if (value != TRE_PARAM_UNSET)
211: reach->params.max_err = value;
212: }
213:
214:
215: reg_errcode_t
216: tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
217: tre_str_type_t type, int *match_tags,
218: regamatch_t *match, regaparams_t default_params,
219: int eflags, int *match_end_ofs)
220: {
221: /* State variables required by GET_NEXT_WCHAR. */
222: tre_char_t prev_c = 0, next_c = 0;
223: const char *str_byte = (const char*)string;
224: int pos = -1;
225: unsigned int pos_add_next = 1;
226: #ifdef TRE_WCHAR
227: const wchar_t *str_wide = (const wchar_t*)string;
228: #ifdef TRE_MBSTATE
229: mbstate_t mbstate;
230: #endif /* !TRE_WCHAR */
231: #endif /* TRE_WCHAR */
232: int reg_notbol = eflags & REG_NOTBOL;
233: int reg_noteol = eflags & REG_NOTEOL;
234: int reg_newline = tnfa->cflags & REG_NEWLINE;
235: int str_user_end = 0;
236:
237: int prev_pos;
238:
239: /* Compilation flags for this regexp. */
240: int cflags = tnfa->cflags;
241:
242: /* Number of tags. */
243: int num_tags;
244: /* The reach tables. */
245: tre_tnfa_approx_reach_t *reach, *reach_next;
246: /* Tag array for temporary use. */
247: int *tmp_tags;
248:
249: /* End offset of best match so far, or -1 if no match found yet. */
250: int match_eo = -1;
251: /* Costs of the match. */
252: int match_costs[TRE_M_LAST];
253:
254: /* Space for temporary data required for matching. */
255: unsigned char *buf;
256:
257: int i, id;
258:
259: if (!match_tags)
260: num_tags = 0;
261: else
262: num_tags = tnfa->num_tags;
263:
264: #ifdef TRE_MBSTATE
265: memset(&mbstate, '\0', sizeof(mbstate));
266: #endif /* TRE_MBSTATE */
267:
268: DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
269: "match_tags %p\n",
270: type, len, eflags,
271: match_tags));
272: DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
273: default_params.max_cost,
274: default_params.cost_ins,
275: default_params.cost_del,
276: default_params.cost_subst));
277:
278: /* Allocate memory for temporary data required for matching. This needs to
279: be done for every matching operation to be thread safe. This allocates
280: everything in a single large block from the stack frame using alloca()
281: or with malloc() if alloca is unavailable. */
282: {
283: unsigned char *buf_cursor;
284: /* Space needed for one array of tags. */
285: int tag_bytes = sizeof(*tmp_tags) * num_tags;
286: /* Space needed for one reach table. */
287: int reach_bytes = sizeof(*reach_next) * tnfa->num_states;
288: /* Total space needed. */
289: int total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
290: /* Add some extra to make sure we can align the pointers. The multiplier
291: used here must be equal to the number of ALIGN calls below. */
292: total_bytes += (sizeof(long) - 1) * 3;
293:
294: /* Allocate the memory. */
295: #ifdef TRE_USE_ALLOCA
296: buf = (unsigned char*)alloca(total_bytes);
297: #else /* !TRE_USE_ALLOCA */
298: buf = (unsigned char*)xmalloc(total_bytes);
299: #endif /* !TRE_USE_ALLOCA */
300: if (!buf)
301: return REG_ESPACE;
302: memset(buf, 0, total_bytes);
303:
304: /* Allocate `tmp_tags' from `buf'. */
305: tmp_tags = (int*)(void *)buf;
306: buf_cursor = buf + tag_bytes;
307: buf_cursor += ALIGN(buf_cursor, long);
308:
309: /* Allocate `reach' from `buf'. */
310: reach = (tre_tnfa_approx_reach_t*)(void *)buf_cursor;
311: buf_cursor += reach_bytes;
312: buf_cursor += ALIGN(buf_cursor, long);
313:
314: /* Allocate `reach_next' from `buf'. */
315: reach_next = (tre_tnfa_approx_reach_t*)(void *)buf_cursor;
316: buf_cursor += reach_bytes;
317: buf_cursor += ALIGN(buf_cursor, long);
318:
319: /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
320: for (i = 0; i < tnfa->num_states; i++)
321: {
322: reach[i].tags = (int*)(void *)buf_cursor;
323: buf_cursor += tag_bytes;
324: reach_next[i].tags = (int*)(void *)buf_cursor;
325: buf_cursor += tag_bytes;
326: }
327: assert(buf_cursor <= buf + total_bytes);
328: }
329:
330: for (i = 0; i < TRE_M_LAST; i++)
331: match_costs[i] = INT_MAX;
332:
333: /* Mark the reach arrays empty. */
334: for (i = 0; i < tnfa->num_states; i++)
335: reach[i].pos = reach_next[i].pos = -2;
336:
337: prev_pos = pos;
338: GET_NEXT_WCHAR();
339: pos = 0;
340:
341: while (1)
342: {
343: DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
344:
345: /* Add initial states to `reach_next' if an exact match has not yet
346: been found. */
347: if (match_costs[TRE_M_COST] > 0)
348: {
349: tre_tnfa_transition_t *trans;
350: DPRINT((" init"));
351: for (trans = tnfa->initial; trans->state; trans++)
352: {
353: int id = trans->state_id;
354:
355: /* If this state is not currently in `reach_next', add it
356: there. */
357: if (reach_next[id].pos < pos)
358: {
359: if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
360: {
361: /* Assertions failed, don't add this state. */
362: DPRINT((" !%d (assert)", id));
363: continue;
364: }
365: DPRINT((" %d", id));
366: reach_next[id].state = trans->state;
367: reach_next[id].pos = pos;
368:
369: /* Compute tag values after this transition. */
370: for (i = 0; i < num_tags; i++)
371: reach_next[id].tags[i] = -1;
372:
373: if (trans->tags)
374: for (i = 0; trans->tags[i] >= 0; i++)
375: if (trans->tags[i] < num_tags)
376: reach_next[id].tags[trans->tags[i]] = pos;
377:
378: /* Set the parameters, depth, and costs. */
379: reach_next[id].params = default_params;
380: reach_next[id].depth = 0;
381: for (i = 0; i < TRE_M_LAST; i++)
382: reach_next[id].costs[0][i] = 0;
383: if (trans->params)
384: tre_set_params(&reach_next[id], trans->params,
385: default_params);
386:
387: /* If this is the final state, mark the exact match. */
388: if (trans->state == tnfa->final)
389: {
390: match_eo = pos;
391: for (i = 0; i < num_tags; i++)
392: match_tags[i] = reach_next[id].tags[i];
393: for (i = 0; i < TRE_M_LAST; i++)
394: match_costs[i] = 0;
395: }
396: }
397: }
398: DPRINT(("\n"));
399: }
400:
401:
402: /* Handle inserts. This is done by pretending there's an epsilon
403: transition from each state in `reach' back to the same state.
404: We don't need to worry about the final state here; this will never
405: give a better match than what we already have. */
406: for (id = 0; id < tnfa->num_states; id++)
407: {
408: int depth;
409: int cost, cost0;
410:
411: if (reach[id].pos != prev_pos)
412: {
413: DPRINT((" insert: %d not reached\n", id));
414: continue; /* Not reached. */
415: }
416:
417: depth = reach[id].depth;
418:
419: /* Compute and check cost at current depth. */
420: cost = reach[id].costs[depth][TRE_M_COST];
421: if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
422: cost += reach[id].params.cost_ins;
423: if (cost > reach[id].params.max_cost)
424: continue; /* Cost too large. */
425:
426: /* Check number of inserts at current depth. */
427: if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
428: > reach[id].params.max_ins)
429: continue; /* Too many inserts. */
430:
431: /* Check total number of errors at current depth. */
432: if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
433: > reach[id].params.max_err)
434: continue; /* Too many errors. */
435:
436: /* Compute overall cost. */
437: cost0 = cost;
438: if (depth > 0)
439: {
440: cost0 = reach[id].costs[0][TRE_M_COST];
441: if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
442: cost0 += reach[id].params.cost_ins;
443: else
444: cost0 += default_params.cost_ins;
445: }
446:
447: DPRINT((" insert: from %d to %d, cost %d: ", id, id,
448: reach[id].costs[depth][TRE_M_COST]));
449: if (reach_next[id].pos == pos
450: && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
451: {
452: DPRINT(("lose\n"));
453: continue;
454: }
455: DPRINT(("win\n"));
456:
457: /* Copy state, position, tags, parameters, and depth. */
458: reach_next[id].state = reach[id].state;
459: reach_next[id].pos = pos;
460: for (i = 0; i < num_tags; i++)
461: reach_next[id].tags[i] = reach[id].tags[i];
462: reach_next[id].params = reach[id].params;
463: reach_next[id].depth = reach[id].depth;
464:
465: /* Set the costs after this transition. */
466: memcpy(reach_next[id].costs, reach[id].costs,
467: sizeof(reach_next[id].costs[0][0])
468: * TRE_M_LAST * (depth + 1));
469: reach_next[id].costs[depth][TRE_M_COST] = cost;
470: reach_next[id].costs[depth][TRE_M_NUM_INS]++;
471: reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
472: if (depth > 0)
473: {
474: reach_next[id].costs[0][TRE_M_COST] = cost0;
475: reach_next[id].costs[0][TRE_M_NUM_INS]++;
476: reach_next[id].costs[0][TRE_M_NUM_ERR]++;
477: }
478:
479: }
480:
481:
482: /* Handle deletes. This is done by traversing through the whole TNFA
483: pretending that all transitions are epsilon transitions, until
484: no more states can be reached with better costs. */
485: {
486: /* XXX - dynamic ringbuffer size */
487: tre_tnfa_approx_reach_t *ringbuffer[512];
488: tre_tnfa_approx_reach_t **deque_start, **deque_end;
489:
490: deque_start = deque_end = ringbuffer;
491:
492: /* Add all states in `reach_next' to the deque. */
493: for (id = 0; id < tnfa->num_states; id++)
494: {
495: if (reach_next[id].pos != pos)
496: continue;
497: *deque_end = &reach_next[id];
498: deque_end++;
499: assert(deque_end != deque_start);
500: }
501:
502: /* Repeat until the deque is empty. */
503: while (deque_end != deque_start)
504: {
505: tre_tnfa_approx_reach_t *reach_p;
506: int id;
507: int depth;
508: int cost, cost0;
509: tre_tnfa_transition_t *trans;
510:
511: /* Pop the first item off the deque. */
512: reach_p = *deque_start;
513: id = reach_p - reach_next;
514: depth = reach_p->depth;
515:
516: /* Compute cost at current depth. */
517: cost = reach_p->costs[depth][TRE_M_COST];
518: if (reach_p->params.cost_del != TRE_PARAM_UNSET)
519: cost += reach_p->params.cost_del;
520:
521: /* Check cost, number of deletes, and total number of errors
522: at current depth. */
523: if (cost > reach_p->params.max_cost
524: || (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
525: > reach_p->params.max_del)
526: || (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
527: > reach_p->params.max_err))
528: {
529: /* Too many errors or cost too large. */
530: DPRINT((" delete: from %03d: cost too large\n", id));
531: deque_start++;
532: if (deque_start >= (ringbuffer + 512))
533: deque_start = ringbuffer;
534: continue;
535: }
536:
537: /* Compute overall cost. */
538: cost0 = cost;
539: if (depth > 0)
540: {
541: cost0 = reach_p->costs[0][TRE_M_COST];
542: if (reach_p->params.cost_del != TRE_PARAM_UNSET)
543: cost0 += reach_p->params.cost_del;
544: else
545: cost0 += default_params.cost_del;
546: }
547:
548: for (trans = reach_p->state; trans->state; trans++)
549: {
550: int dest_id = trans->state_id;
551: DPRINT((" delete: from %03d to %03d, cost %d (%d): ",
552: id, dest_id, cost0, reach_p->params.max_cost));
553:
554: if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
555: {
556: DPRINT(("assertion failed\n"));
557: continue;
558: }
559:
560: /* Compute tag values after this transition. */
561: for (i = 0; i < num_tags; i++)
562: tmp_tags[i] = reach_p->tags[i];
563: if (trans->tags)
564: for (i = 0; trans->tags[i] >= 0; i++)
565: if (trans->tags[i] < num_tags)
566: tmp_tags[trans->tags[i]] = pos;
567:
568: /* If another path has also reached this state, choose the one
569: with the smallest cost or best tags if costs are equal. */
570: if (reach_next[dest_id].pos == pos
571: && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
572: || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
573: && (!match_tags
574: || !tre_tag_order(num_tags,
575: tnfa->tag_directions,
576: tmp_tags,
577: reach_next[dest_id].tags)))))
578: {
579: DPRINT(("lose, cost0 %d, have %d\n",
580: cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
581: continue;
582: }
583: DPRINT(("win\n"));
584:
585: /* Set state, position, tags, parameters, depth, and costs. */
586: reach_next[dest_id].state = trans->state;
587: reach_next[dest_id].pos = pos;
588: for (i = 0; i < num_tags; i++)
589: reach_next[dest_id].tags[i] = tmp_tags[i];
590:
591: reach_next[dest_id].params = reach_p->params;
592: if (trans->params)
593: tre_set_params(&reach_next[dest_id], trans->params,
594: default_params);
595:
596: reach_next[dest_id].depth = reach_p->depth;
597: memcpy(&reach_next[dest_id].costs,
598: reach_p->costs,
599: sizeof(reach_p->costs[0][0])
600: * TRE_M_LAST * (depth + 1));
601: reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
602: reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
603: reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
604: if (depth > 0)
605: {
606: reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
607: reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
608: reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
609: }
610:
611: if (trans->state == tnfa->final
612: && (match_eo < 0
613: || match_costs[TRE_M_COST] > cost0
614: || (match_costs[TRE_M_COST] == cost0
615: && (num_tags > 0
616: && tmp_tags[0] <= match_tags[0]))))
617: {
618: DPRINT((" setting new match at %d, cost %d\n",
619: pos, cost0));
620: match_eo = pos;
621: memcpy(match_costs, reach_next[dest_id].costs[0],
622: sizeof(match_costs[0]) * TRE_M_LAST);
623: for (i = 0; i < num_tags; i++)
624: match_tags[i] = tmp_tags[i];
625: }
626:
627: /* Add to the end of the deque. */
628: *deque_end = &reach_next[dest_id];
629: deque_end++;
630: if (deque_end >= (ringbuffer + 512))
631: deque_end = ringbuffer;
632: assert(deque_end != deque_start);
633: }
634: deque_start++;
635: if (deque_start >= (ringbuffer + 512))
636: deque_start = ringbuffer;
637: }
638:
639: }
640:
641: #ifdef TRE_DEBUG
642: tre_print_reach(tnfa, reach_next, pos, num_tags);
643: #endif /* TRE_DEBUG */
644:
645: /* Check for end of string. */
646: if (len < 0)
647: {
648: if (next_c == L'\0')
649: break;
650: }
651: else
652: {
653: if (pos >= len)
654: break;
655: }
656:
657: prev_pos = pos;
658: GET_NEXT_WCHAR();
659:
660: /* Swap `reach' and `reach_next'. */
661: {
662: tre_tnfa_approx_reach_t *tmp;
663: tmp = reach;
664: reach = reach_next;
665: reach_next = tmp;
666: }
667:
668: /* Handle exact matches and substitutions. */
669: for (id = 0; id < tnfa->num_states; id++)
670: {
671: tre_tnfa_transition_t *trans;
672:
673: if (reach[id].pos < prev_pos)
674: continue; /* Not reached. */
675: for (trans = reach[id].state; trans->state; trans++)
676: {
677: int dest_id;
678: int depth;
679: int cost, cost0, err;
680:
681: if (trans->assertions
682: && (CHECK_ASSERTIONS(trans->assertions)
683: /* Handle character klass transitions. */
684: || ((trans->assertions & ASSERT_CHAR_CLASS)
685: && !(cflags & REG_ICASE)
686: && !tre_isctype((tre_cint_t)prev_c, trans->u.klass))
687: || ((trans->assertions & ASSERT_CHAR_CLASS)
688: && (cflags & REG_ICASE)
689: && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
690: trans->u.klass)
691: && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
692: trans->u.klass)))
693: || ((trans->assertions & ASSERT_CHAR_CLASS_NEG)
694: && tre_neg_char_klasses_match(trans->neg_klasses,
695: (tre_cint_t)prev_c,
696: cflags & REG_ICASE))))
697: {
698: DPRINT((" exact, from %d: assert failed\n", id));
699: continue;
700: }
701:
702: depth = reach[id].depth;
703: dest_id = trans->state_id;
704:
705: cost = reach[id].costs[depth][TRE_M_COST];
706: cost0 = reach[id].costs[0][TRE_M_COST];
707: err = 0;
708:
709: if (trans->code_min > prev_c ||
710: trans->code_max < prev_c)
711: {
712: /* Handle substitutes. The required character was not in
713: the string, so match it in place of whatever was supposed
714: to be there and increase costs accordingly. */
715: err = 1;
716:
717: /* Compute and check cost at current depth. */
718: cost = reach[id].costs[depth][TRE_M_COST];
719: if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
720: cost += reach[id].params.cost_subst;
721: if (cost > reach[id].params.max_cost)
722: continue; /* Cost too large. */
723:
724: /* Check number of substitutes at current depth. */
725: if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
726: > reach[id].params.max_subst)
727: continue; /* Too many substitutes. */
728:
729: /* Check total number of errors at current depth. */
730: if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
731: > reach[id].params.max_err)
732: continue; /* Too many errors. */
733:
734: /* Compute overall cost. */
735: cost0 = cost;
736: if (depth > 0)
737: {
738: cost0 = reach[id].costs[0][TRE_M_COST];
739: if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
740: cost0 += reach[id].params.cost_subst;
741: else
742: cost0 += default_params.cost_subst;
743: }
744: DPRINT((" subst, from %03d to %03d, cost %d: ",
745: id, dest_id, cost0));
746: }
747: else
748: DPRINT((" exact, from %03d to %03d, cost %d: ",
749: id, dest_id, cost0));
750:
751: /* Compute tag values after this transition. */
752: for (i = 0; i < num_tags; i++)
753: tmp_tags[i] = reach[id].tags[i];
754: if (trans->tags)
755: for (i = 0; trans->tags[i] >= 0; i++)
756: if (trans->tags[i] < num_tags)
757: tmp_tags[trans->tags[i]] = pos;
758:
759: /* If another path has also reached this state, choose the
760: one with the smallest cost or best tags if costs are equal. */
761: if (reach_next[dest_id].pos == pos
762: && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
763: || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
764: && !tre_tag_order(num_tags, tnfa->tag_directions,
765: tmp_tags,
766: reach_next[dest_id].tags))))
767: {
768: DPRINT(("lose\n"));
769: continue;
770: }
771: DPRINT(("win %d %d\n",
772: reach_next[dest_id].pos,
773: reach_next[dest_id].costs[0][TRE_M_COST]));
774:
775: /* Set state, position, tags, and depth. */
776: reach_next[dest_id].state = trans->state;
777: reach_next[dest_id].pos = pos;
778: for (i = 0; i < num_tags; i++)
779: reach_next[dest_id].tags[i] = tmp_tags[i];
780: reach_next[dest_id].depth = reach[id].depth;
781:
782: /* Set parameters. */
783: reach_next[dest_id].params = reach[id].params;
784: if (trans->params)
785: tre_set_params(&reach_next[dest_id], trans->params,
786: default_params);
787:
788: /* Set the costs after this transition. */
789: memcpy(&reach_next[dest_id].costs,
790: reach[id].costs,
791: sizeof(reach[id].costs[0][0])
792: * TRE_M_LAST * (depth + 1));
793: reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
794: reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
795: reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
796: if (depth > 0)
797: {
798: reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
799: reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
800: reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
801: }
802:
803: if (trans->state == tnfa->final
804: && (match_eo < 0
805: || cost0 < match_costs[TRE_M_COST]
806: || (cost0 == match_costs[TRE_M_COST]
807: && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
808: {
809: DPRINT((" setting new match at %d, cost %d\n",
810: pos, cost0));
811: match_eo = pos;
812: for (i = 0; i < TRE_M_LAST; i++)
813: match_costs[i] = reach_next[dest_id].costs[0][i];
814: for (i = 0; i < num_tags; i++)
815: match_tags[i] = tmp_tags[i];
816: }
817: }
818: }
819: }
820:
821: DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
822: match_costs[TRE_M_COST]));
823:
824: #ifndef TRE_USE_ALLOCA
825: if (buf)
826: xfree(buf);
827: #endif /* !TRE_USE_ALLOCA */
828:
829: match->cost = match_costs[TRE_M_COST];
830: match->num_ins = match_costs[TRE_M_NUM_INS];
831: match->num_del = match_costs[TRE_M_NUM_DEL];
832: match->num_subst = match_costs[TRE_M_NUM_SUBST];
833: *match_end_ofs = match_eo;
834:
835: return match_eo >= 0 ? REG_OK : REG_NOMATCH;
836: }
Start cpp section to tre/tre_match-backtrack.cpp[1
/1
]
1: #line 5409 "./lpsrc/tre.pak"
2: /*
3: tre-match-backtrack.c - TRE backtracking regex matching engine
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22: /*
23: This matcher is for regexps that use back referencing. Regexp matching
24: with back referencing is an NP-complete problem on the number of back
25: references. The easiest way to match them is to use a backtracking
26: routine which basically goes through all possible paths in the TNFA
27: and chooses the one which results in the best (leftmost and longest)
28: match. This can be spectacularly expensive and may run out of stack
29: space, but there really is no better known generic algorithm. Quoting
30: Henry Spencer from comp.compilers:
31: <URL: http:
32:
33: POSIX.2 REs require longest match, which is really exciting to
34: implement since the obsolete ("basic") variant also includes
35: \<digit>. I haven't found a better way of tackling this than doing
36: a preliminary match using a DFA (or simulation) on a modified RE
37: that just replicates subREs for \<digit>, and then doing a
38: backtracking match to determine whether the subRE matches were
39: right. This can be rather slow, but I console myself with the
40: thought that people who use \<digit> deserve very slow execution.
41: (Pun unintentional but very appropriate.)
42:
43: */
44:
45:
46:
47:
48:
49: /* AIX requires this to be the first thing in the file. */
50:
51:
52:
53:
54:
55:
56:
57:
58: char *alloca ();
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81:
82:
83:
84:
85:
86:
87: typedef struct {
88: int pos;
89: const char *str_byte;
90: #ifdef TRE_WCHAR
91: const wchar_t *str_wide;
92: #endif /* TRE_WCHAR */
93: tre_tnfa_transition_t *state;
94: int state_id;
95: int next_c;
96: int *tags;
97: #ifdef TRE_MBSTATE
98: mbstate_t mbstate;
99: #endif /* TRE_MBSTATE */
100: } tre_backtrack_item_t;
101:
102: typedef struct tre_backtrack_struct {
103: tre_backtrack_item_t item;
104: struct tre_backtrack_struct *prev;
105: struct tre_backtrack_struct *next;
106: } *tre_backtrack_t;
107:
108:
109:
110:
111:
112:
113:
114:
115:
116:
117:
118:
119:
120:
121:
122:
123:
124:
125:
126:
127:
128:
129:
130:
131:
132:
133:
134:
135:
136:
137: do \
138: { \
139: int i; \
140: if (!stack->next) \
141: { \
142: tre_backtrack_t s; \
143: s = (tre_backtrack_struct*)tre_bt_mem_alloc(mem, sizeof(*s)); \
144: if (!s) \
145: { \
146: tre_bt_mem_destroy(mem); \
147: if (tags) \
148: xfree(tags); \
149: if (pmatch) \
150: xfree(pmatch); \
151: if (states_seen) \
152: xfree(states_seen); \
153: return REG_ESPACE; \
154: } \
155: s->prev = stack; \
156: s->next = NULL; \
157: s->item.tags = (int*)tre_bt_mem_alloc(mem, \
158: sizeof(*tags) * tnfa->num_tags); \
159: if (!s->item.tags) \
160: { \
161: tre_bt_mem_destroy(mem); \
162: if (tags) \
163: xfree(tags); \
164: if (pmatch) \
165: xfree(pmatch); \
166: if (states_seen) \
167: xfree(states_seen); \
168: return REG_ESPACE; \
169: } \
170: stack->next = s; \
171: stack = s; \
172: } \
173: else \
174: stack = stack->next; \
175: stack->item.pos = (_pos); \
176: stack->item.str_byte = (_str_byte); \
177: BT_STACK_WIDE_IN; \
178: stack->item.state = (_state); \
179: stack->item.state_id = (_state_id); \
180: stack->item.next_c = (_next_c); \
181: for (i = 0; i < tnfa->num_tags; i++) \
182: stack->item.tags[i] = (_tags)[i]; \
183: BT_STACK_MBSTATE_IN; \
184: } \
185: while (0)
186:
187:
188: do \
189: { \
190: int i; \
191: assert(stack->prev); \
192: pos = stack->item.pos; \
193: str_byte = stack->item.str_byte; \
194: BT_STACK_WIDE_OUT; \
195: state = stack->item.state; \
196: next_c = stack->item.next_c; \
197: for (i = 0; i < tnfa->num_tags; i++) \
198: tags[i] = stack->item.tags[i]; \
199: BT_STACK_MBSTATE_OUT; \
200: stack = stack->prev; \
201: } \
202: while (0)
203:
204:
205:
206:
207: reg_errcode_t
208: tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
209: int len, tre_str_type_t type, int *match_tags,
210: int eflags, int *match_end_ofs)
211: {
212: /* State variables required by GET_NEXT_WCHAR. */
213: tre_char_t prev_c = 0, next_c = 0;
214: const char *str_byte = (const char*)string;
215: int pos = 0;
216: unsigned int pos_add_next = 1;
217: #ifdef TRE_WCHAR
218: const wchar_t *str_wide = (const wchar_t*)string;
219: #ifdef TRE_MBSTATE
220: mbstate_t mbstate;
221: #endif /* TRE_MBSTATE */
222: #endif /* TRE_WCHAR */
223: int reg_notbol = eflags & REG_NOTBOL;
224: int reg_noteol = eflags & REG_NOTEOL;
225: int reg_newline = tnfa->cflags & REG_NEWLINE;
226: int str_user_end = 0;
227:
228: /* These are used to remember the necessary values of the above
229: variables to return to the position where the current search
230: started from. */
231: int next_c_start;
232: const char *str_byte_start;
233: int pos_start = -1;
234: #ifdef TRE_WCHAR
235: const wchar_t *str_wide_start;
236: #endif /* TRE_WCHAR */
237: #ifdef TRE_MBSTATE
238: mbstate_t mbstate_start;
239: #endif /* TRE_MBSTATE */
240:
241: /* Compilation flags for this regexp. */
242: int cflags = tnfa->cflags;
243:
244: /* End offset of best match so far, or -1 if no match found yet. */
245: int match_eo = -1;
246: /* Tag arrays. */
247: int *next_tags, *tags = NULL;
248: /* Current TNFA state. */
249: tre_tnfa_transition_t *state;
250: int *states_seen = NULL;
251:
252: /* Memory allocator to for allocating the backtracking stack. */
253: tre_mem_t mem = tre_bt_mem_new();
254:
255: /* The backtracking stack. */
256: tre_backtrack_t stack;
257:
258: tre_tnfa_transition_t *trans_i;
259: regmatch_t *pmatch = NULL;
260: int ret;
261:
262: #ifdef TRE_MBSTATE
263: memset(&mbstate, '\0', sizeof(mbstate));
264: #endif /* TRE_MBSTATE */
265:
266: if (!mem)
267: return REG_ESPACE;
268: stack = (tre_backtrack_struct*)tre_bt_mem_alloc(mem, sizeof(*stack));
269: if (!stack)
270: {
271: ret = REG_ESPACE;
272: goto error_exit;
273: }
274: stack->prev = NULL;
275: stack->next = NULL;
276:
277: DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
278: DPRINT(("len = %d\n", len));
279:
280: #ifdef TRE_USE_ALLOCA
281: tags = (int*)alloca(sizeof(*tags) * tnfa->num_tags);
282: pmatch = (regmatch_t*)alloca(sizeof(*pmatch) * tnfa->num_submatches);
283: states_seen = (int*)alloca(sizeof(*states_seen) * tnfa->num_states);
284: #else /* !TRE_USE_ALLOCA */
285: tags = (int*)xmalloc(sizeof(*tags) * tnfa->num_tags);
286: if (!tags)
287: {
288: ret = REG_ESPACE;
289: goto error_exit;
290: }
291: pmatch = (regmatch_t*)xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
292: if (!pmatch)
293: {
294: ret = REG_ESPACE;
295: goto error_exit;
296: }
297: states_seen = (int*)xmalloc(sizeof(*states_seen) * tnfa->num_states);
298: if (!states_seen)
299: {
300: ret = REG_ESPACE;
301: goto error_exit;
302: }
303: #endif /* !TRE_USE_ALLOCA */
304:
305: retry:
306: {
307: int i;
308: for (i = 0; i < tnfa->num_tags; i++)
309: {
310: tags[i] = -1;
311: if (match_tags)
312: match_tags[i] = -1;
313: }
314: for (i = 0; i < tnfa->num_states; i++)
315: states_seen[i] = 0;
316: }
317:
318: state = NULL;
319: pos = pos_start;
320: if (type == STR_USER)
321: str_source->rewind(pos + pos_add_next, str_source->context);
322: GET_NEXT_WCHAR();
323: pos_start = pos;
324: next_c_start = next_c;
325: str_byte_start = str_byte;
326: #ifdef TRE_WCHAR
327: str_wide_start = str_wide;
328: #endif /* TRE_WCHAR */
329: #ifdef TRE_MBSTATE
330: mbstate_start = mbstate;
331: #endif /* TRE_MBSTATE */
332:
333: /* Handle initial states. */
334: next_tags = NULL;
335: for (trans_i = tnfa->initial; trans_i->state; trans_i++)
336: {
337: DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
338: if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
339: {
340: DPRINT(("assert failed\n"));
341: continue;
342: }
343: if (state == NULL)
344: {
345: /* Start from this state. */
346: state = trans_i->state;
347: next_tags = trans_i->tags;
348: }
349: else
350: {
351: /* Backtrack to this state. */
352: DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
353: BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
354: trans_i->state_id, next_c, tags, mbstate);
355: {
356: int *tmp = trans_i->tags;
357: if (tmp)
358: while (*tmp >= 0)
359: stack->item.tags[*tmp++] = pos;
360: }
361: }
362: }
363:
364: if (next_tags)
365: for (; *next_tags >= 0; next_tags++)
366: tags[*next_tags] = pos;
367:
368:
369: DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
370: DPRINT(("pos:chr/code | state and tags\n"));
371: DPRINT(("-------------+------------------------------------------------\n"));
372:
373: if (state == NULL)
374: goto backtrack;
375:
376: while (1)
377: {
378: tre_tnfa_transition_t *trans_i, *next_state;
379: int empty_br_match;
380:
381: DPRINT(("start loop\n"));
382: if (state == tnfa->final)
383: {
384: DPRINT((" match found, %d %d\n", match_eo, pos));
385: if (match_eo < pos
386: || (match_eo == pos
387: && match_tags
388: && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
389: tags, match_tags)))
390: {
391: int i;
392: /* This match wins the previous match. */
393: DPRINT((" win previous\n"));
394: match_eo = pos;
395: if (match_tags)
396: for (i = 0; i < tnfa->num_tags; i++)
397: match_tags[i] = tags[i];
398: }
399: /* Our TNFAs never have transitions leaving from the final state,
400: so we jump right to backtracking. */
401: goto backtrack;
402: }
403:
404: #ifdef TRE_DEBUG
405: DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
406: state));
407: {
408: int i;
409: for (i = 0; i < tnfa->num_tags; i++)
410: DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
411: DPRINT(("\n"));
412: }
413: #endif /* TRE_DEBUG */
414:
415: /* Go to the next character in the input string. */
416: empty_br_match = 0;
417: trans_i = state;
418: if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
419: {
420: /* This is a back reference state. All transitions leaving from
421: this state have the same back reference "assertion". Instead
422: of reading the next character, we match the back reference. */
423: int so, eo, bt = trans_i->u.backref;
424: int bt_len;
425: int result;
426:
427: DPRINT((" should match back reference %d\n", bt));
428: /* Get the substring we need to match against. Remember to
429: turn off REG_NOSUB temporarily. */
430: tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & !REG_NOSUB,
431: tnfa, tags, pos);
432: so = pmatch[bt].rm_so;
433: eo = pmatch[bt].rm_eo;
434: bt_len = eo - so;
435:
436: #ifdef TRE_DEBUG
437: {
438: int slen;
439: if (len < 0)
440: slen = bt_len;
441: else
442: slen = MIN(bt_len, len - pos);
443:
444: if (type == STR_BYTE)
445: {
446: DPRINT((" substring (len %d) is [%d, %d[: '%.*s'\n",
447: bt_len, so, eo, bt_len, (char*)string + so));
448: DPRINT((" current string is '%.*s'\n", slen, str_byte - 1));
449: }
450: #ifdef TRE_WCHAR
451: else if (type == STR_WIDE)
452: {
453: DPRINT((" substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
454: bt_len, so, eo, bt_len, (wchar_t*)string + so));
455: DPRINT((" current string is '%.*" STRF "'\n",
456: slen, str_wide - 1));
457: }
458: #endif /* TRE_WCHAR */
459: }
460: #endif
461:
462: if (len < 0)
463: {
464: if (type == STR_USER)
465: result = str_source->compare(so, pos, bt_len,
466: str_source->context);
467: #ifdef TRE_WCHAR
468: else if (type == STR_WIDE)
469: result = wcsncmp((wchar_t*)string + so, str_wide - 1, bt_len);
470: #endif /* TRE_WCHAR */
471: else
472: result = strncmp((char*)string + so, str_byte - 1, bt_len);
473: }
474: else if (len - pos < bt_len)
475: result = 1;
476: else
477: result = memcmp((char*)string + so, str_byte - 1, bt_len);
478:
479: /* We can ignore multibyte characters here because the backref
480: string is already aligned at character boundaries. */
481: if (result == 0)
482: {
483: /* Back reference matched. Check for infinite loop. */
484: if (bt_len == 0)
485: empty_br_match = 1;
486: if (empty_br_match && states_seen[trans_i->state_id])
487: {
488: DPRINT((" avoid loop\n"));
489: goto backtrack;
490: }
491:
492: states_seen[trans_i->state_id] = empty_br_match;
493:
494: /* Advance in input string and resync `prev_c', `next_c'
495: and pos. */
496: DPRINT((" back reference matched\n"));
497: str_byte += bt_len - 1;
498: pos += bt_len - 1;
499: GET_NEXT_WCHAR();
500: DPRINT((" pos now %d\n", pos));
501: }
502: else
503: {
504: DPRINT((" back reference did not match\n"));
505: goto backtrack;
506: }
507: }
508: else
509: {
510: /* Check for end of string. */
511: if (len < 0)
512: {
513: if (next_c == L'\0')
514: goto backtrack;
515: }
516: else
517: {
518: if (pos >= len)
519: goto backtrack;
520: }
521:
522: /* Read the next character. */
523: GET_NEXT_WCHAR();
524: }
525:
526: next_state = NULL;
527: for (trans_i = state; trans_i->state; trans_i++)
528: {
529: DPRINT((" transition %d-%d (%c-%c) %d to %d\n",
530: trans_i->code_min, trans_i->code_max,
531: trans_i->code_min, trans_i->code_max,
532: trans_i->assertions, trans_i->state_id));
533: if (trans_i->code_min <= prev_c && trans_i->code_max >= prev_c)
534: {
535: if (trans_i->assertions
536: && (CHECK_ASSERTIONS(trans_i->assertions)
537: /* Handle character klass transitions. */
538: || ((trans_i->assertions & ASSERT_CHAR_CLASS)
539: && !(cflags & REG_ICASE)
540: && !tre_isctype((tre_cint_t)prev_c, trans_i->u.klass))
541: || ((trans_i->assertions & ASSERT_CHAR_CLASS)
542: && (cflags & REG_ICASE)
543: && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
544: trans_i->u.klass)
545: && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
546: trans_i->u.klass)))
547: || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
548: && tre_neg_char_klasses_match(trans_i->neg_klasses,
549: (tre_cint_t)prev_c,
550: cflags & REG_ICASE))))
551: {
552: DPRINT((" assertion failed\n"));
553: continue;
554: }
555:
556: if (next_state == NULL)
557: {
558: /* First matching transition. */
559: DPRINT((" Next state is %d\n", trans_i->state_id));
560: next_state = trans_i->state;
561: next_tags = trans_i->tags;
562: }
563: else
564: {
565: /* Second mathing transition. We may need to backtrack here
566: to take this transition instead of the first one, so we
567: push this transition in the backtracking stack so we can
568: jump back here if needed. */
569: DPRINT((" saving state %d for backtracking\n",
570: trans_i->state_id));
571: BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
572: trans_i->state_id, next_c, tags, mbstate);
573: {
574: int *tmp;
575: for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
576: stack->item.tags[*tmp] = pos;
577: }
578: #if 0 /* XXX - it's important not to look at all transitions here to keep
579: the stack small! */
580: break;
581: #endif
582: }
583: }
584: }
585:
586: if (next_state != NULL)
587: {
588: /* Matching transitions were found. Take the first one. */
589: state = next_state;
590:
591: /* Update the tag values. */
592: if (next_tags)
593: while (*next_tags >= 0)
594: tags[*next_tags++] = pos;
595: }
596: else
597: {
598: backtrack:
599: /* A matching transition was not found. Try to backtrack. */
600: if (stack->prev)
601: {
602: DPRINT((" backtracking\n"));
603: if (stack->item.state->assertions && ASSERT_BACKREF)
604: {
605: DPRINT((" states_seen[%d] = 0\n",
606: stack->item.state_id));
607: states_seen[stack->item.state_id] = 0;
608: }
609:
610: BT_STACK_POP();
611: }
612: else if (match_eo < 0)
613: {
614: /* Try starting from a later position in the input string. */
615: /* Check for end of string. */
616: if (len < 0)
617: {
618: if (next_c == L'\0')
619: {
620: DPRINT(("end of string.\n"));
621: break;
622: }
623: }
624: else
625: {
626: if (pos >= len)
627: {
628: DPRINT(("end of string.\n"));
629: break;
630: }
631: }
632: DPRINT(("restarting from next start position\n"));
633: next_c = next_c_start;
634: #ifdef TRE_MBSTATE
635: mbstate = mbstate_start;
636: #endif /* TRE_MBSTATE */
637: str_byte = str_byte_start;
638: #ifdef TRE_WCHAR
639: str_wide = str_wide_start;
640: #endif /* TRE_WCHAR */
641: goto retry;
642: }
643: else
644: {
645: DPRINT(("finished\n"));
646: break;
647: }
648: }
649: }
650:
651: ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
652: *match_end_ofs = match_eo;
653:
654: error_exit:
655: tre_bt_mem_destroy(mem);
656: #ifndef TRE_USE_ALLOCA
657: if (tags)
658: xfree(tags);
659: if (pmatch)
660: xfree(pmatch);
661: if (states_seen)
662: xfree(states_seen);
663: #endif /* !TRE_USE_ALLOCA */
664:
665: return (reg_errcode_t)ret;
666: }
Start cpp section to tre/tre_match-parallel.cpp[1
/1
]
1: #line 6076 "./lpsrc/tre.pak"
2: /*
3: tre-match-parallel.c - TRE parallel regex matching engine
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22: /*
23: This algorithm searches for matches basically by reading characters
24: in the searched string one by one, starting at the beginning. All
25: matching paths in the TNFA are traversed in parallel. When two or
26: more paths reach the same state, exactly one is chosen according to
27: tag ordering rules; if returning submatches is not required it does
28: not matter which path is chosen.
29:
30: The worst case time required for finding the leftmost and longest
31: match, or determining that there is no match, is always linearly
32: dependent on the length of the text being searched.
33:
34: This algorithm cannot handle TNFAs with back referencing nodes.
35: See `tre-match-backtrack.c'.
36: */
37:
38:
39:
40:
41:
42: /* AIX requires this to be the first thing in the file. */
43:
44:
45:
46:
47:
48:
49:
50:
51: char *alloca ();
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63:
64:
65:
66:
67:
68:
69:
70:
71:
72:
73:
74:
75:
76:
77:
78:
79:
80:
81: typedef struct {
82: tre_tnfa_transition_t *state;
83: int *tags;
84: } tre_tnfa_reach_t;
85:
86: typedef struct {
87: int pos;
88: int **tags;
89: } tre_reach_pos_t;
90:
91:
92:
93: static void
94: tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
95: {
96: int i;
97:
98: while (reach->state != NULL)
99: {
100: DPRINT((" %p", (void *)reach->state));
101: if (num_tags > 0)
102: {
103: DPRINT(("/"));
104: for (i = 0; i < num_tags; i++)
105: {
106: DPRINT(("%d:%d", i, reach->tags[i]));
107: if (i < (num_tags-1))
108: DPRINT((","));
109: }
110: }
111: reach++;
112: }
113: DPRINT(("\n"));
114:
115: }
116:
117:
118: reg_errcode_t
119: tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
120: tre_str_type_t type, int *match_tags, int eflags,
121: int *match_end_ofs)
122: {
123: /* State variables required by GET_NEXT_WCHAR. */
124: tre_char_t prev_c = 0, next_c = 0;
125: const char *str_byte = (const char*)string;
126: int pos = -1;
127: unsigned int pos_add_next = 1;
128: #ifdef TRE_WCHAR
129: const wchar_t *str_wide = (const wchar_t*)string;
130: #ifdef TRE_MBSTATE
131: mbstate_t mbstate;
132: #endif /* TRE_MBSTATE */
133: #endif /* TRE_WCHAR */
134: int reg_notbol = eflags & REG_NOTBOL;
135: int reg_noteol = eflags & REG_NOTEOL;
136: int reg_newline = tnfa->cflags & REG_NEWLINE;
137: int str_user_end = 0;
138:
139: char *buf;
140: tre_tnfa_transition_t *trans_i;
141: tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
142: tre_reach_pos_t *reach_pos;
143: int *tag_i;
144: int num_tags, i;
145:
146: int match_eo = -1; /* end offset of match (-1 if no match found yet) */
147: int new_match = 0;
148: int *tmp_tags = NULL;
149: int *tmp_iptr;
150:
151: #ifdef TRE_MBSTATE
152: memset(&mbstate, '\0', sizeof(mbstate));
153: #endif /* TRE_MBSTATE */
154:
155: DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
156:
157: if (!match_tags)
158: num_tags = 0;
159: else
160: num_tags = tnfa->num_tags;
161:
162: /* Allocate memory for temporary data required for matching. This needs to
163: be done for every matching operation to be thread safe. This allocates
164: everything in a single large block from the stack frame using alloca()
165: or with malloc() if alloca is unavailable. */
166: {
167: int tbytes, rbytes, pbytes, xbytes, total_bytes;
168: char *tmp_buf;
169: /* Compute the length of the block we need. */
170: tbytes = sizeof(*tmp_tags) * num_tags;
171: rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
172: pbytes = sizeof(*reach_pos) * tnfa->num_states;
173: xbytes = sizeof(int) * num_tags;
174: total_bytes =
175: (sizeof(long) - 1) * 4 /* for alignment paddings */
176: + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
177:
178: /* Allocate the memory. */
179: #ifdef TRE_USE_ALLOCA
180: buf = (char*)alloca(total_bytes);
181: #else /* !TRE_USE_ALLOCA */
182: buf = (char*)xmalloc(total_bytes);
183: #endif /* !TRE_USE_ALLOCA */
184: if (buf == NULL)
185: return REG_ESPACE;
186: memset(buf, 0, total_bytes);
187:
188: /* Get the various pointers within tmp_buf (properly aligned). */
189: tmp_tags = (int*)(void *)buf;
190: tmp_buf = buf + tbytes;
191: tmp_buf += ALIGN(tmp_buf, long);
192: reach_next = (tre_tnfa_reach_t*)(void *)tmp_buf;
193: tmp_buf += rbytes;
194: tmp_buf += ALIGN(tmp_buf, long);
195: reach = (tre_tnfa_reach_t*)(void *)tmp_buf;
196: tmp_buf += rbytes;
197: tmp_buf += ALIGN(tmp_buf, long);
198: reach_pos = (tre_reach_pos_t*)(void *)tmp_buf;
199: tmp_buf += pbytes;
200: tmp_buf += ALIGN(tmp_buf, long);
201: for (i = 0; i < tnfa->num_states; i++)
202: {
203: reach[i].tags = (int*)(void *)tmp_buf;
204: tmp_buf += xbytes;
205: reach_next[i].tags = (int*)(void *)tmp_buf;
206: tmp_buf += xbytes;
207: }
208: }
209:
210: for (i = 0; i < tnfa->num_states; i++)
211: reach_pos[i].pos = -1;
212:
213: /* If only one character can start a match, find it first. */
214: if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
215: {
216: const char *orig_str = str_byte;
217: int first = tnfa->first_char;
218:
219: if (len >= 0)
220: str_byte = (const char*)memchr(orig_str, first, len);
221: else
222: str_byte = strchr(orig_str, first);
223: if (str_byte == NULL)
224: {
225: #ifndef TRE_USE_ALLOCA
226: if (buf)
227: xfree(buf);
228: #endif /* !TRE_USE_ALLOCA */
229: return REG_NOMATCH;
230: }
231: DPRINT(("skipped %d chars\n", str_byte - orig_str));
232: if (str_byte >= orig_str + 1)
233: prev_c = (unsigned char)*(str_byte - 1);
234: next_c = (unsigned char)*str_byte;
235: pos = str_byte - orig_str;
236: if (len < 0 || pos < len)
237: str_byte++;
238: }
239: else
240: {
241: GET_NEXT_WCHAR();
242: pos = 0;
243: }
244:
245: #if 0
246: /* Skip over characters that cannot possibly be the first character
247: of a match. */
248: if (tnfa->firstpos_chars != NULL)
249: {
250: char *chars = tnfa->firstpos_chars;
251:
252: if (len < 0)
253: {
254: const char *orig_str = str_byte;
255: /* XXX - use strpbrk() and wcspbrk() because they might be
256: optimized for the target architecture. Try also strcspn()
257: and wcscspn() and compare the speeds. */
258: while (next_c != L'\0' && !chars[next_c])
259: {
260: next_c = *str_byte++;
261: }
262: prev_c = *(str_byte - 2);
263: pos += str_byte - orig_str;
264: DPRINT(("skipped %d chars\n", str_byte - orig_str));
265: }
266: else
267: {
268: while (pos <= len && !chars[next_c])
269: {
270: prev_c = next_c;
271: next_c = (unsigned char)(*str_byte++);
272: pos++;
273: }
274: }
275: }
276: #endif
277:
278: DPRINT(("length: %d\n", len));
279: DPRINT(("pos:chr/code | states and tags\n"));
280: DPRINT(("-------------+------------------------------------------------\n"));
281:
282: reach_next_i = reach_next;
283: while (1)
284: {
285: /* If no match found yet, add the initial states to `reach_next'. */
286: if (match_eo < 0)
287: {
288: DPRINT((" init >"));
289: trans_i = tnfa->initial;
290: while (trans_i->state != NULL)
291: {
292: if (reach_pos[trans_i->state_id].pos < pos)
293: {
294: if (trans_i->assertions
295: && CHECK_ASSERTIONS(trans_i->assertions))
296: {
297: DPRINT(("assertion failed\n"));
298: trans_i++;
299: continue;
300: }
301:
302: DPRINT((" %p", (void *)trans_i->state));
303: reach_next_i->state = trans_i->state;
304: for (i = 0; i < num_tags; i++)
305: reach_next_i->tags[i] = -1;
306: tag_i = trans_i->tags;
307: if (tag_i)
308: while (*tag_i >= 0)
309: {
310: if (*tag_i < num_tags)
311: reach_next_i->tags[*tag_i] = pos;
312: tag_i++;
313: }
314: if (reach_next_i->state == tnfa->final)
315: {
316: DPRINT((" found empty match\n"));
317: match_eo = pos;
318: new_match = 1;
319: for (i = 0; i < num_tags; i++)
320: match_tags[i] = reach_next_i->tags[i];
321: }
322: reach_pos[trans_i->state_id].pos = pos;
323: reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
324: reach_next_i++;
325: }
326: trans_i++;
327: }
328: DPRINT(("\n"));
329: reach_next_i->state = NULL;
330: }
331: else
332: {
333: if (num_tags == 0 || reach_next_i == reach_next)
334: /* We have found a match. */
335: break;
336: }
337:
338: /* Check for end of string. */
339: if (len < 0)
340: {
341: if (type == STR_USER)
342: {
343: if (str_user_end)
344: break;
345: }
346: else if (next_c == L'\0')
347: break;
348: }
349: else
350: {
351: if (pos >= len)
352: break;
353: }
354:
355: GET_NEXT_WCHAR();
356:
357: #ifdef TRE_DEBUG
358: DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
359: tre_print_reach(tnfa, reach_next, num_tags);
360: DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
361: tre_print_reach(tnfa, reach_next, num_tags);
362: #endif /* TRE_DEBUG */
363:
364: /* Swap `reach' and `reach_next'. */
365: reach_i = reach;
366: reach = reach_next;
367: reach_next = reach_i;
368:
369: /* For each state in `reach', weed out states that don't fulfill the
370: minimal matching conditions. */
371: if (tnfa->num_minimals && new_match)
372: {
373: new_match = 0;
374: reach_next_i = reach_next;
375: for (reach_i = reach; reach_i->state; reach_i++)
376: {
377: int i;
378: int skip = 0;
379: for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
380: {
381: int end = tnfa->minimal_tags[i];
382: int start = tnfa->minimal_tags[i + 1];
383: DPRINT((" Minimal start %d, end %d\n", start, end));
384: if (end >= num_tags)
385: {
386: DPRINT((" Throwing %p out.\n", reach_i->state));
387: skip = 1;
388: break;
389: }
390: else if (reach_i->tags[start] == match_tags[start]
391: && reach_i->tags[end] < match_tags[end])
392: {
393: DPRINT((" Throwing %p out because t%d < %d\n",
394: reach_i->state, end, match_tags[end]));
395: skip = 1;
396: break;
397: }
398: }
399: if (!skip)
400: {
401: int *tmp_iptr;
402: reach_next_i->state = reach_i->state;
403: tmp_iptr = reach_next_i->tags;
404: reach_next_i->tags = reach_i->tags;
405: reach_i->tags = tmp_iptr;
406: reach_next_i++;
407: }
408: }
409: reach_next_i->state = NULL;
410:
411: /* Swap `reach' and `reach_next'. */
412: reach_i = reach;
413: reach = reach_next;
414: reach_next = reach_i;
415: }
416:
417: /* For each state in `reach' see if there is a transition leaving with
418: the current input symbol to a state not yet in `reach_next', and
419: add the destination states to `reach_next'. */
420: reach_next_i = reach_next;
421: for (reach_i = reach; reach_i->state; reach_i++)
422: {
423: for (trans_i = reach_i->state; trans_i->state; trans_i++)
424: {
425: /* Does this transition match the input symbol? */
426: if (trans_i->code_min <= prev_c &&
427: trans_i->code_max >= prev_c)
428: {
429: if (trans_i->assertions
430: && (CHECK_ASSERTIONS(trans_i->assertions)
431: /* Handle character klass transitions. */
432: || ((trans_i->assertions & ASSERT_CHAR_CLASS)
433: && !(tnfa->cflags & REG_ICASE)
434: && !tre_isctype((tre_cint_t)prev_c,
435: trans_i->u.klass))
436: || ((trans_i->assertions & ASSERT_CHAR_CLASS)
437: && (tnfa->cflags & REG_ICASE)
438: && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
439: trans_i->u.klass)
440: && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
441: trans_i->u.klass)))
442: || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
443: && tre_neg_char_klasses_match(trans_i->neg_klasses,
444: (tre_cint_t)prev_c,
445: tnfa->cflags & REG_ICASE))))
446: {
447: DPRINT(("assertion failed\n"));
448: continue;
449: }
450:
451: /* Compute the tags after this transition. */
452: for (i = 0; i < num_tags; i++)
453: tmp_tags[i] = reach_i->tags[i];
454: tag_i = trans_i->tags;
455: if (tag_i != NULL)
456: while (*tag_i >= 0)
457: {
458: if (*tag_i < num_tags)
459: tmp_tags[*tag_i] = pos;
460: tag_i++;
461: }
462:
463: if (reach_pos[trans_i->state_id].pos < pos)
464: {
465: /* Found an unvisited node. */
466: reach_next_i->state = trans_i->state;
467: tmp_iptr = reach_next_i->tags;
468: reach_next_i->tags = tmp_tags;
469: tmp_tags = tmp_iptr;
470: reach_pos[trans_i->state_id].pos = pos;
471: reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
472:
473: if (reach_next_i->state == tnfa->final
474: && (match_eo == -1
475: || (num_tags > 0
476: && reach_next_i->tags[0] <= match_tags[0])))
477: {
478: DPRINT((" found match %p\n", trans_i->state));
479: match_eo = pos;
480: new_match = 1;
481: for (i = 0; i < num_tags; i++)
482: match_tags[i] = reach_next_i->tags[i];
483: }
484: reach_next_i++;
485:
486: }
487: else
488: {
489: assert(reach_pos[trans_i->state_id].pos == pos);
490: /* Another path has also reached this state. We choose
491: the winner by examining the tag values for both
492: paths. */
493: if (tre_tag_order(num_tags, tnfa->tag_directions,
494: tmp_tags,
495: *reach_pos[trans_i->state_id].tags))
496: {
497: /* The new path wins. */
498: tmp_iptr = *reach_pos[trans_i->state_id].tags;
499: *reach_pos[trans_i->state_id].tags = tmp_tags;
500: if (trans_i->state == tnfa->final)
501: {
502: DPRINT((" found better match\n"));
503: match_eo = pos;
504: new_match = 1;
505: for (i = 0; i < num_tags; i++)
506: match_tags[i] = tmp_tags[i];
507: }
508: tmp_tags = tmp_iptr;
509: }
510: }
511: }
512: }
513: }
514: reach_next_i->state = NULL;
515: }
516:
517: DPRINT(("match end offset = %d\n", match_eo));
518:
519: #ifndef TRE_USE_ALLOCA
520: if (buf)
521: xfree(buf);
522: #endif /* !TRE_USE_ALLOCA */
523:
524: *match_end_ofs = match_eo;
525: return match_eo >= 0 ? REG_OK : REG_NOMATCH;
526: }
527:
528: /* EOF */
Start cpp section to tre/tre_match-utils.hpp[1
/1
]
1: #line 6605 "./lpsrc/tre.pak"
2:
3: /*
4: tre-match-utils.h - TRE matcher helper definitions
5:
6: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
7:
8: This program is free software; you can redistribute it and/or modify
9: it under the terms of the GNU General Public License version 2 (June
10: 1991) as published by the Free Software Foundation.
11:
12: This program is distributed in the hope that it will be useful,
13: but WITHOUT ANY WARRANTY; without even the implied warranty of
14: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15: GNU General Public License for more details.
16:
17: You should have received a copy of the GNU General Public License
18: along with this program; if not, write to the Free Software
19: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20:
21: */
22:
23:
24:
25:
26:
27:
28:
29: /* Wide character and multibyte support. */
30:
31:
32: do { \
33: prev_c = next_c; \
34: if (type == STR_BYTE) \
35: { \
36: pos++; \
37: if (len >= 0 && pos >= len) \
38: next_c = '\0'; \
39: else \
40: next_c = (unsigned char)(*str_byte++); \
41: } \
42: else if (type == STR_WIDE) \
43: { \
44: pos++; \
45: if (len >= 0 && pos >= len) \
46: next_c = L'\0'; \
47: else \
48: next_c = *str_wide++; \
49: } \
50: else if (type == STR_MBS) \
51: { \
52: pos += pos_add_next; \
53: if (str_byte == NULL) \
54: next_c = L'\0'; \
55: else \
56: { \
57: size_t w; \
58: int max; \
59: if (len >= 0) \
60: max = len - pos; \
61: else \
62: max = 32; \
63: if (max <= 0) \
64: { \
65: next_c = L'\0'; \
66: pos_add_next = 1; \
67: } \
68: else \
69: { \
70: w = tre_mbrtowc(&next_c, str_byte, max, &mbstate); \
71: if (w == (size_t)-1 || w == (size_t)-2) \
72: return REG_NOMATCH; \
73: if (w == 0 && len >= 0) \
74: { \
75: pos_add_next = 1; \
76: next_c = 0; \
77: str_byte++; \
78: } \
79: else \
80: { \
81: pos_add_next = w; \
82: str_byte += w; \
83: } \
84: } \
85: } \
86: } \
87: else if (type == STR_USER) \
88: { \
89: pos += pos_add_next; \
90: str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
91: str_source->context); \
92: } \
93: } while(0)
94:
95:
96:
97: /* Wide character support, no multibyte support. */
98:
99:
100: do { \
101: prev_c = next_c; \
102: if (type == STR_BYTE) \
103: { \
104: pos++; \
105: if (len >= 0 && pos >= len) \
106: next_c = '\0'; \
107: else \
108: next_c = (unsigned char)(*str_byte++); \
109: } \
110: else if (type == STR_WIDE) \
111: { \
112: pos++; \
113: if (len >= 0 && pos >= len) \
114: next_c = L'\0'; \
115: else \
116: next_c = *str_wide++; \
117: } \
118: else if (type == STR_USER) \
119: { \
120: pos += pos_add_next; \
121: str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
122: str_source->context); \
123: } \
124: } while(0)
125:
126:
127:
128:
129:
130: /* No wide character or multibyte support. */
131:
132:
133: do { \
134: prev_c = next_c; \
135: if (type == STR_BYTE) \
136: { \
137: pos++; \
138: if (len >= 0 && pos >= len) \
139: next_c = '\0'; \
140: else \
141: next_c = (unsigned char)(*str_byte++); \
142: } \
143: else if (type == STR_USER) \
144: { \
145: pos += pos_add_next; \
146: str_user_end = str_source->get_next_char(&next_c, &pos_add_next, \
147: str_source->context); \
148: } \
149: } while(0)
150:
151:
152:
153:
154:
155:
156:
157:
158: (((assertions & ASSERT_AT_BOL) \
159: && (pos > 0 || reg_notbol) \
160: && (prev_c != L'\n' || !reg_newline)) \
161: || ((assertions & ASSERT_AT_EOL) \
162: && (next_c != L'\0' || reg_noteol) \
163: && (next_c != L'\n' || !reg_newline)) \
164: || ((assertions & ASSERT_AT_BOW) \
165: && (pos > 0 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))) \
166: || ((assertions & ASSERT_AT_EOW) \
167: && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
168: || ((assertions & ASSERT_AT_WB) \
169: && (pos != 0 && next_c != L'\0' \
170: && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
171: || ((assertions & ASSERT_AT_WB_NEG) \
172: && (pos == 0 || next_c == L'\0' \
173: || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
174:
175:
176:
177: /* Returns 1 if `t1' wins `t2', 0 otherwise. */
178: inline static int
179: tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
180: int *t1, int *t2)
181: {
182: int i;
183: for (i = 0; i < num_tags; i++)
184: {
185: if (tag_directions[i] == TRE_TAG_MINIMIZE)
186: {
187: if (t1[i] < t2[i])
188: return 1;
189: if (t1[i] > t2[i])
190: return 0;
191: }
192: else
193: {
194: if (t1[i] > t2[i])
195: return 1;
196: if (t1[i] < t2[i])
197: return 0;
198: }
199: }
200: /* assert(0);*/
201: return 0;
202: }
203:
204: inline static int
205: tre_neg_char_klasses_match(tre_ctype_t *klasses, tre_cint_t wc, int icase)
206: {
207: DPRINT(("neg_char_klasses_test: %p, %d, %d\n", klasses, wc, icase));
208: while (*klasses != (tre_ctype_t)0)
209: if ((!icase && tre_isctype(wc, *klasses))
210: || (icase && (tre_isctype(tre_toupper(wc), *klasses)
211: || tre_isctype(tre_tolower(wc), *klasses))))
212: return 1; /* Match. */
213: else
214: klasses++;
215: return 0; /* No match. */
216: }
Start cpp section to tre/tre_mem.cpp[1
/1
]
1: #line 6822 "./lpsrc/tre.pak"
2: /*
3: tre-mem.c - TRE memory allocator
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22: /*
23: This memory allocator is for allocating small memory blocks efficiently
24: in terms of memory overhead and execution speed. The allocated blocks
25: cannot be freed individually, only all at once. There can be multiple
26: allocators, though.
27: */
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38: /* Returns a new memory allocator or NULL if out of memory. */
39: tre_mem_t
40: tre_mem_new_impl(int provided, void *provided_block)
41: {
42: tre_mem_t mem;
43: if (provided)
44: {
45: mem = (tre_mem_struct*)provided_block;
46: memset(mem, 0, sizeof(*mem));
47: }
48: else
49: mem = (tre_mem_struct*)xcalloc(1, sizeof(*mem));
50: if (mem == NULL)
51: return NULL;
52: return mem;
53: }
54:
55:
56: /* Frees the memory allocator and all memory allocated with it. */
57: void
58: tre_mem_destroy(tre_mem_t mem)
59: {
60: tre_list_t *tmp, *l = mem->blocks;
61:
62: while (l != NULL)
63: {
64: xfree(l->data);
65: tmp = l->next;
66: xfree(l);
67: l = tmp;
68: }
69: xfree(mem);
70: }
71:
72:
73: /* Allocates a block of `size' bytes from `mem'. Returns a pointer to the
74: allocated block or NULL if an underlying malloc() failed. */
75: void *
76: tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block,
77: int zero, size_t size)
78: {
79: void *ptr;
80:
81: if (mem->failed)
82: {
83: DPRINT(("tre_mem_alloc: oops, called after failure?!\n"));
84: return NULL;
85: }
86:
87: #ifdef MALLOC_DEBUGGING
88: if (!provided)
89: {
90: ptr = xmalloc(1);
91: if (ptr == NULL)
92: {
93: DPRINT(("tre_mem_alloc: xmalloc forced failure\n"));
94: mem->failed = 1;
95: return NULL;
96: }
97: xfree(ptr);
98: }
99: #endif /* MALLOC_DEBUGGING */
100:
101: if (mem->n < size)
102: {
103: /* We need more memory than is available in the current block.
104: Allocate a new block. */
105: tre_list_t *l;
106: if (provided)
107: {
108: DPRINT(("tre_mem_alloc: using provided block\n"));
109: if (provided_block == NULL)
110: {
111: DPRINT(("tre_mem_alloc: provided block was NULL\n"));
112: mem->failed = 1;
113: return NULL;
114: }
115: mem->ptr = (char*)provided_block;
116: mem->n = TRE_MEM_BLOCK_SIZE;
117: }
118: else
119: {
120: int block_size;
121: if (size * 8 > TRE_MEM_BLOCK_SIZE)
122: block_size = size * 8;
123: else
124: block_size = TRE_MEM_BLOCK_SIZE;
125: DPRINT(("tre_mem_alloc: allocating new %d byte block\n",
126: block_size));
127: l = (tre_list_t*)xmalloc(sizeof(*l));
128: if (l == NULL)
129: {
130: mem->failed = 1;
131: return NULL;
132: }
133: l->data = xmalloc(block_size);
134: if (l->data == NULL)
135: {
136: xfree(l);
137: mem->failed = 1;
138: return NULL;
139: }
140: l->next = NULL;
141: if (mem->current != NULL)
142: mem->current->next = l;
143: if (mem->blocks == NULL)
144: mem->blocks = l;
145: mem->current = l;
146: mem->ptr = (char*)l->data;
147: mem->n = block_size;
148: }
149: }
150:
151: /* Make sure the next pointer will be aligned. */
152: size += ALIGN(mem->ptr + size, long);
153:
154: /* Allocate from current block. */
155: ptr = mem->ptr;
156: mem->ptr += size;
157: mem->n -= size;
158:
159: /* Set to zero if needed. */
160: if (zero)
161: memset(ptr, 0, size);
162:
163: return ptr;
164: }
165:
166: /* EOF */
Start cpp section to tre/tre_mem.hpp[1
/1
]
1: #line 6989 "./lpsrc/tre.pak"
2: /*
3: tre-mem.h - TRE memory allocator interface
4:
5: Copyright (C) 2001-2003 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24:
25:
26:
27:
28:
29: typedef struct tre_list {
30: void *data;
31: struct tre_list *next;
32: } tre_list_t;
33:
34: typedef struct tre_mem_struct {
35: tre_list_t *blocks;
36: tre_list_t *current;
37: char *ptr;
38: size_t n;
39: int failed;
40: void **provided;
41: } *tre_mem_t;
42:
43:
44: tre_mem_t tre_mem_new_impl(int provided, void *provided_block);
45: void *tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block,
46: int zero, size_t size);
47:
48: /* Returns a new memory allocator or NULL if out of memory. */
49:
50:
51: /* Allocates a block of `size' bytes from `mem'. Returns a pointer to the
52: allocated block or NULL if an underlying malloc() failed. */
53:
54:
55: /* Allocates a block of `size' bytes from `mem'. Returns a pointer to the
56: allocated block or NULL if an underlying malloc() failed. The memory
57: is set to zero. */
58:
59:
60:
61: /* alloca() versions. Like above, but memory is allocated with alloca()
62: instead of malloc(). */
63:
64:
65: tre_mem_new_impl(1, alloca(sizeof(struct tre_mem_struct)))
66:
67:
68: ((mem)->n >= (size) \
69: ? tre_mem_alloc_impl((mem), 1, NULL, 0, (size)) \
70: : tre_mem_alloc_impl((mem), 1, alloca(TRE_MEM_BLOCK_SIZE), 0, (size)))
71:
72:
73:
74: /* Frees the memory allocator and all memory allocated with it. */
75: void tre_mem_destroy(tre_mem_t mem);
76:
77:
78:
79: /* EOF */
Start cpp section to tre/tre_parse.cpp[1
/1
]
1: #line 7069 "./lpsrc/tre.pak"
2: /*
3: tre-parse.c - Regexp parser
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19: */
20:
21: /*
22: This parser is just a simple recursive descent parser for POSIX.2
23: regexps. The parser supports both the obsolete default syntax and
24: the "extended" syntax, and some nonstandard extensions.
25: */
26:
27:
28:
29:
30:
31:
32:
33:
34:
35:
36:
37:
38:
39:
40: /* Characters with special meanings in regexp syntax. */
41:
42:
43:
44:
45:
46:
47:
48:
49:
50:
51:
52:
53:
54:
55:
56:
57:
58:
59:
60:
61:
62:
63: /* Some macros for expanding \w, \s, etc. */
64: static const char *tre_macros[] =
65: { "t", "\t", "n", "\n", "r", "\r",
66: "f", "\f", "a", "\a", "e", "\033",
67: "w", "[[:alnum:]_]", "W", "[^[:alnum:]_]", "s", "[[:space:]]",
68: "S", "[^[:space:]]", "d", "[[:digit:]]", "D", "[^[:digit:]]",
69: NULL };
70:
71:
72: /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
73: must have at least `len' items. Sets buf[0] to zero if the there
74: is no match in `tre_macros'. */
75: static void
76: tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
77: tre_char_t *buf, size_t buf_len)
78: {
79: int i;
80: size_t len = regex_end - regex;
81:
82: buf[0] = 0;
83: for (i = 0; tre_macros[i] != NULL; i += 2)
84: {
85: int match = 0;
86: if (strlen(tre_macros[i]) > len)
87: continue;
88: #ifdef TRE_WCHAR
89: {
90: tre_char_t tmp_wcs[64];
91: unsigned int j;
92: for (j = 0; j < strlen(tre_macros[i]) && j < elementsof(tmp_wcs); j++)
93: tmp_wcs[j] = btowc(tre_macros[i][j]);
94: tmp_wcs[j] = 0;
95: match = wcsncmp(tmp_wcs, regex, strlen(tre_macros[i]));
96: }
97: #else /* !TRE_WCHAR */
98: match = strncmp(tre_macros[i], (const char*)regex, strlen(tre_macros[i]));
99: #endif /* !TRE_WCHAR */
100: if (match == 0)
101: {
102: unsigned int j;
103: DPRINT(("Expanding macro '%s' => '%s'\n",
104: tre_macros[i], tre_macros[i + 1]));
105: for (j = 0; tre_macros[i + 1][j] != 0 && j < buf_len; j++)
106: {
107: #ifdef TRE_WCHAR
108: buf[j] = btowc(tre_macros[i + 1][j]);
109: #else /* !TRE_WCHAR */
110: buf[j] = tre_macros[i + 1][j];
111: #endif /* !TRE_WCHAR */
112: }
113: buf[j] = 0;
114: break;
115: }
116: }
117: }
118:
119: static reg_errcode_t
120: tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
121: tre_ast_node_t ***items)
122: {
123: reg_errcode_t status;
124: tre_ast_node_t **array = *items;
125: /* Allocate more space if necessary. */
126: if (*i >= *max_i)
127: {
128: tre_ast_node_t **new_items;
129: DPRINT(("out of array space, i = %d\n", *i));
130: /* If the array is already 1024 items large, give up -- there's
131: probably an error in the regexp (e.g. not a '\0' terminated
132: string and missing ']') */
133: if (*max_i > 1024)
134: return REG_ESPACE;
135: *max_i *= 2;
136: new_items = (tre_ast_node_t**)xrealloc(array, sizeof(*items) * *max_i);
137: if (new_items == NULL)
138: return REG_ESPACE;
139: *items = array = new_items;
140: }
141: array[*i] = tre_ast_new_literal(mem, min, max, -1);
142: status = array[*i] == NULL ? REG_ESPACE : REG_OK;
143: (*i)++;
144: return status;
145: }
146:
147:
148: /* Expands a character klass to character ranges. */
149: static reg_errcode_t
150: tre_expand_ctype(tre_mem_t mem, tre_ctype_t klass, tre_ast_node_t ***items,
151: int *i, int *max_i, int cflags)
152: {
153: reg_errcode_t status = REG_OK;
154: tre_cint_t c;
155: int j, min = -1, max = 0;
156: assert(TRE_MB_CUR_MAX == 1);
157:
158: DPRINT((" expanding klass to character ranges\n"));
159: for (j = 0; (j < 256) && (status == REG_OK); j++)
160: {
161: c = j;
162: if (tre_isctype(c, klass)
163: || ((cflags & REG_ICASE)
164: && (tre_isctype(tre_tolower(c), klass)
165: || tre_isctype(tre_toupper(c), klass))))
166: {
167: if (min < 0)
168: min = c;
169: max = c;
170: }
171: else if (min >= 0)
172: {
173: DPRINT((" range %c (%d) to %c (%d)\n", min, min, max, max));
174: status = tre_new_item(mem, min, max, i, max_i, items);
175: min = -1;
176: }
177: }
178: if (min >= 0 && status == REG_OK)
179: status = tre_new_item(mem, min, max, i, max_i, items);
180: return status;
181: }
182:
183:
184: static int
185: tre_compare_items(const void *a, const void *b)
186: {
187: tre_ast_node_t *node_a = *(tre_ast_node_t **)a;
188: tre_ast_node_t *node_b = *(tre_ast_node_t **)b;
189: tre_literal_t *l_a = (tre_literal_t*)node_a->obj, *l_b = (tre_literal_t*)node_b->obj;
190: int a_min = l_a->code_min, b_min = l_b->code_min;
191:
192: if (a_min < b_min)
193: return -1;
194: else if (a_min > b_min)
195: return 1;
196: else
197: return 0;
198: }
199:
200:
201:
202: /* isalnum() and the rest may be macros, so wrap them to functions. */
203: int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
204: int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
205:
206:
207: int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
208:
209: int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
210:
211:
212:
213: int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
214:
215: int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
216:
217:
218: int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
219: int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
220: int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
221: int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
222: int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
223: int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
224: int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
225: int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
226: int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
227:
228: struct {
229: char *name;
230: int (*func)(tre_cint_t);
231: } tre_ctype_map[] = {
232: { "alnum", &tre_isalnum_func },
233: { "alpha", &tre_isalpha_func },
234: #ifdef tre_isascii
235: { "ascii", &tre_isascii_func },
236: #endif /* tre_isascii */
237: #ifdef tre_isblank
238: { "blank", &tre_isblank_func },
239: #endif /* tre_isblank */
240: { "cntrl", &tre_iscntrl_func },
241: { "digit", &tre_isdigit_func },
242: { "graph", &tre_isgraph_func },
243: { "lower", &tre_islower_func },
244: { "print", &tre_isprint_func },
245: { "punct", &tre_ispunct_func },
246: { "space", &tre_isspace_func },
247: { "upper", &tre_isupper_func },
248: { "xdigit", &tre_isxdigit_func },
249: { NULL, NULL}
250: };
251:
252: tre_ctype_t tre_ctype(const char *name)
253: {
254: int i;
255: for (i = 0; tre_ctype_map[i].name != NULL; i++)
256: {
257: if (strcmp(name, tre_ctype_map[i].name) == 0)
258: return tre_ctype_map[i].func;
259: }
260: return (tre_ctype_t)0;
261: }
262:
263:
264: /* Maximum number of character klasses that can occur in a negated bracket
265: expression. */
266:
267:
268: /* Maximum length of character klass names. */
269:
270:
271: static reg_errcode_t
272: tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
273: tre_ctype_t neg_klasses[], int *num_neg_klasses,
274: tre_ast_node_t ***items, int *num_items,
275: int *items_size)
276: {
277: const tre_char_t *re = ctx->re;
278: reg_errcode_t status = REG_OK;
279: tre_ctype_t klass = (tre_ctype_t)0;
280: int i = *num_items;
281: int max_i = *items_size;
282: int skip;
283:
284: /* Build an array of the items in the bracket expression. */
285: while (status == REG_OK)
286: {
287: skip = 0;
288: if (re == ctx->re_end)
289: {
290: status = REG_EBRACK;
291: }
292: else if (*re == CHAR_RBRACKET && re > ctx->re)
293: {
294: DPRINT(("tre_parse_bracket: done: '%.*" STRF "'\n",
295: ctx->re_end - re, re));
296: re++;
297: break;
298: }
299: else
300: {
301: tre_cint_t min = 0, max = 0;
302:
303: klass = (tre_ctype_t)0;
304: if (re + 2 < ctx->re_end
305: && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
306: {
307: DPRINT(("tre_parse_bracket: range: '%.*" STRF "'\n",
308: ctx->re_end - re, re));
309: min = *re;
310: max = *(re + 2);
311: re += 3;
312: /* XXX - Should use collation order instead of encoding values
313: in character ranges. */
314: if (min > max)
315: status = REG_ERANGE;
316: }
317: else if (re + 1 < ctx->re_end
318: && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
319: status = REG_ECOLLATE;
320: else if (re + 1 < ctx->re_end
321: && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
322: status = REG_ECOLLATE;
323: else if (re + 1 < ctx->re_end
324: && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
325: {
326: char tmp_str[64];
327: const tre_char_t *endptr = re + 2;
328: int len;
329: DPRINT(("tre_parse_bracket: klass: '%.*" STRF "'\n",
330: ctx->re_end - re, re));
331: while (endptr < ctx->re_end && *endptr != CHAR_COLON)
332: endptr++;
333: if (endptr != ctx->re_end)
334: {
335: len = MIN(endptr - re - 2, 63);
336: #ifdef TRE_WCHAR
337: {
338: tre_char_t tmp_wcs[64];
339: wcsncpy(tmp_wcs, re + 2, len);
340: tmp_wcs[len] = L'\0';
341: #if defined HAVE_WCSRTOMBS
342: {
343: mbstate_t state;
344: const tre_char_t *src = tmp_wcs;
345: memset(&state, '\0', sizeof(state));
346: len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
347: }
348: #elif defined HAVE_WCSTOMBS
349: len = wcstombs(tmp_str, tmp_wcs, 63);
350: #endif /* defined HAVE_WCSTOMBS */
351: }
352: #else /* !TRE_WCHAR */
353: strncpy(tmp_str, (const char*)re + 2, len);
354: #endif /* !TRE_WCHAR */
355: tmp_str[len] = '\0';
356: DPRINT((" klass name: %s\n", tmp_str));
357: klass = tre_ctype(tmp_str);
358: if (!klass)
359: status = REG_ECTYPE;
360: /* Optimize character klasses for 8 bit character sets. */
361: if (status == REG_OK && TRE_MB_CUR_MAX == 1)
362: {
363: status = tre_expand_ctype(ctx->mem, klass, items,
364: &i, &max_i, ctx->cflags);
365: klass = (tre_ctype_t)0;
366: skip = 1;
367: }
368: re = endptr + 2;
369: }
370: else
371: status = REG_ECTYPE;
372: min = 0;
373: max = TRE_CHAR_MAX;
374: }
375: else
376: {
377: DPRINT(("tre_parse_bracket: char: '%.*" STRF "'\n",
378: ctx->re_end - re, re));
379: if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
380: && ctx->re != re)
381: /* Two ranges are not allowed to share and endpoint. */
382: status = REG_ERANGE;
383: min = max = *re++;
384: }
385:
386: if (status != REG_OK)
387: break;
388:
389: if (klass && negate)
390: if (*num_neg_klasses >= MAX_NEG_CLASSES)
391: status = REG_ESPACE;
392: else
393: neg_klasses[(*num_neg_klasses)++] = klass;
394: else if (!skip)
395: {
396: status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
397: if (status != REG_OK)
398: break;
399: ((tre_literal_t*)((*items)[i-1])->obj)->u.klass = klass;
400: }
401:
402: /* Add opposite-case counterpoints if REG_ICASE is present.
403: This is broken if there are more than two "same" characters. */
404: if (ctx->cflags & REG_ICASE && !klass && status == REG_OK && !skip)
405: {
406: int cmin, ccurr;
407:
408: DPRINT(("adding opposite-case counterpoints\n"));
409: while (min <= max)
410: {
411: if (tre_islower(min))
412: {
413: cmin = ccurr = tre_toupper(min++);
414: while (tre_islower(min) && tre_toupper(min) == ccurr + 1
415: && min <= max)
416: ccurr = tre_toupper(min++);
417: status = tre_new_item(ctx->mem, cmin, ccurr,
418: &i, &max_i, items);
419: }
420: else if (tre_isupper(min))
421: {
422: cmin = ccurr = tre_tolower(min++);
423: while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
424: && min <= max)
425: ccurr = tre_tolower(min++);
426: status = tre_new_item(ctx->mem, cmin, ccurr,
427: &i, &max_i, items);
428: }
429: else min++;
430: if (status != REG_OK)
431: break;
432: }
433: if (status != REG_OK)
434: break;
435: }
436: }
437: }
438: *num_items = i;
439: *items_size = max_i;
440: ctx->re = re;
441: return status;
442: }
443:
444: static reg_errcode_t
445: tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
446: {
447: tre_ast_node_t *node = NULL;
448: int negate = 0;
449: reg_errcode_t status = REG_OK;
450: tre_ast_node_t **items, *u, *n;
451: int i = 0, j, max_i = 32, curr_max, curr_min;
452: tre_ctype_t neg_klasses[MAX_NEG_CLASSES];
453: int num_neg_klasses = 0;
454:
455: /* Start off with an array of `max_i' elements. */
456: items = (tre_ast_node_t**)xmalloc(sizeof(*items) * max_i);
457: if (items == NULL)
458: return REG_ESPACE;
459:
460: if (*ctx->re == CHAR_CARET)
461: {
462: DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n",
463: ctx->re_end - ctx->re, ctx->re));
464: negate = 1;
465: ctx->re++;
466: }
467:
468: status = tre_parse_bracket_items(ctx, negate, neg_klasses, &num_neg_klasses,
469: &items, &i, &max_i);
470:
471: if (status != REG_OK)
472: goto parse_bracket_done;
473:
474: /* Sort the array if we need to negate it. */
475: if (negate)
476: qsort(items, i, sizeof(*items), tre_compare_items);
477:
478: curr_max = curr_min = 0;
479: /* Build a union of the items in the array, negated if necessary. */
480: for (j = 0; j < i && status == REG_OK; j++)
481: {
482: int min, max;
483: tre_literal_t *l = (tre_literal_t*)items[j]->obj;
484: min = l->code_min;
485: max = l->code_max;
486:
487: DPRINT(("item: %d - %d, klass %ld, curr_max = %d\n",
488: (int)l->code_min, (int)l->code_max, (long)l->u.klass, curr_max));
489:
490: if (negate)
491: {
492: if (min < curr_max)
493: {
494: /* Overlap. */
495: curr_max = MAX(max + 1, curr_max);
496: DPRINT(("overlap, curr_max = %d\n", curr_max));
497: l = NULL;
498: }
499: else
500: {
501: /* No overlap. */
502: curr_max = min - 1;
503: if (curr_max >= curr_min)
504: {
505: DPRINT(("no overlap\n"));
506: l->code_min = curr_min;
507: l->code_max = curr_max;
508: }
509: else
510: {
511: DPRINT(("no overlap, zero room\n"));
512: l = NULL;
513: }
514: curr_min = curr_max = max + 1;
515: }
516: }
517:
518: if (l != NULL)
519: {
520: int k;
521: DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
522: l->position = ctx->position;
523: if (num_neg_klasses > 0)
524: {
525: l->neg_klasses = (tre_ctype_t*)tre_mem_alloc(ctx->mem,
526: (sizeof(l->neg_klasses)
527: * (num_neg_klasses + 1)));
528: if (l->neg_klasses == NULL)
529: {
530: status = REG_ESPACE;
531: break;
532: }
533: for (k = 0; k < num_neg_klasses; k++)
534: l->neg_klasses[k] = neg_klasses[k];
535: l->neg_klasses[k] = (tre_ctype_t)0;
536: }
537: else
538: l->neg_klasses = NULL;
539: if (node == NULL)
540: node = items[j];
541: else
542: {
543: u = tre_ast_new_union(ctx->mem, node, items[j]);
544: if (u == NULL)
545: status = REG_ESPACE;
546: node = u;
547: }
548: }
549: }
550:
551: if (status != REG_OK)
552: goto parse_bracket_done;
553:
554: if (negate)
555: {
556: int k;
557: DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
558: n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
559: if (n == NULL)
560: status = REG_ESPACE;
561: else
562: {
563: tre_literal_t *l = (tre_literal_t*)n->obj;
564: if (num_neg_klasses > 0)
565: {
566: l->neg_klasses = (tre_ctype_t*)tre_mem_alloc(ctx->mem,
567: (sizeof(l->neg_klasses)
568: * (num_neg_klasses + 1)));
569: if (l->neg_klasses == NULL)
570: {
571: status = REG_ESPACE;
572: goto parse_bracket_done;
573: }
574: for (k = 0; k < num_neg_klasses; k++)
575: l->neg_klasses[k] = neg_klasses[k];
576: l->neg_klasses[k] = (tre_ctype_t)0;
577: }
578: else
579: l->neg_klasses = NULL;
580: if (node == NULL)
581: node = n;
582: else
583: {
584: u = tre_ast_new_union(ctx->mem, node, n);
585: if (u == NULL)
586: status = REG_ESPACE;
587: node = u;
588: }
589: }
590: }
591:
592: if (status != REG_OK)
593: goto parse_bracket_done;
594:
595: #ifdef TRE_DEBUG
596: tre_ast_print(node);
597: #endif /* TRE_DEBUG */
598:
599: parse_bracket_done:
600: xfree(items);
601: ctx->position++;
602: *result = node;
603: return status;
604: }
605:
606:
607: /* Parses a positive decimal integer. Returns -1 if the string does not
608: contain a valid number. */
609: static int
610: tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
611: {
612: int num = -1;
613: const tre_char_t *r = *regex;
614: while (r < regex_end && *r >= L'0' && *r <= L'9')
615: {
616: if (num < 0)
617: num = 0;
618: num = num * 10 + *r - L'0';
619: r++;
620: }
621: *regex = r;
622: return num;
623: }
624:
625:
626: static reg_errcode_t
627: tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
628: {
629: int min, max, i;
630: int cost_ins, cost_del, cost_subst, cost_max;
631: int limit_ins, limit_del, limit_subst, limit_err;
632: const tre_char_t *r = ctx->re;
633: const tre_char_t *start;
634: int minimal = 0;
635: int approx = 0;
636: int costs_set = 0;
637: int counts_set = 0;
638:
639: cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
640: limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
641:
642: /* Parse number (minimum repetition count). */
643: min = -1;
644: if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
645: DPRINT(("tre_parse: min count: '%.*" STRF "'\n", ctx->re_end - r, r));
646: min = tre_parse_int(&r, ctx->re_end);
647: }
648:
649: /* Parse comma and second number (maximum repetition count). */
650: max = min;
651: if (r < ctx->re_end && *r == CHAR_COMMA)
652: {
653: r++;
654: DPRINT(("tre_parse: max count: '%.*" STRF "'\n", ctx->re_end - r, r));
655: max = tre_parse_int(&r, ctx->re_end);
656: }
657:
658: /* Check that the repeat counts are sane. */
659: if ((max >= 0 && min > max) || max > RE_DUP_MAX)
660: return REG_BADBR;
661:
662:
663: /*
664: '{'
665: optionally followed immediately by a number == minimum repcount
666: optionally followed by , then a number == maximum repcount
667: + then a number == maximum insertion count
668: - then a number == maximum deletion count
669: # then a number == maximum substitution count
670: ~ then a number == maximum number of errors
671: Any of +, -, # or ~ without followed by a number means that
672: the maximum count/number of errors is infinite.
673:
674: An equation of the form
675: Xi + Yd + Zs < C
676: can be specified to set costs and the cost limit to a value
677: different from the default value:
678: - X is the cost of an insertion
679: - Y is the cost of a deletion
680: - Z is the cost of a substitution
681: - C is the maximum cost
682:
683: If no count limit or cost is set for an operation, the operation
684: is not allowed at all.
685: */
686:
687:
688: do {
689: int done;
690: start = r;
691:
692: /* Parse count limit settings */
693: done = 0;
694: if (!counts_set)
695: while (r + 1 < ctx->re_end && !done)
696: {
697: switch (*r)
698: {
699: case CHAR_PLUS: /* Insert limit */
700: DPRINT(("tre_parse: ins limit: '%.*" STRF "'\n", ctx->re_end - r, r));
701: r++;
702: limit_ins = tre_parse_int(&r, ctx->re_end);
703: if (limit_ins < 0)
704: limit_ins = INT_MAX;
705: counts_set = 1;
706: break;
707: case CHAR_MINUS: /* Delete limit */
708: DPRINT(("tre_parse: del limit: '%.*" STRF "'\n", ctx->re_end - r, r));
709: r++;
710: limit_del = tre_parse_int(&r, ctx->re_end);
711: if (limit_del < 0)
712: limit_del = INT_MAX;
713: counts_set = 1;
714: break;
715: case CHAR_HASH: /* Substitute limit */
716: DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", ctx->re_end - r, r));
717: r++;
718: limit_subst = tre_parse_int(&r, ctx->re_end);
719: if (limit_subst < 0)
720: limit_subst = INT_MAX;
721: counts_set = 1;
722: break;
723: case CHAR_TILDE: /* Maximum number of changes */
724: DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", ctx->re_end - r, r));
725: r++;
726: limit_err = tre_parse_int(&r, ctx->re_end);
727: if (limit_err < 0)
728: limit_err = INT_MAX;
729: approx = 1;
730: break;
731: case CHAR_COMMA:
732: r++;
733: break;
734: case L' ':
735: r++;
736: break;
737: case L'}':
738: done = 1;
739: break;
740: default:
741: done = 1;
742: break;
743: }
744: }
745:
746: /* Parse cost restriction equation. */
747: done = 0;
748: if (!costs_set)
749: while (r + 1 < ctx->re_end && !done)
750: {
751: switch (*r)
752: {
753: case CHAR_PLUS:
754: case L' ':
755: r++;
756: break;
757: case L'<':
758: DPRINT(("tre_parse: max cost: '%.*" STRF "'\n", ctx->re_end - r, r));
759: r++;
760: while (*r == L' ')
761: r++;
762: cost_max = tre_parse_int(&r, ctx->re_end);
763: if (cost_max < 0)
764: cost_max = INT_MAX;
765: else
766: cost_max--;
767: approx = 1;
768: break;
769: case CHAR_COMMA:
770: r++;
771: done = 1;
772: break;
773: default:
774: if (*r >= L'0' && *r <= L'9')
775: {
776: #ifdef TRE_DEBUG
777: const tre_char_t *sr = r;
778: #endif /* TRE_DEBUG */
779: int cost = tre_parse_int(&r, ctx->re_end);
780: /* XXX - make sure r is not past end. */
781: switch (*r)
782: {
783: case L'i': /* Insert cost */
784: DPRINT(("tre_parse: ins cost: '%.*" STRF "'\n",
785: ctx->re_end - sr, sr));
786: r++;
787: cost_ins = cost;
788: costs_set = 1;
789: break;
790: case L'd': /* Delete cost */
791: DPRINT(("tre_parse: del cost: '%.*" STRF "'\n",
792: ctx->re_end - sr, sr));
793: r++;
794: cost_del = cost;
795: costs_set = 1;
796: break;
797: case L's': /* Substitute cost */
798: DPRINT(("tre_parse: subst cost: '%.*" STRF "'\n",
799: ctx->re_end - sr, sr));
800: r++;
801: cost_subst = cost;
802: costs_set = 1;
803: break;
804: default:
805: return REG_BADBR;
806: }
807: }
808: else
809: {
810: done = 1;
811: break;
812: }
813: }
814: }
815: } while (start != r);
816:
817: /* Missing }. */
818: if (r >= ctx->re_end)
819: return REG_EBRACE;
820:
821: /* Empty contents of {}. */
822: if (r == ctx->re)
823: return REG_BADBR;
824:
825: /* Parse the ending '}' or '\}'.*/
826: if (ctx->cflags & REG_EXTENDED)
827: {
828: if (r >= ctx->re_end || *r != CHAR_RBRACE)
829: return REG_BADBR;
830: r++;
831: }
832: else
833: {
834: if (r + 1 >= ctx->re_end
835: || *r != CHAR_BACKSLASH
836: || *(r + 1) != CHAR_RBRACE)
837: return REG_BADBR;
838: r += 2;
839: }
840:
841:
842: /* Parse trailing '?' marking minimal repetition. */
843: if (r < ctx->re_end && *r == CHAR_QUESTIONMARK)
844: {
845: minimal = 1;
846: r++;
847: }
848:
849: /* Create the AST node(s). */
850: if (min == 0 && max == 0)
851: {
852: *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
853: if (*result == NULL)
854: return REG_ESPACE;
855: }
856: else
857: {
858: if (min < 0 && max < 0)
859: /* Only approximate parameters set, no repetitions. */
860: min = max = 1;
861:
862: *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
863: if (!*result)
864: return REG_ESPACE;
865:
866: /* If approximate matching parameters are set, add them to the
867: iteration node. */
868: if (approx || costs_set || counts_set)
869: {
870: unsigned int *params;
871: tre_iteration_t *iter = (tre_iteration_t*)(*result)->obj;
872:
873: if (costs_set || counts_set)
874: {
875: if (limit_ins == TRE_PARAM_UNSET)
876: {
877: if (cost_ins == TRE_PARAM_UNSET)
878: limit_ins = 0;
879: else
880: limit_ins = INT_MAX;
881: }
882:
883: if (limit_del == TRE_PARAM_UNSET)
884: {
885: if (cost_del == TRE_PARAM_UNSET)
886: limit_del = 0;
887: else
888: limit_del = INT_MAX;
889: }
890:
891: if (limit_subst == TRE_PARAM_UNSET)
892: {
893: if (cost_subst == TRE_PARAM_UNSET)
894: limit_subst = 0;
895: else
896: limit_subst = INT_MAX;
897: }
898: }
899:
900: if (cost_max == TRE_PARAM_UNSET)
901: cost_max = INT_MAX;
902: if (limit_err == TRE_PARAM_UNSET)
903: limit_err = INT_MAX;
904:
905: ctx->have_approx = 1;
906: params = (unsigned int*)tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
907: if (!params)
908: return REG_ESPACE;
909: for (i = 0; i < TRE_PARAM_LAST; i++)
910: params[i] = TRE_PARAM_UNSET;
911: params[TRE_PARAM_COST_INS] = cost_ins;
912: params[TRE_PARAM_COST_DEL] = cost_del;
913: params[TRE_PARAM_COST_SUBST] = cost_subst;
914: params[TRE_PARAM_COST_MAX] = cost_max;
915: params[TRE_PARAM_MAX_INS] = limit_ins;
916: params[TRE_PARAM_MAX_DEL] = limit_del;
917: params[TRE_PARAM_MAX_SUBST] = limit_subst;
918: params[TRE_PARAM_MAX_ERR] = limit_err;
919: iter->params = params;
920: }
921: }
922:
923: DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
924: "limits [%d,%d,%d, total %d]\n",
925: min, max, cost_ins, cost_del, cost_subst, cost_max,
926: limit_ins, limit_del, limit_subst, limit_err));
927:
928:
929: ctx->re = r;
930: return REG_OK;
931: }
932:
933: typedef enum {
934: PARSE_RE = 0,
935: PARSE_ATOM,
936: PARSE_MARK_FOR_SUBMATCH,
937: PARSE_BRANCH,
938: PARSE_PIECE,
939: PARSE_CATENATION,
940: PARSE_POST_CATENATION,
941: PARSE_UNION,
942: PARSE_POST_UNION,
943: PARSE_POSTFIX,
944: PARSE_RESTORE_CFLAGS
945: } tre_parse_re_stack_symbol_t;
946:
947:
948: reg_errcode_t
949: tre_parse(tre_parse_ctx_t *ctx)
950: {
951: tre_ast_node_t *result = NULL;
952: tre_parse_re_stack_symbol_t symbol;
953: reg_errcode_t status = REG_OK;
954: tre_stack_t *stack = ctx->stack;
955: int bottom = tre_stack_num_objects(stack);
956: int depth = 0;
957: int temporary_cflags = 0;
958:
959: DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
960: ctx->len, ctx->re, ctx->len));
961:
962: if (!ctx->nofirstsub)
963: {
964: STACK_PUSH(stack, ctx->re);
965: STACK_PUSH(stack, ctx->submatch_id);
966: STACK_PUSH(stack, PARSE_MARK_FOR_SUBMATCH);
967: ctx->submatch_id++;
968: }
969: STACK_PUSH(stack, PARSE_RE);
970: ctx->re_start = ctx->re;
971: ctx->re_end = ctx->re + ctx->len;
972:
973:
974: /* The following is basically just a recursive descent parser. I use
975: an explicit stack instead of recursive functions mostly because of
976: two reasons: compatibility with systems which have an overflowable
977: call stack, and efficiency (both in lines of code and speed). */
978: while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
979: {
980: if (status != REG_OK)
981: break;
982: symbol = (tre_parse_re_stack_symbol_t)(FLX_RAWADDRESS)tre_stack_pop(stack);
983: switch (symbol)
984: {
985: case PARSE_RE:
986: /* Parse a full regexp. A regexp is one or more branches,
987: separated by the union operator `|'. */
988: #ifdef REG_LITERAL
989: if (!(ctx->cflags & REG_LITERAL)
990: && ctx->cflags & REG_EXTENDED)
991: #endif /* REG_LITERAL */
992: STACK_PUSHX(stack, PARSE_UNION);
993: STACK_PUSHX(stack, PARSE_BRANCH);
994: break;
995:
996: case PARSE_BRANCH:
997: /* Parse a branch. A branch is one or more pieces, concatenated.
998: A piece is an atom possibly followed by a postfix operator. */
999: STACK_PUSHX(stack, PARSE_CATENATION);
1000: STACK_PUSHX(stack, PARSE_PIECE);
1001: break;
1002:
1003: case PARSE_PIECE:
1004: /* Parse a piece. A piece is an atom possibly followed by one
1005: or more postfix operators. */
1006: #ifdef REG_LITERAL
1007: if (!(ctx->cflags & REG_LITERAL))
1008: #endif /* REG_LITERAL */
1009: STACK_PUSHX(stack, PARSE_POSTFIX);
1010: STACK_PUSHX(stack, PARSE_ATOM);
1011: break;
1012:
1013: case PARSE_CATENATION:
1014: /* If the expression has not ended, parse another piece. */
1015: {
1016: tre_char_t c;
1017: if (ctx->re >= ctx->re_end)
1018: break;
1019: c = *ctx->re;
1020: #ifdef REG_LITERAL
1021: if (!(ctx->cflags & REG_LITERAL))
1022: {
1023: #endif /* REG_LITERAL */
1024: if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
1025: break;
1026: if ((ctx->cflags & REG_EXTENDED
1027: && c == CHAR_RPAREN && depth > 0)
1028: || (!(ctx->cflags & REG_EXTENDED)
1029: && (c == CHAR_BACKSLASH
1030: && *(ctx->re + 1) == CHAR_RPAREN)))
1031: {
1032: if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
1033: status = REG_EPAREN;
1034: DPRINT(("tre_parse: group end: '%.*" STRF "'\n",
1035: ctx->re_end - ctx->re, ctx->re));
1036: depth--;
1037: if (!(ctx->cflags & REG_EXTENDED))
1038: ctx->re += 2;
1039: break;
1040: }
1041: #ifdef REG_LITERAL
1042: }
1043: #endif /* REG_LITERAL */
1044:
1045: #ifdef REG_RIGHT_ASSOC
1046: if (ctx->cflags & REG_RIGHT_ASSOC)
1047: {
1048: /* Right associative concatenation. */
1049: STACK_PUSHX(stack, result);
1050: STACK_PUSHX(stack, PARSE_POST_CATENATION);
1051: STACK_PUSHX(stack, PARSE_CATENATION);
1052: STACK_PUSHX(stack, PARSE_PIECE);
1053: }
1054: else
1055: #endif /* REG_RIGHT_ASSOC */
1056: {
1057: /* Default case, left associative concatenation. */
1058: STACK_PUSHX(stack, PARSE_CATENATION);
1059: STACK_PUSHX(stack, result);
1060: STACK_PUSHX(stack, PARSE_POST_CATENATION);
1061: STACK_PUSHX(stack, PARSE_PIECE);
1062: }
1063: break;
1064: }
1065:
1066: case PARSE_POST_CATENATION:
1067: {
1068: tre_ast_node_t *tree = (tre_ast_node_t*)tre_stack_pop(stack);
1069: tre_ast_node_t *tmp_node;
1070: tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
1071: if (!tmp_node)
1072: return REG_ESPACE;
1073: result = tmp_node;
1074: break;
1075: }
1076:
1077: case PARSE_UNION:
1078: if (ctx->re >= ctx->re_end)
1079: break;
1080: #ifdef REG_LITERAL
1081: if (ctx->cflags & REG_LITERAL)
1082: break;
1083: #endif /* REG_LITERAL */
1084: switch (*ctx->re)
1085: {
1086: case CHAR_PIPE:
1087: DPRINT(("tre_parse: union: '%.*" STRF "'\n",
1088: ctx->re_end - ctx->re, ctx->re));
1089: STACK_PUSHX(stack, PARSE_UNION);
1090: STACK_PUSHX(stack, result);
1091: STACK_PUSHX(stack, PARSE_POST_UNION);
1092: STACK_PUSHX(stack, PARSE_BRANCH);
1093: ctx->re++;
1094: break;
1095:
1096: case CHAR_RPAREN:
1097: ctx->re++;
1098: break;
1099:
1100: default:
1101: break;
1102: }
1103: break;
1104:
1105: case PARSE_POST_UNION:
1106: {
1107: tre_ast_node_t *tmp_node;
1108: tre_ast_node_t *tree = (tre_ast_node_t*)tre_stack_pop(stack);
1109: tmp_node = tre_ast_new_union(ctx->mem, tree, result);
1110: if (!tmp_node)
1111: return REG_ESPACE;
1112: result = tmp_node;
1113: break;
1114: }
1115:
1116: case PARSE_POSTFIX:
1117: /* Parse postfix operators. */
1118: if (ctx->re >= ctx->re_end)
1119: break;
1120: #ifdef REG_LITERAL
1121: if (ctx->cflags & REG_LITERAL)
1122: break;
1123: #endif /* REG_LITERAL */
1124: switch (*ctx->re)
1125: {
1126: case CHAR_STAR:
1127: case CHAR_PLUS:
1128: case CHAR_QUESTIONMARK:
1129: {
1130: tre_ast_node_t *tmp_node;
1131: int minimal = 0;
1132: int rep_min = 0;
1133: int rep_max = -1;
1134: if (*ctx->re == CHAR_PLUS)
1135: rep_min = 1;
1136: if (*ctx->re == CHAR_QUESTIONMARK)
1137: rep_max = 1;
1138:
1139: if (ctx->re + 1 < ctx->re_end
1140: && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1141: minimal = 1;
1142: DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
1143: minimal ? " minimal" : "greedy",
1144: ctx->re_end - ctx->re, ctx->re));
1145: ctx->re += minimal + 1;
1146: tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
1147: minimal);
1148: if (tmp_node == NULL)
1149: return REG_ESPACE;
1150: result = tmp_node;
1151: STACK_PUSHX(stack, PARSE_POSTFIX);
1152: break;
1153: }
1154:
1155: case CHAR_BACKSLASH:
1156: /* "\{" is special without REG_EXTENDED */
1157: if (!(ctx->cflags & REG_EXTENDED)
1158: && ctx->re + 1 < ctx->re_end
1159: && *(ctx->re + 1) == CHAR_LBRACE)
1160: {
1161: ctx->re++;
1162: goto parse_brace;
1163: }
1164: else
1165: break;
1166:
1167: case CHAR_LBRACE:
1168: /* "{" is literal without REG_EXTENDED */
1169: if (!(ctx->cflags & REG_EXTENDED))
1170: break;
1171:
1172: parse_brace:
1173: DPRINT(("tre_parse: bound: '%.*" STRF "'\n",
1174: ctx->re_end - ctx->re, ctx->re));
1175: ctx->re++;
1176:
1177: status = tre_parse_bound(ctx, &result);
1178: if (status != REG_OK)
1179: return status;
1180: STACK_PUSHX(stack, PARSE_POSTFIX);
1181: break;
1182: }
1183: break;
1184:
1185: case PARSE_ATOM:
1186: /* Parse an atom. An atom is a regular expression enclosed in `()',
1187: an empty set of `()', a bracket expression, `.', `^', `$',
1188: a `\' followed by a character, or a single character. */
1189:
1190: /* End of regexp? (empty string). */
1191: if (ctx->re >= ctx->re_end)
1192: goto parse_literal;
1193:
1194: #ifdef REG_LITERAL
1195: if (ctx->cflags & REG_LITERAL)
1196: goto parse_literal;
1197: #endif /* REG_LITERAL */
1198:
1199: switch (*ctx->re)
1200: {
1201: case CHAR_LPAREN: /* parenthesized subexpression */
1202:
1203: /* Handle "(?...)" extensions. They work in a way similar
1204: to Perls corresponding extensions. */
1205: if (ctx->cflags & REG_EXTENDED
1206: && *(ctx->re + 1) == CHAR_QUESTIONMARK)
1207: {
1208: int new_cflags = ctx->cflags;
1209: int bit = 1;
1210: DPRINT(("tre_parse: extension: '%.*" STRF "\n",
1211: ctx->re_end - ctx->re, ctx->re));
1212: ctx->re += 2;
1213: while (1)
1214: {
1215: if (*ctx->re == L'i')
1216: {
1217: DPRINT(("tre_parse: icase: '%.*" STRF "\n",
1218: ctx->re_end - ctx->re, ctx->re));
1219: if (bit)
1220: new_cflags |= REG_ICASE;
1221: else
1222: new_cflags &= ~REG_ICASE;
1223: ctx->re++;
1224: }
1225: else if (*ctx->re == L'n')
1226: {
1227: DPRINT(("tre_parse: newline: '%.*" STRF "\n",
1228: ctx->re_end - ctx->re, ctx->re));
1229: if (bit)
1230: new_cflags |= REG_NEWLINE;
1231: else
1232: new_cflags &= ~REG_NEWLINE;
1233: ctx->re++;
1234: }
1235: #ifdef REG_RIGHT_ASSOC
1236: else if (*ctx->re == L'r')
1237: {
1238: DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
1239: ctx->re_end - ctx->re, ctx->re));
1240: if (bit)
1241: new_cflags |= REG_RIGHT_ASSOC;
1242: else
1243: new_cflags &= ~REG_RIGHT_ASSOC;
1244: ctx->re++;
1245: }
1246: #endif /* REG_RIGHT_ASSOC */
1247: else if (*ctx->re == CHAR_MINUS)
1248: {
1249: DPRINT(("tre_parse: turn off: '%.*" STRF "\n",
1250: ctx->re_end - ctx->re, ctx->re));
1251: ctx->re++;
1252: bit = 0;
1253: }
1254: else if (*ctx->re == CHAR_COLON)
1255: {
1256: DPRINT(("tre_parse: no group: '%.*" STRF "\n",
1257: ctx->re_end - ctx->re, ctx->re));
1258: ctx->re++;
1259: depth++;
1260: break;
1261: }
1262: else if (*ctx->re == CHAR_RPAREN)
1263: {
1264: ctx->re++;
1265: break;
1266: }
1267: else
1268: return REG_BADPAT;
1269: }
1270:
1271: /* Turn on the cflags changes for the rest of the
1272: enclosing group. */
1273: STACK_PUSHX(stack, ctx->cflags);
1274: STACK_PUSHX(stack, PARSE_RESTORE_CFLAGS);
1275: STACK_PUSHX(stack, PARSE_RE);
1276: ctx->cflags = new_cflags;
1277: break;
1278: }
1279:
1280: if (ctx->cflags & REG_EXTENDED
1281: || (ctx->re > ctx->re_start
1282: && *(ctx->re - 1) == CHAR_BACKSLASH))
1283: {
1284: depth++;
1285: if (ctx->re + 2 < ctx->re_end
1286: && *(ctx->re + 1) == CHAR_QUESTIONMARK
1287: && *(ctx->re + 2) == CHAR_COLON)
1288: {
1289: DPRINT(("tre_parse: group begin: '%.*" STRF
1290: "', no submatch\n",
1291: ctx->re_end - ctx->re, ctx->re));
1292: /* Don't mark for submatching. */
1293: ctx->re += 3;
1294: STACK_PUSHX(stack, PARSE_RE);
1295: }
1296: else
1297: {
1298: DPRINT(("tre_parse: group begin: '%.*" STRF
1299: "', submatch %d\n",
1300: ctx->re_end - ctx->re, ctx->re,
1301: ctx->submatch_id));
1302: ctx->re++;
1303: /* First parse a whole RE, then mark the resulting tree
1304: for submatching. */
1305: STACK_PUSHX(stack, ctx->submatch_id);
1306: STACK_PUSHX(stack, PARSE_MARK_FOR_SUBMATCH);
1307: STACK_PUSHX(stack, PARSE_RE);
1308: ctx->submatch_id++;
1309: }
1310: }
1311: else
1312: goto parse_literal;
1313: break;
1314:
1315: case CHAR_RPAREN: /* end of current subexpression */
1316: if ((ctx->cflags & REG_EXTENDED && depth > 0)
1317: || (ctx->re > ctx->re_start
1318: && *(ctx->re - 1) == CHAR_BACKSLASH))
1319: {
1320: DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1321: ctx->re_end - ctx->re, ctx->re));
1322: /* We were expecting an atom, but instead the current
1323: subexpression was closed. POSIX leaves the meaning of
1324: this to be implementation-defined. We interpret this as
1325: an empty expression (which matches an empty string). */
1326: result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1327: if (result == NULL)
1328: return REG_ESPACE;
1329: if (!(ctx->cflags & REG_EXTENDED))
1330: ctx->re--;
1331: }
1332: else
1333: goto parse_literal;
1334: break;
1335:
1336: case CHAR_LBRACKET: /* bracket expression */
1337: DPRINT(("tre_parse: bracket: '%.*" STRF "'\n",
1338: ctx->re_end - ctx->re, ctx->re));
1339: ctx->re++;
1340: status = tre_parse_bracket(ctx, &result);
1341: if (status != REG_OK)
1342: return status;
1343: break;
1344:
1345: case CHAR_BACKSLASH:
1346: /* If this is "\(" or "\)" chew off the backslash and
1347: try again. */
1348: if (!(ctx->cflags & REG_EXTENDED)
1349: && ctx->re + 1 < ctx->re_end
1350: && (*(ctx->re + 1) == CHAR_LPAREN
1351: || *(ctx->re + 1) == CHAR_RPAREN))
1352: {
1353: ctx->re++;
1354: STACK_PUSHX(stack, PARSE_ATOM);
1355: break;
1356: }
1357:
1358: /* If a macro is used, parse the expanded macro recursively. */
1359: {
1360: tre_char_t buf[64];
1361: tre_expand_macro(ctx->re + 1, ctx->re_end,
1362: buf, elementsof(buf));
1363: if (buf[0] != 0)
1364: {
1365: tre_parse_ctx_t subctx;
1366: memcpy(&subctx, ctx, sizeof(subctx));
1367: subctx.re = buf;
1368: subctx.len = tre_strlen((const char*)buf);
1369: subctx.nofirstsub = 1;
1370: status = tre_parse(&subctx);
1371: if (status != REG_OK)
1372: return status;
1373: ctx->re += 2;
1374: ctx->position = subctx.position;
1375: result = subctx.result;
1376: break;
1377: }
1378: }
1379:
1380: if (ctx->re + 1 >= ctx->re_end)
1381: /* Trailing backslash. */
1382: return REG_EESCAPE;
1383:
1384: #ifdef REG_LITERAL
1385: if (*(ctx->re + 1) == L'Q')
1386: {
1387: DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
1388: ctx->re_end - ctx->re, ctx->re));
1389: ctx->cflags |= REG_LITERAL;
1390: temporary_cflags |= REG_LITERAL;
1391: ctx->re += 2;
1392: STACK_PUSHX(stack, PARSE_ATOM);
1393: break;
1394: }
1395: #endif /* REG_LITERAL */
1396:
1397: DPRINT(("tre_parse: bleep: '%.*" STRF "'\n",
1398: ctx->re_end - ctx->re, ctx->re));
1399: ctx->re++;
1400: switch (*ctx->re)
1401: {
1402: case L'b':
1403: result = tre_ast_new_literal(ctx->mem, ASSERTION,
1404: ASSERT_AT_WB, -1);
1405: ctx->re++;
1406: break;
1407: case L'B':
1408: result = tre_ast_new_literal(ctx->mem, ASSERTION,
1409: ASSERT_AT_WB_NEG, -1);
1410: ctx->re++;
1411: break;
1412: case L'<':
1413: result = tre_ast_new_literal(ctx->mem, ASSERTION,
1414: ASSERT_AT_BOW, -1);
1415: ctx->re++;
1416: break;
1417: case L'>':
1418: result = tre_ast_new_literal(ctx->mem, ASSERTION,
1419: ASSERT_AT_EOW, -1);
1420: ctx->re++;
1421: break;
1422: case L'x':
1423: ctx->re++;
1424: if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
1425: {
1426: /* 8 bit hex char. */
1427: char tmp[3] = {0, 0, 0};
1428: long val;
1429: DPRINT(("tre_parse: 8 bit hex: '%.*" STRF "'\n",
1430: ctx->re_end - ctx->re + 2, ctx->re - 2));
1431:
1432: if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1433: {
1434: tmp[0] = (char)ctx->re[0];
1435: ctx->re++;
1436: }
1437: if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
1438: {
1439: tmp[1] = (char)ctx->re[0];
1440: ctx->re++;
1441: }
1442: val = strtol(tmp, NULL, 16);
1443: result = tre_ast_new_literal(ctx->mem, val, val,
1444: ctx->position);
1445: ctx->position++;
1446: break;
1447: }
1448: else if (ctx->re < ctx->re_end)
1449: {
1450: /* Wide char. */
1451: char tmp[32];
1452: long val;
1453: int i = 0;
1454: ctx->re++;
1455: while (ctx->re_end - ctx->re >= 0)
1456: {
1457: if (ctx->re[0] == CHAR_RBRACE)
1458: break;
1459: if (tre_isxdigit(ctx->re[0]))
1460: {
1461: tmp[i] = (char)ctx->re[0];
1462: i++;
1463: ctx->re++;
1464: continue;
1465: }
1466: return REG_EBRACE;
1467: }
1468: ctx->re++;
1469: tmp[i] = 0;
1470: val = strtol(tmp, NULL, 16);
1471: result = tre_ast_new_literal(ctx->mem, val, val,
1472: ctx->position);
1473: ctx->position++;
1474: break;
1475: }
1476:
1477: default:
1478: if (tre_isdigit(*ctx->re))
1479: {
1480: /* Back reference. */
1481: int val = *ctx->re - L'0';
1482: DPRINT(("tre_parse: backref: '%.*" STRF "'\n",
1483: ctx->re_end - ctx->re + 1, ctx->re - 1));
1484: result = tre_ast_new_literal(ctx->mem, BACKREF, val,
1485: ctx->position);
1486: if (result == NULL)
1487: return REG_ESPACE;
1488: ctx->position++;
1489: ctx->max_backref = MAX(val, ctx->max_backref);
1490: ctx->re++;
1491: }
1492: else
1493: {
1494: /* Escaped character. */
1495: DPRINT(("tre_parse: escaped: '%.*" STRF "'\n",
1496: ctx->re_end - ctx->re + 1, ctx->re - 1));
1497: result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1498: ctx->position);
1499: ctx->position++;
1500: ctx->re++;
1501: }
1502: break;
1503: }
1504: if (result == NULL)
1505: return REG_ESPACE;
1506: break;
1507:
1508: case CHAR_PERIOD: /* the any-symbol */
1509: DPRINT(("tre_parse: any: '%.*" STRF "'\n",
1510: ctx->re_end - ctx->re, ctx->re));
1511: if (ctx->cflags & REG_NEWLINE)
1512: {
1513: tre_ast_node_t *tmp1;
1514: tre_ast_node_t *tmp2;
1515: tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
1516: ctx->position);
1517: if (!tmp1)
1518: return REG_ESPACE;
1519: tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
1520: ctx->position + 1);
1521: if (!tmp2)
1522: return REG_ESPACE;
1523: result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1524: if (!result)
1525: return REG_ESPACE;
1526: ctx->position += 2;
1527: }
1528: else
1529: {
1530: result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
1531: ctx->position);
1532: if (!result)
1533: return REG_ESPACE;
1534: ctx->position++;
1535: }
1536: ctx->re++;
1537: break;
1538:
1539: case CHAR_CARET: /* beginning of line assertion */
1540: /* '^' has a special meaning everywhere in EREs, and in the
1541: beginning of the RE and after \( is BREs. */
1542: if (ctx->cflags & REG_EXTENDED
1543: || (ctx->re - 2 >= ctx->re_start
1544: && *(ctx->re - 2) == CHAR_BACKSLASH
1545: && *(ctx->re - 1) == CHAR_LPAREN)
1546: || ctx->re == ctx->re_start)
1547: {
1548: DPRINT(("tre_parse: BOL: '%.*" STRF "'\n",
1549: ctx->re_end - ctx->re, ctx->re));
1550: result = tre_ast_new_literal(ctx->mem, ASSERTION,
1551: ASSERT_AT_BOL, -1);
1552: if (result == NULL)
1553: return REG_ESPACE;
1554: ctx->re++;
1555: }
1556: else
1557: goto parse_literal;
1558: break;
1559:
1560: case CHAR_DOLLAR: /* end of line assertion. */
1561: /* '$' is special everywhere in EREs, and in the end of the
1562: string and before \) is BREs. */
1563: if (ctx->cflags & REG_EXTENDED
1564: || (ctx->re + 2 < ctx->re_end
1565: && *(ctx->re + 1) == CHAR_BACKSLASH
1566: && *(ctx->re + 2) == CHAR_RPAREN)
1567: || ctx->re + 1 == ctx->re_end)
1568: {
1569: DPRINT(("tre_parse: EOL: '%.*" STRF "'\n",
1570: ctx->re_end - ctx->re, ctx->re));
1571: result = tre_ast_new_literal(ctx->mem, ASSERTION,
1572: ASSERT_AT_EOL, -1);
1573: if (result == NULL)
1574: return REG_ESPACE;
1575: ctx->re++;
1576: }
1577: else
1578: goto parse_literal;
1579: break;
1580:
1581: default:
1582: parse_literal:
1583:
1584: if (temporary_cflags && ctx->re + 1 < ctx->re_end
1585: && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
1586: {
1587: DPRINT(("tre_parse: end tmps: '%.*" STRF "'\n",
1588: ctx->re_end - ctx->re, ctx->re));
1589: ctx->cflags &= ~temporary_cflags;
1590: temporary_cflags = 0;
1591: ctx->re += 2;
1592: STACK_PUSHX(stack, PARSE_ATOM);
1593: break;
1594: }
1595:
1596:
1597: /* We are expecting an atom. If the subexpression (or the whole
1598: regexp ends here, we interpret it as an empty expression
1599: (which matches an empty string). */
1600: if (
1601: #ifdef REG_LITERAL
1602: !(ctx->cflags & REG_LITERAL) &&
1603: #endif /* REG_LITERAL */
1604: (ctx->re >= ctx->re_end
1605: || *ctx->re == CHAR_STAR
1606: || (ctx->cflags & REG_EXTENDED
1607: && (*ctx->re == CHAR_PIPE
1608: || *ctx->re == CHAR_LBRACE
1609: || *ctx->re == CHAR_PLUS
1610: || *ctx->re == CHAR_QUESTIONMARK))
1611: /* Test for "\)" in BRE mode. */
1612: || (!(ctx->cflags & REG_EXTENDED)
1613: && ctx->re + 1 < ctx->re_end
1614: && *ctx->re == CHAR_BACKSLASH
1615: && *(ctx->re + 1) == CHAR_LBRACE)))
1616: {
1617: DPRINT(("tre_parse: empty: '%.*" STRF "'\n",
1618: ctx->re_end - ctx->re, ctx->re));
1619: result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1620: if (!result)
1621: return REG_ESPACE;
1622: break;
1623: }
1624:
1625: DPRINT(("tre_parse: literal: '%.*" STRF "'\n",
1626: ctx->re_end - ctx->re, ctx->re));
1627: /* Note that we can't use an tre_isalpha() test here, since there
1628: may be characters which are alphabetic but neither upper or
1629: lower case. */
1630: if (ctx->cflags & REG_ICASE
1631: && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
1632: {
1633: tre_ast_node_t *tmp1;
1634: tre_ast_node_t *tmp2;
1635:
1636: /* XXX - Can there be more than one opposite-case
1637: counterpoints for some character in some locale? Or
1638: more than two characters which all should be regarded
1639: the same character if case is ignored? If yes, there
1640: does not seem to be a portable way to detect it. I guess
1641: that at least for multi-character collating elements there
1642: could be several opposite-case counterpoints, but they
1643: cannot be supported portably anyway. */
1644: tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
1645: tre_toupper(*ctx->re),
1646: ctx->position);
1647: if (!tmp1)
1648: return REG_ESPACE;
1649: tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
1650: tre_tolower(*ctx->re),
1651: ctx->position);
1652: if (!tmp2)
1653: return REG_ESPACE;
1654: result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
1655: if (!result)
1656: return REG_ESPACE;
1657: }
1658: else
1659: {
1660: result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
1661: ctx->position);
1662: if (!result)
1663: return REG_ESPACE;
1664: }
1665: ctx->position++;
1666: ctx->re++;
1667: break;
1668: }
1669: break;
1670:
1671: case PARSE_MARK_FOR_SUBMATCH:
1672: {
1673: int submatch_id = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
1674:
1675: if (result->submatch_id >= 0)
1676: {
1677: tre_ast_node_t *n, *tmp_node;
1678: n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1679: if (n == NULL)
1680: return REG_ESPACE;
1681: tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
1682: if (tmp_node == NULL)
1683: return REG_ESPACE;
1684: tmp_node->num_submatches = result->num_submatches;
1685: result = tmp_node;
1686: }
1687: result->submatch_id = submatch_id;
1688: result->num_submatches++;
1689: break;
1690: }
1691:
1692: case PARSE_RESTORE_CFLAGS:
1693: ctx->cflags = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
1694: break;
1695: }
1696: }
1697:
1698: /* Check for missing closing parentheses. */
1699: if (depth > 0)
1700: return REG_EPAREN;
1701:
1702: if (status == REG_OK)
1703: ctx->result = result;
1704:
1705: return status;
1706: }
1707:
1708: /* EOF */
Start cpp section to tre/tre_parse.hpp[1
/1
]
1: #line 8778 "./lpsrc/tre.pak"
2: /*
3: tre-parse.c - Regexp parser definitions
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19: */
20:
21:
22:
23:
24: /* Parse context. */
25: typedef struct {
26: /* Memory allocator. The AST is allocated using this. */
27: tre_mem_t mem;
28: /* Stack used for keeping track of regexp syntax. */
29: tre_stack_t *stack;
30: /* The parse result. */
31: tre_ast_node_t *result;
32: /* The regexp to parse and its length. */
33: const tre_char_t *re;
34: /* The first character of the entire regexp. */
35: const tre_char_t *re_start;
36: /* The first character after the end of the regexp. */
37: const tre_char_t *re_end;
38: int len;
39: /* Current submatch ID. */
40: int submatch_id;
41: /* Current position (number of literal). */
42: int position;
43: /* The highest back reference or -1 if none seen so far. */
44: int max_backref;
45: /* This flag is set if the regexp uses approximate matching. */
46: int have_approx;
47: /* Compilation flags. */
48: int cflags;
49: /* If this flag is set the top-level submatch is not captured. */
50: int nofirstsub;
51: /* The currently set approximate matching parameters. */
52: int params[TRE_PARAM_LAST];
53: } tre_parse_ctx_t;
54:
55: /* Parses a wide character regexp pattern into a syntax tree. This parser
56: handles both syntaxes (BRE and ERE), including the TRE extensions. */
57: reg_errcode_t
58: tre_parse(tre_parse_ctx_t *ctx);
59:
60:
61:
62: /* EOF */
Start cpp section to tre/tre_stack.cpp[1
/1
]
1: #line 8841 "./lpsrc/tre.pak"
2: /*
3: tre-stack.c - Simple stack implementation
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19: */
20:
21:
22:
23:
24:
25:
26:
27:
28:
29: struct tre_stack_rec {
30: int size;
31: int max_size;
32: int increment;
33: int ptr;
34: void **stack;
35: };
36:
37:
38: tre_stack_t *
39: tre_stack_new(int size, int max_size, int increment)
40: {
41: tre_stack_t *s;
42:
43: s = (tre_stack_t*)xmalloc(sizeof(*s));
44: if (s != NULL)
45: {
46: s->stack = (void**)xmalloc(sizeof(*s->stack) * size);
47: if (s->stack == NULL)
48: {
49: xfree(s);
50: return NULL;
51: }
52: s->size = size;
53: s->max_size = max_size;
54: s->increment = increment;
55: s->ptr = 0;
56: }
57: return s;
58: }
59:
60: void
61: tre_stack_destroy(tre_stack_t *s)
62: {
63: xfree(s->stack);
64: xfree(s);
65: }
66:
67: int
68: tre_stack_num_objects(tre_stack_t *s)
69: {
70: return s->ptr;
71: }
72:
73: reg_errcode_t
74: tre_stack_push(tre_stack_t *s, void *value)
75: {
76: if (s->ptr < s->size)
77: {
78: s->stack[s->ptr] = value;
79: s->ptr++;
80: }
81: else
82: {
83: if (s->size >= s->max_size)
84: {
85: DPRINT(("tre_stack_push: stack full\n"));
86: return REG_ESPACE;
87: }
88: else
89: {
90: void **new_buffer;
91: int new_size;
92: DPRINT(("tre_stack_push: trying to realloc more space\n"));
93: new_size = s->size + s->increment;
94: if (new_size > s->max_size)
95: new_size = s->max_size;
96: new_buffer = (void**)xrealloc(s->stack, sizeof(*new_buffer) * new_size);
97: if (new_buffer == NULL)
98: {
99: DPRINT(("tre_stack_push: realloc failed.\n"));
100: return REG_ESPACE;
101: }
102: DPRINT(("tre_stack_push: realloc succeeded.\n"));
103: assert(new_size > s->size);
104: s->size = new_size;
105: s->stack = new_buffer;
106: tre_stack_push(s, value);
107: }
108: }
109: return REG_OK;
110: }
111:
112: void *
113: tre_stack_pop(tre_stack_t *s)
114: {
115: return s->stack[--s->ptr];
116: }
117:
118: /* EOF */
Start cpp section to tre/tre_stack.hpp[1
/1
]
1: #line 8960 "./lpsrc/tre.pak"
2: /*
3: tre-stack.h: Stack definitions
4:
5: Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19: */
20:
21:
22:
23:
24:
25:
26:
27: typedef struct tre_stack_rec tre_stack_t;
28:
29: /* Creates a new stack object. `size' is initial size in bytes, `max_size'
30: is maximum size, and `increment' specifies how much more space will be
31: allocated with realloc() if all space gets used up. Returns the stack
32: object or NULL if out of memory. */
33: tre_stack_t *
34: tre_stack_new(int size, int max_size, int increment);
35:
36: /* Frees the stack object. */
37: void
38: tre_stack_destroy(tre_stack_t *s);
39:
40: /* Returns the current number of objects in the stack. */
41: int
42: tre_stack_num_objects(tre_stack_t *s);
43:
44: /* Pushes `value' on top of stack `s'. Returns REG_ESPACE if out of memory
45: (tries to realloc() more space before failing if maximum size not yet
46: reached). Returns REG_OK if successful. */
47: reg_errcode_t
48: tre_stack_push(tre_stack_t *s, void *value);
49:
50: /* Pops the topmost element off of stack `s' and returns it. The stack must
51: not be empty. */
52: void *
53: tre_stack_pop(tre_stack_t *s);
54:
55:
56: /* Just to save some typing. */
57:
58: do \
59: { \
60: status = tre_stack_push(s, (void *)(value)); \
61: } \
62: while (0)
63:
64:
65: { \
66: status = tre_stack_push(s, (void *)(value)); \
67: if (status != REG_OK) \
68: break; \
69: }
70:
71:
72: { \
73: reg_errcode_t status; \
74: status = tre_stack_push(s, (void *)(value)); \
75: if (status != REG_OK) \
76: return status; \
77: }
78:
79:
80:
81: /* EOF */
Start cpp section to tre/tre_xmalloc.hpp[1
/1
]
1: #line 9042 "./lpsrc/tre.pak"
2: /*
3: xmalloc.h - Simple malloc debugger library API
4:
5: Copyright (C) 2001-2003 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22:
23:
24:
25:
26:
27: /* Version 2.4 and later of GCC define a magical variable `__PRETTY_FUNCTION__'
28: which contains the name of the function currently being defined.
29:
30: This is broken in G++ before version 2.6.
31: C9x has a similar variable called __func__, but prefer the GCC one since
32: it demangles C++ function names. */
33:
34:
35: && __GNUC_MINOR__ >= (defined __cplusplus ? 6 : 4))
36: # define __XMALLOC_FUNCTION __PRETTY_FUNCTION__
37: # else
38: # define __XMALLOC_FUNCTION ((const char *) 0)
39: # endif
40: # else
41: # if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
42: # define __XMALLOC_FUNCTION __func__
43: # else
44: # define __XMALLOC_FUNCTION ((const char *) 0)
45: # endif
46: # endif
47:
48: #define xmalloc(size) xmalloc_impl(size, __FILE__, __LINE__, \
49: __XMALLOC_FUNCTION)
50: #define xcalloc(nmemb, size) xcalloc_impl(nmemb, size, __FILE__, __LINE__, \
51: __XMALLOC_FUNCTION)
52: #define xfree(ptr) xfree_impl(ptr, __FILE__, __LINE__, __XMALLOC_FUNCTION)
53: #define xrealloc(ptr, new_size) xrealloc_impl(ptr, new_size, __FILE__, \
54: __LINE__, __XMALLOC_FUNCTION)
55:
56: void *xmalloc_impl(size_t size, const char *file, int line, const char *func);
57: void *xcalloc_impl(size_t nmemb, size_t size, const char *file, int line,
58: const char *func);
59: void xfree_impl(void *ptr, const char *file, int line, const char *func);
60: void *xrealloc_impl(void *ptr, size_t new_size, const char *file, int line,
61: const char *func);
62: int xmalloc_dump_leaks(void);
63: void xmalloc_configure(int fail_after);
64:
65: #undef malloc
66: #undef calloc
67: #undef free
68: #undef realloc
69:
70: #define malloc USE_XMALLOC_INSTEAD_OF_MALLOC
71: #define calloc USE_XCALLOC_INSTEAD_OF_CALLOC
72: #define free USE_XFREE_INSTEAD_OF_FREE
73: #define realloc USE_XREALLOC_INSTEAD_OF_REALLOC
74:
75: #else /* !MALLOC_DEBUGGING */
76:
77: #include <stdlib.h>
78:
79: #define xmalloc(size) malloc(size)
80: #define xcalloc(nmemb, size) calloc(nmemb, size)
81: #define xfree(ptr) free(ptr)
82: #define xrealloc(ptr, new_size) realloc(ptr, new_size)
83:
84: #endif /* !MALLOC_DEBUGGING */
85:
86: #endif /* _XMALLOC_H */
87:
88: /* EOF */
Start cpp section to tre/tre_xmalloc.cpp[1
/1
]
1: #line 9131 "./lpsrc/tre.pak"
2: /*
3: xmalloc.c - Simple malloc debugger library implementation
4:
5: Copyright (C) 2001-2003 Ville Laurikari <vl@iki.fi>.
6:
7: This program is free software; you can redistribute it and/or modify
8: it under the terms of the GNU General Public License version 2 (June
9: 1991) as published by the Free Software Foundation.
10:
11: This program is distributed in the hope that it will be useful,
12: but WITHOUT ANY WARRANTY; without even the implied warranty of
13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: GNU General Public License for more details.
15:
16: You should have received a copy of the GNU General Public License
17: along with this program; if not, write to the Free Software
18: Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19:
20: */
21:
22: /*
23: TODO:
24: - red zones
25: - group dumps by source location
26: */
27:
28:
29:
30:
31:
32:
33:
34: /*
35: Internal stuff.
36: */
37:
38: typedef struct hashTableItemRec {
39: void *ptr;
40: int bytes;
41: const char *file;
42: int line;
43: const char *func;
44: struct hashTableItemRec *next;
45: } hashTableItem;
46:
47: typedef struct {
48: hashTableItem **table;
49: } hashTable;
50:
51: static int xmalloc_peak;
52: int xmalloc_current;
53: static int xmalloc_peak_blocks;
54: int xmalloc_current_blocks;
55: static int xmalloc_fail_after;
56:
57:
58:
59:
60:
61: static hashTable *
62: hash_table_new(void)
63: {
64: hashTable *tbl;
65:
66: tbl = (hashTable*)malloc(sizeof(*tbl));
67:
68: if (tbl != NULL)
69: {
70: tbl->table = (hashTableItem**)calloc(TABLE_SIZE, sizeof(*tbl->table));
71:
72: if (tbl->table == NULL)
73: {
74: free(tbl);
75: return NULL;
76: }
77: }
78:
79: return tbl;
80: }
81:
82: static int
83: hash_void_ptr(void *ptr)
84: {
85: int hash;
86: int i;
87:
88: /* I took this hash function just off the top of my head, I have
89: no idea whether it is bad or very bad. */
90: hash = 0;
91: for (i = 0; i < sizeof(ptr)*8 / TABLE_BITS; i++)
92: {
93: hash ^= (FLX_RAWADDRESS)ptr >> i*8;
94: hash += i * 17;
95: hash &= TABLE_MASK;
96: }
97: return hash;
98: }
99:
100: static void
101: hash_table_add(hashTable *tbl, void *ptr, int bytes,
102: const char *file, int line, const char *func)
103: {
104: int i;
105: hashTableItem *item, *xnew;
106:
107: i = hash_void_ptr(ptr);
108:
109: item = tbl->table[i];
110: if (item != NULL)
111: while (item->next != NULL)
112: item = item->next;
113:
114: xnew = (hashTableItem*)malloc(sizeof(*xnew));
115: assert(xnew != NULL);
116: xnew->ptr = ptr;
117: xnew->bytes = bytes;
118: xnew->file = file;
119: xnew->line = line;
120: xnew->func = func;
121: xnew->next = NULL;
122: if (item != NULL)
123: item->next = xnew;
124: else
125: tbl->table[i] = xnew;
126:
127: xmalloc_current += bytes;
128: if (xmalloc_current > xmalloc_peak)
129: xmalloc_peak = xmalloc_current;
130: xmalloc_current_blocks++;
131: if (xmalloc_current_blocks > xmalloc_peak_blocks)
132: xmalloc_peak_blocks = xmalloc_current_blocks;
133: }
134:
135: static void
136: hash_table_del(hashTable *tbl, void *ptr)
137: {
138: int i;
139: hashTableItem *item, *prev;
140:
141: i = hash_void_ptr(ptr);
142:
143: item = tbl->table[i];
144: if (item == NULL)
145: {
146: printf("xfree: invalid ptr %p\n", ptr);
147: abort();
148: }
149: prev = NULL;
150: while (item->ptr != ptr)
151: {
152: prev = item;
153: item = item->next;
154: }
155: if (item->ptr != ptr)
156: {
157: printf("xfree: invalid ptr %p\n", ptr);
158: abort();
159: }
160:
161: xmalloc_current -= item->bytes;
162: xmalloc_current_blocks--;
163:
164: if (prev != NULL)
165: {
166: prev->next = item->next;
167: free(item);
168: }
169: else
170: {
171: tbl->table[i] = item->next;
172: free(item);
173: }
174: }
175:
176: static hashTable *xmalloc_table = NULL;
177:
178: static void
179: xmalloc_init(void)
180: {
181: if (xmalloc_table == NULL)
182: {
183: xmalloc_table = hash_table_new();
184: xmalloc_peak = 0;
185: xmalloc_peak_blocks = 0;
186: xmalloc_current = 0;
187: xmalloc_current_blocks = 0;
188: xmalloc_fail_after = -1;
189: }
190: assert(xmalloc_table != NULL);
191: assert(xmalloc_table->table != NULL);
192: }
193:
194:
195: /*
196: Public API.
197: */
198:
199: void
200: xmalloc_configure(int fail_after)
201: {
202: xmalloc_init();
203: xmalloc_fail_after = fail_after;
204: }
205:
206: int
207: xmalloc_dump_leaks(void)
208: {
209: int i;
210: int num_leaks = 0;
211: int leaked_bytes = 0;
212: hashTableItem *item;
213:
214: xmalloc_init();
215:
216: for (i = 0; i < TABLE_SIZE; i++)
217: {
218: item = xmalloc_table->table[i];
219: while (item != NULL)
220: {
221: printf("%s:%d: %s: %d bytes at %p not freed\n",
222: item->file, item->line, item->func, item->bytes, item->ptr);
223: num_leaks++;
224: leaked_bytes += item->bytes;
225: item = item->next;
226: }
227: }
228: if (num_leaks == 0)
229: printf("No memory leaks.\n");
230: else
231: printf("%d unfreed memory chuncks, total %d unfreed bytes.\n",
232: num_leaks, leaked_bytes);
233: printf("Peak memory consumption %d bytes (%.1f kB, %.1f MB) in %d blocks ",
234: xmalloc_peak, (double)xmalloc_peak / 1024,
235: (double)xmalloc_peak / (1024*1024), xmalloc_peak_blocks);
236: printf("(average ");
237: if (xmalloc_peak_blocks)
238: printf("%d", ((xmalloc_peak + xmalloc_peak_blocks / 2)
239: / xmalloc_peak_blocks));
240: else
241: printf("N/A");
242: printf(" bytes per block).\n");
243:
244: return num_leaks;
245: }
246:
247: void *
248: xmalloc_impl(size_t size, const char *file, int line, const char *func)
249: {
250: void *ptr;
251:
252: xmalloc_init();
253: assert(size > 0);
254:
255: if (xmalloc_fail_after == 0)
256: {
257: xmalloc_fail_after = -2;
258: #if 0
259: printf("xmalloc: forced failure %s:%d: %s\n", file, line, func);
260: #endif
261: return NULL;
262: }
263: else if (xmalloc_fail_after == -2)
264: {
265: printf("xmalloc: called after failure from %s:%d: %s\n",
266: file, line, func);
267: assert(0);
268: }
269: else if (xmalloc_fail_after > 0)
270: xmalloc_fail_after--;
271:
272: ptr = malloc(size);
273: if (ptr != NULL)
274: hash_table_add(xmalloc_table, ptr, size, file, line, func);
275: return ptr;
276: }
277:
278: void *
279: xcalloc_impl(size_t nmemb, size_t size, const char *file, int line,
280: const char *func)
281: {
282: void *ptr;
283:
284: xmalloc_init();
285: assert(size > 0);
286:
287: if (xmalloc_fail_after == 0)
288: {
289: xmalloc_fail_after = -2;
290: #if 0
291: printf("xcalloc: forced failure %s:%d: %s\n", file, line, func);
292: #endif
293: return NULL;
294: }
295: else if (xmalloc_fail_after == -2)
296: {
297: printf("xcalloc: called after failure from %s:%d: %s\n",
298: file, line, func);
299: assert(0);
300: }
301: else if (xmalloc_fail_after > 0)
302: xmalloc_fail_after--;
303:
304: ptr = calloc(nmemb, size);
305: if (ptr != NULL)
306: hash_table_add(xmalloc_table, ptr, nmemb * size, file, line, func);
307: return ptr;
308: }
309:
310: void
311: xfree_impl(void *ptr, const char *file, int line, const char *func)
312: {
313: xmalloc_init();
314:
315: if (ptr != NULL)
316: hash_table_del(xmalloc_table, ptr);
317: free(ptr);
318: }
319:
320: void *
321: xrealloc_impl(void *ptr, size_t new_size, const char *file, int line,
322: const char *func)
323: {
324: void *new_ptr;
325:
326: xmalloc_init();
327: assert(ptr != NULL);
328: assert(new_size > 0);
329:
330: if (xmalloc_fail_after == 0)
331: {
332: xmalloc_fail_after = -2;
333: return NULL;
334: }
335: else if (xmalloc_fail_after == -2)
336: {
337: printf("xrealloc: called after failure from %s:%d: %s\n",
338: file, line, func);
339: assert(0);
340: }
341: else if (xmalloc_fail_after > 0)
342: xmalloc_fail_after--;
343:
344: new_ptr = realloc(ptr, new_size);
345: if (new_ptr != NULL)
346: {
347: hash_table_del(xmalloc_table, ptr);
348: hash_table_add(xmalloc_table, new_ptr, new_size, file, line, func);
349: }
350: return new_ptr;
351: }
352:
353:
354:
355: /* EOF */