1. tre

Start python section to spkgs/tre.py[1 /1 ]
     1: #line 6 "./lpsrc/tre.pak"
     2: 
     3: rtl_interfaces = [
     4:   'tre/tre_gettext.hpp',
     5:   'tre/tre_regex.hpp',
     6:   'tre/tre_ast.hpp',
     7:   'tre/tre_compile.hpp',
     8:   'tre/tre_config.hpp',
     9:   'tre/tre_filter.hpp',
    10:   'tre/tre_match-utils.hpp',
    11:   'tre/tre_mem.hpp',
    12:   'tre/tre_parse.hpp',
    13:   ]
    14: 
    15: TRE_CPPS = [
    16:   'tre/tre_regcomp',
    17:   'tre/tre_regerror',
    18:   'tre/tre_regexec',
    19:   'tre/tre_ast',
    20:   'tre/tre_compile',
    21:   'tre/tre_filter',
    22:   'tre/tre_match-approx',
    23:   'tre/tre_match-backtrack',
    24:   'tre/tre_match-parallel',
    25:   'tre/tre_mem',
    26:   'tre/tre_parse',
    27:   'tre/tre_stack'
    28:   ]
    29: 
    30: cpp_cpps = TRE_CPPS
    31: iscr_source = ['lpsrc/tre.pak']
    32: weaver_directory = 'doc/rtl/tre/'
    33: build_macro = "TRE"
    34: unit_tests = glob.glob("test" + os.sep + "tre*.flx")
    35: unit_tests.sort()
    36: 
End python section to spkgs/tre.py[1]
Start data section to config/tre.fpc[1 /1 ]
     1: 
     2: provides_dlib: -ltre_dynamic
     3: provides_slib: -ltre_static
End data section to config/tre.fpc[1]
Start felix section to lib/tre.flx[1 /1 ]
     1: #line 54 "./lpsrc/tre.pak"
     2: #import <flx.flxh>
     3: 
     4: module Tre
     5: {
     6:   open C_hack;
     7:   requires package 'tre';
     8:   header '#include "tre_regex.hpp"';
     9:   type tre_regex_t = "regex_t";
    10: 
    11:   const tre_version: string = "tre_version()";
    12: 
    13:   private fun _tre_regcomp: ptr[tre_regex_t] * string  -> int =
    14:     "regncomp($1,$2.data(),$2.size(),REG_EXTENDED)"
    15:   ;
    16: 
    17:   fun tre_regcomp (x:string): opt[tre_regex_t] = {
    18:     var cr: tre_regex_t;
    19:     var res = _tre_regcomp(addr cr, x);
    20:     return
    21:       if res == 0 then Some cr else None[tre_regex_t] endif
    22:     ;
    23:   }
    24: 
    25:   fun nsub: ptr[tre_regex_t] -> int = "$1->re_nsub+1";
    26: 
    27:   ctypes regoff_t;
    28:   cstruct regmatch_t {
    29:     rm_so: regoff_t;
    30:     rm_eo: regoff_t;
    31:   }
    32: 
    33:   fun add: ptr[char] * regoff_t -> ptr[char] = "$1+$2";
    34:   fun sub: regoff_t * regoff_t -> int = "(int)($1-$2)";
    35:   fun _ctor_int : regoff_t -> int = "int($1)";
    36: 
    37:   private fun _tre_regexec:
    38:     ptr[tre_regex_t] *     // the compiled regex
    39:     string *               // string to search
    40:     int *                  // number of matches to return
    41:     ptr[regmatch_t]        // array to hold matches
    42:     -> int
    43:   =
    44:     "regnexec($1,$2.data(),$2.size(),$3,$4,0);"
    45:   ;
    46: 
    47:   fun tre_regexec (re_in: tre_regex_t) (x:string): int * int * ptr[regmatch_t] =
    48:   {
    49:     var re = re_in;
    50:     val nmatches = nsub$ addr re;
    51:     var matches = Carray::array_alloc[regmatch_t] nmatches;
    52:     var res = _tre_regexec(addr re, x, nmatches, matches);
    53:     return res,nmatches,matches;
    54:   }
    55: }
    56: 
End felix section to lib/tre.flx[1]
Start felix section to test/tre_01.flx[1 /1 ]
     1: #line 111 "./lpsrc/tre.pak"
     2: #import <flx.flxh>
     3: include "tre.flx";
     4: open Tre;
     5: open C_hack;
     6: open Carray;
     7: 
     8: print$ "Using tre " tre_version; endl;
     9: 
    10: var r = tre_regcomp("(a|b)*abb");
    11: print "Done tre compile"; endl;
    12: 
    13: print
    14:   match r with
    15:   | Some _ => "Compiled"
    16:   | None => "failed"
    17:   endmatch
    18: ;
    19: endl;
    20: 
    21: 
    22: var re : tre_regex_t =
    23:   match r with
    24:   | Some ?re => re
    25:   | None => re // HACK!
    26:   endmatch
    27: ;
    28: 
    29: var s = "aabbabababb";
    30: res,n,a := tre_regexec re s;
    31: print "Result = "; print res; endl;
    32: print "nmatches = "; print n; endl;
    33: 
    34: var i : int;
    35: for_each { i=0; } { i<n } { ++i; }
    36:   {
    37:     if int(a.[i].rm_so) == -1 do
    38:       print i; print " -> nomatch\n";
    39:     else
    40:       print i; print "-> match '";
    41:       start := int(a.[i].rm_so);
    42:       finish := int(a.[i].rm_eo);
    43:       print s.[start to finish];
    44:       print "'"; endl;
    45:     done;
    46:   }
    47: ;
    48: 
    49: print "Finished"; endl;
    50: 
End felix section to test/tre_01.flx[1]
Start data section to test/tre_01.expect[1 /1 ]
     1: Using tre TRE 0.7.2 (GPL)
     2: Done tre compile
     3: Compiled
     4: Result = 0
     5: nmatches = 2
     6: 0-> match 'aabbabababb'
     7: 1-> match 'b'
     8: Finished
End data section to test/tre_01.expect[1]
Start cpp section to rtl/flx_target_tre_config.hpp[1 /1 ]
     1: #line 172 "./lpsrc/tre.pak"
     2: #ifndef __FLX_TARGET_TRE_CONFIG_GUARD__
     3: #define __FLX_TARGET_TRE_CONFIG_GUARD__
     4: #include "flx_rtl_config.hpp"
     5: #ifdef BUILD_TRE
     6: #define TRE_EXTERN FLX_EXPORT
     7: #else
     8: #define TRE_EXTERN FLX_IMPORT
     9: #endif
    10: #endif
    11: 
    12: /* config.h.  Generated by configure.  */
    13: /* config.h.in.  Generated from configure.ac by autoheader.  */
    14: 
    15: /* Define to one of `_getb67', `GETB67', `getb67' for Cray-2 and Cray-YMP
    16:    systems. This function is required for `alloca.c' support on those systems.
    17:    */
    18: /* #undef CRAY_STACKSEG_END */
    19: 
    20: /* Define to 1 if using `alloca.c'. */
    21: /* #undef C_ALLOCA */
    22: 
    23: /* Define to 1 if translation of program messages to the user's native
    24:    language is requested. */
    25: #define ENABLE_NLS 0
    26: 
    27: /* Define to 1 if you have `alloca', as a function or macro. */
    28: //#define HAVE_ALLOCA 1
    29: 
    30: /* Define to 1 if you have <alloca.h> and it should be used (not on Ultrix).
    31:    */
    32: //#define HAVE_ALLOCA_H 1
    33: 
    34: /* Define if the GNU dcgettext() function is already present or preinstalled.
    35:    */
    36: //#define HAVE_DCGETTEXT 1
    37: 
    38: /* Define to 1 if you have the <dlfcn.h> header file. */
    39: //#define HAVE_DLFCN_H 1
    40: 
    41: /* Define to 1 if you have the <getopt.h> header file. */
    42: //#define HAVE_GETOPT_H 1
    43: 
    44: /* Define to 1 if you have the `getopt_long' function. */
    45: //#define HAVE_GETOPT_LONG 1
    46: 
    47: /* Define if the GNU gettext() function is already present or preinstalled. */
    48: //#define HAVE_GETTEXT 1
    49: 
    50: /* Define if you have the iconv() function. */
    51: /* #undef HAVE_ICONV */
    52: 
    53: /* Define to 1 if you have the <inttypes.h> header file. */
    54: //#define HAVE_INTTYPES_H 1
    55: 
    56: /* Define to 1 if you have the `isascii' function. */
    57: #define HAVE_ISASCII 1
    58: 
    59: /* Define to 1 if you have the `isblank' function. */
    60: /* RF: had to comment this out to get cl.exe version working */
    61: /* #define HAVE_ISBLANK 1 */
    62: 
    63: /* Define to 1 if you have the `iswascii' function or macro. */
    64: /* #undef HAVE_ISWASCII */
    65: 
    66: /* Define to 1 if you have the `iswblank' function or macro. */
    67: //#define HAVE_ISWBLANK 1
    68: 
    69: /* Define to 1 if you have the `iswctype' function or macro. */
    70: //#define HAVE_ISWCTYPE 1
    71: 
    72: /* Define to 1 if you have the `iswlower' function or macro. */
    73: //#define HAVE_ISWLOWER 1
    74: 
    75: /* Define to 1 if you have the `iswupper' function or macro. */
    76: //#define HAVE_ISWUPPER 1
    77: 
    78: /* Define to 1 if you have the <libutf8.h> header file. */
    79: /* #undef HAVE_LIBUTF8_H */
    80: 
    81: /* Define to 1 if you have the `mbrtowc' function or macro. */
    82: //#define HAVE_MBRTOWC 1
    83: 
    84: /* Define to 1 if the system has the type `mbstate_t'. */
    85: //#define HAVE_MBSTATE_T 1
    86: 
    87: /* Define to 1 if you have the `mbtowc' function or macro. */
    88: /* #undef HAVE_MBTOWC */
    89: 
    90: /* Define to 1 if you have the <memory.h> header file. */
    91: //#define HAVE_MEMORY_H 1
    92: 
    93: /* Define to 1 if you have the <regex.h> header file. */
    94: /* #undef HAVE_REGEX_H */
    95: 
    96: /* Define to 1 if the system has the type `reg_errcode_t'. */
    97: /* #undef HAVE_REG_ERRCODE_T */
    98: 
    99: /* Define to 1 if you have the <stdint.h> header file. */
   100: //#define HAVE_STDINT_H 1
   101: 
   102: /* Define to 1 if you have the <stdlib.h> header file. */
   103: #define HAVE_STDLIB_H 1
   104: 
   105: /* Define to 1 if you have the <strings.h> header file. */
   106: //#define HAVE_STRINGS_H 1
   107: 
   108: /* Define to 1 if you have the <string.h> header file. */
   109: #define HAVE_STRING_H 1
   110: 
   111: /* Define to 1 if you have the <sys/stat.h> header file. */
   112: //#define HAVE_SYS_STAT_H 1
   113: 
   114: /* Define to 1 if you have the <sys/types.h> header file. */
   115: //#define HAVE_SYS_TYPES_H 1
   116: 
   117: /* Define to 1 if you have the `towlower' function or macro. */
   118: //#define HAVE_TOWLOWER 1
   119: 
   120: /* Define to 1 if you have the `towupper' function or macro. */
   121: //#define HAVE_TOWUPPER 1
   122: 
   123: /* Define to 1 if you have the <unistd.h> header file. */
   124: //#define HAVE_UNISTD_H 1
   125: 
   126: /* Define to 1 if you have the <wchar.h> header file. */
   127: //#define HAVE_WCHAR_H 1
   128: 
   129: /* Define to 1 if the system has the type `wchar_t'. */
   130: //#define HAVE_WCHAR_T 1
   131: 
   132: /* Define to 1 if you have the `wcschr' function or macro. */
   133: //#define HAVE_WCSCHR 1
   134: 
   135: /* Define to 1 if you have the `wcscpy' function or macro. */
   136: //#define HAVE_WCSCPY 1
   137: 
   138: /* Define to 1 if you have the `wcslen' function or macro. */
   139: //#define HAVE_WCSLEN 1
   140: 
   141: /* Define to 1 if you have the `wcsncpy' function or macro. */
   142: //#define HAVE_WCSNCPY 1
   143: 
   144: /* Define to 1 if you have the `wcsrtombs' function or macro. */
   145: //#define HAVE_WCSRTOMBS 1
   146: 
   147: /* Define to 1 if you have the `wcstombs' function or macro. */
   148: /* #undef HAVE_WCSTOMBS */
   149: 
   150: /* Define to 1 if you have the `wctype' function or macro. */
   151: //#define HAVE_WCTYPE 1
   152: 
   153: /* Define to 1 if you have the <wctype.h> header file. */
   154: //#define HAVE_WCTYPE_H 1
   155: 
   156: /* Define to 1 if the system has the type `wint_t'. */
   157: //#define HAVE_WINT_T 1
   158: 
   159: /* Define if you want to disable debug assertions. */
   160: #define NDEBUG 1
   161: 
   162: /* Name of package */
   163: #define PACKAGE "tre"
   164: 
   165: /* Define to the address where bug reports for this package should be sent. */
   166: #define PACKAGE_BUGREPORT "Ville Laurikari <vl@iki.fi>"
   167: 
   168: /* Define to the full name of this package. */
   169: #define PACKAGE_NAME "TRE"
   170: 
   171: /* Define to the full name and version of this package. */
   172: #define PACKAGE_STRING "TRE 0.7.2"
   173: 
   174: /* Define to the one symbol short name of this package. */
   175: #define PACKAGE_TARNAME "tre"
   176: 
   177: /* Define to the version of this package. */
   178: #define PACKAGE_VERSION "0.7.2"
   179: 
   180: /* If using the C implementation of alloca, define if you know the
   181:    direction of stack growth for your system; otherwise it will be
   182:    automatically deduced at run-time.
   183:         STACK_DIRECTION > 0 => grows toward higher addresses
   184:         STACK_DIRECTION < 0 => grows toward lower addresses
   185:         STACK_DIRECTION = 0 => direction of growth unknown */
   186: /* #undef STACK_DIRECTION */
   187: 
   188: /* Define to 1 if you have the ANSI C header files. */
   189: #define STDC_HEADERS 1
   190: 
   191: /* Define if you want to enable approximate matching functionality. */
   192: #define TRE_APPROX 1
   193: 
   194: /* Define if you want TRE to print debug messages to stdout. */
   195: /* #undef TRE_DEBUG */
   196: 
   197: /* Define to enable multibyte character set support. */
   198: #define TRE_MULTIBYTE 0
   199: 
   200: /* Define to a field in the regex_t struct where TRE should store a pointer to
   201:    the internal tre_tnfa_t structure */
   202: #define TRE_REGEX_T_FIELD value
   203: 
   204: /* Define to the absolute path to the system regex.h */
   205: /* #undef TRE_SYSTEM_REGEX_H_PATH */
   206: 
   207: /* Define if you want TRE to use alloca() instead of malloc() when allocating
   208:    memory needed for regexec operations. */
   209: //#define TRE_USE_ALLOCA 1
   210: 
   211: /* Define to include the system regex.h from TRE regex.h */
   212: /* #undef TRE_USE_SYSTEM_REGEX_H */
   213: 
   214: /* TRE version string. */
   215: #define TRE_VERSION "0.7.2"
   216: 
   217: /* TRE version level 1. */
   218: #define TRE_VERSION_1 0
   219: 
   220: /* TRE version level 2. */
   221: #define TRE_VERSION_2 7
   222: 
   223: /* TRE version level 3. */
   224: #define TRE_VERSION_3 2
   225: 
   226: /* Define to enable wide character (wchar_t) support. */
   227: //#define TRE_WCHAR 1
   228: 
   229: /* Version number of package */
   230: #define VERSION "0.7.2"
   231: 
   232: /* Define to the maximum value of wchar_t if not already defined elsewhere */
   233: /* #undef WCHAR_MAX */
   234: 
   235: /* Define if wchar_t is signed */
   236: /* #undef WCHAR_T_SIGNED */
   237: 
   238: /* Define if wchar_t is unsigned */
   239: /* #undef WCHAR_T_UNSIGNED */
   240: 
   241: /* Number of bits in a file offset, on hosts where this is settable. */
   242: /* #undef _FILE_OFFSET_BITS */
   243: 
   244: /* Define to enable GNU extensions in glibc */
   245: //#define _GNU_SOURCE 1
   246: 
   247: /* Define for large files, on AIX-style hosts. */
   248: /* #undef _LARGE_FILES */
   249: 
   250: /* Define on IRIX */
   251: /* #undef _REGCOMP_INTERNAL */
   252: 
   253: /* Define to empty if `const' does not conform to ANSI C. */
   254: /* #undef const */
   255: 
   256: /* Define to `__inline__' or `__inline' if that's what the C compiler
   257:    calls it, or to nothing if 'inline' is not supported under any name.  */
   258: #ifndef __cplusplus
   259: /* #undef inline */
   260: #endif
   261: 
End cpp section to rtl/flx_target_tre_config.hpp[1]
Start cpp section to tre/tre_gettext.hpp[1 /1 ]
     1: #line 434 "./lpsrc/tre.pak"
     2: /* Convenience header for conditional use of GNU <libintl.h>.
     3:    Copyright (C) 1995-1998, 2000-2002 Free Software Foundation, Inc.
     4: 
     5:    This program is free software; you can redistribute it and/or modify it
     6:    under the terms of the GNU Library General Public License as published
     7:    by the Free Software Foundation; either version 2, or (at your option)
     8:    any later version.
     9: 
    10:    This program is distributed in the hope that it will be useful,
    11:    but WITHOUT ANY WARRANTY; without even the implied warranty of
    12:    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    13:    Library General Public License for more details.
    14: 
    15:    You should have received a copy of the GNU Library General Public
    16:    License along with this program; if not, write to the Free Software
    17:    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
    18:    USA.  */
    19: 
    20: #ifndef _LIBGETTEXT_H
    21: #define _LIBGETTEXT_H 1
    22: 
    23: /* NLS can be disabled through the configure --disable-nls option.  */
    24: #if ENABLE_NLS
    25: 
    26: /* Get declarations of GNU message catalog functions.  */
    27: # include <libintl.h>
    28: 
    29: #else
    30: 
    31: /* Solaris /usr/include/locale.h includes /usr/include/libintl.h, which
    32:    chokes if dcgettext is defined as a macro.  So include it now, to make
    33:    later inclusions of <locale.h> a NOP.  We don't include <libintl.h>
    34:    as well because people using "gettext.h" will not include <libintl.h>,
    35:    and also including <libintl.h> would fail on SunOS 4, whereas <locale.h>
    36:    is OK.  */
    37: #if defined(__sun)
    38: # include <locale.h>
    39: #endif
    40: 
    41: /* Disabled NLS.
    42:    The casts to 'const char *' serve the purpose of producing warnings
    43:    for invalid uses of the value returned from these functions.
    44:    On pre-ANSI systems without 'const', the config.h file is supposed to
    45:    contain "#define const".  */
    46: # define gettext(Msgid) ((const char *) (Msgid))
    47: # define dgettext(Domainname, Msgid) ((const char *) (Msgid))
    48: # define dcgettext(Domainname, Msgid, Category) ((const char *) (Msgid))
    49: # define ngettext(Msgid1, Msgid2, N) \
    50:     ((N) == 1 ? (const char *) (Msgid1) : (const char *) (Msgid2))
    51: # define dngettext(Domainname, Msgid1, Msgid2, N) \
    52:     ((N) == 1 ? (const char *) (Msgid1) : (const char *) (Msgid2))
    53: # define dcngettext(Domainname, Msgid1, Msgid2, N, Category) \
    54:     ((N) == 1 ? (const char *) (Msgid1) : (const char *) (Msgid2))
    55: # define textdomain(Domainname) ((const char *) (Domainname))
    56: # define bindtextdomain(Domainname, Dirname) ((const char *) (Dirname))
    57: # define bind_textdomain_codeset(Domainname, Codeset) ((const char *) (Codeset))
    58: 
    59: #endif
    60: 
    61: /* A pseudo function call that serves as a marker for the automated
    62:    extraction of messages, but does not call gettext().  The run-time
    63:    translation is done at a different place in the code.
    64:    The argument, String, should be a literal string.  Concatenated strings
    65:    and other string expressions won't work.
    66:    The macro's expansion is not parenthesized, so that it is suitable as
    67:    initializer for static 'char[]' or 'const char[]' variables.  */
    68: #define gettext_noop(String) String
    69: 
    70: #endif /* _LIBGETTEXT_H */
End cpp section to tre/tre_gettext.hpp[1]
Start cpp section to tre/tre_regcomp.cpp[1 /1 ]
     1: #line 505 "./lpsrc/tre.pak"
     2: /*
     3:   regcomp.c - TRE POSIX compatible regex compilation functions.
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #include "flx_target_tre_config.hpp"
    23: 
    24: #include <string.h>
    25: #include <errno.h>
    26: #include <stdlib.h>
    27: 
    28: #include "tre_regex.hpp"
    29: #include "tre_internal.hpp"
    30: #include "tre_xmalloc.hpp"
    31: 
    32: int
    33: regncomp(regex_t *preg, const char *regex, size_t n, int cflags)
    34: {
    35:   int ret;
    36: #if TRE_WCHAR
    37:   tre_char_t *wregex;
    38:   int wlen;
    39: 
    40:   wregex = (tre_char_t*)xmalloc(sizeof(tre_char_t) * (n + 1));
    41:   if (wregex == NULL)
    42:     return REG_ESPACE;
    43: 
    44:   /* If the current locale uses the standard single byte encoding of
    45:      characters, we don't do a multibyte string conversion.  If we did,
    46:      many applications which use the default locale would break since
    47:      the default "C" locale uses the 7-bit ASCII character set, and
    48:      all characters with the eighth bit set would be considered invalid. */
    49: #if TRE_MULTIBYTE
    50:   if (TRE_MB_CUR_MAX == 1)
    51: #endif /* TRE_MULTIBYTE */
    52:     {
    53:       unsigned int i;
    54:       const unsigned char *str = (unsigned char *)regex;
    55:       tre_char_t *wstr = wregex;
    56: 
    57:       for (i = 0; i < n; i++)
    58:         *(wstr++) = *(str++);
    59:       wlen = n;
    60:     }
    61: #if TRE_MULTIBYTE
    62:   else
    63:     {
    64:       int consumed;
    65:       tre_char_t *wcptr = wregex;
    66: #ifdef HAVE_MBSTATE_T
    67:       mbstate_t state;
    68:       memset(&state, '\0', sizeof(state));
    69: #endif /* HAVE_MBSTATE_T */
    70:       while (n > 0)
    71:         {
    72:           consumed = tre_mbrtowc(wcptr, regex, n, &state);
    73: 
    74:           switch (consumed)
    75:             {
    76:             case 0:
    77:               if (*regex == '\0')
    78:                 consumed = 1;
    79:               else
    80:                 {
    81:                   xfree(wregex);
    82:                   return REG_BADPAT;
    83:                 }
    84:               break;
    85:             case -1:
    86:               DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno)));
    87:               xfree(wregex);
    88:               return REG_BADPAT;
    89:             case -2:
    90:               /* The last character wasn't complete.  Let's not call it a
    91:                  fatal error. */
    92:               consumed = n;
    93:               break;
    94:             }
    95:           regex += consumed;
    96:           n -= consumed;
    97:           wcptr++;
    98:         }
    99:       wlen = wcptr - wregex;
   100:     }
   101: #endif /* TRE_MULTIBYTE */
   102: 
   103:   wregex[wlen] = L'\0';
   104:   ret = tre_compile(preg, wregex, wlen, cflags);
   105:   xfree(wregex);
   106: #else /* !TRE_WCHAR */
   107:   ret = tre_compile(preg, (const tre_char_t*)regex, n, cflags);
   108: #endif /* !TRE_WCHAR */
   109: 
   110:   return ret;
   111: }
   112: 
   113: int
   114: regcomp(regex_t *preg, const char *regex, int cflags)
   115: {
   116:   return regncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
   117: }
   118: 
   119: 
   120: #ifdef TRE_WCHAR
   121: int
   122: regwncomp(regex_t *preg, const wchar_t *regex, size_t n, int cflags)
   123: {
   124:   return tre_compile(preg, regex, n, cflags);
   125: }
   126: 
   127: int
   128: regwcomp(regex_t *preg, const wchar_t *regex, int cflags)
   129: {
   130:   return tre_compile(preg, regex, regex ? wcslen(regex) : 0, cflags);
   131: }
   132: #endif /* TRE_WCHAR */
   133: 
   134: void
   135: regfree(regex_t *preg)
   136: {
   137:   tre_free(preg);
   138: }
   139: 
   140: /* EOF */
End cpp section to tre/tre_regcomp.cpp[1]
Start cpp section to tre/tre_regerror.cpp[1 /1 ]
     1: #line 646 "./lpsrc/tre.pak"
     2: /*
     3:   regerror.c - POSIX regerror() implementation for TRE.
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #include "flx_target_tre_config.hpp"
    23: 
    24: #include <string.h>
    25: #ifdef HAVE_WCHAR_H
    26: #include <wchar.h>
    27: #endif /* HAVE_WCHAR_H */
    28: #ifdef HAVE_WCTYPE_H
    29: #include <wctype.h>
    30: #endif /* HAVE_WCTYPE_H */
    31: 
    32: #include "tre_internal.hpp"
    33: #include "tre_regex.hpp"
    34: #include "tre_gettext.hpp"
    35: #define _(String) dgettext(PACKAGE, String)
    36: #define gettext_noop(String) String
    37: 
    38: /* Error message strings for error codes listed in `regex.h'.  This list
    39:    needs to be in sync with the codes listed there, naturally. */
    40: static const char *tre_error_messages[] =
    41:   { gettext_noop("No error"),                      /* REG_OK */
    42:     gettext_noop("No match"),                      /* REG_NOMATCH */
    43:     gettext_noop("Invalid regexp"),                /* REG_BADPAT */
    44:     gettext_noop("Unknown collating element"),     /* REG_ECOLLATE */
    45:     gettext_noop("Unknown character klass name"),  /* REG_ECTYPE */
    46:     gettext_noop("Trailing backslash"),            /* REG_EESCAPE */
    47:     gettext_noop("Invalid back reference"),        /* REG_ESUBREG */
    48:     gettext_noop("Missing ']'"),                   /* REG_EBRACK */
    49:     gettext_noop("Missing ')'"),                   /* REG_EPAREN */
    50:     gettext_noop("Missing '}'"),                   /* REG_EBRACE */
    51:     gettext_noop("Invalid contents of {}"),        /* REG_BADBR */
    52:     gettext_noop("Invalid character range"),       /* REG_ERANGE */
    53:     gettext_noop("Out of memory"),                 /* REG_ESPACE */
    54:     gettext_noop("XXX")                            /* REG_BADRPT */
    55:   };
    56: 
    57: size_t
    58: regerror(int errcode, const regex_t *preg, char *errbuf, size_t errbuf_size)
    59: {
    60:   const char *err;
    61:   size_t err_len;
    62: 
    63:   if (errcode >= 0
    64:       && errcode < (sizeof(tre_error_messages) / sizeof(*tre_error_messages)))
    65:     err = gettext(tre_error_messages[errcode]);
    66:   else
    67:     err = gettext("Unknown error");
    68: 
    69:   err_len = strlen(err) + 1;
    70:   if (errbuf_size > 0 && errbuf != NULL)
    71:     {
    72:       if (err_len > errbuf_size)
    73:         {
    74:           strncpy(errbuf, err, errbuf_size - 1);
    75:           errbuf[errbuf_size - 1] = '\0';
    76:         }
    77:       else
    78:         {
    79:           strcpy(errbuf, err);
    80:         }
    81:     }
    82:   return err_len;
    83: }
    84: 
    85: /* EOF */
End cpp section to tre/tre_regerror.cpp[1]
Start cpp section to tre/tre_regexec.cpp[1 /1 ]
     1: #line 732 "./lpsrc/tre.pak"
     2: /*
     3:   regexec.c - TRE POSIX compatible matching functions (and more).
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #include "flx_target_tre_config.hpp"
    23: 
    24: #ifdef TRE_USE_ALLOCA
    25: /* AIX requires this to be the first thing in the file.  */
    26: #ifndef __GNUC__
    27: # if HAVE_ALLOCA_H
    28: #  include <alloca.h>
    29: # else
    30: #  ifdef _AIX
    31:  #pragma alloca
    32: #  else
    33: #   ifndef alloca /* predefined by HP cc +Olibcalls */
    34: char *alloca ();
    35: #   endif
    36: #  endif
    37: # endif
    38: #endif
    39: #endif /* TRE_USE_ALLOCA */
    40: 
    41: #include <assert.h>
    42: #include <stdlib.h>
    43: #include <string.h>
    44: #ifdef HAVE_WCHAR_H
    45: #include <wchar.h>
    46: #endif /* HAVE_WCHAR_H */
    47: #ifdef HAVE_WCTYPE_H
    48: #include <wctype.h>
    49: #endif /* HAVE_WCTYPE_H */
    50: #ifndef TRE_WCHAR
    51: #include <ctype.h>
    52: #endif /* !TRE_WCHAR */
    53: #ifdef HAVE_MALLOC_H
    54: #include <malloc.h>
    55: #endif /* HAVE_MALLOC_H */
    56: #include <limits.h>
    57: 
    58: #include "tre_regex.hpp"
    59: #include "tre_internal.hpp"
    60: #include "tre_filter.hpp"
    61: #include "tre_xmalloc.hpp"
    62: 
    63: 
    64: /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
    65:    endpoint values. */
    66: void
    67: tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
    68:                 const tre_tnfa_t *tnfa, int *tags, int match_eo)
    69: {
    70:   tre_submatch_data_t *submatch_data;
    71:   unsigned int i, j;
    72:   int *parents;
    73: 
    74:   i = 0;
    75:   if (match_eo >= 0 && !(cflags & REG_NOSUB))
    76:     {
    77:       /* Construct submatch offsets from the tags. */
    78:       DPRINT(("end tag = t%d = %d\n", tnfa->end_tag, match_eo));
    79:       submatch_data = tnfa->submatch_data;
    80:       while (i < tnfa->num_submatches && i < nmatch)
    81:         {
    82:           if (submatch_data[i].so_tag == tnfa->end_tag)
    83:             pmatch[i].rm_so = match_eo;
    84:           else
    85:             pmatch[i].rm_so = tags[submatch_data[i].so_tag];
    86: 
    87:           if (submatch_data[i].eo_tag == tnfa->end_tag)
    88:             pmatch[i].rm_eo = match_eo;
    89:           else
    90:             pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
    91: 
    92:           /* If either of the endpoints were not used, this submatch
    93:              was not part of the match. */
    94:           if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
    95:             pmatch[i].rm_so = pmatch[i].rm_eo = -1;
    96: 
    97:           DPRINT(("pmatch[%d] = {t%d = %d, t%d = %d}\n", i,
    98:                   submatch_data[i].so_tag, pmatch[i].rm_so,
    99:                   submatch_data[i].eo_tag, pmatch[i].rm_eo));
   100:           i++;
   101:         }
   102:       /* Reset all submatches that are not within all of their parent
   103:          submatches. */
   104:       i = 0;
   105:       while (i < tnfa->num_submatches && i < nmatch)
   106:         {
   107:           if (pmatch[i].rm_eo == -1)
   108:             assert(pmatch[i].rm_so == -1);
   109:           assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
   110: 
   111:           parents = submatch_data[i].parents;
   112:           if (parents != NULL)
   113:             for (j = 0; parents[j] >= 0; j++)
   114:               {
   115:                 DPRINT(("pmatch[%d] parent %d\n", i, parents[j]));
   116:                 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
   117:                     || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
   118:                   pmatch[i].rm_so = pmatch[i].rm_eo = -1;
   119:               }
   120:           i++;
   121:         }
   122:     }
   123: 
   124:   while (i < nmatch)
   125:     {
   126:       pmatch[i].rm_so = -1;
   127:       pmatch[i].rm_eo = -1;
   128:       i++;
   129:     }
   130: }
   131: 
   132: 
   133: /*
   134:   Wrapper functions for POSIX compatible regexp matching.
   135: */
   136: 
   137: int
   138: tre_have_backrefs(const regex_t *preg)
   139: {
   140:   tre_tnfa_t *tnfa = (tre_tnfa_t*)preg->TRE_REGEX_T_FIELD;
   141:   return tnfa->have_backrefs;
   142: }
   143: 
   144: int
   145: tre_have_approx(const regex_t *preg)
   146: {
   147:   tre_tnfa_t *tnfa = (tre_tnfa_t*)preg->TRE_REGEX_T_FIELD;
   148:   return tnfa->have_approx;
   149: }
   150: 
   151: static int
   152: tre_match(const tre_tnfa_t *tnfa, const void *string, size_t len,
   153:           tre_str_type_t type, size_t nmatch, regmatch_t pmatch[],
   154:           int eflags)
   155: {
   156:   reg_errcode_t status;
   157:   int *tags = NULL, eo;
   158:   if (tnfa->num_tags > 0 && nmatch > 0)
   159:     {
   160: #ifdef TRE_USE_ALLOCA
   161:       tags = (int*)alloca(sizeof(*tags) * tnfa->num_tags);
   162: #else /* !TRE_USE_ALLOCA */
   163:       tags = (int*)xmalloc(sizeof(*tags) * tnfa->num_tags);
   164: #endif /* !TRE_USE_ALLOCA */
   165:       if (tags == NULL)
   166:         return REG_ESPACE;
   167:     }
   168: 
   169:   /* Dispatch to the appropriate matcher. */
   170:   if (tnfa->have_backrefs || eflags & REG_BACKTRACKING_MATCHER)
   171:     {
   172:       /* The regex has back references, use the backtracking matcher. */
   173:       if (type == STR_USER)
   174:         {
   175:           const tre_str_source *source = (tre_str_source*)string;
   176:           if (source->rewind == NULL || source->compare == NULL)
   177:             /* The backtracking matcher requires rewind and compare
   178:                capabilities from the input stream. */
   179:             return REG_BADPAT;
   180:         }
   181:       status = tre_tnfa_run_backtrack(tnfa, string, len, type,
   182:                                       tags, eflags, &eo);
   183:     }
   184: #ifdef TRE_APPROX
   185:   else if (tnfa->have_approx || eflags & REG_APPROX_MATCHER)
   186:     {
   187:       /* The regex uses approximate matching, use the approximate matcher. */
   188:       regamatch_t match;
   189:       regaparams_t params;
   190:       regaparams_default(¶ms);
   191:       params.max_err = 0;
   192:       params.max_cost = 0;
   193:       status = tre_tnfa_run_approx(tnfa, string, len, type, tags,
   194:                                    &match, params, eflags, &eo);
   195:     }
   196: #endif /* TRE_APPROX */
   197:   else
   198:     {
   199:       /* Exact matching, no back references, use the parallel matcher. */
   200:       status = tre_tnfa_run_parallel(tnfa, string, len, type,
   201:                                      tags, eflags, &eo);
   202:     }
   203: 
   204:   if (status == REG_OK)
   205:     /* A match was found, so fill the submatch registers. */
   206:     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
   207: #ifndef TRE_USE_ALLOCA
   208:   if (tags)
   209:     xfree(tags);
   210: #endif /* !TRE_USE_ALLOCA */
   211:   return status;
   212: }
   213: 
   214: int
   215: regnexec(const regex_t *preg, const char *str, size_t len,
   216:          size_t nmatch, regmatch_t pmatch[], int eflags)
   217: {
   218:   tre_tnfa_t *tnfa = (tre_tnfa_t*)preg->TRE_REGEX_T_FIELD;
   219:   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
   220: 
   221:   return tre_match(tnfa, str, len, type, nmatch, pmatch, eflags);
   222: }
   223: 
   224: int
   225: regexec(const regex_t *preg, const char *str,
   226:         size_t nmatch, regmatch_t pmatch[], int eflags)
   227: {
   228:   return regnexec(preg, str, -1, nmatch, pmatch, eflags);
   229: }
   230: 
   231: 
   232: #ifdef TRE_WCHAR
   233: 
   234: int
   235: regwnexec(const regex_t *preg, const wchar_t *str, size_t len,
   236:           size_t nmatch, regmatch_t pmatch[], int eflags)
   237: {
   238:   tre_tnfa_t *tnfa = (tre_tnfa_t *)preg->TRE_REGEX_T_FIELD;
   239:   return tre_match(tnfa, str, len, STR_WIDE, nmatch, pmatch, eflags);
   240: }
   241: 
   242: int
   243: regwexec(const regex_t *preg, const wchar_t *str,
   244:          size_t nmatch, regmatch_t pmatch[], int eflags)
   245: {
   246:   return regwnexec(preg, str, -1, nmatch, pmatch, eflags);
   247: }
   248: 
   249: #endif /* TRE_WCHAR */
   250: 
   251: int
   252: reguexec(const regex_t *preg, const tre_str_source *str,
   253:          size_t nmatch, regmatch_t pmatch[], int eflags)
   254: {
   255:   tre_tnfa_t *tnfa = (tre_tnfa_t *)preg->TRE_REGEX_T_FIELD;
   256:   return tre_match(tnfa, str, -1, STR_USER, nmatch, pmatch, eflags);
   257: }
   258: 
   259: 
   260: #ifdef TRE_APPROX
   261: 
   262: /*
   263:   Wrapper functions for approximate regexp matching.
   264: */
   265: 
   266: static int
   267: tre_match_approx(const tre_tnfa_t *tnfa, const void *string, size_t len,
   268:                  tre_str_type_t type, regamatch_t *match, regaparams_t params,
   269:                  int eflags)
   270: {
   271:   reg_errcode_t status;
   272:   int *tags = NULL, eo;
   273: 
   274:   /* If the regexp does not use approximate matching features, the
   275:      maximum cost is zero, and the approximate matcher isn't forced,
   276:      use the exact matcher instead. */
   277:   if (params.max_cost == 0 && !tnfa->have_approx
   278:       && !(eflags & REG_APPROX_MATCHER))
   279:     return tre_match(tnfa, string, len, type, match->nmatch, match->pmatch,
   280:                      eflags);
   281: 
   282:   /* Back references are not supported by the approximate matcher. */
   283:   if (tnfa->have_backrefs)
   284:     return REG_BADPAT;
   285: 
   286: #if 0
   287:   {
   288:     int pos;
   289:     tre_filter_profile_t profile[] =
   290:       { {'b', 2}, {'e', 3}, {'l', 1}, {'B', 1}, {'r', 1},
   291:         {'o', 1}, {'x', 1}, {0, 0} };
   292:     tre_filter_t filter = { 10, profile };
   293:     pos = tre_filter_find((unsigned char *)string, len, &filter);
   294:     if (pos < 0)
   295:       return REG_NOMATCH;
   296:   }
   297: #endif
   298: 
   299:   if (tnfa->num_tags > 0 && match->nmatch > 0)
   300:     {
   301: #if TRE_USE_ALLOCA
   302:       tags = (int*)alloca(sizeof(*tags) * tnfa->num_tags);
   303: #else /* !TRE_USE_ALLOCA */
   304:       tags = (int*)xmalloc(sizeof(*tags) * tnfa->num_tags);
   305: #endif /* !TRE_USE_ALLOCA */
   306:       if (tags == NULL)
   307:         return REG_ESPACE;
   308:     }
   309:   status = tre_tnfa_run_approx(tnfa, string, len, type, tags,
   310:                                match, params, eflags, &eo);
   311:   if (status == REG_OK)
   312:     tre_fill_pmatch(match->nmatch, match->pmatch, tnfa->cflags, tnfa, tags, eo);
   313: #ifndef TRE_USE_ALLOCA
   314:   if (tags)
   315:     xfree(tags);
   316: #endif /* !TRE_USE_ALLOCA */
   317:   return status;
   318: }
   319: 
   320: int
   321: reganexec(const regex_t *preg, const char *str, size_t len,
   322:           regamatch_t *match, regaparams_t params, int eflags)
   323: {
   324:   tre_tnfa_t *tnfa = (tre_tnfa_t *)preg->TRE_REGEX_T_FIELD;
   325:   tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
   326: 
   327:   return tre_match_approx(tnfa, str, len, type, match, params, eflags);
   328: }
   329: 
   330: int
   331: regaexec(const regex_t *preg, const char *str,
   332:          regamatch_t *match, regaparams_t params, int eflags)
   333: {
   334:   return reganexec(preg, str, -1, match, params, eflags);
   335: }
   336: 
   337: #ifdef TRE_WCHAR
   338: 
   339: int
   340: regawnexec(const regex_t *preg, const wchar_t *str, size_t len,
   341:            regamatch_t *match, regaparams_t params, int eflags)
   342: {
   343:   tre_tnfa_t *tnfa = (tre_tnfa_t *)preg->TRE_REGEX_T_FIELD;
   344:   return tre_match_approx(tnfa, str, len, STR_WIDE,
   345:                           match, params, eflags);
   346: }
   347: 
   348: int
   349: regawexec(const regex_t *preg, const wchar_t *str,
   350:           regamatch_t *match, regaparams_t params, int eflags)
   351: {
   352:   return regawnexec(preg, str, -1, match, params, eflags);
   353: }
   354: 
   355: #endif /* TRE_WCHAR */
   356: 
   357: void
   358: regaparams_default(regaparams_t *params)
   359: {
   360:   memset(params, 0, sizeof(*params));
   361:   params->cost_ins = 1;
   362:   params->cost_del = 1;
   363:   params->cost_subst = 1;
   364:   params->max_cost = INT_MAX;
   365:   params->max_ins = INT_MAX;
   366:   params->max_del = INT_MAX;
   367:   params->max_subst = INT_MAX;
   368:   params->max_err = INT_MAX;
   369: }
   370: 
   371: #endif /* TRE_APPROX */
   372: 
   373: /* EOF */
End cpp section to tre/tre_regexec.cpp[1]
Start cpp section to tre/tre_regex.hpp[1 /1 ]
     1: #line 1106 "./lpsrc/tre.pak"
     2: /*
     3:   regex.h - POSIX.2 compatible regexp interface and TRE extensions
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #ifndef TRE_REGEX_H
    23: #define TRE_REGEX_H 1
    24: 
    25: #include "tre_config.hpp"
    26: 
    27: #ifdef HAVE_SYS_TYPES_H
    28: #include <sys/types.h>
    29: #endif /* HAVE_SYS_TYPES_H */
    30: 
    31: #ifdef HAVE_LIBUTF8_H
    32: #include <libutf8.h>
    33: #endif /* HAVE_LIBUTF8_H */
    34: 
    35: #ifdef TRE_USE_SYSTEM_REGEX_H
    36: /* Include the system regex.h to make TRE ABI compatible with the
    37:    system regex. */
    38: #include TRE_SYSTEM_REGEX_H_PATH
    39: #endif /* TRE_USE_SYSTEM_REGEX_H */
    40: 
    41: #ifdef __cplusplus
    42: extern "C" {
    43: #endif
    44: 
    45: #ifdef TRE_USE_SYSTEM_REGEX_H
    46: 
    47: #ifndef REG_OK
    48: #define REG_OK 0
    49: #endif /* !REG_OK */
    50: 
    51: #ifndef HAVE_REG_ERRCODE_T
    52: typedef int reg_errcode_t;
    53: #endif /* !HAVE_REG_ERRCODE_T */
    54: 
    55: #if !defined(REG_NOSPEC) && !defined(REG_LITERAL)
    56: #define REG_LITERAL 0x1000
    57: #endif
    58: 
    59: /* Extra regcomp() flags. */
    60: #define REG_RIGHT_ASSOC (REG_LITERAL << 1)
    61: 
    62: /* Extra regexec() flags. */
    63: #define REG_APPROX_MATCHER       0x1000
    64: #define REG_BACKTRACKING_MATCHER (REG_APPROX_MATCHER << 1)
    65: 
    66: #else /* !TRE_USE_SYSTEM_REGEX_H */
    67: 
    68: /* If the we're not using system regex.h, we need to define the
    69:    structs and enums ourselves. */
    70: 
    71: typedef int regoff_t;
    72: typedef struct {
    73:   size_t re_nsub;  /* Number of parenthesized subexpressions. */
    74:   void *value;     /* For internal use only. */
    75: } regex_t;
    76: 
    77: typedef struct {
    78:   regoff_t rm_so;
    79:   regoff_t rm_eo;
    80: } regmatch_t;
    81: 
    82: 
    83: typedef enum {
    84:   REG_OK = 0,           /* No error. */
    85:   /* POSIX regcomp() return error codes.  (In the order listed in the
    86:      standard.)  */
    87:   REG_NOMATCH,          /* No match. */
    88:   REG_BADPAT,           /* Invalid regexp. */
    89:   REG_ECOLLATE,         /* Unknown collating element. */
    90:   REG_ECTYPE,           /* Unknown character klass name. */
    91:   REG_EESCAPE,          /* Trailing backslash. */
    92:   REG_ESUBREG,          /* Invalid back reference. */
    93:   REG_EBRACK,           /* "[]" imbalance */
    94:   REG_EPAREN,           /* "\(\)" or "()" imbalance */
    95:   REG_EBRACE,           /* "\{\}" or "{}" imbalance */
    96:   REG_BADBR,            /* Invalid content of {} */
    97:   REG_ERANGE,           /* Invalid use of range operator */
    98:   REG_ESPACE,           /* Out of memory.  */
    99:   REG_BADRPT
   100: } reg_errcode_t;
   101: 
   102: /* POSIX regcomp() flags. */
   103: #define REG_EXTENDED    1
   104: #define REG_ICASE       (REG_EXTENDED << 1)
   105: #define REG_NEWLINE     (REG_ICASE << 1)
   106: #define REG_NOSUB       (REG_NEWLINE << 1)
   107: 
   108: /* Extra regcomp() flags. */
   109: #define REG_BASIC       0
   110: #define REG_LITERAL     (REG_NOSUB << 1)
   111: #define REG_RIGHT_ASSOC (REG_LITERAL << 1)
   112: 
   113: /* POSIX regexec() flags. */
   114: #define REG_NOTBOL 1
   115: #define REG_NOTEOL (REG_NOTBOL << 1)
   116: 
   117: /* Extra regexec() flags. */
   118: #define REG_APPROX_MATCHER       (REG_NOTEOL << 1)
   119: #define REG_BACKTRACKING_MATCHER (REG_APPROX_MATCHER << 1)
   120: 
   121: #endif /* !TRE_USE_SYSTEM_REGEX_H */
   122: 
   123: /* REG_NOSPEC and REG_LITERAL mean the same thing. */
   124: #ifdef REG_LITERAL
   125: #define REG_NOSPEC      REG_LITERAL
   126: #elif defined(REG_NOSPEC)
   127: #define REG_LITERAL     REG_NOSPEC
   128: #endif /* defined(REG_NOSPEC) */
   129: 
   130: /* The maximum number of iterations in a bound expression. */
   131: #undef RE_DUP_MAX
   132: #define RE_DUP_MAX 255
   133: 
   134: /* The POSIX.2 regexp functions */
   135: TRE_EXTERN int regcomp(regex_t *preg, const char *regex, int cflags);
   136: TRE_EXTERN int regexec(const regex_t *preg, const char *string, size_t nmatch,
   137:             regmatch_t pmatch[], int eflags);
   138: TRE_EXTERN size_t regerror(int errcode, const regex_t *preg, char *errbuf,
   139:                 size_t errbuf_size);
   140: TRE_EXTERN void regfree(regex_t *preg);
   141: 
   142: #ifdef TRE_WCHAR
   143: #ifdef HAVE_WCHAR_H
   144: #include <wchar.h>
   145: #endif /* HAVE_WCHAR_H */
   146: 
   147: /* Wide character versions (not in POSIX.2). */
   148: int regwcomp(regex_t *preg, const wchar_t *regex, int cflags);
   149: int regwexec(const regex_t *preg, const wchar_t *string, size_t nmatch,
   150:              regmatch_t pmatch[], int eflags);
   151: #endif /* TRE_WCHAR */
   152: 
   153: /* Versions with a maximum length argument and therefore the capability to
   154:    handle null characters in the middle of the strings (not in POSIX.2). */
   155: TRE_EXTERN int regncomp(regex_t *preg, const char *regex, size_t len,
   156:   int cflags);
   157: TRE_EXTERN int regnexec(const regex_t *preg, const char *string, size_t len,
   158:              size_t nmatch, regmatch_t pmatch[], int eflags);
   159: #ifdef TRE_WCHAR
   160: TRE_EXTERN int regwncomp(regex_t *preg, const wchar_t *regex, size_t len,
   161:   int cflags);
   162: TRE_EXTERN int regwnexec(const regex_t *preg, const wchar_t *string, size_t len,
   163:               size_t nmatch, regmatch_t pmatch[], int eflags);
   164: #endif /* TRE_WCHAR */
   165: 
   166: #ifdef TRE_APPROX
   167: 
   168: /* Approximate matching parameter struct. */
   169: typedef struct {
   170:   int cost_ins;        /* Default cost of an inserted character. */
   171:   int cost_del;        /* Default cost of a deleted character. */
   172:   int cost_subst;      /* Default cost of a substituted character. */
   173:   int max_cost;        /* Maximum allowed cost of a match. */
   174: 
   175:   int max_ins;         /* Maximum allowed number of inserts. */
   176:   int max_del;         /* Maximum allowed number of deletes. */
   177:   int max_subst;       /* Maximum allowed number of substitutes. */
   178:   int max_err;         /* Maximum allowed number of errors total. */
   179: } regaparams_t;
   180: 
   181: /* Approximate matching result struct. */
   182: typedef struct {
   183:   size_t nmatch;       /* Length of pmatch[] array. */
   184:   regmatch_t *pmatch;  /* Submatch data. */
   185:   int cost;            /* Cost of the match. */
   186:   int num_ins;         /* Number of inserts in the match. */
   187:   int num_del;         /* Number of deletes in the match. */
   188:   int num_subst;       /* Number of substitutes in the match. */
   189: } regamatch_t;
   190: 
   191: 
   192: /* Approximate matching functions. */
   193: int regaexec(const regex_t *preg, const char *string,
   194:              regamatch_t *match, regaparams_t params, int eflags);
   195: int reganexec(const regex_t *preg, const char *string, size_t len,
   196:               regamatch_t *match, regaparams_t params, int eflags);
   197: #ifdef TRE_WCHAR
   198: /* Wide character approximate matching. */
   199: int regawexec(const regex_t *preg, const wchar_t *string,
   200:               regamatch_t *match, regaparams_t params, int eflags);
   201: int regawnexec(const regex_t *preg, const wchar_t *string, size_t len,
   202:                regamatch_t *match, regaparams_t params, int eflags);
   203: #endif /* TRE_WCHAR */
   204: 
   205: /* Sets the parameters to default values. */
   206: void regaparams_default(regaparams_t *params);
   207: #endif /* TRE_APPROX */
   208: 
   209: #ifdef TRE_WCHAR
   210: typedef wchar_t tre_char_t;
   211: #else /* !TRE_WCHAR */
   212: typedef unsigned char tre_char_t;
   213: #endif /* !TRE_WCHAR */
   214: 
   215: typedef struct {
   216:   int (*get_next_char)(tre_char_t *c, unsigned int *pos_add, void *context);
   217:   void (*rewind)(size_t pos, void *context);
   218:   int (*compare)(size_t pos1, size_t pos2, size_t len, void *context);
   219:   void *context;
   220: } tre_str_source;
   221: 
   222: int reguexec(const regex_t *preg, const tre_str_source *string,
   223:              size_t nmatch, regmatch_t pmatch[], int eflags);
   224: 
   225: /* Returns the version string.  The returned string is static. */
   226: TRE_EXTERN char *tre_version(void);
   227: 
   228: /* Returns the value for a config parameter.  The type to which `result'
   229:    must point to depends of the value of `query', see documentation for
   230:    more details. */
   231: TRE_EXTERN int tre_config(int query, void *result);
   232: 
   233: enum {
   234:   TRE_CONFIG_APPROX,
   235:   TRE_CONFIG_WCHAR,
   236:   TRE_CONFIG_MULTIBYTE,
   237:   TRE_CONFIG_SYSTEM_ABI,
   238:   TRE_CONFIG_VERSION
   239: };
   240: 
   241: /* Returns 1 if the compiled pattern has back references, 0 if not. */
   242: TRE_EXTERN int tre_have_backrefs(const regex_t *preg);
   243: 
   244: /* Returns 1 if the compiled pattern uses approximate matching features,
   245:    0 if not. */
   246: TRE_EXTERN int tre_have_approx(const regex_t *preg);
   247: 
   248: #ifdef __cplusplus
   249: }
   250: #endif
   251: #endif                          /* TRE_REGEX_H */
   252: 
   253: /* EOF */
End cpp section to tre/tre_regex.hpp[1]
Start cpp section to tre/tre_ast.cpp[1 /1 ]
     1: #line 1360 "./lpsrc/tre.pak"
     2: /*
     3:   tre-ast.c - Abstract syntax tree (AST) routines
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #include "flx_target_tre_config.hpp"
    23: #include <assert.h>
    24: 
    25: #include "tre_ast.hpp"
    26: #include "tre_mem.hpp"
    27: 
    28: tre_ast_node_t *
    29: tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size)
    30: {
    31:   tre_ast_node_t *node;
    32: 
    33:   node = (tre_ast_node_t*)tre_mem_calloc(mem, sizeof(*node));
    34:   if (!node)
    35:     return NULL;
    36:   node->obj = tre_mem_calloc(mem, size);
    37:   if (!node->obj)
    38:     return NULL;
    39:   node->type = type;
    40:   node->nullable = -1;
    41:   node->submatch_id = -1;
    42: 
    43:   return node;
    44: }
    45: 
    46: tre_ast_node_t *
    47: tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
    48: {
    49:   tre_ast_node_t *node;
    50:   tre_literal_t *lit;
    51: 
    52:   node = tre_ast_new_node(mem, LITERAL, sizeof(tre_literal_t));
    53:   if (!node)
    54:     return NULL;
    55:   lit = (tre_literal_t*)node->obj;
    56:   lit->code_min = code_min;
    57:   lit->code_max = code_max;
    58:   lit->position = position;
    59: 
    60:   return node;
    61: }
    62: 
    63: tre_ast_node_t *
    64: tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
    65:                  int minimal)
    66: {
    67:   tre_ast_node_t *node;
    68:   tre_iteration_t *iter;
    69: 
    70:   node = tre_ast_new_node(mem, ITERATION, sizeof(tre_iteration_t));
    71:   if (!node)
    72:     return NULL;
    73:   iter = (tre_iteration_t*)node->obj;
    74:   iter->arg = arg;
    75:   iter->min = min;
    76:   iter->max = max;
    77:   iter->minimal = minimal;
    78:   node->num_submatches = arg->num_submatches;
    79: 
    80:   return node;
    81: }
    82: 
    83: tre_ast_node_t *
    84: tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
    85: {
    86:   tre_ast_node_t *node;
    87: 
    88:   node = tre_ast_new_node(mem, UNION, sizeof(tre_union_t));
    89:   if (node == NULL)
    90:     return NULL;
    91:   ((tre_union_t *)node->obj)->left = left;
    92:   ((tre_union_t *)node->obj)->right = right;
    93:   node->num_submatches = left->num_submatches + right->num_submatches;
    94: 
    95:   return node;
    96: }
    97: 
    98: tre_ast_node_t *
    99: tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
   100:                        tre_ast_node_t *right)
   101: {
   102:   tre_ast_node_t *node;
   103: 
   104:   node = tre_ast_new_node(mem, CATENATION, sizeof(tre_catenation_t));
   105:   if (node == NULL)
   106:     return NULL;
   107:   ((tre_catenation_t *)node->obj)->left = left;
   108:   ((tre_catenation_t *)node->obj)->right = right;
   109:   node->num_submatches = left->num_submatches + right->num_submatches;
   110: 
   111:   return node;
   112: }
   113: 
   114: #ifdef TRE_DEBUG
   115: 
   116: static void
   117: tre_findent(FILE *stream, int i)
   118: {
   119:   while (i-- > 0)
   120:     fputc(' ', stream);
   121: }
   122: 
   123: void
   124: tre_print_params(int *params)
   125: {
   126:   int i;
   127:   if (params)
   128:     {
   129:       DPRINT(("params ["));
   130:       for (i = 0; i < TRE_PARAM_LAST; i++)
   131:         {
   132:           if (params[i] == TRE_PARAM_UNSET)
   133:             DPRINT(("unset"));
   134:           else if (params[i] == TRE_PARAM_DEFAULT)
   135:             DPRINT(("default"));
   136:           else
   137:             DPRINT(("%d", params[i]));
   138:           if (i < TRE_PARAM_LAST - 1)
   139:             DPRINT((", "));
   140:         }
   141:       DPRINT(("]"));
   142:     }
   143: }
   144: 
   145: static void
   146: tre_do_print(FILE *stream, tre_ast_node_t *ast, int indent)
   147: {
   148:   int code_min, code_max, pos;
   149:   int num_tags = ast->num_tags;
   150:   tre_literal_t *lit;
   151:   tre_iteration_t *iter;
   152: 
   153:   tre_findent(stream, indent);
   154:   switch (ast->type)
   155:     {
   156:     case LITERAL:
   157:       lit = ast->obj;
   158:       code_min = lit->code_min;
   159:       code_max = lit->code_max;
   160:       pos = lit->position;
   161:       if (IS_EMPTY(lit))
   162:         {
   163:           fprintf(stream, "literal empty\n");
   164:         }
   165:       else if (IS_ASSERTION(lit))
   166:         {
   167:           int i;
   168:           char *assertions[] = { "bol", "eol", "ctype", "!ctype",
   169:                                  "bow", "eow", "wb", "!wb" };
   170:           if (code_max >= ASSERT_LAST << 1)
   171:             assert(0);
   172:           fprintf(stream, "assertions: ");
   173:           for (i = 0; (1 << i) <= ASSERT_LAST; i++)
   174:             if (code_max & (1 << i))
   175:               fprintf(stream, "%s ", assertions[i]);
   176:           fprintf(stream, "\n");
   177:         }
   178:       else if (IS_TAG(lit))
   179:         {
   180:           fprintf(stream, "tag %d\n", code_max);
   181:         }
   182:       else if (IS_BACKREF(lit))
   183:         {
   184:           fprintf(stream, "backref %d, pos %d\n", code_max, pos);
   185:         }
   186:       else if (IS_PARAMETER(lit))
   187:         {
   188:           tre_print_params(lit->u.params);
   189:           fprintf(stream, "\n");
   190:         }
   191:       else
   192:         {
   193:           fprintf(stream, "literal (%c, %c) (%d, %d), pos %d, sub %d, "
   194:                   "%d tags\n", code_min, code_max, code_min, code_max, pos,
   195:                   ast->submatch_id, num_tags);
   196:         }
   197:       break;
   198:     case ITERATION:
   199:       iter = ast->obj;
   200:       fprintf(stream, "iteration {%d, %d}, sub %d, %d tags, %s\n",
   201:               iter->min, iter->max, ast->submatch_id, num_tags,
   202:               iter->minimal ? "minimal" : "greedy");
   203:       tre_do_print(stream, iter->arg, indent + 2);
   204:       break;
   205:     case UNION:
   206:       fprintf(stream, "union, sub %d, %d tags\n", ast->submatch_id, num_tags);
   207:       tre_do_print(stream, ((tre_union_t *)ast->obj)->left, indent + 2);
   208:       tre_do_print(stream, ((tre_union_t *)ast->obj)->right, indent + 2);
   209:       break;
   210:     case CATENATION:
   211:       fprintf(stream, "catenation, sub %d, %d tags\n", ast->submatch_id,
   212:               num_tags);
   213:       tre_do_print(stream, ((tre_catenation_t *)ast->obj)->left, indent + 2);
   214:       tre_do_print(stream, ((tre_catenation_t *)ast->obj)->right, indent + 2);
   215:       break;
   216:     default:
   217:       assert(0);
   218:       break;
   219:     }
   220: }
   221: 
   222: static void
   223: tre_ast_fprint(FILE *stream, tre_ast_node_t *ast)
   224: {
   225:   tre_do_print(stream, ast, 0);
   226: }
   227: 
   228: void
   229: tre_ast_print(tre_ast_node_t *tree)
   230: {
   231:   printf("AST:\n");
   232:   tre_ast_fprint(stdout, tree);
   233: }
   234: 
   235: #endif /* TRE_DEBUG */
   236: 
   237: /* EOF */
End cpp section to tre/tre_ast.cpp[1]
Start cpp section to tre/tre_ast.hpp[1 /1 ]
     1: #line 1598 "./lpsrc/tre.pak"
     2: /*
     3:   tre-ast.h - Abstract syntax tree (AST) definitions
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: */
    20: 
    21: 
    22: #ifndef TRE_AST_H
    23: #define TRE_AST_H 1
    24: 
    25: #include "tre_mem.hpp"
    26: #include "tre_internal.hpp"
    27: #include "tre_compile.hpp"
    28: 
    29: /* The different AST node types. */
    30: typedef enum {
    31:   LITERAL,
    32:   CATENATION,
    33:   ITERATION,
    34:   UNION
    35: } tre_ast_type_t;
    36: 
    37: /* Special subtypes of TRE_LITERAL. */
    38: #define EMPTY     -1   /* Empty leaf (denotes empty string). */
    39: #define ASSERTION -2   /* Assertion leaf. */
    40: #define TAG       -3   /* Tag leaf. */
    41: #define BACKREF   -4   /* Back reference leaf. */
    42: #define PARAMETER -5   /* Parameter. */
    43: 
    44: #define IS_SPECIAL(x)   ((x)->code_min < 0)
    45: #define IS_EMPTY(x)     ((x)->code_min == EMPTY)
    46: #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
    47: #define IS_TAG(x)       ((x)->code_min == TAG)
    48: #define IS_BACKREF(x)   ((x)->code_min == BACKREF)
    49: #define IS_PARAMETER(x) ((x)->code_min == PARAMETER)
    50: 
    51: 
    52: /* A generic AST node.  All AST nodes consist of this node on the top
    53:    level with `obj' pointing to the actual content. */
    54: typedef struct {
    55:   tre_ast_type_t type;   /* Type of the node. */
    56:   void *obj;             /* Pointer to actual node. */
    57:   int nullable;
    58:   int submatch_id;
    59:   int num_submatches;
    60:   int num_tags;
    61:   tre_pos_and_tags_t *firstpos;
    62:   tre_pos_and_tags_t *lastpos;
    63: } tre_ast_node_t;
    64: 
    65: 
    66: /* A "literal" node.  These are created for assertions, back references,
    67:    tags, matching parameter settings, and all expressions that match one
    68:    character. */
    69: typedef struct {
    70:   long code_min;
    71:   long code_max;
    72:   int position;
    73:   union {
    74:     tre_ctype_t klass;
    75:     unsigned int *params;
    76:   } u;
    77:   tre_ctype_t *neg_klasses;
    78: } tre_literal_t;
    79: 
    80: /* A "catenation" node.  These are created when two regexps are concatenated.
    81:    If there are more than one subexpressions in sequence, the `left' part
    82:    holds all but the last, and `right' part holds the last subexpression
    83:    (catenation is left associative). */
    84: typedef struct {
    85:   tre_ast_node_t *left;
    86:   tre_ast_node_t *right;
    87: } tre_catenation_t;
    88: 
    89: /* An "iteration" node.  These are created for the "*", "+", "?", and "{m,n}"
    90:    operators. */
    91: typedef struct {
    92:   /* Subexpression to match. */
    93:   tre_ast_node_t *arg;
    94:   /* Minimum number of consecutive matches. */
    95:   int min;
    96:   /* Maximum number of consecutive matches. */
    97:   int max;
    98:   /* If 0, match as many characters as possible, if 1 match as few as
    99:      possible.  Note that this does not always mean the same thing as
   100:      matching as many/few repetitions as possible. */
   101:   unsigned int minimal:1;
   102:   /* Approximate matching parameters (or NULL). */
   103:   unsigned int *params;
   104: } tre_iteration_t;
   105: 
   106: /* An "union" node.  These are created for the "|" operator. */
   107: typedef struct {
   108:   tre_ast_node_t *left;
   109:   tre_ast_node_t *right;
   110: } tre_union_t;
   111: 
   112: tre_ast_node_t *
   113: tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, size_t size);
   114: 
   115: tre_ast_node_t *
   116: tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position);
   117: 
   118: tre_ast_node_t *
   119: tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max,
   120:                  int minimal);
   121: 
   122: tre_ast_node_t *
   123: tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right);
   124: 
   125: tre_ast_node_t *
   126: tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left,
   127:                        tre_ast_node_t *right);
   128: 
   129: #ifdef TRE_DEBUG
   130: void
   131: tre_ast_print(tre_ast_node_t *tree);
   132: 
   133: /* XXX - rethink AST printing API */
   134: void
   135: tre_print_params(int *params);
   136: #endif /* TRE_DEBUG */
   137: 
   138: #endif /* TRE_AST_H */
   139: 
   140: /* EOF */
End cpp section to tre/tre_ast.hpp[1]
Start cpp section to tre/tre_compile.cpp[1 /1 ]
     1: #line 1739 "./lpsrc/tre.pak"
     2: /*
     3:   tre-compile.c - TRE regex compiler
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: /*
    23:   TODO:
    24:    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
    25:      function calls.
    26: */
    27: 
    28: 
    29: #include "flx_target_tre_config.hpp"
    30: #include <stdio.h>
    31: #include <assert.h>
    32: #include <string.h>
    33: 
    34: #include "tre_internal.hpp"
    35: #include "tre_mem.hpp"
    36: #include "tre_stack.hpp"
    37: #include "tre_ast.hpp"
    38: #include "tre_parse.hpp"
    39: #include "tre_compile.hpp"
    40: #include "tre_regex.hpp"
    41: #include "tre_xmalloc.hpp"
    42: 
    43: /*
    44:   Algorithms to setup tags so that submatch addressing can be done.
    45: */
    46: 
    47: 
    48: /* Inserts a catenation node to the root of the tree given in `node'.
    49:    As the left child a new tag with number `tag_id' to `node' is added,
    50:    and the right child is the old root. */
    51: static reg_errcode_t
    52: tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
    53: {
    54:   tre_catenation_t *c;
    55: 
    56:   DPRINT(("add_tag_left: tag %d\n", tag_id));
    57: 
    58:   c = (tre_catenation_t*)tre_mem_alloc(mem, sizeof(*c));
    59:   if (c == NULL)
    60:     return REG_ESPACE;
    61:   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
    62:   if (c->left == NULL)
    63:     return REG_ESPACE;
    64:   c->right = (tre_ast_node_t*)tre_mem_alloc(mem, sizeof(tre_ast_node_t));
    65:   if (c->right == NULL)
    66:     return REG_ESPACE;
    67: 
    68:   c->right->obj = node->obj;
    69:   c->right->type = node->type;
    70:   c->right->nullable = -1;
    71:   c->right->submatch_id = -1;
    72:   c->right->firstpos = NULL;
    73:   c->right->lastpos = NULL;
    74:   c->right->num_tags = 0;
    75:   node->obj = c;
    76:   node->type = CATENATION;
    77:   return REG_OK;
    78: }
    79: 
    80: /* Inserts a catenation node to the root of the tree given in `node'.
    81:    As the right child a new tag with number `tag_id' to `node' is added,
    82:    and the left child is the old root. */
    83: static reg_errcode_t
    84: tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
    85: {
    86:   tre_catenation_t *c;
    87: 
    88:   DPRINT(("tre_add_tag_right: tag %d\n", tag_id));
    89: 
    90:   c = (tre_catenation_t*)tre_mem_alloc(mem, sizeof(*c));
    91:   if (c == NULL)
    92:     return REG_ESPACE;
    93:   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
    94:   if (c->right == NULL)
    95:     return REG_ESPACE;
    96:   c->left = (tre_ast_node_t*)tre_mem_alloc(mem, sizeof(tre_ast_node_t));
    97:   if (c->left == NULL)
    98:     return REG_ESPACE;
    99: 
   100:   c->left->obj = node->obj;
   101:   c->left->type = node->type;
   102:   c->left->nullable = -1;
   103:   c->left->submatch_id = -1;
   104:   c->left->firstpos = NULL;
   105:   c->left->lastpos = NULL;
   106:   c->left->num_tags = 0;
   107:   node->obj = c;
   108:   node->type = CATENATION;
   109:   return REG_OK;
   110: }
   111: 
   112: typedef enum {
   113:   ADDTAGS_RECURSE,
   114:   ADDTAGS_AFTER_ITERATION,
   115:   ADDTAGS_AFTER_UNION_LEFT,
   116:   ADDTAGS_AFTER_UNION_RIGHT,
   117:   ADDTAGS_AFTER_CAT_LEFT,
   118:   ADDTAGS_AFTER_CAT_RIGHT,
   119:   ADDTAGS_SET_SUBMATCH_END
   120: } tre_addtags_symbol_t;
   121: 
   122: 
   123: typedef struct {
   124:   int tag;
   125:   int next_tag;
   126: } tre_tag_states_t;
   127: 
   128: /* Adds tags to appropriate locations in the parse tree in `tree', so that
   129:    subexpressions marked for submatch addressing can be traced. */
   130: static reg_errcode_t
   131: tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
   132:              tre_tnfa_t *tnfa)
   133: {
   134:   reg_errcode_t status = REG_OK;
   135:   tre_addtags_symbol_t symbol;
   136:   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
   137:   int bottom = tre_stack_num_objects(stack);
   138:   /* True for first pass (counting number of needed tags) */
   139:   int first_pass = (mem == NULL || tnfa == NULL);
   140:   int *regset, *orig_regset;
   141:   int num_tags = 0; /* Total number of tags. */
   142:   int num_minimals = 0;  /* Number of special minimal tags. */
   143:   int tag = 0;      /* The tag that is to be added next. */
   144:   int next_tag = 1; /* Next tag to use after this one. */
   145:   int *parents;     /* Stack of submatches the current submatch is
   146:                        contained in. */
   147:   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
   148:   tre_tag_states_t *saved_states;
   149: 
   150:   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
   151:   if (!first_pass)
   152:     {
   153:       tnfa->end_tag = 0;
   154:       tnfa->minimal_tags[0] = -1;
   155:     }
   156: 
   157:   regset = (int*)xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
   158:   if (regset == NULL)
   159:     return REG_ESPACE;
   160:   regset[0] = -1;
   161:   orig_regset = regset;
   162: 
   163:   parents = (int*)xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
   164:   if (parents == NULL)
   165:     {
   166:       xfree(regset);
   167:       return REG_ESPACE;
   168:     }
   169:   parents[0] = -1;
   170: 
   171:   saved_states = (tre_tag_states_t*)xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
   172:   if (saved_states == NULL)
   173:     {
   174:       xfree(regset);
   175:       xfree(parents);
   176:       return REG_ESPACE;
   177:     }
   178:   else
   179:     {
   180:       unsigned int i;
   181:       for (i = 0; i <= tnfa->num_submatches; i++)
   182:         saved_states[i].tag = -1;
   183:     }
   184: 
   185:   STACK_PUSH(stack, node);
   186:   STACK_PUSH(stack, ADDTAGS_RECURSE);
   187: 
   188:   while (tre_stack_num_objects(stack) > bottom)
   189:     {
   190:       if (status != REG_OK)
   191:         break;
   192: 
   193:       symbol = (tre_addtags_symbol_t)(FLX_RAWADDRESS)tre_stack_pop(stack);
   194:       switch (symbol)
   195:         {
   196: 
   197:         case ADDTAGS_SET_SUBMATCH_END:
   198:           {
   199:             int id = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   200:             int i;
   201: 
   202:             /* Add end of this submatch to regset. */
   203:             for (i = 0; regset[i] >= 0; i++);
   204:             regset[i] = id * 2 + 1;
   205:             regset[i + 1] = -1;
   206: 
   207:             /* Pop this submatch from the parents stack. */
   208:             for (i = 0; parents[i] >= 0; i++);
   209:             parents[i - 1] = -1;
   210:             break;
   211:           }
   212: 
   213:         case ADDTAGS_RECURSE:
   214:           node = (tre_ast_node_t*)tre_stack_pop(stack);
   215: 
   216:           if (node->submatch_id >= 0)
   217:             {
   218:               int id = node->submatch_id;
   219:               int i;
   220: 
   221: 
   222:               /* Add start of this submatch to regset. */
   223:               for (i = 0; regset[i] >= 0; i++);
   224:               regset[i] = id * 2;
   225:               regset[i + 1] = -1;
   226: 
   227:               if (!first_pass)
   228:                 {
   229:                   for (i = 0; parents[i] >= 0; i++);
   230:                   tnfa->submatch_data[id].parents = NULL;
   231:                   if (i > 0)
   232:                     {
   233:                       int *p = (int*)xmalloc(sizeof(*p) * (i + 1));
   234:                       if (p == NULL)
   235:                         {
   236:                           status = REG_ESPACE;
   237:                           break;
   238:                         }
   239:                       assert(tnfa->submatch_data[id].parents == NULL);
   240:                       tnfa->submatch_data[id].parents = p;
   241:                       for (i = 0; parents[i] >= 0; i++)
   242:                         p[i] = parents[i];
   243:                       p[i] = -1;
   244:                     }
   245:                 }
   246: 
   247:               /* Add end of this submatch to regset after processing this
   248:                  node. */
   249:               STACK_PUSHX(stack, node->submatch_id);
   250:               STACK_PUSHX(stack, ADDTAGS_SET_SUBMATCH_END);
   251:             }
   252: 
   253:           switch (node->type)
   254:             {
   255:             case LITERAL:
   256:               {
   257:                 tre_literal_t *lit = (tre_literal_t*)node->obj;
   258: 
   259:                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
   260:                   {
   261:                     int i;
   262:                     DPRINT(("Literal %d-%d\n",
   263:                             (int)lit->code_min, (int)lit->code_max));
   264:                     if (regset[0] >= 0)
   265:                       {
   266:                         /* Regset is not empty, so add a tag before the
   267:                            literal or backref. */
   268:                         if (!first_pass)
   269:                           {
   270:                             status = tre_add_tag_left(mem, node, tag);
   271:                             tnfa->tag_directions[tag] = direction;
   272:                             if (minimal_tag >= 0)
   273:                               {
   274:                                 DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
   275:                                 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
   276:                                 tnfa->minimal_tags[i] = tag;
   277:                                 tnfa->minimal_tags[i + 1] = minimal_tag;
   278:                                 tnfa->minimal_tags[i + 2] = -1;
   279:                                 minimal_tag = -1;
   280:                                 num_minimals++;
   281:                               }
   282:                             /* Go through the regset and set submatch data for
   283:                                submatches that are using this tag. */
   284:                             for (i = 0; regset[i] >= 0; i++)
   285:                               {
   286:                                 int id = regset[i] / 2;
   287:                                 int start = !(regset[i] % 2);
   288:                                 DPRINT(("  Using tag %d for %s offset of "
   289:                                         "submatch %d\n", tag,
   290:                                         start ? "start" : "end", id));
   291:                                 if (start)
   292:                                   tnfa->submatch_data[id].so_tag = tag;
   293:                                 else
   294:                                   tnfa->submatch_data[id].eo_tag = tag;
   295:                               }
   296:                           }
   297:                         else
   298:                           {
   299:                             DPRINT(("  num_tags = 1\n"));
   300:                             node->num_tags = 1;
   301:                           }
   302: 
   303:                         DPRINT(("  num_tags++\n"));
   304:                         regset[0] = -1;
   305:                         tag = next_tag;
   306:                         num_tags++;
   307:                         next_tag++;
   308:                       }
   309:                   }
   310:                 else
   311:                   {
   312:                     assert(!IS_TAG(lit));
   313:                   }
   314:                 break;
   315:               }
   316:             case CATENATION:
   317:               {
   318:                 tre_catenation_t *cat = (tre_catenation_t*)node->obj;
   319:                 tre_ast_node_t *left = (tre_ast_node_t*)cat->left;
   320:                 tre_ast_node_t *right = (tre_ast_node_t*)cat->right;
   321:                 int reserved_tag = -1;
   322:                 DPRINT(("Catenation, next_tag = %d\n", next_tag));
   323: 
   324: 
   325:                 /* After processing right child. */
   326:                 STACK_PUSHX(stack, node);
   327:                 STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_RIGHT);
   328: 
   329:                 /* Process right child. */
   330:                 STACK_PUSHX(stack, right);
   331:                 STACK_PUSHX(stack, ADDTAGS_RECURSE);
   332: 
   333:                 /* After processing left child. */
   334:                 STACK_PUSHX(stack, next_tag + left->num_tags);
   335:                 DPRINT(("  Pushing %d for after left\n",
   336:                         next_tag + left->num_tags));
   337:                 if (left->num_tags > 0 && right->num_tags > 0)
   338:                   {
   339:                     /* Reserve the next tag to the right child. */
   340:                     DPRINT(("  Reserving next_tag %d to right child\n",
   341:                             next_tag));
   342:                     reserved_tag = next_tag;
   343:                     next_tag++;
   344:                   }
   345:                 STACK_PUSHX(stack, reserved_tag);
   346:                 STACK_PUSHX(stack, ADDTAGS_AFTER_CAT_LEFT);
   347: 
   348:                 /* Process left child. */
   349:                 STACK_PUSHX(stack, left);
   350:                 STACK_PUSHX(stack, ADDTAGS_RECURSE);
   351: 
   352:                 }
   353:               break;
   354:             case ITERATION:
   355:               {
   356:                 tre_iteration_t *iter = (tre_iteration_t*)node->obj;
   357:                 DPRINT(("Iteration\n"));
   358: 
   359:                 if (first_pass)
   360:                   {
   361:                     STACK_PUSHX(stack, regset[0] >= 0 || iter->minimal);
   362:                   }
   363:                 else
   364:                   {
   365:                     STACK_PUSHX(stack, tag);
   366:                     STACK_PUSHX(stack, iter->minimal);
   367:                   }
   368:                 STACK_PUSHX(stack, node);
   369:                 STACK_PUSHX(stack, ADDTAGS_AFTER_ITERATION);
   370: 
   371:                 STACK_PUSHX(stack, iter->arg);
   372:                 STACK_PUSHX(stack, ADDTAGS_RECURSE);
   373: 
   374:                 /* Regset is not empty, so add a tag here. */
   375:                 if (regset[0] >= 0 || iter->minimal)
   376:                   {
   377:                     if (!first_pass)
   378:                       {
   379:                         int i;
   380:                         status = tre_add_tag_left(mem, node, tag);
   381:                         if (iter->minimal)
   382:                           tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
   383:                         else
   384:                           tnfa->tag_directions[tag] = direction;
   385:                         if (minimal_tag >= 0)
   386:                           {
   387:                             DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
   388:                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
   389:                             tnfa->minimal_tags[i] = tag;
   390:                             tnfa->minimal_tags[i + 1] = minimal_tag;
   391:                             tnfa->minimal_tags[i + 2] = -1;
   392:                             minimal_tag = -1;
   393:                             num_minimals++;
   394:                           }
   395:                         /* Go through the regset and set submatch data for
   396:                            submatches that are using this tag. */
   397:                         for (i = 0; regset[i] >= 0; i++)
   398:                           {
   399:                             int id = regset[i] / 2;
   400:                             int start = !(regset[i] % 2);
   401:                             DPRINT(("  Using tag %d for %s offset of "
   402:                                     "submatch %d\n", tag,
   403:                                     start ? "start" : "end", id));
   404:                             if (start)
   405:                               tnfa->submatch_data[id].so_tag = tag;
   406:                             else
   407:                               tnfa->submatch_data[id].eo_tag = tag;
   408:                           }
   409:                       }
   410: 
   411:                     DPRINT(("  num_tags++\n"));
   412:                     regset[0] = -1;
   413:                     tag = next_tag;
   414:                     num_tags++;
   415:                     next_tag++;
   416:                   }
   417:                 direction = TRE_TAG_MINIMIZE;
   418:               }
   419:               break;
   420:             case UNION:
   421:               {
   422:                 tre_union_t *uni = (tre_union_t*)node->obj;
   423:                 tre_ast_node_t *left = (tre_ast_node_t*)uni->left;
   424:                 tre_ast_node_t *right = (tre_ast_node_t*)uni->right;
   425:                 int left_tag;
   426:                 int right_tag;
   427: 
   428:                 if (regset[0] >= 0)
   429:                   {
   430:                     left_tag = next_tag;
   431:                     right_tag = next_tag + 1;
   432:                   }
   433:                 else
   434:                   {
   435:                     left_tag = tag;
   436:                     right_tag = next_tag;
   437:                   }
   438: 
   439:                 DPRINT(("Union\n"));
   440: 
   441:                 /* After processing right child. */
   442:                 STACK_PUSHX(stack, right_tag);
   443:                 STACK_PUSHX(stack, left_tag);
   444:                 STACK_PUSHX(stack, regset);
   445:                 STACK_PUSHX(stack, regset[0] >= 0);
   446:                 STACK_PUSHX(stack, node);
   447:                 STACK_PUSHX(stack, right);
   448:                 STACK_PUSHX(stack, left);
   449:                 STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_RIGHT);
   450: 
   451:                 /* Process right child. */
   452:                 STACK_PUSHX(stack, right);
   453:                 STACK_PUSHX(stack, ADDTAGS_RECURSE);
   454: 
   455:                 /* After processing left child. */
   456:                 STACK_PUSHX(stack, ADDTAGS_AFTER_UNION_LEFT);
   457: 
   458:                 /* Process left child. */
   459:                 STACK_PUSHX(stack, left);
   460:                 STACK_PUSHX(stack, ADDTAGS_RECURSE);
   461: 
   462:                 /* Regset is not empty, so add a tag here. */
   463:                 if (regset[0] >= 0)
   464:                   {
   465:                     if (!first_pass)
   466:                       {
   467:                         int i;
   468:                         status = tre_add_tag_left(mem, node, tag);
   469:                         tnfa->tag_directions[tag] = direction;
   470:                         if (minimal_tag >= 0)
   471:                           {
   472:                             DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
   473:                             for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
   474:                             tnfa->minimal_tags[i] = tag;
   475:                             tnfa->minimal_tags[i + 1] = minimal_tag;
   476:                             tnfa->minimal_tags[i + 2] = -1;
   477:                             minimal_tag = -1;
   478:                             num_minimals++;
   479:                           }
   480:                         /* Go through the regset and set submatch data for
   481:                            submatches that are using this tag. */
   482:                         for (i = 0; regset[i] >= 0; i++)
   483:                           {
   484:                             int id = regset[i] / 2;
   485:                             int start = !(regset[i] % 2);
   486:                             DPRINT(("  Using tag %d for %s offset of "
   487:                                     "submatch %d\n", tag,
   488:                                     start ? "start" : "end", id));
   489:                             if (start)
   490:                               tnfa->submatch_data[id].so_tag = tag;
   491:                             else
   492:                               tnfa->submatch_data[id].eo_tag = tag;
   493:                           }
   494:                       }
   495: 
   496:                     DPRINT(("  num_tags++\n"));
   497:                     regset[0] = -1;
   498:                     tag = next_tag;
   499:                     num_tags++;
   500:                     next_tag++;
   501:                   }
   502: 
   503:                 if (node->num_submatches > 0)
   504:                   {
   505:                     /* The next two tags are reserved for markers. */
   506:                     next_tag++;
   507:                     tag = next_tag;
   508:                     next_tag++;
   509:                   }
   510: 
   511:                 break;
   512:               }
   513:             }
   514: 
   515:           if (node->submatch_id >= 0)
   516:             {
   517:               int i;
   518:               /* Push this submatch on the parents stack. */
   519:               for (i = 0; parents[i] >= 0; i++);
   520:               parents[i] = node->submatch_id;
   521:               parents[i + 1] = -1;
   522:             }
   523: 
   524:           break; /* end case: ADDTAGS_RECURSE */
   525: 
   526:         case ADDTAGS_AFTER_ITERATION:
   527:           {
   528:             int minimal = 0;
   529:             int enter_tag;
   530:             node = (tre_ast_node_t*)tre_stack_pop(stack);
   531:             if (first_pass)
   532:               {
   533:                 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
   534:                   + (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   535:                 minimal_tag = -1;
   536:               }
   537:             else
   538:               {
   539:                 minimal = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   540:                 enter_tag = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   541:                 if (minimal)
   542:                   minimal_tag = enter_tag;
   543:               }
   544: 
   545:             DPRINT(("After iteration\n"));
   546:             if (!first_pass)
   547:               {
   548:                 DPRINT(("  Setting direction to %s\n",
   549:                         minimal ? "minimize" : "maximize"));
   550:                 if (minimal)
   551:                   direction = TRE_TAG_MINIMIZE;
   552:                 else
   553:                   direction = TRE_TAG_MAXIMIZE;
   554:               }
   555:             break;
   556:           }
   557: 
   558:         case ADDTAGS_AFTER_CAT_LEFT:
   559:           {
   560:             int new_tag = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   561:             next_tag = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   562:             DPRINT(("After cat left, tag = %d, next_tag = %d\n",
   563:                     tag, next_tag));
   564:             if (new_tag >= 0)
   565:               {
   566:                 DPRINT(("  Setting tag to %d\n", new_tag));
   567:                 tag = new_tag;
   568:               }
   569:             break;
   570:           }
   571: 
   572:         case ADDTAGS_AFTER_CAT_RIGHT:
   573:           DPRINT(("After cat right\n"));
   574:           node = (tre_ast_node_t*)tre_stack_pop(stack);
   575:           if (first_pass)
   576:             node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
   577:               + ((tre_catenation_t *)node->obj)->right->num_tags;
   578:           break;
   579: 
   580:         case ADDTAGS_AFTER_UNION_LEFT:
   581:           DPRINT(("After union left\n"));
   582:           /* Lift the bottom of the `regset' array so that when processing
   583:              the right operand the items currently in the array are
   584:              invisible.  The original bottom was saved at ADDTAGS_UNION and
   585:              will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
   586:           while (*regset >= 0)
   587:             regset++;
   588:           break;
   589: 
   590:         case ADDTAGS_AFTER_UNION_RIGHT:
   591:           {
   592:             int added_tags, tag_left, tag_right;
   593:             tre_ast_node_t *left = (tre_ast_node_t*)tre_stack_pop(stack);
   594:             tre_ast_node_t *right = (tre_ast_node_t*)tre_stack_pop(stack);
   595:             DPRINT(("After union right\n"));
   596:             node = (tre_ast_node_t*)tre_stack_pop(stack);
   597:             added_tags = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   598:             if (first_pass)
   599:               {
   600:                 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
   601:                   + ((tre_union_t *)node->obj)->right->num_tags + added_tags
   602:                   + ((node->num_submatches > 0) ? 2 : 0);
   603:               }
   604:             regset = (int*)tre_stack_pop(stack);
   605:             tag_left = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   606:             tag_right = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   607: 
   608:             /* Add tags after both children, the left child gets a smaller
   609:                tag than the right child.  This guarantees that we prefer
   610:                the left child over the right child. */
   611:             /* XXX - This is not always necessary (if the children have
   612:                tags which must be seen for every match of that child). */
   613:             /* XXX - Check if this is the only place where tre_add_tag_right
   614:                is used.  If so, use tre_add_tag_left (putting the tag before
   615:                the child as opposed after the child) and throw away
   616:                tre_add_tag_right. */
   617:             if (node->num_submatches > 0)
   618:               {
   619:                 if (!first_pass)
   620:                   {
   621:                     status = tre_add_tag_right(mem, left, tag_left);
   622:                     tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
   623:                     status = tre_add_tag_right(mem, right, tag_right);
   624:                     tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
   625:                   }
   626:                 DPRINT(("  num_tags += 2\n"));
   627:                 num_tags += 2;
   628:               }
   629:             direction = TRE_TAG_MAXIMIZE;
   630:             break;
   631:           }
   632: 
   633:         default:
   634:           assert(0);
   635:           break;
   636: 
   637:         } /* end switch(symbol) */
   638:     } /* end while(tre_stack_num_objects(stack) > bottom) */
   639: 
   640:   if (!first_pass)
   641:     {
   642:       int i;
   643:       /* Go through the regset and set submatch data for
   644:          submatches that are using this tag. */
   645:       for (i = 0; regset[i] >= 0; i++)
   646:         {
   647:           int id = regset[i] / 2;
   648:           int start = !(regset[i] % 2);
   649:           DPRINT(("  Using tag %d for %s offset of "
   650:                   "submatch %d\n", num_tags,
   651:                   start ? "start" : "end", id));
   652:           if (start)
   653:             tnfa->submatch_data[id].so_tag = num_tags;
   654:           else
   655:             tnfa->submatch_data[id].eo_tag = num_tags;
   656:         }
   657:     }
   658: 
   659:   if (!first_pass && minimal_tag >= 0)
   660:     {
   661:       int i;
   662:       DPRINT(("Minimal %d, %d\n", minimal_tag, tag));
   663:       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
   664:       tnfa->minimal_tags[i] = tag;
   665:       tnfa->minimal_tags[i + 1] = minimal_tag;
   666:       tnfa->minimal_tags[i + 2] = -1;
   667:       minimal_tag = -1;
   668:       num_minimals++;
   669:     }
   670: 
   671:   DPRINT(("tre_add_tags: %s complete.  Number of tags %d.\n",
   672:           first_pass? "First pass" : "Second pass", num_tags));
   673: 
   674:   assert(tree->num_tags == num_tags);
   675:   tnfa->end_tag = num_tags;
   676:   tnfa->num_tags = num_tags;
   677:   tnfa->num_minimals = num_minimals;
   678:   xfree(orig_regset);
   679:   xfree(parents);
   680:   xfree(saved_states);
   681:   return status;
   682: }
   683: 
   684: 
   685: 
   686: /*
   687:   AST to TNFA compilation routines.
   688: */
   689: 
   690: typedef enum {
   691:   COPY_RECURSE,
   692:   COPY_SET_RESULT_PTR
   693: } tre_copyast_symbol_t;
   694: 
   695: /* Flags for tre_copy_ast(). */
   696: #define COPY_REMOVE_TAGS         1
   697: #define COPY_MAXIMIZE_FIRST_TAG  2
   698: 
   699: static reg_errcode_t
   700: tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
   701:              int flags, int *pos_add, tre_tag_direction_t *tag_directions,
   702:              tre_ast_node_t **copy, int *max_pos)
   703: {
   704:   reg_errcode_t status = REG_OK;
   705:   int bottom = tre_stack_num_objects(stack);
   706:   int num_copied = 0;
   707:   int first_tag = 1;
   708:   tre_ast_node_t **result = copy;
   709:   tre_copyast_symbol_t symbol;
   710: 
   711:   STACK_PUSH(stack, ast);
   712:   STACK_PUSH(stack, COPY_RECURSE);
   713: 
   714:   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
   715:     {
   716:       tre_ast_node_t *node;
   717:       if (status != REG_OK)
   718:         break;
   719: 
   720:       symbol = (tre_copyast_symbol_t)(FLX_RAWADDRESS)tre_stack_pop(stack);
   721:       switch (symbol)
   722:         {
   723:         case COPY_SET_RESULT_PTR:
   724:           result = (tre_ast_node_t**)tre_stack_pop(stack);
   725:           break;
   726:         case COPY_RECURSE:
   727:           node = (tre_ast_node_t*)tre_stack_pop(stack);
   728:           switch (node->type)
   729:             {
   730:             case LITERAL:
   731:               {
   732:                 tre_literal_t *lit = (tre_literal_t*)node->obj;
   733:                 int pos = lit->position;
   734:                 int min = lit->code_min;
   735:                 int max = lit->code_max;
   736:                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
   737:                   {
   738:                     /* XXX - e.g. [ab] has only one position but two
   739:                        nodes, so we are creating holes in the state space
   740:                        here.  Not fatal, just wastes memory. */
   741:                     pos += *pos_add;
   742:                     num_copied++;
   743:                   }
   744:                 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
   745:                   {
   746:                     /* Change this tag to empty. */
   747:                     min = EMPTY;
   748:                     max = pos = -1;
   749:                   }
   750:                 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
   751:                          && first_tag)
   752:                   {
   753:                     /* Maximize the first tag. */
   754:                     tag_directions[max] = TRE_TAG_MAXIMIZE;
   755:                     first_tag = 0;
   756:                   }
   757:                 *result = tre_ast_new_literal(mem, min, max, pos);
   758:                 if (*result == NULL)
   759:                   status = REG_ESPACE;
   760: 
   761:                 if (pos > *max_pos)
   762:                   *max_pos = pos;
   763:                 break;
   764:               }
   765:             case UNION:
   766:               {
   767:                 tre_union_t *uni = (tre_union_t*)node->obj;
   768:                 tre_union_t *copy;
   769:                 *result = tre_ast_new_union(mem, uni->left, uni->right);
   770:                 if (*result == NULL)
   771:                   {
   772:                     status = REG_ESPACE;
   773:                     break;
   774:                   }
   775:                 copy = (tre_union_t*)(*result)->obj;
   776:                 result = ©->left;
   777:                 STACK_PUSHX(stack, uni->right);
   778:                 STACK_PUSHX(stack, COPY_RECURSE);
   779:                 STACK_PUSHX(stack, ©->right);
   780:                 STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
   781:                 STACK_PUSHX(stack, uni->left);
   782:                 STACK_PUSHX(stack, COPY_RECURSE);
   783:                 break;
   784:               }
   785:             case CATENATION:
   786:               {
   787:                 tre_catenation_t *cat = (tre_catenation_t*)node->obj;
   788:                 tre_catenation_t *copy;
   789:                 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
   790:                 if (*result == NULL)
   791:                   {
   792:                     status = REG_ESPACE;
   793:                     break;
   794:                   }
   795:                 copy = (tre_catenation_t*)(*result)->obj;
   796:                 copy->left = NULL;
   797:                 copy->right = NULL;
   798:                 result = ©->left;
   799: 
   800:                 STACK_PUSHX(stack, cat->right);
   801:                 STACK_PUSHX(stack, COPY_RECURSE);
   802:                 STACK_PUSHX(stack, ©->right);
   803:                 STACK_PUSHX(stack, COPY_SET_RESULT_PTR);
   804:                 STACK_PUSHX(stack, cat->left);
   805:                 STACK_PUSHX(stack, COPY_RECURSE);
   806:                 break;
   807:               }
   808:             case ITERATION:
   809:               {
   810:                 tre_iteration_t *iter = (tre_iteration_t*)node->obj;
   811:                 STACK_PUSHX(stack, iter->arg);
   812:                 STACK_PUSHX(stack, COPY_RECURSE);
   813:                 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
   814:                                            iter->max, iter->minimal);
   815:                 if (*result == NULL)
   816:                   {
   817:                     status = REG_ESPACE;
   818:                     break;
   819:                   }
   820:                 iter = (tre_iteration_t*)(*result)->obj;
   821:                 result = &iter->arg;
   822:                 break;
   823:               }
   824:             default:
   825:               assert(0);
   826:               break;
   827:             }
   828:           break;
   829:         }
   830:     }
   831:   *pos_add += num_copied;
   832:   return status;
   833: }
   834: 
   835: typedef enum {
   836:   EXPAND_RECURSE,
   837:   EXPAND_AFTER_ITER
   838: } tre_expand_ast_symbol_t;
   839: 
   840: /* Expands each iteration node that has a finite nonzero minimum or maximum
   841:    iteration count to a catenated sequence of copies of the node. */
   842: static reg_errcode_t
   843: tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
   844:                int *position, tre_tag_direction_t *tag_directions,
   845:                int *max_depth)
   846: {
   847:   reg_errcode_t status = REG_OK;
   848:   int bottom = tre_stack_num_objects(stack);
   849:   int pos_add = 0;
   850:   int pos_add_total = 0;
   851:   int max_pos = 0;
   852:   /* Current approximate matching parameters. */
   853:   int params[TRE_PARAM_LAST];
   854:   /* Approximate parameter nesting level. */
   855:   int params_depth = 0;
   856:   int iter_depth = 0;
   857:   int i;
   858: 
   859:   for (i = 0; i < TRE_PARAM_LAST; i++)
   860:     params[i] = TRE_PARAM_DEFAULT;
   861: 
   862:   STACK_PUSHR(stack, ast);
   863:   STACK_PUSHR(stack, EXPAND_RECURSE);
   864:   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
   865:     {
   866:       tre_ast_node_t *node;
   867:       tre_expand_ast_symbol_t symbol;
   868: 
   869:       if (status != REG_OK)
   870:         break;
   871: 
   872:       DPRINT(("pos_add %d\n", pos_add));
   873: 
   874:       symbol = (tre_expand_ast_symbol_t)(FLX_RAWADDRESS)tre_stack_pop(stack);
   875:       node = (tre_ast_node_t*)tre_stack_pop(stack);
   876:       switch (symbol)
   877:         {
   878:         case EXPAND_RECURSE:
   879:           switch (node->type)
   880:             {
   881:             case LITERAL:
   882:               {
   883:                 tre_literal_t *lit= (tre_literal_t*)node->obj;
   884:                 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
   885:                   {
   886:                     lit->position += pos_add;
   887:                     if (lit->position > max_pos)
   888:                       max_pos = lit->position;
   889:                   }
   890:                 break;
   891:               }
   892:             case UNION:
   893:               {
   894:                 tre_union_t *uni = (tre_union_t*)node->obj;
   895:                 STACK_PUSHX(stack, uni->right);
   896:                 STACK_PUSHX(stack, EXPAND_RECURSE);
   897:                 STACK_PUSHX(stack, uni->left);
   898:                 STACK_PUSHX(stack, EXPAND_RECURSE);
   899:                 break;
   900:               }
   901:             case CATENATION:
   902:               {
   903:                 tre_catenation_t *cat = (tre_catenation_t*)node->obj;
   904:                 STACK_PUSHX(stack, cat->right);
   905:                 STACK_PUSHX(stack, EXPAND_RECURSE);
   906:                 STACK_PUSHX(stack, cat->left);
   907:                 STACK_PUSHX(stack, EXPAND_RECURSE);
   908:                 break;
   909:               }
   910:             case ITERATION:
   911:               {
   912:                 tre_iteration_t *iter = (tre_iteration_t*)node->obj;
   913:                 STACK_PUSHX(stack, pos_add);
   914:                 STACK_PUSHX(stack, node);
   915:                 STACK_PUSHX(stack, EXPAND_AFTER_ITER);
   916:                 STACK_PUSHX(stack, iter->arg);
   917:                 STACK_PUSHX(stack, EXPAND_RECURSE);
   918:                 /* If we are going to expand this node at EXPAND_AFTER_ITER
   919:                    then don't increase the `pos' fields of the nodes now, it
   920:                    will get done when expanding. */
   921:                 if (iter->min > 1 || iter->max > 1)
   922:                   pos_add = 0;
   923:                 iter_depth++;
   924:                 DPRINT(("iter\n"));
   925:                 break;
   926:               }
   927:             default:
   928:               assert(0);
   929:               break;
   930:             }
   931:           break;
   932:         case EXPAND_AFTER_ITER:
   933:           {
   934:             tre_iteration_t *iter = (tre_iteration_t*)node->obj;
   935:             int pos_add_last;
   936:             pos_add = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
   937:             pos_add_last = pos_add;
   938:             if (iter->min > 1 || iter->max > 1)
   939:               {
   940:                 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
   941:                 int i;
   942:                 int pos_add_save = pos_add;
   943: 
   944:                 /* Create a catenated sequence of copies of the node. */
   945:                 for (i = 0; i < iter->min; i++)
   946:                   {
   947:                     tre_ast_node_t *copy;
   948:                     /* Remove tags from all but the last copy. */
   949:                     int flags = ((i + 1 < iter->min)
   950:                                  ? COPY_REMOVE_TAGS
   951:                                  : COPY_MAXIMIZE_FIRST_TAG);
   952:                     DPRINT(("  pos_add %d\n", pos_add));
   953:                     pos_add_save = pos_add;
   954:                     status = tre_copy_ast(mem, stack, iter->arg, flags,
   955:                                           &pos_add, tag_directions, ©,
   956:                                           &max_pos);
   957:                     if (status != REG_OK)
   958:                       return status;
   959:                     if (seq1 != NULL)
   960:                       seq1 = tre_ast_new_catenation(mem, seq1, copy);
   961:                     else
   962:                       seq1 = copy;
   963:                     if (seq1 == NULL)
   964:                       return REG_ESPACE;
   965:                   }
   966: 
   967:                 if (iter->max == -1)
   968:                   {
   969:                     /* No upper limit. */
   970:                     pos_add_save = pos_add;
   971:                     status = tre_copy_ast(mem, stack, iter->arg, 0,
   972:                                           &pos_add, NULL, &seq2, &max_pos);
   973:                     if (status != REG_OK)
   974:                       return status;
   975:                     seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
   976:                     if (seq2 == NULL)
   977:                       return REG_ESPACE;
   978:                   }
   979:                 else
   980:                   {
   981:                     for (i = iter->min; i < iter->max; i++)
   982:                       {
   983:                         tre_ast_node_t *tmp, *copy;
   984:                         pos_add_save = pos_add;
   985:                         status = tre_copy_ast(mem, stack, iter->arg, 0,
   986:                                               &pos_add, NULL, ©, &max_pos);
   987:                         if (status != REG_OK)
   988:                           return status;
   989:                         if (seq2 != NULL)
   990:                           seq2 = tre_ast_new_catenation(mem, copy, seq2);
   991:                         else
   992:                           seq2 = copy;
   993:                         if (seq2 == NULL)
   994:                           return REG_ESPACE;
   995:                         tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
   996:                         if (tmp == NULL)
   997:                           return REG_ESPACE;
   998:                         seq2 = tre_ast_new_union(mem, tmp, seq2);
   999:                         if (seq2 == NULL)
  1000:                           return REG_ESPACE;
  1001:                       }
  1002:                   }
  1003: 
  1004:                 pos_add = pos_add_save;
  1005:                 if (seq1 == NULL)
  1006:                   seq1 = seq2;
  1007:                 else if (seq2 != NULL)
  1008:                   seq1 = tre_ast_new_catenation(mem, seq1, seq2);
  1009:                 if (seq1 == NULL)
  1010:                   return REG_ESPACE;
  1011:                 node->obj = seq1->obj;
  1012:                 node->type = seq1->type;
  1013:               }
  1014: 
  1015:             iter_depth--;
  1016:             pos_add_total += pos_add - pos_add_last;
  1017:             if (iter_depth == 0)
  1018:               pos_add = pos_add_total;
  1019: 
  1020:             /* If approximate parameters are specified, surround the result
  1021:                with two parameter setting nodes.  The one on the left sets
  1022:                the specified parameters, and the one on the right restores
  1023:                the old parameters. */
  1024:             if (iter->params)
  1025:               {
  1026:                 tre_ast_node_t *tmp_l, *tmp_r, *tmp_node, *node_copy;
  1027:                 unsigned int *old_params;
  1028: 
  1029:                 tmp_l = tre_ast_new_literal(mem, PARAMETER, 0, -1);
  1030:                 if (!tmp_l)
  1031:                   return REG_ESPACE;
  1032:                 ((tre_literal_t *)tmp_l->obj)->u.params = iter->params;
  1033:                 iter->params[TRE_PARAM_DEPTH] = params_depth + 1;
  1034:                 tmp_r = tre_ast_new_literal(mem, PARAMETER, 0, -1);
  1035:                 if (!tmp_r)
  1036:                   return REG_ESPACE;
  1037:                 old_params = (unsigned int*)tre_mem_alloc(mem, sizeof(*old_params)
  1038:                                            * TRE_PARAM_LAST);
  1039:                 if (!old_params)
  1040:                   return REG_ESPACE;
  1041:                 for (i = 0; i < TRE_PARAM_LAST; i++)
  1042:                   old_params[i] = params[i];
  1043:                 ((tre_literal_t *)tmp_r->obj)->u.params = old_params;
  1044:                 old_params[TRE_PARAM_DEPTH] = params_depth;
  1045:                 /* XXX - this is the only place where ast_new_node is
  1046:                    needed -- should be moved inside AST module. */
  1047:                 node_copy = tre_ast_new_node(mem, ITERATION,
  1048:                                              sizeof(tre_iteration_t));
  1049:                 if (!node_copy)
  1050:                   return REG_ESPACE;
  1051:                 node_copy->obj = node->obj;
  1052:                 tmp_node = tre_ast_new_catenation(mem, tmp_l, node_copy);
  1053:                 if (!tmp_node)
  1054:                   return REG_ESPACE;
  1055:                 tmp_node = tre_ast_new_catenation(mem, tmp_node, tmp_r);
  1056:                 if (!tmp_node)
  1057:                   return REG_ESPACE;
  1058:                 /* Replace the contents of `node' with `tmp_node'. */
  1059:                 memcpy(node, tmp_node, sizeof(*node));
  1060:                 node->obj = tmp_node->obj;
  1061:                 node->type = tmp_node->type;
  1062:                 params_depth++;
  1063:                 if (params_depth > *max_depth)
  1064:                   *max_depth = params_depth;
  1065:               }
  1066:             break;
  1067:           }
  1068:         default:
  1069:           assert(0);
  1070:           break;
  1071:         }
  1072:     }
  1073: 
  1074:   *position += pos_add_total;
  1075: 
  1076:   /* `max_pos' should never be larger than `*position' if the above
  1077:      code works, but just an extra safeguard let's make sure
  1078:      `*position' is set large enough so enough memory will be
  1079:      allocated for the transition table. */
  1080:   if (max_pos > *position)
  1081:     *position = max_pos;
  1082: 
  1083: #ifdef TRE_DEBUG
  1084:   DPRINT(("Expanded AST:\n"));
  1085:   tre_ast_print(ast);
  1086:   DPRINT(("*position %d, max_pos %d\n", *position, max_pos));
  1087: #endif
  1088: 
  1089:   return status;
  1090: }
  1091: 
  1092: static tre_pos_and_tags_t *
  1093: tre_set_empty(tre_mem_t mem)
  1094: {
  1095:   tre_pos_and_tags_t *new_set;
  1096: 
  1097:   new_set = (tre_pos_and_tags_t*)tre_mem_calloc(mem, sizeof(*new_set));
  1098:   if (new_set == NULL)
  1099:     return NULL;
  1100: 
  1101:   new_set[0].position = -1;
  1102:   new_set[0].code_min = -1;
  1103:   new_set[0].code_max = -1;
  1104: 
  1105:   return new_set;
  1106: }
  1107: 
  1108: static tre_pos_and_tags_t *
  1109: tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
  1110:             tre_ctype_t klass, tre_ctype_t *neg_klasses, int backref)
  1111: {
  1112:   tre_pos_and_tags_t *new_set;
  1113: 
  1114:   new_set = (tre_pos_and_tags_t*)tre_mem_calloc(mem, sizeof(*new_set) * 2);
  1115:   if (new_set == NULL)
  1116:     return NULL;
  1117: 
  1118:   new_set[0].position = position;
  1119:   new_set[0].code_min = code_min;
  1120:   new_set[0].code_max = code_max;
  1121:   new_set[0].klass = klass;
  1122:   new_set[0].neg_klasses = neg_klasses;
  1123:   new_set[0].backref = backref;
  1124:   new_set[1].position = -1;
  1125:   new_set[1].code_min = -1;
  1126:   new_set[1].code_max = -1;
  1127: 
  1128:   return new_set;
  1129: }
  1130: 
  1131: static tre_pos_and_tags_t *
  1132: tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
  1133:               int *tags, int assertions, int *params)
  1134: {
  1135:   int s1, s2, i, j;
  1136:   tre_pos_and_tags_t *new_set;
  1137:   int *new_tags;
  1138:   int num_tags;
  1139: 
  1140:   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
  1141:   for (s1 = 0; set1[s1].position >= 0; s1++);
  1142:   for (s2 = 0; set2[s2].position >= 0; s2++);
  1143:   new_set = (tre_pos_and_tags_t*)tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
  1144:   if (!new_set )
  1145:     return NULL;
  1146: 
  1147:   for (s1 = 0; set1[s1].position >= 0; s1++)
  1148:     {
  1149:       new_set[s1].position = set1[s1].position;
  1150:       new_set[s1].code_min = set1[s1].code_min;
  1151:       new_set[s1].code_max = set1[s1].code_max;
  1152:       new_set[s1].assertions = set1[s1].assertions | assertions;
  1153:       new_set[s1].klass = set1[s1].klass;
  1154:       new_set[s1].neg_klasses = set1[s1].neg_klasses;
  1155:       new_set[s1].backref = set1[s1].backref;
  1156:       if (set1[s1].tags == NULL && tags == NULL)
  1157:         new_set[s1].tags = NULL;
  1158:       else
  1159:         {
  1160:           for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
  1161:           new_tags = (int*)tre_mem_alloc(mem, (sizeof(*new_tags)
  1162:                                          * (i + num_tags + 1)));
  1163:           if (new_tags == NULL)
  1164:             return NULL;
  1165:           for (j = 0; j < i; j++)
  1166:             new_tags[j] = set1[s1].tags[j];
  1167:           for (i = 0; i < num_tags; i++)
  1168:             new_tags[j + i] = tags[i];
  1169:           new_tags[j + i] = -1;
  1170:           new_set[s1].tags = new_tags;
  1171:         }
  1172:       if (set1[s1].params)
  1173:         new_set[s1].params = set1[s1].params;
  1174:       if (params)
  1175:         {
  1176:           if (!new_set[s1].params)
  1177:             new_set[s1].params = params;
  1178:           else
  1179:             {
  1180:               new_set[s1].params = (int*)tre_mem_alloc(mem, sizeof(*params) *
  1181:                                                  TRE_PARAM_LAST);
  1182:               if (!new_set[s1].params)
  1183:                 return NULL;
  1184:               for (i = 0; i < TRE_PARAM_LAST; i++)
  1185:                 if (params[i] != TRE_PARAM_UNSET)
  1186:                   new_set[s1].params[i] = params[i];
  1187:             }
  1188:         }
  1189:     }
  1190: 
  1191:   for (s2 = 0; set2[s2].position >= 0; s2++)
  1192:     {
  1193:       new_set[s1 + s2].position = set2[s2].position;
  1194:       new_set[s1 + s2].code_min = set2[s2].code_min;
  1195:       new_set[s1 + s2].code_max = set2[s2].code_max;
  1196:       /* XXX - why not | assertions here as well? */
  1197:       new_set[s1 + s2].assertions = set2[s2].assertions;
  1198:       new_set[s1 + s2].klass = set2[s2].klass;
  1199:       new_set[s1 + s2].neg_klasses = set2[s2].neg_klasses;
  1200:       new_set[s1 + s2].backref = set2[s2].backref;
  1201:       if (set2[s2].tags == NULL)
  1202:         new_set[s1 + s2].tags = NULL;
  1203:       else
  1204:         {
  1205:           for (i = 0; set2[s2].tags[i] >= 0; i++);
  1206:           new_tags = (int*)tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
  1207:           if (new_tags == NULL)
  1208:             return NULL;
  1209:           for (j = 0; j < i; j++)
  1210:             new_tags[j] = set2[s2].tags[j];
  1211:           new_tags[j] = -1;
  1212:           new_set[s1 + s2].tags = new_tags;
  1213:         }
  1214:       if (set2[s2].params)
  1215:         new_set[s1 + s2].params = set2[s2].params;
  1216:       if (params)
  1217:         {
  1218:           if (!new_set[s1 + s2].params)
  1219:             new_set[s1 + s2].params = params;
  1220:           else
  1221:             {
  1222:               new_set[s1 + s2].params = (int*)tre_mem_alloc(mem, sizeof(*params) *
  1223:                                                       TRE_PARAM_LAST);
  1224:               if (!new_set[s1 + s2].params)
  1225:                 return NULL;
  1226:               for (i = 0; i < TRE_PARAM_LAST; i++)
  1227:                 if (params[i] != TRE_PARAM_UNSET)
  1228:                   new_set[s1 + s2].params[i] = params[i];
  1229:             }
  1230:         }
  1231:     }
  1232:   new_set[s1 + s2].position = -1;
  1233:   return new_set;
  1234: }
  1235: 
  1236: /* Finds the empty path through `node' which is the one that should be
  1237:    taken according to POSIX.2 rules, and adds the tags on that path to
  1238:    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
  1239:    set to the number of tags seen on the path. */
  1240: static reg_errcode_t
  1241: tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
  1242:                 int *assertions, int *params, int *num_tags_seen,
  1243:                 int *params_seen)
  1244: {
  1245:   tre_literal_t *lit;
  1246:   tre_union_t *uni;
  1247:   tre_catenation_t *cat;
  1248:   tre_iteration_t *iter;
  1249:   int i;
  1250:   int bottom = tre_stack_num_objects(stack);
  1251:   reg_errcode_t status = REG_OK;
  1252:   if (num_tags_seen)
  1253:     *num_tags_seen = 0;
  1254:   if (params_seen)
  1255:     *params_seen = 0;
  1256: 
  1257:   status = tre_stack_push(stack, node);
  1258: 
  1259:   /* Walk through the tree recursively. */
  1260:   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
  1261:     {
  1262:       node = (tre_ast_node_t*)tre_stack_pop(stack);
  1263: 
  1264:       switch (node->type)
  1265:         {
  1266:         case LITERAL:
  1267:           lit = (tre_literal_t *)node->obj;
  1268:           switch (lit->code_min)
  1269:             {
  1270:             case TAG:
  1271:               if (lit->code_max >= 0)
  1272:                 {
  1273:                   if (tags != NULL)
  1274:                     {
  1275:                       /* Add the tag to `tags'. */
  1276:                       for (i = 0; tags[i] >= 0; i++)
  1277:                         if (tags[i] == lit->code_max)
  1278:                           break;
  1279:                       if (tags[i] < 0)
  1280:                         {
  1281:                           tags[i] = lit->code_max;
  1282:                           tags[i + 1] = -1;
  1283:                         }
  1284:                     }
  1285:                   if (num_tags_seen)
  1286:                     (*num_tags_seen)++;
  1287:                 }
  1288:               break;
  1289:             case ASSERTION:
  1290:               assert(lit->code_max >= 1
  1291:                      || lit->code_max <= ASSERT_LAST);
  1292:               if (assertions != NULL)
  1293:                 *assertions |= lit->code_max;
  1294:               break;
  1295:             case PARAMETER:
  1296:               if (params != NULL)
  1297:                 for (i = 0; i < TRE_PARAM_LAST; i++)
  1298:                   params[i] = lit->u.params[i];
  1299:               if (params_seen != NULL)
  1300:                 *params_seen = 1;
  1301:               break;
  1302:             case EMPTY:
  1303:               break;
  1304:             default:
  1305:               assert(0);
  1306:               break;
  1307:             }
  1308:           break;
  1309: 
  1310:         case UNION:
  1311:           /* Subexpressions starting earlier take priority over ones
  1312:              starting later, so we prefer the left subexpression over the
  1313:              right subexpression. */
  1314:           uni = (tre_union_t *)node->obj;
  1315:           if (uni->left->nullable)
  1316:             STACK_PUSHX(stack, uni->left)
  1317:           else if (uni->right->nullable)
  1318:             STACK_PUSHX(stack, uni->right)
  1319:           else
  1320:             assert(0);
  1321:           break;
  1322: 
  1323:         case CATENATION:
  1324:           /* The path must go through both children. */
  1325:           cat = (tre_catenation_t *)node->obj;
  1326:           assert(cat->left->nullable);
  1327:           assert(cat->right->nullable);
  1328:           STACK_PUSHX(stack, cat->left);
  1329:           STACK_PUSHX(stack, cat->right);
  1330:           break;
  1331: 
  1332:         case ITERATION:
  1333:           /* A match with an empty string is preferred over no match at
  1334:              all, so we go through the argument if possible. */
  1335:           iter = (tre_iteration_t *)node->obj;
  1336:           if (iter->arg->nullable)
  1337:             STACK_PUSHX(stack, iter->arg);
  1338:           break;
  1339: 
  1340:         default:
  1341:           assert(0);
  1342:           break;
  1343:         }
  1344:     }
  1345: 
  1346:   return status;
  1347: }
  1348: 
  1349: 
  1350: typedef enum {
  1351:   NFL_RECURSE,
  1352:   NFL_POST_UNION,
  1353:   NFL_POST_CATENATION,
  1354:   NFL_POST_ITERATION
  1355: } tre_nfl_stack_symbol_t;
  1356: 
  1357: 
  1358: /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
  1359:    the nodes of the AST `tree'. */
  1360: static reg_errcode_t
  1361: tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
  1362: {
  1363:   int bottom = tre_stack_num_objects(stack);
  1364: 
  1365:   STACK_PUSHR(stack, tree);
  1366:   STACK_PUSHR(stack, NFL_RECURSE);
  1367: 
  1368:   while (tre_stack_num_objects(stack) > bottom)
  1369:     {
  1370:       tre_nfl_stack_symbol_t symbol;
  1371:       tre_ast_node_t *node;
  1372: 
  1373:       symbol = (tre_nfl_stack_symbol_t) (FLX_RAWADDRESS)tre_stack_pop(stack);
  1374:       node = (tre_ast_node_t*)tre_stack_pop(stack);
  1375:       switch (symbol)
  1376:         {
  1377:         case NFL_RECURSE:
  1378:           switch (node->type)
  1379:             {
  1380:             case LITERAL:
  1381:               {
  1382:                 tre_literal_t *lit = (tre_literal_t *)node->obj;
  1383:                 if (IS_BACKREF(lit))
  1384:                   {
  1385:                     /* Back references: nullable = false, firstpos = {i},
  1386:                        lastpos = {i}. */
  1387:                     node->nullable = 0;
  1388:                     node->firstpos = tre_set_one(mem, lit->position, 0,
  1389:                                              TRE_CHAR_MAX, 0, NULL, -1);
  1390:                     if (!node->firstpos)
  1391:                       return REG_ESPACE;
  1392:                     node->lastpos = tre_set_one(mem, lit->position, 0,
  1393:                                                 TRE_CHAR_MAX, 0, NULL,
  1394:                                                 lit->code_max);
  1395:                     if (!node->lastpos)
  1396:                       return REG_ESPACE;
  1397:                   }
  1398:                 else if (lit->code_min < 0)
  1399:                   {
  1400:                     /* Tags, empty strings, params, and zero width assertions:
  1401:                        nullable = true, firstpos = {}, and lastpos = {}. */
  1402:                     node->nullable = 1;
  1403:                     node->firstpos = tre_set_empty(mem);
  1404:                     if (!node->firstpos)
  1405:                       return REG_ESPACE;
  1406:                     node->lastpos = tre_set_empty(mem);
  1407:                     if (!node->lastpos)
  1408:                       return REG_ESPACE;
  1409:                   }
  1410:                 else
  1411:                   {
  1412:                     /* Literal at position i: nullable = false, firstpos = {i},
  1413:                        lastpos = {i}. */
  1414:                     node->nullable = 0;
  1415:                     node->firstpos =
  1416:                       tre_set_one(mem, lit->position, lit->code_min,
  1417:                                   lit->code_max, 0, NULL, -1);
  1418:                     if (!node->firstpos)
  1419:                       return REG_ESPACE;
  1420:                     node->lastpos = tre_set_one(mem, lit->position,
  1421:                                                 lit->code_min, lit->code_max,
  1422:                                                 lit->u.klass, lit->neg_klasses,
  1423:                                                 -1);
  1424:                     if (!node->lastpos)
  1425:                       return REG_ESPACE;
  1426:                   }
  1427:                 break;
  1428:               }
  1429: 
  1430:             case UNION:
  1431:               /* Compute the attributes for the two subtrees, and after that
  1432:                  for this node. */
  1433:               STACK_PUSHR(stack, node);
  1434:               STACK_PUSHR(stack, NFL_POST_UNION);
  1435:               STACK_PUSHR(stack, ((tre_union_t *)node->obj)->right);
  1436:               STACK_PUSHR(stack, NFL_RECURSE);
  1437:               STACK_PUSHR(stack, ((tre_union_t *)node->obj)->left);
  1438:               STACK_PUSHR(stack, NFL_RECURSE);
  1439:               break;
  1440: 
  1441:             case CATENATION:
  1442:               /* Compute the attributes for the two subtrees, and after that
  1443:                  for this node. */
  1444:               STACK_PUSHR(stack, node);
  1445:               STACK_PUSHR(stack, NFL_POST_CATENATION);
  1446:               STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->right);
  1447:               STACK_PUSHR(stack, NFL_RECURSE);
  1448:               STACK_PUSHR(stack, ((tre_catenation_t *)node->obj)->left);
  1449:               STACK_PUSHR(stack, NFL_RECURSE);
  1450:               break;
  1451: 
  1452:             case ITERATION:
  1453:               /* Compute the attributes for the subtree, and after that for
  1454:                  this node. */
  1455:               STACK_PUSHR(stack, node);
  1456:               STACK_PUSHR(stack, NFL_POST_ITERATION);
  1457:               STACK_PUSHR(stack, ((tre_iteration_t *)node->obj)->arg);
  1458:               STACK_PUSHR(stack, NFL_RECURSE);
  1459:               break;
  1460:             }
  1461:           break; /* end case: NFL_RECURSE */
  1462: 
  1463:         case NFL_POST_UNION:
  1464:           {
  1465:             tre_union_t *uni = (tre_union_t *)node->obj;
  1466:             node->nullable = uni->left->nullable || uni->right->nullable;
  1467:             node->firstpos = tre_set_union(mem, uni->left->firstpos,
  1468:                                            uni->right->firstpos, NULL, 0, NULL);
  1469:             if (!node->firstpos)
  1470:               return REG_ESPACE;
  1471:             node->lastpos = tre_set_union(mem, uni->left->lastpos,
  1472:                                           uni->right->lastpos, NULL, 0, NULL);
  1473:             if (!node->lastpos)
  1474:               return REG_ESPACE;
  1475:             break;
  1476:           }
  1477: 
  1478:         case NFL_POST_ITERATION:
  1479:           {
  1480:             tre_iteration_t *iter = (tre_iteration_t *)node->obj;
  1481: 
  1482:             if (iter->min == 0 || iter->arg->nullable)
  1483:               node->nullable = 1;
  1484:             else
  1485:               node->nullable = 0;
  1486:             node->firstpos = iter->arg->firstpos;
  1487:             node->lastpos = iter->arg->lastpos;
  1488:             break;
  1489:           }
  1490: 
  1491:         case NFL_POST_CATENATION:
  1492:           {
  1493:             int num_tags, *tags, assertions, params_seen;
  1494:             int *params;
  1495:             reg_errcode_t status;
  1496:             tre_catenation_t *cat = (tre_catenation_t*)node->obj;
  1497:             node->nullable = cat->left->nullable && cat->right->nullable;
  1498: 
  1499:             /* Compute firstpos. */
  1500:             if (cat->left->nullable)
  1501:               {
  1502:                 /* The left side matches the empty string.  Make a first pass
  1503:                    with tre_match_empty() to get the number of tags and
  1504:                    parameters. */
  1505:                 status = tre_match_empty(stack, cat->left,
  1506:                                          NULL, NULL, NULL, &num_tags,
  1507:                                          ¶ms_seen);
  1508:                 if (status != REG_OK)
  1509:                   return status;
  1510:                 /* Allocate arrays for the tags and parameters. */
  1511:                 tags = (int*)xmalloc(sizeof(*tags) * (num_tags + 1));
  1512:                 if (!tags)
  1513:                   return REG_ESPACE;
  1514:                 tags[0] = -1;
  1515:                 assertions = 0;
  1516:                 params = NULL;
  1517:                 if (params_seen)
  1518:                   {
  1519:                     params = (int*)tre_mem_alloc(mem, sizeof(*params)
  1520:                                            * TRE_PARAM_LAST);
  1521:                     if (!params)
  1522:                       {
  1523:                         xfree(tags);
  1524:                         return REG_ESPACE;
  1525:                       }
  1526:                   }
  1527:                 /* Second pass with tre_mach_empty() to get the list of
  1528:                    tags and parameters. */
  1529:                 status = tre_match_empty(stack, cat->left, tags,
  1530:                                          &assertions, params, NULL, NULL);
  1531:                 if (status != REG_OK)
  1532:                   {
  1533:                     xfree(tags);
  1534:                     return status;
  1535:                   }
  1536:                 node->firstpos =
  1537:                   tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
  1538:                                 tags, assertions, params);
  1539:                 xfree(tags);
  1540:                 if (!node->firstpos)
  1541:                   return REG_ESPACE;
  1542:               }
  1543:             else
  1544:               {
  1545:                 node->firstpos = cat->left->firstpos;
  1546:               }
  1547: 
  1548:             /* Compute lastpos. */
  1549:             if (cat->right->nullable)
  1550:               {
  1551:                 /* The right side matches the empty string.  Make a first pass
  1552:                    with tre_match_empty() to get the number of tags and
  1553:                    parameters. */
  1554:                 status = tre_match_empty(stack, cat->right,
  1555:                                          NULL, NULL, NULL, &num_tags,
  1556:                                          ¶ms_seen);
  1557:                 if (status != REG_OK)
  1558:                   return status;
  1559:                 /* Allocate arrays for the tags and parameters. */
  1560:                 tags = (int*)xmalloc(sizeof(int) * (num_tags + 1));
  1561:                 if (!tags)
  1562:                   return REG_ESPACE;
  1563:                 tags[0] = -1;
  1564:                 assertions = 0;
  1565:                 params = NULL;
  1566:                 if (params_seen)
  1567:                   {
  1568:                     params = (int*)tre_mem_alloc(mem, sizeof(*params)
  1569:                                            * TRE_PARAM_LAST);
  1570:                     if (!params)
  1571:                       {
  1572:                         xfree(tags);
  1573:                         return REG_ESPACE;
  1574:                       }
  1575:                   }
  1576:                 /* Second pass with tre_mach_empty() to get the list of
  1577:                    tags and parameters. */
  1578:                 status = tre_match_empty(stack, cat->right, tags,
  1579:                                          &assertions, params, NULL, NULL);
  1580:                 if (status != REG_OK)
  1581:                   {
  1582:                     xfree(tags);
  1583:                     return status;
  1584:                   }
  1585:                 node->lastpos =
  1586:                   tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
  1587:                                 tags, assertions, params);
  1588:                 xfree(tags);
  1589:                 if (!node->lastpos)
  1590:                   return REG_ESPACE;
  1591:               }
  1592:             else
  1593:               {
  1594:                 node->lastpos = cat->right->lastpos;
  1595:               }
  1596:             break;
  1597:           }
  1598: 
  1599:         default:
  1600:           assert(0);
  1601:           break;
  1602:         }
  1603:     }
  1604: 
  1605:   return REG_OK;
  1606: }
  1607: 
  1608: 
  1609: /* Adds a transition from each position in `p1' to each position in `p2'. */
  1610: static reg_errcode_t
  1611: tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
  1612:                tre_tnfa_transition_t *transitions,
  1613:                int *counts, int *offs)
  1614: {
  1615:   tre_pos_and_tags_t *orig_p2 = p2;
  1616:   tre_tnfa_transition_t *trans;
  1617:   int i, j, k, l, dup, prev_p2_pos;
  1618: 
  1619:   if (transitions != NULL)
  1620:     while (p1->position >= 0)
  1621:       {
  1622:         p2 = orig_p2;
  1623:         prev_p2_pos = -1;
  1624:         while (p2->position >= 0)
  1625:           {
  1626:             /* Optimization: if this position was already handled, skip it. */
  1627:             if (p2->position == prev_p2_pos)
  1628:               {
  1629:                 p2++;
  1630:                 continue;
  1631:               }
  1632:             prev_p2_pos = p2->position;
  1633:             /* Set `trans' to point to the next unused transition from
  1634:                position `p1->position'. */
  1635:             trans = transitions + offs[p1->position];
  1636:             while (trans->state != NULL)
  1637:               {
  1638: #if 0
  1639:                 /* If we find a previous transition from `p1->position' to
  1640:                    `p2->position', it is overwritten.  This can happen only
  1641:                    if there are nested loops in the regexp, like in "((a)*)*".
  1642:                    In POSIX.2 repetition using the outer loop is always
  1643:                    preferred over using the inner loop.  Therefore the
  1644:                    transition for the inner loop is useless and can be thrown
  1645:                    away. */
  1646:                 /* XXX - The same position is used for all nodes in a bracket
  1647:                    expression, so this optimization cannot be used (it will
  1648:                    break bracket expressions) unless I figure out a way to
  1649:                    detect it here. */
  1650:                 if (trans->state_id == p2->position)
  1651:                   {
  1652:                     DPRINT(("*"));
  1653:                     break;
  1654:                   }
  1655: #endif
  1656:                 trans++;
  1657:               }
  1658: 
  1659:             if (trans->state == NULL)
  1660:               (trans + 1)->state = NULL;
  1661:             /* Use the character ranges, assertions, etc. from `p1' for
  1662:                the transition from `p1' to `p2'. */
  1663:             trans->code_min = p1->code_min;
  1664:             trans->code_max = p1->code_max;
  1665:             trans->state = transitions + offs[p2->position];
  1666:             trans->state_id = p2->position;
  1667:             trans->assertions = p1->assertions | p2->assertions
  1668:               | (p1->klass ? ASSERT_CHAR_CLASS : 0)
  1669:               | (p1->neg_klasses != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
  1670:             if (p1->backref >= 0)
  1671:               {
  1672:                 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
  1673:                 assert(p2->backref < 0);
  1674:                 trans->u.backref = p1->backref;
  1675:                 trans->assertions |= ASSERT_BACKREF;
  1676:               }
  1677:             else
  1678:               trans->u.klass = p1->klass;
  1679:             if (p1->neg_klasses != NULL)
  1680:               {
  1681:                 for (i = 0; p1->neg_klasses[i] != (tre_ctype_t)0; i++);
  1682:                 trans->neg_klasses =
  1683:                   (tre_ctype_t*)xmalloc(sizeof(*trans->neg_klasses) * (i + 1));
  1684:                 if (trans->neg_klasses == NULL)
  1685:                   return REG_ESPACE;
  1686:                 for (i = 0; p1->neg_klasses[i] != (tre_ctype_t)0; i++)
  1687:                   trans->neg_klasses[i] = p1->neg_klasses[i];
  1688:                 trans->neg_klasses[i] = (tre_ctype_t)0;
  1689:               }
  1690:             else
  1691:               trans->neg_klasses = NULL;
  1692: 
  1693:             /* Find out how many tags this transition has. */
  1694:             i = 0;
  1695:             if (p1->tags != NULL)
  1696:               while(p1->tags[i] >= 0)
  1697:                 i++;
  1698:             j = 0;
  1699:             if (p2->tags != NULL)
  1700:               while(p2->tags[j] >= 0)
  1701:                 j++;
  1702: 
  1703:             /* If we are overwriting a transition, free the old tag array. */
  1704:             if (trans->tags != NULL)
  1705:               xfree(trans->tags);
  1706:             trans->tags = NULL;
  1707: 
  1708:             /* If there were any tags, allocate an array and fill it. */
  1709:             if (i + j > 0)
  1710:               {
  1711:                 trans->tags = (int*)xmalloc(sizeof(*trans->tags) * (i + j + 1));
  1712:                 if (!trans->tags)
  1713:                   return REG_ESPACE;
  1714:                 i = 0;
  1715:                 if (p1->tags != NULL)
  1716:                   while(p1->tags[i] >= 0)
  1717:                     {
  1718:                       trans->tags[i] = p1->tags[i];
  1719:                       i++;
  1720:                     }
  1721:                 l = i;
  1722:                 j = 0;
  1723:                 if (p2->tags != NULL)
  1724:                   while (p2->tags[j] >= 0)
  1725:                     {
  1726:                       /* Don't add duplicates. */
  1727:                       dup = 0;
  1728:                       for (k = 0; k < i; k++)
  1729:                         if (trans->tags[k] == p2->tags[j])
  1730:                           {
  1731:                             dup = 1;
  1732:                             break;
  1733:                           }
  1734:                       if (!dup)
  1735:                         trans->tags[l++] = p2->tags[j];
  1736:                       j++;
  1737:                     }
  1738:                 trans->tags[l] = -1;
  1739:               }
  1740: 
  1741:             /* Set the parameter array.  If both `p2' and `p1' have same
  1742:                parameters, the values in `p2' override those in `p1'. */
  1743:             if (p1->params || p2->params)
  1744:               {
  1745:                 if (!trans->params)
  1746:                   trans->params = (int*)xmalloc(sizeof(*trans->params)
  1747:                                           * TRE_PARAM_LAST);
  1748:                 if (!trans->params)
  1749:                   return REG_ESPACE;
  1750:                 for (i = 0; i < TRE_PARAM_LAST; i++)
  1751:                   {
  1752:                     trans->params[i] = TRE_PARAM_UNSET;
  1753:                     if (p1->params && p1->params[i] != TRE_PARAM_UNSET)
  1754:                       trans->params[i] = p1->params[i];
  1755:                     if (p2->params && p2->params[i] != TRE_PARAM_UNSET)
  1756:                       trans->params[i] = p2->params[i];
  1757:                   }
  1758:               }
  1759:             else
  1760:               {
  1761:                 if (trans->params)
  1762:                   xfree(trans->params);
  1763:                 trans->params = NULL;
  1764:               }
  1765: 
  1766: 
  1767: #ifdef TRE_DEBUG
  1768:             {
  1769:               int *tags;
  1770: 
  1771:               DPRINT(("  %2d -> %2d on %3d", p1->position, p2->position,
  1772:                       p1->code_min));
  1773:               if (p1->code_max != p1->code_min)
  1774:                 DPRINT(("-%3d", p1->code_max));
  1775:               tags = trans->tags;
  1776:               if (tags)
  1777:                 {
  1778:                   DPRINT((", tags ["));
  1779:                   while (*tags >= 0)
  1780:                     {
  1781:                       DPRINT(("%d", *tags));
  1782:                       tags++;
  1783:                       if (*tags >= 0)
  1784:                         DPRINT((","));
  1785:                     }
  1786:                   DPRINT(("]"));
  1787:                 }
  1788:               if (trans->assertions)
  1789:                 DPRINT((", assert %d", trans->assertions));
  1790:               if (trans->assertions & ASSERT_BACKREF)
  1791:                 DPRINT((", backref %d", trans->u.backref));
  1792:               else if (trans->u.klass)
  1793:                 DPRINT((", klass %ld", (long)trans->u.klass));
  1794:               if (trans->neg_klasses)
  1795:                 DPRINT((", neg_klasses %p", trans->neg_klasses));
  1796:               if (trans->params)
  1797:                 {
  1798:                   DPRINT((", "));
  1799:                   tre_print_params(trans->params);
  1800:                 }
  1801:               DPRINT(("\n"));
  1802:             }
  1803: #endif /* TRE_DEBUG */
  1804:             p2++;
  1805:           }
  1806:         p1++;
  1807:       }
  1808:   else
  1809:     /* Compute a maximum limit for the number of transitions leaving
  1810:        from each state. */
  1811:     while (p1->position >= 0)
  1812:       {
  1813:         p2 = orig_p2;
  1814:         while (p2->position >= 0)
  1815:           {
  1816:             counts[p1->position]++;
  1817:             p2++;
  1818:           }
  1819:         p1++;
  1820:       }
  1821:   return REG_OK;
  1822: }
  1823: 
  1824: /* Converts the syntax tree to a TNFA.  All the transitions in the TNFA are
  1825:    labelled with one character range (there are no transitions on empty
  1826:    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
  1827:    the regexp. */
  1828: static reg_errcode_t
  1829: tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
  1830:                 int *counts, int *offs)
  1831: {
  1832:   tre_union_t *uni;
  1833:   tre_catenation_t *cat;
  1834:   tre_iteration_t *iter;
  1835:   reg_errcode_t errcode = REG_OK;
  1836: 
  1837:   /* XXX - recurse using a stack!. */
  1838:   switch (node->type)
  1839:     {
  1840:     case LITERAL:
  1841:       break;
  1842:     case UNION:
  1843:       uni = (tre_union_t *)node->obj;
  1844:       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
  1845:       if (errcode != REG_OK)
  1846:         return errcode;
  1847:       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
  1848:       break;
  1849: 
  1850:     case CATENATION:
  1851:       cat = (tre_catenation_t *)node->obj;
  1852:       /* Add a transition from each position in cat->left->lastpos
  1853:          to each position in cat->right->firstpos. */
  1854:       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
  1855:                                transitions, counts, offs);
  1856:       if (errcode != REG_OK)
  1857:         return errcode;
  1858:       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
  1859:       if (errcode != REG_OK)
  1860:         return errcode;
  1861:       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
  1862:       break;
  1863: 
  1864:     case ITERATION:
  1865:       iter = (tre_iteration_t *)node->obj;
  1866:       assert(iter->max == -1 || iter->max == 1);
  1867: 
  1868:       if (iter->max == -1)
  1869:         {
  1870:           assert(iter->min == 0 || iter->min == 1);
  1871:           /* Add a transition from each last position in the iterated
  1872:              expression to each first position. */
  1873:           errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
  1874:                                    transitions, counts, offs);
  1875:           if (errcode != REG_OK)
  1876:             return errcode;
  1877:         }
  1878:       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
  1879:       break;
  1880:     }
  1881:   return errcode;
  1882: }
  1883: 
  1884: 
  1885: #define ERROR_EXIT(err)           \
  1886:   do                              \
  1887:     {                             \
  1888:       errcode = err;              \
  1889:       if (1) goto error_exit;     \
  1890:     }                             \
  1891:  while (0)
  1892: 
  1893: 
  1894: int
  1895: tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags)
  1896: {
  1897:   tre_stack_t *stack;
  1898:   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
  1899:   tre_pos_and_tags_t *p;
  1900:   int *counts = NULL, *offs = NULL;
  1901:   int i, add = 0;
  1902:   tre_tnfa_transition_t *transitions, *initial;
  1903:   tre_tnfa_t *tnfa = NULL;
  1904:   tre_submatch_data_t *submatch_data;
  1905:   tre_tag_direction_t *tag_directions = NULL;
  1906:   reg_errcode_t errcode;
  1907:   tre_mem_t mem;
  1908: 
  1909:   /* Parse context. */
  1910:   tre_parse_ctx_t parse_ctx;
  1911: 
  1912:   /* Allocate a stack used throughout the compilation process for various
  1913:      purposes. */
  1914:   stack = tre_stack_new(512, 10240, 128);
  1915:   if (!stack)
  1916:     return REG_ESPACE;
  1917:   /* Allocate a fast memory allocator. */
  1918:   mem = tre_mem_new();
  1919:   if (!mem)
  1920:     {
  1921:       tre_stack_destroy(stack);
  1922:       return REG_ESPACE;
  1923:     }
  1924: 
  1925:   /* Parse the regexp. */
  1926:   memset(&parse_ctx, 0, sizeof(parse_ctx));
  1927:   parse_ctx.mem = mem;
  1928:   parse_ctx.stack = stack;
  1929:   parse_ctx.re = regex;
  1930:   parse_ctx.len = n;
  1931:   parse_ctx.cflags = cflags;
  1932:   parse_ctx.max_backref = -1;
  1933:   DPRINT(("tre_compile: parsing '%.*" STRF "'\n", n, regex));
  1934:   errcode = tre_parse(&parse_ctx);
  1935:   if (errcode != REG_OK)
  1936:     ERROR_EXIT(errcode);
  1937:   preg->re_nsub = parse_ctx.submatch_id - 1;
  1938:   tree = parse_ctx.result;
  1939: 
  1940:   /* Back references and approximate matching cannot currently be used
  1941:      in the same regexp. */
  1942:   if (parse_ctx.max_backref >= 0 && parse_ctx.have_approx)
  1943:     ERROR_EXIT(REG_BADPAT);
  1944: 
  1945: #ifdef TRE_DEBUG
  1946:   tre_ast_print(tree);
  1947: #endif /* TRE_DEBUG */
  1948: 
  1949:   /* Referring to nonexistent subexpressions is illegal. */
  1950:   if (parse_ctx.max_backref > (int)preg->re_nsub)
  1951:     ERROR_EXIT(REG_ESUBREG);
  1952: 
  1953:   /* Allocate the TNFA struct. */
  1954:   tnfa = (tre_tnfa_t*)xcalloc(1, sizeof(tre_tnfa_t));
  1955:   if (tnfa == NULL)
  1956:     ERROR_EXIT(REG_ESPACE);
  1957:   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
  1958:   tnfa->have_approx = parse_ctx.have_approx;
  1959:   tnfa->num_submatches = parse_ctx.submatch_id;
  1960: 
  1961:   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
  1962:      regexp does not have back references, this can be skipped. */
  1963:   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
  1964:     {
  1965:       DPRINT(("tre_compile: setting up tags\n"));
  1966: 
  1967:       /* Figure out how many tags we will need. */
  1968:       errcode = tre_add_tags(NULL, stack, tree, tnfa);
  1969:       if (errcode != REG_OK)
  1970:         ERROR_EXIT(errcode);
  1971: #ifdef TRE_DEBUG
  1972:       tre_ast_print(tree);
  1973: #endif /* TRE_DEBUG */
  1974: 
  1975:       if (tnfa->num_tags > 0)
  1976:         {
  1977:           tag_directions = (tre_tag_direction_t*)xmalloc(sizeof(*tag_directions)
  1978:                                    * (tnfa->num_tags + 1));
  1979:           if (tag_directions == NULL)
  1980:             ERROR_EXIT(REG_ESPACE);
  1981:           tnfa->tag_directions = tag_directions;
  1982:           memset(tag_directions, -1,
  1983:                  sizeof(*tag_directions) * (tnfa->num_tags + 1));
  1984:         }
  1985:       tnfa->minimal_tags = (int*)xcalloc(tnfa->num_tags * 2 + 1,
  1986:                                    sizeof(tnfa->minimal_tags));
  1987:       if (tnfa->minimal_tags == NULL)
  1988:         ERROR_EXIT(REG_ESPACE);
  1989: 
  1990:       submatch_data = (tre_submatch_data_t*)xcalloc(parse_ctx.submatch_id, sizeof(*submatch_data));
  1991:       if (submatch_data == NULL)
  1992:         ERROR_EXIT(REG_ESPACE);
  1993:       tnfa->submatch_data = submatch_data;
  1994: 
  1995:       errcode = tre_add_tags(mem, stack, tree, tnfa);
  1996:       if (errcode != REG_OK)
  1997:         ERROR_EXIT(errcode);
  1998: 
  1999: #ifdef TRE_DEBUG
  2000:       for (i = 0; i < parse_ctx.submatch_id; i++)
  2001:         DPRINT(("pmatch[%d] = {t%d, t%d}\n",
  2002:                 i, submatch_data[i].so_tag, submatch_data[i].eo_tag));
  2003:       for (i = 0; i < tnfa->num_tags; i++)
  2004:         DPRINT(("t%d is %s\n", i,
  2005:                 tag_directions[i] == TRE_TAG_MINIMIZE ?
  2006:                 "minimized" : "maximized"));
  2007: #endif /* TRE_DEBUG */
  2008:     }
  2009: 
  2010:   /* Expand iteration nodes. */
  2011:   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
  2012:                            tag_directions, &tnfa->params_depth);
  2013:   if (errcode != REG_OK)
  2014:     ERROR_EXIT(errcode);
  2015: 
  2016:   /* Add a dummy node for the final state.
  2017:      XXX - For certain patterns this dummy node can be optimized away,
  2018:            for example "a*" or "ab*".   Figure out a simple way to detect
  2019:            this possibility. */
  2020:   tmp_ast_l = tree;
  2021:   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
  2022:   if (tmp_ast_r == NULL)
  2023:     ERROR_EXIT(REG_ESPACE);
  2024: 
  2025:   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
  2026:   if (tree == NULL)
  2027:     ERROR_EXIT(REG_ESPACE);
  2028: 
  2029: #ifdef TRE_DEBUG
  2030:   tre_ast_print(tree);
  2031:   DPRINT(("Number of states: %d\n", parse_ctx.position));
  2032: #endif /* TRE_DEBUG */
  2033: 
  2034:   errcode = tre_compute_nfl(mem, stack, tree);
  2035:   if (errcode != REG_OK)
  2036:     ERROR_EXIT(errcode);
  2037: 
  2038:   counts = (int*)xmalloc(sizeof(int) * parse_ctx.position);
  2039:   if (counts == NULL)
  2040:     ERROR_EXIT(REG_ESPACE);
  2041: 
  2042:   offs = (int*)xmalloc(sizeof(int) * parse_ctx.position);
  2043:   if (offs == NULL)
  2044:     ERROR_EXIT(REG_ESPACE);
  2045: 
  2046:   for (i = 0; i < parse_ctx.position; i++)
  2047:     counts[i] = 0;
  2048:   tre_ast_to_tnfa(tree, NULL, counts, NULL);
  2049: 
  2050:   add = 0;
  2051:   for (i = 0; i < parse_ctx.position; i++)
  2052:     {
  2053:       offs[i] = add;
  2054:       add += counts[i] + 1;
  2055:       counts[i] = 0;
  2056:     }
  2057:   transitions = (tre_tnfa_transition_t*)xcalloc(add + 1, sizeof(*transitions));
  2058:   if (transitions == NULL)
  2059:     ERROR_EXIT(REG_ESPACE);
  2060:   tnfa->transitions = transitions;
  2061:   tnfa->num_transitions = add;
  2062: 
  2063:   DPRINT(("Converting to TNFA:\n"));
  2064:   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
  2065:   if (errcode != REG_OK)
  2066:     ERROR_EXIT(errcode);
  2067: 
  2068:   /* If in eight bit mode, compute a table of characters that can be the
  2069:      first character of a match. */
  2070:   tnfa->first_char = -1;
  2071:   if (TRE_MB_CUR_MAX == 1 && !tmp_ast_l->nullable)
  2072:     {
  2073:       int count = 0;
  2074:       int k;
  2075:       DPRINT(("Characters that can start a match:"));
  2076:       tnfa->firstpos_chars = (char*)xcalloc(256, sizeof(char));
  2077:       if (tnfa->firstpos_chars == NULL)
  2078:         ERROR_EXIT(REG_ESPACE);
  2079:       for (p = tree->firstpos; p->position >= 0; p++)
  2080:         {
  2081:           tre_tnfa_transition_t *j = transitions + offs[p->position];
  2082:           while (j->state != NULL)
  2083:             {
  2084:               for (k = j->code_min; k <= j->code_max && k < 256; k++)
  2085:                 {
  2086:                   DPRINT((" %d", k));
  2087:                   tnfa->firstpos_chars[k] = 1;
  2088:                   count++;
  2089:                 }
  2090:               j++;
  2091:             }
  2092:         }
  2093:       DPRINT(("\n"));
  2094: #define TRE_OPTIMIZE_FIRST_CHAR 1
  2095: #if TRE_OPTIMIZE_FIRST_CHAR
  2096:       if (count == 1)
  2097:         {
  2098:           for (k = 0; k < 256; k++)
  2099:             if (tnfa->firstpos_chars[k])
  2100:               {
  2101:                 DPRINT(("first char must be %d\n", k));
  2102:                 tnfa->first_char = k;
  2103:                 xfree(tnfa->firstpos_chars);
  2104:                 tnfa->firstpos_chars = NULL;
  2105:                 break;
  2106:               }
  2107:         }
  2108: #endif
  2109: 
  2110:     }
  2111:   else
  2112:     tnfa->firstpos_chars = NULL;
  2113: 
  2114: 
  2115:   p = tree->firstpos;
  2116:   i = 0;
  2117:   while (p->position >= 0)
  2118:     {
  2119:       i++;
  2120: 
  2121: #ifdef TRE_DEBUG
  2122:       {
  2123:         int *tags;
  2124:         DPRINT(("initial: %d", p->position));
  2125:         tags = p->tags;
  2126:         if (tags != NULL)
  2127:           {
  2128:             if (*tags >= 0)
  2129:               DPRINT(("/"));
  2130:             while (*tags >= 0)
  2131:               {
  2132:                 DPRINT(("%d", *tags));
  2133:                 tags++;
  2134:                 if (*tags >= 0)
  2135:                   DPRINT((","));
  2136:               }
  2137:           }
  2138:         DPRINT((", assert %d", p->assertions));
  2139:         if (p->params)
  2140:           {
  2141:             DPRINT((", "));
  2142:             tre_print_params(p->params);
  2143:           }
  2144:         DPRINT(("\n"));
  2145:       }
  2146: #endif /* TRE_DEBUG */
  2147: 
  2148:       p++;
  2149:     }
  2150: 
  2151:   initial = (tre_tnfa_transition_t*)xcalloc(i + 1, sizeof(tre_tnfa_transition_t));
  2152:   if (initial == NULL)
  2153:     ERROR_EXIT(REG_ESPACE);
  2154:   tnfa->initial = initial;
  2155: 
  2156:   i = 0;
  2157:   for (p = tree->firstpos; p->position >= 0; p++)
  2158:     {
  2159:       initial[i].state = transitions + offs[p->position];
  2160:       initial[i].state_id = p->position;
  2161:       initial[i].tags = NULL;
  2162:       /* Copy the arrays p->tags, and p->params, they are allocated
  2163:          from a tre_mem object. */
  2164:       if (p->tags)
  2165:         {
  2166:           int j;
  2167:           for (j = 0; p->tags[j] >= 0; j++);
  2168:           initial[i].tags = (int*)xmalloc(sizeof(*p->tags) * (j + 1));
  2169:           if (!initial[i].tags)
  2170:             ERROR_EXIT(REG_ESPACE);
  2171:           memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
  2172:         }
  2173:       initial[i].params = NULL;
  2174:       if (p->params)
  2175:         {
  2176:           initial[i].params = (int*)xmalloc(sizeof(*p->params) * TRE_PARAM_LAST);
  2177:           if (!initial[i].params)
  2178:             ERROR_EXIT(REG_ESPACE);
  2179:           memcpy(initial[i].params, p->params,
  2180:                  sizeof(*p->params) * TRE_PARAM_LAST);
  2181:         }
  2182:       initial[i].assertions = p->assertions;
  2183:       i++;
  2184:     }
  2185:   initial[i].state = NULL;
  2186: 
  2187:   tnfa->num_transitions = add;
  2188:   tnfa->final = transitions + offs[tree->lastpos[0].position];
  2189:   tnfa->num_states = parse_ctx.position;
  2190:   tnfa->cflags = cflags;
  2191: 
  2192:   DPRINT(("final state %p\n", (void *)tnfa->final));
  2193: 
  2194:   tre_mem_destroy(mem);
  2195:   tre_stack_destroy(stack);
  2196:   xfree(counts);
  2197:   xfree(offs);
  2198: 
  2199:   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
  2200:   return REG_OK;
  2201: 
  2202:  error_exit:
  2203:   /* Free everything that was allocated and return the error code. */
  2204:   tre_mem_destroy(mem);
  2205:   if (stack != NULL)
  2206:     tre_stack_destroy(stack);
  2207:   if (counts != NULL)
  2208:     xfree(counts);
  2209:   if (offs != NULL)
  2210:     xfree(offs);
  2211:   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
  2212:   tre_free(preg);
  2213:   return errcode;
  2214: }
  2215: 
  2216: 
  2217: 
  2218: 
  2219: void
  2220: tre_free(regex_t *preg)
  2221: {
  2222:   tre_tnfa_t *tnfa;
  2223:   unsigned int i;
  2224:   tre_tnfa_transition_t *trans;
  2225: 
  2226:   tnfa = (tre_tnfa_t*)preg->TRE_REGEX_T_FIELD;
  2227:   if (!tnfa)
  2228:     return;
  2229: 
  2230:   for (i = 0; i < tnfa->num_transitions; i++)
  2231:     if (tnfa->transitions[i].state)
  2232:       {
  2233:         if (tnfa->transitions[i].tags)
  2234:           xfree(tnfa->transitions[i].tags);
  2235:         if (tnfa->transitions[i].neg_klasses)
  2236:           xfree(tnfa->transitions[i].neg_klasses);
  2237:         if (tnfa->transitions[i].params)
  2238:           xfree(tnfa->transitions[i].params);
  2239:       }
  2240:   if (tnfa->transitions)
  2241:     xfree(tnfa->transitions);
  2242: 
  2243:   if (tnfa->initial)
  2244:     {
  2245:       for (trans = tnfa->initial; trans->state; trans++)
  2246:         {
  2247:           if (trans->tags)
  2248:             xfree(trans->tags);
  2249:           if (trans->params)
  2250:             xfree(trans->params);
  2251:         }
  2252:       xfree(tnfa->initial);
  2253:     }
  2254: 
  2255:   if (tnfa->submatch_data)
  2256:     {
  2257:       for (i = 0; i < tnfa->num_submatches; i++)
  2258:         if (tnfa->submatch_data[i].parents)
  2259:           xfree(tnfa->submatch_data[i].parents);
  2260:       xfree(tnfa->submatch_data);
  2261:     }
  2262: 
  2263:   if (tnfa->tag_directions)
  2264:     xfree(tnfa->tag_directions);
  2265:   if (tnfa->firstpos_chars)
  2266:     xfree(tnfa->firstpos_chars);
  2267:   if (tnfa->minimal_tags)
  2268:     xfree(tnfa->minimal_tags);
  2269:   xfree(tnfa);
  2270: }
  2271: 
  2272: char *
  2273: tre_version(void)
  2274: {
  2275:   static char str[256];
  2276:   char *version;
  2277: 
  2278:   if (str[0] == 0)
  2279:     {
  2280:       tre_config(TRE_CONFIG_VERSION, &version);
  2281:       sprintf(str, "TRE %s (GPL)", version);
  2282:     }
  2283:   return str;
  2284: }
  2285: 
  2286: int
  2287: tre_config(int query, void *result)
  2288: {
  2289:   int *int_result = (int*)result;
  2290:   char **string_result = (char**)result;
  2291: 
  2292:   switch (query)
  2293:     {
  2294:     case TRE_CONFIG_APPROX:
  2295: #ifdef TRE_APPROX
  2296:       *int_result = 1;
  2297: #else /* !TRE_APPROX */
  2298:       *int_result = 0;
  2299: #endif /* !TRE_APPROX */
  2300:       return REG_OK;
  2301: 
  2302:     case TRE_CONFIG_WCHAR:
  2303: #ifdef TRE_WCHAR
  2304:       *int_result = 1;
  2305: #else /* !TRE_WCHAR */
  2306:       *int_result = 0;
  2307: #endif /* !TRE_WCHAR */
  2308:       return REG_OK;
  2309: 
  2310:     case TRE_CONFIG_MULTIBYTE:
  2311: #ifdef TRE_MULTIBYTE
  2312:       *int_result = 1;
  2313: #else /* !TRE_MULTIBYTE */
  2314:       *int_result = 0;
  2315: #endif /* !TRE_MULTIBYTE */
  2316:       return REG_OK;
  2317: 
  2318:     case TRE_CONFIG_SYSTEM_ABI:
  2319: #ifdef TRE_CONFIG_SYSTEM_ABI
  2320:       *int_result = 1;
  2321: #else /* !TRE_CONFIG_SYSTEM_ABI */
  2322:       *int_result = 0;
  2323: #endif /* !TRE_CONFIG_SYSTEM_ABI */
  2324:       return REG_OK;
  2325: 
  2326:     case TRE_CONFIG_VERSION:
  2327:       *string_result = TRE_VERSION;
  2328:       return REG_OK;
  2329:     }
  2330: 
  2331:   return REG_NOMATCH;
  2332: }
  2333: 
  2334: 
  2335: /* EOF */
End cpp section to tre/tre_compile.cpp[1]
Start cpp section to tre/tre_compile.hpp[1 /1 ]
     1: #line 4075 "./lpsrc/tre.pak"
     2: /*
     3:   tre-compile.h: Regex compilation definitions
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: */
    20: 
    21: 
    22: #ifndef TRE_COMPILE_H
    23: #define TRE_COMPILE_H 1
    24: 
    25: typedef struct {
    26:   int position;
    27:   int code_min;
    28:   int code_max;
    29:   int *tags;
    30:   int assertions;
    31:   tre_ctype_t klass;
    32:   tre_ctype_t *neg_klasses;
    33:   int backref;
    34:   int *params;
    35: } tre_pos_and_tags_t;
    36: 
    37: #endif /* TRE_COMPILE_H */
    38: 
    39: /* EOF */
End cpp section to tre/tre_compile.hpp[1]
Start cpp section to tre/tre_config.hpp[1 /1 ]
     1: #line 4115 "./lpsrc/tre.pak"
     2: 
     3: /* RF: need this for TRE_EXTERN */
     4: #include "flx_target_tre_config.hpp"
     5: 
     6: /* lib/tre-config.h.  Generated by configure.  */
     7: /* tre-config.h.in.  This file has all definitions that are needed in
     8:    `regex.h'.  Note that this file must contain only the bare minimum
     9:    of definitions without the TRE_ prefix to avoid conflicts between
    10:    definitions here and definitions included from somewhere else. */
    11: 
    12: /* Define to 1 if you have the <libutf8.h> header file. */
    13: /* #undef HAVE_LIBUTF8_H */
    14: 
    15: /* Define to 1 if the system has the type `reg_errcode_t'. */
    16: /* #undef HAVE_REG_ERRCODE_T */
    17: 
    18: /* Define to 1 if you have the <sys/types.h> header file. */
    19: #define HAVE_SYS_TYPES_H 1
    20: 
    21: /* Define to 1 if you have the <wchar.h> header file. */
    22: //#define HAVE_WCHAR_H 0
    23: 
    24: /* Define if you want to enable approximate matching functionality. */
    25: #define TRE_APPROX 1
    26: 
    27: /* Define to enable multibyte character set support. */
    28: #define TRE_MULTIBYTE 0
    29: 
    30: /* Define to the absolute path to the system regex.h */
    31: /* #undef TRE_SYSTEM_REGEX_H_PATH */
    32: 
    33: /* Define to include the system regex.h from TRE regex.h */
    34: /* #undef TRE_USE_SYSTEM_REGEX_H */
    35: 
    36: /* Define to enable wide character (wchar_t) support. */
    37: //#define TRE_WCHAR 0
    38: 
    39: /* TRE version string. */
    40: #define TRE_VERSION "0.7.2"
    41: 
    42: /* TRE version level 1. */
    43: #define TRE_VERSION_1 0
    44: 
    45: /* TRE version level 2. */
    46: #define TRE_VERSION_2 7
    47: 
    48: /* TRE version level 3. */
    49: #define TRE_VERSION_3 2
End cpp section to tre/tre_config.hpp[1]
Start cpp section to tre/tre_filter.cpp[1 /1 ]
     1: #line 4165 "./lpsrc/tre.pak"
     2: /*
     3:   tre-filter.c: Histogram filter to quickly find regexp match candidates
     4: 
     5:   Copyright (C) 2004 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: /* The idea of this filter is quite simple.  First, let's assume the
    23:    search pattern is a simple string.  In order for a substring of a
    24:    longer string to match the search pattern, it must have the same
    25:    numbers of different characters as the pattern, and those
    26:    characters must occur in the same order as they occur in pattern. */
    27: 
    28: #include "flx_target_tre_config.hpp"
    29: #include <stdio.h>
    30: #include "tre_internal.hpp"
    31: #include "tre_filter.hpp"
    32: 
    33: int
    34: tre_filter_find(const unsigned char *str, size_t len, tre_filter_t *filter)
    35: {
    36:   unsigned short counts[256];
    37:   unsigned int i;
    38:   unsigned int window_len = filter->window_len;
    39:   tre_filter_profile_t *profile = filter->profile;
    40:   const unsigned char *str_orig = str;
    41: 
    42:   DPRINT(("tre_filter_find: %.*s\n", len, str));
    43: 
    44:   for (i = 0; i < elementsof(counts); i++)
    45:     counts[i] = 0;
    46: 
    47:   i = 0;
    48:   while (*str && i < window_len && i < len)
    49:     {
    50:       counts[*str]++;
    51:       i++;
    52:       str++;
    53:       len--;
    54:     }
    55: 
    56:   while (len > 0)
    57:     {
    58:       tre_filter_profile_t *p;
    59:       counts[*str]++;
    60:       counts[*(str - window_len)]--;
    61: 
    62:       p = profile;
    63:       while (p->ch)
    64:         {
    65:           if (counts[p->ch] < p->count)
    66:             break;
    67:           p++;
    68:         }
    69:       if (!p->ch)
    70:         {
    71:           DPRINT(("Found possible match at %d\n",
    72:                   str - str_orig));
    73:           return str - str_orig;
    74:         }
    75:       else
    76:         {
    77:           DPRINT(("No match so far...\n"));
    78:         }
    79:       len--;
    80:       str++;
    81:     }
    82:   DPRINT(("This string cannot match.\n"));
    83:   return -1;
    84: }
End cpp section to tre/tre_filter.cpp[1]
Start cpp section to tre/tre_filter.hpp[1 /1 ]
     1: #line 4250 "./lpsrc/tre.pak"
     2: 
     3: 
     4: 
     5: 
     6: typedef struct {
     7:   unsigned char ch;
     8:   unsigned char count;
     9: } tre_filter_profile_t;
    10: 
    11: typedef struct {
    12:   /* Length of the window where the character counts are kept. */
    13:   int window_len;
    14:   /* Required character counts table. */
    15:   tre_filter_profile_t *profile;
    16: } tre_filter_t;
    17: 
    18: 
    19: int
    20: tre_filter_find(const unsigned char *str, size_t len, tre_filter_t *filter);
End cpp section to tre/tre_filter.hpp[1]
Start cpp section to tre/tre_internal.hpp[1 /1 ]
     1: #line 4271 "./lpsrc/tre.pak"
     2: /*
     3:   tre-internal.h - TRE internal definitions
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #ifndef TRE_INTERNAL_H
    23: #define TRE_INTERNAL_H 1
    24: 
    25: #ifdef HAVE_WCHAR_H
    26: #include <wchar.h>
    27: #endif /* HAVE_WCHAR_H */
    28: 
    29: #ifdef HAVE_WCTYPE_H
    30: #include <wctype.h>
    31: #endif /* !HAVE_WCTYPE_H */
    32: 
    33: #include <ctype.h>
    34: #include "tre_regex.hpp"
    35: 
    36: #ifdef TRE_DEBUG
    37: #include <stdio.h>
    38: #define DPRINT(msg) do {printf msg; fflush(stdout);} while(0)
    39: #else /* !TRE_DEBUG */
    40: #define DPRINT(msg) do { } while(0)
    41: #endif /* !TRE_DEBUG */
    42: 
    43: #define elementsof(x)   ( sizeof(x) / sizeof(x[0]) )
    44: 
    45: #ifdef HAVE_MBRTOWC
    46: #define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
    47: #else /* !HAVE_MBRTOWC */
    48: #ifdef HAVE_MBTOWC
    49: #define tre_mbrtowc(pwc, s, n, ps) (mbtowc((pwc), (s), (n)))
    50: #endif /* HAVE_MBTOWC */
    51: #endif /* !HAVE_MBRTOWC */
    52: 
    53: #ifdef TRE_MULTIBYTE
    54: #ifdef HAVE_MBSTATE_T
    55: #define TRE_MBSTATE
    56: #endif /* TRE_MULTIBYTE */
    57: #endif /* HAVE_MBSTATE_T */
    58: 
    59: /* Define the character types and functions. */
    60: #ifdef TRE_WCHAR
    61: 
    62: /* Wide characters. */
    63: typedef wint_t tre_cint_t;
    64: #define TRE_CHAR_MAX WCHAR_MAX
    65: 
    66: #ifdef TRE_MULTIBYTE
    67: #define TRE_MB_CUR_MAX MB_CUR_MAX
    68: #else /* !TRE_MULTIBYTE */
    69: #define TRE_MB_CUR_MAX 1
    70: #endif /* !TRE_MULTIBYTE */
    71: 
    72: #define tre_isalnum iswalnum
    73: #define tre_isalpha iswalpha
    74: #ifdef HAVE_ISWBLANK
    75: #define tre_isblank iswblank
    76: #endif /* HAVE_ISWBLANK */
    77: #define tre_iscntrl iswcntrl
    78: #define tre_isdigit iswdigit
    79: #define tre_isgraph iswgraph
    80: #define tre_islower iswlower
    81: #define tre_isprint iswprint
    82: #define tre_ispunct iswpunct
    83: #define tre_isspace iswspace
    84: #define tre_isupper iswupper
    85: #define tre_isxdigit iswxdigit
    86: 
    87: #define tre_tolower towlower
    88: #define tre_toupper towupper
    89: #define tre_strlen  wcslen
    90: 
    91: #else /* !TRE_WCHAR */
    92: 
    93: /* 8 bit characters. */
    94: typedef short tre_cint_t;
    95: #define TRE_CHAR_MAX 255
    96: #define TRE_MB_CUR_MAX 1
    97: 
    98: #define tre_isalnum isalnum
    99: #define tre_isalpha isalpha
   100: #ifdef HAVE_ISASCII
   101: #define tre_isascii isascii
   102: #endif /* HAVE_ISASCII */
   103: #ifdef HAVE_ISBLANK
   104: #define tre_isblank isblank
   105: #endif /* HAVE_ISBLANK */
   106: #define tre_iscntrl iscntrl
   107: #define tre_isdigit isdigit
   108: #define tre_isgraph isgraph
   109: #define tre_islower islower
   110: #define tre_isprint isprint
   111: #define tre_ispunct ispunct
   112: #define tre_isspace isspace
   113: #define tre_isupper isupper
   114: #define tre_isxdigit isxdigit
   115: 
   116: #define tre_tolower(c) (tre_cint_t)(tolower(c))
   117: #define tre_toupper(c) (tre_cint_t)(toupper(c))
   118: #define tre_strlen  strlen
   119: 
   120: #endif /* !TRE_WCHAR */
   121: 
   122: #if defined(TRE_WCHAR) && defined(HAVE_ISWCTYPE) && defined(HAVE_WCTYPE)
   123: #define TRE_USE_SYSTEM_WCTYPE 1
   124: #endif
   125: 
   126: #ifdef TRE_USE_SYSTEM_WCTYPE
   127: /* Use system provided iswctype() and wctype(). */
   128: typedef wctype_t tre_ctype_t;
   129: #define tre_isctype iswctype
   130: #define tre_ctype   wctype
   131: #else /* !TRE_USE_SYSTEM_WCTYPE */
   132: /* Define our own versions of iswctype() and wctype(). */
   133: typedef int (*tre_ctype_t)(tre_cint_t);
   134: #define tre_isctype(c, type) ( (type)(c) )
   135: tre_ctype_t tre_ctype(const char *name);
   136: #endif /* !TRE_USE_SYSTEM_WCTYPE */
   137: 
   138: typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
   139: 
   140: /* Returns number of bytes to add to (char *)ptr to make it
   141:    properly aligned for the type. */
   142: #define ALIGN(ptr, type) \
   143:   ((((long)ptr) % sizeof(type)) \
   144:    ? (sizeof(type) - (((long)ptr) % sizeof(type))) \
   145:    : 0)
   146: 
   147: #undef MAX
   148: #undef MIN
   149: #define MAX(a, b) (((a) >= (b)) ? (a) : (b))
   150: #define MIN(a, b) (((a) <= (b)) ? (a) : (b))
   151: 
   152: /* Define STRF to the correct printf formatter for strings. */
   153: #ifdef TRE_WCHAR
   154: #define STRF "ls"
   155: #else /* !TRE_WCHAR */
   156: #define STRF "s"
   157: #endif /* !TRE_WCHAR */
   158: 
   159: /* TNFA transition type. A TNFA state is an array of transitions,
   160:    the terminator is a transition with NULL `state'. */
   161: typedef struct tnfa_transition tre_tnfa_transition_t;
   162: 
   163: struct tnfa_transition {
   164:   /* Range of accepted characters. */
   165:   tre_cint_t code_min;
   166:   tre_cint_t code_max;
   167:   /* Pointer to the destination state. */
   168:   tre_tnfa_transition_t *state;
   169:   /* ID number of the destination state. */
   170:   int state_id;
   171:   /* -1 terminated array of tags (or NULL). */
   172:   int *tags;
   173:   /* Matching parameters settings (or NULL). */
   174:   int *params;
   175:   /* Assertion bitmap. */
   176:   int assertions;
   177:   /* Assertion parameters. */
   178:   union {
   179:     /* Character klass assertion. */
   180:     tre_ctype_t klass;
   181:     /* Back reference assertion. */
   182:     int backref;
   183:   } u;
   184:   /* Negative character klass assertions. */
   185:   tre_ctype_t *neg_klasses;
   186: };
   187: 
   188: 
   189: /* Assertions. */
   190: #define ASSERT_AT_BOL             1   /* Beginning of line. */
   191: #define ASSERT_AT_EOL             2   /* End of line. */
   192: #define ASSERT_CHAR_CLASS         4   /* Character klass in `klass'. */
   193: #define ASSERT_CHAR_CLASS_NEG     8   /* Character klasses in `neg_klasses'. */
   194: #define ASSERT_AT_BOW            16   /* Beginning of word. */
   195: #define ASSERT_AT_EOW            32   /* End of word. */
   196: #define ASSERT_AT_WB             64   /* Word boundary. */
   197: #define ASSERT_AT_WB_NEG        128   /* Not a word boundary. */
   198: #define ASSERT_BACKREF          256   /* A back reference in `backref'. */
   199: #define ASSERT_LAST             256
   200: 
   201: /* Tag directions. */
   202: typedef enum {
   203:   TRE_TAG_MINIMIZE = 0,
   204:   TRE_TAG_MAXIMIZE = 1
   205: } tre_tag_direction_t;
   206: 
   207: /* Parameters that can be changed dynamically while matching. */
   208: typedef enum {
   209:   TRE_PARAM_COST_INS        = 0,
   210:   TRE_PARAM_COST_DEL        = 1,
   211:   TRE_PARAM_COST_SUBST      = 2,
   212:   TRE_PARAM_COST_MAX        = 3,
   213:   TRE_PARAM_MAX_INS         = 4,
   214:   TRE_PARAM_MAX_DEL         = 5,
   215:   TRE_PARAM_MAX_SUBST       = 6,
   216:   TRE_PARAM_MAX_ERR         = 7,
   217:   TRE_PARAM_DEPTH           = 8,
   218:   TRE_PARAM_LAST            = 9
   219: } tre_param_t;
   220: 
   221: /* Unset matching parameter */
   222: #define TRE_PARAM_UNSET -1
   223: 
   224: /* Signifies the default matching parameter value. */
   225: #define TRE_PARAM_DEFAULT -2
   226: 
   227: /* Instructions to compute submatch register values from tag values
   228:    after a successful match.  */
   229: struct tre_submatch_data {
   230:   /* Tag that gives the value for rm_so (submatch start offset). */
   231:   int so_tag;
   232:   /* Tag that gives the value for rm_eo (submatch end offset). */
   233:   int eo_tag;
   234:   /* List of submatches this submatch is contained in. */
   235:   int *parents;
   236: };
   237: 
   238: typedef struct tre_submatch_data tre_submatch_data_t;
   239: 
   240: 
   241: /* TNFA definition. */
   242: typedef struct tnfa tre_tnfa_t;
   243: 
   244: struct tnfa {
   245:   tre_tnfa_transition_t *transitions;
   246:   unsigned int num_transitions;
   247:   tre_tnfa_transition_t *initial;
   248:   tre_tnfa_transition_t *final;
   249:   tre_submatch_data_t *submatch_data;
   250:   char *firstpos_chars;
   251:   int first_char;
   252:   unsigned int num_submatches;
   253:   tre_tag_direction_t *tag_directions;
   254:   int *minimal_tags;
   255:   int num_tags;
   256:   int num_minimals;
   257:   int end_tag;
   258:   int num_states;
   259:   int cflags;
   260:   int have_backrefs;
   261:   int have_approx;
   262:   int params_depth;
   263: };
   264: 
   265: int
   266: tre_compile(regex_t *preg, const tre_char_t *regex, size_t n, int cflags);
   267: 
   268: void
   269: tre_free(regex_t *preg);
   270: 
   271: void
   272: tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
   273:                 const tre_tnfa_t *tnfa, int *tags, int match_eo);
   274: 
   275: reg_errcode_t
   276: tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
   277:                       tre_str_type_t type, int *match_tags, int eflags,
   278:                       int *match_end_ofs);
   279: 
   280: reg_errcode_t
   281: tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
   282:                       tre_str_type_t type, int *match_tags, int eflags,
   283:                       int *match_end_ofs);
   284: 
   285: reg_errcode_t
   286: tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
   287:                        int len, tre_str_type_t type, int *match_tags,
   288:                        int eflags, int *match_end_ofs);
   289: 
   290: #ifdef TRE_APPROX
   291: reg_errcode_t
   292: tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
   293:                     tre_str_type_t type, int *match_tags,
   294:                     regamatch_t *match, regaparams_t params,
   295:                     int eflags, int *match_end_ofs);
   296: #endif /* TRE_APPROX */
   297: 
   298: #endif /* TRE_INTERNAL_H */
   299: 
   300: /* EOF */
End cpp section to tre/tre_internal.hpp[1]
Start cpp section to tre/tre_match-approx.cpp[1 /1 ]
     1: #line 4572 "./lpsrc/tre.pak"
     2: /*
     3:   tre-match-approx.c - TRE approximate regex matching engine
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #include "flx_target_tre_config.hpp"
    23: 
    24: /* AIX requires this to be the first thing in the file.  */
    25: #ifdef TRE_USE_ALLOCA
    26: #ifndef __GNUC__
    27: # if HAVE_ALLOCA_H
    28: #  include <alloca.h>
    29: # else
    30: #  ifdef _AIX
    31:  #pragma alloca
    32: #  else
    33: #   ifndef alloca /* predefined by HP cc +Olibcalls */
    34: char *alloca ();
    35: #   endif
    36: #  endif
    37: # endif
    38: #endif
    39: #endif /* TRE_USE_ALLOCA */
    40: 
    41: #define __USE_STRING_INLINES
    42: #undef __NO_INLINE__
    43: 
    44: #include <assert.h>
    45: #include <stdlib.h>
    46: #include <string.h>
    47: #include <limits.h>
    48: #ifdef HAVE_WCHAR_H
    49: #include <wchar.h>
    50: #endif /* HAVE_WCHAR_H */
    51: #ifdef HAVE_WCTYPE_H
    52: #include <wctype.h>
    53: #endif /* HAVE_WCTYPE_H */
    54: #ifndef TRE_WCHAR
    55: #include <ctype.h>
    56: #endif /* !TRE_WCHAR */
    57: #ifdef HAVE_MALLOC_H
    58: #include <malloc.h>
    59: #endif /* HAVE_MALLOC_H */
    60: 
    61: #include "tre_internal.hpp"
    62: #include "tre_match-utils.hpp"
    63: #include "tre_regex.hpp"
    64: #include "tre_xmalloc.hpp"
    65: 
    66: #define TRE_M_COST      0
    67: #define TRE_M_NUM_INS   1
    68: #define TRE_M_NUM_DEL   2
    69: #define TRE_M_NUM_SUBST 3
    70: #define TRE_M_NUM_ERR   4
    71: #define TRE_M_LAST      5
    72: 
    73: #define TRE_M_MAX_DEPTH 3
    74: 
    75: typedef struct {
    76:   /* State in the TNFA transition table. */
    77:   tre_tnfa_transition_t *state;
    78:   /* Position in input string. */
    79:   int pos;
    80:   /* Tag values. */
    81:   int *tags;
    82:   /* Matching parameters. */
    83:   regaparams_t params;
    84:   /* Nesting depth of parameters.  This is used as an index in
    85:      the `costs' array. */
    86:   int depth;
    87:   /* Costs and counter values for different parameter nesting depths. */
    88:   int costs[TRE_M_MAX_DEPTH + 1][TRE_M_LAST];
    89: } tre_tnfa_approx_reach_t;
    90: 
    91: 
    92: #ifdef TRE_DEBUG
    93: /* Prints the `reach' array in a readable fashion with DPRINT. */
    94: static void
    95: tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_approx_reach_t *reach,
    96:                 int pos, int num_tags)
    97: {
    98:   int id;
    99: 
   100:   /* Print each state on one line. */
   101:   DPRINT(("  reach:\n"));
   102:   for (id = 0; id < tnfa->num_states; id++)
   103:     {
   104:       int i, j;
   105:       if (reach[id].pos < pos)
   106:         continue;  /* Not reached. */
   107:       DPRINT(("  %03d, costs ", id));
   108:       for (i = 0; i <= reach[id].depth; i++)
   109:         {
   110:           DPRINT(("["));
   111:           for (j = 0; j < TRE_M_LAST; j++)
   112:             {
   113:               DPRINT(("%2d", reach[id].costs[i][j]));
   114:               if (j + 1 < TRE_M_LAST)
   115:                 DPRINT((","));
   116:             }
   117:           DPRINT(("]"));
   118:           if (i + 1 <= reach[id].depth)
   119:             DPRINT((", "));
   120:         }
   121:       DPRINT(("\n       tags "));
   122:       for (i = 0; i < num_tags; i++)
   123:         {
   124:           DPRINT(("%02d", reach[id].tags[i]));
   125:           if (i + 1 < num_tags)
   126:             DPRINT((","));
   127:         }
   128:       DPRINT(("\n"));
   129:     }
   130:   DPRINT(("\n"));
   131: }
   132: #endif /* TRE_DEBUG */
   133: 
   134: 
   135: /* Sets the matching parameters in `reach' to the ones defined in the `pa'
   136:    array.  If `pa' specifies default values, they are taken from
   137:    `default_params'. */
   138: inline static void
   139: tre_set_params(tre_tnfa_approx_reach_t *reach,
   140:                int *pa, regaparams_t default_params)
   141: {
   142:   int value;
   143: 
   144:   /* If depth is increased reset costs and counters to zero for the
   145:      new levels. */
   146:   value = pa[TRE_PARAM_DEPTH];
   147:   assert(value <= TRE_M_MAX_DEPTH);
   148:   if (value > reach->depth)
   149:     {
   150:       int i, j;
   151:       for (i = reach->depth + 1; i <= value; i++)
   152:         for (j = 0; j < TRE_M_LAST; j++)
   153:           reach->costs[i][j] = 0;
   154:     }
   155:   reach->depth = value;
   156: 
   157:   /* Set insert cost. */
   158:   value = pa[TRE_PARAM_COST_INS];
   159:   if (value == TRE_PARAM_DEFAULT)
   160:     reach->params.cost_ins = default_params.cost_ins;
   161:   else if (value != TRE_PARAM_UNSET)
   162:     reach->params.cost_ins = value;
   163: 
   164:   /* Set delete cost. */
   165:   value = pa[TRE_PARAM_COST_DEL];
   166:   if (value == TRE_PARAM_DEFAULT)
   167:     reach->params.cost_del = default_params.cost_del;
   168:   else if (value != TRE_PARAM_UNSET)
   169:     reach->params.cost_del = value;
   170: 
   171:   /* Set substitute cost. */
   172:   value = pa[TRE_PARAM_COST_SUBST];
   173:   if (value == TRE_PARAM_DEFAULT)
   174:     reach->params.cost_subst = default_params.cost_subst;
   175:   else
   176:     reach->params.cost_subst = value;
   177: 
   178:   /* Set maximum cost. */
   179:   value = pa[TRE_PARAM_COST_MAX];
   180:   if (value == TRE_PARAM_DEFAULT)
   181:     reach->params.max_cost = default_params.max_cost;
   182:   else if (value != TRE_PARAM_UNSET)
   183:     reach->params.max_cost = value;
   184: 
   185:   /* Set maximum inserts. */
   186:   value = pa[TRE_PARAM_MAX_INS];
   187:   if (value == TRE_PARAM_DEFAULT)
   188:     reach->params.max_ins = default_params.max_ins;
   189:   else if (value != TRE_PARAM_UNSET)
   190:     reach->params.max_ins = value;
   191: 
   192:   /* Set maximum deletes. */
   193:   value = pa[TRE_PARAM_MAX_DEL];
   194:   if (value == TRE_PARAM_DEFAULT)
   195:     reach->params.max_del = default_params.max_del;
   196:   else if (value != TRE_PARAM_UNSET)
   197:     reach->params.max_del = value;
   198: 
   199:   /* Set maximum substitutes. */
   200:   value = pa[TRE_PARAM_MAX_SUBST];
   201:   if (value == TRE_PARAM_DEFAULT)
   202:     reach->params.max_subst = default_params.max_subst;
   203:   else if (value != TRE_PARAM_UNSET)
   204:     reach->params.max_subst = value;
   205: 
   206:   /* Set maximum number of errors. */
   207:   value = pa[TRE_PARAM_MAX_ERR];
   208:   if (value == TRE_PARAM_DEFAULT)
   209:     reach->params.max_err = default_params.max_err;
   210:   else if (value != TRE_PARAM_UNSET)
   211:     reach->params.max_err = value;
   212: }
   213: 
   214: 
   215: reg_errcode_t
   216: tre_tnfa_run_approx(const tre_tnfa_t *tnfa, const void *string, int len,
   217:                     tre_str_type_t type, int *match_tags,
   218:                     regamatch_t *match, regaparams_t default_params,
   219:                     int eflags, int *match_end_ofs)
   220: {
   221:   /* State variables required by GET_NEXT_WCHAR. */
   222:   tre_char_t prev_c = 0, next_c = 0;
   223:   const char *str_byte = (const char*)string;
   224:   int pos = -1;
   225:   unsigned int pos_add_next = 1;
   226: #ifdef TRE_WCHAR
   227:   const wchar_t *str_wide = (const wchar_t*)string;
   228: #ifdef TRE_MBSTATE
   229:   mbstate_t mbstate;
   230: #endif /* !TRE_WCHAR */
   231: #endif /* TRE_WCHAR */
   232:   int reg_notbol = eflags & REG_NOTBOL;
   233:   int reg_noteol = eflags & REG_NOTEOL;
   234:   int reg_newline = tnfa->cflags & REG_NEWLINE;
   235:   int str_user_end = 0;
   236: 
   237:   int prev_pos;
   238: 
   239:   /* Compilation flags for this regexp. */
   240:   int cflags = tnfa->cflags;
   241: 
   242:   /* Number of tags. */
   243:   int num_tags;
   244:   /* The reach tables. */
   245:   tre_tnfa_approx_reach_t *reach, *reach_next;
   246:   /* Tag array for temporary use. */
   247:   int *tmp_tags;
   248: 
   249:   /* End offset of best match so far, or -1 if no match found yet. */
   250:   int match_eo = -1;
   251:   /* Costs of the match. */
   252:   int match_costs[TRE_M_LAST];
   253: 
   254:   /* Space for temporary data required for matching. */
   255:   unsigned char *buf;
   256: 
   257:   int i, id;
   258: 
   259:   if (!match_tags)
   260:     num_tags = 0;
   261:   else
   262:     num_tags = tnfa->num_tags;
   263: 
   264: #ifdef TRE_MBSTATE
   265:   memset(&mbstate, '\0', sizeof(mbstate));
   266: #endif /* TRE_MBSTATE */
   267: 
   268:   DPRINT(("tre_tnfa_run_approx, input type %d, len %d, eflags %d, "
   269:           "match_tags %p\n",
   270:           type, len, eflags,
   271:           match_tags));
   272:   DPRINT(("max cost %d, ins %d, del %d, subst %d\n",
   273:           default_params.max_cost,
   274:           default_params.cost_ins,
   275:           default_params.cost_del,
   276:           default_params.cost_subst));
   277: 
   278:   /* Allocate memory for temporary data required for matching.  This needs to
   279:      be done for every matching operation to be thread safe.  This allocates
   280:      everything in a single large block from the stack frame using alloca()
   281:      or with malloc() if alloca is unavailable. */
   282:   {
   283:     unsigned char *buf_cursor;
   284:     /* Space needed for one array of tags. */
   285:     int tag_bytes = sizeof(*tmp_tags) * num_tags;
   286:     /* Space needed for one reach table. */
   287:     int reach_bytes = sizeof(*reach_next) * tnfa->num_states;
   288:     /* Total space needed. */
   289:     int total_bytes = reach_bytes * 2 + (tnfa->num_states * 2 + 1 ) * tag_bytes;
   290:     /* Add some extra to make sure we can align the pointers.  The multiplier
   291:        used here must be equal to the number of ALIGN calls below. */
   292:     total_bytes += (sizeof(long) - 1) * 3;
   293: 
   294:     /* Allocate the memory. */
   295: #ifdef TRE_USE_ALLOCA
   296:     buf = (unsigned char*)alloca(total_bytes);
   297: #else /* !TRE_USE_ALLOCA */
   298:     buf = (unsigned char*)xmalloc(total_bytes);
   299: #endif /* !TRE_USE_ALLOCA */
   300:     if (!buf)
   301:       return REG_ESPACE;
   302:     memset(buf, 0, total_bytes);
   303: 
   304:     /* Allocate `tmp_tags' from `buf'. */
   305:     tmp_tags = (int*)(void *)buf;
   306:     buf_cursor = buf + tag_bytes;
   307:     buf_cursor += ALIGN(buf_cursor, long);
   308: 
   309:     /* Allocate `reach' from `buf'. */
   310:     reach = (tre_tnfa_approx_reach_t*)(void *)buf_cursor;
   311:     buf_cursor += reach_bytes;
   312:     buf_cursor += ALIGN(buf_cursor, long);
   313: 
   314:     /* Allocate `reach_next' from `buf'. */
   315:     reach_next = (tre_tnfa_approx_reach_t*)(void *)buf_cursor;
   316:     buf_cursor += reach_bytes;
   317:     buf_cursor += ALIGN(buf_cursor, long);
   318: 
   319:     /* Allocate tag arrays for `reach' and `reach_next' from `buf'. */
   320:     for (i = 0; i < tnfa->num_states; i++)
   321:       {
   322:         reach[i].tags = (int*)(void *)buf_cursor;
   323:         buf_cursor += tag_bytes;
   324:         reach_next[i].tags = (int*)(void *)buf_cursor;
   325:         buf_cursor += tag_bytes;
   326:       }
   327:     assert(buf_cursor <= buf + total_bytes);
   328:   }
   329: 
   330:   for (i = 0; i < TRE_M_LAST; i++)
   331:     match_costs[i] = INT_MAX;
   332: 
   333:   /* Mark the reach arrays empty. */
   334:   for (i = 0; i < tnfa->num_states; i++)
   335:     reach[i].pos = reach_next[i].pos = -2;
   336: 
   337:   prev_pos = pos;
   338:   GET_NEXT_WCHAR();
   339:   pos = 0;
   340: 
   341:   while (1)
   342:     {
   343:       DPRINT(("%03d:%2lc/%05d\n", pos, (tre_cint_t)next_c, (int)next_c));
   344: 
   345:       /* Add initial states to `reach_next' if an exact match has not yet
   346:          been found. */
   347:       if (match_costs[TRE_M_COST] > 0)
   348:         {
   349:           tre_tnfa_transition_t *trans;
   350:           DPRINT(("  init"));
   351:           for (trans = tnfa->initial; trans->state; trans++)
   352:             {
   353:               int id = trans->state_id;
   354: 
   355:               /* If this state is not currently in `reach_next', add it
   356:                  there. */
   357:               if (reach_next[id].pos < pos)
   358:                 {
   359:                   if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
   360:                     {
   361:                       /* Assertions failed, don't add this state. */
   362:                       DPRINT((" !%d (assert)", id));
   363:                       continue;
   364:                     }
   365:                   DPRINT((" %d", id));
   366:                   reach_next[id].state = trans->state;
   367:                   reach_next[id].pos = pos;
   368: 
   369:                   /* Compute tag values after this transition. */
   370:                   for (i = 0; i < num_tags; i++)
   371:                     reach_next[id].tags[i] = -1;
   372: 
   373:                   if (trans->tags)
   374:                     for (i = 0; trans->tags[i] >= 0; i++)
   375:                       if (trans->tags[i] < num_tags)
   376:                         reach_next[id].tags[trans->tags[i]] = pos;
   377: 
   378:                   /* Set the parameters, depth, and costs. */
   379:                   reach_next[id].params = default_params;
   380:                   reach_next[id].depth = 0;
   381:                   for (i = 0; i < TRE_M_LAST; i++)
   382:                     reach_next[id].costs[0][i] = 0;
   383:                   if (trans->params)
   384:                     tre_set_params(&reach_next[id], trans->params,
   385:                                    default_params);
   386: 
   387:                   /* If this is the final state, mark the exact match. */
   388:                   if (trans->state == tnfa->final)
   389:                     {
   390:                       match_eo = pos;
   391:                       for (i = 0; i < num_tags; i++)
   392:                         match_tags[i] = reach_next[id].tags[i];
   393:                       for (i = 0; i < TRE_M_LAST; i++)
   394:                         match_costs[i] = 0;
   395:                     }
   396:                 }
   397:             }
   398:             DPRINT(("\n"));
   399:         }
   400: 
   401: 
   402:       /* Handle inserts.  This is done by pretending there's an epsilon
   403:          transition from each state in `reach' back to the same state.
   404:          We don't need to worry about the final state here; this will never
   405:          give a better match than what we already have. */
   406:       for (id = 0; id < tnfa->num_states; id++)
   407:         {
   408:           int depth;
   409:           int cost, cost0;
   410: 
   411:           if (reach[id].pos != prev_pos)
   412:             {
   413:               DPRINT(("  insert: %d not reached\n", id));
   414:               continue;  /* Not reached. */
   415:             }
   416: 
   417:           depth = reach[id].depth;
   418: 
   419:           /* Compute and check cost at current depth. */
   420:           cost = reach[id].costs[depth][TRE_M_COST];
   421:           if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
   422:             cost += reach[id].params.cost_ins;
   423:           if (cost > reach[id].params.max_cost)
   424:             continue;  /* Cost too large. */
   425: 
   426:           /* Check number of inserts at current depth. */
   427:           if (reach[id].costs[depth][TRE_M_NUM_INS] + 1
   428:               > reach[id].params.max_ins)
   429:             continue;  /* Too many inserts. */
   430: 
   431:           /* Check total number of errors at current depth. */
   432:           if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
   433:               > reach[id].params.max_err)
   434:             continue;  /* Too many errors. */
   435: 
   436:           /* Compute overall cost. */
   437:           cost0 = cost;
   438:           if (depth > 0)
   439:             {
   440:               cost0 = reach[id].costs[0][TRE_M_COST];
   441:               if (reach[id].params.cost_ins != TRE_PARAM_UNSET)
   442:                 cost0 += reach[id].params.cost_ins;
   443:               else
   444:                 cost0 += default_params.cost_ins;
   445:             }
   446: 
   447:           DPRINT(("  insert: from %d to %d, cost %d: ", id, id,
   448:                   reach[id].costs[depth][TRE_M_COST]));
   449:           if (reach_next[id].pos == pos
   450:               && (cost0 >= reach_next[id].costs[0][TRE_M_COST]))
   451:             {
   452:               DPRINT(("lose\n"));
   453:               continue;
   454:             }
   455:           DPRINT(("win\n"));
   456: 
   457:           /* Copy state, position, tags, parameters, and depth. */
   458:           reach_next[id].state = reach[id].state;
   459:           reach_next[id].pos = pos;
   460:           for (i = 0; i < num_tags; i++)
   461:             reach_next[id].tags[i] = reach[id].tags[i];
   462:           reach_next[id].params = reach[id].params;
   463:           reach_next[id].depth = reach[id].depth;
   464: 
   465:           /* Set the costs after this transition. */
   466:           memcpy(reach_next[id].costs, reach[id].costs,
   467:                  sizeof(reach_next[id].costs[0][0])
   468:                  * TRE_M_LAST * (depth + 1));
   469:           reach_next[id].costs[depth][TRE_M_COST] = cost;
   470:           reach_next[id].costs[depth][TRE_M_NUM_INS]++;
   471:           reach_next[id].costs[depth][TRE_M_NUM_ERR]++;
   472:           if (depth > 0)
   473:             {
   474:               reach_next[id].costs[0][TRE_M_COST] = cost0;
   475:               reach_next[id].costs[0][TRE_M_NUM_INS]++;
   476:               reach_next[id].costs[0][TRE_M_NUM_ERR]++;
   477:             }
   478: 
   479:         }
   480: 
   481: 
   482:       /* Handle deletes.  This is done by traversing through the whole TNFA
   483:          pretending that all transitions are epsilon transitions, until
   484:          no more states can be reached with better costs. */
   485:       {
   486:         /* XXX - dynamic ringbuffer size */
   487:         tre_tnfa_approx_reach_t *ringbuffer[512];
   488:         tre_tnfa_approx_reach_t **deque_start, **deque_end;
   489: 
   490:         deque_start = deque_end = ringbuffer;
   491: 
   492:         /* Add all states in `reach_next' to the deque. */
   493:         for (id = 0; id < tnfa->num_states; id++)
   494:           {
   495:             if (reach_next[id].pos != pos)
   496:               continue;
   497:             *deque_end = &reach_next[id];
   498:             deque_end++;
   499:             assert(deque_end != deque_start);
   500:           }
   501: 
   502:         /* Repeat until the deque is empty. */
   503:         while (deque_end != deque_start)
   504:           {
   505:             tre_tnfa_approx_reach_t *reach_p;
   506:             int id;
   507:             int depth;
   508:             int cost, cost0;
   509:             tre_tnfa_transition_t *trans;
   510: 
   511:             /* Pop the first item off the deque. */
   512:             reach_p = *deque_start;
   513:             id = reach_p - reach_next;
   514:             depth = reach_p->depth;
   515: 
   516:             /* Compute cost at current depth. */
   517:             cost = reach_p->costs[depth][TRE_M_COST];
   518:             if (reach_p->params.cost_del != TRE_PARAM_UNSET)
   519:               cost += reach_p->params.cost_del;
   520: 
   521:             /* Check cost, number of deletes, and total number of errors
   522:                at current depth. */
   523:             if (cost > reach_p->params.max_cost
   524:                 || (reach_p->costs[depth][TRE_M_NUM_DEL] + 1
   525:                     > reach_p->params.max_del)
   526:                 || (reach_p->costs[depth][TRE_M_NUM_ERR] + 1
   527:                     > reach_p->params.max_err))
   528:               {
   529:                 /* Too many errors or cost too large. */
   530:                 DPRINT(("  delete: from %03d: cost too large\n", id));
   531:                 deque_start++;
   532:                 if (deque_start >= (ringbuffer + 512))
   533:                   deque_start = ringbuffer;
   534:                 continue;
   535:               }
   536: 
   537:             /* Compute overall cost. */
   538:             cost0 = cost;
   539:             if (depth > 0)
   540:               {
   541:                 cost0 = reach_p->costs[0][TRE_M_COST];
   542:                 if (reach_p->params.cost_del != TRE_PARAM_UNSET)
   543:                   cost0 += reach_p->params.cost_del;
   544:                 else
   545:                   cost0 += default_params.cost_del;
   546:               }
   547: 
   548:             for (trans = reach_p->state; trans->state; trans++)
   549:               {
   550:                 int dest_id = trans->state_id;
   551:                 DPRINT(("  delete: from %03d to %03d, cost %d (%d): ",
   552:                         id, dest_id, cost0, reach_p->params.max_cost));
   553: 
   554:                 if (trans->assertions && CHECK_ASSERTIONS(trans->assertions))
   555:                   {
   556:                     DPRINT(("assertion failed\n"));
   557:                     continue;
   558:                   }
   559: 
   560:                 /* Compute tag values after this transition. */
   561:                 for (i = 0; i < num_tags; i++)
   562:                   tmp_tags[i] = reach_p->tags[i];
   563:                 if (trans->tags)
   564:                   for (i = 0; trans->tags[i] >= 0; i++)
   565:                     if (trans->tags[i] < num_tags)
   566:                       tmp_tags[trans->tags[i]] = pos;
   567: 
   568:                 /* If another path has also reached this state, choose the one
   569:                    with the smallest cost or best tags if costs are equal. */
   570:                 if (reach_next[dest_id].pos == pos
   571:                     && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
   572:                         || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
   573:                             && (!match_tags
   574:                                 || !tre_tag_order(num_tags,
   575:                                                   tnfa->tag_directions,
   576:                                                   tmp_tags,
   577:                                                   reach_next[dest_id].tags)))))
   578:                   {
   579:                     DPRINT(("lose, cost0 %d, have %d\n",
   580:                             cost0, reach_next[dest_id].costs[0][TRE_M_COST]));
   581:                     continue;
   582:                   }
   583:                 DPRINT(("win\n"));
   584: 
   585:                 /* Set state, position, tags, parameters, depth, and costs. */
   586:                 reach_next[dest_id].state = trans->state;
   587:                 reach_next[dest_id].pos = pos;
   588:                 for (i = 0; i < num_tags; i++)
   589:                   reach_next[dest_id].tags[i] = tmp_tags[i];
   590: 
   591:                 reach_next[dest_id].params = reach_p->params;
   592:                 if (trans->params)
   593:                   tre_set_params(&reach_next[dest_id], trans->params,
   594:                                  default_params);
   595: 
   596:                 reach_next[dest_id].depth = reach_p->depth;
   597:                 memcpy(&reach_next[dest_id].costs,
   598:                        reach_p->costs,
   599:                        sizeof(reach_p->costs[0][0])
   600:                        * TRE_M_LAST * (depth + 1));
   601:                 reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
   602:                 reach_next[dest_id].costs[depth][TRE_M_NUM_DEL]++;
   603:                 reach_next[dest_id].costs[depth][TRE_M_NUM_ERR]++;
   604:                 if (depth > 0)
   605:                   {
   606:                     reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
   607:                     reach_next[dest_id].costs[0][TRE_M_NUM_DEL]++;
   608:                     reach_next[dest_id].costs[0][TRE_M_NUM_ERR]++;
   609:                   }
   610: 
   611:                 if (trans->state == tnfa->final
   612:                     && (match_eo < 0
   613:                         || match_costs[TRE_M_COST] > cost0
   614:                         || (match_costs[TRE_M_COST] == cost0
   615:                             && (num_tags > 0
   616:                                 && tmp_tags[0] <= match_tags[0]))))
   617:                   {
   618:                     DPRINT(("    setting new match at %d, cost %d\n",
   619:                             pos, cost0));
   620:                     match_eo = pos;
   621:                     memcpy(match_costs, reach_next[dest_id].costs[0],
   622:                            sizeof(match_costs[0]) * TRE_M_LAST);
   623:                     for (i = 0; i < num_tags; i++)
   624:                       match_tags[i] = tmp_tags[i];
   625:                   }
   626: 
   627:                 /* Add to the end of the deque. */
   628:                 *deque_end = &reach_next[dest_id];
   629:                 deque_end++;
   630:                 if (deque_end >= (ringbuffer + 512))
   631:                   deque_end = ringbuffer;
   632:                 assert(deque_end != deque_start);
   633:               }
   634:             deque_start++;
   635:             if (deque_start >= (ringbuffer + 512))
   636:               deque_start = ringbuffer;
   637:           }
   638: 
   639:       }
   640: 
   641: #ifdef TRE_DEBUG
   642:       tre_print_reach(tnfa, reach_next, pos, num_tags);
   643: #endif /* TRE_DEBUG */
   644: 
   645:       /* Check for end of string. */
   646:       if (len < 0)
   647:         {
   648:           if (next_c == L'\0')
   649:             break;
   650:         }
   651:       else
   652:         {
   653:           if (pos >= len)
   654:             break;
   655:         }
   656: 
   657:       prev_pos = pos;
   658:       GET_NEXT_WCHAR();
   659: 
   660:       /* Swap `reach' and `reach_next'. */
   661:       {
   662:         tre_tnfa_approx_reach_t *tmp;
   663:         tmp = reach;
   664:         reach = reach_next;
   665:         reach_next = tmp;
   666:       }
   667: 
   668:       /* Handle exact matches and substitutions. */
   669:       for (id = 0; id < tnfa->num_states; id++)
   670:         {
   671:           tre_tnfa_transition_t *trans;
   672: 
   673:           if (reach[id].pos < prev_pos)
   674:             continue;  /* Not reached. */
   675:           for (trans = reach[id].state; trans->state; trans++)
   676:             {
   677:               int dest_id;
   678:               int depth;
   679:               int cost, cost0, err;
   680: 
   681:               if (trans->assertions
   682:                   && (CHECK_ASSERTIONS(trans->assertions)
   683:                       /* Handle character klass transitions. */
   684:                       || ((trans->assertions & ASSERT_CHAR_CLASS)
   685:                           && !(cflags & REG_ICASE)
   686:                           && !tre_isctype((tre_cint_t)prev_c, trans->u.klass))
   687:                       || ((trans->assertions & ASSERT_CHAR_CLASS)
   688:                           && (cflags & REG_ICASE)
   689:                           && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
   690:                                            trans->u.klass)
   691:                               && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
   692:                                               trans->u.klass)))
   693:                       || ((trans->assertions & ASSERT_CHAR_CLASS_NEG)
   694:                           && tre_neg_char_klasses_match(trans->neg_klasses,
   695:                                                         (tre_cint_t)prev_c,
   696:                                                         cflags & REG_ICASE))))
   697:                 {
   698:                   DPRINT(("  exact,  from %d: assert failed\n", id));
   699:                   continue;
   700:                 }
   701: 
   702:               depth = reach[id].depth;
   703:               dest_id = trans->state_id;
   704: 
   705:               cost = reach[id].costs[depth][TRE_M_COST];
   706:               cost0 = reach[id].costs[0][TRE_M_COST];
   707:               err = 0;
   708: 
   709:               if (trans->code_min > prev_c ||
   710:                   trans->code_max < prev_c)
   711:                 {
   712:                   /* Handle substitutes.  The required character was not in
   713:                      the string, so match it in place of whatever was supposed
   714:                      to be there and increase costs accordingly. */
   715:                   err = 1;
   716: 
   717:                   /* Compute and check cost at current depth. */
   718:                   cost = reach[id].costs[depth][TRE_M_COST];
   719:                   if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
   720:                     cost += reach[id].params.cost_subst;
   721:                   if (cost > reach[id].params.max_cost)
   722:                     continue; /* Cost too large. */
   723: 
   724:                   /* Check number of substitutes at current depth. */
   725:                   if (reach[id].costs[depth][TRE_M_NUM_SUBST] + 1
   726:                       > reach[id].params.max_subst)
   727:                     continue; /* Too many substitutes. */
   728: 
   729:                   /* Check total number of errors at current depth. */
   730:                   if (reach[id].costs[depth][TRE_M_NUM_ERR] + 1
   731:                       > reach[id].params.max_err)
   732:                     continue; /* Too many errors. */
   733: 
   734:                   /* Compute overall cost. */
   735:                   cost0 = cost;
   736:                   if (depth > 0)
   737:                     {
   738:                       cost0 = reach[id].costs[0][TRE_M_COST];
   739:                       if (reach[id].params.cost_subst != TRE_PARAM_UNSET)
   740:                         cost0 += reach[id].params.cost_subst;
   741:                       else
   742:                         cost0 += default_params.cost_subst;
   743:                     }
   744:                   DPRINT(("  subst,  from %03d to %03d, cost %d: ",
   745:                           id, dest_id, cost0));
   746:                 }
   747:               else
   748:                 DPRINT(("  exact,  from %03d to %03d, cost %d: ",
   749:                         id, dest_id, cost0));
   750: 
   751:               /* Compute tag values after this transition. */
   752:               for (i = 0; i < num_tags; i++)
   753:                 tmp_tags[i] = reach[id].tags[i];
   754:               if (trans->tags)
   755:                 for (i = 0; trans->tags[i] >= 0; i++)
   756:                   if (trans->tags[i] < num_tags)
   757:                     tmp_tags[trans->tags[i]] = pos;
   758: 
   759:               /* If another path has also reached this state, choose the
   760:                  one with the smallest cost or best tags if costs are equal. */
   761:               if (reach_next[dest_id].pos == pos
   762:                   && (cost0 > reach_next[dest_id].costs[0][TRE_M_COST]
   763:                       || (cost0 == reach_next[dest_id].costs[0][TRE_M_COST]
   764:                           && !tre_tag_order(num_tags, tnfa->tag_directions,
   765:                                             tmp_tags,
   766:                                             reach_next[dest_id].tags))))
   767:                 {
   768:                   DPRINT(("lose\n"));
   769:                   continue;
   770:                 }
   771:               DPRINT(("win %d %d\n",
   772:                       reach_next[dest_id].pos,
   773:                       reach_next[dest_id].costs[0][TRE_M_COST]));
   774: 
   775:               /* Set state, position, tags, and depth. */
   776:               reach_next[dest_id].state = trans->state;
   777:               reach_next[dest_id].pos = pos;
   778:               for (i = 0; i < num_tags; i++)
   779:                 reach_next[dest_id].tags[i] = tmp_tags[i];
   780:               reach_next[dest_id].depth = reach[id].depth;
   781: 
   782:               /* Set parameters. */
   783:               reach_next[dest_id].params = reach[id].params;
   784:               if (trans->params)
   785:                 tre_set_params(&reach_next[dest_id], trans->params,
   786:                                default_params);
   787: 
   788:               /* Set the costs after this transition. */
   789:                 memcpy(&reach_next[dest_id].costs,
   790:                        reach[id].costs,
   791:                        sizeof(reach[id].costs[0][0])
   792:                        * TRE_M_LAST * (depth + 1));
   793:               reach_next[dest_id].costs[depth][TRE_M_COST] = cost;
   794:               reach_next[dest_id].costs[depth][TRE_M_NUM_SUBST] += err;
   795:               reach_next[dest_id].costs[depth][TRE_M_NUM_ERR] += err;
   796:               if (depth > 0)
   797:                 {
   798:                   reach_next[dest_id].costs[0][TRE_M_COST] = cost0;
   799:                   reach_next[dest_id].costs[0][TRE_M_NUM_SUBST] += err;
   800:                   reach_next[dest_id].costs[0][TRE_M_NUM_ERR] += err;
   801:                 }
   802: 
   803:               if (trans->state == tnfa->final
   804:                   && (match_eo < 0
   805:                       || cost0 < match_costs[TRE_M_COST]
   806:                       || (cost0 == match_costs[TRE_M_COST]
   807:                           && num_tags > 0 && tmp_tags[0] <= match_tags[0])))
   808:                 {
   809:                   DPRINT(("    setting new match at %d, cost %d\n",
   810:                           pos, cost0));
   811:                   match_eo = pos;
   812:                   for (i = 0; i < TRE_M_LAST; i++)
   813:                     match_costs[i] = reach_next[dest_id].costs[0][i];
   814:                   for (i = 0; i < num_tags; i++)
   815:                     match_tags[i] = tmp_tags[i];
   816:                 }
   817:             }
   818:         }
   819:     }
   820: 
   821:   DPRINT(("match end offset = %d, match cost = %d\n", match_eo,
   822:           match_costs[TRE_M_COST]));
   823: 
   824: #ifndef TRE_USE_ALLOCA
   825:   if (buf)
   826:     xfree(buf);
   827: #endif /* !TRE_USE_ALLOCA */
   828: 
   829:   match->cost = match_costs[TRE_M_COST];
   830:   match->num_ins = match_costs[TRE_M_NUM_INS];
   831:   match->num_del = match_costs[TRE_M_NUM_DEL];
   832:   match->num_subst = match_costs[TRE_M_NUM_SUBST];
   833:   *match_end_ofs = match_eo;
   834: 
   835:   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
   836: }
End cpp section to tre/tre_match-approx.cpp[1]
Start cpp section to tre/tre_match-backtrack.cpp[1 /1 ]
     1: #line 5409 "./lpsrc/tre.pak"
     2: /*
     3:   tre-match-backtrack.c - TRE backtracking regex matching engine
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: /*
    23:   This matcher is for regexps that use back referencing.  Regexp matching
    24:   with back referencing is an NP-complete problem on the number of back
    25:   references.  The easiest way to match them is to use a backtracking
    26:   routine which basically goes through all possible paths in the TNFA
    27:   and chooses the one which results in the best (leftmost and longest)
    28:   match.  This can be spectacularly expensive and may run out of stack
    29:   space, but there really is no better known generic algorithm.  Quoting
    30:   Henry Spencer from comp.compilers:
    31:   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
    32: 
    33:     POSIX.2 REs require longest match, which is really exciting to
    34:     implement since the obsolete ("basic") variant also includes
    35:     \<digit>.  I haven't found a better way of tackling this than doing
    36:     a preliminary match using a DFA (or simulation) on a modified RE
    37:     that just replicates subREs for \<digit>, and then doing a
    38:     backtracking match to determine whether the subRE matches were
    39:     right.  This can be rather slow, but I console myself with the
    40:     thought that people who use \<digit> deserve very slow execution.
    41:     (Pun unintentional but very appropriate.)
    42: 
    43: */
    44: 
    45: 
    46: #include "flx_target_tre_config.hpp"
    47: 
    48: #ifdef TRE_USE_ALLOCA
    49: /* AIX requires this to be the first thing in the file.  */
    50: #ifndef __GNUC__
    51: # if HAVE_ALLOCA_H
    52: #  include <alloca.h>
    53: # else
    54: #  ifdef _AIX
    55:  #pragma alloca
    56: #  else
    57: #   ifndef alloca /* predefined by HP cc +Olibcalls */
    58: char *alloca ();
    59: #   endif
    60: #  endif
    61: # endif
    62: #endif
    63: #endif /* TRE_USE_ALLOCA */
    64: 
    65: #include <assert.h>
    66: #include <stdlib.h>
    67: #include <string.h>
    68: #ifdef HAVE_WCHAR_H
    69: #include <wchar.h>
    70: #endif /* HAVE_WCHAR_H */
    71: #ifdef HAVE_WCTYPE_H
    72: #include <wctype.h>
    73: #endif /* HAVE_WCTYPE_H */
    74: #ifndef TRE_WCHAR
    75: #include <ctype.h>
    76: #endif /* !TRE_WCHAR */
    77: #ifdef HAVE_MALLOC_H
    78: #include <malloc.h>
    79: #endif /* HAVE_MALLOC_H */
    80: 
    81: #include "tre_internal.hpp"
    82: #include "tre_mem.hpp"
    83: #include "tre_match-utils.hpp"
    84: #include "tre_regex.hpp"
    85: #include "tre_xmalloc.hpp"
    86: 
    87: typedef struct {
    88:   int pos;
    89:   const char *str_byte;
    90: #ifdef TRE_WCHAR
    91:   const wchar_t *str_wide;
    92: #endif /* TRE_WCHAR */
    93:   tre_tnfa_transition_t *state;
    94:   int state_id;
    95:   int next_c;
    96:   int *tags;
    97: #ifdef TRE_MBSTATE
    98:   mbstate_t mbstate;
    99: #endif /* TRE_MBSTATE */
   100: } tre_backtrack_item_t;
   101: 
   102: typedef struct tre_backtrack_struct {
   103:   tre_backtrack_item_t item;
   104:   struct tre_backtrack_struct *prev;
   105:   struct tre_backtrack_struct *next;
   106: } *tre_backtrack_t;
   107: 
   108: #ifdef TRE_WHAR
   109: #define BT_STACK_WIDE_IN     stack->item.str_wide = (_str_wide)
   110: #define BT_STACK_WIDE_OUT    (_str_wide) = stack->item.str_wide
   111: #else /* !TRE_WCHAR */
   112: #define BT_STACK_WIDE_IN
   113: #define BT_STACK_WIDE_OUT
   114: #endif /* !TRE_WCHAR */
   115: 
   116: #ifdef TRE_MBSTATE
   117: #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
   118: #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
   119: #else /* !TRE_MBSTATE */
   120: #define BT_STACK_MBSTATE_IN
   121: #define BT_STACK_MBSTATE_OUT
   122: #endif /* !TRE_MBSTATE */
   123: 
   124: 
   125: #ifdef TRE_USE_ALLOCA
   126: #define tre_bt_mem_new            tre_mem_newa
   127: #define tre_bt_mem_alloc          tre_mem_alloca
   128: #define tre_bt_mem_destroy(obj)   do { } while (0)
   129: #else /* !TRE_USE_ALLOCA */
   130: #define tre_bt_mem_new            tre_mem_new
   131: #define tre_bt_mem_alloc          tre_mem_alloc
   132: #define tre_bt_mem_destroy        tre_mem_destroy
   133: #endif /* !TRE_USE_ALLOCA */
   134: 
   135: 
   136: #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
   137:   do                                                                          \
   138:     {                                                                         \
   139:       int i;                                                                  \
   140:       if (!stack->next)                                                       \
   141:         {                                                                     \
   142:           tre_backtrack_t s;                                                  \
   143:           s = (tre_backtrack_struct*)tre_bt_mem_alloc(mem, sizeof(*s));                              \
   144:           if (!s)                                                             \
   145:             {                                                                 \
   146:               tre_bt_mem_destroy(mem);                                        \
   147:               if (tags)                                                       \
   148:                 xfree(tags);                                                  \
   149:               if (pmatch)                                                     \
   150:                 xfree(pmatch);                                                \
   151:               if (states_seen)                                                \
   152:                 xfree(states_seen);                                           \
   153:               return REG_ESPACE;                                              \
   154:             }                                                                 \
   155:           s->prev = stack;                                                    \
   156:           s->next = NULL;                                                     \
   157:           s->item.tags = (int*)tre_bt_mem_alloc(mem,                                \
   158:                                           sizeof(*tags) * tnfa->num_tags);    \
   159:           if (!s->item.tags)                                                  \
   160:             {                                                                 \
   161:               tre_bt_mem_destroy(mem);                                        \
   162:               if (tags)                                                       \
   163:                 xfree(tags);                                                  \
   164:               if (pmatch)                                                     \
   165:                 xfree(pmatch);                                                \
   166:               if (states_seen)                                                \
   167:                 xfree(states_seen);                                           \
   168:               return REG_ESPACE;                                              \
   169:             }                                                                 \
   170:           stack->next = s;                                                    \
   171:           stack = s;                                                          \
   172:         }                                                                     \
   173:       else                                                                    \
   174:         stack = stack->next;                                                  \
   175:       stack->item.pos = (_pos);                                               \
   176:       stack->item.str_byte = (_str_byte);                                     \
   177:       BT_STACK_WIDE_IN;                                                       \
   178:       stack->item.state = (_state);                                           \
   179:       stack->item.state_id = (_state_id);                                     \
   180:       stack->item.next_c = (_next_c);                                         \
   181:       for (i = 0; i < tnfa->num_tags; i++)                                    \
   182:         stack->item.tags[i] = (_tags)[i];                                     \
   183:       BT_STACK_MBSTATE_IN;                                                    \
   184:     }                                                                         \
   185:   while (0)
   186: 
   187: #define BT_STACK_POP()                                                        \
   188:   do                                                                          \
   189:     {                                                                         \
   190:       int i;                                                                  \
   191:       assert(stack->prev);                                                    \
   192:       pos = stack->item.pos;                                                  \
   193:       str_byte = stack->item.str_byte;                                        \
   194:       BT_STACK_WIDE_OUT;                                                      \
   195:       state = stack->item.state;                                              \
   196:       next_c = stack->item.next_c;                                            \
   197:       for (i = 0; i < tnfa->num_tags; i++)                                    \
   198:         tags[i] = stack->item.tags[i];                                        \
   199:       BT_STACK_MBSTATE_OUT;                                                   \
   200:       stack = stack->prev;                                                    \
   201:     }                                                                         \
   202:   while (0)
   203: 
   204: #undef MIN
   205: #define MIN(a, b) ((a) <= (b) ? (a) : (b))
   206: 
   207: reg_errcode_t
   208: tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
   209:                        int len, tre_str_type_t type, int *match_tags,
   210:                        int eflags, int *match_end_ofs)
   211: {
   212:   /* State variables required by GET_NEXT_WCHAR. */
   213:   tre_char_t prev_c = 0, next_c = 0;
   214:   const char *str_byte = (const char*)string;
   215:   int pos = 0;
   216:   unsigned int pos_add_next = 1;
   217: #ifdef TRE_WCHAR
   218:   const wchar_t *str_wide = (const wchar_t*)string;
   219: #ifdef TRE_MBSTATE
   220:   mbstate_t mbstate;
   221: #endif /* TRE_MBSTATE */
   222: #endif /* TRE_WCHAR */
   223:   int reg_notbol = eflags & REG_NOTBOL;
   224:   int reg_noteol = eflags & REG_NOTEOL;
   225:   int reg_newline = tnfa->cflags & REG_NEWLINE;
   226:   int str_user_end = 0;
   227: 
   228:   /* These are used to remember the necessary values of the above
   229:      variables to return to the position where the current search
   230:      started from. */
   231:   int next_c_start;
   232:   const char *str_byte_start;
   233:   int pos_start = -1;
   234: #ifdef TRE_WCHAR
   235:   const wchar_t *str_wide_start;
   236: #endif /* TRE_WCHAR */
   237: #ifdef TRE_MBSTATE
   238:   mbstate_t mbstate_start;
   239: #endif /* TRE_MBSTATE */
   240: 
   241:   /* Compilation flags for this regexp. */
   242:   int cflags = tnfa->cflags;
   243: 
   244:   /* End offset of best match so far, or -1 if no match found yet. */
   245:   int match_eo = -1;
   246:   /* Tag arrays. */
   247:   int *next_tags, *tags = NULL;
   248:   /* Current TNFA state. */
   249:   tre_tnfa_transition_t *state;
   250:   int *states_seen = NULL;
   251: 
   252:   /* Memory allocator to for allocating the backtracking stack. */
   253:   tre_mem_t mem = tre_bt_mem_new();
   254: 
   255:   /* The backtracking stack. */
   256:   tre_backtrack_t stack;
   257: 
   258:   tre_tnfa_transition_t *trans_i;
   259:   regmatch_t *pmatch = NULL;
   260:   int ret;
   261: 
   262: #ifdef TRE_MBSTATE
   263:   memset(&mbstate, '\0', sizeof(mbstate));
   264: #endif /* TRE_MBSTATE */
   265: 
   266:   if (!mem)
   267:     return REG_ESPACE;
   268:   stack = (tre_backtrack_struct*)tre_bt_mem_alloc(mem, sizeof(*stack));
   269:   if (!stack)
   270:     {
   271:       ret = REG_ESPACE;
   272:       goto error_exit;
   273:     }
   274:   stack->prev = NULL;
   275:   stack->next = NULL;
   276: 
   277:   DPRINT(("tnfa_execute_backtrack, input type %d\n", type));
   278:   DPRINT(("len = %d\n", len));
   279: 
   280: #ifdef TRE_USE_ALLOCA
   281:   tags = (int*)alloca(sizeof(*tags) * tnfa->num_tags);
   282:   pmatch = (regmatch_t*)alloca(sizeof(*pmatch) * tnfa->num_submatches);
   283:   states_seen = (int*)alloca(sizeof(*states_seen) * tnfa->num_states);
   284: #else /* !TRE_USE_ALLOCA */
   285:   tags = (int*)xmalloc(sizeof(*tags) * tnfa->num_tags);
   286:   if (!tags)
   287:     {
   288:       ret = REG_ESPACE;
   289:       goto error_exit;
   290:     }
   291:   pmatch = (regmatch_t*)xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
   292:   if (!pmatch)
   293:     {
   294:       ret = REG_ESPACE;
   295:       goto error_exit;
   296:     }
   297:   states_seen = (int*)xmalloc(sizeof(*states_seen) * tnfa->num_states);
   298:   if (!states_seen)
   299:     {
   300:       ret = REG_ESPACE;
   301:       goto error_exit;
   302:     }
   303: #endif /* !TRE_USE_ALLOCA */
   304: 
   305:  retry:
   306:   {
   307:     int i;
   308:     for (i = 0; i < tnfa->num_tags; i++)
   309:       {
   310:         tags[i] = -1;
   311:         if (match_tags)
   312:           match_tags[i] = -1;
   313:       }
   314:     for (i = 0; i < tnfa->num_states; i++)
   315:       states_seen[i] = 0;
   316:   }
   317: 
   318:   state = NULL;
   319:   pos = pos_start;
   320:   if (type == STR_USER)
   321:     str_source->rewind(pos + pos_add_next, str_source->context);
   322:   GET_NEXT_WCHAR();
   323:   pos_start = pos;
   324:   next_c_start = next_c;
   325:   str_byte_start = str_byte;
   326: #ifdef TRE_WCHAR
   327:   str_wide_start = str_wide;
   328: #endif /* TRE_WCHAR */
   329: #ifdef TRE_MBSTATE
   330:   mbstate_start = mbstate;
   331: #endif /* TRE_MBSTATE */
   332: 
   333:   /* Handle initial states. */
   334:   next_tags = NULL;
   335:   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
   336:     {
   337:       DPRINT(("> init %p, prev_c %lc\n", trans_i->state, (tre_cint_t)prev_c));
   338:       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
   339:         {
   340:           DPRINT(("assert failed\n"));
   341:           continue;
   342:         }
   343:       if (state == NULL)
   344:         {
   345:           /* Start from this state. */
   346:           state = trans_i->state;
   347:           next_tags = trans_i->tags;
   348:         }
   349:       else
   350:         {
   351:           /* Backtrack to this state. */
   352:           DPRINT(("saving state %d for backtracking\n", trans_i->state_id));
   353:           BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
   354:                         trans_i->state_id, next_c, tags, mbstate);
   355:           {
   356:             int *tmp = trans_i->tags;
   357:             if (tmp)
   358:               while (*tmp >= 0)
   359:                 stack->item.tags[*tmp++] = pos;
   360:           }
   361:         }
   362:     }
   363: 
   364:   if (next_tags)
   365:     for (; *next_tags >= 0; next_tags++)
   366:       tags[*next_tags] = pos;
   367: 
   368: 
   369:   DPRINT(("entering match loop, pos %d, str_byte %p\n", pos, str_byte));
   370:   DPRINT(("pos:chr/code | state and tags\n"));
   371:   DPRINT(("-------------+------------------------------------------------\n"));
   372: 
   373:   if (state == NULL)
   374:     goto backtrack;
   375: 
   376:   while (1)
   377:     {
   378:       tre_tnfa_transition_t *trans_i, *next_state;
   379:       int empty_br_match;
   380: 
   381:       DPRINT(("start loop\n"));
   382:       if (state == tnfa->final)
   383:         {
   384:           DPRINT(("  match found, %d %d\n", match_eo, pos));
   385:           if (match_eo < pos
   386:               || (match_eo == pos
   387:                   && match_tags
   388:                   && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
   389:                                    tags, match_tags)))
   390:             {
   391:               int i;
   392:               /* This match wins the previous match. */
   393:               DPRINT(("  win previous\n"));
   394:               match_eo = pos;
   395:               if (match_tags)
   396:                 for (i = 0; i < tnfa->num_tags; i++)
   397:                   match_tags[i] = tags[i];
   398:             }
   399:           /* Our TNFAs never have transitions leaving from the final state,
   400:              so we jump right to backtracking. */
   401:           goto backtrack;
   402:         }
   403: 
   404: #ifdef TRE_DEBUG
   405:       DPRINT(("%3d:%2lc/%05d | %p ", pos, (tre_cint_t)next_c, (int)next_c,
   406:               state));
   407:       {
   408:         int i;
   409:         for (i = 0; i < tnfa->num_tags; i++)
   410:           DPRINT(("%d%s", tags[i], i < tnfa->num_tags - 1 ? ", " : ""));
   411:         DPRINT(("\n"));
   412:       }
   413: #endif /* TRE_DEBUG */
   414: 
   415:       /* Go to the next character in the input string. */
   416:       empty_br_match = 0;
   417:       trans_i = state;
   418:       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
   419:         {
   420:           /* This is a back reference state.  All transitions leaving from
   421:              this state have the same back reference "assertion".  Instead
   422:              of reading the next character, we match the back reference. */
   423:           int so, eo, bt = trans_i->u.backref;
   424:           int bt_len;
   425:           int result;
   426: 
   427:           DPRINT(("  should match back reference %d\n", bt));
   428:           /* Get the substring we need to match against.  Remember to
   429:              turn off REG_NOSUB temporarily. */
   430:           tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & !REG_NOSUB,
   431:                           tnfa, tags, pos);
   432:           so = pmatch[bt].rm_so;
   433:           eo = pmatch[bt].rm_eo;
   434:           bt_len = eo - so;
   435: 
   436: #ifdef TRE_DEBUG
   437:           {
   438:             int slen;
   439:             if (len < 0)
   440:               slen = bt_len;
   441:             else
   442:               slen = MIN(bt_len, len - pos);
   443: 
   444:             if (type == STR_BYTE)
   445:               {
   446:                 DPRINT(("  substring (len %d) is [%d, %d[: '%.*s'\n",
   447:                         bt_len, so, eo, bt_len, (char*)string + so));
   448:                 DPRINT(("  current string is '%.*s'\n", slen, str_byte - 1));
   449:               }
   450: #ifdef TRE_WCHAR
   451:             else if (type == STR_WIDE)
   452:               {
   453:                 DPRINT(("  substring (len %d) is [%d, %d[: '%.*" STRF "'\n",
   454:                         bt_len, so, eo, bt_len, (wchar_t*)string + so));
   455:                 DPRINT(("  current string is '%.*" STRF "'\n",
   456:                         slen, str_wide - 1));
   457:               }
   458: #endif /* TRE_WCHAR */
   459:           }
   460: #endif
   461: 
   462:           if (len < 0)
   463:             {
   464:               if (type == STR_USER)
   465:                 result = str_source->compare(so, pos, bt_len,
   466:                                              str_source->context);
   467: #ifdef TRE_WCHAR
   468:               else if (type == STR_WIDE)
   469:                 result = wcsncmp((wchar_t*)string + so, str_wide - 1, bt_len);
   470: #endif /* TRE_WCHAR */
   471:               else
   472:                 result = strncmp((char*)string + so, str_byte - 1, bt_len);
   473:             }
   474:           else if (len - pos < bt_len)
   475:             result = 1;
   476:           else
   477:             result = memcmp((char*)string + so, str_byte - 1, bt_len);
   478: 
   479:           /* We can ignore multibyte characters here because the backref
   480:              string is already aligned at character boundaries. */
   481:           if (result == 0)
   482:             {
   483:               /* Back reference matched.  Check for infinite loop. */
   484:               if (bt_len == 0)
   485:                 empty_br_match = 1;
   486:               if (empty_br_match && states_seen[trans_i->state_id])
   487:                 {
   488:                   DPRINT(("  avoid loop\n"));
   489:                   goto backtrack;
   490:                 }
   491: 
   492:               states_seen[trans_i->state_id] = empty_br_match;
   493: 
   494:               /* Advance in input string and resync `prev_c', `next_c'
   495:                  and pos. */
   496:               DPRINT(("  back reference matched\n"));
   497:               str_byte += bt_len - 1;
   498:               pos += bt_len - 1;
   499:               GET_NEXT_WCHAR();
   500:               DPRINT(("  pos now %d\n", pos));
   501:             }
   502:           else
   503:             {
   504:               DPRINT(("  back reference did not match\n"));
   505:               goto backtrack;
   506:             }
   507:         }
   508:       else
   509:         {
   510:           /* Check for end of string. */
   511:           if (len < 0)
   512:             {
   513:               if (next_c == L'\0')
   514:                 goto backtrack;
   515:             }
   516:           else
   517:             {
   518:               if (pos >= len)
   519:                 goto backtrack;
   520:             }
   521: 
   522:           /* Read the next character. */
   523:           GET_NEXT_WCHAR();
   524:         }
   525: 
   526:       next_state = NULL;
   527:       for (trans_i = state; trans_i->state; trans_i++)
   528:         {
   529:           DPRINT(("  transition %d-%d (%c-%c) %d to %d\n",
   530:                   trans_i->code_min, trans_i->code_max,
   531:                   trans_i->code_min, trans_i->code_max,
   532:                   trans_i->assertions, trans_i->state_id));
   533:           if (trans_i->code_min <= prev_c && trans_i->code_max >= prev_c)
   534:             {
   535:               if (trans_i->assertions
   536:                   && (CHECK_ASSERTIONS(trans_i->assertions)
   537:                       /* Handle character klass transitions. */
   538:                       || ((trans_i->assertions & ASSERT_CHAR_CLASS)
   539:                           && !(cflags & REG_ICASE)
   540:                           && !tre_isctype((tre_cint_t)prev_c, trans_i->u.klass))
   541:                       || ((trans_i->assertions & ASSERT_CHAR_CLASS)
   542:                           && (cflags & REG_ICASE)
   543:                           && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
   544:                                            trans_i->u.klass)
   545:                               && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
   546:                                               trans_i->u.klass)))
   547:                       || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
   548:                           && tre_neg_char_klasses_match(trans_i->neg_klasses,
   549:                                                         (tre_cint_t)prev_c,
   550:                                                         cflags & REG_ICASE))))
   551:                 {
   552:                   DPRINT(("  assertion failed\n"));
   553:                   continue;
   554:                 }
   555: 
   556:               if (next_state == NULL)
   557:                 {
   558:                   /* First matching transition. */
   559:                   DPRINT(("  Next state is %d\n", trans_i->state_id));
   560:                   next_state = trans_i->state;
   561:                   next_tags = trans_i->tags;
   562:                 }
   563:               else
   564:                 {
   565:                   /* Second mathing transition.  We may need to backtrack here
   566:                      to take this transition instead of the first one, so we
   567:                      push this transition in the backtracking stack so we can
   568:                      jump back here if needed. */
   569:                   DPRINT(("  saving state %d for backtracking\n",
   570:                           trans_i->state_id));
   571:                   BT_STACK_PUSH(pos, str_byte, str_wide, trans_i->state,
   572:                                 trans_i->state_id, next_c, tags, mbstate);
   573:                   {
   574:                     int *tmp;
   575:                     for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
   576:                       stack->item.tags[*tmp] = pos;
   577:                   }
   578: #if 0 /* XXX - it's important not to look at all transitions here to keep
   579:          the stack small! */
   580:                   break;
   581: #endif
   582:                 }
   583:             }
   584:         }
   585: 
   586:       if (next_state != NULL)
   587:         {
   588:           /* Matching transitions were found.  Take the first one. */
   589:           state = next_state;
   590: 
   591:           /* Update the tag values. */
   592:           if (next_tags)
   593:             while (*next_tags >= 0)
   594:               tags[*next_tags++] = pos;
   595:         }
   596:       else
   597:         {
   598:         backtrack:
   599:           /* A matching transition was not found.  Try to backtrack. */
   600:           if (stack->prev)
   601:             {
   602:               DPRINT(("  backtracking\n"));
   603:               if (stack->item.state->assertions && ASSERT_BACKREF)
   604:                 {
   605:                   DPRINT(("  states_seen[%d] = 0\n",
   606:                           stack->item.state_id));
   607:                   states_seen[stack->item.state_id] = 0;
   608:                 }
   609: 
   610:               BT_STACK_POP();
   611:             }
   612:           else if (match_eo < 0)
   613:             {
   614:               /* Try starting from a later position in the input string. */
   615:               /* Check for end of string. */
   616:               if (len < 0)
   617:                 {
   618:                   if (next_c == L'\0')
   619:                     {
   620:                       DPRINT(("end of string.\n"));
   621:                       break;
   622:                     }
   623:                 }
   624:               else
   625:                 {
   626:                   if (pos >= len)
   627:                     {
   628:                       DPRINT(("end of string.\n"));
   629:                       break;
   630:                     }
   631:                 }
   632:               DPRINT(("restarting from next start position\n"));
   633:               next_c = next_c_start;
   634: #ifdef TRE_MBSTATE
   635:               mbstate = mbstate_start;
   636: #endif /* TRE_MBSTATE */
   637:               str_byte = str_byte_start;
   638: #ifdef TRE_WCHAR
   639:               str_wide = str_wide_start;
   640: #endif /* TRE_WCHAR */
   641:               goto retry;
   642:             }
   643:           else
   644:             {
   645:               DPRINT(("finished\n"));
   646:               break;
   647:             }
   648:         }
   649:     }
   650: 
   651:   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
   652:   *match_end_ofs = match_eo;
   653: 
   654:  error_exit:
   655:   tre_bt_mem_destroy(mem);
   656: #ifndef TRE_USE_ALLOCA
   657:   if (tags)
   658:     xfree(tags);
   659:   if (pmatch)
   660:     xfree(pmatch);
   661:   if (states_seen)
   662:     xfree(states_seen);
   663: #endif /* !TRE_USE_ALLOCA */
   664: 
   665:   return (reg_errcode_t)ret;
   666: }
End cpp section to tre/tre_match-backtrack.cpp[1]
Start cpp section to tre/tre_match-parallel.cpp[1 /1 ]
     1: #line 6076 "./lpsrc/tre.pak"
     2: /*
     3:   tre-match-parallel.c - TRE parallel regex matching engine
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: /*
    23:   This algorithm searches for matches basically by reading characters
    24:   in the searched string one by one, starting at the beginning.  All
    25:   matching paths in the TNFA are traversed in parallel.  When two or
    26:   more paths reach the same state, exactly one is chosen according to
    27:   tag ordering rules; if returning submatches is not required it does
    28:   not matter which path is chosen.
    29: 
    30:   The worst case time required for finding the leftmost and longest
    31:   match, or determining that there is no match, is always linearly
    32:   dependent on the length of the text being searched.
    33: 
    34:   This algorithm cannot handle TNFAs with back referencing nodes.
    35:   See `tre-match-backtrack.c'.
    36: */
    37: 
    38: 
    39: #include "flx_target_tre_config.hpp"
    40: 
    41: #ifdef TRE_USE_ALLOCA
    42: /* AIX requires this to be the first thing in the file.  */
    43: #ifndef __GNUC__
    44: # if HAVE_ALLOCA_H
    45: #  include <alloca.h>
    46: # else
    47: #  ifdef _AIX
    48:  #pragma alloca
    49: #  else
    50: #   ifndef alloca /* predefined by HP cc +Olibcalls */
    51: char *alloca ();
    52: #   endif
    53: #  endif
    54: # endif
    55: #endif
    56: #endif /* TRE_USE_ALLOCA */
    57: 
    58: #include <assert.h>
    59: #include <stdlib.h>
    60: #include <string.h>
    61: #ifdef HAVE_WCHAR_H
    62: #include <wchar.h>
    63: #endif /* HAVE_WCHAR_H */
    64: #ifdef HAVE_WCTYPE_H
    65: #include <wctype.h>
    66: #endif /* HAVE_WCTYPE_H */
    67: #ifndef TRE_WCHAR
    68: #include <ctype.h>
    69: #endif /* !TRE_WCHAR */
    70: #ifdef HAVE_MALLOC_H
    71: #include <malloc.h>
    72: #endif /* HAVE_MALLOC_H */
    73: 
    74: #include "tre_internal.hpp"
    75: #include "tre_match-utils.hpp"
    76: #include "tre_regex.hpp"
    77: #include "tre_xmalloc.hpp"
    78: 
    79: 
    80: 
    81: typedef struct {
    82:   tre_tnfa_transition_t *state;
    83:   int *tags;
    84: } tre_tnfa_reach_t;
    85: 
    86: typedef struct {
    87:   int pos;
    88:   int **tags;
    89: } tre_reach_pos_t;
    90: 
    91: 
    92: #ifdef TRE_DEBUG
    93: static void
    94: tre_print_reach(const tre_tnfa_t *tnfa, tre_tnfa_reach_t *reach, int num_tags)
    95: {
    96:   int i;
    97: 
    98:   while (reach->state != NULL)
    99:     {
   100:       DPRINT((" %p", (void *)reach->state));
   101:       if (num_tags > 0)
   102:         {
   103:           DPRINT(("/"));
   104:           for (i = 0; i < num_tags; i++)
   105:             {
   106:               DPRINT(("%d:%d", i, reach->tags[i]));
   107:               if (i < (num_tags-1))
   108:                 DPRINT((","));
   109:             }
   110:         }
   111:       reach++;
   112:     }
   113:   DPRINT(("\n"));
   114: 
   115: }
   116: #endif /* TRE_DEBUG */
   117: 
   118: reg_errcode_t
   119: tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string, int len,
   120:                       tre_str_type_t type, int *match_tags, int eflags,
   121:                       int *match_end_ofs)
   122: {
   123:   /* State variables required by GET_NEXT_WCHAR. */
   124:   tre_char_t prev_c = 0, next_c = 0;
   125:   const char *str_byte = (const char*)string;
   126:   int pos = -1;
   127:   unsigned int pos_add_next = 1;
   128: #ifdef TRE_WCHAR
   129:   const wchar_t *str_wide = (const wchar_t*)string;
   130: #ifdef TRE_MBSTATE
   131:   mbstate_t mbstate;
   132: #endif /* TRE_MBSTATE */
   133: #endif /* TRE_WCHAR */
   134:   int reg_notbol = eflags & REG_NOTBOL;
   135:   int reg_noteol = eflags & REG_NOTEOL;
   136:   int reg_newline = tnfa->cflags & REG_NEWLINE;
   137:   int str_user_end = 0;
   138: 
   139:   char *buf;
   140:   tre_tnfa_transition_t *trans_i;
   141:   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
   142:   tre_reach_pos_t *reach_pos;
   143:   int *tag_i;
   144:   int num_tags, i;
   145: 
   146:   int match_eo = -1;       /* end offset of match (-1 if no match found yet) */
   147:   int new_match = 0;
   148:   int *tmp_tags = NULL;
   149:   int *tmp_iptr;
   150: 
   151: #ifdef TRE_MBSTATE
   152:   memset(&mbstate, '\0', sizeof(mbstate));
   153: #endif /* TRE_MBSTATE */
   154: 
   155:   DPRINT(("tre_tnfa_run_parallel, input type %d\n", type));
   156: 
   157:   if (!match_tags)
   158:     num_tags = 0;
   159:   else
   160:     num_tags = tnfa->num_tags;
   161: 
   162:   /* Allocate memory for temporary data required for matching.  This needs to
   163:      be done for every matching operation to be thread safe.  This allocates
   164:      everything in a single large block from the stack frame using alloca()
   165:      or with malloc() if alloca is unavailable. */
   166:   {
   167:     int tbytes, rbytes, pbytes, xbytes, total_bytes;
   168:     char *tmp_buf;
   169:     /* Compute the length of the block we need. */
   170:     tbytes = sizeof(*tmp_tags) * num_tags;
   171:     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
   172:     pbytes = sizeof(*reach_pos) * tnfa->num_states;
   173:     xbytes = sizeof(int) * num_tags;
   174:     total_bytes =
   175:       (sizeof(long) - 1) * 4 /* for alignment paddings */
   176:       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
   177: 
   178:     /* Allocate the memory. */
   179: #ifdef TRE_USE_ALLOCA
   180:     buf = (char*)alloca(total_bytes);
   181: #else /* !TRE_USE_ALLOCA */
   182:     buf = (char*)xmalloc(total_bytes);
   183: #endif /* !TRE_USE_ALLOCA */
   184:     if (buf == NULL)
   185:       return REG_ESPACE;
   186:     memset(buf, 0, total_bytes);
   187: 
   188:     /* Get the various pointers within tmp_buf (properly aligned). */
   189:     tmp_tags = (int*)(void *)buf;
   190:     tmp_buf = buf + tbytes;
   191:     tmp_buf += ALIGN(tmp_buf, long);
   192:     reach_next = (tre_tnfa_reach_t*)(void *)tmp_buf;
   193:     tmp_buf += rbytes;
   194:     tmp_buf += ALIGN(tmp_buf, long);
   195:     reach = (tre_tnfa_reach_t*)(void *)tmp_buf;
   196:     tmp_buf += rbytes;
   197:     tmp_buf += ALIGN(tmp_buf, long);
   198:     reach_pos = (tre_reach_pos_t*)(void *)tmp_buf;
   199:     tmp_buf += pbytes;
   200:     tmp_buf += ALIGN(tmp_buf, long);
   201:     for (i = 0; i < tnfa->num_states; i++)
   202:       {
   203:         reach[i].tags = (int*)(void *)tmp_buf;
   204:         tmp_buf += xbytes;
   205:         reach_next[i].tags = (int*)(void *)tmp_buf;
   206:         tmp_buf += xbytes;
   207:       }
   208:   }
   209: 
   210:   for (i = 0; i < tnfa->num_states; i++)
   211:     reach_pos[i].pos = -1;
   212: 
   213:   /* If only one character can start a match, find it first. */
   214:   if (tnfa->first_char >= 0 && type == STR_BYTE && str_byte)
   215:     {
   216:       const char *orig_str = str_byte;
   217:       int first = tnfa->first_char;
   218: 
   219:       if (len >= 0)
   220:         str_byte = (const char*)memchr(orig_str, first, len);
   221:       else
   222:         str_byte = strchr(orig_str, first);
   223:       if (str_byte == NULL)
   224:         {
   225: #ifndef TRE_USE_ALLOCA
   226:           if (buf)
   227:             xfree(buf);
   228: #endif /* !TRE_USE_ALLOCA */
   229:           return REG_NOMATCH;
   230:         }
   231:       DPRINT(("skipped %d chars\n", str_byte - orig_str));
   232:       if (str_byte >= orig_str + 1)
   233:         prev_c = (unsigned char)*(str_byte - 1);
   234:       next_c = (unsigned char)*str_byte;
   235:       pos = str_byte - orig_str;
   236:       if (len < 0 || pos < len)
   237:         str_byte++;
   238:     }
   239:   else
   240:     {
   241:       GET_NEXT_WCHAR();
   242:       pos = 0;
   243:     }
   244: 
   245: #if 0
   246:   /* Skip over characters that cannot possibly be the first character
   247:      of a match. */
   248:   if (tnfa->firstpos_chars != NULL)
   249:     {
   250:       char *chars = tnfa->firstpos_chars;
   251: 
   252:       if (len < 0)
   253:         {
   254:           const char *orig_str = str_byte;
   255:           /* XXX - use strpbrk() and wcspbrk() because they might be
   256:              optimized for the target architecture.  Try also strcspn()
   257:              and wcscspn() and compare the speeds. */
   258:           while (next_c != L'\0' && !chars[next_c])
   259:             {
   260:               next_c = *str_byte++;
   261:             }
   262:           prev_c = *(str_byte - 2);
   263:           pos += str_byte - orig_str;
   264:           DPRINT(("skipped %d chars\n", str_byte - orig_str));
   265:         }
   266:       else
   267:         {
   268:           while (pos <= len && !chars[next_c])
   269:             {
   270:               prev_c = next_c;
   271:               next_c = (unsigned char)(*str_byte++);
   272:               pos++;
   273:             }
   274:         }
   275:     }
   276: #endif
   277: 
   278:   DPRINT(("length: %d\n", len));
   279:   DPRINT(("pos:chr/code | states and tags\n"));
   280:   DPRINT(("-------------+------------------------------------------------\n"));
   281: 
   282:   reach_next_i = reach_next;
   283:   while (1)
   284:     {
   285:       /* If no match found yet, add the initial states to `reach_next'. */
   286:       if (match_eo < 0)
   287:         {
   288:           DPRINT((" init >"));
   289:           trans_i = tnfa->initial;
   290:           while (trans_i->state != NULL)
   291:             {
   292:               if (reach_pos[trans_i->state_id].pos < pos)
   293:                 {
   294:                   if (trans_i->assertions
   295:                       && CHECK_ASSERTIONS(trans_i->assertions))
   296:                     {
   297:                       DPRINT(("assertion failed\n"));
   298:                       trans_i++;
   299:                       continue;
   300:                     }
   301: 
   302:                   DPRINT((" %p", (void *)trans_i->state));
   303:                   reach_next_i->state = trans_i->state;
   304:                   for (i = 0; i < num_tags; i++)
   305:                     reach_next_i->tags[i] = -1;
   306:                   tag_i = trans_i->tags;
   307:                   if (tag_i)
   308:                     while (*tag_i >= 0)
   309:                       {
   310:                         if (*tag_i < num_tags)
   311:                           reach_next_i->tags[*tag_i] = pos;
   312:                         tag_i++;
   313:                       }
   314:                   if (reach_next_i->state == tnfa->final)
   315:                     {
   316:                       DPRINT(("  found empty match\n"));
   317:                       match_eo = pos;
   318:                       new_match = 1;
   319:                       for (i = 0; i < num_tags; i++)
   320:                         match_tags[i] = reach_next_i->tags[i];
   321:                     }
   322:                   reach_pos[trans_i->state_id].pos = pos;
   323:                   reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
   324:                   reach_next_i++;
   325:                 }
   326:               trans_i++;
   327:             }
   328:           DPRINT(("\n"));
   329:           reach_next_i->state = NULL;
   330:         }
   331:       else
   332:         {
   333:           if (num_tags == 0 || reach_next_i == reach_next)
   334:             /* We have found a match. */
   335:             break;
   336:         }
   337: 
   338:       /* Check for end of string. */
   339:       if (len < 0)
   340:         {
   341:           if (type == STR_USER)
   342:             {
   343:               if (str_user_end)
   344:                 break;
   345:             }
   346:           else if (next_c == L'\0')
   347:             break;
   348:         }
   349:       else
   350:         {
   351:           if (pos >= len)
   352:             break;
   353:         }
   354: 
   355:       GET_NEXT_WCHAR();
   356: 
   357: #ifdef TRE_DEBUG
   358:       DPRINT(("%3d:%2lc/%05d |", pos - 1, (tre_cint_t)prev_c, (int)prev_c));
   359:       tre_print_reach(tnfa, reach_next, num_tags);
   360:       DPRINT(("%3d:%2lc/%05d |", pos, (tre_cint_t)next_c, (int)next_c));
   361:       tre_print_reach(tnfa, reach_next, num_tags);
   362: #endif /* TRE_DEBUG */
   363: 
   364:       /* Swap `reach' and `reach_next'. */
   365:       reach_i = reach;
   366:       reach = reach_next;
   367:       reach_next = reach_i;
   368: 
   369:       /* For each state in `reach', weed out states that don't fulfill the
   370:          minimal matching conditions. */
   371:       if (tnfa->num_minimals && new_match)
   372:         {
   373:           new_match = 0;
   374:           reach_next_i = reach_next;
   375:           for (reach_i = reach; reach_i->state; reach_i++)
   376:             {
   377:               int i;
   378:               int skip = 0;
   379:               for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
   380:                 {
   381:                   int end = tnfa->minimal_tags[i];
   382:                   int start = tnfa->minimal_tags[i + 1];
   383:                   DPRINT(("  Minimal start %d, end %d\n", start, end));
   384:                   if (end >= num_tags)
   385:                     {
   386:                       DPRINT(("  Throwing %p out.\n", reach_i->state));
   387:                       skip = 1;
   388:                       break;
   389:                     }
   390:                   else if (reach_i->tags[start] == match_tags[start]
   391:                            && reach_i->tags[end] < match_tags[end])
   392:                     {
   393:                       DPRINT(("  Throwing %p out because t%d < %d\n",
   394:                               reach_i->state, end, match_tags[end]));
   395:                       skip = 1;
   396:                       break;
   397:                     }
   398:                 }
   399:               if (!skip)
   400:                 {
   401:                   int *tmp_iptr;
   402:                   reach_next_i->state = reach_i->state;
   403:                   tmp_iptr = reach_next_i->tags;
   404:                   reach_next_i->tags = reach_i->tags;
   405:                   reach_i->tags = tmp_iptr;
   406:                   reach_next_i++;
   407:                 }
   408:             }
   409:           reach_next_i->state = NULL;
   410: 
   411:           /* Swap `reach' and `reach_next'. */
   412:           reach_i = reach;
   413:           reach = reach_next;
   414:           reach_next = reach_i;
   415:         }
   416: 
   417:       /* For each state in `reach' see if there is a transition leaving with
   418:          the current input symbol to a state not yet in `reach_next', and
   419:          add the destination states to `reach_next'. */
   420:       reach_next_i = reach_next;
   421:       for (reach_i = reach; reach_i->state; reach_i++)
   422:         {
   423:           for (trans_i = reach_i->state; trans_i->state; trans_i++)
   424:             {
   425:               /* Does this transition match the input symbol? */
   426:               if (trans_i->code_min <= prev_c &&
   427:                   trans_i->code_max >= prev_c)
   428:                 {
   429:                   if (trans_i->assertions
   430:                       && (CHECK_ASSERTIONS(trans_i->assertions)
   431:                           /* Handle character klass transitions. */
   432:                           || ((trans_i->assertions & ASSERT_CHAR_CLASS)
   433:                               && !(tnfa->cflags & REG_ICASE)
   434:                               && !tre_isctype((tre_cint_t)prev_c,
   435:                                               trans_i->u.klass))
   436:                           || ((trans_i->assertions & ASSERT_CHAR_CLASS)
   437:                               && (tnfa->cflags & REG_ICASE)
   438:                               && (!tre_isctype(tre_tolower((tre_cint_t)prev_c),
   439:                                                trans_i->u.klass)
   440:                                   && !tre_isctype(tre_toupper((tre_cint_t)prev_c),
   441:                                                   trans_i->u.klass)))
   442:                           || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)
   443:                               && tre_neg_char_klasses_match(trans_i->neg_klasses,
   444:                                                             (tre_cint_t)prev_c,
   445:                                                             tnfa->cflags & REG_ICASE))))
   446:                     {
   447:                       DPRINT(("assertion failed\n"));
   448:                       continue;
   449:                     }
   450: 
   451:                   /* Compute the tags after this transition. */
   452:                   for (i = 0; i < num_tags; i++)
   453:                     tmp_tags[i] = reach_i->tags[i];
   454:                   tag_i = trans_i->tags;
   455:                   if (tag_i != NULL)
   456:                     while (*tag_i >= 0)
   457:                       {
   458:                         if (*tag_i < num_tags)
   459:                           tmp_tags[*tag_i] = pos;
   460:                         tag_i++;
   461:                       }
   462: 
   463:                   if (reach_pos[trans_i->state_id].pos < pos)
   464:                     {
   465:                       /* Found an unvisited node. */
   466:                       reach_next_i->state = trans_i->state;
   467:                       tmp_iptr = reach_next_i->tags;
   468:                       reach_next_i->tags = tmp_tags;
   469:                       tmp_tags = tmp_iptr;
   470:                       reach_pos[trans_i->state_id].pos = pos;
   471:                       reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
   472: 
   473:                       if (reach_next_i->state == tnfa->final
   474:                           && (match_eo == -1
   475:                               || (num_tags > 0
   476:                                   && reach_next_i->tags[0] <= match_tags[0])))
   477:                         {
   478:                           DPRINT(("  found match %p\n", trans_i->state));
   479:                           match_eo = pos;
   480:                           new_match = 1;
   481:                           for (i = 0; i < num_tags; i++)
   482:                             match_tags[i] = reach_next_i->tags[i];
   483:                         }
   484:                       reach_next_i++;
   485: 
   486:                     }
   487:                   else
   488:                     {
   489:                       assert(reach_pos[trans_i->state_id].pos == pos);
   490:                       /* Another path has also reached this state.  We choose
   491:                          the winner by examining the tag values for both
   492:                          paths. */
   493:                       if (tre_tag_order(num_tags, tnfa->tag_directions,
   494:                                         tmp_tags,
   495:                                         *reach_pos[trans_i->state_id].tags))
   496:                         {
   497:                           /* The new path wins. */
   498:                           tmp_iptr = *reach_pos[trans_i->state_id].tags;
   499:                           *reach_pos[trans_i->state_id].tags = tmp_tags;
   500:                           if (trans_i->state == tnfa->final)
   501:                             {
   502:                               DPRINT(("  found better match\n"));
   503:                               match_eo = pos;
   504:                               new_match = 1;
   505:                               for (i = 0; i < num_tags; i++)
   506:                                 match_tags[i] = tmp_tags[i];
   507:                             }
   508:                           tmp_tags = tmp_iptr;
   509:                         }
   510:                     }
   511:                 }
   512:             }
   513:         }
   514:       reach_next_i->state = NULL;
   515:     }
   516: 
   517:   DPRINT(("match end offset = %d\n", match_eo));
   518: 
   519: #ifndef TRE_USE_ALLOCA
   520:   if (buf)
   521:     xfree(buf);
   522: #endif /* !TRE_USE_ALLOCA */
   523: 
   524:   *match_end_ofs = match_eo;
   525:   return match_eo >= 0 ? REG_OK : REG_NOMATCH;
   526: }
   527: 
   528: /* EOF */
End cpp section to tre/tre_match-parallel.cpp[1]
Start cpp section to tre/tre_match-utils.hpp[1 /1 ]
     1: #line 6605 "./lpsrc/tre.pak"
     2: 
     3: /*
     4:   tre-match-utils.h - TRE matcher helper definitions
     5: 
     6:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>.
     7: 
     8:   This program is free software; you can redistribute it and/or modify
     9:   it under the terms of the GNU General Public License version 2 (June
    10:   1991) as published by the Free Software Foundation.
    11: 
    12:   This program is distributed in the hope that it will be useful,
    13:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    14:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    15:   GNU General Public License for more details.
    16: 
    17:   You should have received a copy of the GNU General Public License
    18:   along with this program; if not, write to the Free Software
    19:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    20: 
    21: */
    22: 
    23: #define str_source ((tre_str_source*)string)
    24: 
    25: #ifdef TRE_WCHAR
    26: 
    27: #ifdef TRE_MULTIBYTE
    28: 
    29: /* Wide character and multibyte support. */
    30: 
    31: #define GET_NEXT_WCHAR()                                                      \
    32:   do {                                                                        \
    33:     prev_c = next_c;                                                          \
    34:     if (type == STR_BYTE)                                                     \
    35:       {                                                                       \
    36:         pos++;                                                                \
    37:         if (len >= 0 && pos >= len)                                           \
    38:           next_c = '\0';                                                      \
    39:         else                                                                  \
    40:           next_c = (unsigned char)(*str_byte++);                              \
    41:       }                                                                       \
    42:     else if (type == STR_WIDE)                                                \
    43:       {                                                                       \
    44:         pos++;                                                                \
    45:         if (len >= 0 && pos >= len)                                           \
    46:           next_c = L'\0';                                                     \
    47:         else                                                                  \
    48:           next_c = *str_wide++;                                               \
    49:       }                                                                       \
    50:     else if (type == STR_MBS)                                                 \
    51:       {                                                                       \
    52:         pos += pos_add_next;                                                  \
    53:         if (str_byte == NULL)                                                 \
    54:           next_c = L'\0';                                                     \
    55:         else                                                                  \
    56:           {                                                                   \
    57:             size_t w;                                                         \
    58:             int max;                                                          \
    59:             if (len >= 0)                                                     \
    60:               max = len - pos;                                                \
    61:             else                                                              \
    62:               max = 32;                                                       \
    63:             if (max <= 0)                                                     \
    64:               {                                                               \
    65:                 next_c = L'\0';                                               \
    66:                 pos_add_next = 1;                                             \
    67:               }                                                               \
    68:             else                                                              \
    69:               {                                                               \
    70:                 w = tre_mbrtowc(&next_c, str_byte, max, &mbstate);            \
    71:                 if (w == (size_t)-1 || w == (size_t)-2)                       \
    72:                   return REG_NOMATCH;                                         \
    73:                 if (w == 0 && len >= 0)                                       \
    74:                   {                                                           \
    75:                     pos_add_next = 1;                                         \
    76:                     next_c = 0;                                               \
    77:                     str_byte++;                                               \
    78:                   }                                                           \
    79:                 else                                                          \
    80:                   {                                                           \
    81:                     pos_add_next = w;                                         \
    82:                     str_byte += w;                                            \
    83:                   }                                                           \
    84:               }                                                               \
    85:           }                                                                   \
    86:       }                                                                       \
    87:     else if (type == STR_USER)                                                \
    88:       {                                                                       \
    89:         pos += pos_add_next;                                                  \
    90:         str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
    91:                                                  str_source->context);        \
    92:       }                                                                       \
    93:   } while(0)
    94: 
    95: #else /* !TRE_MULTIBYTE */
    96: 
    97: /* Wide character support, no multibyte support. */
    98: 
    99: #define GET_NEXT_WCHAR()                                                      \
   100:   do {                                                                        \
   101:     prev_c = next_c;                                                          \
   102:     if (type == STR_BYTE)                                                     \
   103:       {                                                                       \
   104:         pos++;                                                                \
   105:         if (len >= 0 && pos >= len)                                           \
   106:           next_c = '\0';                                                      \
   107:         else                                                                  \
   108:           next_c = (unsigned char)(*str_byte++);                              \
   109:       }                                                                       \
   110:     else if (type == STR_WIDE)                                                \
   111:       {                                                                       \
   112:         pos++;                                                                \
   113:         if (len >= 0 && pos >= len)                                           \
   114:           next_c = L'\0';                                                     \
   115:         else                                                                  \
   116:           next_c = *str_wide++;                                               \
   117:       }                                                                       \
   118:     else if (type == STR_USER)                                                \
   119:       {                                                                       \
   120:         pos += pos_add_next;                                                  \
   121:         str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
   122:                                                  str_source->context);        \
   123:       }                                                                       \
   124:   } while(0)
   125: 
   126: #endif /* !TRE_MULTIBYTE */
   127: 
   128: #else /* !TRE_WCHAR */
   129: 
   130: /* No wide character or multibyte support. */
   131: 
   132: #define GET_NEXT_WCHAR()                                                      \
   133:   do {                                                                        \
   134:     prev_c = next_c;                                                          \
   135:     if (type == STR_BYTE)                                                     \
   136:       {                                                                       \
   137:         pos++;                                                                \
   138:         if (len >= 0 && pos >= len)                                           \
   139:           next_c = '\0';                                                      \
   140:         else                                                                  \
   141:           next_c = (unsigned char)(*str_byte++);                              \
   142:       }                                                                       \
   143:     else if (type == STR_USER)                                                \
   144:       {                                                                       \
   145:         pos += pos_add_next;                                                  \
   146:         str_user_end = str_source->get_next_char(&next_c, &pos_add_next,      \
   147:                                                  str_source->context);        \
   148:       }                                                                       \
   149:   } while(0)
   150: 
   151: #endif /* !TRE_WCHAR */
   152: 
   153: 
   154: 
   155: #define IS_WORD_CHAR(c)  ((c) == L'_' || tre_isalnum(c))
   156: 
   157: #define CHECK_ASSERTIONS(assertions)                                          \
   158:   (((assertions & ASSERT_AT_BOL)                                              \
   159:     && (pos > 0 || reg_notbol)                                                \
   160:     && (prev_c != L'\n' || !reg_newline))                                     \
   161:    || ((assertions & ASSERT_AT_EOL)                                           \
   162:        && (next_c != L'\0' || reg_noteol)                                     \
   163:        && (next_c != L'\n' || !reg_newline))                                  \
   164:    || ((assertions & ASSERT_AT_BOW)                                           \
   165:        && (pos > 0 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))))       \
   166:    || ((assertions & ASSERT_AT_EOW)                                           \
   167:        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))                    \
   168:    || ((assertions & ASSERT_AT_WB)                                            \
   169:        && (pos != 0 && next_c != L'\0'                                        \
   170:            && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))                  \
   171:    || ((assertions & ASSERT_AT_WB_NEG)                                        \
   172:        && (pos == 0 || next_c == L'\0'                                        \
   173:            || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
   174: 
   175: 
   176: 
   177: /* Returns 1 if `t1' wins `t2', 0 otherwise. */
   178: inline static int
   179: tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
   180:               int *t1, int *t2)
   181: {
   182:   int i;
   183:   for (i = 0; i < num_tags; i++)
   184:     {
   185:       if (tag_directions[i] == TRE_TAG_MINIMIZE)
   186:         {
   187:           if (t1[i] < t2[i])
   188:             return 1;
   189:           if (t1[i] > t2[i])
   190:             return 0;
   191:         }
   192:       else
   193:         {
   194:           if (t1[i] > t2[i])
   195:             return 1;
   196:           if (t1[i] < t2[i])
   197:             return 0;
   198:         }
   199:     }
   200:   /*  assert(0);*/
   201:   return 0;
   202: }
   203: 
   204: inline static int
   205: tre_neg_char_klasses_match(tre_ctype_t *klasses, tre_cint_t wc, int icase)
   206: {
   207:   DPRINT(("neg_char_klasses_test: %p, %d, %d\n", klasses, wc, icase));
   208:   while (*klasses != (tre_ctype_t)0)
   209:     if ((!icase && tre_isctype(wc, *klasses))
   210:         || (icase && (tre_isctype(tre_toupper(wc), *klasses)
   211:                       || tre_isctype(tre_tolower(wc), *klasses))))
   212:       return 1; /* Match. */
   213:     else
   214:       klasses++;
   215:   return 0; /* No match. */
   216: }
End cpp section to tre/tre_match-utils.hpp[1]
Start cpp section to tre/tre_mem.cpp[1 /1 ]
     1: #line 6822 "./lpsrc/tre.pak"
     2: /*
     3:   tre-mem.c - TRE memory allocator
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: /*
    23:   This memory allocator is for allocating small memory blocks efficiently
    24:   in terms of memory overhead and execution speed.  The allocated blocks
    25:   cannot be freed individually, only all at once.  There can be multiple
    26:   allocators, though.
    27: */
    28: 
    29: #include "flx_target_tre_config.hpp"
    30: #include <stdlib.h>
    31: #include <string.h>
    32: 
    33: #include "tre_internal.hpp"
    34: #include "tre_mem.hpp"
    35: #include "tre_xmalloc.hpp"
    36: 
    37: 
    38: /* Returns a new memory allocator or NULL if out of memory. */
    39: tre_mem_t
    40: tre_mem_new_impl(int provided, void *provided_block)
    41: {
    42:   tre_mem_t mem;
    43:   if (provided)
    44:     {
    45:       mem = (tre_mem_struct*)provided_block;
    46:       memset(mem, 0, sizeof(*mem));
    47:     }
    48:   else
    49:     mem = (tre_mem_struct*)xcalloc(1, sizeof(*mem));
    50:   if (mem == NULL)
    51:     return NULL;
    52:   return mem;
    53: }
    54: 
    55: 
    56: /* Frees the memory allocator and all memory allocated with it. */
    57: void
    58: tre_mem_destroy(tre_mem_t mem)
    59: {
    60:   tre_list_t *tmp, *l = mem->blocks;
    61: 
    62:   while (l != NULL)
    63:     {
    64:       xfree(l->data);
    65:       tmp = l->next;
    66:       xfree(l);
    67:       l = tmp;
    68:     }
    69:   xfree(mem);
    70: }
    71: 
    72: 
    73: /* Allocates a block of `size' bytes from `mem'.  Returns a pointer to the
    74:    allocated block or NULL if an underlying malloc() failed. */
    75: void *
    76: tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block,
    77:                    int zero, size_t size)
    78: {
    79:   void *ptr;
    80: 
    81:   if (mem->failed)
    82:     {
    83:       DPRINT(("tre_mem_alloc: oops, called after failure?!\n"));
    84:       return NULL;
    85:     }
    86: 
    87: #ifdef MALLOC_DEBUGGING
    88:   if (!provided)
    89:     {
    90:       ptr = xmalloc(1);
    91:       if (ptr == NULL)
    92:         {
    93:           DPRINT(("tre_mem_alloc: xmalloc forced failure\n"));
    94:           mem->failed = 1;
    95:           return NULL;
    96:         }
    97:       xfree(ptr);
    98:     }
    99: #endif /* MALLOC_DEBUGGING */
   100: 
   101:   if (mem->n < size)
   102:     {
   103:       /* We need more memory than is available in the current block.
   104:          Allocate a new block. */
   105:       tre_list_t *l;
   106:       if (provided)
   107:         {
   108:           DPRINT(("tre_mem_alloc: using provided block\n"));
   109:           if (provided_block == NULL)
   110:             {
   111:               DPRINT(("tre_mem_alloc: provided block was NULL\n"));
   112:               mem->failed = 1;
   113:               return NULL;
   114:             }
   115:           mem->ptr = (char*)provided_block;
   116:           mem->n = TRE_MEM_BLOCK_SIZE;
   117:         }
   118:       else
   119:         {
   120:           int block_size;
   121:           if (size * 8 > TRE_MEM_BLOCK_SIZE)
   122:             block_size = size * 8;
   123:           else
   124:             block_size = TRE_MEM_BLOCK_SIZE;
   125:           DPRINT(("tre_mem_alloc: allocating new %d byte block\n",
   126:                   block_size));
   127:           l = (tre_list_t*)xmalloc(sizeof(*l));
   128:           if (l == NULL)
   129:             {
   130:               mem->failed = 1;
   131:               return NULL;
   132:             }
   133:           l->data = xmalloc(block_size);
   134:           if (l->data == NULL)
   135:             {
   136:               xfree(l);
   137:               mem->failed = 1;
   138:               return NULL;
   139:             }
   140:           l->next = NULL;
   141:           if (mem->current != NULL)
   142:             mem->current->next = l;
   143:           if (mem->blocks == NULL)
   144:             mem->blocks = l;
   145:           mem->current = l;
   146:           mem->ptr = (char*)l->data;
   147:           mem->n = block_size;
   148:         }
   149:     }
   150: 
   151:   /* Make sure the next pointer will be aligned. */
   152:   size += ALIGN(mem->ptr + size, long);
   153: 
   154:   /* Allocate from current block. */
   155:   ptr = mem->ptr;
   156:   mem->ptr += size;
   157:   mem->n -= size;
   158: 
   159:   /* Set to zero if needed. */
   160:   if (zero)
   161:     memset(ptr, 0, size);
   162: 
   163:   return ptr;
   164: }
   165: 
   166: /* EOF */
End cpp section to tre/tre_mem.cpp[1]
Start cpp section to tre/tre_mem.hpp[1 /1 ]
     1: #line 6989 "./lpsrc/tre.pak"
     2: /*
     3:   tre-mem.h - TRE memory allocator interface
     4: 
     5:   Copyright (C) 2001-2003 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #ifndef TRE_MEM_H
    23: #define TRE_MEM_H 1
    24: 
    25: #include <stdlib.h>
    26: 
    27: #define TRE_MEM_BLOCK_SIZE 1024
    28: 
    29: typedef struct tre_list {
    30:   void *data;
    31:   struct tre_list *next;
    32: } tre_list_t;
    33: 
    34: typedef struct tre_mem_struct {
    35:   tre_list_t *blocks;
    36:   tre_list_t *current;
    37:   char *ptr;
    38:   size_t n;
    39:   int failed;
    40:   void **provided;
    41: } *tre_mem_t;
    42: 
    43: 
    44: tre_mem_t tre_mem_new_impl(int provided, void *provided_block);
    45: void *tre_mem_alloc_impl(tre_mem_t mem, int provided, void *provided_block,
    46:                          int zero, size_t size);
    47: 
    48: /* Returns a new memory allocator or NULL if out of memory. */
    49: #define tre_mem_new()  tre_mem_new_impl(0, NULL)
    50: 
    51: /* Allocates a block of `size' bytes from `mem'.  Returns a pointer to the
    52:    allocated block or NULL if an underlying malloc() failed. */
    53: #define tre_mem_alloc(mem, size) tre_mem_alloc_impl(mem, 0, NULL, 0, size)
    54: 
    55: /* Allocates a block of `size' bytes from `mem'.  Returns a pointer to the
    56:    allocated block or NULL if an underlying malloc() failed.  The memory
    57:    is set to zero. */
    58: #define tre_mem_calloc(mem, size) tre_mem_alloc_impl(mem, 0, NULL, 1, size)
    59: 
    60: #ifdef TRE_USE_ALLOCA
    61: /* alloca() versions.  Like above, but memory is allocated with alloca()
    62:    instead of malloc(). */
    63: 
    64: #define tre_mem_newa() \
    65:   tre_mem_new_impl(1, alloca(sizeof(struct tre_mem_struct)))
    66: 
    67: #define tre_mem_alloca(mem, size)                                             \
    68:   ((mem)->n >= (size)                                                         \
    69:    ? tre_mem_alloc_impl((mem), 1, NULL, 0, (size))                            \
    70:    : tre_mem_alloc_impl((mem), 1, alloca(TRE_MEM_BLOCK_SIZE), 0, (size)))
    71: #endif /* TRE_USE_ALLOCA */
    72: 
    73: 
    74: /* Frees the memory allocator and all memory allocated with it. */
    75: void tre_mem_destroy(tre_mem_t mem);
    76: 
    77: #endif /* TRE_MEM_H */
    78: 
    79: /* EOF */
End cpp section to tre/tre_mem.hpp[1]
Start cpp section to tre/tre_parse.cpp[1 /1 ]
     1: #line 7069 "./lpsrc/tre.pak"
     2: /*
     3:   tre-parse.c - Regexp parser
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: */
    20: 
    21: /*
    22:   This parser is just a simple recursive descent parser for POSIX.2
    23:   regexps.  The parser supports both the obsolete default syntax and
    24:   the "extended" syntax, and some nonstandard extensions.
    25: */
    26: 
    27: 
    28: #include "flx_target_tre_config.hpp"
    29: #include <string.h>
    30: #include <assert.h>
    31: #include <limits.h>
    32: 
    33: #include "tre_xmalloc.hpp"
    34: #include "tre_mem.hpp"
    35: #include "tre_ast.hpp"
    36: #include "tre_stack.hpp"
    37: #include "tre_parse.hpp"
    38: 
    39: 
    40: /* Characters with special meanings in regexp syntax. */
    41: #define CHAR_PIPE          L'|'
    42: #define CHAR_LPAREN        L'('
    43: #define CHAR_RPAREN        L')'
    44: #define CHAR_LBRACE        L'{'
    45: #define CHAR_RBRACE        L'}'
    46: #define CHAR_LBRACKET      L'['
    47: #define CHAR_RBRACKET      L']'
    48: #define CHAR_MINUS         L'-'
    49: #define CHAR_STAR          L'*'
    50: #define CHAR_QUESTIONMARK  L'?'
    51: #define CHAR_PLUS          L'+'
    52: #define CHAR_PERIOD        L'.'
    53: #define CHAR_COLON         L':'
    54: #define CHAR_EQUAL         L'='
    55: #define CHAR_COMMA         L','
    56: #define CHAR_CARET         L'^'
    57: #define CHAR_DOLLAR        L'$'
    58: #define CHAR_BACKSLASH     L'\\'
    59: #define CHAR_HASH          L'#'
    60: #define CHAR_TILDE         L'~'
    61: 
    62: 
    63: /* Some macros for expanding \w, \s, etc. */
    64: static const char *tre_macros[] =
    65:   { "t", "\t",             "n", "\n",             "r", "\r",
    66:     "f", "\f",             "a", "\a",             "e", "\033",
    67:     "w", "[[:alnum:]_]",   "W", "[^[:alnum:]_]",  "s", "[[:space:]]",
    68:     "S", "[^[:space:]]",   "d", "[[:digit:]]",    "D", "[^[:digit:]]",
    69:    NULL };
    70: 
    71: 
    72: /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
    73:    must have at least `len' items.  Sets buf[0] to zero if the there
    74:    is no match in `tre_macros'. */
    75: static void
    76: tre_expand_macro(const tre_char_t *regex, const tre_char_t *regex_end,
    77:                  tre_char_t *buf, size_t buf_len)
    78: {
    79:   int i;
    80:   size_t len = regex_end - regex;
    81: 
    82:   buf[0] = 0;
    83:   for (i = 0; tre_macros[i] != NULL; i += 2)
    84:     {
    85:       int match = 0;
    86:       if (strlen(tre_macros[i]) > len)
    87:         continue;
    88: #ifdef TRE_WCHAR
    89:       {
    90:         tre_char_t tmp_wcs[64];
    91:         unsigned int j;
    92:         for (j = 0; j < strlen(tre_macros[i]) && j < elementsof(tmp_wcs); j++)
    93:           tmp_wcs[j] = btowc(tre_macros[i][j]);
    94:         tmp_wcs[j] = 0;
    95:         match = wcsncmp(tmp_wcs, regex, strlen(tre_macros[i]));
    96:       }
    97: #else /* !TRE_WCHAR */
    98:       match = strncmp(tre_macros[i], (const char*)regex, strlen(tre_macros[i]));
    99: #endif /* !TRE_WCHAR */
   100:       if (match == 0)
   101:         {
   102:           unsigned int j;
   103:           DPRINT(("Expanding macro '%s' => '%s'\n",
   104:                   tre_macros[i], tre_macros[i + 1]));
   105:           for (j = 0; tre_macros[i + 1][j] != 0 && j < buf_len; j++)
   106:             {
   107: #ifdef TRE_WCHAR
   108:               buf[j] = btowc(tre_macros[i + 1][j]);
   109: #else /* !TRE_WCHAR */
   110:               buf[j] = tre_macros[i + 1][j];
   111: #endif /* !TRE_WCHAR */
   112:             }
   113:           buf[j] = 0;
   114:           break;
   115:         }
   116:     }
   117: }
   118: 
   119: static reg_errcode_t
   120: tre_new_item(tre_mem_t mem, int min, int max, int *i, int *max_i,
   121:          tre_ast_node_t ***items)
   122: {
   123:   reg_errcode_t status;
   124:   tre_ast_node_t **array = *items;
   125:   /* Allocate more space if necessary. */
   126:   if (*i >= *max_i)
   127:     {
   128:       tre_ast_node_t **new_items;
   129:       DPRINT(("out of array space, i = %d\n", *i));
   130:       /* If the array is already 1024 items large, give up -- there's
   131:          probably an error in the regexp (e.g. not a '\0' terminated
   132:          string and missing ']') */
   133:       if (*max_i > 1024)
   134:         return REG_ESPACE;
   135:       *max_i *= 2;
   136:       new_items = (tre_ast_node_t**)xrealloc(array, sizeof(*items) * *max_i);
   137:       if (new_items == NULL)
   138:         return REG_ESPACE;
   139:       *items = array = new_items;
   140:     }
   141:   array[*i] = tre_ast_new_literal(mem, min, max, -1);
   142:   status = array[*i] == NULL ? REG_ESPACE : REG_OK;
   143:   (*i)++;
   144:   return status;
   145: }
   146: 
   147: 
   148: /* Expands a character klass to character ranges. */
   149: static reg_errcode_t
   150: tre_expand_ctype(tre_mem_t mem, tre_ctype_t klass, tre_ast_node_t ***items,
   151:                  int *i, int *max_i, int cflags)
   152: {
   153:   reg_errcode_t status = REG_OK;
   154:   tre_cint_t c;
   155:   int j, min = -1, max = 0;
   156:   assert(TRE_MB_CUR_MAX == 1);
   157: 
   158:   DPRINT(("  expanding klass to character ranges\n"));
   159:   for (j = 0; (j < 256) && (status == REG_OK); j++)
   160:     {
   161:       c = j;
   162:       if (tre_isctype(c, klass)
   163:           || ((cflags & REG_ICASE)
   164:               && (tre_isctype(tre_tolower(c), klass)
   165:                   || tre_isctype(tre_toupper(c), klass))))
   166: {
   167:           if (min < 0)
   168:             min = c;
   169:           max = c;
   170:         }
   171:       else if (min >= 0)
   172:         {
   173:           DPRINT(("  range %c (%d) to %c (%d)\n", min, min, max, max));
   174:           status = tre_new_item(mem, min, max, i, max_i, items);
   175:           min = -1;
   176:         }
   177:     }
   178:   if (min >= 0 && status == REG_OK)
   179:     status = tre_new_item(mem, min, max, i, max_i, items);
   180:   return status;
   181: }
   182: 
   183: 
   184: static int
   185: tre_compare_items(const void *a, const void *b)
   186: {
   187:   tre_ast_node_t *node_a = *(tre_ast_node_t **)a;
   188:   tre_ast_node_t *node_b = *(tre_ast_node_t **)b;
   189:   tre_literal_t *l_a = (tre_literal_t*)node_a->obj, *l_b = (tre_literal_t*)node_b->obj;
   190:   int a_min = l_a->code_min, b_min = l_b->code_min;
   191: 
   192:   if (a_min < b_min)
   193:     return -1;
   194:   else if (a_min > b_min)
   195:     return 1;
   196:   else
   197:     return 0;
   198: }
   199: 
   200: #ifndef TRE_USE_SYSTEM_WCTYPE
   201: 
   202: /* isalnum() and the rest may be macros, so wrap them to functions. */
   203: int tre_isalnum_func(tre_cint_t c) { return tre_isalnum(c); }
   204: int tre_isalpha_func(tre_cint_t c) { return tre_isalpha(c); }
   205: 
   206: #ifdef tre_isascii
   207: int tre_isascii_func(tre_cint_t c) { return tre_isascii(c); }
   208: #else /* !tre_isascii */
   209: int tre_isascii_func(tre_cint_t c) { return !(c >> 7); }
   210: #endif /* !tre_isascii */
   211: 
   212: #ifdef tre_isblank
   213: int tre_isblank_func(tre_cint_t c) { return tre_isblank(c); }
   214: #else /* !tre_isblank */
   215: int tre_isblank_func(tre_cint_t c) { return ((c == ' ') || (c == '\t')); }
   216: #endif /* !tre_isblank */
   217: 
   218: int tre_iscntrl_func(tre_cint_t c) { return tre_iscntrl(c); }
   219: int tre_isdigit_func(tre_cint_t c) { return tre_isdigit(c); }
   220: int tre_isgraph_func(tre_cint_t c) { return tre_isgraph(c); }
   221: int tre_islower_func(tre_cint_t c) { return tre_islower(c); }
   222: int tre_isprint_func(tre_cint_t c) { return tre_isprint(c); }
   223: int tre_ispunct_func(tre_cint_t c) { return tre_ispunct(c); }
   224: int tre_isspace_func(tre_cint_t c) { return tre_isspace(c); }
   225: int tre_isupper_func(tre_cint_t c) { return tre_isupper(c); }
   226: int tre_isxdigit_func(tre_cint_t c) { return tre_isxdigit(c); }
   227: 
   228: struct {
   229:   char *name;
   230:   int (*func)(tre_cint_t);
   231: } tre_ctype_map[] = {
   232:   { "alnum", &tre_isalnum_func },
   233:   { "alpha", &tre_isalpha_func },
   234: #ifdef tre_isascii
   235:   { "ascii", &tre_isascii_func },
   236: #endif /* tre_isascii */
   237: #ifdef tre_isblank
   238:   { "blank", &tre_isblank_func },
   239: #endif /* tre_isblank */
   240:   { "cntrl", &tre_iscntrl_func },
   241:   { "digit", &tre_isdigit_func },
   242:   { "graph", &tre_isgraph_func },
   243:   { "lower", &tre_islower_func },
   244:   { "print", &tre_isprint_func },
   245:   { "punct", &tre_ispunct_func },
   246:   { "space", &tre_isspace_func },
   247:   { "upper", &tre_isupper_func },
   248:   { "xdigit", &tre_isxdigit_func },
   249:   { NULL, NULL}
   250: };
   251: 
   252: tre_ctype_t tre_ctype(const char *name)
   253: {
   254:   int i;
   255:   for (i = 0; tre_ctype_map[i].name != NULL; i++)
   256:     {
   257:       if (strcmp(name, tre_ctype_map[i].name) == 0)
   258:         return tre_ctype_map[i].func;
   259:     }
   260:   return (tre_ctype_t)0;
   261: }
   262: #endif /* !TRE_USE_SYSTEM_WCTYPE */
   263: 
   264: /* Maximum number of character klasses that can occur in a negated bracket
   265:    expression.  */
   266: #define MAX_NEG_CLASSES 64
   267: 
   268: /* Maximum length of character klass names. */
   269: #define MAX_CLASS_NAME
   270: 
   271: static reg_errcode_t
   272: tre_parse_bracket_items(tre_parse_ctx_t *ctx, int negate,
   273:                         tre_ctype_t neg_klasses[], int *num_neg_klasses,
   274:                         tre_ast_node_t ***items, int *num_items,
   275:                         int *items_size)
   276: {
   277:   const tre_char_t *re = ctx->re;
   278:   reg_errcode_t status = REG_OK;
   279:   tre_ctype_t klass = (tre_ctype_t)0;
   280:   int i = *num_items;
   281:   int max_i = *items_size;
   282:   int skip;
   283: 
   284:   /* Build an array of the items in the bracket expression. */
   285:   while (status == REG_OK)
   286:     {
   287:       skip = 0;
   288:       if (re == ctx->re_end)
   289:         {
   290:           status = REG_EBRACK;
   291:         }
   292:       else if (*re == CHAR_RBRACKET && re > ctx->re)
   293:         {
   294:           DPRINT(("tre_parse_bracket:   done: '%.*" STRF "'\n",
   295:                   ctx->re_end - re, re));
   296:           re++;
   297:           break;
   298:         }
   299:       else
   300:         {
   301:           tre_cint_t min = 0, max = 0;
   302: 
   303:           klass = (tre_ctype_t)0;
   304:           if (re + 2 < ctx->re_end
   305:               && *(re + 1) == CHAR_MINUS && *(re + 2) != CHAR_RBRACKET)
   306:             {
   307:               DPRINT(("tre_parse_bracket:  range: '%.*" STRF "'\n",
   308:                       ctx->re_end - re, re));
   309:               min = *re;
   310:               max = *(re + 2);
   311:               re += 3;
   312:               /* XXX - Should use collation order instead of encoding values
   313:                  in character ranges. */
   314:               if (min > max)
   315:                 status = REG_ERANGE;
   316:             }
   317:           else if (re + 1 < ctx->re_end
   318:                    && *re == CHAR_LBRACKET && *(re + 1) == CHAR_PERIOD)
   319:             status = REG_ECOLLATE;
   320:           else if (re + 1 < ctx->re_end
   321:                    && *re == CHAR_LBRACKET && *(re + 1) == CHAR_EQUAL)
   322:             status = REG_ECOLLATE;
   323:           else if (re + 1 < ctx->re_end
   324:                    && *re == CHAR_LBRACKET && *(re + 1) == CHAR_COLON)
   325:             {
   326:               char tmp_str[64];
   327:               const tre_char_t *endptr = re + 2;
   328:               int len;
   329:               DPRINT(("tre_parse_bracket:  klass: '%.*" STRF "'\n",
   330:                       ctx->re_end - re, re));
   331:               while (endptr < ctx->re_end && *endptr != CHAR_COLON)
   332:                 endptr++;
   333:               if (endptr != ctx->re_end)
   334:                 {
   335:                   len = MIN(endptr - re - 2, 63);
   336: #ifdef TRE_WCHAR
   337:                   {
   338:                     tre_char_t tmp_wcs[64];
   339:                     wcsncpy(tmp_wcs, re + 2, len);
   340:                     tmp_wcs[len] = L'\0';
   341: #if defined HAVE_WCSRTOMBS
   342:                     {
   343:                       mbstate_t state;
   344:                       const tre_char_t *src = tmp_wcs;
   345:                       memset(&state, '\0', sizeof(state));
   346:                       len = wcsrtombs(tmp_str, &src, sizeof(tmp_str), &state);
   347:                     }
   348: #elif defined HAVE_WCSTOMBS
   349:                     len = wcstombs(tmp_str, tmp_wcs, 63);
   350: #endif /* defined HAVE_WCSTOMBS */
   351:                   }
   352: #else /* !TRE_WCHAR */
   353:                   strncpy(tmp_str, (const char*)re + 2, len);
   354: #endif /* !TRE_WCHAR */
   355:                   tmp_str[len] = '\0';
   356:                   DPRINT(("  klass name: %s\n", tmp_str));
   357:                   klass = tre_ctype(tmp_str);
   358:                   if (!klass)
   359:                     status = REG_ECTYPE;
   360:                   /* Optimize character klasses for 8 bit character sets. */
   361:                   if (status == REG_OK && TRE_MB_CUR_MAX == 1)
   362:                     {
   363:                       status = tre_expand_ctype(ctx->mem, klass, items,
   364:                                                 &i, &max_i, ctx->cflags);
   365:                       klass = (tre_ctype_t)0;
   366:                       skip = 1;
   367:                     }
   368:                   re = endptr + 2;
   369:                 }
   370:               else
   371:                 status = REG_ECTYPE;
   372:               min = 0;
   373:               max = TRE_CHAR_MAX;
   374:             }
   375:           else
   376:             {
   377:               DPRINT(("tre_parse_bracket:   char: '%.*" STRF "'\n",
   378:                       ctx->re_end - re, re));
   379:               if (*re == CHAR_MINUS && *(re + 1) != CHAR_RBRACKET
   380:                   && ctx->re != re)
   381:                 /* Two ranges are not allowed to share and endpoint. */
   382:                 status = REG_ERANGE;
   383:               min = max = *re++;
   384:             }
   385: 
   386:           if (status != REG_OK)
   387:             break;
   388: 
   389:           if (klass && negate)
   390:             if (*num_neg_klasses >= MAX_NEG_CLASSES)
   391:               status = REG_ESPACE;
   392:             else
   393:               neg_klasses[(*num_neg_klasses)++] = klass;
   394:           else if (!skip)
   395:             {
   396:               status = tre_new_item(ctx->mem, min, max, &i, &max_i, items);
   397:               if (status != REG_OK)
   398:                 break;
   399:               ((tre_literal_t*)((*items)[i-1])->obj)->u.klass = klass;
   400:             }
   401: 
   402:           /* Add opposite-case counterpoints if REG_ICASE is present.
   403:              This is broken if there are more than two "same" characters. */
   404:           if (ctx->cflags & REG_ICASE && !klass && status == REG_OK && !skip)
   405:             {
   406:               int cmin, ccurr;
   407: 
   408:               DPRINT(("adding opposite-case counterpoints\n"));
   409:               while (min <= max)
   410:                 {
   411:                   if (tre_islower(min))
   412:                     {
   413:                       cmin = ccurr = tre_toupper(min++);
   414:                       while (tre_islower(min) && tre_toupper(min) == ccurr + 1
   415:                              && min <= max)
   416:                         ccurr = tre_toupper(min++);
   417:                       status = tre_new_item(ctx->mem, cmin, ccurr,
   418:                                             &i, &max_i, items);
   419:                     }
   420:                   else if (tre_isupper(min))
   421:                     {
   422:                       cmin = ccurr = tre_tolower(min++);
   423:                       while (tre_isupper(min) && tre_tolower(min) == ccurr + 1
   424:                              && min <= max)
   425:                         ccurr = tre_tolower(min++);
   426:                       status = tre_new_item(ctx->mem, cmin, ccurr,
   427:                                             &i, &max_i, items);
   428:                     }
   429:                   else min++;
   430:                   if (status != REG_OK)
   431:                     break;
   432:                 }
   433:               if (status != REG_OK)
   434:                 break;
   435:             }
   436:         }
   437:     }
   438:   *num_items = i;
   439:   *items_size = max_i;
   440:   ctx->re = re;
   441:   return status;
   442: }
   443: 
   444: static reg_errcode_t
   445: tre_parse_bracket(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
   446: {
   447:   tre_ast_node_t *node = NULL;
   448:   int negate = 0;
   449:   reg_errcode_t status = REG_OK;
   450:   tre_ast_node_t **items, *u, *n;
   451:   int i = 0, j, max_i = 32, curr_max, curr_min;
   452:   tre_ctype_t neg_klasses[MAX_NEG_CLASSES];
   453:   int num_neg_klasses = 0;
   454: 
   455:   /* Start off with an array of `max_i' elements. */
   456:   items = (tre_ast_node_t**)xmalloc(sizeof(*items) * max_i);
   457:   if (items == NULL)
   458:     return REG_ESPACE;
   459: 
   460:   if (*ctx->re == CHAR_CARET)
   461:     {
   462:       DPRINT(("tre_parse_bracket: negate: '%.*" STRF "'\n",
   463:               ctx->re_end - ctx->re, ctx->re));
   464:       negate = 1;
   465:       ctx->re++;
   466:     }
   467: 
   468:   status = tre_parse_bracket_items(ctx, negate, neg_klasses, &num_neg_klasses,
   469:                                    &items, &i, &max_i);
   470: 
   471:   if (status != REG_OK)
   472:     goto parse_bracket_done;
   473: 
   474:   /* Sort the array if we need to negate it. */
   475:   if (negate)
   476:     qsort(items, i, sizeof(*items), tre_compare_items);
   477: 
   478:   curr_max = curr_min = 0;
   479:   /* Build a union of the items in the array, negated if necessary. */
   480:   for (j = 0; j < i && status == REG_OK; j++)
   481:     {
   482:       int min, max;
   483:       tre_literal_t *l = (tre_literal_t*)items[j]->obj;
   484:       min = l->code_min;
   485:       max = l->code_max;
   486: 
   487:       DPRINT(("item: %d - %d, klass %ld, curr_max = %d\n",
   488:               (int)l->code_min, (int)l->code_max, (long)l->u.klass, curr_max));
   489: 
   490:       if (negate)
   491:         {
   492:           if (min < curr_max)
   493:             {
   494:               /* Overlap. */
   495:               curr_max = MAX(max + 1, curr_max);
   496:               DPRINT(("overlap, curr_max = %d\n", curr_max));
   497:               l = NULL;
   498:             }
   499:           else
   500:             {
   501:               /* No overlap. */
   502:               curr_max = min - 1;
   503:               if (curr_max >= curr_min)
   504:                 {
   505:                   DPRINT(("no overlap\n"));
   506:                   l->code_min = curr_min;
   507:                   l->code_max = curr_max;
   508:                 }
   509:               else
   510:                 {
   511:                   DPRINT(("no overlap, zero room\n"));
   512:                   l = NULL;
   513:                 }
   514:               curr_min = curr_max = max + 1;
   515:             }
   516:         }
   517: 
   518:       if (l != NULL)
   519:         {
   520:           int k;
   521:           DPRINT(("creating %d - %d\n", (int)l->code_min, (int)l->code_max));
   522:           l->position = ctx->position;
   523:           if (num_neg_klasses > 0)
   524:             {
   525:               l->neg_klasses = (tre_ctype_t*)tre_mem_alloc(ctx->mem,
   526:                                              (sizeof(l->neg_klasses)
   527:                                               * (num_neg_klasses + 1)));
   528:               if (l->neg_klasses == NULL)
   529:                 {
   530:                   status = REG_ESPACE;
   531:                   break;
   532:                 }
   533:               for (k = 0; k < num_neg_klasses; k++)
   534:                 l->neg_klasses[k] = neg_klasses[k];
   535:               l->neg_klasses[k] = (tre_ctype_t)0;
   536:             }
   537:           else
   538:             l->neg_klasses = NULL;
   539:           if (node == NULL)
   540:             node = items[j];
   541:           else
   542:             {
   543:               u = tre_ast_new_union(ctx->mem, node, items[j]);
   544:               if (u == NULL)
   545:                 status = REG_ESPACE;
   546:               node = u;
   547:             }
   548:         }
   549:     }
   550: 
   551:   if (status != REG_OK)
   552:     goto parse_bracket_done;
   553: 
   554:   if (negate)
   555:     {
   556:       int k;
   557:       DPRINT(("final: creating %d - %d\n", curr_min, (int)TRE_CHAR_MAX));
   558:       n = tre_ast_new_literal(ctx->mem, curr_min, TRE_CHAR_MAX, ctx->position);
   559:       if (n == NULL)
   560:         status = REG_ESPACE;
   561:       else
   562:         {
   563:           tre_literal_t *l = (tre_literal_t*)n->obj;
   564:           if (num_neg_klasses > 0)
   565:             {
   566:               l->neg_klasses = (tre_ctype_t*)tre_mem_alloc(ctx->mem,
   567:                                              (sizeof(l->neg_klasses)
   568:                                               * (num_neg_klasses + 1)));
   569:               if (l->neg_klasses == NULL)
   570:                 {
   571:                   status = REG_ESPACE;
   572:                   goto parse_bracket_done;
   573:                 }
   574:               for (k = 0; k < num_neg_klasses; k++)
   575:                 l->neg_klasses[k] = neg_klasses[k];
   576:               l->neg_klasses[k] = (tre_ctype_t)0;
   577:             }
   578:           else
   579:             l->neg_klasses = NULL;
   580:           if (node == NULL)
   581:             node = n;
   582:           else
   583:             {
   584:               u = tre_ast_new_union(ctx->mem, node, n);
   585:               if (u == NULL)
   586:                 status = REG_ESPACE;
   587:               node = u;
   588:             }
   589:         }
   590:     }
   591: 
   592:   if (status != REG_OK)
   593:     goto parse_bracket_done;
   594: 
   595: #ifdef TRE_DEBUG
   596:   tre_ast_print(node);
   597: #endif /* TRE_DEBUG */
   598: 
   599:  parse_bracket_done:
   600:   xfree(items);
   601:   ctx->position++;
   602:   *result = node;
   603:   return status;
   604: }
   605: 
   606: 
   607: /* Parses a positive decimal integer.  Returns -1 if the string does not
   608:    contain a valid number. */
   609: static int
   610: tre_parse_int(const tre_char_t **regex, const tre_char_t *regex_end)
   611: {
   612:   int num = -1;
   613:   const tre_char_t *r = *regex;
   614:   while (r < regex_end && *r >= L'0' && *r <= L'9')
   615:     {
   616:       if (num < 0)
   617:         num = 0;
   618:       num = num * 10 + *r - L'0';
   619:       r++;
   620:     }
   621:   *regex = r;
   622:   return num;
   623: }
   624: 
   625: 
   626: static reg_errcode_t
   627: tre_parse_bound(tre_parse_ctx_t *ctx, tre_ast_node_t **result)
   628: {
   629:   int min, max, i;
   630:   int cost_ins, cost_del, cost_subst, cost_max;
   631:   int limit_ins, limit_del, limit_subst, limit_err;
   632:   const tre_char_t *r = ctx->re;
   633:   const tre_char_t *start;
   634:   int minimal = 0;
   635:   int approx = 0;
   636:   int costs_set = 0;
   637:   int counts_set = 0;
   638: 
   639:   cost_ins = cost_del = cost_subst = cost_max = TRE_PARAM_UNSET;
   640:   limit_ins = limit_del = limit_subst = limit_err = TRE_PARAM_UNSET;
   641: 
   642:   /* Parse number (minimum repetition count). */
   643:   min = -1;
   644:   if (r < ctx->re_end && *r >= L'0' && *r <= L'9') {
   645:     DPRINT(("tre_parse:   min count: '%.*" STRF "'\n", ctx->re_end - r, r));
   646:     min = tre_parse_int(&r, ctx->re_end);
   647:   }
   648: 
   649:   /* Parse comma and second number (maximum repetition count). */
   650:   max = min;
   651:   if (r < ctx->re_end && *r == CHAR_COMMA)
   652:     {
   653:       r++;
   654:       DPRINT(("tre_parse:   max count: '%.*" STRF "'\n", ctx->re_end - r, r));
   655:       max = tre_parse_int(&r, ctx->re_end);
   656:     }
   657: 
   658:   /* Check that the repeat counts are sane. */
   659:   if ((max >= 0 && min > max) || max > RE_DUP_MAX)
   660:     return REG_BADBR;
   661: 
   662: 
   663:   /*
   664:    '{'
   665:      optionally followed immediately by a number == minimum repcount
   666:      optionally followed by , then a number == maximum repcount
   667:       + then a number == maximum insertion count
   668:       - then a number == maximum deletion count
   669:       # then a number == maximum substitution count
   670:       ~ then a number == maximum number of errors
   671:       Any of +, -, # or ~ without followed by a number means that
   672:       the maximum count/number of errors is infinite.
   673: 
   674:       An equation of the form
   675:         Xi + Yd + Zs < C
   676:       can be specified to set costs and the cost limit to a value
   677:       different from the default value:
   678:         - X is the cost of an insertion
   679:         - Y is the cost of a deletion
   680:         - Z is the cost of a substitution
   681:         - C is the maximum cost
   682: 
   683:       If no count limit or cost is set for an operation, the operation
   684:       is not allowed at all.
   685:   */
   686: 
   687: 
   688:   do {
   689:     int done;
   690:     start = r;
   691: 
   692:     /* Parse count limit settings */
   693:     done = 0;
   694:     if (!counts_set)
   695:       while (r + 1 < ctx->re_end && !done)
   696:         {
   697:           switch (*r)
   698:             {
   699:             case CHAR_PLUS:  /* Insert limit */
   700:               DPRINT(("tre_parse:   ins limit: '%.*" STRF "'\n", ctx->re_end - r, r));
   701:               r++;
   702:               limit_ins = tre_parse_int(&r, ctx->re_end);
   703:               if (limit_ins < 0)
   704:                 limit_ins = INT_MAX;
   705:               counts_set = 1;
   706:               break;
   707:             case CHAR_MINUS: /* Delete limit */
   708:               DPRINT(("tre_parse:   del limit: '%.*" STRF "'\n", ctx->re_end - r, r));
   709:               r++;
   710:               limit_del = tre_parse_int(&r, ctx->re_end);
   711:               if (limit_del < 0)
   712:                 limit_del = INT_MAX;
   713:               counts_set = 1;
   714:               break;
   715:             case CHAR_HASH:  /* Substitute limit */
   716:               DPRINT(("tre_parse: subst limit: '%.*" STRF "'\n", ctx->re_end - r, r));
   717:               r++;
   718:               limit_subst = tre_parse_int(&r, ctx->re_end);
   719:               if (limit_subst < 0)
   720:                 limit_subst = INT_MAX;
   721:               counts_set = 1;
   722:               break;
   723:             case CHAR_TILDE: /* Maximum number of changes */
   724:               DPRINT(("tre_parse: count limit: '%.*" STRF "'\n", ctx->re_end - r, r));
   725:               r++;
   726:               limit_err = tre_parse_int(&r, ctx->re_end);
   727:               if (limit_err < 0)
   728:                 limit_err = INT_MAX;
   729:               approx = 1;
   730:               break;
   731:             case CHAR_COMMA:
   732:               r++;
   733:               break;
   734:             case L' ':
   735:               r++;
   736:               break;
   737:             case L'}':
   738:               done = 1;
   739:               break;
   740:             default:
   741:               done = 1;
   742:               break;
   743:             }
   744:         }
   745: 
   746:     /* Parse cost restriction equation. */
   747:     done = 0;
   748:     if (!costs_set)
   749:       while (r + 1 < ctx->re_end && !done)
   750:         {
   751:           switch (*r)
   752:             {
   753:             case CHAR_PLUS:
   754:             case L' ':
   755:               r++;
   756:               break;
   757:             case L'<':
   758:               DPRINT(("tre_parse:    max cost: '%.*" STRF "'\n", ctx->re_end - r, r));
   759:               r++;
   760:               while (*r == L' ')
   761:                 r++;
   762:               cost_max = tre_parse_int(&r, ctx->re_end);
   763:               if (cost_max < 0)
   764:                 cost_max = INT_MAX;
   765:               else
   766:                 cost_max--;
   767:               approx = 1;
   768:               break;
   769:             case CHAR_COMMA:
   770:               r++;
   771:               done = 1;
   772:               break;
   773:             default:
   774:               if (*r >= L'0' && *r <= L'9')
   775:                 {
   776: #ifdef TRE_DEBUG
   777:                   const tre_char_t *sr = r;
   778: #endif /* TRE_DEBUG */
   779:                   int cost = tre_parse_int(&r, ctx->re_end);
   780:                   /* XXX - make sure r is not past end. */
   781:                   switch (*r)
   782:                     {
   783:                     case L'i':  /* Insert cost */
   784:                       DPRINT(("tre_parse:    ins cost: '%.*" STRF "'\n",
   785:                               ctx->re_end - sr, sr));
   786:                       r++;
   787:                       cost_ins = cost;
   788:                       costs_set = 1;
   789:                       break;
   790:                     case L'd':  /* Delete cost */
   791:                       DPRINT(("tre_parse:    del cost: '%.*" STRF "'\n",
   792:                               ctx->re_end - sr, sr));
   793:                       r++;
   794:                       cost_del = cost;
   795:                       costs_set = 1;
   796:                       break;
   797:                     case L's':  /* Substitute cost */
   798:                       DPRINT(("tre_parse:  subst cost: '%.*" STRF "'\n",
   799:                               ctx->re_end - sr, sr));
   800:                       r++;
   801:                       cost_subst = cost;
   802:                       costs_set = 1;
   803:                       break;
   804:                     default:
   805:                       return REG_BADBR;
   806:                     }
   807:                 }
   808:               else
   809:                 {
   810:                   done = 1;
   811:                   break;
   812:                 }
   813:             }
   814:         }
   815:   } while (start != r);
   816: 
   817:   /* Missing }. */
   818:   if (r >= ctx->re_end)
   819:     return REG_EBRACE;
   820: 
   821:   /* Empty contents of {}. */
   822:   if (r == ctx->re)
   823:     return REG_BADBR;
   824: 
   825:   /* Parse the ending '}' or '\}'.*/
   826:   if (ctx->cflags & REG_EXTENDED)
   827:     {
   828:       if (r >= ctx->re_end || *r != CHAR_RBRACE)
   829:         return REG_BADBR;
   830:       r++;
   831:     }
   832:   else
   833:     {
   834:       if (r + 1 >= ctx->re_end
   835:           || *r != CHAR_BACKSLASH
   836:           || *(r + 1) != CHAR_RBRACE)
   837:         return REG_BADBR;
   838:       r += 2;
   839:     }
   840: 
   841: 
   842:   /* Parse trailing '?' marking minimal repetition. */
   843:   if (r < ctx->re_end && *r == CHAR_QUESTIONMARK)
   844:     {
   845:       minimal = 1;
   846:       r++;
   847:     }
   848: 
   849:   /* Create the AST node(s). */
   850:   if (min == 0 && max == 0)
   851:     {
   852:       *result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
   853:       if (*result == NULL)
   854:         return REG_ESPACE;
   855:     }
   856:   else
   857:     {
   858:       if (min < 0 && max < 0)
   859:         /* Only approximate parameters set, no repetitions. */
   860:         min = max = 1;
   861: 
   862:       *result = tre_ast_new_iter(ctx->mem, *result, min, max, minimal);
   863:       if (!*result)
   864:         return REG_ESPACE;
   865: 
   866:       /* If approximate matching parameters are set, add them to the
   867:          iteration node. */
   868:       if (approx || costs_set || counts_set)
   869:         {
   870:           unsigned int *params;
   871:           tre_iteration_t *iter = (tre_iteration_t*)(*result)->obj;
   872: 
   873:           if (costs_set || counts_set)
   874:             {
   875:               if (limit_ins == TRE_PARAM_UNSET)
   876:                 {
   877:                   if (cost_ins == TRE_PARAM_UNSET)
   878:                     limit_ins = 0;
   879:                   else
   880:                     limit_ins = INT_MAX;
   881:                 }
   882: 
   883:               if (limit_del == TRE_PARAM_UNSET)
   884:                 {
   885:                   if (cost_del == TRE_PARAM_UNSET)
   886:                     limit_del = 0;
   887:                   else
   888:                     limit_del = INT_MAX;
   889:                 }
   890: 
   891:               if (limit_subst == TRE_PARAM_UNSET)
   892:                 {
   893:                   if (cost_subst == TRE_PARAM_UNSET)
   894:                     limit_subst = 0;
   895:                   else
   896:                     limit_subst = INT_MAX;
   897:                 }
   898:             }
   899: 
   900:           if (cost_max == TRE_PARAM_UNSET)
   901:             cost_max = INT_MAX;
   902:           if (limit_err == TRE_PARAM_UNSET)
   903:             limit_err = INT_MAX;
   904: 
   905:           ctx->have_approx = 1;
   906:           params = (unsigned int*)tre_mem_alloc(ctx->mem, sizeof(*params) * TRE_PARAM_LAST);
   907:           if (!params)
   908:             return REG_ESPACE;
   909:           for (i = 0; i < TRE_PARAM_LAST; i++)
   910:             params[i] = TRE_PARAM_UNSET;
   911:           params[TRE_PARAM_COST_INS] = cost_ins;
   912:           params[TRE_PARAM_COST_DEL] = cost_del;
   913:           params[TRE_PARAM_COST_SUBST] = cost_subst;
   914:           params[TRE_PARAM_COST_MAX] = cost_max;
   915:           params[TRE_PARAM_MAX_INS] = limit_ins;
   916:           params[TRE_PARAM_MAX_DEL] = limit_del;
   917:           params[TRE_PARAM_MAX_SUBST] = limit_subst;
   918:           params[TRE_PARAM_MAX_ERR] = limit_err;
   919:           iter->params = params;
   920:         }
   921:     }
   922: 
   923:   DPRINT(("tre_parse_bound: min %d, max %d, costs [%d,%d,%d, total %d], "
   924:           "limits [%d,%d,%d, total %d]\n",
   925:           min, max, cost_ins, cost_del, cost_subst, cost_max,
   926:           limit_ins, limit_del, limit_subst, limit_err));
   927: 
   928: 
   929:   ctx->re = r;
   930:   return REG_OK;
   931: }
   932: 
   933: typedef enum {
   934:   PARSE_RE = 0,
   935:   PARSE_ATOM,
   936:   PARSE_MARK_FOR_SUBMATCH,
   937:   PARSE_BRANCH,
   938:   PARSE_PIECE,
   939:   PARSE_CATENATION,
   940:   PARSE_POST_CATENATION,
   941:   PARSE_UNION,
   942:   PARSE_POST_UNION,
   943:   PARSE_POSTFIX,
   944:   PARSE_RESTORE_CFLAGS
   945: } tre_parse_re_stack_symbol_t;
   946: 
   947: 
   948: reg_errcode_t
   949: tre_parse(tre_parse_ctx_t *ctx)
   950: {
   951:   tre_ast_node_t *result = NULL;
   952:   tre_parse_re_stack_symbol_t symbol;
   953:   reg_errcode_t status = REG_OK;
   954:   tre_stack_t *stack = ctx->stack;
   955:   int bottom = tre_stack_num_objects(stack);
   956:   int depth = 0;
   957:   int temporary_cflags = 0;
   958: 
   959:   DPRINT(("tre_parse: parsing '%.*" STRF "', len = %d\n",
   960:           ctx->len, ctx->re, ctx->len));
   961: 
   962:   if (!ctx->nofirstsub)
   963:     {
   964:       STACK_PUSH(stack, ctx->re);
   965:       STACK_PUSH(stack, ctx->submatch_id);
   966:       STACK_PUSH(stack, PARSE_MARK_FOR_SUBMATCH);
   967:       ctx->submatch_id++;
   968:     }
   969:   STACK_PUSH(stack, PARSE_RE);
   970:   ctx->re_start = ctx->re;
   971:   ctx->re_end = ctx->re + ctx->len;
   972: 
   973: 
   974:   /* The following is basically just a recursive descent parser.  I use
   975:      an explicit stack instead of recursive functions mostly because of
   976:      two reasons: compatibility with systems which have an overflowable
   977:      call stack, and efficiency (both in lines of code and speed).  */
   978:   while (tre_stack_num_objects(stack) > bottom && status == REG_OK)
   979:     {
   980:       if (status != REG_OK)
   981:         break;
   982:       symbol = (tre_parse_re_stack_symbol_t)(FLX_RAWADDRESS)tre_stack_pop(stack);
   983:       switch (symbol)
   984:         {
   985:         case PARSE_RE:
   986:           /* Parse a full regexp.  A regexp is one or more branches,
   987:              separated by the union operator `|'. */
   988: #ifdef REG_LITERAL
   989:           if (!(ctx->cflags & REG_LITERAL)
   990:               && ctx->cflags & REG_EXTENDED)
   991: #endif /* REG_LITERAL */
   992:             STACK_PUSHX(stack, PARSE_UNION);
   993:           STACK_PUSHX(stack, PARSE_BRANCH);
   994:           break;
   995: 
   996:         case PARSE_BRANCH:
   997:           /* Parse a branch.  A branch is one or more pieces, concatenated.
   998:              A piece is an atom possibly followed by a postfix operator. */
   999:           STACK_PUSHX(stack, PARSE_CATENATION);
  1000:           STACK_PUSHX(stack, PARSE_PIECE);
  1001:           break;
  1002: 
  1003:         case PARSE_PIECE:
  1004:           /* Parse a piece.  A piece is an atom possibly followed by one
  1005:              or more postfix operators. */
  1006: #ifdef REG_LITERAL
  1007:           if (!(ctx->cflags & REG_LITERAL))
  1008: #endif /* REG_LITERAL */
  1009:             STACK_PUSHX(stack, PARSE_POSTFIX);
  1010:           STACK_PUSHX(stack, PARSE_ATOM);
  1011:           break;
  1012: 
  1013:         case PARSE_CATENATION:
  1014:           /* If the expression has not ended, parse another piece. */
  1015:           {
  1016:             tre_char_t c;
  1017:             if (ctx->re >= ctx->re_end)
  1018:               break;
  1019:             c = *ctx->re;
  1020: #ifdef REG_LITERAL
  1021:             if (!(ctx->cflags & REG_LITERAL))
  1022:               {
  1023: #endif /* REG_LITERAL */
  1024:                 if (ctx->cflags & REG_EXTENDED && c == CHAR_PIPE)
  1025:                   break;
  1026:                 if ((ctx->cflags & REG_EXTENDED
  1027:                      && c == CHAR_RPAREN && depth > 0)
  1028:                     || (!(ctx->cflags & REG_EXTENDED)
  1029:                         && (c == CHAR_BACKSLASH
  1030:                             && *(ctx->re + 1) == CHAR_RPAREN)))
  1031:                   {
  1032:                     if (!(ctx->cflags & REG_EXTENDED) && depth == 0)
  1033:                       status = REG_EPAREN;
  1034:                     DPRINT(("tre_parse:   group end: '%.*" STRF "'\n",
  1035:                             ctx->re_end - ctx->re, ctx->re));
  1036:                     depth--;
  1037:                     if (!(ctx->cflags & REG_EXTENDED))
  1038:                       ctx->re += 2;
  1039:                     break;
  1040:                   }
  1041: #ifdef REG_LITERAL
  1042:               }
  1043: #endif /* REG_LITERAL */
  1044: 
  1045: #ifdef REG_RIGHT_ASSOC
  1046:             if (ctx->cflags & REG_RIGHT_ASSOC)
  1047:               {
  1048:                 /* Right associative concatenation. */
  1049:                 STACK_PUSHX(stack, result);
  1050:                 STACK_PUSHX(stack, PARSE_POST_CATENATION);
  1051:                 STACK_PUSHX(stack, PARSE_CATENATION);
  1052:                 STACK_PUSHX(stack, PARSE_PIECE);
  1053:               }
  1054:             else
  1055: #endif /* REG_RIGHT_ASSOC */
  1056:               {
  1057:                 /* Default case, left associative concatenation. */
  1058:                 STACK_PUSHX(stack, PARSE_CATENATION);
  1059:                 STACK_PUSHX(stack, result);
  1060:                 STACK_PUSHX(stack, PARSE_POST_CATENATION);
  1061:                 STACK_PUSHX(stack, PARSE_PIECE);
  1062:               }
  1063:             break;
  1064:           }
  1065: 
  1066:         case PARSE_POST_CATENATION:
  1067:           {
  1068:             tre_ast_node_t *tree = (tre_ast_node_t*)tre_stack_pop(stack);
  1069:             tre_ast_node_t *tmp_node;
  1070:             tmp_node = tre_ast_new_catenation(ctx->mem, tree, result);
  1071:             if (!tmp_node)
  1072:               return REG_ESPACE;
  1073:             result = tmp_node;
  1074:             break;
  1075:           }
  1076: 
  1077:         case PARSE_UNION:
  1078:           if (ctx->re >= ctx->re_end)
  1079:             break;
  1080: #ifdef REG_LITERAL
  1081:           if (ctx->cflags & REG_LITERAL)
  1082:             break;
  1083: #endif /* REG_LITERAL */
  1084:           switch (*ctx->re)
  1085:             {
  1086:             case CHAR_PIPE:
  1087:               DPRINT(("tre_parse:       union: '%.*" STRF "'\n",
  1088:                       ctx->re_end - ctx->re, ctx->re));
  1089:               STACK_PUSHX(stack, PARSE_UNION);
  1090:               STACK_PUSHX(stack, result);
  1091:               STACK_PUSHX(stack, PARSE_POST_UNION);
  1092:               STACK_PUSHX(stack, PARSE_BRANCH);
  1093:               ctx->re++;
  1094:               break;
  1095: 
  1096:             case CHAR_RPAREN:
  1097:               ctx->re++;
  1098:               break;
  1099: 
  1100:             default:
  1101:               break;
  1102:             }
  1103:           break;
  1104: 
  1105:         case PARSE_POST_UNION:
  1106:           {
  1107:             tre_ast_node_t *tmp_node;
  1108:             tre_ast_node_t *tree = (tre_ast_node_t*)tre_stack_pop(stack);
  1109:             tmp_node = tre_ast_new_union(ctx->mem, tree, result);
  1110:             if (!tmp_node)
  1111:               return REG_ESPACE;
  1112:             result = tmp_node;
  1113:             break;
  1114:           }
  1115: 
  1116:         case PARSE_POSTFIX:
  1117:           /* Parse postfix operators. */
  1118:           if (ctx->re >= ctx->re_end)
  1119:             break;
  1120: #ifdef REG_LITERAL
  1121:           if (ctx->cflags & REG_LITERAL)
  1122:             break;
  1123: #endif /* REG_LITERAL */
  1124:           switch (*ctx->re)
  1125:             {
  1126:             case CHAR_STAR:
  1127:             case CHAR_PLUS:
  1128:             case CHAR_QUESTIONMARK:
  1129:               {
  1130:                 tre_ast_node_t *tmp_node;
  1131:                 int minimal = 0;
  1132:                 int rep_min = 0;
  1133:                 int rep_max = -1;
  1134:                 if (*ctx->re == CHAR_PLUS)
  1135:                   rep_min = 1;
  1136:                 if (*ctx->re == CHAR_QUESTIONMARK)
  1137:                   rep_max = 1;
  1138: 
  1139:                 if (ctx->re + 1 < ctx->re_end
  1140:                     && *(ctx->re + 1) == CHAR_QUESTIONMARK)
  1141:                   minimal = 1;
  1142:                 DPRINT(("tre_parse: %s star: '%.*" STRF "'\n",
  1143:                         minimal ? "  minimal" : "greedy",
  1144:                         ctx->re_end - ctx->re, ctx->re));
  1145:                 ctx->re += minimal + 1;
  1146:                 tmp_node = tre_ast_new_iter(ctx->mem, result, rep_min, rep_max,
  1147:                                             minimal);
  1148:                 if (tmp_node == NULL)
  1149:                   return REG_ESPACE;
  1150:                 result = tmp_node;
  1151:                 STACK_PUSHX(stack, PARSE_POSTFIX);
  1152:                 break;
  1153:               }
  1154: 
  1155:             case CHAR_BACKSLASH:
  1156:               /* "\{" is special without REG_EXTENDED */
  1157:               if (!(ctx->cflags & REG_EXTENDED)
  1158:                   && ctx->re + 1 < ctx->re_end
  1159:                   && *(ctx->re + 1) == CHAR_LBRACE)
  1160:                 {
  1161:                   ctx->re++;
  1162:                   goto parse_brace;
  1163:                 }
  1164:               else
  1165:                 break;
  1166: 
  1167:             case CHAR_LBRACE:
  1168:               /* "{" is literal without REG_EXTENDED */
  1169:               if (!(ctx->cflags & REG_EXTENDED))
  1170:                 break;
  1171: 
  1172:             parse_brace:
  1173:               DPRINT(("tre_parse:       bound: '%.*" STRF "'\n",
  1174:                       ctx->re_end - ctx->re, ctx->re));
  1175:               ctx->re++;
  1176: 
  1177:               status = tre_parse_bound(ctx, &result);
  1178:               if (status != REG_OK)
  1179:                 return status;
  1180:               STACK_PUSHX(stack, PARSE_POSTFIX);
  1181:               break;
  1182:             }
  1183:           break;
  1184: 
  1185:         case PARSE_ATOM:
  1186:           /* Parse an atom.  An atom is a regular expression enclosed in `()',
  1187:              an empty set of `()', a bracket expression, `.', `^', `$',
  1188:              a `\' followed by a character, or a single character. */
  1189: 
  1190:           /* End of regexp? (empty string). */
  1191:           if (ctx->re >= ctx->re_end)
  1192:             goto parse_literal;
  1193: 
  1194: #ifdef REG_LITERAL
  1195:           if (ctx->cflags & REG_LITERAL)
  1196:             goto parse_literal;
  1197: #endif /* REG_LITERAL */
  1198: 
  1199:           switch (*ctx->re)
  1200:             {
  1201:             case CHAR_LPAREN:  /* parenthesized subexpression */
  1202: 
  1203:               /* Handle "(?...)" extensions.  They work in a way similar
  1204:                  to Perls corresponding extensions. */
  1205:               if (ctx->cflags & REG_EXTENDED
  1206:                   && *(ctx->re + 1) == CHAR_QUESTIONMARK)
  1207:                 {
  1208:                   int new_cflags = ctx->cflags;
  1209:                   int bit = 1;
  1210:                   DPRINT(("tre_parse:   extension: '%.*" STRF "\n",
  1211:                           ctx->re_end - ctx->re, ctx->re));
  1212:                   ctx->re += 2;
  1213:                   while (1)
  1214:                     {
  1215:                       if (*ctx->re == L'i')
  1216:                         {
  1217:                           DPRINT(("tre_parse:       icase: '%.*" STRF "\n",
  1218:                                   ctx->re_end - ctx->re, ctx->re));
  1219:                           if (bit)
  1220:                             new_cflags |= REG_ICASE;
  1221:                           else
  1222:                             new_cflags &= ~REG_ICASE;
  1223:                           ctx->re++;
  1224:                         }
  1225:                       else if (*ctx->re == L'n')
  1226:                         {
  1227:                           DPRINT(("tre_parse:     newline: '%.*" STRF "\n",
  1228:                                   ctx->re_end - ctx->re, ctx->re));
  1229:                           if (bit)
  1230:                             new_cflags |= REG_NEWLINE;
  1231:                           else
  1232:                             new_cflags &= ~REG_NEWLINE;
  1233:                           ctx->re++;
  1234:                         }
  1235: #ifdef REG_RIGHT_ASSOC
  1236:                       else if (*ctx->re == L'r')
  1237:                         {
  1238:                           DPRINT(("tre_parse: right assoc: '%.*" STRF "\n",
  1239:                                   ctx->re_end - ctx->re, ctx->re));
  1240:                           if (bit)
  1241:                             new_cflags |= REG_RIGHT_ASSOC;
  1242:                           else
  1243:                             new_cflags &= ~REG_RIGHT_ASSOC;
  1244:                           ctx->re++;
  1245:                         }
  1246: #endif /* REG_RIGHT_ASSOC */
  1247:                       else if (*ctx->re == CHAR_MINUS)
  1248:                         {
  1249:                           DPRINT(("tre_parse:    turn off: '%.*" STRF "\n",
  1250:                                   ctx->re_end - ctx->re, ctx->re));
  1251:                           ctx->re++;
  1252:                           bit = 0;
  1253:                         }
  1254:                       else if (*ctx->re == CHAR_COLON)
  1255:                         {
  1256:                           DPRINT(("tre_parse:    no group: '%.*" STRF "\n",
  1257:                                   ctx->re_end - ctx->re, ctx->re));
  1258:                           ctx->re++;
  1259:                           depth++;
  1260:                           break;
  1261:                         }
  1262:                       else if (*ctx->re == CHAR_RPAREN)
  1263:                         {
  1264:                           ctx->re++;
  1265:                           break;
  1266:                         }
  1267:                       else
  1268:                         return REG_BADPAT;
  1269:                     }
  1270: 
  1271:                   /* Turn on the cflags changes for the rest of the
  1272:                      enclosing group. */
  1273:                   STACK_PUSHX(stack, ctx->cflags);
  1274:                   STACK_PUSHX(stack, PARSE_RESTORE_CFLAGS);
  1275:                   STACK_PUSHX(stack, PARSE_RE);
  1276:                   ctx->cflags = new_cflags;
  1277:                   break;
  1278:                 }
  1279: 
  1280:               if (ctx->cflags & REG_EXTENDED
  1281:                   || (ctx->re > ctx->re_start
  1282:                       && *(ctx->re - 1) == CHAR_BACKSLASH))
  1283:                 {
  1284:                   depth++;
  1285:                   if (ctx->re + 2 < ctx->re_end
  1286:                       && *(ctx->re + 1) == CHAR_QUESTIONMARK
  1287:                       && *(ctx->re + 2) == CHAR_COLON)
  1288:                     {
  1289:                       DPRINT(("tre_parse: group begin: '%.*" STRF
  1290:                               "', no submatch\n",
  1291:                               ctx->re_end - ctx->re, ctx->re));
  1292:                       /* Don't mark for submatching. */
  1293:                       ctx->re += 3;
  1294:                       STACK_PUSHX(stack, PARSE_RE);
  1295:                     }
  1296:                   else
  1297:                     {
  1298:                       DPRINT(("tre_parse: group begin: '%.*" STRF
  1299:                               "', submatch %d\n",
  1300:                               ctx->re_end - ctx->re, ctx->re,
  1301:                               ctx->submatch_id));
  1302:                       ctx->re++;
  1303:                       /* First parse a whole RE, then mark the resulting tree
  1304:                          for submatching. */
  1305:                       STACK_PUSHX(stack, ctx->submatch_id);
  1306:                       STACK_PUSHX(stack, PARSE_MARK_FOR_SUBMATCH);
  1307:                       STACK_PUSHX(stack, PARSE_RE);
  1308:                       ctx->submatch_id++;
  1309:                     }
  1310:                 }
  1311:               else
  1312:                 goto parse_literal;
  1313:               break;
  1314: 
  1315:             case CHAR_RPAREN:  /* end of current subexpression */
  1316:               if ((ctx->cflags & REG_EXTENDED && depth > 0)
  1317:                   || (ctx->re > ctx->re_start
  1318:                       && *(ctx->re - 1) == CHAR_BACKSLASH))
  1319:                 {
  1320:                   DPRINT(("tre_parse:       empty: '%.*" STRF "'\n",
  1321:                           ctx->re_end - ctx->re, ctx->re));
  1322:                   /* We were expecting an atom, but instead the current
  1323:                      subexpression was closed.  POSIX leaves the meaning of
  1324:                      this to be implementation-defined.  We interpret this as
  1325:                      an empty expression (which matches an empty string).  */
  1326:                   result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
  1327:                   if (result == NULL)
  1328:                     return REG_ESPACE;
  1329:                   if (!(ctx->cflags & REG_EXTENDED))
  1330:                     ctx->re--;
  1331:                 }
  1332:               else
  1333:                 goto parse_literal;
  1334:               break;
  1335: 
  1336:             case CHAR_LBRACKET: /* bracket expression */
  1337:               DPRINT(("tre_parse:     bracket: '%.*" STRF "'\n",
  1338:                       ctx->re_end - ctx->re, ctx->re));
  1339:               ctx->re++;
  1340:               status = tre_parse_bracket(ctx, &result);
  1341:               if (status != REG_OK)
  1342:                 return status;
  1343:               break;
  1344: 
  1345:             case CHAR_BACKSLASH:
  1346:               /* If this is "\(" or "\)" chew off the backslash and
  1347:                  try again. */
  1348:               if (!(ctx->cflags & REG_EXTENDED)
  1349:                   && ctx->re + 1 < ctx->re_end
  1350:                   && (*(ctx->re + 1) == CHAR_LPAREN
  1351:                       || *(ctx->re + 1) == CHAR_RPAREN))
  1352:                 {
  1353:                   ctx->re++;
  1354:                   STACK_PUSHX(stack, PARSE_ATOM);
  1355:                   break;
  1356:                 }
  1357: 
  1358:               /* If a macro is used, parse the expanded macro recursively. */
  1359:               {
  1360:                 tre_char_t buf[64];
  1361:                 tre_expand_macro(ctx->re + 1, ctx->re_end,
  1362:                                  buf, elementsof(buf));
  1363:                 if (buf[0] != 0)
  1364:                   {
  1365:                     tre_parse_ctx_t subctx;
  1366:                     memcpy(&subctx, ctx, sizeof(subctx));
  1367:                     subctx.re = buf;
  1368:                     subctx.len = tre_strlen((const char*)buf);
  1369:                     subctx.nofirstsub = 1;
  1370:                     status = tre_parse(&subctx);
  1371:                     if (status != REG_OK)
  1372:                       return status;
  1373:                     ctx->re += 2;
  1374:                     ctx->position = subctx.position;
  1375:                     result = subctx.result;
  1376:                     break;
  1377:                   }
  1378:               }
  1379: 
  1380:               if (ctx->re + 1 >= ctx->re_end)
  1381:                 /* Trailing backslash. */
  1382:                 return REG_EESCAPE;
  1383: 
  1384: #ifdef REG_LITERAL
  1385:               if (*(ctx->re + 1) == L'Q')
  1386:                 {
  1387:                   DPRINT(("tre_parse: tmp literal: '%.*" STRF "'\n",
  1388:                           ctx->re_end - ctx->re, ctx->re));
  1389:                   ctx->cflags |= REG_LITERAL;
  1390:                   temporary_cflags |= REG_LITERAL;
  1391:                   ctx->re += 2;
  1392:                   STACK_PUSHX(stack, PARSE_ATOM);
  1393:                   break;
  1394:                 }
  1395: #endif /* REG_LITERAL */
  1396: 
  1397:               DPRINT(("tre_parse:  bleep: '%.*" STRF "'\n",
  1398:                       ctx->re_end - ctx->re, ctx->re));
  1399:               ctx->re++;
  1400:               switch (*ctx->re)
  1401:                 {
  1402:                 case L'b':
  1403:                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
  1404:                                                ASSERT_AT_WB, -1);
  1405:                   ctx->re++;
  1406:                   break;
  1407:                 case L'B':
  1408:                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
  1409:                                                ASSERT_AT_WB_NEG, -1);
  1410:                   ctx->re++;
  1411:                   break;
  1412:                 case L'<':
  1413:                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
  1414:                                                ASSERT_AT_BOW, -1);
  1415:                   ctx->re++;
  1416:                   break;
  1417:                 case L'>':
  1418:                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
  1419:                                                ASSERT_AT_EOW, -1);
  1420:                   ctx->re++;
  1421:                   break;
  1422:                 case L'x':
  1423:                   ctx->re++;
  1424:                   if (ctx->re[0] != CHAR_LBRACE && ctx->re < ctx->re_end)
  1425:                     {
  1426:                       /* 8 bit hex char. */
  1427:                       char tmp[3] = {0, 0, 0};
  1428:                       long val;
  1429:                       DPRINT(("tre_parse:  8 bit hex: '%.*" STRF "'\n",
  1430:                               ctx->re_end - ctx->re + 2, ctx->re - 2));
  1431: 
  1432:                       if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
  1433:                         {
  1434:                           tmp[0] = (char)ctx->re[0];
  1435:                           ctx->re++;
  1436:                         }
  1437:                       if (tre_isxdigit(ctx->re[0]) && ctx->re < ctx->re_end)
  1438:                         {
  1439:                           tmp[1] = (char)ctx->re[0];
  1440:                           ctx->re++;
  1441:                         }
  1442:                       val = strtol(tmp, NULL, 16);
  1443:                       result = tre_ast_new_literal(ctx->mem, val, val,
  1444:                                                    ctx->position);
  1445:                       ctx->position++;
  1446:                       break;
  1447:                     }
  1448:                   else if (ctx->re < ctx->re_end)
  1449:                     {
  1450:                       /* Wide char. */
  1451:                       char tmp[32];
  1452:                       long val;
  1453:                       int i = 0;
  1454:                       ctx->re++;
  1455:                       while (ctx->re_end - ctx->re >= 0)
  1456:                         {
  1457:                           if (ctx->re[0] == CHAR_RBRACE)
  1458:                             break;
  1459:                           if (tre_isxdigit(ctx->re[0]))
  1460:                             {
  1461:                               tmp[i] = (char)ctx->re[0];
  1462:                               i++;
  1463:                               ctx->re++;
  1464:                               continue;
  1465:                             }
  1466:                           return REG_EBRACE;
  1467:                         }
  1468:                       ctx->re++;
  1469:                       tmp[i] = 0;
  1470:                       val = strtol(tmp, NULL, 16);
  1471:                       result = tre_ast_new_literal(ctx->mem, val, val,
  1472:                                                    ctx->position);
  1473:                       ctx->position++;
  1474:                       break;
  1475:                     }
  1476: 
  1477:                 default:
  1478:                   if (tre_isdigit(*ctx->re))
  1479:                     {
  1480:                       /* Back reference. */
  1481:                       int val = *ctx->re - L'0';
  1482:                       DPRINT(("tre_parse:     backref: '%.*" STRF "'\n",
  1483:                               ctx->re_end - ctx->re + 1, ctx->re - 1));
  1484:                       result = tre_ast_new_literal(ctx->mem, BACKREF, val,
  1485:                                                    ctx->position);
  1486:                       if (result == NULL)
  1487:                         return REG_ESPACE;
  1488:                       ctx->position++;
  1489:                       ctx->max_backref = MAX(val, ctx->max_backref);
  1490:                       ctx->re++;
  1491:                     }
  1492:                   else
  1493:                     {
  1494:                       /* Escaped character. */
  1495:                       DPRINT(("tre_parse:     escaped: '%.*" STRF "'\n",
  1496:                               ctx->re_end - ctx->re + 1, ctx->re - 1));
  1497:                       result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
  1498:                                                    ctx->position);
  1499:                       ctx->position++;
  1500:                       ctx->re++;
  1501:                     }
  1502:                   break;
  1503:                 }
  1504:               if (result == NULL)
  1505:                 return REG_ESPACE;
  1506:               break;
  1507: 
  1508:             case CHAR_PERIOD:    /* the any-symbol */
  1509:               DPRINT(("tre_parse:         any: '%.*" STRF "'\n",
  1510:                       ctx->re_end - ctx->re, ctx->re));
  1511:               if (ctx->cflags & REG_NEWLINE)
  1512:                 {
  1513:                   tre_ast_node_t *tmp1;
  1514:                   tre_ast_node_t *tmp2;
  1515:                   tmp1 = tre_ast_new_literal(ctx->mem, 0, L'\n' - 1,
  1516:                                              ctx->position);
  1517:                   if (!tmp1)
  1518:                     return REG_ESPACE;
  1519:                   tmp2 = tre_ast_new_literal(ctx->mem, L'\n' + 1, TRE_CHAR_MAX,
  1520:                                              ctx->position + 1);
  1521:                   if (!tmp2)
  1522:                     return REG_ESPACE;
  1523:                   result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
  1524:                   if (!result)
  1525:                     return REG_ESPACE;
  1526:                   ctx->position += 2;
  1527:                 }
  1528:               else
  1529:                 {
  1530:                   result = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX,
  1531:                                                ctx->position);
  1532:                   if (!result)
  1533:                     return REG_ESPACE;
  1534:                   ctx->position++;
  1535:                 }
  1536:               ctx->re++;
  1537:               break;
  1538: 
  1539:             case CHAR_CARET:     /* beginning of line assertion */
  1540:               /* '^' has a special meaning everywhere in EREs, and in the
  1541:                  beginning of the RE and after \( is BREs. */
  1542:               if (ctx->cflags & REG_EXTENDED
  1543:                   || (ctx->re - 2 >= ctx->re_start
  1544:                       && *(ctx->re - 2) == CHAR_BACKSLASH
  1545:                       && *(ctx->re - 1) == CHAR_LPAREN)
  1546:                   || ctx->re == ctx->re_start)
  1547:                 {
  1548:                   DPRINT(("tre_parse:         BOL: '%.*" STRF "'\n",
  1549:                           ctx->re_end - ctx->re, ctx->re));
  1550:                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
  1551:                                                ASSERT_AT_BOL, -1);
  1552:                   if (result == NULL)
  1553:                     return REG_ESPACE;
  1554:                   ctx->re++;
  1555:                 }
  1556:               else
  1557:                 goto parse_literal;
  1558:               break;
  1559: 
  1560:             case CHAR_DOLLAR:    /* end of line assertion. */
  1561:               /* '$' is special everywhere in EREs, and in the end of the
  1562:                  string and before \) is BREs. */
  1563:               if (ctx->cflags & REG_EXTENDED
  1564:                   || (ctx->re + 2 < ctx->re_end
  1565:                       && *(ctx->re + 1) == CHAR_BACKSLASH
  1566:                       && *(ctx->re + 2) == CHAR_RPAREN)
  1567:                   || ctx->re + 1 == ctx->re_end)
  1568:                 {
  1569:                   DPRINT(("tre_parse:         EOL: '%.*" STRF "'\n",
  1570:                           ctx->re_end - ctx->re, ctx->re));
  1571:                   result = tre_ast_new_literal(ctx->mem, ASSERTION,
  1572:                                                ASSERT_AT_EOL, -1);
  1573:                   if (result == NULL)
  1574:                     return REG_ESPACE;
  1575:                   ctx->re++;
  1576:                 }
  1577:               else
  1578:                 goto parse_literal;
  1579:               break;
  1580: 
  1581:             default:
  1582:             parse_literal:
  1583: 
  1584:               if (temporary_cflags && ctx->re + 1 < ctx->re_end
  1585:                   && *ctx->re == CHAR_BACKSLASH && *(ctx->re + 1) == L'E')
  1586:                 {
  1587:                   DPRINT(("tre_parse:    end tmps: '%.*" STRF "'\n",
  1588:                           ctx->re_end - ctx->re, ctx->re));
  1589:                   ctx->cflags &= ~temporary_cflags;
  1590:                   temporary_cflags = 0;
  1591:                   ctx->re += 2;
  1592:                   STACK_PUSHX(stack, PARSE_ATOM);
  1593:                   break;
  1594:                 }
  1595: 
  1596: 
  1597:               /* We are expecting an atom.  If the subexpression (or the whole
  1598:                  regexp ends here, we interpret it as an empty expression
  1599:                  (which matches an empty string).  */
  1600:               if (
  1601: #ifdef REG_LITERAL
  1602:                   !(ctx->cflags & REG_LITERAL) &&
  1603: #endif /* REG_LITERAL */
  1604:                   (ctx->re >= ctx->re_end
  1605:                    || *ctx->re == CHAR_STAR
  1606:                    || (ctx->cflags & REG_EXTENDED
  1607:                        && (*ctx->re == CHAR_PIPE
  1608:                            || *ctx->re == CHAR_LBRACE
  1609:                            || *ctx->re == CHAR_PLUS
  1610:                            || *ctx->re == CHAR_QUESTIONMARK))
  1611:                    /* Test for "\)" in BRE mode. */
  1612:                    || (!(ctx->cflags & REG_EXTENDED)
  1613:                        && ctx->re + 1 < ctx->re_end
  1614:                        && *ctx->re == CHAR_BACKSLASH
  1615:                        && *(ctx->re + 1) == CHAR_LBRACE)))
  1616:                 {
  1617:                   DPRINT(("tre_parse:       empty: '%.*" STRF "'\n",
  1618:                           ctx->re_end - ctx->re, ctx->re));
  1619:                   result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
  1620:                   if (!result)
  1621:                     return REG_ESPACE;
  1622:                   break;
  1623:                 }
  1624: 
  1625:               DPRINT(("tre_parse:     literal: '%.*" STRF "'\n",
  1626:                       ctx->re_end - ctx->re, ctx->re));
  1627:               /* Note that we can't use an tre_isalpha() test here, since there
  1628:                  may be characters which are alphabetic but neither upper or
  1629:                  lower case. */
  1630:               if (ctx->cflags & REG_ICASE
  1631:                   && (tre_isupper(*ctx->re) || tre_islower(*ctx->re)))
  1632:                 {
  1633:                   tre_ast_node_t *tmp1;
  1634:                   tre_ast_node_t *tmp2;
  1635: 
  1636:                   /* XXX - Can there be more than one opposite-case
  1637:                      counterpoints for some character in some locale?  Or
  1638:                      more than two characters which all should be regarded
  1639:                      the same character if case is ignored?  If yes, there
  1640:                      does not seem to be a portable way to detect it.  I guess
  1641:                      that at least for multi-character collating elements there
  1642:                      could be several opposite-case counterpoints, but they
  1643:                      cannot be supported portably anyway. */
  1644:                   tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(*ctx->re),
  1645:                                              tre_toupper(*ctx->re),
  1646:                                              ctx->position);
  1647:                   if (!tmp1)
  1648:                     return REG_ESPACE;
  1649:                   tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(*ctx->re),
  1650:                                              tre_tolower(*ctx->re),
  1651:                                              ctx->position);
  1652:                   if (!tmp2)
  1653:                     return REG_ESPACE;
  1654:                   result = tre_ast_new_union(ctx->mem, tmp1, tmp2);
  1655:                   if (!result)
  1656:                     return REG_ESPACE;
  1657:                 }
  1658:               else
  1659:                 {
  1660:                   result = tre_ast_new_literal(ctx->mem, *ctx->re, *ctx->re,
  1661:                                                ctx->position);
  1662:                   if (!result)
  1663:                     return REG_ESPACE;
  1664:                 }
  1665:               ctx->position++;
  1666:               ctx->re++;
  1667:               break;
  1668:             }
  1669:           break;
  1670: 
  1671:         case PARSE_MARK_FOR_SUBMATCH:
  1672:           {
  1673:             int submatch_id = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
  1674: 
  1675:             if (result->submatch_id >= 0)
  1676:               {
  1677:                 tre_ast_node_t *n, *tmp_node;
  1678:                 n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
  1679:                 if (n == NULL)
  1680:                   return REG_ESPACE;
  1681:                 tmp_node = tre_ast_new_catenation(ctx->mem, n, result);
  1682:                 if (tmp_node == NULL)
  1683:                   return REG_ESPACE;
  1684:                 tmp_node->num_submatches = result->num_submatches;
  1685:                 result = tmp_node;
  1686:               }
  1687:             result->submatch_id = submatch_id;
  1688:             result->num_submatches++;
  1689:             break;
  1690:           }
  1691: 
  1692:         case PARSE_RESTORE_CFLAGS:
  1693:           ctx->cflags = (int)(FLX_RAWADDRESS)tre_stack_pop(stack);
  1694:           break;
  1695:         }
  1696:     }
  1697: 
  1698:   /* Check for missing closing parentheses. */
  1699:   if (depth > 0)
  1700:     return REG_EPAREN;
  1701: 
  1702:   if (status == REG_OK)
  1703:     ctx->result = result;
  1704: 
  1705:   return status;
  1706: }
  1707: 
  1708: /* EOF */
End cpp section to tre/tre_parse.cpp[1]
Start cpp section to tre/tre_parse.hpp[1 /1 ]
     1: #line 8778 "./lpsrc/tre.pak"
     2: /*
     3:   tre-parse.c - Regexp parser definitions
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: */
    20: 
    21: #ifndef TRE_PARSE_H
    22: #define TRE_PARSE_H 1
    23: 
    24: /* Parse context. */
    25: typedef struct {
    26:   /* Memory allocator.  The AST is allocated using this. */
    27:   tre_mem_t mem;
    28:   /* Stack used for keeping track of regexp syntax. */
    29:   tre_stack_t *stack;
    30:   /* The parse result. */
    31:   tre_ast_node_t *result;
    32:   /* The regexp to parse and its length. */
    33:   const tre_char_t *re;
    34:   /* The first character of the entire regexp. */
    35:   const tre_char_t *re_start;
    36:   /* The first character after the end of the regexp. */
    37:   const tre_char_t *re_end;
    38:   int len;
    39:   /* Current submatch ID. */
    40:   int submatch_id;
    41:   /* Current position (number of literal). */
    42:   int position;
    43:   /* The highest back reference or -1 if none seen so far. */
    44:   int max_backref;
    45:   /* This flag is set if the regexp uses approximate matching. */
    46:   int have_approx;
    47:   /* Compilation flags. */
    48:   int cflags;
    49:   /* If this flag is set the top-level submatch is not captured. */
    50:   int nofirstsub;
    51:   /* The currently set approximate matching parameters. */
    52:   int params[TRE_PARAM_LAST];
    53: } tre_parse_ctx_t;
    54: 
    55: /* Parses a wide character regexp pattern into a syntax tree.  This parser
    56:    handles both syntaxes (BRE and ERE), including the TRE extensions. */
    57: reg_errcode_t
    58: tre_parse(tre_parse_ctx_t *ctx);
    59: 
    60: #endif /* TRE_PARSE_H */
    61: 
    62: /* EOF */
End cpp section to tre/tre_parse.hpp[1]
Start cpp section to tre/tre_stack.cpp[1 /1 ]
     1: #line 8841 "./lpsrc/tre.pak"
     2: /*
     3:   tre-stack.c - Simple stack implementation
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: */
    20: 
    21: #include "flx_target_tre_config.hpp"
    22: #include <stdlib.h>
    23: #include <assert.h>
    24: 
    25: #include "tre_stack.hpp"
    26: #include "tre_internal.hpp"
    27: #include "tre_xmalloc.hpp"
    28: 
    29: struct tre_stack_rec {
    30:   int size;
    31:   int max_size;
    32:   int increment;
    33:   int ptr;
    34:   void **stack;
    35: };
    36: 
    37: 
    38: tre_stack_t *
    39: tre_stack_new(int size, int max_size, int increment)
    40: {
    41:   tre_stack_t *s;
    42: 
    43:   s = (tre_stack_t*)xmalloc(sizeof(*s));
    44:   if (s != NULL)
    45:     {
    46:       s->stack = (void**)xmalloc(sizeof(*s->stack) * size);
    47:       if (s->stack == NULL)
    48:         {
    49:           xfree(s);
    50:           return NULL;
    51:         }
    52:       s->size = size;
    53:       s->max_size = max_size;
    54:       s->increment = increment;
    55:       s->ptr = 0;
    56:     }
    57:   return s;
    58: }
    59: 
    60: void
    61: tre_stack_destroy(tre_stack_t *s)
    62: {
    63:   xfree(s->stack);
    64:   xfree(s);
    65: }
    66: 
    67: int
    68: tre_stack_num_objects(tre_stack_t *s)
    69: {
    70:   return s->ptr;
    71: }
    72: 
    73: reg_errcode_t
    74: tre_stack_push(tre_stack_t *s, void *value)
    75: {
    76:   if (s->ptr < s->size)
    77:     {
    78:       s->stack[s->ptr] = value;
    79:       s->ptr++;
    80:     }
    81:   else
    82:     {
    83:       if (s->size >= s->max_size)
    84:         {
    85:           DPRINT(("tre_stack_push: stack full\n"));
    86:           return REG_ESPACE;
    87:         }
    88:       else
    89:         {
    90:           void **new_buffer;
    91:           int new_size;
    92:           DPRINT(("tre_stack_push: trying to realloc more space\n"));
    93:           new_size = s->size + s->increment;
    94:           if (new_size > s->max_size)
    95:             new_size = s->max_size;
    96:           new_buffer = (void**)xrealloc(s->stack, sizeof(*new_buffer) * new_size);
    97:           if (new_buffer == NULL)
    98:             {
    99:               DPRINT(("tre_stack_push: realloc failed.\n"));
   100:               return REG_ESPACE;
   101:             }
   102:           DPRINT(("tre_stack_push: realloc succeeded.\n"));
   103:           assert(new_size > s->size);
   104:           s->size = new_size;
   105:           s->stack = new_buffer;
   106:           tre_stack_push(s, value);
   107:         }
   108:     }
   109:   return REG_OK;
   110: }
   111: 
   112: void *
   113: tre_stack_pop(tre_stack_t *s)
   114: {
   115:   return s->stack[--s->ptr];
   116: }
   117: 
   118: /* EOF */
End cpp section to tre/tre_stack.cpp[1]
Start cpp section to tre/tre_stack.hpp[1 /1 ]
     1: #line 8960 "./lpsrc/tre.pak"
     2: /*
     3:   tre-stack.h: Stack definitions
     4: 
     5:   Copyright (C) 2001-2004 Ville Laurikari <vl@iki.fi>
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: */
    20: 
    21: 
    22: #ifndef TRE_STACK_H
    23: #define TRE_STACK_H 1
    24: 
    25: #include "tre_regex.hpp"
    26: 
    27: typedef struct tre_stack_rec tre_stack_t;
    28: 
    29: /* Creates a new stack object.  `size' is initial size in bytes, `max_size'
    30:    is maximum size, and `increment' specifies how much more space will be
    31:    allocated with realloc() if all space gets used up.  Returns the stack
    32:    object or NULL if out of memory. */
    33: tre_stack_t *
    34: tre_stack_new(int size, int max_size, int increment);
    35: 
    36: /* Frees the stack object. */
    37: void
    38: tre_stack_destroy(tre_stack_t *s);
    39: 
    40: /* Returns the current number of objects in the stack. */
    41: int
    42: tre_stack_num_objects(tre_stack_t *s);
    43: 
    44: /* Pushes `value' on top of stack `s'.  Returns REG_ESPACE if out of memory
    45:    (tries to realloc() more space before failing if maximum size not yet
    46:    reached).  Returns REG_OK if successful. */
    47: reg_errcode_t
    48: tre_stack_push(tre_stack_t *s, void *value);
    49: 
    50: /* Pops the topmost element off of stack `s' and returns it.  The stack must
    51:    not be empty. */
    52: void *
    53: tre_stack_pop(tre_stack_t *s);
    54: 
    55: 
    56: /* Just to save some typing. */
    57: #define STACK_PUSH(s, value)                                                  \
    58:   do                                                                          \
    59:     {                                                                         \
    60:       status = tre_stack_push(s, (void *)(value));                            \
    61:     }                                                                         \
    62:   while (0)
    63: 
    64: #define STACK_PUSHX(s, value)                                                 \
    65:   {                                                                           \
    66:     status = tre_stack_push(s, (void *)(value));                              \
    67:     if (status != REG_OK)                                                     \
    68:       break;                                                                  \
    69:   }
    70: 
    71: #define STACK_PUSHR(s, value)                                                 \
    72:   {                                                                           \
    73:     reg_errcode_t status;                                                     \
    74:     status = tre_stack_push(s, (void *)(value));                              \
    75:     if (status != REG_OK)                                                     \
    76:       return status;                                                          \
    77:   }
    78: 
    79: #endif /* TRE_STACK_H */
    80: 
    81: /* EOF */
End cpp section to tre/tre_stack.hpp[1]
Start cpp section to tre/tre_xmalloc.hpp[1 /1 ]
     1: #line 9042 "./lpsrc/tre.pak"
     2: /*
     3:   xmalloc.h - Simple malloc debugger library API
     4: 
     5:   Copyright (C) 2001-2003 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: #ifndef _XMALLOC_H
    23: #define _XMALLOC_H 1
    24: 
    25: #ifdef MALLOC_DEBUGGING
    26: 
    27: /* Version 2.4 and later of GCC define a magical variable `__PRETTY_FUNCTION__'
    28:    which contains the name of the function currently being defined.
    29: #  define __XMALLOC_FUNCTION     __PRETTY_FUNCTION__
    30:    This is broken in G++ before version 2.6.
    31:    C9x has a similar variable called __func__, but prefer the GCC one since
    32:    it demangles C++ function names.  */
    33: # ifdef __GNUC__
    34: #  if __GNUC__ > 2 || (__GNUC__ == 2 \
    35:                        && __GNUC_MINOR__ >= (defined __cplusplus ? 6 : 4))
    36: #   define __XMALLOC_FUNCTION    __PRETTY_FUNCTION__
    37: #  else
    38: #   define __XMALLOC_FUNCTION    ((const char *) 0)
    39: #  endif
    40: # else
    41: #  if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
    42: #   define __XMALLOC_FUNCTION    __func__
    43: #  else
    44: #   define __XMALLOC_FUNCTION    ((const char *) 0)
    45: #  endif
    46: # endif
    47: 
    48: #define xmalloc(size) xmalloc_impl(size, __FILE__, __LINE__, \
    49:                                    __XMALLOC_FUNCTION)
    50: #define xcalloc(nmemb, size) xcalloc_impl(nmemb, size, __FILE__, __LINE__, \
    51:                                           __XMALLOC_FUNCTION)
    52: #define xfree(ptr) xfree_impl(ptr, __FILE__, __LINE__, __XMALLOC_FUNCTION)
    53: #define xrealloc(ptr, new_size) xrealloc_impl(ptr, new_size, __FILE__, \
    54:                                               __LINE__, __XMALLOC_FUNCTION)
    55: 
    56: void *xmalloc_impl(size_t size, const char *file, int line, const char *func);
    57: void *xcalloc_impl(size_t nmemb, size_t size, const char *file, int line,
    58:                    const char *func);
    59: void xfree_impl(void *ptr, const char *file, int line, const char *func);
    60: void *xrealloc_impl(void *ptr, size_t new_size, const char *file, int line,
    61:                     const char *func);
    62: int xmalloc_dump_leaks(void);
    63: void xmalloc_configure(int fail_after);
    64: 
    65: #undef malloc
    66: #undef calloc
    67: #undef free
    68: #undef realloc
    69: 
    70: #define malloc  USE_XMALLOC_INSTEAD_OF_MALLOC
    71: #define calloc  USE_XCALLOC_INSTEAD_OF_CALLOC
    72: #define free    USE_XFREE_INSTEAD_OF_FREE
    73: #define realloc USE_XREALLOC_INSTEAD_OF_REALLOC
    74: 
    75: #else /* !MALLOC_DEBUGGING */
    76: 
    77: #include <stdlib.h>
    78: 
    79: #define xmalloc(size) malloc(size)
    80: #define xcalloc(nmemb, size) calloc(nmemb, size)
    81: #define xfree(ptr) free(ptr)
    82: #define xrealloc(ptr, new_size) realloc(ptr, new_size)
    83: 
    84: #endif /* !MALLOC_DEBUGGING */
    85: 
    86: #endif /* _XMALLOC_H */
    87: 
    88: /* EOF */
End cpp section to tre/tre_xmalloc.hpp[1]
Start cpp section to tre/tre_xmalloc.cpp[1 /1 ]
     1: #line 9131 "./lpsrc/tre.pak"
     2: /*
     3:   xmalloc.c - Simple malloc debugger library implementation
     4: 
     5:   Copyright (C) 2001-2003 Ville Laurikari <vl@iki.fi>.
     6: 
     7:   This program is free software; you can redistribute it and/or modify
     8:   it under the terms of the GNU General Public License version 2 (June
     9:   1991) as published by the Free Software Foundation.
    10: 
    11:   This program is distributed in the hope that it will be useful,
    12:   but WITHOUT ANY WARRANTY; without even the implied warranty of
    13:   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    14:   GNU General Public License for more details.
    15: 
    16:   You should have received a copy of the GNU General Public License
    17:   along with this program; if not, write to the Free Software
    18:   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
    19: 
    20: */
    21: 
    22: /*
    23:   TODO:
    24:    - red zones
    25:    - group dumps by source location
    26: */
    27: 
    28: #include "flx_target_tre_config.hpp"
    29: 
    30: #include <stdlib.h>
    31: #include <assert.h>
    32: #include <stdio.h>
    33: 
    34: /*
    35:   Internal stuff.
    36: */
    37: 
    38: typedef struct hashTableItemRec {
    39:   void *ptr;
    40:   int bytes;
    41:   const char *file;
    42:   int line;
    43:   const char *func;
    44:   struct hashTableItemRec *next;
    45: } hashTableItem;
    46: 
    47: typedef struct {
    48:   hashTableItem **table;
    49: } hashTable;
    50: 
    51: static int xmalloc_peak;
    52: int xmalloc_current;
    53: static int xmalloc_peak_blocks;
    54: int xmalloc_current_blocks;
    55: static int xmalloc_fail_after;
    56: 
    57: #define TABLE_BITS 8
    58: #define TABLE_MASK ((1 << TABLE_BITS) - 1)
    59: #define TABLE_SIZE (1 << TABLE_BITS)
    60: 
    61: static hashTable *
    62: hash_table_new(void)
    63: {
    64:   hashTable *tbl;
    65: 
    66:   tbl = (hashTable*)malloc(sizeof(*tbl));
    67: 
    68:   if (tbl != NULL)
    69:     {
    70:       tbl->table = (hashTableItem**)calloc(TABLE_SIZE, sizeof(*tbl->table));
    71: 
    72:       if (tbl->table == NULL)
    73:         {
    74:           free(tbl);
    75:           return NULL;
    76:         }
    77:     }
    78: 
    79:   return tbl;
    80: }
    81: 
    82: static int
    83: hash_void_ptr(void *ptr)
    84: {
    85:   int hash;
    86:   int i;
    87: 
    88:   /* I took this hash function just off the top of my head, I have
    89:      no idea whether it is bad or very bad. */
    90:   hash = 0;
    91:   for (i = 0; i < sizeof(ptr)*8 / TABLE_BITS; i++)
    92:     {
    93:       hash ^= (FLX_RAWADDRESS)ptr >> i*8;
    94:       hash += i * 17;
    95:       hash &= TABLE_MASK;
    96:     }
    97:   return hash;
    98: }
    99: 
   100: static void
   101: hash_table_add(hashTable *tbl, void *ptr, int bytes,
   102:                const char *file, int line, const char *func)
   103: {
   104:   int i;
   105:   hashTableItem *item, *xnew;
   106: 
   107:   i = hash_void_ptr(ptr);
   108: 
   109:   item = tbl->table[i];
   110:   if (item != NULL)
   111:     while (item->next != NULL)
   112:       item = item->next;
   113: 
   114:   xnew = (hashTableItem*)malloc(sizeof(*xnew));
   115:   assert(xnew != NULL);
   116:   xnew->ptr = ptr;
   117:   xnew->bytes = bytes;
   118:   xnew->file = file;
   119:   xnew->line = line;
   120:   xnew->func = func;
   121:   xnew->next = NULL;
   122:   if (item != NULL)
   123:     item->next = xnew;
   124:   else
   125:     tbl->table[i] = xnew;
   126: 
   127:   xmalloc_current += bytes;
   128:   if (xmalloc_current > xmalloc_peak)
   129:     xmalloc_peak = xmalloc_current;
   130:   xmalloc_current_blocks++;
   131:   if (xmalloc_current_blocks > xmalloc_peak_blocks)
   132:     xmalloc_peak_blocks = xmalloc_current_blocks;
   133: }
   134: 
   135: static void
   136: hash_table_del(hashTable *tbl, void *ptr)
   137: {
   138:   int i;
   139:   hashTableItem *item, *prev;
   140: 
   141:   i = hash_void_ptr(ptr);
   142: 
   143:   item = tbl->table[i];
   144:   if (item == NULL)
   145:     {
   146:       printf("xfree: invalid ptr %p\n", ptr);
   147:       abort();
   148:     }
   149:   prev = NULL;
   150:   while (item->ptr != ptr)
   151:     {
   152:       prev = item;
   153:       item = item->next;
   154:     }
   155:   if (item->ptr != ptr)
   156:     {
   157:       printf("xfree: invalid ptr %p\n", ptr);
   158:       abort();
   159:     }
   160: 
   161:   xmalloc_current -= item->bytes;
   162:   xmalloc_current_blocks--;
   163: 
   164:   if (prev != NULL)
   165:     {
   166:       prev->next = item->next;
   167:       free(item);
   168:     }
   169:   else
   170:     {
   171:       tbl->table[i] = item->next;
   172:       free(item);
   173:     }
   174: }
   175: 
   176: static hashTable *xmalloc_table = NULL;
   177: 
   178: static void
   179: xmalloc_init(void)
   180: {
   181:   if (xmalloc_table == NULL)
   182:     {
   183:       xmalloc_table = hash_table_new();
   184:       xmalloc_peak = 0;
   185:       xmalloc_peak_blocks = 0;
   186:       xmalloc_current = 0;
   187:       xmalloc_current_blocks = 0;
   188:       xmalloc_fail_after = -1;
   189:     }
   190:   assert(xmalloc_table != NULL);
   191:   assert(xmalloc_table->table != NULL);
   192: }
   193: 
   194: 
   195: /*
   196:   Public API.
   197: */
   198: 
   199: void
   200: xmalloc_configure(int fail_after)
   201: {
   202:   xmalloc_init();
   203:   xmalloc_fail_after = fail_after;
   204: }
   205: 
   206: int
   207: xmalloc_dump_leaks(void)
   208: {
   209:   int i;
   210:   int num_leaks = 0;
   211:   int leaked_bytes = 0;
   212:   hashTableItem *item;
   213: 
   214:   xmalloc_init();
   215: 
   216:   for (i = 0; i < TABLE_SIZE; i++)
   217:     {
   218:       item = xmalloc_table->table[i];
   219:       while (item != NULL)
   220:         {
   221:           printf("%s:%d: %s: %d bytes at %p not freed\n",
   222:                  item->file, item->line, item->func, item->bytes, item->ptr);
   223:           num_leaks++;
   224:           leaked_bytes += item->bytes;
   225:           item = item->next;
   226:         }
   227:     }
   228:   if (num_leaks == 0)
   229:     printf("No memory leaks.\n");
   230:   else
   231:     printf("%d unfreed memory chuncks, total %d unfreed bytes.\n",
   232:            num_leaks, leaked_bytes);
   233:   printf("Peak memory consumption %d bytes (%.1f kB, %.1f MB) in %d blocks ",
   234:          xmalloc_peak, (double)xmalloc_peak / 1024,
   235:          (double)xmalloc_peak / (1024*1024), xmalloc_peak_blocks);
   236:   printf("(average ");
   237:   if (xmalloc_peak_blocks)
   238:     printf("%d", ((xmalloc_peak + xmalloc_peak_blocks / 2)
   239:                   / xmalloc_peak_blocks));
   240:   else
   241:     printf("N/A");
   242:   printf(" bytes per block).\n");
   243: 
   244:   return num_leaks;
   245: }
   246: 
   247: void *
   248: xmalloc_impl(size_t size, const char *file, int line, const char *func)
   249: {
   250:   void *ptr;
   251: 
   252:   xmalloc_init();
   253:   assert(size > 0);
   254: 
   255:   if (xmalloc_fail_after == 0)
   256:     {
   257:       xmalloc_fail_after = -2;
   258: #if 0
   259:       printf("xmalloc: forced failure %s:%d: %s\n", file, line, func);
   260: #endif
   261:       return NULL;
   262:     }
   263:   else if (xmalloc_fail_after == -2)
   264:     {
   265:       printf("xmalloc: called after failure from %s:%d: %s\n",
   266:              file, line, func);
   267:       assert(0);
   268:     }
   269:   else if (xmalloc_fail_after > 0)
   270:     xmalloc_fail_after--;
   271: 
   272:   ptr = malloc(size);
   273:   if (ptr != NULL)
   274:     hash_table_add(xmalloc_table, ptr, size, file, line, func);
   275:   return ptr;
   276: }
   277: 
   278: void *
   279: xcalloc_impl(size_t nmemb, size_t size, const char *file, int line,
   280:              const char *func)
   281: {
   282:   void *ptr;
   283: 
   284:   xmalloc_init();
   285:   assert(size > 0);
   286: 
   287:   if (xmalloc_fail_after == 0)
   288:     {
   289:       xmalloc_fail_after = -2;
   290: #if 0
   291:       printf("xcalloc: forced failure %s:%d: %s\n", file, line, func);
   292: #endif
   293:       return NULL;
   294:     }
   295:   else if (xmalloc_fail_after == -2)
   296:     {
   297:       printf("xcalloc: called after failure from %s:%d: %s\n",
   298:              file, line, func);
   299:       assert(0);
   300:     }
   301:   else if (xmalloc_fail_after > 0)
   302:     xmalloc_fail_after--;
   303: 
   304:   ptr = calloc(nmemb, size);
   305:   if (ptr != NULL)
   306:     hash_table_add(xmalloc_table, ptr, nmemb * size, file, line, func);
   307:   return ptr;
   308: }
   309: 
   310: void
   311: xfree_impl(void *ptr, const char *file, int line, const char *func)
   312: {
   313:   xmalloc_init();
   314: 
   315:   if (ptr != NULL)
   316:     hash_table_del(xmalloc_table, ptr);
   317:   free(ptr);
   318: }
   319: 
   320: void *
   321: xrealloc_impl(void *ptr, size_t new_size, const char *file, int line,
   322:               const char *func)
   323: {
   324:   void *new_ptr;
   325: 
   326:   xmalloc_init();
   327:   assert(ptr != NULL);
   328:   assert(new_size > 0);
   329: 
   330:   if (xmalloc_fail_after == 0)
   331:     {
   332:       xmalloc_fail_after = -2;
   333:       return NULL;
   334:     }
   335:   else if (xmalloc_fail_after == -2)
   336:     {
   337:       printf("xrealloc: called after failure from %s:%d: %s\n",
   338:              file, line, func);
   339:       assert(0);
   340:     }
   341:   else if (xmalloc_fail_after > 0)
   342:     xmalloc_fail_after--;
   343: 
   344:   new_ptr = realloc(ptr, new_size);
   345:   if (new_ptr != NULL)
   346:     {
   347:       hash_table_del(xmalloc_table, ptr);
   348:       hash_table_add(xmalloc_table, new_ptr, new_size, file, line, func);
   349:     }
   350:   return new_ptr;
   351: }
   352: 
   353: 
   354: 
   355: /* EOF */
End cpp section to tre/tre_xmalloc.cpp[1]