pcre.c
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:150k
- /*
- * ===========================================================================
- * PRODUCTION $Log: pcre.c,v $
- * PRODUCTION Revision 1000.0 2003/10/29 15:55:43 gouriano
- * PRODUCTION PRODUCTION: IMPORTED [ORIGINAL] Dev-tree R1.2
- * PRODUCTION
- * ===========================================================================
- */
- /*************************************************
- * Perl-Compatible Regular Expressions *
- *************************************************/
- /*
- This is a library of functions to support regular expressions whose syntax
- and semantics are as close as possible to those of the Perl 5 language. See
- the file Tech.Notes for some information on the internals.
- Written by: Philip Hazel <ph10@cam.ac.uk>
- Copyright (c) 1997-2001 University of Cambridge
- -----------------------------------------------------------------------------
- Permission is granted to anyone to use this software for any purpose on any
- computer system, and to redistribute it freely, subject to the following
- restrictions:
- 1. This software is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- 2. The origin of this software must not be misrepresented, either by
- explicit claim or by omission.
- 3. Altered versions must be plainly marked as such, and must not be
- misrepresented as being the original software.
- 4. If PCRE is embedded in any software that is released under the GNU
- General Purpose Licence (GPL), then the terms of that licence shall
- supersede any condition above with which it is incompatible.
- -----------------------------------------------------------------------------
- */
- /* Use a macro for debugging printing */
- #if defined(PCRE_DEBUG)
- # define DPRINTF(p) printf p
- #else
- # define DPRINTF(p) /*nothing*/
- #endif
- /* Include the internals header, which itself includes Standard C headers plus
- the external pcre header. */
- #include "pcre_internal.h"
- /* Allow compilation as C++ source code, should anybody want to do that. */
- #ifdef __cplusplus
- #define class pcre_class
- #endif
- /* Maximum number of items on the nested bracket stacks at compile time. This
- applies to the nesting of all kinds of parentheses. It does not limit
- un-nested, non-capturing parentheses. This number can be made bigger if
- necessary - it is used to dimension one int and one unsigned char vector at
- compile time. */
- #define BRASTACK_SIZE 200
- /* The number of bytes in a literal character string above which we can't add
- any more is different when UTF-8 characters may be encountered. */
- #ifdef SUPPORT_UTF8
- #define MAXLIT 250
- #else
- #define MAXLIT 255
- #endif
- /* Min and max values for the common repeats; for the maxima, 0 => infinity */
- static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
- static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
- /* Text forms of OP_ values and things, for debugging (not all used) */
- #ifdef DEBUG
- static const char *OP_names[] = {
- "End", "\A", "\B", "\b", "\D", "\d",
- "\S", "\s", "\W", "\w", "\Z", "\z",
- "Opt", "^", "$", "Any", "chars", "not",
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
- "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
- "*", "*?", "+", "+?", "?", "??", "{", "{",
- "class", "Ref", "Recurse",
- "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
- "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
- "Brazero", "Braminzero", "Branumber", "Bra"
- };
- #endif
- /* Table for handling escaped characters in the range '0'-'z'. Positive returns
- are simple data values; negative values are for special things like d and so
- on. Zero means further processing is needed (for things like x), or the escape
- is invalid. */
- static const short int escapes[] = {
- 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
- 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
- '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
- 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
- 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
- 0, 0, -ESC_Z, '[', '\', ']', '^', '_', /* X - _ */
- '`', 7, -ESC_b, 0, -ESC_d, ESC_E, ESC_F, 0, /* ` - g */
- 0, 0, 0, 0, 0, 0, ESC_N, 0, /* h - o */
- 0, 0, ESC_R, -ESC_s, ESC_T, 0, 0, -ESC_w, /* p - w */
- 0, 0, -ESC_z /* x - z */
- };
- /* Tables of names of POSIX character classes and their lengths. The list is
- terminated by a zero length entry. The first three must be alpha, upper, lower,
- as this is assumed for handling case independence. */
- static const char *posix_names[] = {
- "alpha", "lower", "upper",
- "alnum", "ascii", "cntrl", "digit", "graph",
- "print", "punct", "space", "word", "xdigit" };
- static const uschar posix_name_lengths[] = {
- 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
- /* Table of class bit maps for each POSIX class; up to three may be combined
- to form the class. */
- static const int posix_class_maps[] = {
- cbit_lower, cbit_upper, -1, /* alpha */
- cbit_lower, -1, -1, /* lower */
- cbit_upper, -1, -1, /* upper */
- cbit_digit, cbit_lower, cbit_upper, /* alnum */
- cbit_print, cbit_cntrl, -1, /* ascii */
- cbit_cntrl, -1, -1, /* cntrl */
- cbit_digit, -1, -1, /* digit */
- cbit_graph, -1, -1, /* graph */
- cbit_print, -1, -1, /* print */
- cbit_punct, -1, -1, /* punct */
- cbit_space, -1, -1, /* space */
- cbit_word, -1, -1, /* word */
- cbit_xdigit,-1, -1 /* xdigit */
- };
- /* Definition to allow mutual recursion */
- static BOOL
- compile_regex(int, int, int *, uschar **, const uschar **, const char **,
- BOOL, int, int *, int *, compile_data *);
- /* Structure for building a chain of data that actually lives on the
- stack, for holding the values of the subject pointer at the start of each
- subpattern, so as to detect when an empty string has been matched by a
- subpattern - to break infinite loops. */
- typedef struct eptrblock {
- struct eptrblock *prev;
- const uschar *saved_eptr;
- } eptrblock;
- /* Flag bits for the match() function */
- #define match_condassert 0x01 /* Called to check a condition assertion */
- #define match_isgroup 0x02 /* Set if start of bracketed group */
- /*************************************************
- * Global variables *
- *************************************************/
- /* PCRE is thread-clean and doesn't use any global variables in the normal
- sense. However, it calls memory allocation and free functions via the two
- indirections below, which are can be changed by the caller, but are shared
- between all threads. */
- void *(*pcre_malloc)(size_t) = malloc;
- void (*pcre_free)(void *) = free;
- /*************************************************
- * Macros and tables for character handling *
- *************************************************/
- /* When UTF-8 encoding is being used, a character is no longer just a single
- byte. The macros for character handling generate simple sequences when used in
- byte-mode, and more complicated ones for UTF-8 characters. */
- #ifndef SUPPORT_UTF8
- #define GETCHARINC(c, eptr) c = *eptr++;
- #define GETCHARLEN(c, eptr, len) c = *eptr;
- #define BACKCHAR(eptr)
- #else /* SUPPORT_UTF8 */
- /* Get the next UTF-8 character, advancing the pointer */
- #define GETCHARINC(c, eptr)
- c = *eptr++;
- if (md->utf8 && (c & 0xc0) == 0xc0)
- {
- int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */
- int s = 6*a;
- c = (c & utf8_table3[a]) << s;
- while (a-- > 0)
- {
- s -= 6;
- c |= (*eptr++ & 0x3f) << s;
- }
- }
- /* Get the next UTF-8 character, not advancing the pointer, setting length */
- #define GETCHARLEN(c, eptr, len)
- c = *eptr;
- len = 1;
- if (md->utf8 && (c & 0xc0) == 0xc0)
- {
- int i;
- int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */
- int s = 6*a;
- c = (c & utf8_table3[a]) << s;
- for (i = 1; i <= a; i++)
- {
- s -= 6;
- c |= (eptr[i] & 0x3f) << s;
- }
- len += a;
- }
- /* If the pointer is not at the start of a character, move it back until
- it is. */
- #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
- #endif
- /*************************************************
- * Default character tables *
- *************************************************/
- /* A default set of character tables is included in the PCRE binary. Its source
- is built by the maketables auxiliary program, which uses the default C ctypes
- functions, and put in the file chartables.c. These tables are used by PCRE
- whenever the caller of pcre_compile() does not provide an alternate set of
- tables. */
- #include "chartables.c"
- #ifdef SUPPORT_UTF8
- /*************************************************
- * Tables for UTF-8 support *
- *************************************************/
- /* These are the breakpoints for different numbers of bytes in a UTF-8
- character. */
- static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
- /* These are the indicator bits and the mask for the data bits to set in the
- first byte of a character, indexed by the number of additional bytes. */
- static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
- static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
- /* Table of the number of extra characters, indexed by the first character
- masked with 0x3f. The highest number for a valid UTF-8 character is in fact
- 0x3d. */
- static uschar utf8_table4[] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
- /*************************************************
- * Convert character value to UTF-8 *
- *************************************************/
- /* This function takes an integer value in the range 0 - 0x7fffffff
- and encodes it as a UTF-8 character in 0 to 6 bytes.
- Arguments:
- cvalue the character value
- buffer pointer to buffer for result - at least 6 bytes long
- Returns: number of characters placed in the buffer
- */
- static int
- ord2utf8(int cvalue, uschar *buffer)
- {
- register int i, j;
- for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
- if (cvalue <= utf8_table1[i]) break;
- buffer += i;
- for (j = i; j > 0; j--)
- {
- *buffer-- = 0x80 | (cvalue & 0x3f);
- cvalue >>= 6;
- }
- *buffer = utf8_table2[i] | cvalue;
- return i + 1;
- }
- #endif
- /*************************************************
- * Return version string *
- *************************************************/
- #define STRING(a) # a
- #define XSTRING(s) STRING(s)
- const char *
- pcre_version(void)
- {
- return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
- }
- /*************************************************
- * (Obsolete) Return info about compiled pattern *
- *************************************************/
- /* This is the original "info" function. It picks potentially useful data out
- of the private structure, but its interface was too rigid. It remains for
- backwards compatibility. The public options are passed back in an int - though
- the re->options field has been expanded to a long int, all the public options
- at the low end of it, and so even on 16-bit systems this will still be OK.
- Therefore, I haven't changed the API for pcre_info().
- Arguments:
- external_re points to compiled code
- optptr where to pass back the options
- first_char where to pass back the first character,
- or -1 if multiline and all branches start ^,
- or -2 otherwise
- Returns: number of capturing subpatterns
- or negative values on error
- */
- int
- pcre_info(const pcre *external_re, int *optptr, int *first_char)
- {
- const real_pcre *re = (const real_pcre *)external_re;
- if (re == NULL) return PCRE_ERROR_NULL;
- if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
- if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
- if (first_char != NULL)
- *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
- ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
- return re->top_bracket;
- }
- /*************************************************
- * Return info about compiled pattern *
- *************************************************/
- /* This is a newer "info" function which has an extensible interface so
- that additional items can be added compatibly.
- Arguments:
- external_re points to compiled code
- external_study points to study data, or NULL
- what what information is required
- where where to put the information
- Returns: 0 if data returned, negative on error
- */
- int
- pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
- void *where)
- {
- const real_pcre *re = (const real_pcre *)external_re;
- const real_pcre_extra *study = (const real_pcre_extra *)study_data;
- if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
- if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
- switch (what)
- {
- case PCRE_INFO_OPTIONS:
- *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
- break;
- case PCRE_INFO_SIZE:
- *((size_t *)where) = re->size;
- break;
- case PCRE_INFO_CAPTURECOUNT:
- *((int *)where) = re->top_bracket;
- break;
- case PCRE_INFO_BACKREFMAX:
- *((int *)where) = re->top_backref;
- break;
- case PCRE_INFO_FIRSTCHAR:
- *((int *)where) =
- ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
- ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
- break;
- case PCRE_INFO_FIRSTTABLE:
- *((const uschar **)where) =
- (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
- study->start_bits : NULL;
- break;
- case PCRE_INFO_LASTLITERAL:
- *((int *)where) =
- ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
- break;
- default: return PCRE_ERROR_BADOPTION;
- }
- return 0;
- }
- #ifdef DEBUG
- /*************************************************
- * Debugging function to print chars *
- *************************************************/
- /* Print a sequence of chars in printable format, stopping at the end of the
- subject if the requested.
- Arguments:
- p points to characters
- length number to print
- is_subject TRUE if printing from within md->start_subject
- md pointer to matching data block, if is_subject is TRUE
- Returns: nothing
- */
- static void
- pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
- {
- int c;
- if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
- while (length-- > 0)
- if (isprint(c = *(p++))) printf("%c", c); else printf("\x%02x", c);
- }
- #endif
- /*************************************************
- * Handle escapes *
- *************************************************/
- /* This function is called when a has been encountered. It either returns a
- positive value for a simple escape such as n, or a negative value which
- encodes one of the more complicated things such as d. When UTF-8 is enabled,
- a positive value greater than 255 may be returned. On entry, ptr is pointing at
- the . On exit, it is on the final character of the escape sequence.
- Arguments:
- ptrptr points to the pattern position pointer
- errorptr points to the pointer to the error message
- bracount number of previous extracting brackets
- options the options bits
- isclass TRUE if inside a character class
- cd pointer to char tables block
- Returns: zero or positive => a data character
- negative => a special escape sequence
- on error, errorptr is set
- */
- static int
- check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
- int options, BOOL isclass, compile_data *cd)
- {
- const uschar *ptr = *ptrptr;
- int c, i;
- /* If backslash is at the end of the pattern, it's an error. */
- c = *(++ptr);
- if (c == 0) *errorptr = ERR1;
- /* Digits or letters may have special meaning; all others are literals. */
- else if (c < '0' || c > 'z') {}
- /* Do an initial lookup in a table. A non-zero result is something that can be
- returned immediately. Otherwise further processing may be required. */
- else if ((i = escapes[c - '0']) != 0) c = i;
- /* Escapes that need further processing, or are illegal. */
- else
- {
- const uschar *oldptr;
- switch (c)
- {
- /* The handling of escape sequences consisting of a string of digits
- starting with one that is not zero is not straightforward. By experiment,
- the way Perl works seems to be as follows:
- Outside a character class, the digits are read as a decimal number. If the
- number is less than 10, or if there are that many previous extracting
- left brackets, then it is a back reference. Otherwise, up to three octal
- digits are read to form an escaped byte. Thus 123 is likely to be octal
- 123 (cf