pcre.c
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:150k
- }
- }
- length++;
- /* A back reference needs an additional 2 bytes, plus either one or 5
- bytes for a repeat. We also need to keep the value of the highest
- back reference. */
- if (c <= -ESC_REF)
- {
- int refnum = -c - ESC_REF;
- if (refnum > top_backref) top_backref = refnum;
- length += 2; /* For single back reference */
- if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
- {
- ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
- else length += 5;
- if (ptr[1] == '?') ptr++;
- }
- }
- continue;
- case '^':
- case '.':
- case '$':
- case '*': /* These repeats won't be after brackets; */
- case '+': /* those are handled separately */
- case '?':
- length++;
- continue;
- /* This covers the cases of repeats after a single char, metachar, class,
- or back reference. */
- case '{':
- if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
- ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
- else
- {
- length--; /* Uncount the original char or metachar */
- if (min == 1) length++; else if (min > 0) length += 4;
- if (max > 0) length += 4; else length += 2;
- }
- if (ptr[1] == '?') ptr++;
- continue;
- /* An alternation contains an offset to the next branch or ket. If any ims
- options changed in the previous branch(es), and/or if we are in a
- lookbehind assertion, extra space will be needed at the start of the
- branch. This is handled by branch_extra. */
- case '|':
- length += 3 + branch_extra;
- continue;
- /* A character class uses 33 characters. Don't worry about character types
- that aren't allowed in classes - they'll get picked up during the compile.
- A character class that contains only one character uses 2 or 3 bytes,
- depending on whether it is negated or not. Notice this where we can. */
- case '[':
- class_charcount = 0;
- if (*(++ptr) == '^') ptr++;
- do
- {
- if (*ptr == '\')
- {
- int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
- &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
- }
- else class_charcount++;
- ptr++;
- }
- while (*ptr != 0 && *ptr != ']');
- /* Repeats for negated single chars are handled by the general code */
- if (class_charcount == 1) length += 3; else
- {
- length += 33;
- /* A repeat needs either 1 or 5 bytes. */
- if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
- {
- ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if ((min == 0 && (max == 1 || max == -1)) ||
- (min == 1 && max == -1))
- length++;
- else length += 5;
- if (ptr[1] == '?') ptr++;
- }
- }
- continue;
- /* Brackets may be genuine groups or special things */
- case '(':
- branch_newextra = 0;
- bracket_length = 3;
- /* Handle special forms of bracket, which all start (? */
- if (ptr[1] == '?')
- {
- int set, unset;
- int *optset;
- switch (c = ptr[2])
- {
- /* Skip over comments entirely */
- case '#':
- ptr += 3;
- while (*ptr != 0 && *ptr != ')') ptr++;
- if (*ptr == 0)
- {
- *errorptr = ERR18;
- goto PCRE_ERROR_RETURN;
- }
- continue;
- /* Non-referencing groups and lookaheads just move the pointer on, and
- then behave like a non-special bracket, except that they don't increment
- the count of extracting brackets. Ditto for the "once only" bracket,
- which is in Perl from version 5.005. */
- case ':':
- case '=':
- case '!':
- case '>':
- ptr += 2;
- break;
- /* A recursive call to the regex is an extension, to provide the
- facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
- case 'R':
- if (ptr[3] != ')')
- {
- *errorptr = ERR29;
- goto PCRE_ERROR_RETURN;
- }
- ptr += 3;
- length += 1;
- break;
- /* Lookbehinds are in Perl from version 5.005 */
- case '<':
- if (ptr[3] == '=' || ptr[3] == '!')
- {
- ptr += 3;
- branch_newextra = 3;
- length += 3; /* For the first branch */
- break;
- }
- *errorptr = ERR24;
- goto PCRE_ERROR_RETURN;
- /* Conditionals are in Perl from version 5.005. The bracket must either
- be followed by a number (for bracket reference) or by an assertion
- group. */
- case '(':
- if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
- {
- ptr += 4;
- length += 3;
- while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
- if (*ptr != ')')
- {
- *errorptr = ERR26;
- goto PCRE_ERROR_RETURN;
- }
- }
- else /* An assertion must follow */
- {
- ptr++; /* Can treat like ':' as far as spacing is concerned */
- if (ptr[2] != '?' ||
- (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
- {
- ptr += 2; /* To get right offset in message */
- *errorptr = ERR28;
- goto PCRE_ERROR_RETURN;
- }
- }
- break;
- /* Else loop checking valid options until ) is met. Anything else is an
- error. If we are without any brackets, i.e. at top level, the settings
- act as if specified in the options, so massage the options immediately.
- This is for backward compatibility with Perl 5.004. */
- default:
- set = unset = 0;
- optset = &set;
- ptr += 2;
- for (;; ptr++)
- {
- c = *ptr;
- switch (c)
- {
- case 'i':
- *optset |= PCRE_CASELESS;
- continue;
- case 'm':
- *optset |= PCRE_MULTILINE;
- continue;
- case 's':
- *optset |= PCRE_DOTALL;
- continue;
- case 'x':
- *optset |= PCRE_EXTENDED;
- continue;
- case 'X':
- *optset |= PCRE_EXTRA;
- continue;
- case 'U':
- *optset |= PCRE_UNGREEDY;
- continue;
- case '-':
- optset = &unset;
- continue;
- /* A termination by ')' indicates an options-setting-only item;
- this is global at top level; otherwise nothing is done here and
- it is handled during the compiling process on a per-bracket-group
- basis. */
- case ')':
- if (brastackptr == 0)
- {
- options = (options | set) & (~unset);
- set = unset = 0; /* To save length */
- }
- /* Fall through */
- /* A termination by ':' indicates the start of a nested group with
- the given options set. This is again handled at compile time, but
- we must allow for compiled space if any of the ims options are
- set. We also have to allow for resetting space at the end of
- the group, which is why 4 is added to the length and not just 2.
- If there are several changes of options within the same group, this
- will lead to an over-estimate on the length, but this shouldn't
- matter very much. We also have to allow for resetting options at
- the start of any alternations, which we do by setting
- branch_newextra to 2. Finally, we record whether the case-dependent
- flag ever changes within the regex. This is used by the "required
- character" code. */
- case ':':
- if (((set|unset) & PCRE_IMS) != 0)
- {
- length += 4;
- branch_newextra = 2;
- if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
- }
- goto END_OPTIONS;
- /* Unrecognized option character */
- default:
- *errorptr = ERR12;
- goto PCRE_ERROR_RETURN;
- }
- }
- /* If we hit a closing bracket, that's it - this is a freestanding
- option-setting. We need to ensure that branch_extra is updated if
- necessary. The only values branch_newextra can have here are 0 or 2.
- If the value is 2, then branch_extra must either be 2 or 5, depending
- on whether this is a lookbehind group or not. */
- END_OPTIONS:
- if (c == ')')
- {
- if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
- branch_extra += branch_newextra;
- continue;
- }
- /* If options were terminated by ':' control comes here. Fall through
- to handle the group below. */
- }
- }
- /* Extracting brackets must be counted so we can process escapes in a
- Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
- need an additional 3 bytes of store per extracting bracket. */
- else
- {
- bracount++;
- if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
- }
- /* Save length for computing whole length at end if there's a repeat that
- requires duplication of the group. Also save the current value of
- branch_extra, and start the new group with the new value. If non-zero, this
- will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
- if (brastackptr >= sizeof(brastack)/sizeof(int))
- {
- *errorptr = ERR19;
- goto PCRE_ERROR_RETURN;
- }
- bralenstack[brastackptr] = branch_extra;
- branch_extra = branch_newextra;
- brastack[brastackptr++] = length;
- length += bracket_length;
- continue;
- /* Handle ket. Look for subsequent max/min; for certain sets of values we
- have to replicate this bracket up to that many times. If brastackptr is
- 0 this is an unmatched bracket which will generate an error, but take care
- not to try to access brastack[-1] when computing the length and restoring
- the branch_extra value. */
- case ')':
- length += 3;
- {
- int minval = 1;
- int maxval = 1;
- int duplength;
- if (brastackptr > 0)
- {
- duplength = length - brastack[--brastackptr];
- branch_extra = bralenstack[brastackptr];
- }
- else duplength = 0;
- /* Leave ptr at the final char; for read_repeat_counts this happens
- automatically; for the others we need an increment. */
- if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
- {
- ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
- &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- }
- else if (c == '*') { minval = 0; maxval = -1; ptr++; }
- else if (c == '+') { maxval = -1; ptr++; }
- else if (c == '?') { minval = 0; ptr++; }
- /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
- group, and if the maximum is greater than zero, we have to replicate
- maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
- bracket set - hence the 7. */
- if (minval == 0)
- {
- length++;
- if (maxval > 0) length += (maxval - 1) * (duplength + 7);
- }
- /* When the minimum is greater than zero, 1 we have to replicate up to
- minval-1 times, with no additions required in the copies. Then, if
- there is a limited maximum we have to replicate up to maxval-1 times
- allowing for a BRAZERO item before each optional copy and nesting
- brackets for all but one of the optional copies. */
- else
- {
- length += (minval - 1) * duplength;
- if (maxval > minval) /* Need this test as maxval=-1 means no limit */
- length += (maxval - minval) * (duplength + 7) - 6;
- }
- }
- continue;
- /* Non-special character. For a run of such characters the length required
- is the number of characters + 2, except that the maximum run length is 255.
- We won't get a skipped space or a non-data escape or the start of a #
- comment as the first character, so the length can't be zero. */
- NORMAL_CHAR:
- default:
- length += 2;
- runlength = 0;
- do
- {
- if ((options & PCRE_EXTENDED) != 0)
- {
- if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
- if (c == '#')
- {
- /* The space before the ; is to avoid a warning on a silly compiler
- on the Macintosh. */
- while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
- continue;
- }
- }
- /* Backslash may introduce a data char or a metacharacter; stop the
- string before the latter. */
- if (c == '\')
- {
- const uschar *saveptr = ptr;
- c = check_escape(&ptr, errorptr, bracount, options, FALSE,
- &compile_block);
- if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
- if (c < 0) { ptr = saveptr; break; }
- #ifdef SUPPORT_UTF8
- if (c > 127 && (options & PCRE_UTF8) != 0)
- {
- int i;
- for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
- if (c <= utf8_table1[i]) break;
- runlength += i;
- }
- #endif
- }
- /* Ordinary character or single-char escape */
- runlength++;
- }
- /* This "while" is the end of the "do" above. */
- while (runlength < MAXLIT &&
- (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
- ptr--;
- length += runlength;
- continue;
- }
- }
- length += 4; /* For final KET and END */
- if (length > 65539)
- {
- *errorptr = ERR20;
- return NULL;
- }
- /* Compute the size of data block needed and get it, either from malloc or
- externally provided function. We specify "code[0]" in the offsetof() expression
- rather than just "code", because it has been reported that one broken compiler
- fails on "code" because it is also an independent variable. It should make no
- difference to the value of the offsetof(). */
- size = length + offsetof(real_pcre, code[0]);
- re = (real_pcre *)(pcre_malloc)(size);
- if (re == NULL)
- {
- *errorptr = ERR21;
- return NULL;
- }
- /* Put in the magic number, and save the size, options, and table pointer */
- re->magic_number = MAGIC_NUMBER;
- re->size = size;
- re->options = options;
- re->tables = tables;
- /* Set up a starting, non-extracting bracket, then compile the expression. On
- error, *errorptr will be set non-NULL, so we don't need to look at the result
- of the function here. */
- ptr = (const uschar *)pattern;
- code = re->code;
- *code = OP_BRA;
- bracount = 0;
- (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
- &reqchar, &countlits, &compile_block);
- re->top_bracket = bracount;
- re->top_backref = top_backref;
- /* If not reached end of pattern on success, there's an excess bracket. */
- if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
- /* Fill in the terminating state and check for disastrous overflow, but
- if debugging, leave the test till after things are printed out. */
- *code++ = OP_END;
- #ifndef DEBUG
- if (code - re->code > length) *errorptr = ERR23;
- #endif
- /* Give an error if there's back reference to a non-existent capturing
- subpattern. */
- if (top_backref > re->top_bracket) *errorptr = ERR15;
- /* Failed to compile */
- if (*errorptr != NULL)
- {
- (pcre_free)(re);
- PCRE_ERROR_RETURN:
- *erroroffset = ptr - (const uschar *)pattern;
- return NULL;
- }
- /* If the anchored option was not passed, set flag if we can determine that the
- pattern is anchored by virtue of ^ characters or A or anything else (such as
- starting with .* when DOTALL is set).
- Otherwise, see if we can determine what the first character has to be, because
- that speeds up unanchored matches no end. If not, see if we can set the
- PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
- start with ^. and also when all branches start with .* for non-DOTALL matches.
- */
- if ((options & PCRE_ANCHORED) == 0)
- {
- int temp_options = options;
- if (is_anchored(re->code, &temp_options))
- re->options |= PCRE_ANCHORED;
- else
- {
- int ch = find_firstchar(re->code, &temp_options);
- if (ch >= 0)
- {
- re->first_char = ch;
- re->options |= PCRE_FIRSTSET;
- }
- else if (is_startline(re->code))
- re->options |= PCRE_STARTLINE;
- }
- }
- /* Save the last required character if there are at least two literal
- characters on all paths, or if there is no first character setting. */
- if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
- {
- re->req_char = reqchar;
- re->options |= PCRE_REQCHSET;
- }
- /* Print out the compiled data for debugging */
- #ifdef DEBUG
- printf("Length = %d top_bracket = %d top_backref = %dn",
- length, re->top_bracket, re->top_backref);
- if (re->options != 0)
- {
- printf("%s%s%s%s%s%s%s%s%sn",
- ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
- ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
- ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
- ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
- ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
- ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
- ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
- ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
- ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
- }
- if ((re->options & PCRE_FIRSTSET) != 0)
- {
- if (isprint(re->first_char)) printf("First char = %cn", re->first_char);
- else printf("First char = \x%02xn", re->first_char);
- }
- if ((re->options & PCRE_REQCHSET) != 0)
- {
- if (isprint(re->req_char)) printf("Req char = %cn", re->req_char);
- else printf("Req char = \x%02xn", re->req_char);
- }
- code_end = code;
- code_base = code = re->code;
- while (code < code_end)
- {
- int charlength;
- printf("%3d ", code - code_base);
- if (*code >= OP_BRA)
- {
- if (*code - OP_BRA > EXTRACT_BASIC_MAX)
- printf("%3d Bra extra", (code[1] << 8) + code[2]);
- else
- printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
- code += 2;
- }
- else switch(*code)
- {
- case OP_OPT:
- printf(" %.2x %s", code[1], OP_names[*code]);
- code++;
- break;
- case OP_CHARS:
- charlength = *(++code);
- printf("%3d ", charlength);
- while (charlength-- > 0)
- if (isprint(c = *(++code))) printf("%c", c); else printf("\x%02x", c);
- break;
- case OP_KETRMAX:
- case OP_KETRMIN:
- case OP_ALT:
- case OP_KET:
- case OP_ASSERT:
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK:
- case OP_ASSERTBACK_NOT:
- case OP_ONCE:
- case OP_REVERSE:
- case OP_BRANUMBER:
- case OP_COND:
- case OP_CREF:
- printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
- code += 2;
- break;
- case OP_STAR:
- case OP_MINSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- if (*code >= OP_TYPESTAR)
- printf(" %s", OP_names[code[1]]);
- else if (isprint(c = code[1])) printf(" %c", c);
- else printf(" \x%02x", c);
- printf("%s", OP_names[*code++]);
- break;
- case OP_EXACT:
- case OP_UPTO:
- case OP_MINUPTO:
- if (isprint(c = code[3])) printf(" %c{", c);
- else printf(" \x%02x{", c);
- if (*code != OP_EXACT) printf("0,");
- printf("%d}", (code[1] << 8) + code[2]);
- if (*code == OP_MINUPTO) printf("?");
- code += 3;
- break;
- case OP_TYPEEXACT:
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- printf(" %s{", OP_names[code[3]]);
- if (*code != OP_TYPEEXACT) printf(",");
- printf("%d}", (code[1] << 8) + code[2]);
- if (*code == OP_TYPEMINUPTO) printf("?");
- code += 3;
- break;
- case OP_NOT:
- if (isprint(c = *(++code))) printf(" [^%c]", c);
- else printf(" [^\x%02x]", c);
- break;
- case OP_NOTSTAR:
- case OP_NOTMINSTAR:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTQUERY:
- case OP_NOTMINQUERY:
- if (isprint(c = code[1])) printf(" [^%c]", c);
- else printf(" [^\x%02x]", c);
- printf("%s", OP_names[*code++]);
- break;
- case OP_NOTEXACT:
- case OP_NOTUPTO:
- case OP_NOTMINUPTO:
- if (isprint(c = code[3])) printf(" [^%c]{", c);
- else printf(" [^\x%02x]{", c);
- if (*code != OP_NOTEXACT) printf(",");
- printf("%d}", (code[1] << 8) + code[2]);
- if (*code == OP_NOTMINUPTO) printf("?");
- code += 3;
- break;
- case OP_REF:
- printf(" \%d", (code[1] << 8) | code[2]);
- code += 3;
- goto CLASS_REF_REPEAT;
- case OP_CLASS:
- {
- int i, min, max;
- code++;
- printf(" [");
- for (i = 0; i < 256; i++)
- {
- if ((code[i/8] & (1 << (i&7))) != 0)
- {
- int j;
- for (j = i+1; j < 256; j++)
- if ((code[j/8] & (1 << (j&7))) == 0) break;
- if (i == '-' || i == ']') printf("\");
- if (isprint(i)) printf("%c", i); else printf("\x%02x", i);
- if (--j > i)
- {
- printf("-");
- if (j == '-' || j == ']') printf("\");
- if (isprint(j)) printf("%c", j); else printf("\x%02x", j);
- }
- i = j;
- }
- }
- printf("]");
- code += 32;
- CLASS_REF_REPEAT:
- switch(*code)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- printf("%s", OP_names[*code]);
- break;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- min = (code[1] << 8) + code[2];
- max = (code[3] << 8) + code[4];
- if (max == 0) printf("{%d,}", min);
- else printf("{%d,%d}", min, max);
- if (*code == OP_CRMINRANGE) printf("?");
- code += 4;
- break;
- default:
- code--;
- }
- }
- break;
- /* Anything else is just a one-node item */
- default:
- printf(" %s", OP_names[*code]);
- break;
- }
- code++;
- printf("n");
- }
- printf("------------------------------------------------------------------n");
- /* This check is done here in the debugging case so that the code that
- was compiled can be seen. */
- if (code - re->code > length)
- {
- *errorptr = ERR23;
- (pcre_free)(re);
- *erroroffset = ptr - (uschar *)pattern;
- return NULL;
- }
- #endif
- return (pcre *)re;
- }
- /*************************************************
- * Match a back-reference *
- *************************************************/
- /* If a back reference hasn't been set, the length that is passed is greater
- than the number of characters left in the string, so the match fails.
- Arguments:
- offset index into the offset vector
- eptr points into the subject
- length length to be matched
- md points to match data block
- ims the ims flags
- Returns: TRUE if matched
- */
- static BOOL
- match_ref(int offset, register const uschar *eptr, int length, match_data *md,
- unsigned long int ims)
- {
- const uschar *p = md->start_subject + md->offset_vector[offset];
- #ifdef DEBUG
- if (eptr >= md->end_subject)
- printf("matching subject <null>");
- else
- {
- printf("matching subject ");
- pchars(eptr, length, TRUE, md);
- }
- printf(" against backref ");
- pchars(p, length, FALSE, md);
- printf("n");
- #endif
- /* Always fail if not enough characters left */
- if (length > md->end_subject - eptr) return FALSE;
- /* Separate the caselesss case for speed */
- if ((ims & PCRE_CASELESS) != 0)
- {
- while (length-- > 0)
- if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
- }
- else
- { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
- return TRUE;
- }
- /*************************************************
- * Match from current position *
- *************************************************/
- /* On entry ecode points to the first opcode, and eptr to the first character
- in the subject string, while eptrb holds the value of eptr at the start of the
- last bracketed group - used for breaking infinite loops matching zero-length
- strings.
- Arguments:
- eptr pointer in subject
- ecode position in code
- offset_top current top pointer
- md pointer to "static" info for the match
- ims current /i, /m, and /s options
- eptrb pointer to chain of blocks containing eptr at start of
- brackets - for testing for empty matches
- flags can contain
- match_condassert - this is an assertion condition
- match_isgroup - this is the start of a bracketed group
- Returns: TRUE if matched
- */
- static BOOL
- match(register const uschar *eptr, register const uschar *ecode,
- int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
- int flags)
- {
- unsigned long int original_ims = ims; /* Save for resetting on ')' */
- eptrblock newptrb;
- /* At the start of a bracketed group, add the current subject pointer to the
- stack of such pointers, to be re-instated at the end of the group when we hit
- the closing ket. When match() is called in other circumstances, we don't add to
- the stack. */
- if ((flags & match_isgroup) != 0)
- {
- newptrb.prev = eptrb;
- newptrb.saved_eptr = eptr;
- eptrb = &newptrb;
- }
- /* Now start processing the operations. */
- for (;;)
- {
- int op = (int)*ecode;
- int min, max, ctype;
- register int i;
- register int c;
- BOOL minimize = FALSE;
- /* Opening capturing bracket. If there is space in the offset vector, save
- the current subject position in the working slot at the top of the vector. We
- mustn't change the current values of the data slot, because they may be set
- from a previous iteration of this group, and be referred to by a reference
- inside the group.
- If the bracket fails to match, we need to restore this value and also the
- values of the final offsets, in case they were set by a previous iteration of
- the same bracket.
- If there isn't enough space in the offset vector, treat this as if it were a
- non-capturing bracket. Don't worry about setting the flag for the error case
- here; that is handled in the code for KET. */
- if (op > OP_BRA)
- {
- int offset;
- int number = op - OP_BRA;
- /* For extended extraction brackets (large number), we have to fish out the
- number from a dummy opcode at the start. */
- if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
- offset = number << 1;
- #ifdef DEBUG
- printf("start bracket %d subject=", number);
- pchars(eptr, 16, TRUE, md);
- printf("n");
- #endif
- if (offset < md->offset_max)
- {
- int save_offset1 = md->offset_vector[offset];
- int save_offset2 = md->offset_vector[offset+1];
- int save_offset3 = md->offset_vector[md->offset_end - number];
- DPRINTF(("saving %d %d %dn", save_offset1, save_offset2, save_offset3));
- md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
- do
- {
- if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- ecode += (ecode[1] << 8) + ecode[2];
- }
- while (*ecode == OP_ALT);
- DPRINTF(("bracket %d failedn", number));
- md->offset_vector[offset] = save_offset1;
- md->offset_vector[offset+1] = save_offset2;
- md->offset_vector[md->offset_end - number] = save_offset3;
- return FALSE;
- }
- /* Insufficient room for saving captured contents */
- else op = OP_BRA;
- }
- /* Other types of node can be handled by a switch */
- switch(op)
- {
- case OP_BRA: /* Non-capturing bracket: optimized */
- DPRINTF(("start bracket 0n"));
- do
- {
- if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- ecode += (ecode[1] << 8) + ecode[2];
- }
- while (*ecode == OP_ALT);
- DPRINTF(("bracket 0 failedn"));
- return FALSE;
- /* Conditional group: compilation checked that there are no more than
- two branches. If the condition is false, skipping the first branch takes us
- past the end if there is only one branch, but that's OK because that is
- exactly what going to the ket would do. */
- case OP_COND:
- if (ecode[3] == OP_CREF) /* Condition is extraction test */
- {
- int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
- return match(eptr,
- ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
- 6 : 3 + (ecode[1] << 8) + ecode[2]),
- offset_top, md, ims, eptrb, match_isgroup);
- }
- /* The condition is an assertion. Call match() to evaluate it - setting
- the final argument TRUE causes it to stop at the end of an assertion. */
- else
- {
- if (match(eptr, ecode+3, offset_top, md, ims, NULL,
- match_condassert | match_isgroup))
- {
- ecode += 3 + (ecode[4] << 8) + ecode[5];
- while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
- }
- else ecode += (ecode[1] << 8) + ecode[2];
- return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
- }
- /* Control never reaches here */
- /* Skip over conditional reference or large extraction number data if
- encountered. */
- case OP_CREF:
- case OP_BRANUMBER:
- ecode += 3;
- break;
- /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
- an empty string - recursion will then try other alternatives, if any. */
- case OP_END:
- if (md->notempty && eptr == md->start_match) return FALSE;
- md->end_match_ptr = eptr; /* Record where we ended */
- md->end_offset_top = offset_top; /* and how many extracts were taken */
- return TRUE;
- /* Change option settings */
- case OP_OPT:
- ims = ecode[1];
- ecode += 2;
- DPRINTF(("ims set to %02lxn", ims));
- break;
- /* Assertion brackets. Check the alternative branches in turn - the
- matching won't pass the KET for an assertion. If any one branch matches,
- the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
- start of each branch to move the current point backwards, so the code at
- this level is identical to the lookahead case. */
- case OP_ASSERT:
- case OP_ASSERTBACK:
- do
- {
- if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
- ecode += (ecode[1] << 8) + ecode[2];
- }
- while (*ecode == OP_ALT);
- if (*ecode == OP_KET) return FALSE;
- /* If checking an assertion for a condition, return TRUE. */
- if ((flags & match_condassert) != 0) return TRUE;
- /* Continue from after the assertion, updating the offsets high water
- mark, since extracts may have been taken during the assertion. */
- do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
- ecode += 3;
- offset_top = md->end_offset_top;
- continue;
- /* Negative assertion: all branches must fail to match */
- case OP_ASSERT_NOT:
- case OP_ASSERTBACK_NOT:
- do
- {
- if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
- return FALSE;
- ecode += (ecode[1] << 8) + ecode[2];
- }
- while (*ecode == OP_ALT);
- if ((flags & match_condassert) != 0) return TRUE;
- ecode += 3;
- continue;
- /* Move the subject pointer back. This occurs only at the start of
- each branch of a lookbehind assertion. If we are too close to the start to
- move back, this match function fails. When working with UTF-8 we move
- back a number of characters, not bytes. */
- case OP_REVERSE:
- #ifdef SUPPORT_UTF8
- c = (ecode[1] << 8) + ecode[2];
- for (i = 0; i < c; i++)
- {
- eptr--;
- BACKCHAR(eptr)
- }
- #else
- eptr -= (ecode[1] << 8) + ecode[2];
- #endif
- if (eptr < md->start_subject) return FALSE;
- ecode += 3;
- break;
- /* Recursion matches the current regex, nested. If there are any capturing
- brackets started but not finished, we have to save their starting points
- and reinstate them after the recursion. However, we don't know how many
- such there are (offset_top records the completed total) so we just have
- to save all the potential data. There may be up to 99 such values, which
- is a bit large to put on the stack, but using malloc for small numbers
- seems expensive. As a compromise, the stack is used when there are fewer
- than 16 values to store; otherwise malloc is used. A problem is what to do
- if the malloc fails ... there is no way of returning to the top level with
- an error. Save the top 15 values on the stack, and accept that the rest
- may be wrong. */
- case OP_RECURSE:
- {
- BOOL rc;
- int *save;
- int stacksave[15];
- c = md->offset_max;
- if (c < 16) save = stacksave; else
- {
- save = (int *)(pcre_malloc)((c+1) * sizeof(int));
- if (save == NULL)
- {
- save = stacksave;
- c = 15;
- }
- }
- for (i = 1; i <= c; i++)
- save[i] = md->offset_vector[md->offset_end - i];
- rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
- match_isgroup);
- for (i = 1; i <= c; i++)
- md->offset_vector[md->offset_end - i] = save[i];
- if (save != stacksave) (pcre_free)(save);
- if (!rc) return FALSE;
- /* In case the recursion has set more capturing values, save the final
- number, then move along the subject till after the recursive match,
- and advance one byte in the pattern code. */
- offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
- ecode++;
- }
- break;
- /* "Once" brackets are like assertion brackets except that after a match,
- the point in the subject string is not moved back. Thus there can never be
- a move back into the brackets. Check the alternative branches in turn - the
- matching won't pass the KET for this kind of subpattern. If any one branch
- matches, we carry on as at the end of a normal bracket, leaving the subject
- pointer. */
- case OP_ONCE:
- {
- const uschar *prev = ecode;
- const uschar *saved_eptr = eptr;
- do
- {
- if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
- break;
- ecode += (ecode[1] << 8) + ecode[2];
- }
- while (*ecode == OP_ALT);
- /* If hit the end of the group (which could be repeated), fail */
- if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
- /* Continue as from after the assertion, updating the offsets high water
- mark, since extracts may have been taken. */
- do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
- offset_top = md->end_offset_top;
- eptr = md->end_match_ptr;
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 3;
- break;
- }
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. We need to reset any options
- that changed within the bracket before re-running it, so check the next
- opcode. */
- if (ecode[3] == OP_OPT)
- {
- ims = (ims & ~PCRE_IMS) | ecode[4];
- DPRINTF(("ims set to %02lx at group repeatn", ims));
- }
- if (*ecode == OP_KETRMIN)
- {
- if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
- match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- }
- else /* OP_KETRMAX */
- {
- if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
- match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
- }
- }
- return FALSE;
- /* An alternation is the end of a branch; scan along to find the end of the
- bracketed group and go to there. */
- case OP_ALT:
- do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
- break;
- /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
- that it may occur zero times. It may repeat infinitely, or not at all -
- i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
- repeat limits are compiled as a number of copies, with the optional ones
- preceded by BRAZERO or BRAMINZERO. */
- case OP_BRAZERO:
- {
- const uschar *next = ecode+1;
- if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
- ecode = next + 3;
- }
- break;
- case OP_BRAMINZERO:
- {
- const uschar *next = ecode+1;
- do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
- if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- ecode++;
- }
- break;
- /* End of a group, repeated or non-repeating. If we are at the end of
- an assertion "group", stop matching and return TRUE, but record the
- current high water mark for use by positive assertions. Do this also
- for the "once" (not-backup up) groups. */
- case OP_KET:
- case OP_KETRMIN:
- case OP_KETRMAX:
- {
- const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
- const uschar *saved_eptr = eptrb->saved_eptr;
- eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
- if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
- *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
- *prev == OP_ONCE)
- {
- md->end_match_ptr = eptr; /* For ONCE */
- md->end_offset_top = offset_top;
- return TRUE;
- }
- /* In all other cases except a conditional group we have to check the
- group number back at the start and if necessary complete handling an
- extraction by setting the offsets and bumping the high water mark. */
- if (*prev != OP_COND)
- {
- int offset;
- int number = *prev - OP_BRA;
- /* For extended extraction brackets (large number), we have to fish out
- the number from a dummy opcode at the start. */
- if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
- offset = number << 1;
- #ifdef DEBUG
- printf("end bracket %d", number);
- printf("n");
- #endif
- if (number > 0)
- {
- if (offset >= md->offset_max) md->offset_overflow = TRUE; else
- {
- md->offset_vector[offset] =
- md->offset_vector[md->offset_end - number];
- md->offset_vector[offset+1] = eptr - md->start_subject;
- if (offset_top <= offset) offset_top = offset + 2;
- }
- }
- }
- /* Reset the value of the ims flags, in case they got changed during
- the group. */
- ims = original_ims;
- DPRINTF(("ims reset to %02lxn", ims));
- /* For a non-repeating ket, just continue at this level. This also
- happens for a repeating ket if no characters were matched in the group.
- This is the forcible breaking of infinite loops as implemented in Perl
- 5.005. If there is an options reset, it will get obeyed in the normal
- course of events. */
- if (*ecode == OP_KET || eptr == saved_eptr)
- {
- ecode += 3;
- break;
- }
- /* The repeating kets try the rest of the pattern or restart from the
- preceding bracket, in the appropriate order. */
- if (*ecode == OP_KETRMIN)
- {
- if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
- match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
- return TRUE;
- }
- else /* OP_KETRMAX */
- {
- if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
- match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
- }
- }
- return FALSE;
- /* Start of subject unless notbol, or after internal newline if multiline */
- case OP_CIRC:
- if (md->notbol && eptr == md->start_subject) return FALSE;
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
- ecode++;
- break;
- }
- /* ... else fall through */
- /* Start of subject assertion */
- case OP_SOD:
- if (eptr != md->start_subject) return FALSE;
- ecode++;
- break;
- /* Assert before internal newline if multiline, or before a terminating
- newline unless endonly is set, else end of subject unless noteol is set. */
- case OP_DOLL:
- if ((ims & PCRE_MULTILINE) != 0)
- {
- if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
- else { if (md->noteol) return FALSE; }
- ecode++;
- break;
- }
- else
- {
- if (md->noteol) return FALSE;
- if (!md->endonly)
- {
- if (eptr < md->end_subject - 1 ||
- (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
- ecode++;
- break;
- }
- }
- /* ... else fall through */
- /* End of subject assertion (z) */
- case OP_EOD:
- if (eptr < md->end_subject) return FALSE;
- ecode++;
- break;
- /* End of subject or ending n assertion (Z) */
- case OP_EODN:
- if (eptr < md->end_subject - 1 ||
- (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
- ecode++;
- break;
- /* Word boundary assertions */
- case OP_NOT_WORD_BOUNDARY:
- case OP_WORD_BOUNDARY:
- {
- BOOL prev_is_word = (eptr != md->start_subject) &&
- ((md->ctypes[eptr[-1]] & ctype_word) != 0);
- BOOL cur_is_word = (eptr < md->end_subject) &&
- ((md->ctypes[*eptr] & ctype_word) != 0);
- if ((*ecode++ == OP_WORD_BOUNDARY)?
- cur_is_word == prev_is_word : cur_is_word != prev_is_word)
- return FALSE;
- }
- break;
- /* Match a single character type; inline for speed */
- case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
- return FALSE;
- if (eptr++ >= md->end_subject) return FALSE;
- #ifdef SUPPORT_UTF8
- if (md->utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- #endif
- ecode++;
- break;
- case OP_NOT_DIGIT:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_digit) != 0)
- return FALSE;
- ecode++;
- break;
- case OP_DIGIT:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_digit) == 0)
- return FALSE;
- ecode++;
- break;
- case OP_NOT_WHITESPACE:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_space) != 0)
- return FALSE;
- ecode++;
- break;
- case OP_WHITESPACE:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_space) == 0)
- return FALSE;
- ecode++;
- break;
- case OP_NOT_WORDCHAR:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_word) != 0)
- return FALSE;
- ecode++;
- break;
- case OP_WORDCHAR:
- if (eptr >= md->end_subject ||
- (md->ctypes[*eptr++] & ctype_word) == 0)
- return FALSE;
- ecode++;
- break;
- /* Match a back reference, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following. The code is similar
- to that for character classes, but repeated for efficiency. Then obey
- similar code to character type repeats - written out again for speed.
- However, if the referenced string is the empty string, always treat
- it as matched, any number of times (otherwise there could be infinite
- loops). */
- case OP_REF:
- {
- int length;
- int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
- ecode += 3; /* Advance past item */
- /* If the reference is unset, set the length to be longer than the amount
- of subject left; this ensures that every attempt at a match fails. We
- can't just fail here, because of the possibility of quantifiers with zero
- minima. */
- length = (offset >= offset_top || md->offset_vector[offset] < 0)?
- md->end_subject - eptr + 1 :
- md->offset_vector[offset+1] - md->offset_vector[offset];
- /* Set up for repetition, or handle the non-repeated case */
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = (ecode[1] << 8) + ecode[2];
- max = (ecode[3] << 8) + ecode[4];
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
- default: /* No repeat follows */
- if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
- eptr += length;
- continue; /* With the main loop */
- }
- /* If the length of the reference is zero, just continue with the
- main loop. */
- if (length == 0) continue;
- /* First, ensure the minimum number of matches are present. We get back
- the length of the reference string explicitly rather than passing the
- address of eptr, so that eptr can be a register variable. */
- for (i = 1; i <= min; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
- eptr += length;
- }
- /* If min = max, continue at the same level without recursion.
- They are not both allowed to be zero. */
- if (min == max) continue;
- /* If minimizing, keep trying and advancing the pointer */
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || !match_ref(offset, eptr, length, md, ims))
- return FALSE;
- eptr += length;
- }
- /* Control never gets here */
- }
- /* If maximizing, find the longest string and work backwards */
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (!match_ref(offset, eptr, length, md, ims)) break;
- eptr += length;
- }
- while (eptr >= pp)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- eptr -= length;
- }
- return FALSE;
- }
- }
- /* Control never gets here */
- /* Match a character class, possibly repeatedly. Look past the end of the
- item to see if there is repeat information following. Then obey similar
- code to character type repeats - written out again for speed. */
- case OP_CLASS:
- {
- const uschar *data = ecode + 1; /* Save for matching */
- ecode += 33; /* Advance past the item */
- switch (*ecode)
- {
- case OP_CRSTAR:
- case OP_CRMINSTAR:
- case OP_CRPLUS:
- case OP_CRMINPLUS:
- case OP_CRQUERY:
- case OP_CRMINQUERY:
- c = *ecode++ - OP_CRSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- break;
- case OP_CRRANGE:
- case OP_CRMINRANGE:
- minimize = (*ecode == OP_CRMINRANGE);
- min = (ecode[1] << 8) + ecode[2];
- max = (ecode[3] << 8) + ecode[4];
- if (max == 0) max = INT_MAX;
- ecode += 5;
- break;
- default: /* No repeat follows */
- min = max = 1;
- break;
- }
- /* First, ensure the minimum number of matches are present. */
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject) return FALSE;
- GETCHARINC(c, eptr) /* Get character; increment eptr */
- #ifdef SUPPORT_UTF8
- /* We do not yet support class members > 255 */
- if (c > 255) return FALSE;
- #endif
- if ((data[c/8] & (1 << (c&7))) != 0) continue;
- return FALSE;
- }
- /* If max == min we can continue with the main loop without the
- need to recurse. */
- if (min == max) continue;
- /* If minimizing, keep testing the rest of the expression and advancing
- the pointer while it matches the class. */
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject) return FALSE;
- GETCHARINC(c, eptr) /* Get character; increment eptr */
- #ifdef SUPPORT_UTF8
- /* We do not yet support class members > 255 */
- if (c > 255) return FALSE;
- #endif
- if ((data[c/8] & (1 << (c&7))) != 0) continue;
- return FALSE;
- }
- /* Control never gets here */
- }
- /* If maximizing, find the longest possible run, then work backwards. */
- else
- {
- const uschar *pp = eptr;
- int len = 1;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject) break;
- GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
- #ifdef SUPPORT_UTF8
- /* We do not yet support class members > 255 */
- if (c > 255) break;
- #endif
- if ((data[c/8] & (1 << (c&7))) == 0) break;
- eptr += len;
- }
- while (eptr >= pp)
- {
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- #ifdef SUPPORT_UTF8
- BACKCHAR(eptr)
- #endif
- }
- return FALSE;
- }
- }
- /* Control never gets here */
- /* Match a run of characters */
- case OP_CHARS:
- {
- register int length = ecode[1];
- ecode += 2;
- #ifdef DEBUG /* Sigh. Some compilers never learn. */
- if (eptr >= md->end_subject)
- printf("matching subject <null> against pattern ");
- else
- {
- printf("matching subject ");
- pchars(eptr, length, TRUE, md);
- printf(" against pattern ");
- }
- pchars(ecode, length, FALSE, md);
- printf("n");
- #endif
- if (length > md->end_subject - eptr) return FALSE;
- if ((ims & PCRE_CASELESS) != 0)
- {
- while (length-- > 0)
- if (md->lcc[*ecode++] != md->lcc[*eptr++])
- return FALSE;
- }
- else
- {
- while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
- }
- }
- break;
- /* Match a single character repeatedly; different opcodes share code. */
- case OP_EXACT:
- min = max = (ecode[1] << 8) + ecode[2];
- ecode += 3;
- goto REPEATCHAR;
- case OP_UPTO:
- case OP_MINUPTO:
- min = 0;
- max = (ecode[1] << 8) + ecode[2];
- minimize = *ecode == OP_MINUPTO;
- ecode += 3;
- goto REPEATCHAR;
- case OP_STAR:
- case OP_MINSTAR:
- case OP_PLUS:
- case OP_MINPLUS:
- case OP_QUERY:
- case OP_MINQUERY:
- c = *ecode++ - OP_STAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-character matches. We can give
- up quickly if there are fewer than the minimum number of characters left in
- the subject. */
- REPEATCHAR:
- if (min > md->end_subject - eptr) return FALSE;
- c = *ecode++;
- /* The code is duplicated for the caseless and caseful cases, for speed,
- since matching characters is likely to be quite common. First, ensure the
- minimum number of matches are present. If min = max, continue at the same
- level without recursing. Otherwise, if minimizing, keep trying the rest of
- the expression and advancing one matching character if failing, up to the
- maximum. Alternatively, if maximizing, find the maximum number of
- characters and work backwards. */
- DPRINTF(("matching %c{%d,%d} against subject %.*sn", c, min, max,
- max, eptr));
- if ((ims & PCRE_CASELESS) != 0)
- {
- c = md->lcc[c];
- for (i = 1; i <= min; i++)
- if (c != md->lcc[*eptr++]) return FALSE;
- if (min == max) continue;
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject ||
- c != md->lcc[*eptr++])
- return FALSE;
- }
- /* Control never gets here */
- }
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
- eptr++;
- }
- while (eptr >= pp)
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- return FALSE;
- }
- /* Control never gets here */
- }
- /* Caseful comparisons */
- else
- {
- for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
- if (min == max) continue;
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
- }
- /* Control never gets here */
- }
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || c != *eptr) break;
- eptr++;
- }
- while (eptr >= pp)
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- return FALSE;
- }
- }
- /* Control never gets here */
- /* Match a negated single character */
- case OP_NOT:
- if (eptr >= md->end_subject) return FALSE;
- ecode++;
- if ((ims & PCRE_CASELESS) != 0)
- {
- if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
- }
- else
- {
- if (*ecode++ == *eptr++) return FALSE;
- }
- break;
- /* Match a negated single character repeatedly. This is almost a repeat of
- the code for a repeated single character, but I haven't found a nice way of
- commoning these up that doesn't require a test of the positive/negative
- option for each character match. Maybe that wouldn't add very much to the
- time taken, but character matching *is* what this is all about... */
- case OP_NOTEXACT:
- min = max = (ecode[1] << 8) + ecode[2];
- ecode += 3;
- goto REPEATNOTCHAR;
- case OP_NOTUPTO:
- case OP_NOTMINUPTO:
- min = 0;
- max = (ecode[1] << 8) + ecode[2];
- minimize = *ecode == OP_NOTMINUPTO;
- ecode += 3;
- goto REPEATNOTCHAR;
- case OP_NOTSTAR:
- case OP_NOTMINSTAR:
- case OP_NOTPLUS:
- case OP_NOTMINPLUS:
- case OP_NOTQUERY:
- case OP_NOTMINQUERY:
- c = *ecode++ - OP_NOTSTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- /* Common code for all repeated single-character matches. We can give
- up quickly if there are fewer than the minimum number of characters left in
- the subject. */
- REPEATNOTCHAR:
- if (min > md->end_subject - eptr) return FALSE;
- c = *ecode++;
- /* The code is duplicated for the caseless and caseful cases, for speed,
- since matching characters is likely to be quite common. First, ensure the
- minimum number of matches are present. If min = max, continue at the same
- level without recursing. Otherwise, if minimizing, keep trying the rest of
- the expression and advancing one matching character if failing, up to the
- maximum. Alternatively, if maximizing, find the maximum number of
- characters and work backwards. */
- DPRINTF(("negative matching %c{%d,%d} against subject %.*sn", c, min, max,
- max, eptr));
- if ((ims & PCRE_CASELESS) != 0)
- {
- c = md->lcc[c];
- for (i = 1; i <= min; i++)
- if (c == md->lcc[*eptr++]) return FALSE;
- if (min == max) continue;
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject ||
- c == md->lcc[*eptr++])
- return FALSE;
- }
- /* Control never gets here */
- }
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
- eptr++;
- }
- while (eptr >= pp)
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- return FALSE;
- }
- /* Control never gets here */
- }
- /* Caseful comparisons */
- else
- {
- for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
- if (min == max) continue;
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
- }
- /* Control never gets here */
- }
- else
- {
- const uschar *pp = eptr;
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || c == *eptr) break;
- eptr++;
- }
- while (eptr >= pp)
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- return FALSE;
- }
- }
- /* Control never gets here */
- /* Match a single character type repeatedly; several different opcodes
- share code. This is very similar to the code for single characters, but we
- repeat it in the interests of efficiency. */
- case OP_TYPEEXACT:
- min = max = (ecode[1] << 8) + ecode[2];
- minimize = TRUE;
- ecode += 3;
- goto REPEATTYPE;
- case OP_TYPEUPTO:
- case OP_TYPEMINUPTO:
- min = 0;
- max = (ecode[1] << 8) + ecode[2];
- minimize = *ecode == OP_TYPEMINUPTO;
- ecode += 3;
- goto REPEATTYPE;
- case OP_TYPESTAR:
- case OP_TYPEMINSTAR:
- case OP_TYPEPLUS:
- case OP_TYPEMINPLUS:
- case OP_TYPEQUERY:
- case OP_TYPEMINQUERY:
- c = *ecode++ - OP_TYPESTAR;
- minimize = (c & 1) != 0;
- min = rep_min[c]; /* Pick up values from tables; */
- max = rep_max[c]; /* zero for max => infinity */
- if (max == 0) max = INT_MAX;
- /* Common code for all repeated single character type matches */
- REPEATTYPE:
- ctype = *ecode++; /* Code for the character type */
- /* First, ensure the minimum number of matches are present. Use inline
- code for maximizing the speed, and do the type test once at the start
- (i.e. keep it out of the loop). Also we can test that there are at least
- the minimum number of bytes before we start, except when doing '.' in
- UTF8 mode. Leave the test in in all cases; in the special case we have
- to test after each character. */
- if (min > md->end_subject - eptr) return FALSE;
- if (min > 0) switch(ctype)
- {
- case OP_ANY:
- #ifdef SUPPORT_UTF8
- if (md->utf8)
- {
- for (i = 1; i <= min; i++)
- {
- if (eptr >= md->end_subject ||
- (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
- return FALSE;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- break;
- }
- #endif
- /* Non-UTF8 can be faster */
- if ((ims & PCRE_DOTALL) == 0)
- { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
- else eptr += min;
- break;
- case OP_NOT_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
- break;
- case OP_DIGIT:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
- break;
- case OP_NOT_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
- break;
- case OP_WHITESPACE:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
- break;
- case OP_NOT_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) != 0)
- return FALSE;
- break;
- case OP_WORDCHAR:
- for (i = 1; i <= min; i++)
- if ((md->ctypes[*eptr++] & ctype_word) == 0)
- return FALSE;
- break;
- }
- /* If min = max, continue at the same level without recursing */
- if (min == max) continue;
- /* If minimizing, we have to test the rest of the pattern before each
- subsequent match. */
- if (minimize)
- {
- for (i = min;; i++)
- {
- if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
- if (i >= max || eptr >= md->end_subject) return FALSE;
- c = *eptr++;
- switch(ctype)
- {
- case OP_ANY:
- if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
- #ifdef SUPPORT_UTF8
- if (md->utf8)
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- #endif
- break;
- case OP_NOT_DIGIT:
- if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
- break;
- case OP_DIGIT:
- if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
- break;
- case OP_NOT_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
- break;
- case OP_WHITESPACE:
- if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
- break;
- case OP_NOT_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
- break;
- case OP_WORDCHAR:
- if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
- break;
- }
- }
- /* Control never gets here */
- }
- /* If maximizing it is worth using inline code for speed, doing the type
- test once at the start (i.e. keep it out of the loop). */
- else
- {
- const uschar *pp = eptr;
- switch(ctype)
- {
- case OP_ANY:
- /* Special code is required for UTF8, but when the maximum is unlimited
- we don't need it. */
- #ifdef SUPPORT_UTF8
- if (md->utf8 && max < INT_MAX)
- {
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- else
- {
- for (i = min; i < max; i++)
- {
- eptr++;
- while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
- }
- }
- break;
- }
- #endif
- /* Non-UTF8 can be faster */
- if ((ims & PCRE_DOTALL) == 0)
- {
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || *eptr == NEWLINE) break;
- eptr++;
- }
- }
- else
- {
- c = max - min;
- if (c > md->end_subject - eptr) c = md->end_subject - eptr;
- eptr += c;
- }
- break;
- case OP_NOT_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
- break;
- eptr++;
- }
- break;
- case OP_DIGIT:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
- break;
- eptr++;
- }
- break;
- case OP_NOT_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
- break;
- eptr++;
- }
- break;
- case OP_WHITESPACE:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
- break;
- eptr++;
- }
- break;
- case OP_NOT_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
- break;
- eptr++;
- }
- break;
- case OP_WORDCHAR:
- for (i = min; i < max; i++)
- {
- if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
- break;
- eptr++;
- }
- break;
- }
- while (eptr >= pp)
- {
- if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
- return TRUE;
- #ifdef SUPPORT_UTF8
- if (md->utf8)
- while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
- #endif
- }
- return FALSE;
- }
- /* Control never gets here */
- /* There's been some horrible disaster. */
- default:
- DPRINTF(("Unknown opcode %dn", *ecode));
- md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
- return FALSE;
- }
- /* Do not stick any code in here without much thought; it is assumed
- that "continue" in the code above comes out to here to repeat the main
- loop. */
- } /* End of main loop */
- /* Control never reaches here */
- }
- /*************************************************
- * Execute a Regular Expression *
- *************************************************/
- /* This function applies a compiled re to a subject string and picks out
- portions of the string if it matches. Two elements in the vector are set for
- each substring: the offsets to the start and end of the substring.
- Arguments:
- external_re points to the compiled expression
- external_extra points to "hints" from pcre_study() or is NULL
- subject points to the subject string
- length length of subject string (may contain binary zeros)
- start_offset where to start in the subject string
- options option bits
- offsets points to a vector of ints to be filled in with offsets
- offsetcount the number of elements in the vector
- Returns: > 0 => success; value is the number of elements filled in
- = 0 => success, but offsets is not big enough
- -1 => failed to match
- < -1 => some kind of unexpected problem
- */
- int
- pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
- const char *subject, int length, int start_offset, int options, int *offsets,
- int offsetcount)
- {
- int resetcount, ocount;
- int first_char = -1;
- int req_char = -1;
- int req_char2 = -1;
- unsigned long int ims = 0;
- match_data match_block;
- const uschar *start_bits = NULL;
- const uschar *start_match = (const uschar *)subject + start_offset;
- const uschar *end_subject;
- const uschar *req_char_ptr = start_match - 1;
- const real_pcre *re = (const real_pcre *)external_re;
- const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
- BOOL using_temporary_offsets = FALSE;
- BOOL anchored;
- BOOL startline;
- if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
- if (re == NULL || subject == NULL ||
- (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
- if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
- anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
- startline = (re->options & PCRE_STARTLINE) != 0;
- match_block.start_pattern = re->code;
- match_block.start_subject = (const uschar *)subject;
- match_block.end_subject = match_block.start_subject + length;
- end_subject = match_block.end_subject;
- match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
- match_block.utf8 = (re->options & PCRE_UTF8) != 0;
- match_block.notbol = (options & PCRE_NOTBOL) != 0;
- match_block.noteol = (options & PCRE_NOTEOL) != 0;
- match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
- match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
- match_block.lcc = re->tables + lcc_offset;
- match_block.ctypes = re->tables + ctypes_offset;
- /* The ims options can vary during the matching as a result of the presence
- of (?ims) items in the pattern. They are kept in a local variable so that
- restoring at the exit of a group is easy. */
- ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
- /* If the expression has got more back references than the offsets supplied can
- hold, we get a temporary bit of working store to use during the matching.
- Otherwise, we can use the vector supplied, rounding down its size to a multiple
- of 3. */
- ocount = offsetcount - (offsetcount % 3);
- if (re->top_backref > 0 && re->top_backref >= ocount/3)
- {
- ocount = re->top_backref * 3 + 3;
- match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
- if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
- using_temporary_offsets = TRUE;
- DPRINTF(("Got memory to hold back referencesn"));
- }
- else match_block.offset_vector = offsets;
- match_block.offset_end = ocount;
- match_block.offset_max = (2*ocount)/3;
- match_block.offset_overflow = FALSE;
- /* Compute the minimum number of offsets that we need to reset each time. Doing
- this makes a huge difference to execution time when there aren't many brackets
- in the pattern. */
- resetcount = 2 + re->top_bracket * 2;
- if (resetcount > offsetcount) resetcount = ocount;
- /* Reset the working variable associated with each extraction. These should
- never be used unless previously set, but they get saved and restored, and so we
- initialize them to avoid reading uninitialized locations. */
- if (match_block.offset_vector != NULL)
- {
- register int *iptr = match_block.offset_vector + ocount;
- register int *iend = iptr - resetcount/2 + 1;
- while (--iptr >= iend) *iptr = -1;
- }
- /* Set up the first character to match, if available. The first_char value is
- never set for an anchored regular expression, but the anchoring may be forced
- at run time, so we have to test for anchoring. The first char may be unset for
- an unanchored pattern, of course. If there's no first char and the pattern was
- studied, there may be a bitmap of possible first characters. */
- if (!anchored)
- {
- if ((re->options & PCRE_FIRSTSET) != 0)
- {
- first_char = re->first_char;
- if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
- }
- else
- if (!startline && extra != NULL &&
- (extra->options & PCRE_STUDY_MAPPED) != 0)
- start_bits = extra->start_bits;
- }
- /* For anchored or unanchored matches, there may be a "last known required
- character" set. If the PCRE_CASELESS is set, implying that the match starts
- caselessly, or if there are any changes of this flag within the regex, set up
- both cases of the character. Otherwise set the two values the same, which will
- avoid duplicate testing (which takes significant time). This covers the vast
- majority of cases. It will be suboptimal when the case flag changes in a regex
- and the required character in fact is caseful. */
- if ((re->options & PCRE_REQCHSET) != 0)
- {
- req_char = re->req_char;
- req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
- (re->tables + fcc_offset)[req_char] : req_char;
- }
- /* Loop for handling unanchored repeated matching attempts; for anchored regexs
- the loop runs just once. */
- do
- {
- int rc;
- register int *iptr = match_block.offset_vector;
- register int *iend = iptr + resetcount;
- /* Reset the maximum number of extractions we might see. */
- while (iptr < iend) *iptr++ = -1;
- /* Advance to a unique first char if possible */
- if (first_char >= 0)
- {
- if ((ims & PCRE_CASELESS) != 0)
- while (start_match < end_subject &&
- match_block.lcc[*start_match] != first_char)
- start_match++;
- else
- while (start_match < end_subject && *start_match != first_char)
- start_match++;
- }
- /* Or to just after n for a multiline match if possible */
- else if (startline)
- {
- if (start_match > match_block.start_subject + start_offset)
- {
- while (start_match < end_subject && start_match[-1] != NEWLINE)
- start_match++;
- }
- }
- /* Or to a non-unique first char after study */
- else if (start_bits != NULL)
- {
- while (start_match < end_subject)
- {
- register int c = *start_match;
- if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
- }
- }
- #ifdef DEBUG /* Sigh. Some compilers never learn. */
- printf(">>>> Match against: ");
- pchars(start_match, end_subject - start_match, TRUE, &match_block);
- printf("n");
- #endif
- /* If req_char is set, we know that that character must appear in the subject
- for the match to succeed. If the first character is set, req_char must be
- later in the subject; otherwise the test starts at the match point. This
- optimization can save a huge amount of backtracking in patterns with nested
- unlimited repeats that aren't going to match. We don't know what the state of
- case matching may be when this character is hit, so test for it in both its
- cases if necessary. However, the different cased versions will not be set up
- unless PCRE_CASELESS was given or the casing state changes within the regex.
- Writing separate code makes it go faster, as does using an autoincrement and
- backing off on a match. */
- if (req_char >= 0)
- {
- register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
- /* We don't need to repeat the search if we haven't yet reached the
- place we found it at last time. */
- if (p > req_char_ptr)
- {
- /* Do a single test if no case difference is set up */
- if (req_char == req_char2)
- {
- while (p < end_subject)
- {
- if (*p++ == req_char) { p--; break; }
- }
- }
- /* Otherwise test for either case */
- else
- {
- while (p < end_subject)
- {
- register int pp = *p++;
- if (pp == req_char || pp == req_char2) { p--; break; }
- }
- }
- /* If we can't find the required character, break the matching loop */
- if (p >= end_subject) break;
- /* If we have found the required character, save the point where we
- found it, so that we don't search again next time round the loop if
- the start hasn't passed this character yet. */
- req_char_ptr = p;
- }
- }
- /* When a match occurs, substrings will be set for all internal extractions;
- we just need to set up the whole thing as substring 0 before returning. If
- there were too many extractions, set the return code to zero. In the case
- where we had to get some local store to hold offsets for backreferences, copy
- those back references that we can. In this case there need not be overflow
- if certain parts of the pattern were not used. */
- match_block.start_match = start_match;
- if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
- continue;
- /* Copy the offset information from temporary store if necessary */
- if (using_temporary_offsets)
- {
- if (offsetcount >= 4)
- {
- memcpy(offsets + 2, match_block.offset_vector + 2,
- (offsetcount - 2) * sizeof(int));
- DPRINTF(("Copied offsets from temporary memoryn"));
- }
- if (match_block.end_offset_top > offsetcount)
- match_block.offset_overflow = TRUE;
- DPRINTF(("Freeing temporary memoryn"));
- (pcre_free)(match_block.offset_vector);
- }
- rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
- if (offsetcount < 2) rc = 0; else
- {
- offsets[0] = start_match - match_block.start_subject;
- offsets[1] = match_block.end_match_ptr - match_block.start_subject;
- }
- DPRINTF((">>>> returning %dn", rc));
- return rc;
- }
- /* This "while" is the end of the "do" above */
- while (!anchored &&
- match_block.errorcode == PCRE_ERROR_NOMATCH &&
- start_match++ < end_subject);
- if (using_temporary_offsets)
- {
- DPRINTF(("Freeing temporary memoryn"));
- (pcre_free)(match_block.offset_vector);
- }
- DPRINTF((">>>> returning %dn", match_block.errorcode));
- return match_block.errorcode;
- }
- /* End of pcre.c */