RegExpr.pas
上传用户:yjb1804
上传日期:2021-01-30
资源大小:3105k
文件大小:145k
- unit RegExpr;
- {
- TRegExpr class library
- Delphi Regular Expressions
- Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
- You may use this software in any kind of development,
- including comercial, redistribute, and modify it freely,
- under the following restrictions :
- 1. This software is provided as it is, without any kind of
- warranty given. Use it at Your own risk.The author is not
- responsible for any consequences of use of this software.
- 2. The origin of this software may not be mispresented, You
- must not claim that You wrote the original software. If
- You use this software in any kind of product, it would be
- appreciated that there in a information box, or in the
- documentation would be an acknowledgement like
- Partial Copyright (c) 2004 Andrey V. Sorokin
- http://RegExpStudio.com
- mailto:anso@mail.ru
- 3. You may not have any income from distributing this source
- (or altered version of it) to other developers. When You
- use this product in a comercial package, the source may
- not be charged seperatly.
- 4. Altered versions must be plainly marked as such, and must
- not be misrepresented as being the original software.
- 5. RegExp Studio application and all the visual components as
- well as documentation is not part of the TRegExpr library
- and is not free for usage.
- mailto:anso@mail.ru
- http://RegExpStudio.com
- http://anso.da.ru/
- }
- interface
- // ======== Determine compiler
- {$IFDEF VER80} Sorry, TRegExpr is for 32-bits Delphi only. Delphi 1 is not supported (and whos really care today?!). {$ENDIF}
- {$IFDEF VER90} {$DEFINE D2} {$ENDIF} // D2
- {$IFDEF VER93} {$DEFINE D2} {$ENDIF} // CPPB 1
- {$IFDEF VER100} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D3
- {$IFDEF VER110} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // CPPB 3
- {$IFDEF VER120} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D4
- {$IFDEF VER130} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D5
- {$IFDEF VER140} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D6
- {$IFDEF VER150} {$DEFINE D7} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D7
- // ======== Define base compiler options
- {$BOOLEVAL OFF}
- {$EXTENDEDSYNTAX ON}
- {$LONGSTRINGS ON}
- {$OPTIMIZATION ON}
- {$IFDEF D6}
- {$WARN SYMBOL_PLATFORM OFF} // Suppress .Net warnings
- {$ENDIF}
- {$IFDEF D7}
- {$WARN UNSAFE_CAST OFF} // Suppress .Net warnings
- {$WARN UNSAFE_TYPE OFF} // Suppress .Net warnings
- {$WARN UNSAFE_CODE OFF} // Suppress .Net warnings
- {$ENDIF}
- {$IFDEF FPC}
- {$MODE DELPHI} // Delphi-compatible mode in FreePascal
- {$ENDIF}
- // ======== Define options for TRegExpr engine
- {.$DEFINE UniCode} // Unicode support
- {$DEFINE RegExpPCodeDump} // p-code dumping (see Dump method)
- {$IFNDEF FPC} // the option is not supported in FreePascal
- {$DEFINE reRealExceptionAddr} // exceptions will point to appropriate source line, not to Error procedure
- {$ENDIF}
- {$DEFINE ComplexBraces} // support braces in complex cases
- {$IFNDEF UniCode} // the option applicable only for non-UniCode mode
- {$DEFINE UseSetOfChar} // Significant optimization by using set of char
- {$ENDIF}
- {$IFDEF UseSetOfChar}
- {$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
- {$ENDIF}
- // ======== Define Pascal-language options
- // Define 'UseAsserts' option (do not edit this definitions).
- // Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
- // completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
- {$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
- {$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
- // Define 'use subroutine parameters default values' option (do not edit this definition).
- {$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
- // Define 'OverMeth' options, to use method overloading (do not edit this definitions).
- {$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
- {$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
- uses
- Classes, // TStrings in Split method
- SysUtils; // Exception
- type
- {$IFDEF UniCode}
- PRegExprChar = PWideChar;
- RegExprString = WideString;
- REChar = WideChar;
- {$ELSE}
- PRegExprChar = PChar;
- RegExprString = AnsiString; //###0.952 was string
- REChar = Char;
- {$ENDIF}
- TREOp = REChar; // internal p-code type //###0.933
- PREOp = ^TREOp;
- TRENextOff = integer; // internal Next "pointer" (offset to current p-code) //###0.933
- PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
- TREBracesArg = integer; // type of {m,n} arguments
- PREBracesArg = ^TREBracesArg;
- const
- REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
- RENextOffSz = SizeOf (TRENextOff) div SizeOf (REChar); // size of Next 'pointer' -"-
- REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-
- type
- TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
- of object;
- const
- EscChar = ''; // 'Escape'-char ('' in common r.e.) used for escaping metachars (w, d etc).
- RegExprModifierI : boolean = False; // default value for ModifierI
- RegExprModifierR : boolean = True; // default value for ModifierR
- RegExprModifierS : boolean = True; // default value for ModifierS
- RegExprModifierG : boolean = True; // default value for ModifierG
- RegExprModifierM : boolean = False; // default value for ModifierM
- RegExprModifierX : boolean = False; // default value for ModifierX
- RegExprSpaceChars : RegExprString = // default value for SpaceChars
- ' '#$9#$A#$D#$C;
- RegExprWordChars : RegExprString = // default value for WordChars
- '0123456789' //###0.940
- + 'abcdefghijklmnopqrstuvwxyz'
- + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
- RegExprLineSeparators : RegExprString =// default value for LineSeparators
- #$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
- RegExprLinePairedSeparator : RegExprString =// default value for LinePairedSeparator
- #$d#$a;
- { if You need Unix-styled line separators (only n), then use:
- RegExprLineSeparators = #$a;
- RegExprLinePairedSeparator = '';
- }
- const
- NSUBEXP = 15; // max number of subexpression //###0.929
- // Cannot be more than NSUBEXPMAX
- // Be carefull - don't use values which overflow CLOSE opcode
- // (in this case you'll get compiler erorr).
- // Big NSUBEXP will cause more slow work and more stack required
- NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
- // Don't change it! It's defined by internal TRegExpr design.
- MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
- {$IFDEF ComplexBraces}
- LoopStackMax = 10; // max depth of loops stack //###0.925
- {$ENDIF}
- TinySetLen = 3;
- // if range includes more then TinySetLen chars, //###0.934
- // then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
- // !!! Attension ! If you change TinySetLen, you must
- // change code marked as "//!!!TinySet"
- type
- {$IFDEF UseSetOfChar}
- PSetOfREChar = ^TSetOfREChar;
- TSetOfREChar = set of REChar;
- {$ENDIF}
- TRegExpr = class;
- TRegExprReplaceFunction = function (ARegExpr : TRegExpr): string
- of object;
- TRegExpr = class
- private
- startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
- endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points
- {$IFDEF ComplexBraces}
- LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
- LoopStackIdx : integer; // 0 - out of all loops
- {$ENDIF}
- // The "internal use only" fields to pass info from compile
- // to execute that permits the execute phase to run lots faster on
- // simple cases.
- regstart : REChar; // char that must begin a match; ' ' if none obvious
- reganch : REChar; // is the match anchored (at beginning-of-line only)?
- regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
- regmlen : integer; // length of regmust string
- // Regstart and reganch permit very fast decisions on suitable starting points
- // for a match, cutting down the work a lot. Regmust permits fast rejection
- // of lines that cannot possibly match. The regmust tests are costly enough
- // that regcomp() supplies a regmust only if the r.e. contains something
- // potentially expensive (at present, the only such thing detected is * or +
- // at the start of the r.e., which can involve a lot of backup). Regmlen is
- // supplied because the test in regexec() needs it and regcomp() is computing
- // it anyway.
- {$IFDEF UseFirstCharSet} //###0.929
- FirstCharSet : TSetOfREChar;
- {$ENDIF}
- // work variables for Exec's routins - save stack in recursion}
- reginput : PRegExprChar; // String-input pointer.
- fInputStart : PRegExprChar; // Pointer to first char of input string.
- fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string
- // work variables for compiler's routines
- regparse : PRegExprChar; // Input-scan pointer.
- regnpar : integer; // count.
- regdummy : char;
- regcode : PRegExprChar; // Code-emit pointer; @regdummy = don't.
- regsize : integer; // Code size.
- regexpbeg : PRegExprChar; // only for error handling. Contains
- // pointer to beginning of r.e. while compiling
- fExprIsCompiled : boolean; // true if r.e. successfully compiled
- // programm is essentially a linear encoding
- // of a nondeterministic finite-state machine (aka syntax charts or
- // "railroad normal form" in parsing technology). Each node is an opcode
- // plus a "next" pointer, possibly plus an operand. "Next" pointers of
- // all nodes except BRANCH implement concatenation; a "next" pointer with
- // a BRANCH on both ends of it is connecting two alternatives. (Here we
- // have one of the subtle syntax dependencies: an individual BRANCH (as
- // opposed to a collection of them) is never concatenated with anything
- // because of operator precedence.) The operand of some types of node is
- // a literal string; for others, it is a node leading into a sub-FSM. In
- // particular, the operand of a BRANCH node is the first node of the branch.
- // (NB this is *not* a tree structure: the tail of the branch connects
- // to the thing following the set of BRANCHes.) The opcodes are:
- programm : PRegExprChar; // Unwarranted chumminess with compiler.
- fExpression : PRegExprChar; // source of compiled r.e.
- fInputString : PRegExprChar; // input string
- fLastError : integer; // see Error, LastError
- fModifiers : integer; // modifiers
- fCompModifiers : integer; // compiler's copy of modifiers
- fProgModifiers : integer; // modifiers values from last programm compilation
- fSpaceChars : RegExprString; //###0.927
- fWordChars : RegExprString; //###0.929
- fInvertCase : TRegExprInvertCaseFunction; //###0.927
- fLineSeparators : RegExprString; //###0.941
- fLinePairedSeparatorAssigned : boolean;
- fLinePairedSeparatorHead,
- fLinePairedSeparatorTail : REChar;
- {$IFNDEF UniCode}
- fLineSeparatorsSet : set of REChar;
- {$ENDIF}
- procedure InvalidateProgramm;
- // Mark programm as have to be [re]compiled
- function IsProgrammOk : boolean; //###0.941
- // Check if we can use precompiled r.e. or
- // [re]compile it if something changed
- function GetExpression : RegExprString;
- procedure SetExpression (const s : RegExprString);
- function GetModifierStr : RegExprString;
- class function ParseModifiersStr (const AModifiers : RegExprString;
- var AModifiersInt : integer) : boolean; //###0.941 class function now
- // Parse AModifiers string and return true and set AModifiersInt
- // if it's in format 'ismxrg-ismxrg'.
- procedure SetModifierStr (const AModifiers : RegExprString);
- function GetModifier (AIndex : integer) : boolean;
- procedure SetModifier (AIndex : integer; ASet : boolean);
- procedure Error (AErrorID : integer); virtual; // error handler.
- // Default handler raise exception ERegExpr with
- // Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
- // and CompilerErrorPos = value of property CompilerErrorPos.
- {==================== Compiler section ===================}
- function CompileRegExpr (exp : PRegExprChar) : boolean;
- // compile a regular expression into internal code
- procedure Tail (p : PRegExprChar; val : PRegExprChar);
- // set the next-pointer at the end of a node chain
- procedure OpTail (p : PRegExprChar; val : PRegExprChar);
- // regoptail - regtail on operand of first argument; nop if operandless
- function EmitNode (op : TREOp) : PRegExprChar;
- // regnode - emit a node, return location
- procedure EmitC (b : REChar);
- // emit (if appropriate) a byte of code
- procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
- // insert an operator in front of already-emitted operand
- // Means relocating the operand.
- function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
- // regular expression, i.e. main body or parenthesized thing
- function ParseBranch (var flagp : integer) : PRegExprChar;
- // one alternative of an | operator
- function ParsePiece (var flagp : integer) : PRegExprChar;
- // something followed by possible [*+?]
- function ParseAtom (var flagp : integer) : PRegExprChar;
- // the lowest level
- function GetCompilerErrorPos : integer;
- // current pos in r.e. - for error hanling
- {$IFDEF UseFirstCharSet} //###0.929
- procedure FillFirstCharSet (prog : PRegExprChar);
- {$ENDIF}
- {===================== Mathing section ===================}
- function regrepeat (p : PRegExprChar; AMax : integer) : integer;
- // repeatedly match something simple, report how many
- function regnext (p : PRegExprChar) : PRegExprChar;
- // dig the "next" pointer out of a node
- function MatchPrim (prog : PRegExprChar) : boolean;
- // recursively matching routine
- function ExecPrim (AOffset: integer) : boolean;
- // Exec for stored InputString
- {$IFDEF RegExpPCodeDump}
- function DumpOp (op : REChar) : RegExprString;
- {$ENDIF}
- function GetSubExprMatchCount : integer;
- function GetMatchPos (Idx : integer) : integer;
- function GetMatchLen (Idx : integer) : integer;
- function GetMatch (Idx : integer) : RegExprString;
- function GetInputString : RegExprString;
- procedure SetInputString (const AInputString : RegExprString);
- {$IFNDEF UseSetOfChar}
- function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
- {$ENDIF}
- procedure SetLineSeparators (const AStr : RegExprString);
- procedure SetLinePairedSeparator (const AStr : RegExprString);
- function GetLinePairedSeparator : RegExprString;
- public
- constructor Create;
- destructor Destroy; override;
- class function VersionMajor : integer; //###0.944
- class function VersionMinor : integer; //###0.944
- property Expression : RegExprString read GetExpression write SetExpression;
- // Regular expression.
- // For optimization, TRegExpr will automatically compiles it into 'P-code'
- // (You can see it with help of Dump method) and stores in internal
- // structures. Real [re]compilation occures only when it really needed -
- // while calling Exec[Next], Substitute, Dump, etc
- // and only if Expression or other P-code affected properties was changed
- // after last [re]compilation.
- // If any errors while [re]compilation occures, Error method is called
- // (by default Error raises exception - see below)
- property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
- // Set/get default values of r.e.syntax modifiers. Modifiers in
- // r.e. (?ismx-ismx) will replace this default values.
- // If you try to set unsupported modifier, Error will be called
- // (by defaul Error raises exception ERegExpr).
- property ModifierI : boolean index 1 read GetModifier write SetModifier;
- // Modifier /i - caseinsensitive, initialized from RegExprModifierI
- property ModifierR : boolean index 2 read GetModifier write SetModifier;
- // Modifier /r - use r.e.syntax extended for russian,
- // (was property ExtSyntaxEnabled in previous versions)
- // If true, then