ICU 4.6 4.6
regex.h
Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2010, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 //#define REGEX_DEBUG
00020 
00045 #include "unicode/utypes.h"
00046 
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048 
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053 
00054 #include "unicode/uregex.h"
00055 
00056 U_NAMESPACE_BEGIN
00057 
00058 
00059 // Forward Declarations...
00060 
00061 class RegexMatcher;
00062 class RegexPattern;
00063 class UVector;
00064 class UVector32;
00065 class UVector64;
00066 class UnicodeSet;
00067 struct REStackFrame;
00068 struct Regex8BitSet;
00069 class  RuleBasedBreakIterator;
00070 class  RegexCImpl;
00071 
00072 
00073 
00074 
00079 #ifdef REGEX_DEBUG
00080 U_INTERNAL void U_EXPORT2
00081     RegexPatternDump(const RegexPattern *pat);
00082 #else
00083     #undef RegexPatternDump
00084     #define RegexPatternDump(pat)
00085 #endif
00086 
00087 
00088 
00100 class U_I18N_API RegexPattern: public UObject {
00101 public:
00102 
00110     RegexPattern();
00111 
00118     RegexPattern(const RegexPattern &source);
00119 
00125     virtual ~RegexPattern();
00126 
00135     UBool           operator==(const RegexPattern& that) const;
00136 
00145     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00146 
00152     RegexPattern  &operator =(const RegexPattern &source);
00153 
00161     virtual RegexPattern  *clone() const;
00162 
00163 
00188     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00189         UParseError          &pe,
00190         UErrorCode           &status);
00191 
00192 
00219     static RegexPattern * U_EXPORT2 compile( UText *regex,
00220         UParseError          &pe,
00221         UErrorCode           &status);
00222 
00247     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00248         uint32_t             flags,
00249         UParseError          &pe,
00250         UErrorCode           &status);
00251         
00252         
00279     static RegexPattern * U_EXPORT2 compile( UText *regex,
00280         uint32_t             flags,
00281         UParseError          &pe,
00282         UErrorCode           &status);
00283     
00284 
00307     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00308         uint32_t             flags,
00309         UErrorCode           &status);
00310 
00311 
00336     static RegexPattern * U_EXPORT2 compile( UText *regex,
00337         uint32_t             flags,
00338         UErrorCode           &status);
00339     
00340 
00346     virtual uint32_t flags() const;
00347 
00365     virtual RegexMatcher *matcher(const UnicodeString &input,
00366         UErrorCode          &status) const;
00367         
00368         
00373     enum PatternIsUTextFlag { PATTERN_IS_UTEXT };
00374 
00394     virtual RegexMatcher *matcher(UText *input,
00395         PatternIsUTextFlag      flag, 
00396         UErrorCode          &status) const;
00397 
00398 private:
00412     RegexMatcher *matcher(const UChar *input,
00413         UErrorCode          &status) const;
00414 public:
00415 
00416 
00428     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00429 
00430 
00445     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
00446         const UnicodeString   &input,
00447               UParseError     &pe,
00448               UErrorCode      &status);
00449 
00450 
00465     static UBool U_EXPORT2 matches(UText *regex,
00466         UText           *input,
00467         UParseError     &pe,
00468         UErrorCode      &status);
00469 
00470 
00479     virtual UnicodeString pattern() const;
00480     
00481     
00492     virtual UText *patternText(UErrorCode      &status) const;
00493 
00494 
00520     virtual int32_t  split(const UnicodeString &input,
00521         UnicodeString    dest[],
00522         int32_t          destCapacity,
00523         UErrorCode       &status) const;
00524 
00525 
00551     virtual int32_t  split(UText *input,
00552         UText            *dest[],
00553         int32_t          destCapacity,
00554         UErrorCode       &status) const;
00555 
00556 
00562     virtual UClassID getDynamicClassID() const;
00563 
00569     static UClassID U_EXPORT2 getStaticClassID();
00570 
00571 private:
00572     //
00573     //  Implementation Data
00574     //
00575     UText          *fPattern;      // The original pattern string.
00576     UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
00577     uint32_t        fFlags;        // The flags used when compiling the pattern.
00578                                    //
00579     UVector64       *fCompiledPat; // The compiled pattern p-code.
00580     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00581                                    //   after un-escaping, for use during the match.
00582 
00583     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00584     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00585 
00586 
00587     UErrorCode      fDeferredStatus; // status if some prior error has left this
00588                                    //  RegexPattern in an unusable state.
00589 
00590     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00591                                    //   >= this value.  For some patterns, this calculated
00592                                    //   value may be less than the true shortest
00593                                    //   possible match.
00594     
00595     int32_t         fFrameSize;    // Size of a state stack frame in the
00596                                    //   execution engine.
00597 
00598     int32_t         fDataSize;     // The size of the data needed by the pattern that
00599                                    //   does not go on the state stack, but has just
00600                                    //   a single copy per matcher.
00601 
00602     UVector32       *fGroupMap;    // Map from capture group number to position of
00603                                    //   the group's variables in the matcher stack frame.
00604 
00605     int32_t         fMaxCaptureDigits;
00606 
00607     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00608                                    //   regex character classes, e.g. Word.
00609 
00610     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00611                                    //  sets for predefined regex classes.
00612 
00613     int32_t         fStartType;    // Info on how a match must start.
00614     int32_t         fInitialStringIdx;     //
00615     int32_t         fInitialStringLen;
00616     UnicodeSet     *fInitialChars;
00617     UChar32         fInitialChar;
00618     Regex8BitSet   *fInitialChars8;
00619     UBool           fNeedsAltInput;
00620 
00621     friend class RegexCompile;
00622     friend class RegexMatcher;
00623     friend class RegexCImpl;
00624 
00625     //
00626     //  Implementation Methods
00627     //
00628     void        init();            // Common initialization, for use by constructors.
00629     void        zap();             // Common cleanup
00630 #ifdef REGEX_DEBUG
00631     void        dumpOp(int32_t index) const;
00632     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00633 #endif
00634 
00635 };
00636 
00637 
00638 
00648 class U_I18N_API RegexMatcher: public UObject {
00649 public:
00650 
00665     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00666 
00682     RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00683     
00705     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00706         uint32_t flags, UErrorCode &status);
00707 
00729     RegexMatcher(UText *regexp, UText *input,
00730         uint32_t flags, UErrorCode &status);
00731 
00732 private:
00746     RegexMatcher(const UnicodeString &regexp, const UChar *input,
00747         uint32_t flags, UErrorCode &status);
00748 public:
00749 
00750 
00756     virtual ~RegexMatcher();
00757 
00758 
00765     virtual UBool matches(UErrorCode &status);
00766 
00767 
00778     virtual UBool matches(int64_t startIndex, UErrorCode &status);
00779 
00780 
00794     virtual UBool lookingAt(UErrorCode &status);
00795 
00796 
00810     virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
00811 
00812 
00825     virtual UBool find();
00826 
00827 
00837     virtual UBool find(int64_t start, UErrorCode &status);
00838 
00839 
00849     virtual UnicodeString group(UErrorCode &status) const;
00850 
00851 
00864     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00865 
00866 
00872     virtual int32_t groupCount() const;
00873 
00874 
00889     virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 
00890 
00894     virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
00895 
00911     virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00912 
00913 
00921     virtual int32_t start(UErrorCode &status) const;
00922 
00926     virtual int64_t start64(UErrorCode &status) const;
00927 
00928 
00942     virtual int32_t start(int32_t group, UErrorCode &status) const;
00943 
00947     virtual int64_t start64(int32_t group, UErrorCode &status) const;
00948 
00949 
00962     virtual int32_t end(UErrorCode &status) const;
00963 
00967     virtual int64_t end64(UErrorCode &status) const;
00968 
00969 
00986     virtual int32_t end(int32_t group, UErrorCode &status) const;
00987 
00991     virtual int64_t end64(int32_t group, UErrorCode &status) const;
00992 
00993 
01002     virtual RegexMatcher &reset();
01003 
01004 
01020     virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
01021 
01022 
01040     virtual RegexMatcher &reset(const UnicodeString &input);
01041 
01042 
01056     virtual RegexMatcher &reset(UText *input);
01057 
01058 private:
01072     RegexMatcher &reset(const UChar *input);
01073 public:
01074 
01082     virtual const UnicodeString &input() const;
01083     
01092     virtual UText *inputText() const;
01093     
01103     virtual UText *getInput(UText *dest, UErrorCode &status) const;
01104     
01105 
01124      virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
01125 
01135      virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
01136 
01145      virtual int32_t regionStart() const;
01146 
01150      virtual int64_t regionStart64() const;
01151 
01152 
01161       virtual int32_t regionEnd() const;
01162 
01166       virtual int64_t regionEnd64() const;
01167 
01176       virtual UBool hasTransparentBounds() const;
01177 
01196       virtual RegexMatcher &useTransparentBounds(UBool b);
01197 
01198      
01206       virtual UBool hasAnchoringBounds() const;
01207 
01208 
01221       virtual RegexMatcher &useAnchoringBounds(UBool b);
01222 
01223 
01236       virtual UBool hitEnd() const;
01237 
01247       virtual UBool requireEnd() const;
01248 
01249 
01255     virtual const RegexPattern &pattern() const;
01256 
01257 
01274     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01275 
01276 
01297     virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01298     
01299 
01320     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01321     
01322 
01347     virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01348     
01349     
01377     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01378         const UnicodeString &replacement, UErrorCode &status);
01379     
01380     
01408     virtual RegexMatcher &appendReplacement(UText *dest,
01409         UText *replacement, UErrorCode &status);
01410 
01411 
01422     virtual UnicodeString &appendTail(UnicodeString &dest);
01423 
01424 
01437     virtual UText *appendTail(UText *dest, UErrorCode &status);
01438 
01439 
01463     virtual int32_t  split(const UnicodeString &input,
01464         UnicodeString    dest[],
01465         int32_t          destCapacity,
01466         UErrorCode       &status);
01467 
01468 
01492     virtual int32_t  split(UText *input,
01493         UText           *dest[],
01494         int32_t          destCapacity,
01495         UErrorCode       &status);
01496     
01518     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01519 
01526     virtual int32_t getTimeLimit() const;
01527 
01549     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
01550     
01558     virtual int32_t  getStackLimit() const;
01559 
01560 
01574     virtual void setMatchCallback(URegexMatchCallback     *callback,
01575                                   const void              *context,
01576                                   UErrorCode              &status);
01577 
01578 
01589     virtual void getMatchCallback(URegexMatchCallback     *&callback,
01590                                   const void              *&context,
01591                                   UErrorCode              &status);
01592 
01593 
01607     virtual void setFindProgressCallback(URegexFindProgressCallback      *callback,
01608                                               const void                              *context,
01609                                               UErrorCode                              &status);
01610 
01611 
01622     virtual void getFindProgressCallback(URegexFindProgressCallback      *&callback,
01623                                               const void                      *&context,
01624                                               UErrorCode                      &status);
01625 
01626 
01632     void setTrace(UBool state);
01633 
01634 
01640     static UClassID U_EXPORT2 getStaticClassID();
01641 
01647     virtual UClassID getDynamicClassID() const;
01648 
01649 private:
01650     // Constructors and other object boilerplate are private.
01651     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
01652     RegexMatcher();                  // default constructor not implemented
01653     RegexMatcher(const RegexPattern *pat);
01654     RegexMatcher(const RegexMatcher &other);
01655     RegexMatcher &operator =(const RegexMatcher &rhs);
01656     void init(UErrorCode &status);                      // Common initialization
01657     void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
01658 
01659     friend class RegexPattern;
01660     friend class RegexCImpl;
01661 public:
01663     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
01664 private:
01665 
01666     //
01667     //  MatchAt   This is the internal interface to the match engine itself.
01668     //            Match status comes back in matcher member variables.
01669     //
01670     void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01671     inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
01672     UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
01673     UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
01674     REStackFrame        *resetStack();
01675     inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01676     void                 IncrementTime(UErrorCode &status);
01677     UBool                ReportFindProgress(int64_t matchIndex, UErrorCode &status);
01678     
01679     int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01680     
01681     UBool                findUsingChunk();
01682     void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01683     UBool                isChunkWordBoundary(int32_t pos);
01684 
01685     const RegexPattern  *fPattern;
01686     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
01687                                            //   should delete it when through.
01688 
01689     const UnicodeString *fInput;           // The string being matched. Only used for input()
01690     UText               *fInputText;       // The text being matched. Is never NULL.
01691     UText               *fAltInputText;    // A shallow copy of the text being matched.
01692                                            //   Only created if the pattern contains backreferences.
01693     int64_t              fInputLength;     // Full length of the input text.
01694     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
01695     
01696     int64_t              fRegionStart;     // Start of the input region, default = 0.
01697     int64_t              fRegionLimit;     // End of input region, default to input.length.
01698     
01699     int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
01700     int64_t              fAnchorLimit;     //   See useAnchoringBounds
01701     
01702     int64_t              fLookStart;       // Region bounds for look-ahead/behind and
01703     int64_t              fLookLimit;       //   and other boundary tests.  See
01704                                            //   useTransparentBounds
01705 
01706     int64_t              fActiveStart;     // Currently active bounds for matching.
01707     int64_t              fActiveLimit;     //   Usually is the same as region, but
01708                                            //   is changed to fLookStart/Limit when
01709                                            //   entering look around regions.
01710 
01711     UBool                fTransparentBounds;  // True if using transparent bounds.
01712     UBool                fAnchoringBounds; // True if using anchoring bounds.
01713 
01714     UBool                fMatch;           // True if the last attempted match was successful.
01715     int64_t              fMatchStart;      // Position of the start of the most recent match
01716     int64_t              fMatchEnd;        // First position after the end of the most recent match
01717                                            //   Zero if no previous match, even when a region
01718                                            //   is active.
01719     int64_t              fLastMatchEnd;    // First position after the end of the previous match,
01720                                            //   or -1 if there was no previous match.
01721     int64_t              fAppendPosition;  // First position after the end of the previous
01722                                            //   appendReplacement().  As described by the
01723                                            //   JavaDoc for Java Matcher, where it is called 
01724                                            //   "append position"
01725     UBool                fHitEnd;          // True if the last match touched the end of input.
01726     UBool                fRequireEnd;      // True if the last match required end-of-input
01727                                            //    (matched $ or Z)
01728 
01729     UVector64           *fStack;
01730     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
01731                                            //   which will contain the capture group results.
01732                                            //   NOT valid while match engine is running.
01733 
01734     int64_t             *fData;            // Data area for use by the compiled pattern.
01735     int64_t             fSmallData[8];     //   Use this for data if it's enough.
01736 
01737     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
01738                                            //   match engine run.  Zero for unlimited.
01739     
01740     int32_t             fTime;             // Match time, accumulates while matching.
01741     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
01742                                            //   Kept separately from fTime to keep as much
01743                                            //   code as possible out of the inline
01744                                            //   StateSave function.
01745 
01746     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
01747                                            //   stack, in bytes.  Zero for unlimited.
01748 
01749     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
01750                                            //   NULL if there is no callback.
01751     const void         *fCallbackContext;  // User Context ptr for callback function.
01752 
01753     URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
01754                                                            //   NULL if there is no callback.
01755     const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
01756 
01757 
01758     UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
01759 
01760     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
01761 
01762     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
01763                                            //   reported, or that permanently disables this matcher.
01764 
01765     RuleBasedBreakIterator  *fWordBreakItr;
01766 
01767 
01768 };
01769 
01770 U_NAMESPACE_END
01771 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
01772 #endif
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Defines