1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
/*
* Copyright (C) 2005-2018 Team Kodi
* This file is part of Kodi - https://kodi.tv
*
* SPDX-License-Identifier: GPL-2.0-or-later
* See LICENSES/README.md for more information.
*/
#pragma once
//! @todo - move to std::regex (after switching to gcc 4.9 or higher) and get rid of CRegExp
#include <string>
#include <vector>
/* make sure stdlib.h is included before including pcre.h inside the
namespace; this works around stdlib.h definitions also living in
the PCRE namespace */
#include <stdlib.h>
namespace PCRE {
struct real_pcre_jit_stack; // forward declaration for PCRE without JIT
typedef struct real_pcre_jit_stack pcre_jit_stack;
#include <pcre.h>
}
class CRegExp
{
public:
enum studyMode
{
NoStudy = 0, // do not study expression
StudyRegExp = 1, // study expression (slower compilation, faster find)
StudyWithJitComp // study expression and JIT-compile it, if possible (heavyweight optimization)
};
enum utf8Mode
{
autoUtf8 = -1, // analyze regexp for UTF-8 multi-byte chars, for Unicode codes > 0xFF
// or explicit Unicode properties (\p, \P and \X), enable UTF-8 mode if any of them are found
asciiOnly = 0, // process regexp and strings as single-byte encoded strings
forceUtf8 = 1 // enable UTF-8 mode (with Unicode properties)
};
static const int m_MaxNumOfBackrefrences = 20;
/**
* @param caseless (optional) Matching will be case insensitive if set to true
* or case sensitive if set to false
* @param utf8 (optional) Control UTF-8 processing
*/
CRegExp(bool caseless = false, utf8Mode utf8 = asciiOnly);
/**
* Create new CRegExp object and compile regexp expression in one step
* @warning Use only with hardcoded regexp when you're sure that regexp is compiled without errors
* @param caseless Matching will be case insensitive if set to true
* or case sensitive if set to false
* @param utf8 Control UTF-8 processing
* @param re The regular expression
* @param study (optional) Controls study of expression, useful if expression will be used
* several times
*/
CRegExp(bool caseless, utf8Mode utf8, const char *re, studyMode study = NoStudy);
CRegExp(const CRegExp& re);
~CRegExp();
/**
* Compile (prepare) regular expression
* @param re The regular expression
* @param study (optional) Controls study of expression, useful if expression will be used
* several times
* @return true on success, false on any error
*/
bool RegComp(const char *re, studyMode study = NoStudy);
/**
* Compile (prepare) regular expression
* @param re The regular expression
* @param study (optional) Controls study of expression, useful if expression will be used
* several times
* @return true on success, false on any error
*/
bool RegComp(const std::string& re, studyMode study = NoStudy)
{ return RegComp(re.c_str(), study); }
/**
* Find first match of regular expression in given string
* @param str The string to match against regular expression
* @param startoffset (optional) The string offset to start matching
* @param maxNumberOfCharsToTest (optional) The maximum number of characters to test (match) in
* string. If set to -1 string checked up to the end.
* @return staring position of match in string, negative value in case of error or no match
*/
int RegFind(const char* str, unsigned int startoffset = 0, int maxNumberOfCharsToTest = -1);
/**
* Find first match of regular expression in given string
* @param str The string to match against regular expression
* @param startoffset (optional) The string offset to start matching
* @param maxNumberOfCharsToTest (optional) The maximum number of characters to test (match) in
* string. If set to -1 string checked up to the end.
* @return staring position of match in string, negative value in case of error or no match
*/
int RegFind(const std::string& str, unsigned int startoffset = 0, int maxNumberOfCharsToTest = -1)
{ return PrivateRegFind(str.length(), str.c_str(), startoffset, maxNumberOfCharsToTest); }
std::string GetReplaceString(const std::string& sReplaceExp) const;
int GetFindLen() const
{
if (!m_re || !m_bMatched)
return 0;
return (m_iOvector[1] - m_iOvector[0]);
};
int GetSubCount() const { return m_iMatchCount - 1; } // PCRE returns the number of sub-patterns + 1
int GetSubStart(int iSub) const;
int GetSubStart(const std::string& subName) const;
int GetSubLength(int iSub) const;
int GetSubLength(const std::string& subName) const;
int GetCaptureTotal() const;
std::string GetMatch(int iSub = 0) const;
std::string GetMatch(const std::string& subName) const;
const std::string& GetPattern() const { return m_pattern; }
bool GetNamedSubPattern(const char* strName, std::string& strMatch) const;
int GetNamedSubPatternNumber(const char* strName) const;
void DumpOvector(int iLog);
/**
* Check is RegExp object is ready for matching
* @return true if RegExp object is ready for matching, false otherwise
*/
inline bool IsCompiled(void) const
{ return !m_pattern.empty(); }
CRegExp& operator= (const CRegExp& re);
static bool IsUtf8Supported(void);
static bool AreUnicodePropertiesSupported(void);
static bool LogCheckUtf8Support(void);
static bool IsJitSupported(void);
private:
int PrivateRegFind(size_t bufferLen, const char *str, unsigned int startoffset = 0, int maxNumberOfCharsToTest = -1);
void InitValues(bool caseless = false, CRegExp::utf8Mode utf8 = asciiOnly);
static bool requireUtf8(const std::string& regexp);
static int readCharXCode(const std::string& regexp, size_t& pos);
static bool isCharClassWithUnicode(const std::string& regexp, size_t& pos);
void Cleanup();
inline bool IsValidSubNumber(int iSub) const;
PCRE::pcre* m_re;
PCRE::pcre_extra* m_sd;
static const int OVECCOUNT=(m_MaxNumOfBackrefrences + 1) * 3;
unsigned int m_offset;
int m_iOvector[OVECCOUNT];
utf8Mode m_utf8Mode;
int m_iMatchCount;
int m_iOptions;
bool m_jitCompiled;
bool m_bMatched;
PCRE::pcre_jit_stack* m_jitStack;
std::string m_subject;
std::string m_pattern;
static int m_Utf8Supported;
static int m_UcpSupported;
static int m_JitSupported;
};
typedef std::vector<CRegExp> VECCREGEXP;
|