1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
|
//===-- Regex.cpp - Regular Expression matcher implementation -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements a POSIX regular expression matcher.
//
//===----------------------------------------------------------------------===//
#include "llvm/Support/Regex.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Twine.h"
#include <string>
// Important this comes last because it defines "_REGEX_H_". At least on
// Darwin, if included before any header that (transitively) includes
// xlocale.h, this will cause trouble, because of missing regex-related types.
#include "regex_impl.h"
using namespace llvm;
Regex::Regex() : preg(nullptr), error(REG_BADPAT) {}
Regex::Regex(StringRef regex, unsigned Flags) {
unsigned flags = 0;
preg = new llvm_regex();
preg->re_endp = regex.end();
if (Flags & IgnoreCase)
flags |= REG_ICASE;
if (Flags & Newline)
flags |= REG_NEWLINE;
if (!(Flags & BasicRegex))
flags |= REG_EXTENDED;
error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
}
Regex::Regex(Regex &®ex) {
preg = regex.preg;
error = regex.error;
regex.preg = nullptr;
regex.error = REG_BADPAT;
}
Regex::~Regex() {
if (preg) {
llvm_regfree(preg);
delete preg;
}
}
bool Regex::isValid(std::string &Error) const {
if (!error)
return true;
size_t len = llvm_regerror(error, preg, nullptr, 0);
Error.resize(len - 1);
llvm_regerror(error, preg, &Error[0], len);
return false;
}
/// getNumMatches - In a valid regex, return the number of parenthesized
/// matches it contains.
unsigned Regex::getNumMatches() const {
return preg->re_nsub;
}
bool Regex::match(StringRef String, SmallVectorImpl<StringRef> *Matches){
if (error)
return false;
unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
// pmatch needs to have at least one element.
SmallVector<llvm_regmatch_t, 8> pm;
pm.resize(nmatch > 0 ? nmatch : 1);
pm[0].rm_so = 0;
pm[0].rm_eo = String.size();
int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
if (rc == REG_NOMATCH)
return false;
if (rc != 0) {
// regexec can fail due to invalid pattern or running out of memory.
error = rc;
return false;
}
// There was a match.
if (Matches) { // match position requested
Matches->clear();
for (unsigned i = 0; i != nmatch; ++i) {
if (pm[i].rm_so == -1) {
// this group didn't match
Matches->push_back(StringRef());
continue;
}
assert(pm[i].rm_eo >= pm[i].rm_so);
Matches->push_back(StringRef(String.data()+pm[i].rm_so,
pm[i].rm_eo-pm[i].rm_so));
}
}
return true;
}
std::string Regex::sub(StringRef Repl, StringRef String,
std::string *Error) {
SmallVector<StringRef, 8> Matches;
// Reset error, if given.
if (Error && !Error->empty()) *Error = "";
// Return the input if there was no match.
if (!match(String, &Matches))
return String;
// Otherwise splice in the replacement string, starting with the prefix before
// the match.
std::string Res(String.begin(), Matches[0].begin());
// Then the replacement string, honoring possible substitutions.
while (!Repl.empty()) {
// Skip to the next escape.
std::pair<StringRef, StringRef> Split = Repl.split('\\');
// Add the skipped substring.
Res += Split.first;
// Check for terminimation and trailing backslash.
if (Split.second.empty()) {
if (Repl.size() != Split.first.size() &&
Error && Error->empty())
*Error = "replacement string contained trailing backslash";
break;
}
// Otherwise update the replacement string and interpret escapes.
Repl = Split.second;
// FIXME: We should have a StringExtras function for mapping C99 escapes.
switch (Repl[0]) {
// Treat all unrecognized characters as self-quoting.
default:
Res += Repl[0];
Repl = Repl.substr(1);
break;
// Single character escapes.
case 't':
Res += '\t';
Repl = Repl.substr(1);
break;
case 'n':
Res += '\n';
Repl = Repl.substr(1);
break;
// Decimal escapes are backreferences.
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': {
// Extract the backreference number.
StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
Repl = Repl.substr(Ref.size());
unsigned RefValue;
if (!Ref.getAsInteger(10, RefValue) &&
RefValue < Matches.size())
Res += Matches[RefValue];
else if (Error && Error->empty())
*Error = ("invalid backreference string '" + Twine(Ref) + "'").str();
break;
}
}
}
// And finally the suffix.
Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
return Res;
}
// These are the special characters matched in functions like "p_ere_exp".
static const char RegexMetachars[] = "()^$|*+?.[]\\{}";
bool Regex::isLiteralERE(StringRef Str) {
// Check for regex metacharacters. This list was derived from our regex
// implementation in regcomp.c and double checked against the POSIX extended
// regular expression specification.
return Str.find_first_of(RegexMetachars) == StringRef::npos;
}
std::string Regex::escape(StringRef String) {
std::string RegexStr;
for (unsigned i = 0, e = String.size(); i != e; ++i) {
if (strchr(RegexMetachars, String[i]))
RegexStr += '\\';
RegexStr += String[i];
}
return RegexStr;
}
|