1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424
|
// Copyright 2020 The Chromium Authors
// Copyright 2014 Blake Embrey (hello@blakeembrey.com)
// Use of this source code is governed by an MIT-style license that can be
// found in the LICENSE file or at https://opensource.org/licenses/MIT.
#include "third_party/liburlpattern/parse.h"
#include <string_view>
#include <unordered_set>
#include "third_party/abseil-cpp/absl/base/macros.h"
#include "third_party/abseil-cpp/absl/status/statusor.h"
#include "third_party/abseil-cpp/absl/strings/str_format.h"
#include "third_party/liburlpattern/pattern.h"
#include "third_party/liburlpattern/tokenize.h"
#include "third_party/liburlpattern/utils.h"
// The following code is a translation from the path-to-regexp typescript at:
//
// https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L126-L232
namespace liburlpattern {
namespace {
// Helper class that tracks the parser state.
class State {
public:
State(std::vector<Token> token_list,
EncodeCallback encode_callback,
Options options)
: token_list_(std::move(token_list)),
encode_callback_(std::move(encode_callback)),
options_(std::move(options)),
segment_wildcard_regex_(
absl::StrFormat("[^%s]+?",
EscapeRegexpString(options_.delimiter_list))) {}
// Return true if there are more tokens to process.
bool HasMoreTokens() const { return index_ < token_list_.size(); }
// Attempt to consume the next Token, but only if it matches the given
// |type|. Returns a pointer to the Token on success or nullptr on failure.
const Token* TryConsume(TokenType type) {
ABSL_ASSERT(index_ < token_list_.size());
TokenType next_type = token_list_[index_].type;
if (next_type != type)
return nullptr;
// The last token should always be kEnd.
if ((index_ + 1) == token_list_.size())
ABSL_ASSERT(token_list_[index_].type == TokenType::kEnd);
return &(token_list_[index_++]);
}
// Consume the next Token requiring it to be the given |type|. If this
// is not possible then return an error.
absl::StatusOr<const Token*> MustConsume(TokenType type) {
ABSL_ASSERT(index_ < token_list_.size());
if (const Token* token = TryConsume(type))
return token;
return absl::InvalidArgumentError(absl::StrFormat(
"Unexpected %s '%s' at index %d, expected %s.",
TokenTypeToString(token_list_[index_].type), token_list_[index_].value,
token_list_[index_].index, TokenTypeToString(type)));
}
const Token* TryConsumeModifier() {
const Token* result = TryConsume(TokenType::kOtherModifier);
if (!result)
result = TryConsume(TokenType::kAsterisk);
return result;
}
// Consume as many sequential kChar and kEscapedChar Tokens as possible
// appending them together into a single string value.
std::string ConsumeText() {
// Unfortunately we cannot use a view here and must copy into a new
// string. This is necessary to flatten escape sequences into
// a single value with other characters.
std::string result;
const Token* token = nullptr;
do {
token = TryConsume(TokenType::kChar);
if (!token)
token = TryConsume(TokenType::kEscapedChar);
if (token)
result.append(token->value.data(), token->value.size());
} while (token);
return result;
}
// Append the given Token value to the pending fixed value. This will
// be converted to a kFixed Part when we reach the end of a run of
// kChar and kEscapedChar tokens.
void AppendToPendingFixedValue(std::string_view token_value) {
pending_fixed_value_.append(token_value.data(), token_value.size());
}
// Convert the pending fixed value, if any, to a kFixed Part. Has no effect
// if there is no pending value.
absl::Status MaybeAddPartFromPendingFixedValue() {
if (pending_fixed_value_.empty())
return absl::OkStatus();
auto encoded_result = encode_callback_(std::move(pending_fixed_value_));
if (!encoded_result.has_value()) {
return encoded_result.error();
}
part_list_.emplace_back(PartType::kFixed, std::move(encoded_result.value()),
Modifier::kNone);
pending_fixed_value_ = "";
return absl::OkStatus();
}
// Add a Part for the given set of tokens.
absl::Status AddPart(std::string prefix,
const Token* name_token,
const Token* regex_or_wildcard_token,
std::string suffix,
const Token* modifier_token) {
// Convert the modifier Token into a Modifier enum value.
Modifier modifier = Modifier::kNone;
if (modifier_token) {
ABSL_ASSERT(!modifier_token->value.empty());
switch (modifier_token->value[0]) {
case '?':
modifier = Modifier::kOptional;
break;
case '*':
modifier = Modifier::kZeroOrMore;
break;
case '+':
modifier = Modifier::kOneOrMore;
break;
default:
ABSL_ASSERT(false);
break;
}
}
// If this is a `{ ... }` grouping containing only fixed text, then
// just add it to our pending value for now. We want to collect as
// much fixed text as possible in the buffer before commiting it to
// a kFixed Part.
if (!name_token && !regex_or_wildcard_token &&
modifier == Modifier::kNone) {
AppendToPendingFixedValue(prefix);
return absl::OkStatus();
}
// We are about to add some kind of matching group Part to the list.
// Before doing that make sure to flush any pending fixed test to a
// kFixed Part.
absl::Status status = MaybeAddPartFromPendingFixedValue();
if (!status.ok())
return status;
// If there is no name, regex, or wildcard tokens then this is just a fixed
// string grouping; e.g. "{foo}?". The fixed string ends up in the prefix
// value since it consumed the entire text of the grouping. If the prefix
// value is empty then its an empty "{}" group and we return without adding
// any Part.
if (!name_token && !regex_or_wildcard_token) {
ABSL_ASSERT(suffix.empty());
if (prefix.empty())
return absl::OkStatus();
auto result = encode_callback_(std::move(prefix));
if (!result.has_value()) {
return result.error();
}
part_list_.emplace_back(PartType::kFixed, *result, modifier);
return absl::OkStatus();
}
// Determine the regex value. If there is a |kRegex| Token, then this is
// explicitly set by that Token. If there is a wildcard token, then this
// is set to the |kFullWildcardRegex| constant. Otherwise a kName Token by
// itself gets an implicit regex value that matches through to the end of
// the segment. This is represented by the |segment_wildcard_regex_| value.
std::string regex_value;
if (!regex_or_wildcard_token)
regex_value = segment_wildcard_regex_;
else if (regex_or_wildcard_token->type == TokenType::kAsterisk)
regex_value = kFullWildcardRegex;
else
regex_value = std::string(regex_or_wildcard_token->value);
// Next determine the type of the Part. This depends on the regex value
// since we give certain values special treatment with their own type.
// A |segment_wildcard_regex_| is mapped to the kSegmentWildcard type. A
// |kFullWildcardRegex| is mapped to the kFullWildcard type. Otherwise
// the Part gets the kRegex type.
PartType type = PartType::kRegex;
if (regex_value == segment_wildcard_regex_) {
type = PartType::kSegmentWildcard;
regex_value = "";
} else if (regex_value == kFullWildcardRegex) {
type = PartType::kFullWildcard;
regex_value = "";
}
// Every kRegex, kSegmentWildcard, and kFullWildcard Part must have a
// group name. If there was a kName Token, then use the explicitly
// set name. Otherwise we generate a numeric based key for the name.
std::string name;
if (name_token)
name = std::string(name_token->value);
else if (regex_or_wildcard_token)
name = GenerateKey();
auto name_set_result = name_set_.insert(name);
if (!name_set_result.second) {
return absl::InvalidArgumentError(
absl::StrFormat("Duplicate group name '%s' at index %d.", name,
token_list_[index_].index));
}
auto prefix_result = encode_callback_(std::move(prefix));
if (!prefix_result.has_value()) {
return prefix_result.error();
}
auto suffix_result = encode_callback_(std::move(suffix));
if (!suffix_result.has_value()) {
return suffix_result.error();
}
// Finally add the part to the list. We encode the prefix and suffix, but
// must be careful not to encode the regex value since it can change the
// meaning of the regular expression.
part_list_.emplace_back(type, std::move(name), *prefix_result,
std::move(regex_value), *suffix_result, modifier);
return absl::OkStatus();
}
Pattern TakeAsPattern() {
return Pattern(std::move(part_list_), std::move(options_),
std::move(segment_wildcard_regex_));
}
private:
// Generate a numeric key string to be used for groups that do not
// have an explicit kName Token.
std::string GenerateKey() { return absl::StrFormat("%d", next_key_++); }
// The input list of Token objects to process.
const std::vector<Token> token_list_;
EncodeCallback encode_callback_;
// The set of options used to parse and construct this Pattern. This
// controls the behavior of things like kSegmentWildcard parts, etc.
Options options_;
// The special regex value corresponding to the default regex value
// given to a lone kName Token. This is a variable since its value
// is dependent on the |delimiter_list| passed to the constructor.
const std::string segment_wildcard_regex_;
// The output list of Pattern Part objects.
std::vector<Part> part_list_;
// Tracks which names have been seen before so we can error on duplicates.
std::unordered_set<std::string> name_set_;
// A buffer of kChar and kEscapedChar values that are pending the creation
// of a kFixed Part.
std::string pending_fixed_value_;
// The index of the next Token in |token_list_|.
size_t index_ = 0;
// The next value to use when generating a numeric based name for Parts
// without explicit kName Tokens.
int next_key_ = 0;
};
} // namespace
base::expected<Pattern, absl::Status> Parse(std::string_view pattern,
EncodeCallback encode_callback,
const Options& options) {
auto result = Tokenize(pattern);
if (!result.has_value()) {
return base::unexpected(result.error());
}
State state(std::move(result.value()), std::move(encode_callback), options);
while (state.HasMoreTokens()) {
// Look for the sequence: <prefix char><name><regex><modifier>
// There could be from zero to all through of these tokens. For
// example:
// * "/:foo(bar)?" - all four tokens
// * "/" - just a char token
// * ":foo" - just a name token
// * "(bar)" - just a regex token
// * "/:foo" - char and name tokens
// * "/(bar)" - char and regex tokens
// * "/:foo?" - char, name, and modifier tokens
// * "/(bar)?" - char, regex, and modifier tokens
const Token* char_token = state.TryConsume(TokenType::kChar);
const Token* name_token = state.TryConsume(TokenType::kName);
const Token* regex_or_wildcard_token = state.TryConsume(TokenType::kRegex);
// If there is no name or regex token, then we may have a wildcard `*`
// token in place of an unnamed regex token. Each wildcard will be
// treated as being equivalent to a "(.*)" regex token. For example:
// * "/*" - equivalent to "/(.*)"
// * "/*?" - equivalent to "/(.*)?"
if (!name_token && !regex_or_wildcard_token)
regex_or_wildcard_token = state.TryConsume(TokenType::kAsterisk);
// If there is a name, regex, or wildcard token then we need to add a
// Pattern Part immediately.
if (name_token || regex_or_wildcard_token) {
// Determine if the char token is a valid prefix. Only characters in the
// configured prefix_list are automatically treated as prefixes. A
// kEscapedChar Token is never treated as a prefix.
std::string_view prefix = char_token ? char_token->value : "";
if (options.prefix_list.find(prefix.data(), /*pos=*/0, prefix.size()) ==
std::string::npos) {
// This is not a prefix character. Add it to the buffered characters
// to be added as a kFixed Part later.
state.AppendToPendingFixedValue(prefix);
prefix = std::string_view();
}
// If we have any buffered characters in a pending fixed value, then
// convert them into a kFixed Part now.
absl::Status status = state.MaybeAddPartFromPendingFixedValue();
if (!status.ok())
return base::unexpected(status);
// kName and kRegex tokens can optionally be followed by a modifier.
const Token* modifier_token = state.TryConsumeModifier();
// Add the Part for the name and regex/wildcard tokens.
status = state.AddPart(std::string(prefix), name_token,
regex_or_wildcard_token,
/*suffix=*/"", modifier_token);
if (!status.ok())
return base::unexpected(status);
continue;
}
// There was neither a kRegex or kName token, so consider if we just have a
// fixed string part. A fixed string can consist of kChar or kEscapedChar
// tokens. These just get added to the buffered pending fixed value for
// now. It will get converted to a kFixed Part later.
const Token* fixed_token = char_token;
if (!fixed_token)
fixed_token = state.TryConsume(TokenType::kEscapedChar);
if (fixed_token) {
state.AppendToPendingFixedValue(fixed_token->value);
continue;
}
// There was not a kChar or kEscapedChar token, so we no we are at the end
// of any fixed string. Do not yet convert the pending fixed value into
// a kFixedPart, though. Its possible there will be further fixed text in
// a `{ ... }` group, etc.
// Look for the sequence:
//
// <open><char prefix><name><regex><char suffix><close><modifier>
//
// The open and close are required, but the other tokens are optional.
// For example:
// * "{a:foo(.*)b}?" - all tokens present
// * "{:foo}?" - just name and modifier tokens
// * "{(.*)}?" - just regex and modifier tokens
// * "{ab}?" - just char and modifier tokens
const Token* open_token = state.TryConsume(TokenType::kOpen);
if (open_token) {
std::string prefix = state.ConsumeText();
const Token* name_token = state.TryConsume(TokenType::kName);
const Token* regex_or_wildcard_token =
state.TryConsume(TokenType::kRegex);
// If there is no name or regex token, then we may have a wildcard `*`
// token in place of an unnamed regex token. Each wildcard will be
// treated as being equivalent to a "(.*)" regex token. For example,
// "{a*b}" is equivalent to "{a(.*)b}".
if (!name_token && !regex_or_wildcard_token)
regex_or_wildcard_token = state.TryConsume(TokenType::kAsterisk);
std::string suffix = state.ConsumeText();
auto result = state.MustConsume(TokenType::kClose);
if (!result.ok())
return base::unexpected(result.status());
const Token* modifier_token = state.TryConsumeModifier();
absl::Status status =
state.AddPart(std::move(prefix), name_token, regex_or_wildcard_token,
std::move(suffix), modifier_token);
if (!status.ok())
return base::unexpected(status);
continue;
}
// We are about to end the pattern string, so flush any pending text to
// a kFixed Part.
absl::Status status = state.MaybeAddPartFromPendingFixedValue();
if (!status.ok())
return base::unexpected(status);
// We didn't find any tokens allowed by the syntax, so we should be
// at the end of the token list. If there is a syntax error, this
// is where it will typically be caught.
auto result = state.MustConsume(TokenType::kEnd);
if (!result.ok())
return base::unexpected(result.status());
}
return state.TakeAsPattern();
}
} // namespace liburlpattern
|