File: parse.cc

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (424 lines) | stat: -rw-r--r-- 16,677 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
// Copyright 2020 The Chromium Authors
// Copyright 2014 Blake Embrey (hello@blakeembrey.com)
// Use of this source code is governed by an MIT-style license that can be
// found in the LICENSE file or at https://opensource.org/licenses/MIT.

#include "third_party/liburlpattern/parse.h"

#include <string_view>
#include <unordered_set>

#include "third_party/abseil-cpp/absl/base/macros.h"
#include "third_party/abseil-cpp/absl/status/statusor.h"
#include "third_party/abseil-cpp/absl/strings/str_format.h"
#include "third_party/liburlpattern/pattern.h"
#include "third_party/liburlpattern/tokenize.h"
#include "third_party/liburlpattern/utils.h"

// The following code is a translation from the path-to-regexp typescript at:
//
//  https://github.com/pillarjs/path-to-regexp/blob/125c43e6481f68cc771a5af22b914acdb8c5ba1f/src/index.ts#L126-L232

namespace liburlpattern {

namespace {

// Helper class that tracks the parser state.
class State {
 public:
  State(std::vector<Token> token_list,
        EncodeCallback encode_callback,
        Options options)
      : token_list_(std::move(token_list)),
        encode_callback_(std::move(encode_callback)),
        options_(std::move(options)),
        segment_wildcard_regex_(
            absl::StrFormat("[^%s]+?",
                            EscapeRegexpString(options_.delimiter_list))) {}

  // Return true if there are more tokens to process.
  bool HasMoreTokens() const { return index_ < token_list_.size(); }

  // Attempt to consume the next Token, but only if it matches the given
  // |type|.  Returns a pointer to the Token on success or nullptr on failure.
  const Token* TryConsume(TokenType type) {
    ABSL_ASSERT(index_ < token_list_.size());
    TokenType next_type = token_list_[index_].type;
    if (next_type != type)
      return nullptr;

    // The last token should always be kEnd.
    if ((index_ + 1) == token_list_.size())
      ABSL_ASSERT(token_list_[index_].type == TokenType::kEnd);

    return &(token_list_[index_++]);
  }

  // Consume the next Token requiring it to be the given |type|.  If this
  // is not possible then return an error.
  absl::StatusOr<const Token*> MustConsume(TokenType type) {
    ABSL_ASSERT(index_ < token_list_.size());
    if (const Token* token = TryConsume(type))
      return token;
    return absl::InvalidArgumentError(absl::StrFormat(
        "Unexpected %s '%s' at index %d, expected %s.",
        TokenTypeToString(token_list_[index_].type), token_list_[index_].value,
        token_list_[index_].index, TokenTypeToString(type)));
  }

  const Token* TryConsumeModifier() {
    const Token* result = TryConsume(TokenType::kOtherModifier);
    if (!result)
      result = TryConsume(TokenType::kAsterisk);
    return result;
  }

  // Consume as many sequential kChar and kEscapedChar Tokens as possible
  // appending them together into a single string value.
  std::string ConsumeText() {
    // Unfortunately we cannot use a view here and must copy into a new
    // string.  This is necessary to flatten escape sequences into
    // a single value with other characters.
    std::string result;
    const Token* token = nullptr;
    do {
      token = TryConsume(TokenType::kChar);
      if (!token)
        token = TryConsume(TokenType::kEscapedChar);
      if (token)
        result.append(token->value.data(), token->value.size());
    } while (token);
    return result;
  }

  // Append the given Token value to the pending fixed value.  This will
  // be converted to a kFixed Part when we reach the end of a run of
  // kChar and kEscapedChar tokens.
  void AppendToPendingFixedValue(std::string_view token_value) {
    pending_fixed_value_.append(token_value.data(), token_value.size());
  }

  // Convert the pending fixed value, if any, to a kFixed Part.  Has no effect
  // if there is no pending value.
  absl::Status MaybeAddPartFromPendingFixedValue() {
    if (pending_fixed_value_.empty())
      return absl::OkStatus();

    auto encoded_result = encode_callback_(std::move(pending_fixed_value_));
    if (!encoded_result.has_value()) {
      return encoded_result.error();
    }

    part_list_.emplace_back(PartType::kFixed, std::move(encoded_result.value()),
                            Modifier::kNone);
    pending_fixed_value_ = "";
    return absl::OkStatus();
  }

  // Add a Part for the given set of tokens.
  absl::Status AddPart(std::string prefix,
                       const Token* name_token,
                       const Token* regex_or_wildcard_token,
                       std::string suffix,
                       const Token* modifier_token) {
    // Convert the modifier Token into a Modifier enum value.
    Modifier modifier = Modifier::kNone;
    if (modifier_token) {
      ABSL_ASSERT(!modifier_token->value.empty());
      switch (modifier_token->value[0]) {
        case '?':
          modifier = Modifier::kOptional;
          break;
        case '*':
          modifier = Modifier::kZeroOrMore;
          break;
        case '+':
          modifier = Modifier::kOneOrMore;
          break;
        default:
          ABSL_ASSERT(false);
          break;
      }
    }

    // If this is a `{ ... }` grouping containing only fixed text, then
    // just add it to our pending value for now.  We want to collect as
    // much fixed text as possible in the buffer before commiting it to
    // a kFixed Part.
    if (!name_token && !regex_or_wildcard_token &&
        modifier == Modifier::kNone) {
      AppendToPendingFixedValue(prefix);
      return absl::OkStatus();
    }

    // We are about to add some kind of matching group Part to the list.
    // Before doing that make sure to flush any pending fixed test to a
    // kFixed Part.
    absl::Status status = MaybeAddPartFromPendingFixedValue();
    if (!status.ok())
      return status;

    // If there is no name, regex, or wildcard tokens then this is just a fixed
    // string grouping; e.g. "{foo}?".  The fixed string ends up in the prefix
    // value since it consumed the entire text of the grouping.  If the prefix
    // value is empty then its an empty "{}" group and we return without adding
    // any Part.
    if (!name_token && !regex_or_wildcard_token) {
      ABSL_ASSERT(suffix.empty());
      if (prefix.empty())
        return absl::OkStatus();
      auto result = encode_callback_(std::move(prefix));
      if (!result.has_value()) {
        return result.error();
      }
      part_list_.emplace_back(PartType::kFixed, *result, modifier);
      return absl::OkStatus();
    }

    // Determine the regex value.  If there is a |kRegex| Token, then this is
    // explicitly set by that Token.  If there is a wildcard token, then this
    // is set to the |kFullWildcardRegex| constant.  Otherwise a kName Token by
    // itself gets an implicit regex value that matches through to the end of
    // the segment. This is represented by the |segment_wildcard_regex_| value.
    std::string regex_value;
    if (!regex_or_wildcard_token)
      regex_value = segment_wildcard_regex_;
    else if (regex_or_wildcard_token->type == TokenType::kAsterisk)
      regex_value = kFullWildcardRegex;
    else
      regex_value = std::string(regex_or_wildcard_token->value);

    // Next determine the type of the Part.  This depends on the regex value
    // since we give certain values special treatment with their own type.
    // A |segment_wildcard_regex_| is mapped to the kSegmentWildcard type.  A
    // |kFullWildcardRegex| is mapped to the kFullWildcard type.  Otherwise
    // the Part gets the kRegex type.
    PartType type = PartType::kRegex;
    if (regex_value == segment_wildcard_regex_) {
      type = PartType::kSegmentWildcard;
      regex_value = "";
    } else if (regex_value == kFullWildcardRegex) {
      type = PartType::kFullWildcard;
      regex_value = "";
    }

    // Every kRegex, kSegmentWildcard, and kFullWildcard Part must have a
    // group name.  If there was a kName Token, then use the explicitly
    // set name.  Otherwise we generate a numeric based key for the name.
    std::string name;
    if (name_token)
      name = std::string(name_token->value);
    else if (regex_or_wildcard_token)
      name = GenerateKey();

    auto name_set_result = name_set_.insert(name);
    if (!name_set_result.second) {
      return absl::InvalidArgumentError(
          absl::StrFormat("Duplicate group name '%s' at index %d.", name,
                          token_list_[index_].index));
    }

    auto prefix_result = encode_callback_(std::move(prefix));
    if (!prefix_result.has_value()) {
      return prefix_result.error();
    }

    auto suffix_result = encode_callback_(std::move(suffix));
    if (!suffix_result.has_value()) {
      return suffix_result.error();
    }

    // Finally add the part to the list.  We encode the prefix and suffix, but
    // must be careful not to encode the regex value since it can change the
    // meaning of the regular expression.
    part_list_.emplace_back(type, std::move(name), *prefix_result,
                            std::move(regex_value), *suffix_result, modifier);
    return absl::OkStatus();
  }

  Pattern TakeAsPattern() {
    return Pattern(std::move(part_list_), std::move(options_),
                   std::move(segment_wildcard_regex_));
  }

 private:
  // Generate a numeric key string to be used for groups that do not
  // have an explicit kName Token.
  std::string GenerateKey() { return absl::StrFormat("%d", next_key_++); }

  // The input list of Token objects to process.
  const std::vector<Token> token_list_;

  EncodeCallback encode_callback_;

  // The set of options used to parse and construct this Pattern.  This
  // controls the behavior of things like kSegmentWildcard parts, etc.
  Options options_;

  // The special regex value corresponding to the default regex value
  // given to a lone kName Token.  This is a variable since its value
  // is dependent on the |delimiter_list| passed to the constructor.
  const std::string segment_wildcard_regex_;

  // The output list of Pattern Part objects.
  std::vector<Part> part_list_;

  // Tracks which names have been seen before so we can error on duplicates.
  std::unordered_set<std::string> name_set_;

  // A buffer of kChar and kEscapedChar values that are pending the creation
  // of a kFixed Part.
  std::string pending_fixed_value_;

  // The index of the next Token in |token_list_|.
  size_t index_ = 0;

  // The next value to use when generating a numeric based name for Parts
  // without explicit kName Tokens.
  int next_key_ = 0;
};

}  // namespace

base::expected<Pattern, absl::Status> Parse(std::string_view pattern,
                                            EncodeCallback encode_callback,
                                            const Options& options) {
  auto result = Tokenize(pattern);
  if (!result.has_value()) {
    return base::unexpected(result.error());
  }

  State state(std::move(result.value()), std::move(encode_callback), options);

  while (state.HasMoreTokens()) {
    // Look for the sequence: <prefix char><name><regex><modifier>
    // There could be from zero to all through of these tokens.  For
    // example:
    //  * "/:foo(bar)?" - all four tokens
    //  * "/" - just a char token
    //  * ":foo" - just a name token
    //  * "(bar)" - just a regex token
    //  * "/:foo" - char and name tokens
    //  * "/(bar)" - char and regex tokens
    //  * "/:foo?" - char, name, and modifier tokens
    //  * "/(bar)?" - char, regex, and modifier tokens
    const Token* char_token = state.TryConsume(TokenType::kChar);
    const Token* name_token = state.TryConsume(TokenType::kName);
    const Token* regex_or_wildcard_token = state.TryConsume(TokenType::kRegex);

    // If there is no name or regex token, then we may have a wildcard `*`
    // token in place of an unnamed regex token.  Each wildcard will be
    // treated as being equivalent to a "(.*)" regex token.  For example:
    //  * "/*" - equivalent to "/(.*)"
    //  * "/*?" - equivalent to "/(.*)?"
    if (!name_token && !regex_or_wildcard_token)
      regex_or_wildcard_token = state.TryConsume(TokenType::kAsterisk);

    // If there is a name, regex, or wildcard token then we need to add a
    // Pattern Part immediately.
    if (name_token || regex_or_wildcard_token) {
      // Determine if the char token is a valid prefix.  Only characters in the
      // configured prefix_list are automatically treated as prefixes.  A
      // kEscapedChar Token is never treated as a prefix.
      std::string_view prefix = char_token ? char_token->value : "";
      if (options.prefix_list.find(prefix.data(), /*pos=*/0, prefix.size()) ==
          std::string::npos) {
        // This is not a prefix character.  Add it to the buffered characters
        // to be added as a kFixed Part later.
        state.AppendToPendingFixedValue(prefix);
        prefix = std::string_view();
      }

      // If we have any buffered characters in a pending fixed value, then
      // convert them into a kFixed Part now.
      absl::Status status = state.MaybeAddPartFromPendingFixedValue();
      if (!status.ok())
        return base::unexpected(status);

      // kName and kRegex tokens can optionally be followed by a modifier.
      const Token* modifier_token = state.TryConsumeModifier();

      // Add the Part for the name and regex/wildcard tokens.
      status = state.AddPart(std::string(prefix), name_token,
                             regex_or_wildcard_token,
                             /*suffix=*/"", modifier_token);
      if (!status.ok())
        return base::unexpected(status);
      continue;
    }

    // There was neither a kRegex or kName token, so consider if we just have a
    // fixed string part.  A fixed string can consist of kChar or kEscapedChar
    // tokens.  These just get added to the buffered pending fixed value for
    // now. It will get converted to a kFixed Part later.
    const Token* fixed_token = char_token;
    if (!fixed_token)
      fixed_token = state.TryConsume(TokenType::kEscapedChar);
    if (fixed_token) {
      state.AppendToPendingFixedValue(fixed_token->value);
      continue;
    }

    // There was not a kChar or kEscapedChar token, so we no we are at the end
    // of any fixed string.  Do not yet convert the pending fixed value into
    // a kFixedPart, though.  Its possible there will be further fixed text in
    // a `{ ... }` group, etc.

    // Look for the sequence:
    //
    //  <open><char prefix><name><regex><char suffix><close><modifier>
    //
    // The open and close are required, but the other tokens are optional.
    // For example:
    //  * "{a:foo(.*)b}?" - all tokens present
    //  * "{:foo}?" - just name and modifier tokens
    //  * "{(.*)}?" - just regex and modifier tokens
    //  * "{ab}?" - just char and modifier tokens
    const Token* open_token = state.TryConsume(TokenType::kOpen);
    if (open_token) {
      std::string prefix = state.ConsumeText();
      const Token* name_token = state.TryConsume(TokenType::kName);
      const Token* regex_or_wildcard_token =
          state.TryConsume(TokenType::kRegex);

      // If there is no name or regex token, then we may have a wildcard `*`
      // token in place of an unnamed regex token.  Each wildcard will be
      // treated as being equivalent to a "(.*)" regex token.  For example,
      // "{a*b}" is equivalent to "{a(.*)b}".
      if (!name_token && !regex_or_wildcard_token)
        regex_or_wildcard_token = state.TryConsume(TokenType::kAsterisk);

      std::string suffix = state.ConsumeText();

      auto result = state.MustConsume(TokenType::kClose);
      if (!result.ok())
        return base::unexpected(result.status());

      const Token* modifier_token = state.TryConsumeModifier();

      absl::Status status =
          state.AddPart(std::move(prefix), name_token, regex_or_wildcard_token,
                        std::move(suffix), modifier_token);
      if (!status.ok())
        return base::unexpected(status);
      continue;
    }

    // We are about to end the pattern string, so flush any pending text to
    // a kFixed Part.
    absl::Status status = state.MaybeAddPartFromPendingFixedValue();
    if (!status.ok())
      return base::unexpected(status);

    // We didn't find any tokens allowed by the syntax, so we should be
    // at the end of the token list.  If there is a syntax error, this
    // is where it will typically be caught.
    auto result = state.MustConsume(TokenType::kEnd);
    if (!result.ok())
      return base::unexpected(result.status());
  }

  return state.TakeAsPattern();
}

}  // namespace liburlpattern