File: url_pattern.h

package info (click to toggle)
ada-url 3.4.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,320 kB
  • sloc: cpp: 24,281; ansic: 4,553; python: 573; sh: 193; makefile: 17
file content (418 lines) | stat: -rw-r--r-- 16,039 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
/**
 * @file url_pattern.h
 * @brief URLPattern API implementation.
 *
 * This header provides the URLPattern API as specified by the WHATWG URL
 * Pattern Standard. URLPattern allows matching URLs against patterns with
 * wildcards and named groups, similar to how regular expressions match strings.
 *
 * @see https://urlpattern.spec.whatwg.org/
 * @see https://developer.mozilla.org/en-US/docs/Web/API/URL_Pattern_API
 */
#ifndef ADA_URL_PATTERN_H
#define ADA_URL_PATTERN_H

#include "ada/implementation.h"
#include "ada/expected.h"
#include "ada/parser.h"
#include "ada/url_pattern_init.h"

#include <ostream>
#include <string>
#include <string_view>
#include <unordered_map>
#include <variant>
#include <vector>

#if ADA_TESTING
#include <iostream>
#endif  // ADA_TESTING

#if ADA_INCLUDE_URL_PATTERN
namespace ada {

enum class url_pattern_part_type : uint8_t {
  // The part represents a simple fixed text string.
  FIXED_TEXT,
  // The part represents a matching group with a custom regular expression.
  REGEXP,
  // The part represents a matching group that matches code points up to the
  // next separator code point. This is typically used for a named group like
  // ":foo" that does not have a custom regular expression.
  SEGMENT_WILDCARD,
  // The part represents a matching group that greedily matches all code points.
  // This is typically used for the "*" wildcard matching group.
  FULL_WILDCARD,
};

// Pattern type for fast-path matching optimization.
// This allows skipping expensive regex evaluation for common simple patterns.
enum class url_pattern_component_type : uint8_t {
  // Pattern is "^$" - only matches empty string
  EMPTY,
  // Pattern is "^<literal>$" - exact string match (no regex needed)
  EXACT_MATCH,
  // Pattern is "^(.*)$" - matches anything (full wildcard)
  FULL_WILDCARD,
  // Pattern requires actual regex evaluation
  REGEXP,
};

enum class url_pattern_part_modifier : uint8_t {
  // The part does not have a modifier.
  none,
  // The part has an optional modifier indicated by the U+003F (?) code point.
  optional,
  // The part has a "zero or more" modifier indicated by the U+002A (*) code
  // point.
  zero_or_more,
  // The part has a "one or more" modifier indicated by the U+002B (+) code
  // point.
  one_or_more,
};

// @see https://urlpattern.spec.whatwg.org/#part
class url_pattern_part {
 public:
  url_pattern_part(url_pattern_part_type _type, std::string&& _value,
                   url_pattern_part_modifier _modifier)
      : type(_type), value(std::move(_value)), modifier(_modifier) {}

  url_pattern_part(url_pattern_part_type _type, std::string&& _value,
                   url_pattern_part_modifier _modifier, std::string&& _name,
                   std::string&& _prefix, std::string&& _suffix)
      : type(_type),
        value(std::move(_value)),
        modifier(_modifier),
        name(std::move(_name)),
        prefix(std::move(_prefix)),
        suffix(std::move(_suffix)) {}
  // A part has an associated type, a string, which must be set upon creation.
  url_pattern_part_type type;
  // A part has an associated value, a string, which must be set upon creation.
  std::string value;
  // A part has an associated modifier a string, which must be set upon
  // creation.
  url_pattern_part_modifier modifier;
  // A part has an associated name, a string, initially the empty string.
  std::string name{};
  // A part has an associated prefix, a string, initially the empty string.
  std::string prefix{};
  // A part has an associated suffix, a string, initially the empty string.
  std::string suffix{};

  inline bool is_regexp() const noexcept;
};

// @see https://urlpattern.spec.whatwg.org/#options-header
struct url_pattern_compile_component_options {
  url_pattern_compile_component_options() = default;
  explicit url_pattern_compile_component_options(
      std::optional<char> new_delimiter = std::nullopt,
      std::optional<char> new_prefix = std::nullopt)
      : delimiter(new_delimiter), prefix(new_prefix) {}

  inline std::string_view get_delimiter() const ada_warn_unused;
  inline std::string_view get_prefix() const ada_warn_unused;

  // @see https://urlpattern.spec.whatwg.org/#options-ignore-case
  bool ignore_case = false;

  static url_pattern_compile_component_options DEFAULT;
  static url_pattern_compile_component_options HOSTNAME;
  static url_pattern_compile_component_options PATHNAME;

 private:
  // @see https://urlpattern.spec.whatwg.org/#options-delimiter-code-point
  std::optional<char> delimiter{};
  // @see https://urlpattern.spec.whatwg.org/#options-prefix-code-point
  std::optional<char> prefix{};
};

// The default options is an options struct with delimiter code point set to
// the empty string and prefix code point set to the empty string.
inline url_pattern_compile_component_options
    url_pattern_compile_component_options::DEFAULT(std::nullopt, std::nullopt);

// The hostname options is an options struct with delimiter code point set
// "." and prefix code point set to the empty string.
inline url_pattern_compile_component_options
    url_pattern_compile_component_options::HOSTNAME('.', std::nullopt);

// The pathname options is an options struct with delimiter code point set
// "/" and prefix code point set to "/".
inline url_pattern_compile_component_options
    url_pattern_compile_component_options::PATHNAME('/', '/');

// A struct providing the URLPattern matching results for a single
// URL component. The URLPatternComponentResult is only ever used
// as a member attribute of a URLPatternResult struct. The
// URLPatternComponentResult API is defined as part of the URLPattern
// specification.
struct url_pattern_component_result {
  std::string input;
  std::unordered_map<std::string, std::optional<std::string>> groups;

  bool operator==(const url_pattern_component_result&) const;

#if ADA_TESTING
  friend void PrintTo(const url_pattern_component_result& result,
                      std::ostream* os) {
    *os << "input: '" << result.input << "', group: ";
    for (const auto& group : result.groups) {
      *os << "(" << group.first << ", " << group.second.value_or("undefined")
          << ") ";
    }
  }
#endif  // ADA_TESTING
};

template <url_pattern_regex::regex_concept regex_provider>
class url_pattern_component {
 public:
  url_pattern_component() = default;

  // This function explicitly takes a std::string because it is moved.
  // To avoid unnecessary copy, move each value while calling the constructor.
  url_pattern_component(std::string&& new_pattern,
                        typename regex_provider::regex_type&& new_regexp,
                        std::vector<std::string>&& new_group_name_list,
                        bool new_has_regexp_groups,
                        url_pattern_component_type new_type,
                        std::string&& new_exact_match_value = {})
      : regexp(std::move(new_regexp)),
        pattern(std::move(new_pattern)),
        group_name_list(std::move(new_group_name_list)),
        exact_match_value(std::move(new_exact_match_value)),
        has_regexp_groups(new_has_regexp_groups),
        type(new_type) {}

  // @see https://urlpattern.spec.whatwg.org/#compile-a-component
  template <url_pattern_encoding_callback F>
  static tl::expected<url_pattern_component, errors> compile(
      std::string_view input, F& encoding_callback,
      url_pattern_compile_component_options& options);

  // @see https://urlpattern.spec.whatwg.org/#create-a-component-match-result
  url_pattern_component_result create_component_match_result(
      std::string&& input,
      std::vector<std::optional<std::string>>&& exec_result);

  // Fast path test that returns true/false without constructing result groups.
  // Uses cached pattern type to skip regex evaluation for simple patterns.
  bool fast_test(std::string_view input) const noexcept;

  // Fast path match that returns capture groups without regex for simple
  // patterns. Returns nullopt if pattern doesn't match, otherwise returns
  // capture groups.
  std::optional<std::vector<std::optional<std::string>>> fast_match(
      std::string_view input) const;

#if ADA_TESTING
  friend void PrintTo(const url_pattern_component& component,
                      std::ostream* os) {
    *os << "pattern: '" << component.pattern
        << "', has_regexp_groups: " << component.has_regexp_groups
        << "group_name_list: ";
    for (const auto& name : component.group_name_list) {
      *os << name << ", ";
    }
  }
#endif  // ADA_TESTING

  typename regex_provider::regex_type regexp{};
  std::string pattern{};
  std::vector<std::string> group_name_list{};
  // For EXACT_MATCH type: the literal string to compare against
  std::string exact_match_value{};
  bool has_regexp_groups = false;
  // Cached pattern type for fast-path optimization
  url_pattern_component_type type = url_pattern_component_type::REGEXP;
};

// A URLPattern input can be either a string or a URLPatternInit object.
// If it is a string, it must be a valid UTF-8 string.
using url_pattern_input = std::variant<std::string_view, url_pattern_init>;

// A struct providing the URLPattern matching results for all
// components of a URL. The URLPatternResult API is defined as
// part of the URLPattern specification.
struct url_pattern_result {
  std::vector<url_pattern_input> inputs;
  url_pattern_component_result protocol;
  url_pattern_component_result username;
  url_pattern_component_result password;
  url_pattern_component_result hostname;
  url_pattern_component_result port;
  url_pattern_component_result pathname;
  url_pattern_component_result search;
  url_pattern_component_result hash;
};

struct url_pattern_options {
  bool ignore_case = false;

#if ADA_TESTING
  friend void PrintTo(const url_pattern_options& options, std::ostream* os) {
    *os << "ignore_case: '" << options.ignore_case;
  }
#endif  // ADA_TESTING
};

/**
 * @brief URL pattern matching class implementing the URLPattern API.
 *
 * URLPattern provides a way to match URLs against patterns with wildcards
 * and named capture groups. It's useful for routing, URL-based dispatching,
 * and URL validation.
 *
 * Pattern syntax supports:
 * - Literal text matching
 * - Named groups: `:name` (matches up to the next separator)
 * - Wildcards: `*` (matches everything)
 * - Custom regex: `(pattern)`
 * - Optional segments: `:name?`
 * - Repeated segments: `:name+`, `:name*`
 *
 * @tparam regex_provider The regex implementation to use for pattern matching.
 *         Must satisfy the url_pattern_regex::regex_concept.
 *
 * @note All string inputs must be valid UTF-8.
 *
 * @see https://urlpattern.spec.whatwg.org/
 */
template <url_pattern_regex::regex_concept regex_provider>
class url_pattern {
 public:
  url_pattern() = default;

  /**
   * If non-null, base_url must pointer at a valid UTF-8 string.
   * @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-exec
   */
  result<std::optional<url_pattern_result>> exec(
      const url_pattern_input& input,
      const std::string_view* base_url = nullptr);

  /**
   * If non-null, base_url must pointer at a valid UTF-8 string.
   * @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-test
   */
  result<bool> test(const url_pattern_input& input,
                    const std::string_view* base_url = nullptr);

  /**
   * @see https://urlpattern.spec.whatwg.org/#url-pattern-match
   * This function expects a valid UTF-8 string if input is a string.
   */
  result<std::optional<url_pattern_result>> match(
      const url_pattern_input& input,
      const std::string_view* base_url_string = nullptr);

  // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-protocol
  [[nodiscard]] std::string_view get_protocol() const ada_lifetime_bound;
  // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-username
  [[nodiscard]] std::string_view get_username() const ada_lifetime_bound;
  // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-password
  [[nodiscard]] std::string_view get_password() const ada_lifetime_bound;
  // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hostname
  [[nodiscard]] std::string_view get_hostname() const ada_lifetime_bound;
  // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-port
  [[nodiscard]] std::string_view get_port() const ada_lifetime_bound;
  // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-pathname
  [[nodiscard]] std::string_view get_pathname() const ada_lifetime_bound;
  // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-search
  [[nodiscard]] std::string_view get_search() const ada_lifetime_bound;
  // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hash
  [[nodiscard]] std::string_view get_hash() const ada_lifetime_bound;

  // If ignoreCase is true, the JavaScript regular expression created for each
  // pattern must use the `vi` flag. Otherwise, they must use the `v` flag.
  [[nodiscard]] bool ignore_case() const;

  // @see https://urlpattern.spec.whatwg.org/#url-pattern-has-regexp-groups
  [[nodiscard]] bool has_regexp_groups() const;

  // Helper to test all components at once. Returns true if all match.
  [[nodiscard]] bool test_components(
      std::string_view protocol, std::string_view username,
      std::string_view password, std::string_view hostname,
      std::string_view port, std::string_view pathname, std::string_view search,
      std::string_view hash) const;

#if ADA_TESTING
  friend void PrintTo(const url_pattern& c, std::ostream* os) {
    *os << "protocol_component: '" << c.get_protocol() << ", ";
    *os << "username_component: '" << c.get_username() << ", ";
    *os << "password_component: '" << c.get_password() << ", ";
    *os << "hostname_component: '" << c.get_hostname() << ", ";
    *os << "port_component: '" << c.get_port() << ", ";
    *os << "pathname_component: '" << c.get_pathname() << ", ";
    *os << "search_component: '" << c.get_search() << ", ";
    *os << "hash_component: '" << c.get_hash();
  }
#endif  // ADA_TESTING

  template <url_pattern_regex::regex_concept P>
  friend tl::expected<url_pattern<P>, errors> parser::parse_url_pattern_impl(
      std::variant<std::string_view, url_pattern_init>&& input,
      const std::string_view* base_url, const url_pattern_options* options);

  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  url_pattern_component<regex_provider> protocol_component{};
  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  url_pattern_component<regex_provider> username_component{};
  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  url_pattern_component<regex_provider> password_component{};
  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  url_pattern_component<regex_provider> hostname_component{};
  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  url_pattern_component<regex_provider> port_component{};
  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  url_pattern_component<regex_provider> pathname_component{};
  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  url_pattern_component<regex_provider> search_component{};
  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  url_pattern_component<regex_provider> hash_component{};
  /**
   * @private
   * We can not make this private due to a LLVM bug.
   * Ref: https://github.com/ada-url/ada/pull/859
   */
  bool ignore_case_ = false;
};
}  // namespace ada
#endif  // ADA_INCLUDE_URL_PATTERN
#endif