File: url_pattern.h

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (307 lines) | stat: -rw-r--r-- 11,976 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
// Copyright 2012 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef EXTENSIONS_COMMON_URL_PATTERN_H_
#define EXTENSIONS_COMMON_URL_PATTERN_H_

#include <functional>
#include <iosfwd>
#include <optional>
#include <string>
#include <string_view>
#include <vector>

#include "net/base/registry_controlled_domains/registry_controlled_domain.h"

class GURL;

// A pattern that can be used to match URLs. A URLPattern is a very restricted
// subset of URL syntax:
//
// <url-pattern> := <scheme>://<host><port><path> | '<all_urls>'
// <scheme> := '*' | 'http' | 'https' | 'file' | 'ftp' | 'chrome' |
//             'chrome-extension' | 'filesystem'
// <host> := '*' | <IPv4 address> | [<IPv6 address>] |
//           '*.' <anychar except '/' and '*'>+
// <port> := [':' ('*' | <port number between 0 and 65535>)]
// <path> := '/' <any chars>
//
// * Host is not used when the scheme is 'file'.
// * The path can have embedded '*' characters which act as glob wildcards.
// * '<all_urls>' is a special pattern that matches any valid URL that contains
//   a valid scheme (as specified by valid_schemes_).
// * The '*' scheme pattern excludes file URLs.
//
// Examples of valid patterns:
// - http://*/*
// - http://*/foo*
// - https://*.google.com/foo*bar
// - file://monkey*
// - http://127.0.0.1/*
// - http://[2607:f8b0:4005:805::200e]/*
//
// Examples of invalid patterns:
// - http://* -- path not specified
// - http://*foo/bar -- * not allowed as substring of host component
// - http://foo.*.bar/baz -- * must be first component
// - http:/bar -- scheme separator not found
// - foo://* -- invalid scheme
// - chrome:// -- we don't support chrome internal URLs
class URLPattern {
 public:
  // A collection of scheme bitmasks for use with valid_schemes.
  enum SchemeMasks {
    SCHEME_NONE = 0,
    SCHEME_HTTP = 1 << 0,
    SCHEME_HTTPS = 1 << 1,
    SCHEME_FILE = 1 << 2,
    SCHEME_FTP = 1 << 3,
    SCHEME_CHROMEUI = 1 << 4,
    SCHEME_EXTENSION = 1 << 5,
    SCHEME_FILESYSTEM = 1 << 6,
    SCHEME_WS = 1 << 7,
    SCHEME_WSS = 1 << 8,
    SCHEME_DATA = 1 << 9,
    SCHEME_UUID_IN_PACKAGE = 1 << 10,

    // IMPORTANT!
    // SCHEME_ALL will match every scheme, including chrome://, chrome-
    // extension://, about:, etc. Because this has lots of security
    // implications, third-party extensions should usually not be able to get
    // access to URL patterns initialized this way. If there is a reason
    // for violating this general rule, document why this it safe.
    SCHEME_ALL = -1,
  };

  // Error codes returned from Parse().
  enum class ParseResult {
    kSuccess = 0,
    kMissingSchemeSeparator,
    kInvalidScheme,
    kWrongSchemeSeparator,
    kEmptyHost,
    kInvalidHostWildcard,
    kEmptyPath,
    kInvalidPort,
    kInvalidHost,
    kNumParseResults,
  };

  // The <all_urls> string pattern.
  static const char kAllUrlsPattern[];

  // Returns true if the given `scheme` is considered valid for extensions.
  static bool IsValidSchemeForExtensions(std::string_view scheme);

  // Returns the mask for all schemes considered valid for extensions.
  static int GetValidSchemeMaskForExtensions();

  explicit URLPattern(int valid_schemes);

  // Convenience to construct a URLPattern from a string. If the string is not
  // known ahead of time, use Parse() instead, which returns success or failure.
  // This method will DCHECK if parsing fails.
  URLPattern(int valid_schemes, std::string_view pattern);

  URLPattern();
  URLPattern(const URLPattern& other);
  URLPattern(URLPattern&& other);
  ~URLPattern();

  URLPattern& operator=(const URLPattern& other);
  URLPattern& operator=(URLPattern&& other);

  friend auto operator<=>(const URLPattern& a, const URLPattern& b) {
    return a.GetAsString() <=> b.GetAsString();
  }
  friend bool operator==(const URLPattern& a, const URLPattern& b) {
    return a.GetAsString() == b.GetAsString();
  }

  // Initializes this instance by parsing the provided string. Returns
  // URLPattern::ParseResult::kSuccess on success, or an error code otherwise.
  // On failure, this instance will have some intermediate values and is in an
  // invalid state.
  ParseResult Parse(std::string_view pattern_str);

  // Gets the bitmask of valid schemes.
  int valid_schemes() const { return valid_schemes_; }
  void SetValidSchemes(int valid_schemes);

  // Gets the host the pattern matches. This can be an empty string if the
  // pattern matches all hosts (the input was <scheme>://*/<whatever>).
  const std::string& host() const { return host_; }
  void SetHost(std::string_view host);

  // Gets whether to match subdomains of host().
  bool match_subdomains() const { return match_subdomains_; }
  void SetMatchSubdomains(bool val);

  // Gets the path the pattern matches with the leading slash. This can have
  // embedded asterisks which are interpreted using glob rules.
  const std::string& path() const { return path_; }
  void SetPath(std::string_view path);

  // Returns true if this pattern matches all (valid) urls.
  bool match_all_urls() const { return match_all_urls_; }
  void SetMatchAllURLs(bool val);

  // Sets the scheme for pattern matches. This can be a single '*' if the
  // pattern matches all valid schemes (as defined by the valid_schemes_
  // property). Returns false on failure (if the scheme is not valid).
  bool SetScheme(std::string_view scheme);
  // Note: You should use MatchesScheme() instead of this getter unless you
  // absolutely need the exact scheme. This is exposed for testing.
  const std::string& scheme() const { return scheme_; }

  // Returns true if the specified scheme can be used in this URL pattern, and
  // false otherwise. Uses valid_schemes_ to determine validity.
  bool IsValidScheme(std::string_view scheme) const;

  // Returns true if this instance matches the specified URL. Always returns
  // false for invalid URLs.
  bool MatchesURL(const GURL& test) const;

  // Returns true if this instance matches the specified security origin.
  bool MatchesSecurityOrigin(const GURL& test) const;

  // Returns true if `test` matches our scheme.
  // Note that if test is "filesystem", this may fail whereas MatchesURL
  // may succeed.  MatchesURL is smart enough to look at the inner_url instead
  // of the outer "filesystem:" part.
  bool MatchesScheme(std::string_view test) const;

  // Returns true if `test` matches our host.
  bool MatchesHost(std::string_view test) const;
  bool MatchesHost(const GURL& test) const;

  // Returns true if `test` matches our path.
  bool MatchesPath(std::string_view test) const;

  // Returns true if the pattern matches all patterns in an (e)TLD. This
  // includes patterns like *://*.com/*, *://*.co.uk/*, etc. A pattern that
  // matches all domains (e.g., *://*/*) will return true.
  // `private_filter` specifies whether private registries (like appspot.com)
  // should be considered; if included, patterns like *://*.appspot.com/* will
  // return true. By default, we exclude private registries (so *.appspot.com
  // returns false).
  // Note: This is an expensive method, and should be used sparingly!
  // You should probably use URLPatternSet::ShouldWarnAllHosts(), which is
  // cached.
  bool MatchesEffectiveTld(
      net::registry_controlled_domains::PrivateRegistryFilter private_filter =
          net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES,
      net::registry_controlled_domains::UnknownRegistryFilter unknown_filter =
          net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES) const;

  // Returns true if the pattern only matches a single origin. The pattern may
  // include a path.
  bool MatchesSingleOrigin() const;

  // Sets the port. Returns false if the port is invalid.
  bool SetPort(std::string_view port);
  const std::string& port() const { return port_; }

  // Returns a string representing this instance.
  const std::string& GetAsString() const;

  // Determines whether there is a URL that would match this instance and
  // another instance. This method is symmetrical: Calling
  // other.OverlapsWith(this) would result in the same answer.
  bool OverlapsWith(const URLPattern& other) const;

  // Returns true if this pattern matches all possible URLs that `other` can
  // match. For example, http://*.google.com encompasses http://www.google.com.
  bool Contains(const URLPattern& other) const;

  // Creates a new URLPattern that represents the intersection of this
  // URLPattern with the `other`, or std::nullopt if no intersection exists.
  // For instance, given the patterns http://*.google.com/* and
  // *://maps.google.com/*, the intersection is http://maps.google.com/*.
  // NOTES:
  // - Though scheme intersections are supported, the serialization of
  //   URLPatternSet does not record them. Be sure that this is safe for your
  //   use cases.
  // - Path intersection is done on a best-effort basis. If one path clearly
  //   contains another, it will be handled correctly, but this method does not
  //   deal with cases like /*a* and /*b* (where technically the intersection
  //   is /*a*b*|/*b*a*); the intersection returned for that case will be empty.
  std::optional<URLPattern> CreateIntersection(const URLPattern& other) const;

  // Converts this URLPattern into an equivalent set of URLPatterns that don't
  // use a wildcard in the scheme component. If this URLPattern doesn't use a
  // wildcard scheme, then the returned set will contain one element that is
  // equivalent to this instance.
  std::vector<URLPattern> ConvertToExplicitSchemes() const;

  static bool EffectiveHostCompare(const URLPattern& a, const URLPattern& b) {
    if (a.match_all_urls_ && b.match_all_urls_)
      return false;
    return a.host_.compare(b.host_) < 0;
  }

  // Used for origin comparisons in a std::set.
  class EffectiveHostCompareFunctor {
   public:
    bool operator()(const URLPattern& a, const URLPattern& b) const {
      return EffectiveHostCompare(a, b);
    }
  };

  // Get an error string for a ParseResult.
  static const char* GetParseResultString(URLPattern::ParseResult parse_result);

 private:
  // Returns true if any of the `schemes` items matches our scheme.
  bool MatchesAnyScheme(const std::vector<std::string>& schemes) const;

  // Returns true if all of the `schemes` items matches our scheme.
  bool MatchesAllSchemes(const std::vector<std::string>& schemes) const;

  bool MatchesSecurityOriginHelper(const GURL& test) const;

  // Returns true if our port matches the `port` pattern (it may be "*").
  bool MatchesPortPattern(std::string_view port) const;

  // If the URLPattern contains a wildcard scheme, returns a list of
  // equivalent literal schemes, otherwise returns the current scheme.
  std::vector<std::string> GetExplicitSchemes() const;

  // A bitmask containing the schemes which are considered valid for this
  // pattern. Parse() uses this to decide whether a pattern contains a valid
  // scheme.
  int valid_schemes_;

  // True if this is a special-case "<all_urls>" pattern.
  bool match_all_urls_;

  // The scheme for the pattern.
  std::string scheme_;

  // The host without any leading "*" components.
  std::string host_;

  // Whether we should match subdomains of the host. This is true if the first
  // component of the pattern's host was "*".
  bool match_subdomains_;

  // The port.
  std::string port_;

  // The path to match. This is everything after the host of the URL, or
  // everything after the scheme in the case of file:// URLs.
  std::string path_;

  // The path with "?" and "\" characters escaped for use with the
  // MatchPattern() function.
  std::string path_escaped_;

  // A string representing this URLPattern.
  mutable std::string spec_;
};

std::ostream& operator<<(std::ostream& out, const URLPattern& url_pattern);

using URLPatternList = std::vector<URLPattern>;

#endif  // EXTENSIONS_COMMON_URL_PATTERN_H_