File: regex_set_matcher.h

package info (click to toggle)
chromium 139.0.7258.127-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,122,156 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (79 lines) | stat: -rw-r--r-- 2,589 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_
#define COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_

#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>

#include "base/substring_set_matcher/matcher_string_pattern.h"
#include "base/substring_set_matcher/substring_set_matcher.h"
#include "components/url_matcher/url_matcher_export.h"

namespace re2 {
class FilteredRE2;
}

namespace url_matcher {

// Efficiently matches URLs against a collection of regular expressions,
// using FilteredRE2 to reduce the number of regexes that must be matched
// by pre-filtering with substring matching. See:
// http://swtch.com/~rsc/regexp/regexp3.html#analysis
class URL_MATCHER_EXPORT RegexSetMatcher {
 public:
  RegexSetMatcher();
  virtual ~RegexSetMatcher();

  // Adds the regex patterns in |regex_list| to the matcher. Also rebuilds
  // the FilteredRE2 matcher; thus, for efficiency, prefer adding multiple
  // patterns at once.
  // Ownership of the patterns remains with the caller.
  void AddPatterns(
      const std::vector<const base::MatcherStringPattern*>& regex_list);

  // Removes all regex patterns.
  void ClearPatterns();

  // Appends the IDs of regular expressions in our set that match the |text|
  // to |matches|.
  bool Match(const std::string& text,
             std::set<base::MatcherStringPattern::ID>* matches) const;

  bool IsEmpty() const;

 private:
  typedef int RE2ID;
  typedef std::map<base::MatcherStringPattern::ID,
                   const base::MatcherStringPattern*>
      RegexMap;
  typedef std::vector<base::MatcherStringPattern::ID> RE2IDMap;

  // Use Aho-Corasick SubstringSetMatcher to find which literal patterns
  // match the |text|.
  std::vector<RE2ID> FindSubstringMatches(const std::string& text) const;

  // Rebuild FilteredRE2 from scratch. Needs to be called whenever
  // our set of regexes changes.
  // TODO(yoz): investigate if it could be done incrementally;
  // apparently not supported by FilteredRE2.
  void RebuildMatcher();

  // Mapping of regex MatcherStringPattern::IDs to regexes.
  RegexMap regexes_;
  // Mapping of RE2IDs from FilteredRE2 (which are assigned in order)
  // to regex MatcherStringPattern::IDs.
  RE2IDMap re2_id_map_;

  std::unique_ptr<re2::FilteredRE2> filtered_re2_;
  std::unique_ptr<base::SubstringSetMatcher> substring_matcher_;
};

}  // namespace url_matcher

#endif  // COMPONENTS_URL_MATCHER_REGEX_SET_MATCHER_H_