1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
|
// Copyright 2013 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
#define COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
#include <stdint.h>
#include <limits>
#include <map>
#include <set>
#include <string>
#include <vector>
#include "base/macros.h"
#include "components/url_matcher/string_pattern.h"
#include "components/url_matcher/url_matcher_export.h"
namespace url_matcher {
// Class that store a set of string patterns and can find for a string S,
// which string patterns occur in S.
class URL_MATCHER_EXPORT SubstringSetMatcher {
public:
SubstringSetMatcher();
~SubstringSetMatcher();
// Registers all |patterns|. The ownership remains with the caller.
// The same pattern cannot be registered twice and each pattern needs to have
// a unique ID.
// Ownership of the patterns remains with the caller.
void RegisterPatterns(const std::vector<const StringPattern*>& patterns);
// Unregisters the passed |patterns|.
void UnregisterPatterns(const std::vector<const StringPattern*>& patterns);
// Analogous to RegisterPatterns and UnregisterPatterns but executes both
// operations in one step, which is cheaper in the execution.
void RegisterAndUnregisterPatterns(
const std::vector<const StringPattern*>& to_register,
const std::vector<const StringPattern*>& to_unregister);
// Matches |text| against all registered StringPatterns. Stores the IDs
// of matching patterns in |matches|. |matches| is not cleared before adding
// to it.
bool Match(const std::string& text,
std::set<StringPattern::ID>* matches) const;
// Returns true if this object retains no allocated data. Only for debugging.
bool IsEmpty() const;
private:
// A node of an Aho Corasick Tree. This is implemented according to
// http://www.cs.uku.fi/~kilpelai/BSA05/lectures/slides04.pdf
//
// The algorithm is based on the idea of building a trie of all registered
// patterns. Each node of the tree is annotated with a set of pattern
// IDs that are used to report matches.
//
// The root of the trie represents an empty match. If we were looking whether
// any registered pattern matches a text at the beginning of the text (i.e.
// whether any pattern is a prefix of the text), we could just follow
// nodes in the trie according to the matching characters in the text.
// E.g., if text == "foobar", we would follow the trie from the root node
// to its child labeled 'f', from there to child 'o', etc. In this process we
// would report all pattern IDs associated with the trie nodes as matches.
//
// As we are not looking for all prefix matches but all substring matches,
// this algorithm would need to compare text.substr(0), text.substr(1), ...
// against the trie, which is in O(|text|^2).
//
// The Aho Corasick algorithm improves this runtime by using failure edges.
// In case we have found a partial match of length k in the text
// (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
// a node at depth k, but cannot find a match in the trie for character
// text[i + k] at depth k + 1, we follow a failure edge. This edge
// corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
// is a prefix of any registered pattern.
//
// If your brain thinks "Forget it, let's go shopping.", don't worry.
// Take a nap and read an introductory text on the Aho Corasick algorithm.
// It will make sense. Eventually.
class AhoCorasickNode {
public:
// Key: label of the edge, value: node index in |tree_| of parent class.
typedef std::map<char, uint32_t> Edges;
typedef std::set<StringPattern::ID> Matches;
static const uint32_t kNoSuchEdge; // Represents an invalid node index.
AhoCorasickNode();
~AhoCorasickNode();
AhoCorasickNode(const AhoCorasickNode& other);
AhoCorasickNode& operator=(const AhoCorasickNode& other);
uint32_t GetEdge(char c) const;
void SetEdge(char c, uint32_t node);
const Edges& edges() const { return edges_; }
uint32_t failure() const { return failure_; }
void set_failure(uint32_t failure) { failure_ = failure; }
void AddMatch(StringPattern::ID id);
void AddMatches(const Matches& matches);
const Matches& matches() const { return matches_; }
private:
// Outgoing edges of current node.
Edges edges_;
// Node index that failure edge leads to.
uint32_t failure_;
// Identifiers of matches.
Matches matches_;
};
typedef std::map<StringPattern::ID, const StringPattern*> SubstringPatternMap;
typedef std::vector<const StringPattern*> SubstringPatternVector;
// |sorted_patterns| is a copy of |patterns_| sorted by the pattern string.
void RebuildAhoCorasickTree(const SubstringPatternVector& sorted_patterns);
// Inserts a path for |pattern->pattern()| into the tree and adds
// |pattern->id()| to the set of matches. Ownership of |pattern| remains with
// the caller.
void InsertPatternIntoAhoCorasickTree(const StringPattern* pattern);
void CreateFailureEdges();
// Set of all registered StringPatterns. Used to regenerate the
// Aho-Corasick tree in case patterns are registered or unregistered.
SubstringPatternMap patterns_;
// The nodes of a Aho-Corasick tree.
std::vector<AhoCorasickNode> tree_;
DISALLOW_COPY_AND_ASSIGN(SubstringSetMatcher);
};
} // namespace url_matcher
#endif // COMPONENTS_URL_MATCHER_SUBSTRING_SET_MATCHER_H_
|