File: regex_rules_matcher.cc

package info (click to toggle)
chromium 139.0.7258.127-1
links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (414 lines) | stat: -rw-r--r-- 15,919 bytes
parent folder | download | duplicates (3)
// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "extensions/browser/api/declarative_net_request/regex_rules_matcher.h"

#include <algorithm>
#include <optional>

#include "base/containers/contains.h"
#include "base/logging.h"
#include "base/notreached.h"
#include "base/strings/string_util.h"
#include "base/strings/stringprintf.h"
#include "components/url_pattern_index/url_pattern_index.h"
#include "extensions/browser/api/declarative_net_request/request_action.h"
#include "extensions/browser/api/declarative_net_request/request_params.h"
#include "extensions/browser/api/declarative_net_request/utils.h"

namespace extensions::declarative_net_request {
namespace flat_rule = url_pattern_index::flat;

namespace {

bool IsExtraHeadersMatcherInternal(
    const RegexRulesMatcher::RegexRulesList* regex_list) {
  DCHECK(regex_list);

  // We only support removing a subset of extra headers currently. If that
  // changes, the implementation here should change as well.
  static_assert(flat::ActionType_count == 6,
                "Modify this method to ensure IsExtraHeadersMatcherInternal is "
                "updated as new actions are added.");

  return base::Contains(*regex_list, flat::ActionType_modify_headers,
                        &flat::RegexRule::action_type);
}

// Helper to check if the `rule` metadata matches the given request `params`.
bool DoesRuleMetadataMatchRequest(const flat_rule::UrlRule& rule,
                                  const RequestParams& params) {
  // Evaluates `element_type`, `method`, `is_third_party` and
  // `embedder_conditions_matcher`.
  if (!url_pattern_index::DoesRuleFlagsMatch(
          rule, params.element_type, flat_rule::ActivationType_NONE,
          params.method, params.is_third_party,
          params.embedder_conditions_matcher)) {
    return false;
  }

  // Compares included and excluded request domains.
  if (!url_pattern_index::DoesURLMatchRequestDomainList(*params.url, rule)) {
    return false;
  }

  // Compares included and excluded initiator domains.
  return url_pattern_index::DoesOriginMatchInitiatorDomainList(
      params.first_party_origin, rule);
}

// For the given `action_type`, returns:
// - true if multiple actions of this type can be matched for a request.
// - false if an action of this type that is matched to a request will exclude
//   all other actions from matching to that request.
bool ActionTypeAllowsMultipleActions(flat::ActionType action_type) {
  switch (action_type) {
    case flat::ActionType_block:
    case flat::ActionType_allow:
    case flat::ActionType_redirect:
    case flat::ActionType_upgrade_scheme:
    case flat::ActionType_allow_all_requests:
      return false;
    case flat::ActionType_modify_headers:
      return true;
    case flat::ActionType_count:
      NOTREACHED();
  }
  return true;
}

}  // namespace

RegexRuleInfo::RegexRuleInfo(const flat::RegexRule* regex_rule,
                             const re2::RE2* regex)
    : regex_rule(regex_rule), regex(regex) {
  DCHECK(regex_rule);
  DCHECK(regex);
}
RegexRuleInfo::RegexRuleInfo(const RegexRuleInfo& info) = default;
RegexRuleInfo& RegexRuleInfo::operator=(const RegexRuleInfo& info) = default;

RegexRulesMatcher::RegexRulesMatcher(
    const ExtensionId& extension_id,
    RulesetID ruleset_id,
    const RegexRulesList* before_request_regex_list,
    const RegexRulesList* headers_received_regex_list,
    const ExtensionMetadataList* metadata_list)
    : RulesetMatcherBase(extension_id, ruleset_id),
      before_request_matcher_(before_request_regex_list,
                              this,
                              RulesetMatchingStage::kOnBeforeRequest),
      headers_received_matcher_(headers_received_regex_list,
                                this,
                                RulesetMatchingStage::kOnHeadersReceived),
      metadata_list_(metadata_list),
      // See comments for this field in extension_url_pattern_index_matcher.cc
      // for why different checks are used for `before_request_regex_list` and
      // `headers_received_regex_list`.
      is_extra_headers_matcher_(
          IsExtraHeadersMatcherInternal(before_request_regex_list) ||
          headers_received_regex_list->size() > 0) {}

RegexRulesMatcher::~RegexRulesMatcher() = default;

bool RegexRulesMatcher::IsExtraHeadersMatcher() const {
  return is_extra_headers_matcher_;
}

size_t RegexRulesMatcher::GetRulesCount() const {
  return GetBeforeRequestRulesCount() + GetHeadersReceivedRulesCount();
}

size_t RegexRulesMatcher::GetBeforeRequestRulesCount() const {
  return before_request_matcher_.GetRulesCount();
}

size_t RegexRulesMatcher::GetHeadersReceivedRulesCount() const {
  return headers_received_matcher_.GetRulesCount();
}

std::vector<RequestAction> RegexRulesMatcher::GetModifyHeadersActions(
    const RequestParams& params,
    RulesetMatchingStage stage,
    std::optional<uint64_t> min_priority) const {
  const std::vector<RegexRuleInfo>& potential_matches =
      GetMatcherForStage(stage).GetPotentialMatches(params);

  std::vector<const flat_rule::UrlRule*> rules;
  for (const RegexRuleInfo& info : potential_matches) {
    // Check for the rule's priority iff `min_priority` is specified.
    bool has_sufficient_priority =
        !min_priority ||
        info.regex_rule->url_rule()->priority() > *min_priority;

    if (has_sufficient_priority &&
        info.regex_rule->action_type() == flat::ActionType_modify_headers &&
        re2::RE2::PartialMatch(params.url->spec(), *info.regex)) {
      rules.push_back(info.regex_rule->url_rule());
    }
  }

  return GetModifyHeadersActionsFromMetadata(params, rules, *metadata_list_);
}

std::optional<RequestAction> RegexRulesMatcher::GetAllowAllRequestsAction(
    const RequestParams& params,
    RulesetMatchingStage stage) const {
  const std::vector<RegexRuleInfo>& potential_matches =
      GetMatcherForStage(stage).GetPotentialMatches(params);
  auto info = std::ranges::find_if(
      potential_matches, [&params](const RegexRuleInfo& info) {
        return info.regex_rule->action_type() ==
                   flat::ActionType_allow_all_requests &&
               re2::RE2::PartialMatch(params.url->spec(), *info.regex);
      });
  if (info == potential_matches.end()) {
    return std::nullopt;
  }

  return CreateAllowAllRequestsAction(params, *info->regex_rule->url_rule());
}

std::optional<RequestAction> RegexRulesMatcher::GetActionIgnoringAncestors(
    const RequestParams& params,
    RulesetMatchingStage stage) const {
  const std::vector<RegexRuleInfo>& potential_matches =
      GetMatcherForStage(stage).GetPotentialMatches(params);
  auto info = std::ranges::find_if(
      potential_matches, [&params](const RegexRuleInfo& info) {
        return !ActionTypeAllowsMultipleActions(
                   info.regex_rule->action_type()) &&
               re2::RE2::PartialMatch(params.url->spec(), *info.regex);
      });

  return info == potential_matches.end() ? std::nullopt
                                         : CreateActionFromInfo(params, *info);
}

RegexRulesMatcher::MatchHelper::MatchHelper(
    const raw_ptr<const RegexRulesList> regex_list,
    const RegexRulesMatcher* parent_matcher,
    RulesetMatchingStage stage)
    : regex_list_(regex_list), regex_match_key_(parent_matcher, stage) {
  InitializeMatcher();
}

RegexRulesMatcher::MatchHelper::~MatchHelper() = default;

size_t RegexRulesMatcher::MatchHelper::GetRulesCount() const {
  return regex_list_->size();
}

const std::vector<RegexRuleInfo>&
RegexRulesMatcher::MatchHelper::GetPotentialMatches(
    const RequestParams& params) const {
  auto iter = params.potential_regex_matches.find(regex_match_key_);
  if (iter != params.potential_regex_matches.end()) {
    return iter->second;
  }

  // Early out if this is an empty matcher.
  if (IsEmpty()) {
    auto result = params.potential_regex_matches.insert(
        std::make_pair(regex_match_key_, std::vector<RegexRuleInfo>()));
    return result.first->second;
  }

  // Compute the potential matches. FilteredRE2 requires the text to be lower
  // cased first.
  if (!params.lower_cased_url_spec) {
    params.lower_cased_url_spec = base::ToLowerASCII(params.url->spec());
  }

  // To pre-filter the set of regexes to match against `params`, we first need
  // to compute the set of candidate strings tracked by `substring_matcher_`
  // within `params.lower_cased_url_spec`.
  std::set<base::MatcherStringPattern::ID> candidate_ids_set;
  DCHECK(substring_matcher_);
  substring_matcher_->Match(*params.lower_cased_url_spec, &candidate_ids_set);
  std::vector<int> candidate_ids_list(candidate_ids_set.begin(),
                                      candidate_ids_set.end());

  // FilteredRE2 then yields the set of potential regex matches.
  std::vector<int> potential_re2_ids;
  filtered_re2_.AllPotentials(candidate_ids_list, &potential_re2_ids);

  // We prune the set of potential matches even further by matching request
  // metadata.
  std::vector<RegexRuleInfo> potential_matches;
  for (int re2_id : potential_re2_ids) {
    auto it = re2_id_to_rules_map_.find(re2_id);
    CHECK(it != re2_id_to_rules_map_.end());

    const flat::RegexRule* rule = it->second;
    if (!DoesRuleMetadataMatchRequest(*rule->url_rule(), params)) {
      continue;
    }

    const RE2& regex = filtered_re2_.GetRE2(re2_id);
    potential_matches.emplace_back(rule, &regex);
  }

  // Sort potential matches in descending order of priority.
  std::sort(potential_matches.begin(), potential_matches.end(),
            [](const RegexRuleInfo& lhs, const RegexRuleInfo& rhs) {
              return lhs.regex_rule->url_rule()->priority() >
                     rhs.regex_rule->url_rule()->priority();
            });

  // Cache `potential_matches`.
  auto result = params.potential_regex_matches.insert(
      std::make_pair(regex_match_key_, std::move(potential_matches)));
  return result.first->second;
}

bool RegexRulesMatcher::MatchHelper::IsEmpty() const {
  return regex_list_->size() == 0;
}

void RegexRulesMatcher::MatchHelper::InitializeMatcher() {
  if (IsEmpty()) {
    return;
  }

  for (const auto* regex_rule : *regex_list_) {
    const flat_rule::UrlRule* rule = regex_rule->url_rule();

    const bool is_case_sensitive =
        rule->options() & flat_rule::OptionFlag_IS_MATCH_CASE;

    const bool require_capturing = !!regex_rule->regex_substitution();

    // TODO(karandeepb): Regex compilation can be expensive and sometimes we are
    // compiling the same regex twice, once during rule indexing and now during
    // ruleset loading. We should try maintaining a global cache of compiled
    // regexes and modify FilteredRE2 to take a regex object directly.
    int re2_id;
    re2::RE2::ErrorCode error_code = filtered_re2_.Add(
        rule->url_pattern()->string_view(),
        CreateRE2Options(is_case_sensitive, require_capturing), &re2_id);

    // Ideally there shouldn't be any error, since we had already validated the
    // regular expression while indexing the ruleset. That said, there are cases
    // possible where this may happen, for example, the library's implementation
    // may change etc.
    // TODO(crbug.com/40118204): Notify the extension about the same.
    if (error_code != re2::RE2::NoError) {
      continue;
    }

    const bool did_insert =
        re2_id_to_rules_map_.insert({re2_id, regex_rule}).second;
    DCHECK(did_insert) << "Duplicate |re2_id| seen.";
  }

  // FilteredRE2 on compilation yields a set of candidate strings. These aid in
  // pre-filtering and obtaining the set of potential matches for a request.
  std::vector<std::string> strings_to_match;
  filtered_re2_.Compile(&strings_to_match);

  // FilteredRE2 guarantees that the returned set of candidate strings is
  // lower-cased.
  DCHECK(std::ranges::all_of(strings_to_match, [](const std::string& s) {
    return std::ranges::all_of(
        s, [](const char c) { return !base::IsAsciiUpper(c); });
  }));

  // Convert `strings_to_match` to MatcherStringPatterns. This is necessary to
  // use url_matcher::SubstringSetMatcher.
  std::vector<base::MatcherStringPattern> patterns;
  patterns.reserve(strings_to_match.size());

  for (size_t i = 0; i < strings_to_match.size(); ++i) {
    patterns.emplace_back(std::move(strings_to_match[i]), i);
  }

  substring_matcher_ = std::make_unique<base::SubstringSetMatcher>();

  // This is only used for regex rules, which are limited to 1000,
  // so hitting the 8MB limit should be all but impossible.
  bool success = substring_matcher_->Build(patterns);
  CHECK(success);
}

std::optional<RequestAction> RegexRulesMatcher::CreateActionFromInfo(
    const RequestParams& params,
    const RegexRuleInfo& info) const {
  const flat_rule::UrlRule& rule = *info.regex_rule->url_rule();
  switch (info.regex_rule->action_type()) {
    case flat::ActionType_block:
      return CreateBlockOrCollapseRequestAction(params, rule);
    case flat::ActionType_allow:
      return CreateAllowAction(params, rule);
    case flat::ActionType_redirect:
      // If this is a regex substitution rule, handle the substitution. Else
      // create the redirect action from the information in `metadata_list_`
      // below.
      return info.regex_rule->regex_substitution()
                 ? CreateRegexSubstitutionRedirectAction(params, info)
                 : CreateRedirectActionFromMetadata(params, rule,
                                                    *metadata_list_);
    case flat::ActionType_upgrade_scheme:
      return CreateUpgradeAction(params, rule);
    case flat::ActionType_allow_all_requests:
      return CreateAllowAllRequestsAction(params, rule);
    case flat::ActionType_modify_headers:
    case flat::ActionType_count:
      NOTREACHED();
  }

  return std::nullopt;
}

std::optional<RequestAction>
RegexRulesMatcher::CreateRegexSubstitutionRedirectAction(
    const RequestParams& params,
    const RegexRuleInfo& info) const {
  // We could have extracted the captured strings during the matching stage
  // and directly used RE2::Rewrite here (which doesn't need to match the
  // regex again). However we prefer to capture the strings only when
  // necessary. Not capturing the strings should allow re2 to perform
  // additional optimizations during the matching stage.
  std::string redirect_str = params.url->spec();
  bool success =
      RE2::Replace(&redirect_str, *info.regex,
                   info.regex_rule->regex_substitution()->string_view());
  if (!success) {
    // This should generally not happen since we had already checked for a
    // match and during indexing, had verified that the substitution pattern
    // is not ill-formed. However, the re2 library implementation might have
    // changed since indexing, causing this.
    LOG(ERROR) << base::StringPrintf(
        "Rewrite failed. Regex:%s Substitution:%s URL:%s\n",
        info.regex->pattern().c_str(),
        info.regex_rule->regex_substitution()->c_str(),
        params.url->spec().c_str());
    return std::nullopt;
  }

  GURL redirect_url(redirect_str);

  // Redirects to JavaScript urls are not allowed.
  // TODO(crbug.com/40111509): this results in counterintuitive behavior.
  if (redirect_url.SchemeIs(url::kJavaScriptScheme)) {
    return std::nullopt;
  }

  return CreateRedirectAction(params, *info.regex_rule->url_rule(),
                              std::move(redirect_url));
}

const RegexRulesMatcher::MatchHelper& RegexRulesMatcher::GetMatcherForStage(
    RulesetMatchingStage stage) const {
  switch (stage) {
    case RulesetMatchingStage::kOnBeforeRequest:
      return before_request_matcher_;
    case RulesetMatchingStage::kOnHeadersReceived:
      return headers_received_matcher_;
  }

  NOTREACHED();
}

}  // namespace extensions::declarative_net_request