File: tld_cleanup_util.cc

package info (click to toggle)
chromium 120.0.6099.224-1~deb11u1
links: PTS, VCS
area: main
in suites: bullseye
size: 6,112,112 kB
sloc: cpp: 32,907,025; ansic: 8,148,123; javascript: 3,679,536; python: 2,031,248; asm: 959,718; java: 804,675; xml: 617,256; sh: 111,417; objc: 100,835; perl: 88,443; cs: 53,032; makefile: 29,579; fortran: 24,137; php: 21,162; tcl: 21,147; sql: 20,809; ruby: 17,735; pascal: 12,864; yacc: 8,045; lisp: 3,388; lex: 1,323; ada: 727; awk: 329; jsp: 267; csh: 117; exp: 43; sed: 37
file content (217 lines) | stat: -rw-r--r-- 7,544 bytes
// Copyright 2013 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "net/tools/tld_cleanup/tld_cleanup_util.h"

#include <sstream>
#include <string>

#include "base/containers/contains.h"
#include "base/files/file_util.h"
#include "base/logging.h"
#include "base/ranges/algorithm.h"
#include "base/strings/strcat.h"
#include "base/strings/string_number_conversions.h"
#include "base/strings/string_util.h"
#include "url/gurl.h"
#include "url/third_party/mozilla/url_parse.h"

namespace {

const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";

const int kExceptionRule = 1;
const int kWildcardRule = 2;
const int kPrivateRule = 4;
}

namespace net::tld_cleanup {

std::string RulesToGperf(const RuleMap& rules) {
  std::string data;
  data.append("%{\n"
              "// Copyright 2012 The Chromium Authors\n"
              "// Use of this source code is governed by a BSD-style license "
              "that can be\n"
              "// found in the LICENSE file.\n\n"
              "// This file is generated by net/tools/tld_cleanup/.\n"
              "// DO NOT MANUALLY EDIT!\n"
              "%}\n"
              "struct DomainRule {\n"
              "  int name_offset;\n"
              "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
              "};\n"
              "%%\n");

  for (const auto& [domain, rule] : rules) {
    data.append(domain);
    data.append(", ");
    int type = 0;
    if (rule.exception) {
      type = kExceptionRule;
    } else if (rule.wildcard) {
      type = kWildcardRule;
    }
    if (rule.is_private) {
      type += kPrivateRule;
    }
    data.append(base::NumberToString(type));
    data.append("\n");
  }

  data.append("%%\n");

  return data;
}

// Adjusts the rule to a standard form: removes single extraneous dots and
// canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
// valid; logs a warning and returns kWarning if it is probably invalid; and
// logs an error and returns kError if the rule is (almost) certainly invalid.
NormalizeResult NormalizeRule(std::string& domain, Rule& rule) {
  NormalizeResult result = NormalizeResult::kSuccess;

  // Strip single leading and trailing dots.
  if (base::StartsWith(domain, "."))
    domain.erase(0, 1);
  if (base::EndsWith(domain, "."))
    domain.pop_back();

  // Allow single leading '*.' or '!', saved here so it's not canonicalized.
  if (base::StartsWith(domain, "!")) {
    domain.erase(0, 1);
    rule.exception = true;
  } else if (base::StartsWith(domain, "*.")) {
    domain.erase(0, 2);
    rule.wildcard = true;
  }
  if (domain.empty()) {
    LOG(WARNING) << "Ignoring empty rule";
    return NormalizeResult::kWarning;
  }

  // Warn about additional '*.' or '!'.
  if (base::Contains(domain, "*.") || base::Contains(domain, '!')) {
    LOG(WARNING) << "Keeping probably invalid rule: " << domain;
    result = NormalizeResult::kWarning;
  }

  // Make a GURL and normalize it, then get the host back out.
  GURL gurl(base::StrCat({"http://", domain}));
  const std::string& spec = gurl.possibly_invalid_spec();
  url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
  if (!host.is_valid()) {
    LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << domain;
    return NormalizeResult::kError;
  }
  if (!gurl.is_valid()) {
    LOG(WARNING) << "Keeping rule that GURL says is invalid: " << domain;
    result = NormalizeResult::kWarning;
  }
  domain.assign(spec.substr(host.begin, host.len));

  return result;
}

NormalizeResult NormalizeDataToRuleMap(const std::string& data,
                                       RuleMap& rules) {
  // We do a lot of string assignment during parsing, but simplicity is more
  // important than performance here.
  NormalizeResult result = NormalizeResult::kSuccess;
  std::istringstream data_stream(data);

  bool in_private_section = false;
  RuleMap extra_rules;

  for (std::string line; std::getline(data_stream, line, '\n');) {
    if (base::StartsWith(line, kBeginPrivateDomainsComment)) {
      in_private_section = true;
      continue;
    }
    if (base::StartsWith(line, kEndPrivateDomainsComment)) {
      in_private_section = false;
      continue;
    }
    if (base::StartsWith(line, "//")) {
      // Skip comments.
      continue;
    }
    if (line.empty()) {
      continue;
    }

    // Truncate at first whitespace.
    if (size_t first_whitespace = line.find_first_of("\r\n \t");
        first_whitespace != std::string::npos) {
      line.erase(first_whitespace);
    }
    std::string domain = line;

    Rule rule{/*exception=*/false, /*wildcard=*/false,
              /*is_private=*/in_private_section};
    NormalizeResult new_result = NormalizeRule(domain, rule);
    result = std::max(result, new_result);
    if (new_result == NormalizeResult::kError) {
      continue;
    }

    // Check the existing rules to make sure we don't have an exception and
    // wildcard for the same rule, or that the same domain is listed as both
    // private and not private. If we did, we'd have to update our
    // parsing code to handle this case.
    CHECK(!base::Contains(rules, domain))
        << "Duplicate rule found for " << domain;

    rules[domain] = rule;
    // Add true TLD for multi-level rules.  We don't add them right now, in
    // case there's an exception or wild card that either exists or might be
    // added in a later iteration.  In those cases, there's no need to add
    // it and it would just slow down parsing the data.
    size_t tld_start = domain.find_last_of('.');
    if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
      std::string extra_rule_domain = domain.substr(tld_start + 1);
      RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
      // If a rule already exists, we ensure that if any of the entries is not
      // private the result should be that the entry is not private.  An example
      // is .au which is not listed as a real TLD, but only lists second-level
      // domains such as com.au. Subdomains of .au (eg. blogspot.com.au) are
      // also listed in the private section, which is processed later, so this
      // ensures that the real TLD (eg. .au) is listed as public.
      bool is_private = in_private_section &&
                        (iter == extra_rules.end() || iter->second.is_private);
      extra_rules[extra_rule_domain] =
          Rule{/*exception=*/false, /*wildcard=*/false, is_private};
    }
  }

  base::ranges::copy_if(extra_rules, std::inserter(rules, rules.end()),
                        [&](const auto& extra_rule) {
                          return !base::Contains(rules, extra_rule.first);
                        });

  return result;
}

NormalizeResult NormalizeFile(const base::FilePath& in_filename,
                              const base::FilePath& out_filename) {
  RuleMap rules;
  std::string data;
  if (!base::ReadFileToString(in_filename, &data)) {
    LOG(ERROR) << "Unable to read file";
    // We return success since we've already reported the error.
    return NormalizeResult::kSuccess;
  }

  NormalizeResult result = NormalizeDataToRuleMap(data, rules);

  if (!base::WriteFile(out_filename, RulesToGperf(rules))) {
    LOG(ERROR) << "Error(s) writing output file";
    result = NormalizeResult::kError;
  }

  return result;
}

}  // namespace net::tld_cleanup