File: format_url.cc

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (146 lines) | stat: -rw-r--r-- 5,541 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifdef UNSAFE_BUFFERS_BUILD
// TODO(crbug.com/40285824): Remove this and convert code to safer constructs.
#pragma allow_unsafe_buffers
#endif

// This binary takes a list of domain names in ASCII or unicode, passes them
// through the IDN decoding algorithm and prints out the result. The list can be
// passed as a text file or via stdin. In both cases, the output is printed as
// (input_domain, output_domain, spoof_check_result) tuples on separate lines.
// spoof_check_result is the string representation of IDNSpoofChecker::Result
// enum with an additional kTopDomainLookalike value.

#include <cstdlib>
#include <fstream>
#include <iostream>
#include <string>

#include "base/command_line.h"
#include "base/i18n/icu_util.h"
#include "base/logging.h"
#include "base/notreached.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/url_formatter/spoof_checks/idn_spoof_checker.h"
#include "components/url_formatter/url_formatter.h"
#include "url/gurl.h"

using url_formatter::IDNConversionResult;
using url_formatter::IDNSpoofChecker;

void PrintUsage(const char* process_name) {
  std::cout << "Usage:" << std::endl;
  std::cout << process_name << " <file>" << std::endl;
  std::cout << std::endl;
  std::cout << "<file> is a text file with one hostname per line." << std::endl;
  std::cout << "Hostnames can be ASCII or unicode. Internationalized domain "
               "can (IDN) be encoded in unicode or punycode."
            << std::endl;
  std::cout << "Each hostname is converted to unicode, if safe. Otherwise, "
            << "ASCII hostnames are printed unchanged and unicode hostnames "
            << "are printed in punycode." << std::endl;
}

std::string SpoofCheckResultToString(IDNSpoofChecker::Result result) {
  switch (result) {
    case IDNSpoofChecker::Result::kNone:
      return "kNone";
    case IDNSpoofChecker::Result::kSafe:
      return "kSafe";
    case IDNSpoofChecker::Result::kICUSpoofChecks:
      return "kICUSpoofChecks";
    case IDNSpoofChecker::Result::kDeviationCharacters:
      return "kDeviationCharacters";
    case IDNSpoofChecker::Result::kTLDSpecificCharacters:
      return "kTLDSpecificCharacters";
    case IDNSpoofChecker::Result::kUnsafeMiddleDot:
      return "kUnsafeMiddleDot";
    case IDNSpoofChecker::Result::kWholeScriptConfusable:
      return "kWholeScriptConfusable";
    case IDNSpoofChecker::Result::kDigitLookalikes:
      return "kDigitLookalikes";
    case IDNSpoofChecker::Result::kNonAsciiLatinCharMixedWithNonLatin:
      return "kNonAsciiLatinCharMixedWithNonLatin";
    case IDNSpoofChecker::Result::kDangerousPattern:
      return "kDangerousPattern";
    default:
      NOTREACHED();
  };
}

// Returns the spoof check result as a string. |ascii_domain| must contain
// ASCII characters only. |unicode_domain| is the IDN conversion result
// according to url_formatter. It can be either punycode or unicode.
std::string GetSpoofCheckResult(const std::string& ascii_domain,
                                const std::u16string& unicode_domain) {
  IDNConversionResult result =
      url_formatter::UnsafeIDNToUnicodeWithDetails(ascii_domain);
  std::string spoof_check_result =
      SpoofCheckResultToString(result.spoof_check_result);
  if (result.spoof_check_result == IDNSpoofChecker::Result::kNone) {
    // Input was not punycode.
    return spoof_check_result;
  }
  if (result.spoof_check_result != IDNSpoofChecker::Result::kSafe) {
    return spoof_check_result;
  }
  // If the domain passed all spoof checks but |unicode_domain| is still in
  // punycode, the domain must be a lookalike of a top domain.
  if (base::ASCIIToUTF16(ascii_domain) == unicode_domain) {
    return "kTopDomainLookalike";
  }
  return spoof_check_result;
}

void Convert(std::istream& input) {
  base::i18n::InitializeICU();
  for (std::string line; std::getline(input, line);) {
    CHECK(
        !base::StartsWith(line,
                          "http:", base::CompareCase::INSENSITIVE_ASCII) &&
        !base::StartsWith(line, "https:", base::CompareCase::INSENSITIVE_ASCII))
        << "This binary only accepts hostnames" << line;

    const std::string ascii_hostname =
        base::IsStringASCII(line) ? line : GURL("https://" + line).host();

    // Convert twice, first with spoof checks on, then with spoof checks
    // ignored inside GetSpoofCheckResult(). This is because only the call to
    // UnsafeIDNToUnicodeWithDetails returns information about spoof check
    // results (a quirk of the url_formatter interface).
    const std::u16string converted_hostname =
        url_formatter::IDNToUnicode(ascii_hostname);
    const std::string spoof_check_result =
        GetSpoofCheckResult(ascii_hostname, converted_hostname);
    std::cout << ascii_hostname << ", " << converted_hostname << ", "
              << spoof_check_result << std::endl;
  }
}

int main(int argc, char* argv[]) {
  base::CommandLine::Init(argc, argv);
  base::CommandLine* cmd = base::CommandLine::ForCurrentProcess();

  if (cmd->HasSwitch("help")) {
    PrintUsage(argv[0]);
    return 0;
  }

  if (argc > 1) {
    const std::string filename = argv[1];
    std::ifstream input(filename);
    if (!input.good()) {
      LOG(ERROR) << "Could not open file " << filename;
      return -1;
    }
    Convert(input);
  } else {
    Convert(std::cin);
  }

  return EXIT_SUCCESS;
}