File: url_formatter.h

package info (click to toggle)
chromium 138.0.7204.92-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,576 kB
  • sloc: cpp: 34,933,512; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,956; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (226 lines) | stat: -rw-r--r-- 10,152 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
// Copyright 2015 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// url_formatter contains routines for formatting URLs in a way that can be
// safely and securely displayed to users. For example, it is responsible
// for determining when to convert an IDN A-Label (e.g. "xn--[something]")
// into the IDN U-Label.
//
// Note that this formatting is only intended for display purposes; it would
// be insecure and insufficient to make comparisons solely on formatted URLs
// (that is, it should not be used for normalizing URLs for comparison for
// security decisions).

#ifndef COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_
#define COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_

#include <stddef.h>
#include <stdint.h>

#include <string>
#include <string_view>
#include <vector>

#include "base/containers/flat_set.h"
#include "base/strings/escape.h"
#include "base/strings/utf_offset_string_conversions.h"
#include "components/url_formatter/spoof_checks/idn_spoof_checker.h"

class GURL;

namespace url {
struct Component;
struct Parsed;
}  // namespace url

namespace url_formatter {

using Skeletons = base::flat_set<std::string>;

// Used by FormatUrl to specify handling of certain parts of the url.
typedef uint32_t FormatUrlType;
typedef uint32_t FormatUrlTypes;

// The result of an IDN to Unicode conversion.
struct IDNConversionResult {
  // The result of the conversion. If the input is a safe-to-display IDN encoded
  // as punycode, this will be its unicode representation. Otherwise, it'll be
  // the same as input.
  std::u16string result;
  // True if the hostname of the input has an IDN component, even if the result
  // wasn't converted.
  bool has_idn_component = false;
  // The top domain that the hostname of the input is visually similar to. Is
  // empty if the input didn't match any top domain.
  // E.g. IDNToUnicodeWithDetails("googlé.com") will fill |result| with
  // "xn--googl-fsa.com" and |matching_top_domain.domain| with "google.com".
  TopDomainEntry matching_top_domain;
  // Result of the spoof check. If the domain was converted to unicode, this
  // must be kSafe. Otherwise, this will be the failure reason
  // for the domain component (i.e. label) that failed the spoof checks. If
  // multiple labels fail the checks, this will be the result of the first
  // component that failed, counting from the left in the punycode form.
  IDNSpoofChecker::Result spoof_check_result = IDNSpoofChecker::Result::kNone;
};

// Nothing is omitted.
extern const FormatUrlType kFormatUrlOmitNothing;

// If set, any username and password are removed.
extern const FormatUrlType kFormatUrlOmitUsernamePassword;

// If the scheme is 'http://', it's removed.
extern const FormatUrlType kFormatUrlOmitHTTP;

// Omits the path if it is just a slash and there is no query or ref.  This is
// meaningful for non-file "standard" URLs.
extern const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname;

// If the scheme is 'https://', it's removed. Not in kFormatUrlOmitDefaults.
extern const FormatUrlType kFormatUrlOmitHTTPS;

// Omits some trivially informative subdomains such as "www". Not in
// kFormatUrlOmitDefaults.
extern const FormatUrlType kFormatUrlOmitTrivialSubdomains;

// Omits everything after the host: the path, query, ref, username and password
// are all omitted.
extern const FormatUrlType kFormatUrlTrimAfterHost;

// If the scheme is 'file://', it's removed. Not in kFormatUrlOmitDefaults.
extern const FormatUrlType kFormatUrlOmitFileScheme;

// If the scheme is 'mailto:', it's removed. Not in kFormatUrlOmitDefaults.
extern const FormatUrlType kFormatUrlOmitMailToScheme;

// Omits the mobile prefix "m". Not in kFormatUrlOmitDefaults.
extern const FormatUrlType kFormatUrlOmitMobilePrefix;

// Convenience for omitting all unnecessary types. Does not include HTTPS scheme
// removal, or experimental flags.
extern const FormatUrlType kFormatUrlOmitDefaults;

// Creates a string representation of |url|. The IDN host name is turned to
// Unicode if the Unicode representation is deemed safe. |format_type| is a
// bitmask of FormatUrlTypes, see it for details. |unescape_rules| defines how
// to clean the URL for human readability. You will generally want
// |UnescapeRule::SPACES| for display to the user if you can handle spaces, or
// |UnescapeRule::NORMAL| if not. If the path part and the query part seem to
// be encoded in %-encoded UTF-8, decodes %-encoding and UTF-8.
//
// The last three parameters may be NULL.
//
// |new_parsed| will be set to the parsing parameters of the resultant URL.
//
// |prefix_end| will be the length before the hostname of the resultant URL.
//
// |offset[s]_for_adjustment| specifies one or more offsets into the original
// URL, representing insertion or selection points between characters: if the
// input is "http://foo.com/", offset 0 is before the entire URL, offset 7 is
// between the scheme and the host, and offset 15 is after the end of the URL.
// Valid input offsets range from 0 to the length of the input URL string.  On
// exit, each offset will have been modified to reflect any changes made to the
// output string.  For example, if |url| is "http://a:b@c.com/",
// |omit_username_password| is true, and an offset is 12 (pointing between 'c'
// and '.'), then on return the output string will be "http://c.com/" and the
// offset will be 8.  If an offset cannot be successfully adjusted (e.g. because
// it points into the middle of a component that was entirely removed or into
// the middle of an encoding sequence), it will be set to std::u16string::npos.
// For consistency, if an input offset points between the scheme and the
// username/password, and both are removed, on output this offset will be 0
// rather than npos; this means that offsets at the starts and ends of removed
// components are always transformed the same way regardless of what other
// components are adjacent.
std::u16string FormatUrl(const GURL& url,
                         FormatUrlTypes format_types,
                         base::UnescapeRule::Type unescape_rules,
                         url::Parsed* new_parsed,
                         size_t* prefix_end,
                         size_t* offset_for_adjustment);

std::u16string FormatUrlWithOffsets(
    const GURL& url,
    FormatUrlTypes format_types,
    base::UnescapeRule::Type unescape_rules,
    url::Parsed* new_parsed,
    size_t* prefix_end,
    std::vector<size_t>* offsets_for_adjustment);

// This function is like those above except it takes |adjustments| rather
// than |offset[s]_for_adjustment|.  |adjustments| will be set to reflect all
// the transformations that happened to |url| to convert it into the returned
// value.
std::u16string FormatUrlWithAdjustments(
    const GURL& url,
    FormatUrlTypes format_types,
    base::UnescapeRule::Type unescape_rules,
    url::Parsed* new_parsed,
    size_t* prefix_end,
    base::OffsetAdjuster::Adjustments* adjustments);

// This is a convenience function for FormatUrl() with
// format_types = kFormatUrlOmitDefaults and unescape = SPACES.  This is the
// typical set of flags for "URLs to display to the user".  You should be
// cautious about using this for URLs which will be parsed or sent to other
// applications.
inline std::u16string FormatUrl(const GURL& url) {
  return FormatUrl(url, kFormatUrlOmitDefaults, base::UnescapeRule::SPACES,
                   nullptr, nullptr, nullptr);
}

// Returns whether FormatUrl() would strip a trailing slash from |url|, given a
// format flag including kFormatUrlOmitTrailingSlashOnBareHostname.
bool CanStripTrailingSlash(const GURL& url);

// Formats the host in |url| and appends it to |output|.
void AppendFormattedHost(const GURL& url, std::u16string* output);

// Converts the given host name to unicode characters. This can be called for
// any host name, if the input is not IDN or is invalid in some way, we'll just
// return the ASCII source so it is still usable.
//
// The input should be the canonicalized ASCII host name from GURL. This
// function does NOT accept UTF-8!
std::u16string IDNToUnicode(std::string_view host);

// Same as IDNToUnicode, but disables spoof checks and returns more details.
// In particular, it doesn't fall back to punycode if |host| fails spoof checks
// in IDN spoof checker or is a lookalike of a top domain.
// DO NOT use this for displaying URLs.
IDNConversionResult UnsafeIDNToUnicodeWithDetails(std::string_view host);

// Strips a "www." prefix from |host| if present and if |host| is eligible.
// |host| is only eligible for www-stripping if it is not a private or intranet
// hostname, and if "www." is part of the subdomain (not the eTLD+1).
std::string StripWWW(const std::string& host);

// Strips a "m." prefix from |host| if present.
std::string StripMobilePrefix(const std::string& text);

// If the |host| component of |url| begins with a "www." prefix (and meets the
// conditions described for StripWWW), then updates |host| to strip the "www."
// prefix.
void StripWWWFromHostComponent(const std::string& url, url::Component* host);

// Returns skeleton strings computed from |host| for spoof checking.
Skeletons GetSkeletons(const std::u16string& host);

// Returns a domain from the top 10K list matching the given skeleton. Used for
// spoof checking. Different types of skeletons are saved in the skeleton trie.
// Providing |type| makes sure the right type of skeletons are looked up. For
// example if |skeleton|="googlecorn", |type|="kFull", no match would be found
// even though the skeleton is saved in the trie, because the type of this
// skeleton in the trie is "kSeparatorsRemoved".
TopDomainEntry LookupSkeletonInTopDomains(
    const std::string& skeleton,
    const SkeletonType type = SkeletonType::kFull);

// Removes diacritics from `host` and returns the new string if the input
// only contains Latin-Greek-Cyrillic characters. Otherwise, returns the
// input string.
std::u16string MaybeRemoveDiacritics(const std::u16string& host);

}  // namespace url_formatter

#endif  // COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_