File: google_url_util.cc

package info (click to toggle)
chromium 139.0.7258.127-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,122,156 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (131 lines) | stat: -rw-r--r-- 4,199 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/page_load_metrics/google/browser/google_url_util.h"

#include <algorithm>
#include <string_view>

#include "base/strings/string_util.h"
#include "components/page_load_metrics/browser/page_load_metrics_util.h"
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"

namespace page_load_metrics {

std::optional<std::string> GetGoogleHostnamePrefix(const GURL& url) {
  const size_t registry_length =
      net::registry_controlled_domains::GetRegistryLength(
          url,

          // Do not include unknown registries (registries that don't have any
          // matches in effective TLD names).
          net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,

          // Do not include private registries, such as appspot.com. We don't
          // want to match URLs like www.google.appspot.com.
          net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);

  const std::string_view hostname = url.host_piece();
  if (registry_length == 0 || registry_length == std::string::npos ||
      registry_length >= hostname.length()) {
    return std::nullopt;
  }

  // Removes the tld and the preceding dot.
  const std::string_view hostname_minus_registry =
      hostname.substr(0, hostname.length() - (registry_length + 1));

  if (hostname_minus_registry == "google") {
    return std::string("");
  }

  if (!base::EndsWith(hostname_minus_registry, ".google",
                      base::CompareCase::INSENSITIVE_ASCII)) {
    return std::nullopt;
  }

  return std::string(hostname_minus_registry.substr(
      0, hostname_minus_registry.length() - strlen(".google")));
}

bool IsGoogleHostname(const GURL& url) {
  return GetGoogleHostnamePrefix(url).has_value();
}

bool IsGoogleSearchHostname(const GURL& url) {
  std::optional<std::string> result =
      page_load_metrics::GetGoogleHostnamePrefix(url);
  return result && result.value() == "www";
}

bool IsProbablyGoogleSearchUrl(const GURL& url) {
  if (!page_load_metrics::IsGoogleSearchHostname(url)) {
    return false;
  }

  const std::string_view path = url.path_piece();
  if (path == "/maps" || path.find("/maps/") != std::string_view::npos) {
    return false;
  }

  return true;
}

// Determine if the given url has query associated with it.
bool HasGoogleSearchQuery(const GURL& url) {
  // NOTE: we do not require 'q=' in the query, as AJAXy search may instead
  // store the query in the URL fragment.
  return QueryContainsComponentPrefix(url.query_piece(), "q=") ||
         QueryContainsComponentPrefix(url.ref_piece(), "q=");
}

bool IsGoogleSearchResultUrl(const GURL& url) {
  if (!IsGoogleSearchHostname(url)) {
    return false;
  }

  if (!HasGoogleSearchQuery(url)) {
    return false;
  }

  const std::string_view path = url.path_piece();
  return path == "/search" || path == "/webhp" || path == "/custom" ||
         path == "/";
}

bool IsGoogleSearchHomepageUrl(const GURL& url) {
  if (!IsGoogleSearchHostname(url)) {
    return false;
  }

  const std::string_view path = url.path_piece();
  if (path == "/webhp" || path == "/") {
    return true;
  }

  return (path == "/custom" || path == "/search") && !HasGoogleSearchQuery(url);
}

bool IsGoogleSearchRedirectorUrl(const GURL& url) {
  if (!IsGoogleSearchHostname(url)) {
    return false;
  }

  // The primary search redirector.  Google search result redirects are
  // differentiated from other general google redirects by 'source=web' in the
  // query string.
  if (url.path_piece() == "/url" && url.has_query() &&
      QueryContainsComponent(url.query_piece(), "source=web")) {
    return true;
  }

  // Intent-based navigations from search are redirected through a second
  // redirector, which receives its redirect URL in the fragment/hash/ref
  // portion of the URL (the portion after '#'). We don't check for the presence
  // of certain params in the ref since this redirector is only used for
  // redirects from search.
  return url.path_piece() == "/searchurl/r.html" && url.has_ref();
}

}  // namespace page_load_metrics