File: docs_url_strip_handler.cc

package info (click to toggle)
chromium 138.0.7204.157-1
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 6,071,864 kB
  • sloc: cpp: 34,936,859; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,967; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (185 lines) | stat: -rw-r--r-- 6,987 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/url_deduplication/docs_url_strip_handler.h"

#include <string>
#include <vector>

#include "base/containers/fixed_flat_set.h"
#include "base/containers/lru_cache.h"
#include "base/no_destructor.h"
#include "base/strings/escape.h"
#include "base/strings/string_util.h"
#include "components/url_formatter/url_formatter.h"
#include "net/base/url_util.h"
#include "third_party/re2/src/re2/re2.h"
#include "url/gurl.h"

// TODO(crbug.com/353966074) There is a plan to avoid/consolidate any
//  duplicated code as this borrows from:
//  components/omnibox/browser/document_provider.cc
namespace {
// Verify if the host could possibly be for a valid doc URL. This is a more
// lightweight check than `ExtractDocIdFromUrl()`. It can be done before
// unescaping the URL as valid hosts don't contain escapable chars; unescaping
// is relatively expensive. E.g., 'docs.google.com' isn't a valid doc URL, but
// it's host looks like it could be, so return true. On the other hand,
// 'google.com' is definitely not a doc URL so return false.
bool ValidHostPrefix(const std::string& host) {
  // There are 66 (5*11) valid, e.g. 'docs5.google.com', so rather than check
  // all 66, we just check the 6 prefixes. Keep these prefixes consistent with
  // those in `ExtractDocIdFromUrl()`.
  constexpr auto kValidHostPrefixes = base::MakeFixedFlatSet<std::string_view>({
      "spreadsheets",
      "docs",
      "drive",
      "script",
      "sites",
      "jamboard",
  });
  for (const auto& valid_host_prefix : kValidHostPrefixes) {
    if (base::StartsWith(host, valid_host_prefix,
                         base::CompareCase::INSENSITIVE_ASCII)) {
      return true;
    }
  }
  return false;
}

// Derived from google3/apps/share/util/docs_url_extractor.cc.
std::string ExtractDocIdFromUrl(const std::string& url) {
  static const base::NoDestructor<RE2> docs_url_pattern(
      "\\b("  // The first groups matches the whole URL.
      // Domain.
      "(?:https?://)?(?:"
      // Keep the hosts consistent with `ValidHostPrefix()`.
      "spreadsheets|docs|drive|script|sites|jamboard"
      ")[0-9]?\\.google\\.com"
      "(?::[0-9]+)?\\/"  // Port.
      "(?:\\S*)"         // Non-whitespace chars.
      "(?:"
      // Doc url prefix to match /d/{id}. (?:e/)? deviates from google3.
      "(?:/d/(?:e/)?(?P<path_docid>[0-9a-zA-Z\\-\\_]+))"
      "|"
      // Docs id expr to match a valid id parameter.
      "(?:(?:\\?|&|&amp;)"
      "(?:id|docid|key|docID|DocId)=(?P<query_docid>[0-9a-zA-Z\\-\\_]+))"
      "|"
      // Folder url prefix to match /folders/{folder_id}.
      "(?:/folders/(?P<folder_docid>[0-9a-zA-Z\\-\\_]+))"
      "|"
      // Sites url prefix.
      "(?:/?s/)(?P<sites_docid>[0-9a-zA-Z\\-\\_]+)"
      "(?:/p/[0-9a-zA-Z\\-\\_]+)?/edit"
      "|"
      // Jam url.
      "(?:d/)(?P<jam_docid>[0-9a-zA-Z\\-\\_]+)/(?:edit|viewer)"
      ")"
      // Other valid chars.
      "(?:[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]*)"
      // Summarization details.
      "(?:summarizationDetails=[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/"
      "\\?(?:%5B)(?:%5D)]*)?"
      // Other valid chars.
      "(?:[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]*)"
      "(?:(#[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]+)?)"  // Fragment
      ")");

  std::vector<std::string_view> matched_doc_ids(
      docs_url_pattern->NumberOfCapturingGroups() + 1);
  // ANCHOR_START deviates from google3 which uses UNANCHORED. Using
  // ANCHOR_START prevents incorrectly matching with non-drive URLs but which
  // contain a drive URL; e.g.,
  // url-parser.com/?url=https://docs.google.com/document/d/(id)/edit.
  if (!docs_url_pattern->Match(url, 0, url.size(), RE2::ANCHOR_START,
                               matched_doc_ids.data(),
                               matched_doc_ids.size())) {
    return std::string();
  }
  for (const auto& doc_id_group : docs_url_pattern->NamedCapturingGroups()) {
    std::string_view identified_doc_id = matched_doc_ids[doc_id_group.second];
    if (!identified_doc_id.empty()) {
      return std::string(identified_doc_id);
    }
  }
  return std::string();
}
}  // namespace

namespace url_deduplication {

GURL DocsURLStripHandler::StripExtraParams(GURL url) {
  if (!url.is_valid()) {
    return GURL();
  }

  // A memoization cache. Only updated if `ExtractDocIdFromUrl()` was attempted.
  // That's the most expensive part of this algorithm, and memoizing the earlier
  // trivial checks would worsen performance by pushing out more useful cache
  // entries.
  static base::NoDestructor<base::LRUCache<GURL, GURL>> cache(10);
  const auto& cached = cache->Get(url);
  if (cached != cache->end()) {
    return cached->second;
  }

  // Early exit to avoid unnecessary and more involved checks. Don't update the
  // cache for trivial cases to avoid pushing out a more useful entry.
  if (!url.DomainIs("google.com")) {
    return GURL();
  }

  // We aim to prevent duplicate Drive URLs to appear between the Drive document
  // search provider and history/bookmark entries.
  // All URLs are canonicalized to a GURL form only used for deduplication and
  // not guaranteed to be usable for navigation.

  // Drive redirects are already handled by the regex in |ExtractDocIdFromUrl|.
  // The below logic handles google.com redirects; e.g., google.com/url/q=<url>
  std::string url_str;
  std::string url_str_host;
  if (url.host() == "www.google.com" && url.path() == "/url") {
    if ((!net::GetValueForKeyInQuery(url, "q", &url_str) || url_str.empty()) &&
        (!net::GetValueForKeyInQuery(url, "url", &url_str) ||
         url_str.empty())) {
      return GURL();
    }
    url_str_host = GURL(url_str).host();
  } else {
    url_str = url.spec();
    url_str_host = url.host();
  }

  // Recheck the domain, since a google URL could redirect to a non-google URL
  if (!base::EndsWith(url_str_host, "google.com",
                      base::CompareCase::INSENSITIVE_ASCII)) {
    return GURL();
  }

  // Filter out non-doc hosts. Do this before unescaping the URL below, as
  // unescaping can be expensive and valid hosts don't contain escapable chars.
  // Do this after simplifying the google.com redirect above, as that changes
  // the host.
  if (!ValidHostPrefix(url_str_host)) {
    return GURL();
  }

  // Unescape |url_str|
  url_str = base::UnescapeURLComponent(
      url_str,
      base::UnescapeRule::PATH_SEPARATORS |
          base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

  const std::string id = ExtractDocIdFromUrl(url_str);

  // Canonicalize to the /open form without any extra args.
  // This is similar to what we expect from the server.
  GURL deduping_url =
      id.empty() ? GURL() : GURL("https://drive.google.com/open?id=" + id);
  cache->Put(url, deduping_url);
  return deduping_url;
}

}  // namespace url_deduplication