File: docs_url_strip_handler.cc

package info (click to toggle)
chromium 138.0.7204.157-1
links: PTS, VCS
area: main
in suites: trixie
size: 6,071,864 kB
sloc: cpp: 34,936,859; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,967; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (185 lines) | stat: -rw-r--r-- 6,987 bytes
parent folder | download | duplicates (4)
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/url_deduplication/docs_url_strip_handler.h"

#include <string>
#include <vector>

#include "base/containers/fixed_flat_set.h"
#include "base/containers/lru_cache.h"
#include "base/no_destructor.h"
#include "base/strings/escape.h"
#include "base/strings/string_util.h"
#include "components/url_formatter/url_formatter.h"
#include "net/base/url_util.h"
#include "third_party/re2/src/re2/re2.h"
#include "url/gurl.h"

// TODO(crbug.com/353966074) There is a plan to avoid/consolidate any
//  duplicated code as this borrows from:
//  components/omnibox/browser/document_provider.cc
namespace {
// Verify if the host could possibly be for a valid doc URL. This is a more
// lightweight check than `ExtractDocIdFromUrl()`. It can be done before
// unescaping the URL as valid hosts don't contain escapable chars; unescaping
// is relatively expensive. E.g., 'docs.google.com' isn't a valid doc URL, but
// it's host looks like it could be, so return true. On the other hand,
// 'google.com' is definitely not a doc URL so return false.
bool ValidHostPrefix(const std::string& host) {
  // There are 66 (5*11) valid, e.g. 'docs5.google.com', so rather than check
  // all 66, we just check the 6 prefixes. Keep these prefixes consistent with
  // those in `ExtractDocIdFromUrl()`.
  constexpr auto kValidHostPrefixes = base::MakeFixedFlatSet<std::string_view>({
      "spreadsheets",
      "docs",
      "drive",
      "script",
      "sites",
      "jamboard",
  });
  for (const auto& valid_host_prefix : kValidHostPrefixes) {
    if (base::StartsWith(host, valid_host_prefix,
                         base::CompareCase::INSENSITIVE_ASCII)) {
      return true;
    }
  }
  return false;
}

// Derived from google3/apps/share/util/docs_url_extractor.cc.
std::string ExtractDocIdFromUrl(const std::string& url) {
  static const base::NoDestructor<RE2> docs_url_pattern(
      "\\b("  // The first groups matches the whole URL.
      // Domain.
      "(?:https?://)?(?:"
      // Keep the hosts consistent with `ValidHostPrefix()`.
      "spreadsheets|docs|drive|script|sites|jamboard"
      ")[0-9]?\\.google\\.com"
      "(?::[0-9]+)?\\/"  // Port.
      "(?:\\S*)"         // Non-whitespace chars.
      "(?:"
      // Doc url prefix to match /d/{id}. (?:e/)? deviates from google3.
      "(?:/d/(?:e/)?(?P<path_docid>[0-9a-zA-Z\\-\\_]+))"
      "|"
      // Docs id expr to match a valid id parameter.
      "(?:(?:\\?|&|&amp;)"
      "(?:id|docid|key|docID|DocId)=(?P<query_docid>[0-9a-zA-Z\\-\\_]+))"
      "|"
      // Folder url prefix to match /folders/{folder_id}.
      "(?:/folders/(?P<folder_docid>[0-9a-zA-Z\\-\\_]+))"
      "|"
      // Sites url prefix.
      "(?:/?s/)(?P<sites_docid>[0-9a-zA-Z\\-\\_]+)"
      "(?:/p/[0-9a-zA-Z\\-\\_]+)?/edit"
      "|"
      // Jam url.
      "(?:d/)(?P<jam_docid>[0-9a-zA-Z\\-\\_]+)/(?:edit|viewer)"
      ")"
      // Other valid chars.
      "(?:[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]*)"
      // Summarization details.
      "(?:summarizationDetails=[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/"
      "\\?(?:%5B)(?:%5D)]*)?"
      // Other valid chars.
      "(?:[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]*)"
      "(?:(#[0-9a-zA-Z$\\-\\_\\.\\+\\!\\*\'\\,;:@&=/\\?]+)?)"  // Fragment
      ")");

  std::vector<std::string_view> matched_doc_ids(
      docs_url_pattern->NumberOfCapturingGroups() + 1);
  // ANCHOR_START deviates from google3 which uses UNANCHORED. Using
  // ANCHOR_START prevents incorrectly matching with non-drive URLs but which
  // contain a drive URL; e.g.,
  // url-parser.com/?url=https://docs.google.com/document/d/(id)/edit.
  if (!docs_url_pattern->Match(url, 0, url.size(), RE2::ANCHOR_START,
                               matched_doc_ids.data(),
                               matched_doc_ids.size())) {
    return std::string();
  }
  for (const auto& doc_id_group : docs_url_pattern->NamedCapturingGroups()) {
    std::string_view identified_doc_id = matched_doc_ids[doc_id_group.second];
    if (!identified_doc_id.empty()) {
      return std::string(identified_doc_id);
    }
  }
  return std::string();
}
}  // namespace

namespace url_deduplication {

GURL DocsURLStripHandler::StripExtraParams(GURL url) {
  if (!url.is_valid()) {
    return GURL();
  }

  // A memoization cache. Only updated if `ExtractDocIdFromUrl()` was attempted.
  // That's the most expensive part of this algorithm, and memoizing the earlier
  // trivial checks would worsen performance by pushing out more useful cache
  // entries.
  static base::NoDestructor<base::LRUCache<GURL, GURL>> cache(10);
  const auto& cached = cache->Get(url);
  if (cached != cache->end()) {
    return cached->second;
  }

  // Early exit to avoid unnecessary and more involved checks. Don't update the
  // cache for trivial cases to avoid pushing out a more useful entry.
  if (!url.DomainIs("google.com")) {
    return GURL();
  }

  // We aim to prevent duplicate Drive URLs to appear between the Drive document
  // search provider and history/bookmark entries.
  // All URLs are canonicalized to a GURL form only used for deduplication and
  // not guaranteed to be usable for navigation.

  // Drive redirects are already handled by the regex in |ExtractDocIdFromUrl|.
  // The below logic handles google.com redirects; e.g., google.com/url/q=<url>
  std::string url_str;
  std::string url_str_host;
  if (url.host() == "www.google.com" && url.path() == "/url") {
    if ((!net::GetValueForKeyInQuery(url, "q", &url_str) || url_str.empty()) &&
        (!net::GetValueForKeyInQuery(url, "url", &url_str) ||
         url_str.empty())) {
      return GURL();
    }
    url_str_host = GURL(url_str).host();
  } else {
    url_str = url.spec();
    url_str_host = url.host();
  }

  // Recheck the domain, since a google URL could redirect to a non-google URL
  if (!base::EndsWith(url_str_host, "google.com",
                      base::CompareCase::INSENSITIVE_ASCII)) {
    return GURL();
  }

  // Filter out non-doc hosts. Do this before unescaping the URL below, as
  // unescaping can be expensive and valid hosts don't contain escapable chars.
  // Do this after simplifying the google.com redirect above, as that changes
  // the host.
  if (!ValidHostPrefix(url_str_host)) {
    return GURL();
  }

  // Unescape |url_str|
  url_str = base::UnescapeURLComponent(
      url_str,
      base::UnescapeRule::PATH_SEPARATORS |
          base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS);

  const std::string id = ExtractDocIdFromUrl(url_str);

  // Canonicalize to the /open form without any extra args.
  // This is similar to what we expect from the server.
  GURL deduping_url =
      id.empty() ? GURL() : GURL("https://drive.google.com/open?id=" + id);
  cache->Put(url, deduping_url);
  return deduping_url;
}

}  // namespace url_deduplication