File: html_based_username_detector.cc

package info (click to toggle)
chromium 139.0.7258.127-1
links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (260 lines) | stat: -rw-r--r-- 11,201 bytes
parent folder | download | duplicates (5)
// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/content/renderer/html_based_username_detector.h"

#include <algorithm>
#include <array>
#include <string>
#include <utility>
#include <vector>

#include "base/containers/contains.h"
#include "base/containers/flat_set.h"
#include "base/containers/span.h"
#include "base/i18n/case_conversion.h"
#include "base/memory/raw_span.h"
#include "base/strings/string_split.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/content/renderer/form_autofill_util.h"
#include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
#include "components/autofill/core/common/form_data.h"
#include "third_party/blink/public/web/web_form_element.h"

using blink::WebFormControlElement;
using blink::WebFormElement;
using blink::WebInputElement;

namespace autofill {

namespace {

// List of separators that can appear in HTML attribute values.
constexpr char16_t kDelimiters[] =
    u"$\"\'?%*@!\\/&^#:+~`;,>|<.[](){}-_ 0123456789";

// Minimum length of a word, in order not to be considered short word. Short
// words will not be searched in attribute values (especially after delimiters
// removing), because a short word may be a part of another word. A short word
// should be enclosed between delimiters, otherwise an occurrence doesn't count.
constexpr int kMinimumWordLength = 4;

// For each input element that can be a username, developer and user group
// values are computed. The user group value includes what a user sees: label,
// placeholder, aria-label (all are stored in FormFieldData.label). The
// developer group value consists of name and id attribute values.
// For each group the set of short tokens (tokens shorter than
// |kMinimumWordLength|) is computed as well.
struct UsernameFieldData {
  FieldRendererId renderer_id;
  std::u16string developer_value;
  base::flat_set<std::u16string> developer_short_tokens;
  std::u16string user_value;
  base::flat_set<std::u16string> user_short_tokens;
};

// Words that the algorithm looks for are split into multiple categories based
// on feature reliability.
// A category may contain a latin dictionary and a non-latin dictionary. It is
// mandatory that it has a latin one, but a non-latin might be missing.
// "Latin" translations are the translations of the words for which the
// original translation is similar to the romanized translation (translation of
// the word only using ISO basic Latin alphabet).
// "Non-latin" translations are the translations of the words that have custom,
// country specific characters.
struct CategoryOfWords {
  const base::raw_span<const std::u16string_view> latin_dictionary;
  const base::raw_span<const std::u16string_view> non_latin_dictionary;
};

// 1. Removes delimiters from |raw_value| and appends the remainder to
// |*field_data_value|. A sentinel symbol is added first if |*field_data_value|
// is not empty.
// 2. Tokenizes and appends short tokens (shorter than |kMinimumWordLength|)
// from |raw_value| to |*field_data_short_tokens|, if any.
void AppendValueAndShortTokens(
    const std::u16string& raw_value,
    std::u16string* field_data_value,
    base::flat_set<std::u16string>* field_data_short_tokens) {
  const std::u16string lowercase_value = base::i18n::ToLower(raw_value);
  std::vector<std::u16string_view> tokens =
      base::SplitStringPiece(lowercase_value, kDelimiters,
                             base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

  // When computing the developer value, '$' safety guard is being added
  // between field name and id, so that forming of accidental words is
  // prevented.
  if (!field_data_value->empty())
    field_data_value->push_back('$');

  field_data_value->reserve(field_data_value->size() + lowercase_value.size());
  std::vector<std::u16string> short_tokens;
  for (const std::u16string_view& token : tokens) {
    if (token.size() < kMinimumWordLength)
      short_tokens.emplace_back(token);
    field_data_value->append(token);
  }
  // It is better to insert elements to a |base::flat_set| in one operation.
  field_data_short_tokens->insert(short_tokens.begin(), short_tokens.end());
}

// For the given |input_element|, compute developer and user value, along with
// sets of short tokens, and returns it.
UsernameFieldData ComputeUsernameFieldData(const FormFieldData& field) {
  UsernameFieldData field_data;
  field_data.renderer_id = field.renderer_id();

  AppendValueAndShortTokens(field.name(), &field_data.developer_value,
                            &field_data.developer_short_tokens);
  AppendValueAndShortTokens(field.id_attribute(), &field_data.developer_value,
                            &field_data.developer_short_tokens);
  AppendValueAndShortTokens(field.label(), &field_data.user_value,
                            &field_data.user_short_tokens);
  return field_data;
}

void InferUsernameFieldData(
    const FormData& form_data,
    std::vector<UsernameFieldData>* possible_usernames_data) {
  for (const FormFieldData& field : form_data.fields()) {
    if (field.name().empty() &&
        field.form_control_type() == FormControlType::kInputPassword) {
      continue;
    }
    possible_usernames_data->push_back(ComputeUsernameFieldData(field));
  }
}

// Check if any word from |dictionary| is encountered in computed field
// information (i.e. |value|, |tokens|).
bool CheckFieldWithDictionary(
    const std::u16string& value,
    const base::flat_set<std::u16string>& short_tokens,
    base::span<const std::u16string_view> dictionary) {
  for (std::u16string_view word : dictionary) {
    if (word.length() < kMinimumWordLength) {
      // Treat short words by looking them up in the tokens set.
      if (short_tokens.find(word) != short_tokens.end()) {
        return true;
      }
    } else {
      // Treat long words by looking them up as a substring in |value|.
      if (value.find(word) != std::string::npos) {
        return true;
      }
    }
  }
  return false;
}

// Check if any word from |category| is encountered in computed field
// information (|possible_username|).
bool ContainsWordFromCategory(const UsernameFieldData& possible_username,
                              const CategoryOfWords& category) {
  // For user value, search in latin and non-latin dictionaries, because this
  // value is user visible. For developer value, only look up in latin
  /// dictionaries.
  return CheckFieldWithDictionary(possible_username.user_value,
                                  possible_username.user_short_tokens,
                                  category.latin_dictionary) ||
         CheckFieldWithDictionary(possible_username.user_value,
                                  possible_username.user_short_tokens,
                                  category.non_latin_dictionary) ||
         CheckFieldWithDictionary(possible_username.developer_value,
                                  possible_username.developer_short_tokens,
                                  category.latin_dictionary);
}

// Remove from |possible_usernames_data| the elements that definitely cannot be
// usernames, because their computed values contain at least one negative word.
void RemoveFieldsWithNegativeWords(
    std::vector<UsernameFieldData>* possible_usernames_data) {
  static constexpr CategoryOfWords kNegativeCategory = {kNegativeLatin,
                                                        kNegativeNonLatin};

  std::erase_if(
      *possible_usernames_data, [](const UsernameFieldData& possible_username) {
        return ContainsWordFromCategory(possible_username, kNegativeCategory);
      });
}

// Check if any word from the given category (|category|) appears in fields from
// the form (|possible_usernames_data|). If the category words appear in more
// than 2 fields, do nothing, because it may just be a prefix. If the words
// appears in 1 or 2 fields, the first field is added to |username_predictions|.
void FindWordsFromCategoryInForm(
    const std::vector<UsernameFieldData>& possible_usernames_data,
    const CategoryOfWords& category,
    std::vector<FieldRendererId>* username_predictions) {
  // Auxiliary element that contains the first field (in order of appearance in
  // the form) in which a substring is encountered.
  FieldRendererId chosen_field_renderer_id;

  size_t fields_found = 0;
  for (const UsernameFieldData& field_data : possible_usernames_data) {
    if (ContainsWordFromCategory(field_data, category)) {
      if (fields_found == 0) {
        chosen_field_renderer_id = field_data.renderer_id;
      }
      fields_found++;
    }
  }

  if (fields_found > 0 && fields_found <= 2)
    if (!base::Contains(*username_predictions, chosen_field_renderer_id))
      username_predictions->push_back(chosen_field_renderer_id);
}

// Find username elements if there is no cached result for the given form and
// add them to |username_predictions| in the order of decreasing reliability.
void FindUsernameFieldInternal(
    const FormData& form_data,
    std::vector<FieldRendererId>* username_predictions) {
  DCHECK(username_predictions);
  DCHECK(username_predictions->empty());

  static constexpr CategoryOfWords kUsernameCategory = {kUsernameLatin,
                                                        kUsernameLatin};
  static constexpr CategoryOfWords kUserCategory = {kUserLatin, kUserNonLatin};
  static constexpr CategoryOfWords kTechnicalCategory = {kTechnicalWords, {}};
  static constexpr CategoryOfWords kWeakCategory = {kWeakWords, {}};
  // These categories contain words that point to username field.
  // Order of categories is vital: the detector searches for words in descending
  // order of probability to point to a username field.
  static constexpr auto kPositiveCategories = std::to_array<CategoryOfWords>(
      {kUsernameCategory, kUserCategory, kTechnicalCategory, kWeakCategory});
  std::vector<UsernameFieldData> possible_usernames_data;

  InferUsernameFieldData(form_data, &possible_usernames_data);
  RemoveFieldsWithNegativeWords(&possible_usernames_data);

  // These are the searches performed by the username detector.
  for (const CategoryOfWords& category : kPositiveCategories) {
    FindWordsFromCategoryInForm(possible_usernames_data, category,
                                username_predictions);
  }
}

}  // namespace

const std::vector<FieldRendererId>& GetPredictionsFieldBasedOnHtmlAttributes(
    const FormData& form_data,
    UsernameDetectorCache* username_detector_cache) {
  // The cache will store the object referenced in the return value, so it must
  // exist. It can be empty.
  DCHECK(username_detector_cache);

  auto [form_position, cache_miss] = username_detector_cache->emplace(
      form_data.renderer_id(), std::vector<FieldRendererId>());

  if (cache_miss) {
    std::vector<FieldRendererId> username_predictions;
    FindUsernameFieldInternal(form_data, &username_predictions);
    if (!username_predictions.empty())
      form_position->second = std::move(username_predictions);
  }
  return form_position->second;
}

}  // namespace autofill