1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260
|
// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "components/autofill/content/renderer/html_based_username_detector.h"
#include <algorithm>
#include <array>
#include <string>
#include <utility>
#include <vector>
#include "base/containers/contains.h"
#include "base/containers/flat_set.h"
#include "base/containers/span.h"
#include "base/i18n/case_conversion.h"
#include "base/memory/raw_span.h"
#include "base/strings/string_split.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/content/renderer/form_autofill_util.h"
#include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
#include "components/autofill/core/common/form_data.h"
#include "third_party/blink/public/web/web_form_element.h"
using blink::WebFormControlElement;
using blink::WebFormElement;
using blink::WebInputElement;
namespace autofill {
namespace {
// List of separators that can appear in HTML attribute values.
constexpr char16_t kDelimiters[] =
u"$\"\'?%*@!\\/&^#:+~`;,>|<.[](){}-_ 0123456789";
// Minimum length of a word, in order not to be considered short word. Short
// words will not be searched in attribute values (especially after delimiters
// removing), because a short word may be a part of another word. A short word
// should be enclosed between delimiters, otherwise an occurrence doesn't count.
constexpr int kMinimumWordLength = 4;
// For each input element that can be a username, developer and user group
// values are computed. The user group value includes what a user sees: label,
// placeholder, aria-label (all are stored in FormFieldData.label). The
// developer group value consists of name and id attribute values.
// For each group the set of short tokens (tokens shorter than
// |kMinimumWordLength|) is computed as well.
struct UsernameFieldData {
FieldRendererId renderer_id;
std::u16string developer_value;
base::flat_set<std::u16string> developer_short_tokens;
std::u16string user_value;
base::flat_set<std::u16string> user_short_tokens;
};
// Words that the algorithm looks for are split into multiple categories based
// on feature reliability.
// A category may contain a latin dictionary and a non-latin dictionary. It is
// mandatory that it has a latin one, but a non-latin might be missing.
// "Latin" translations are the translations of the words for which the
// original translation is similar to the romanized translation (translation of
// the word only using ISO basic Latin alphabet).
// "Non-latin" translations are the translations of the words that have custom,
// country specific characters.
struct CategoryOfWords {
const base::raw_span<const std::u16string_view> latin_dictionary;
const base::raw_span<const std::u16string_view> non_latin_dictionary;
};
// 1. Removes delimiters from |raw_value| and appends the remainder to
// |*field_data_value|. A sentinel symbol is added first if |*field_data_value|
// is not empty.
// 2. Tokenizes and appends short tokens (shorter than |kMinimumWordLength|)
// from |raw_value| to |*field_data_short_tokens|, if any.
void AppendValueAndShortTokens(
const std::u16string& raw_value,
std::u16string* field_data_value,
base::flat_set<std::u16string>* field_data_short_tokens) {
const std::u16string lowercase_value = base::i18n::ToLower(raw_value);
std::vector<std::u16string_view> tokens =
base::SplitStringPiece(lowercase_value, kDelimiters,
base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
// When computing the developer value, '$' safety guard is being added
// between field name and id, so that forming of accidental words is
// prevented.
if (!field_data_value->empty())
field_data_value->push_back('$');
field_data_value->reserve(field_data_value->size() + lowercase_value.size());
std::vector<std::u16string> short_tokens;
for (const std::u16string_view& token : tokens) {
if (token.size() < kMinimumWordLength)
short_tokens.emplace_back(token);
field_data_value->append(token);
}
// It is better to insert elements to a |base::flat_set| in one operation.
field_data_short_tokens->insert(short_tokens.begin(), short_tokens.end());
}
// For the given |input_element|, compute developer and user value, along with
// sets of short tokens, and returns it.
UsernameFieldData ComputeUsernameFieldData(const FormFieldData& field) {
UsernameFieldData field_data;
field_data.renderer_id = field.renderer_id();
AppendValueAndShortTokens(field.name(), &field_data.developer_value,
&field_data.developer_short_tokens);
AppendValueAndShortTokens(field.id_attribute(), &field_data.developer_value,
&field_data.developer_short_tokens);
AppendValueAndShortTokens(field.label(), &field_data.user_value,
&field_data.user_short_tokens);
return field_data;
}
void InferUsernameFieldData(
const FormData& form_data,
std::vector<UsernameFieldData>* possible_usernames_data) {
for (const FormFieldData& field : form_data.fields()) {
if (field.name().empty() &&
field.form_control_type() == FormControlType::kInputPassword) {
continue;
}
possible_usernames_data->push_back(ComputeUsernameFieldData(field));
}
}
// Check if any word from |dictionary| is encountered in computed field
// information (i.e. |value|, |tokens|).
bool CheckFieldWithDictionary(
const std::u16string& value,
const base::flat_set<std::u16string>& short_tokens,
base::span<const std::u16string_view> dictionary) {
for (std::u16string_view word : dictionary) {
if (word.length() < kMinimumWordLength) {
// Treat short words by looking them up in the tokens set.
if (short_tokens.find(word) != short_tokens.end()) {
return true;
}
} else {
// Treat long words by looking them up as a substring in |value|.
if (value.find(word) != std::string::npos) {
return true;
}
}
}
return false;
}
// Check if any word from |category| is encountered in computed field
// information (|possible_username|).
bool ContainsWordFromCategory(const UsernameFieldData& possible_username,
const CategoryOfWords& category) {
// For user value, search in latin and non-latin dictionaries, because this
// value is user visible. For developer value, only look up in latin
/// dictionaries.
return CheckFieldWithDictionary(possible_username.user_value,
possible_username.user_short_tokens,
category.latin_dictionary) ||
CheckFieldWithDictionary(possible_username.user_value,
possible_username.user_short_tokens,
category.non_latin_dictionary) ||
CheckFieldWithDictionary(possible_username.developer_value,
possible_username.developer_short_tokens,
category.latin_dictionary);
}
// Remove from |possible_usernames_data| the elements that definitely cannot be
// usernames, because their computed values contain at least one negative word.
void RemoveFieldsWithNegativeWords(
std::vector<UsernameFieldData>* possible_usernames_data) {
static constexpr CategoryOfWords kNegativeCategory = {kNegativeLatin,
kNegativeNonLatin};
std::erase_if(
*possible_usernames_data, [](const UsernameFieldData& possible_username) {
return ContainsWordFromCategory(possible_username, kNegativeCategory);
});
}
// Check if any word from the given category (|category|) appears in fields from
// the form (|possible_usernames_data|). If the category words appear in more
// than 2 fields, do nothing, because it may just be a prefix. If the words
// appears in 1 or 2 fields, the first field is added to |username_predictions|.
void FindWordsFromCategoryInForm(
const std::vector<UsernameFieldData>& possible_usernames_data,
const CategoryOfWords& category,
std::vector<FieldRendererId>* username_predictions) {
// Auxiliary element that contains the first field (in order of appearance in
// the form) in which a substring is encountered.
FieldRendererId chosen_field_renderer_id;
size_t fields_found = 0;
for (const UsernameFieldData& field_data : possible_usernames_data) {
if (ContainsWordFromCategory(field_data, category)) {
if (fields_found == 0) {
chosen_field_renderer_id = field_data.renderer_id;
}
fields_found++;
}
}
if (fields_found > 0 && fields_found <= 2)
if (!base::Contains(*username_predictions, chosen_field_renderer_id))
username_predictions->push_back(chosen_field_renderer_id);
}
// Find username elements if there is no cached result for the given form and
// add them to |username_predictions| in the order of decreasing reliability.
void FindUsernameFieldInternal(
const FormData& form_data,
std::vector<FieldRendererId>* username_predictions) {
DCHECK(username_predictions);
DCHECK(username_predictions->empty());
static constexpr CategoryOfWords kUsernameCategory = {kUsernameLatin,
kUsernameLatin};
static constexpr CategoryOfWords kUserCategory = {kUserLatin, kUserNonLatin};
static constexpr CategoryOfWords kTechnicalCategory = {kTechnicalWords, {}};
static constexpr CategoryOfWords kWeakCategory = {kWeakWords, {}};
// These categories contain words that point to username field.
// Order of categories is vital: the detector searches for words in descending
// order of probability to point to a username field.
static constexpr auto kPositiveCategories = std::to_array<CategoryOfWords>(
{kUsernameCategory, kUserCategory, kTechnicalCategory, kWeakCategory});
std::vector<UsernameFieldData> possible_usernames_data;
InferUsernameFieldData(form_data, &possible_usernames_data);
RemoveFieldsWithNegativeWords(&possible_usernames_data);
// These are the searches performed by the username detector.
for (const CategoryOfWords& category : kPositiveCategories) {
FindWordsFromCategoryInForm(possible_usernames_data, category,
username_predictions);
}
}
} // namespace
const std::vector<FieldRendererId>& GetPredictionsFieldBasedOnHtmlAttributes(
const FormData& form_data,
UsernameDetectorCache* username_detector_cache) {
// The cache will store the object referenced in the return value, so it must
// exist. It can be empty.
DCHECK(username_detector_cache);
auto [form_position, cache_miss] = username_detector_cache->emplace(
form_data.renderer_id(), std::vector<FieldRendererId>());
if (cache_miss) {
std::vector<FieldRendererId> username_predictions;
FindUsernameFieldInternal(form_data, &username_predictions);
if (!username_predictions.empty())
form_position->second = std::move(username_predictions);
}
return form_position->second;
}
} // namespace autofill
|