File: html_based_username_detector.cc

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (260 lines) | stat: -rw-r--r-- 11,201 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/content/renderer/html_based_username_detector.h"

#include <algorithm>
#include <array>
#include <string>
#include <utility>
#include <vector>

#include "base/containers/contains.h"
#include "base/containers/flat_set.h"
#include "base/containers/span.h"
#include "base/i18n/case_conversion.h"
#include "base/memory/raw_span.h"
#include "base/strings/string_split.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/content/renderer/form_autofill_util.h"
#include "components/autofill/content/renderer/html_based_username_detector_vocabulary.h"
#include "components/autofill/core/common/form_data.h"
#include "third_party/blink/public/web/web_form_element.h"

using blink::WebFormControlElement;
using blink::WebFormElement;
using blink::WebInputElement;

namespace autofill {

namespace {

// List of separators that can appear in HTML attribute values.
constexpr char16_t kDelimiters[] =
    u"$\"\'?%*@!\\/&^#:+~`;,>|<.[](){}-_ 0123456789";

// Minimum length of a word, in order not to be considered short word. Short
// words will not be searched in attribute values (especially after delimiters
// removing), because a short word may be a part of another word. A short word
// should be enclosed between delimiters, otherwise an occurrence doesn't count.
constexpr int kMinimumWordLength = 4;

// For each input element that can be a username, developer and user group
// values are computed. The user group value includes what a user sees: label,
// placeholder, aria-label (all are stored in FormFieldData.label). The
// developer group value consists of name and id attribute values.
// For each group the set of short tokens (tokens shorter than
// |kMinimumWordLength|) is computed as well.
struct UsernameFieldData {
  FieldRendererId renderer_id;
  std::u16string developer_value;
  base::flat_set<std::u16string> developer_short_tokens;
  std::u16string user_value;
  base::flat_set<std::u16string> user_short_tokens;
};

// Words that the algorithm looks for are split into multiple categories based
// on feature reliability.
// A category may contain a latin dictionary and a non-latin dictionary. It is
// mandatory that it has a latin one, but a non-latin might be missing.
// "Latin" translations are the translations of the words for which the
// original translation is similar to the romanized translation (translation of
// the word only using ISO basic Latin alphabet).
// "Non-latin" translations are the translations of the words that have custom,
// country specific characters.
struct CategoryOfWords {
  const base::raw_span<const std::u16string_view> latin_dictionary;
  const base::raw_span<const std::u16string_view> non_latin_dictionary;
};

// 1. Removes delimiters from |raw_value| and appends the remainder to
// |*field_data_value|. A sentinel symbol is added first if |*field_data_value|
// is not empty.
// 2. Tokenizes and appends short tokens (shorter than |kMinimumWordLength|)
// from |raw_value| to |*field_data_short_tokens|, if any.
void AppendValueAndShortTokens(
    const std::u16string& raw_value,
    std::u16string* field_data_value,
    base::flat_set<std::u16string>* field_data_short_tokens) {
  const std::u16string lowercase_value = base::i18n::ToLower(raw_value);
  std::vector<std::u16string_view> tokens =
      base::SplitStringPiece(lowercase_value, kDelimiters,
                             base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

  // When computing the developer value, '$' safety guard is being added
  // between field name and id, so that forming of accidental words is
  // prevented.
  if (!field_data_value->empty())
    field_data_value->push_back('$');

  field_data_value->reserve(field_data_value->size() + lowercase_value.size());
  std::vector<std::u16string> short_tokens;
  for (const std::u16string_view& token : tokens) {
    if (token.size() < kMinimumWordLength)
      short_tokens.emplace_back(token);
    field_data_value->append(token);
  }
  // It is better to insert elements to a |base::flat_set| in one operation.
  field_data_short_tokens->insert(short_tokens.begin(), short_tokens.end());
}

// For the given |input_element|, compute developer and user value, along with
// sets of short tokens, and returns it.
UsernameFieldData ComputeUsernameFieldData(const FormFieldData& field) {
  UsernameFieldData field_data;
  field_data.renderer_id = field.renderer_id();

  AppendValueAndShortTokens(field.name(), &field_data.developer_value,
                            &field_data.developer_short_tokens);
  AppendValueAndShortTokens(field.id_attribute(), &field_data.developer_value,
                            &field_data.developer_short_tokens);
  AppendValueAndShortTokens(field.label(), &field_data.user_value,
                            &field_data.user_short_tokens);
  return field_data;
}

void InferUsernameFieldData(
    const FormData& form_data,
    std::vector<UsernameFieldData>* possible_usernames_data) {
  for (const FormFieldData& field : form_data.fields()) {
    if (field.name().empty() &&
        field.form_control_type() == FormControlType::kInputPassword) {
      continue;
    }
    possible_usernames_data->push_back(ComputeUsernameFieldData(field));
  }
}

// Check if any word from |dictionary| is encountered in computed field
// information (i.e. |value|, |tokens|).
bool CheckFieldWithDictionary(
    const std::u16string& value,
    const base::flat_set<std::u16string>& short_tokens,
    base::span<const std::u16string_view> dictionary) {
  for (std::u16string_view word : dictionary) {
    if (word.length() < kMinimumWordLength) {
      // Treat short words by looking them up in the tokens set.
      if (short_tokens.find(word) != short_tokens.end()) {
        return true;
      }
    } else {
      // Treat long words by looking them up as a substring in |value|.
      if (value.find(word) != std::string::npos) {
        return true;
      }
    }
  }
  return false;
}

// Check if any word from |category| is encountered in computed field
// information (|possible_username|).
bool ContainsWordFromCategory(const UsernameFieldData& possible_username,
                              const CategoryOfWords& category) {
  // For user value, search in latin and non-latin dictionaries, because this
  // value is user visible. For developer value, only look up in latin
  /// dictionaries.
  return CheckFieldWithDictionary(possible_username.user_value,
                                  possible_username.user_short_tokens,
                                  category.latin_dictionary) ||
         CheckFieldWithDictionary(possible_username.user_value,
                                  possible_username.user_short_tokens,
                                  category.non_latin_dictionary) ||
         CheckFieldWithDictionary(possible_username.developer_value,
                                  possible_username.developer_short_tokens,
                                  category.latin_dictionary);
}

// Remove from |possible_usernames_data| the elements that definitely cannot be
// usernames, because their computed values contain at least one negative word.
void RemoveFieldsWithNegativeWords(
    std::vector<UsernameFieldData>* possible_usernames_data) {
  static constexpr CategoryOfWords kNegativeCategory = {kNegativeLatin,
                                                        kNegativeNonLatin};

  std::erase_if(
      *possible_usernames_data, [](const UsernameFieldData& possible_username) {
        return ContainsWordFromCategory(possible_username, kNegativeCategory);
      });
}

// Check if any word from the given category (|category|) appears in fields from
// the form (|possible_usernames_data|). If the category words appear in more
// than 2 fields, do nothing, because it may just be a prefix. If the words
// appears in 1 or 2 fields, the first field is added to |username_predictions|.
void FindWordsFromCategoryInForm(
    const std::vector<UsernameFieldData>& possible_usernames_data,
    const CategoryOfWords& category,
    std::vector<FieldRendererId>* username_predictions) {
  // Auxiliary element that contains the first field (in order of appearance in
  // the form) in which a substring is encountered.
  FieldRendererId chosen_field_renderer_id;

  size_t fields_found = 0;
  for (const UsernameFieldData& field_data : possible_usernames_data) {
    if (ContainsWordFromCategory(field_data, category)) {
      if (fields_found == 0) {
        chosen_field_renderer_id = field_data.renderer_id;
      }
      fields_found++;
    }
  }

  if (fields_found > 0 && fields_found <= 2)
    if (!base::Contains(*username_predictions, chosen_field_renderer_id))
      username_predictions->push_back(chosen_field_renderer_id);
}

// Find username elements if there is no cached result for the given form and
// add them to |username_predictions| in the order of decreasing reliability.
void FindUsernameFieldInternal(
    const FormData& form_data,
    std::vector<FieldRendererId>* username_predictions) {
  DCHECK(username_predictions);
  DCHECK(username_predictions->empty());

  static constexpr CategoryOfWords kUsernameCategory = {kUsernameLatin,
                                                        kUsernameLatin};
  static constexpr CategoryOfWords kUserCategory = {kUserLatin, kUserNonLatin};
  static constexpr CategoryOfWords kTechnicalCategory = {kTechnicalWords, {}};
  static constexpr CategoryOfWords kWeakCategory = {kWeakWords, {}};
  // These categories contain words that point to username field.
  // Order of categories is vital: the detector searches for words in descending
  // order of probability to point to a username field.
  static constexpr auto kPositiveCategories = std::to_array<CategoryOfWords>(
      {kUsernameCategory, kUserCategory, kTechnicalCategory, kWeakCategory});
  std::vector<UsernameFieldData> possible_usernames_data;

  InferUsernameFieldData(form_data, &possible_usernames_data);
  RemoveFieldsWithNegativeWords(&possible_usernames_data);

  // These are the searches performed by the username detector.
  for (const CategoryOfWords& category : kPositiveCategories) {
    FindWordsFromCategoryInForm(possible_usernames_data, category,
                                username_predictions);
  }
}

}  // namespace

const std::vector<FieldRendererId>& GetPredictionsFieldBasedOnHtmlAttributes(
    const FormData& form_data,
    UsernameDetectorCache* username_detector_cache) {
  // The cache will store the object referenced in the return value, so it must
  // exist. It can be empty.
  DCHECK(username_detector_cache);

  auto [form_position, cache_miss] = username_detector_cache->emplace(
      form_data.renderer_id(), std::vector<FieldRendererId>());

  if (cache_miss) {
    std::vector<FieldRendererId> username_predictions;
    FindUsernameFieldInternal(form_data, &username_predictions);
    if (!username_predictions.empty())
      form_position->second = std::move(username_predictions);
  }
  return form_position->second;
}

}  // namespace autofill