File: language_detection_util.cc

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (373 lines) | stat: -rw-r--r-- 14,581 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
// Copyright 2014 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/translate/core/language_detection/language_detection_util.h"

#include <stddef.h>

#include <array>
#include <string_view>

#include "base/containers/fixed_flat_set.h"
#include "base/logging.h"
#include "base/metrics/histogram_base.h"
#include "base/metrics/histogram_functions.h"
#include "base/metrics/histogram_macros.h"
#include "base/metrics/metrics_hashes.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversions.h"
#include "components/language/core/common/language_util.h"
#include "components/language_detection/core/constants.h"
#include "components/translate/core/common/translate_metrics.h"
#include "components/translate/core/language_detection/chinese_script_classifier.h"
#include "third_party/cld_3/src/src/nnet_language_identifier.h"

namespace {

// Similar language code list. Some languages are very similar and difficult
// for CLD to distinguish.
struct SimilarLanguageCode {
  const char* const code;
  int group;
};

constexpr auto kSimilarLanguageCodes = std::to_array<SimilarLanguageCode>({
    {"bs", 1},
    {"hr", 1},
    {"hi", 2},
    {"ne", 2},
});

// Checks |kSimilarLanguageCodes| and returns group code.
int GetSimilarLanguageGroupCode(const std::string& language) {
  for (size_t i = 0; i < std::size(kSimilarLanguageCodes); ++i) {
    if (language.find(kSimilarLanguageCodes[i].code) != 0)
      continue;
    return kSimilarLanguageCodes[i].group;
  }
  return 0;
}

// Applies a series of language code modification in proper order.
void ApplyLanguageCodeCorrection(std::string* code) {
  if (!code || code->empty()) {
    return;
  }
  // Correct well-known format errors.
  translate::CorrectLanguageCodeTypo(code);

  if (!translate::IsValidLanguageCode(*code)) {
    *code = std::string();
    return;
  }

  language::ToTranslateLanguageSynonym(code);
}

// Get page language from html language code if it is not empty, otherwise get
// page language from Content-Language code. Returns an empty string when
// Content-Language code is empty.
std::string GetHTMLOrHTTPContentLanguage(const std::string& content_lang,
                                         const std::string& html_lang) {
  // Check if html lang attribute is valid.
  std::string modified_lang = html_lang;
  ApplyLanguageCodeCorrection(&modified_lang);
  if (!modified_lang.empty()) {
    // Found a valid html lang.
    return modified_lang;
  }
  // Check if Content-Language is valid.
  if (!content_lang.empty()) {
    modified_lang = content_lang;
    ApplyLanguageCodeCorrection(&modified_lang);
  }

  return modified_lang;
}

// Checks if the model can complement a sub code when the page language doesn't
// know the sub code.
bool CanModelComplementSubCode(const std::string& page_language,
                               const std::string& model_detected_language) {
  // Translate server cannot treat general Chinese. If Content-Language and
  // the detection model agree that the language is Chinese and Content-Language
  // doesn't know which dialect is used, the model language has priority.
  // TODO(hajimehoshi): How about the other dialects like zh-MO?
  return page_language == "zh" &&
         base::StartsWith(model_detected_language, "zh-",
                          base::CompareCase::INSENSITIVE_ASCII);
}

}  // namespace

namespace translate {

std::string FilterDetectedLanguage(const std::string& utf8_text,
                                   const std::string& detected_language,
                                   bool is_detection_reliable) {
  // Ignore unreliable, "unknown", and xx-Latn predictions that are currently
  // not supported.
  if (!is_detection_reliable)
    return language_detection::kUnknownLanguageCode;
  // TODO(crbug.com/40169055): Determine if ar-Latn and hi-Latn need to be added
  // for the TFLite-based detection model.
  if (detected_language == "bg-Latn" || detected_language == "el-Latn" ||
      detected_language == "ja-Latn" || detected_language == "ru-Latn" ||
      detected_language == "zh-Latn" ||
      detected_language == chrome_lang_id::NNetLanguageIdentifier::kUnknown) {
    return language_detection::kUnknownLanguageCode;
  }

  if (detected_language == "zh") {
    // If prediction is "zh" (Chinese), then we need to determine whether the
    // text is zh-Hant (Chinese Traditional) or zh-Hans (Chinese Simplified).
    translate::ChineseScriptClassifier zh_classifier;

    // The Classify function returns either "zh-Hant" or "zh-Hans".
    // Convert to the old-style language codes used by the Translate API.
    const std::string zh_classification = zh_classifier.Classify(utf8_text);
    if (zh_classification == "zh-Hant")
      return "zh-TW";
    if (zh_classification == "zh-Hans")
      return "zh-CN";
    return language_detection::kUnknownLanguageCode;
  }
  // The detection is reliable and none of the cases that are not handled by the
  // language detection model.
  return detected_language;
}

// Returns the ISO 639 language code of the specified |utf8_text|, or 'unknown'
// if it failed. |is_model_reliable| will be set as true if CLD says the
// detection is reliable and |model_reliability_score| will provide the model's
// confidence in that prediction.
std::string DetermineTextLanguage(const std::string& utf8_text,
                                  bool* is_model_reliable,
                                  float& model_reliability_score) {
  // Make a prediction.
  base::TimeTicks lang_id_start = base::TimeTicks::Now();
  chrome_lang_id::NNetLanguageIdentifier lang_id;
  const chrome_lang_id::NNetLanguageIdentifier::Result lang_id_result =
      lang_id.FindTopNMostFreqLangs(utf8_text, /*num_langs=*/1).at(0);
  base::UmaHistogramTimes("Translate.CLD3.TopLanguageEvaluationDuration",
                          base::TimeTicks::Now() - lang_id_start);
  const bool is_detection_reliable = lang_id_result.is_reliable;
  const float model_probability = lang_id_result.probability;
  const std::string& detected_language = lang_id_result.language;

  // Update histograms.
  const base::HistogramBase::Sample32 pred_lang_hash =
      static_cast<base::HistogramBase::Sample32>(
          base::HashMetricName(detected_language));
  base::UmaHistogramSparse("Translate.CLD3.LanguageDetected", pred_lang_hash);
  if (detected_language != chrome_lang_id::NNetLanguageIdentifier::kUnknown) {
    UMA_HISTOGRAM_PERCENTAGE("Translate.CLD3.LanguagePercentage",
                             static_cast<int>(100 * lang_id_result.proportion));
  }

  if (is_model_reliable != nullptr) {
    *is_model_reliable = is_detection_reliable;
  }
  model_reliability_score = model_probability;
  return FilterDetectedLanguage(utf8_text, detected_language,
                                is_detection_reliable);
}

std::string DeterminePageLanguage(const std::string& code,
                                  const std::string& html_lang,
                                  const std::u16string& contents,
                                  std::string* model_detected_language,
                                  bool* is_model_reliable,
                                  float& model_reliability_score) {
  // First determine the language for the text contents.
  bool is_reliable;
  float model_score = 0.0;
  const std::string utf8_text(base::UTF16ToUTF8(contents));
  std::string detected_language =
      DetermineTextLanguage(utf8_text, &is_reliable, model_score);
  if (model_detected_language != nullptr)
    *model_detected_language = detected_language;
  if (is_model_reliable != nullptr)
    *is_model_reliable = is_reliable;
  model_reliability_score = model_score;
  language::ToTranslateLanguageSynonym(&detected_language);

  return DeterminePageLanguage(code, html_lang, detected_language, is_reliable);
}

std::string DeterminePageLanguageNoModel(
    const std::string& code,
    const std::string& html_lang,
    LanguageVerificationType language_verification_type) {
  translate::ReportLanguageVerification(language_verification_type);
  std::string language = GetHTMLOrHTTPContentLanguage(code, html_lang);
  return language.empty() ? language_detection::kUnknownLanguageCode : language;
}

// Now consider the web page language details along with the contents language.
std::string DeterminePageLanguage(const std::string& code,
                                  const std::string& html_lang,
                                  const std::string& model_detected_language,
                                  bool is_model_reliable) {
  std::string language = GetHTMLOrHTTPContentLanguage(code, html_lang);
  // If |language| is empty, just use model result even though it might be
  // language_detection::kUnknownLanguageCode.
  if (language.empty()) {
    translate::ReportLanguageVerification(
        translate::LanguageVerificationType::kModelOnly);
    return model_detected_language;
  }

  // If |model_detected_language| is empty, just use |language|.
  if (model_detected_language.empty() ||
      model_detected_language == language_detection::kUnknownLanguageCode) {
    translate::ReportLanguageVerification(
        translate::LanguageVerificationType::kModelUnknown);
    return language;
  }

  if (CanModelComplementSubCode(language, model_detected_language)) {
    translate::ReportLanguageVerification(
        translate::LanguageVerificationType::kModelComplementsCountry);
    return model_detected_language;
  }

  if (IsSameOrSimilarLanguages(language, model_detected_language)) {
    translate::ReportLanguageVerification(
        translate::LanguageVerificationType::kModelAgrees);
    return language;
  }

  if (MaybeServerWrongConfiguration(language, model_detected_language)) {
    translate::ReportLanguageVerification(
        translate::LanguageVerificationType::kModelOverrides);
    return model_detected_language;
  }

  // Content-Language value might be wrong because model says that this page is
  // written in another language with confidence. In this case, Chrome doesn't
  // rely on any of the language codes, and gives up suggesting a translation.
  translate::ReportLanguageVerification(
      translate::LanguageVerificationType::kModelDisagrees);
  return language_detection::kUnknownLanguageCode;
}

void CorrectLanguageCodeTypo(std::string* code) {
  DCHECK(code);

  size_t coma_index = code->find(',');
  if (coma_index != std::string::npos) {
    // There are more than 1 language specified, just keep the first one.
    *code = code->substr(0, coma_index);
  }
  base::TrimWhitespaceASCII(*code, base::TRIM_ALL, code);

  // An underscore instead of a dash is a frequent mistake.
  size_t underscore_index = code->find('_');
  if (underscore_index != std::string::npos)
    (*code)[underscore_index] = '-';

  // Change everything up to a dash to lower-case and everything after to upper.
  size_t dash_index = code->find('-');
  if (dash_index != std::string::npos) {
    *code = base::ToLowerASCII(code->substr(0, dash_index)) +
        base::ToUpperASCII(code->substr(dash_index));
  } else {
    *code = base::ToLowerASCII(*code);
  }
}

bool IsValidLanguageCode(const std::string& code) {
  // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
  // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
  std::vector<std::string_view> chunks = base::SplitStringPiece(
      code, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);

  if (chunks.size() < 1 || 2 < chunks.size())
    return false;

  std::string_view main_code = chunks[0];

  if (main_code.size() < 1 || 3 < main_code.size())
    return false;

  for (char c : main_code) {
    if (!base::IsAsciiAlpha(c))
      return false;
  }

  if (chunks.size() == 1)
    return true;

  std::string_view sub_code = chunks[1];

  if (sub_code.size() != 2)
    return false;

  for (char c : sub_code) {
    if (!base::IsAsciiAlpha(c))
      return false;
  }

  return true;
}

bool IsSameOrSimilarLanguages(const std::string& page_language,
                              const std::string& model_detected_language) {
  std::vector<std::string> chunks = base::SplitString(
      page_language, "-", base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
  if (chunks.size() == 0)
    return false;
  std::string page_language_main_part = chunks[0];  // Need copy.

  chunks = base::SplitString(model_detected_language, "-",
                             base::TRIM_WHITESPACE, base::SPLIT_WANT_ALL);
  if (chunks.size() == 0)
    return false;
  const std::string& model_detected_language_main_part = chunks[0];

  // Language code part of |page_language| is matched to one of
  // |model_detected_language|. Country code is ignored here.
  if (page_language_main_part == model_detected_language_main_part) {
    // Languages are matched strictly - return true.
    return true;
  }

  // Check if |page_language| and |model_detected_language| are in the similar
  // language list and belong to the same language group.
  int page_code = GetSimilarLanguageGroupCode(page_language);
  bool match = page_code != 0 && page_code == GetSimilarLanguageGroupCode(
                                                  model_detected_language);

  return match;
}

bool IsServerWrongConfigurationLanguage(const std::string& language_code) {
  // Well-known languages which often have wrong server configuration of
  // Content-Language: en.
  constexpr auto kSet = base::MakeFixedFlatSet<std::string_view>(
      {"es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it",
       "th"});

  return kSet.contains(language_code);
}

bool MaybeServerWrongConfiguration(const std::string& page_language,
                                   const std::string& model_detected_language) {
  // If |page_language| is not "en-*", respect it and just return false here.
  if (!base::StartsWith(page_language, "en",
                        base::CompareCase::INSENSITIVE_ASCII))
    return false;

  // A server provides a language meta information representing "en-*". But it
  // might be just a default value due to missing user configuration.
  // Let's trust |model_detected_language| if the determined language is not
  // difficult to distinguish from English, and the language is one of
  // well-known languages which often provide "en-*" meta information
  // mistakenly.
  return IsServerWrongConfigurationLanguage(model_detected_language);
}

}  // namespace translate