File: transliterator.cc

package info (click to toggle)
chromium 138.0.7204.183-1~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm-proposed-updates
  • size: 6,080,960 kB
  • sloc: cpp: 34,937,079; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,954; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,811; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (166 lines) | stat: -rw-r--r-- 6,487 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
// Copyright 2025 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/core/browser/data_model/transliterator.h"

#include <memory>

#include "base/containers/fixed_flat_set.h"
#include "base/containers/flat_map.h"
#include "base/feature_list.h"
#include "base/i18n/transliterator.h"
#include "base/i18n/unicodestring.h"
#include "base/memory/ptr_util.h"
#include "base/metrics/histogram_functions.h"
#include "base/strings/string_util.h"
#include "base/time/time.h"
#include "components/autofill/core/browser/country_type.h"
#include "components/autofill/core/browser/data_model/addresses/address.h"
#include "components/autofill/core/common/autofill_features.h"

namespace autofill {

namespace {

// The transliteration rule to be applied.
enum class TransliterationId {
  // ICU Katakana-Hiragana transliteration.
  kKatakanaToHiragana,
  // ICU Hiragana-Katakana transliteration.
  kHiraganaToKatakana,
  // Simplified version of the ICU "::de-ASCII" transliteration.
  kGerman,
  // Converts to lowercase and removes diacritics.
  kDefault,
};

// List of country codes where the `TransliterationId::kGerman` can be applied.
static constexpr auto kCountriesWithGermanTransliteration =
    base::MakeFixedFlatSet<std::string_view>(
        {"AT", "BE", "CH", "DE", "IT", "LI", "LU"});

base::flat_map<TransliterationId,
               std::unique_ptr<const base::i18n::Transliterator>>&
GetTransliteratorsMap() {
  // The `ICU` library does not cache the transliterators created from rules,
  // since their creation caused ANR errors on IOS and Android, it is important
  // that until those are converted to be generated during the compile time,
  // they are cached in memory for the duration of the browser lifetime.
  static base::NoDestructor<base::flat_map<
      TransliterationId, std::unique_ptr<const base::i18n::Transliterator>>>
      autofill_transliterators;
  return *autofill_transliterators;
}

std::unique_ptr<base::i18n::Transliterator> CreateTransliterator(
    TransliterationId id) {
  std::string transliteration_rules;
  std::unique_ptr<base::i18n::Transliterator> transliterator;

  switch (id) {
    case TransliterationId::kKatakanaToHiragana:
      transliterator = base::i18n::CreateTransliterator("Katakana-Hiragana");
      break;
    case TransliterationId::kHiraganaToKatakana:
      transliterator = base::i18n::CreateTransliterator("Hiragana-Katakana");
      break;
    case TransliterationId::kGerman:
      // Apply a simplified version of the "::de-ASCII" transliteration, which
      // follows DIN 5007-2 ("ö" becomes "oe"). Here we map everything to
      // lower case because that happens with "::Lower" anyway.
      transliteration_rules =
          "[ö {o \u0308} Ö {O \u0308}] → oe;"
          "[ä {a \u0308} Ä {A \u0308}] → ae;"
          "[ü {u \u0308} Ü {U \u0308}] → ue;";
      [[fallthrough]];
    case TransliterationId::kDefault:
      // These rules are happening in the following order:
      // First there are `TransliterationId::kGerman` specific rules if they are
      // present, then
      // "::NFD;" performs a decomposition and normalization.
      // (â becomes a and ̂)
      // "::[:Nonspacing Mark:] Remove;" removes the " ̂"
      // "::Lower;" converts the result to lower case
      // "::NFC;" re-composes the decomposed characters
      // "::Latin-ASCII;" converts various other Latin characters to an ASCII
      //   representation (e.g. "ł", which does not get decomposed, to "l"; "ß"
      //   to "ss").
      transliteration_rules +=
          "::NFD; ::[:Nonspacing Mark:] Remove; ::Lower; ::NFC; ::Latin-ASCII;";
      transliterator = base::i18n::CreateTransliteratorFromRules(
          "NormalizeForAddresses", transliteration_rules);
      break;
  }

  if (!transliterator) {
    base::UmaHistogramBoolean("Autofill.TransliteratorInitStatus", false);
    return nullptr;
  }

  base::UmaHistogramBoolean("Autofill.TransliteratorInitStatus", true);
  return transliterator;
}

// May return nullptr if the transliterator cannot be initialized.
const base::i18n::Transliterator* GetCachedTransliterator(
    TransliterationId transliteration_id) {
  static base::NoDestructor<base::Lock> getting_transliterator_lock;
  base::AutoLock lock(*getting_transliterator_lock);

  const auto [it, inserted] =
      GetTransliteratorsMap().try_emplace(transliteration_id, nullptr);
  if (inserted) {
    it->second = CreateTransliterator(transliteration_id);
  }
  return it->second.get();
}

std::u16string Transliterate(std::u16string_view value,
                             TransliterationId transliteration_id) {
  if (value.empty()) {
    return std::u16string(value);
  }

  base::Time transliterator_creation_time = base::Time::Now();
  const base::i18n::Transliterator* transliterator =
      GetCachedTransliterator(transliteration_id);
  // TODO(crbug.com/399657187): Remove once the issue is resolved.
  base::UmaHistogramTimes("Autofill.TransliteratorCreationTime",
                          base::Time::Now() - transliterator_creation_time);

  // Transliterator initialization failed.
  if (!transliterator) {
    return base::i18n::ToLower(value);
  }
  base::ScopedUmaHistogramTimer logger("Autofill.TransliterationDuration");
  return transliterator->Transliterate(value);
}
}  // namespace

std::u16string RemoveDiacriticsAndConvertToLowerCase(
    std::u16string_view value,
    const AddressCountryCode& country_code) {
  TransliterationId transliteration_id =
      kCountriesWithGermanTransliteration.contains(country_code.value()) &&
              base::FeatureList::IsEnabled(
                  features::kAutofillEnableGermanTransliteration)
          ? TransliterationId::kGerman
          : TransliterationId::kDefault;
  return Transliterate(value, transliteration_id);
}

std::u16string TransliterateAlternativeName(std::u16string_view value,
                                            bool inverse_transliteration) {
  return Transliterate(value, inverse_transliteration
                                  ? TransliterationId::kHiraganaToKatakana
                                  : TransliterationId::kKatakanaToHiragana);
}

// Should be only used for testing. In general transliterators shouldn't be
// deleted during the lifetime of the browser.
void ClearCachedTransliterators() {
  GetTransliteratorsMap().clear();
}

}  // namespace autofill