File: diacritic_utils.cc

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (63 lines) | stat: -rw-r--r-- 2,646 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
// Copyright 2022 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "chromeos/ash/components/string_matching/diacritic_utils.h"

#include <string>
#include <vector>

#include "base/containers/fixed_flat_map.h"

namespace ash::string_matching {

const std::u16string RemoveDiacritics(const std::u16string& str) {
  // For the initial implementation of diacritic-insensitive search:
  // 1) Intentionally only covering Latin-script accented characters.
  // 2) Only deal with 1-to-1 char mapping i.e., "æ > ae; œ > oe; Æ > AE; Œ >
  // OE" are ignored in this implementation. The implemented ones are listed
  // as below:

  // "[ á à â ä ā å ] > a; "
  // "[ Á À Â Ä Ā Å ] > A; "
  // "[ é è ê ë ē   ] > e; "
  // "[ É È Ê Ë Ē   ] > E; "
  // "[ í ì î ï ī   ] > i; "
  // "[ Í Ì Î Ï Ī   ] > I; "
  // "[ ó ò ô ö ō ø ] > o; "
  // "[ Ó Ò Ô Ö Ō Ø ] > O; "
  // "[ ú ù û ü ū   ] > u; "
  // "[ Ú Ù Û Ü Ū   ] > U; "
  // "[ ý ỳ ŷ ÿ ȳ   ] > y; "
  // "[ Ý Ỳ Ŷ Ÿ Ȳ   ] > Y; "
  // "ç > c; ñ > n; "
  // "Ç > C; Ñ > N;"

  // clang-format off
  static constexpr auto kConversionMap =
    base::MakeFixedFlatMap<char16_t, char16_t>({
      {u'á', u'a'}, {u'à', u'a'}, {u'â', u'a'}, {u'ä', u'a'}, {u'ā', u'a'}, {u'å', u'a'},
      {u'Á', u'A'}, {u'À', u'A'}, {u'Â', u'A'}, {u'Ä', u'A'}, {u'Ā', u'A'}, {u'Å', u'A'},
      {u'é', u'e'}, {u'è', u'e'}, {u'ê', u'e'}, {u'ë', u'e'}, {u'ē', u'e'},
      {u'É', u'E'}, {u'È', u'E'}, {u'Ê', u'E'}, {u'Ë', u'E'}, {u'Ē', u'E'},
      {u'í', u'i'}, {u'ì', u'i'}, {u'î', u'i'}, {u'ï', u'i'}, {u'ī', u'i'},
      {u'Í', u'I'}, {u'Ì', u'I'}, {u'Î', u'I'}, {u'Ï', u'I'}, {u'Ī', u'I'},
      {u'ó', u'o'}, {u'ò', u'o'}, {u'ô', u'o'}, {u'ö', u'o'}, {u'ō', u'o'}, {u'ø', u'o'},
      {u'Ó', u'O'}, {u'Ò', u'O'}, {u'Ô', u'O'}, {u'Ö', u'O'}, {u'Ō', u'O'}, {u'Ø', u'O'},
      {u'ú', u'u'}, {u'ù', u'u'}, {u'û', u'u'}, {u'ü', u'u'}, {u'ū', u'u'},
      {u'Ú', u'U'}, {u'Ù', u'U'}, {u'Û', u'U'}, {u'Ü', u'U'}, {u'Ū', u'U'},
      {u'ý', u'y'}, {u'ỳ', u'y'}, {u'ŷ', u'y'}, {u'ÿ', u'y'}, {u'ȳ', u'y'},
      {u'Ý', u'Y'}, {u'Ỳ', u'Y'}, {u'Ŷ', u'Y'}, {u'Ÿ', u'Y'}, {u'Ȳ', u'Y'},
      {u'ç', u'c'}, {u'Ç', u'C'}, {u'ñ', u'n'}, {u'Ñ', u'N'},
      });
  // clang-format on

  std::u16string result;
  for (auto letter : str) {
    auto it = kConversionMap.find(letter);
    result.push_back(it == kConversionMap.end() ? letter : it->second);
  }
  return result;
}

}  // namespace ash::string_matching