File: text_encoding_detector.cc

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (88 lines) | stat: -rw-r--r-- 4,208 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
/*
 * Copyright (C) 2008, 2009 Google Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "third_party/blink/renderer/platform/text/text_encoding_detector.h"

#include "build/build_config.h"
#include "third_party/blink/renderer/platform/weborigin/kurl.h"
#include "third_party/blink/renderer/platform/wtf/text/text_encoding.h"
#include "third_party/ced/src/compact_enc_det/compact_enc_det.h"

// third_party/ced/src/util/encodings/encodings.h, which is included
// by the include above, undefs UNICODE because that is a macro used
// internally in ced. If we later in the same translation unit do
// anything related to Windows or Windows headers those will then use
// the ASCII versions which we do not want. To avoid that happening in
// jumbo builds, we redefine UNICODE again here.
#if BUILDFLAG(IS_WIN)
#define UNICODE 1
#endif  // BUILDFLAG(IS_WIN)

namespace blink {

bool DetectTextEncoding(base::span<const uint8_t> bytes,
                        const char* hint_encoding_name,
                        const KURL& hint_url,
                        const char* hint_user_language,
                        WTF::TextEncoding* detected_encoding) {
  *detected_encoding = WTF::TextEncoding();
  // In general, do not use language hint. This helps get more
  // deterministic encoding detection results across devices. Note that local
  // file resources can still benefit from the hint.
  Language language = UNKNOWN_LANGUAGE;
  if (hint_url.Protocol() == "file")
    LanguageFromCode(hint_user_language, &language);
  int consumed_bytes;
  bool is_reliable;
  auto chars = base::as_chars(bytes);
  Encoding encoding = CompactEncDet::DetectEncoding(
      chars.data(), chars.size(), hint_url.GetString().Ascii().c_str(), nullptr,
      nullptr, EncodingNameAliasToEncoding(hint_encoding_name), language,
      CompactEncDet::WEB_CORPUS,
      false,  // Include 7-bit encodings to detect ISO-2022-JP
      &consumed_bytes, &is_reliable);

  if (encoding == UNKNOWN_ENCODING)
    *detected_encoding = WTF::UnknownEncoding();
  else
    *detected_encoding = WTF::TextEncoding(MimeEncodingName(encoding));

  // Should return false if the detected encoding is UTF8. This helps prevent
  // modern web sites from neglecting proper encoding labelling and simply
  // relying on browser-side encoding detection. Encoding detection is supposed
  // to work for web sites with legacy encoding only (so this doesn't have to
  // be applied to local file resources).
  // Detection failure leads |TextResourceDecoder| to use its default encoding
  // determined from system locale or TLD.
  return !(encoding == UNKNOWN_ENCODING ||
           (hint_url.Protocol() != "file" && encoding == UTF8));
}

}  // namespace blink