File: TextEncodingDetectorTest.cpp

package info (click to toggle)
chromium-browser 57.0.2987.98-1~deb8u1
  • links: PTS, VCS
  • area: main
  • in suites: jessie
  • size: 2,637,852 kB
  • ctags: 2,544,394
  • sloc: cpp: 12,815,961; ansic: 3,676,222; python: 1,147,112; asm: 526,608; java: 523,212; xml: 286,794; perl: 92,654; sh: 86,408; objc: 73,271; makefile: 27,698; cs: 18,487; yacc: 13,031; tcl: 12,957; pascal: 4,875; ml: 4,716; lex: 3,904; sql: 3,862; ruby: 1,982; lisp: 1,508; php: 1,368; exp: 404; awk: 325; csh: 117; jsp: 39; sed: 37
file content (86 lines) | stat: -rw-r--r-- 3,520 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "platform/text/TextEncodingDetector.h"

#include "testing/gtest/include/gtest/gtest.h"
#include "wtf/text/TextEncoding.h"

namespace blink {

TEST(TextResourceDecoderTest, RespectIso2022Jp) {
  // ISO-2022-JP is the only 7-bit encoding defined in WHATWG standard.
  std::string iso2022jp =
      " \x1B"
      "$BKL3$F;F|K\\%O%`%U%!%$%?!<%:$,%=%U%H%P%s%/$H$N%W%l!<%*%U$r@)$7!\"";
  WTF::TextEncoding encoding;
  bool result = detectTextEncoding(iso2022jp.c_str(), iso2022jp.length(),
                                   nullptr, nullptr, nullptr, &encoding);
  EXPECT_TRUE(result);
  EXPECT_EQ(WTF::TextEncoding("ISO-2022-JP"), encoding);
}

TEST(TextResourceDecoderTest, Ignore7BitEncoding) {
  // 7-bit encodings except ISO-2022-JP are not supported by WHATWG.
  // They should be detected as plain text (US-ASCII).
  std::string hzGb2312 =
      " ~{\x54\x42\x31\x7D\x37\x22\x55\x39\x35\x3D\x3D\x71~} abc";
  WTF::TextEncoding encoding;
  bool result = detectTextEncoding(hzGb2312.c_str(), hzGb2312.length(), nullptr,
                                   nullptr, nullptr, &encoding);
  EXPECT_TRUE(result);
  EXPECT_EQ(WTF::TextEncoding("US-ASCII"), encoding);
}

TEST(TextEncodingDetectorTest, UrlHintHelpsEUCJP) {
  std::string eucjpBytes =
      "<TITLE>"
      "\xA5\xD1\xA5\xEF\xA1\xBC\xA5\xC1\xA5\xE3\xA1\xBC\xA5\xC8\xA1\xC3\xC5\xEA"
      "\xBB\xF1\xBE\xF0\xCA\xF3\xA4\xCE\xA5\xD5\xA5\xA3\xA5\xB9\xA5\xB3</"
      "TITLE>";
  WTF::TextEncoding encoding;
  bool result = detectTextEncoding(eucjpBytes.c_str(), eucjpBytes.length(),
                                   nullptr, nullptr, nullptr, &encoding);
  EXPECT_TRUE(result);
  EXPECT_EQ(WTF::TextEncoding("GBK"), encoding)
      << "Without language hint, it's detected as GBK";

  result = detectTextEncoding(eucjpBytes.c_str(), eucjpBytes.length(), nullptr,
                              "http://example.co.jp/", nullptr, &encoding);
  EXPECT_TRUE(result);
  EXPECT_EQ(WTF::TextEncoding("EUC-JP"), encoding)
      << "With URL hint including '.jp', it's detected as EUC-JP";
}

TEST(TextEncodingDetectorTest, LanguageHintHelpsEUCJP) {
  std::string eucjpBytes =
      "<TITLE>"
      "\xA5\xD1\xA5\xEF\xA1\xBC\xA5\xC1\xA5\xE3\xA1\xBC\xA5\xC8\xA1\xC3\xC5\xEA"
      "\xBB\xF1\xBE\xF0\xCA\xF3\xA4\xCE\xA5\xD5\xA5\xA3\xA5\xB9\xA5\xB3</"
      "TITLE>";
  WTF::TextEncoding encoding;
  bool result = detectTextEncoding(eucjpBytes.c_str(), eucjpBytes.length(),
                                   nullptr, nullptr, nullptr, &encoding);
  EXPECT_TRUE(result);
  EXPECT_EQ(WTF::TextEncoding("GBK"), encoding)
      << "Without language hint, it's detected as GBK";

  result = detectTextEncoding(eucjpBytes.c_str(), eucjpBytes.length(), nullptr,
                              nullptr, "ja", &encoding);
  EXPECT_TRUE(result);
  EXPECT_EQ(WTF::TextEncoding("EUC-JP"), encoding)
      << "With language hint 'ja', it's detected as EUC-JP";
}

TEST(TextEncodingDetectorTest, UTF8DetectionShouldFail) {
  std::string utf8Bytes =
      "tnegirjji gosa gii beare s\xC3\xA1htt\xC3\xA1 \xC4\x8D\xC3"
      "\xA1llit artihkkaliid. Maid don s\xC3\xA1ht\xC3\xA1t dievasmah";
  WTF::TextEncoding encoding;
  bool result = detectTextEncoding(utf8Bytes.c_str(), utf8Bytes.length(),
                                   nullptr, nullptr, nullptr, &encoding);
  EXPECT_FALSE(result);
}

}  // namespace blink