1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349
|
# frozen_string_literal: true
return if RUBY_ENGINE != "ruby"
require_relative "test_helper"
module Prism
class EncodingTest < TestCase
codepoints_1byte = 0...0x100
encodings = {
Encoding::ASCII_8BIT => codepoints_1byte,
Encoding::US_ASCII => codepoints_1byte,
Encoding::Windows_1253 => codepoints_1byte
}
# By default we don't test every codepoint in these encodings because it
# takes a very long time.
if ENV["PRISM_TEST_ALL_ENCODINGS"]
codepoints_2bytes = 0...0x10000
codepoints_unicode = (0...0x110000)
codepoints_eucjp = [
*(0...0x10000),
*(0...0x10000).map { |bytes| bytes | 0x8F0000 }
]
codepoints_emacs_mule = [
*(0...0x80),
*((0x81...0x90).flat_map { |byte1| (0x90...0x100).map { |byte2| byte1 << 8 | byte2 } }),
*((0x90...0x9C).flat_map { |byte1| (0xA0...0x100).flat_map { |byte2| (0xA0...0x100).flat_map { |byte3| byte1 << 16 | byte2 << 8 | byte3 } } }),
*((0xF0...0xF5).flat_map { |byte2| (0xA0...0x100).flat_map { |byte3| (0xA0...0x100).flat_map { |byte4| 0x9C << 24 | byte3 << 16 | byte3 << 8 | byte4 } } }),
]
codepoints_gb18030 = [
*(0...0x80),
*((0x81..0xFE).flat_map { |byte1| (0x40...0x100).map { |byte2| byte1 << 8 | byte2 } }),
*((0x81..0xFE).flat_map { |byte1| (0x30...0x40).flat_map { |byte2| (0x81..0xFE).flat_map { |byte3| (0x2F...0x41).map { |byte4| byte1 << 24 | byte2 << 16 | byte3 << 8 | byte4 } } } }),
]
codepoints_euc_tw = [
*(0..0x7F),
*(0xA1..0xFF).flat_map { |byte1| (0xA1..0xFF).map { |byte2| (byte1 << 8) | byte2 } },
*(0xA1..0xB0).flat_map { |byte2| (0xA1..0xFF).flat_map { |byte3| (0xA1..0xFF).flat_map { |byte4| 0x8E << 24 | byte2 << 16 | byte3 << 8 | byte4 } } }
]
encodings.merge!(
Encoding::CP850 => codepoints_1byte,
Encoding::CP852 => codepoints_1byte,
Encoding::CP855 => codepoints_1byte,
Encoding::GB1988 => codepoints_1byte,
Encoding::IBM437 => codepoints_1byte,
Encoding::IBM720 => codepoints_1byte,
Encoding::IBM737 => codepoints_1byte,
Encoding::IBM775 => codepoints_1byte,
Encoding::IBM852 => codepoints_1byte,
Encoding::IBM855 => codepoints_1byte,
Encoding::IBM857 => codepoints_1byte,
Encoding::IBM860 => codepoints_1byte,
Encoding::IBM861 => codepoints_1byte,
Encoding::IBM862 => codepoints_1byte,
Encoding::IBM863 => codepoints_1byte,
Encoding::IBM864 => codepoints_1byte,
Encoding::IBM865 => codepoints_1byte,
Encoding::IBM866 => codepoints_1byte,
Encoding::IBM869 => codepoints_1byte,
Encoding::ISO_8859_1 => codepoints_1byte,
Encoding::ISO_8859_2 => codepoints_1byte,
Encoding::ISO_8859_3 => codepoints_1byte,
Encoding::ISO_8859_4 => codepoints_1byte,
Encoding::ISO_8859_5 => codepoints_1byte,
Encoding::ISO_8859_6 => codepoints_1byte,
Encoding::ISO_8859_7 => codepoints_1byte,
Encoding::ISO_8859_8 => codepoints_1byte,
Encoding::ISO_8859_9 => codepoints_1byte,
Encoding::ISO_8859_10 => codepoints_1byte,
Encoding::ISO_8859_11 => codepoints_1byte,
Encoding::ISO_8859_13 => codepoints_1byte,
Encoding::ISO_8859_14 => codepoints_1byte,
Encoding::ISO_8859_15 => codepoints_1byte,
Encoding::ISO_8859_16 => codepoints_1byte,
Encoding::KOI8_R => codepoints_1byte,
Encoding::KOI8_U => codepoints_1byte,
Encoding::MACCENTEURO => codepoints_1byte,
Encoding::MACCROATIAN => codepoints_1byte,
Encoding::MACCYRILLIC => codepoints_1byte,
Encoding::MACGREEK => codepoints_1byte,
Encoding::MACICELAND => codepoints_1byte,
Encoding::MACROMAN => codepoints_1byte,
Encoding::MACROMANIA => codepoints_1byte,
Encoding::MACTHAI => codepoints_1byte,
Encoding::MACTURKISH => codepoints_1byte,
Encoding::MACUKRAINE => codepoints_1byte,
Encoding::TIS_620 => codepoints_1byte,
Encoding::Windows_1250 => codepoints_1byte,
Encoding::Windows_1251 => codepoints_1byte,
Encoding::Windows_1252 => codepoints_1byte,
Encoding::Windows_1254 => codepoints_1byte,
Encoding::Windows_1255 => codepoints_1byte,
Encoding::Windows_1256 => codepoints_1byte,
Encoding::Windows_1257 => codepoints_1byte,
Encoding::Windows_1258 => codepoints_1byte,
Encoding::Windows_874 => codepoints_1byte,
Encoding::Big5 => codepoints_2bytes,
Encoding::Big5_HKSCS => codepoints_2bytes,
Encoding::Big5_UAO => codepoints_2bytes,
Encoding::CP949 => codepoints_2bytes,
Encoding::CP950 => codepoints_2bytes,
Encoding::CP951 => codepoints_2bytes,
Encoding::EUC_KR => codepoints_2bytes,
Encoding::GBK => codepoints_2bytes,
Encoding::GB12345 => codepoints_2bytes,
Encoding::GB2312 => codepoints_2bytes,
Encoding::MACJAPANESE => codepoints_2bytes,
Encoding::Shift_JIS => codepoints_2bytes,
Encoding::SJIS_DoCoMo => codepoints_2bytes,
Encoding::SJIS_KDDI => codepoints_2bytes,
Encoding::SJIS_SoftBank => codepoints_2bytes,
Encoding::Windows_31J => codepoints_2bytes,
Encoding::UTF_8 => codepoints_unicode,
Encoding::UTF8_MAC => codepoints_unicode,
Encoding::UTF8_DoCoMo => codepoints_unicode,
Encoding::UTF8_KDDI => codepoints_unicode,
Encoding::UTF8_SoftBank => codepoints_unicode,
Encoding::CESU_8 => codepoints_unicode,
Encoding::CP51932 => codepoints_eucjp,
Encoding::EUC_JP => codepoints_eucjp,
Encoding::EUCJP_MS => codepoints_eucjp,
Encoding::EUC_JIS_2004 => codepoints_eucjp,
Encoding::EMACS_MULE => codepoints_emacs_mule,
Encoding::STATELESS_ISO_2022_JP => codepoints_emacs_mule,
Encoding::STATELESS_ISO_2022_JP_KDDI => codepoints_emacs_mule,
Encoding::GB18030 => codepoints_gb18030,
Encoding::EUC_TW => codepoints_euc_tw
)
end
# These test that we're correctly parsing codepoints for each alias of each
# encoding that prism supports.
encodings.each do |encoding, range|
encoding.names.each do |name|
next if name == "locale"
define_method(:"test_encoding_#{name}") do
assert_encoding(encoding, name, range)
end
end
end
# These test that we're correctly setting the flags on strings for each
# encoding that prism supports.
escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
escapes = escapes.concat(escapes.product(escapes).map(&:join))
encodings.each_key do |encoding|
define_method(:"test_encoding_flags_#{encoding.name}") do
assert_encoding_flags(encoding, escapes)
end
end
def test_coding
result = Prism.parse("# coding: utf-8\n'string'")
actual = result.value.statements.body.first.unescaped.encoding
assert_equal Encoding.find("utf-8"), actual
end
def test_coding_with_whitespace
result = Prism.parse("# coding \t \r \v : \t \v \r ascii-8bit \n'string'")
actual = result.value.statements.body.first.unescaped.encoding
assert_equal Encoding.find("ascii-8bit"), actual
end
def test_emacs_style
result = Prism.parse("# -*- coding: utf-8 -*-\n'string'")
actual = result.value.statements.body.first.unescaped.encoding
assert_equal Encoding.find("utf-8"), actual
end
# This test may be a little confusing. Basically when we use our strpbrk, it
# takes into account the encoding of the file.
def test_strpbrk_multibyte
result = Prism.parse(<<~RUBY)
# encoding: Shift_JIS
%w[\x81\x5c]
RUBY
assert(result.errors.empty?)
assert_equal(
(+"\x81\x5c").force_encoding(Encoding::Shift_JIS),
result.value.statements.body.first.elements.first.unescaped
)
end
def test_utf_8_variations
%w[
utf-8-unix
utf-8-dos
utf-8-mac
utf-8-*
].each do |encoding|
result = Prism.parse("# coding: #{encoding}\n'string'")
actual = result.value.statements.body.first.unescaped.encoding
assert_equal Encoding.find("utf-8"), actual
end
end
def test_first_lexed_token
encoding = Prism.lex("# encoding: ascii-8bit").value[0][0].value.encoding
assert_equal Encoding.find("ascii-8bit"), encoding
end
def test_slice_encoding
slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice
assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice
assert_equal Encoding::SHIFT_JIS, slice.encoding
end
private
class ConstantContext < BasicObject
def self.const_missing(const)
const
end
end
def constant_context
ConstantContext.new
end
class IdentifierContext < BasicObject
def method_missing(name, *)
name
end
end
def identifier_context
IdentifierContext.new
end
def assert_encoding_constant(name, character)
source = "# encoding: #{name}\n#{character}"
expected = constant_context.instance_eval(source)
result = Prism.parse(source)
assert result.success?
actual = result.value.statements.body.last
assert_kind_of ConstantReadNode, actual
assert_equal expected, actual.name
end
def assert_encoding_identifier(name, character)
source = "# encoding: #{name}\n#{character}"
expected = identifier_context.instance_eval(source)
result = Prism.parse(source)
assert result.success?
actual = result.value.statements.body.last
assert_kind_of CallNode, actual
assert_equal expected, actual.name
end
# Check that we can properly parse every codepoint in the given encoding.
def assert_encoding(encoding, name, range)
# I'm not entirely sure, but I believe these codepoints are incorrect in
# their parsing in CRuby. They all report as matching `[[:lower:]]` but
# then they are parsed as constants. This is because CRuby determines if
# an identifier is a constant or not by case folding it down to lowercase
# and checking if there is a difference. And even though they report
# themselves as lowercase, their case fold is different. I have reported
# this bug upstream.
case encoding
when Encoding::UTF_8, Encoding::UTF_8_MAC, Encoding::UTF8_DoCoMo, Encoding::UTF8_KDDI, Encoding::UTF8_SoftBank, Encoding::CESU_8
range = range.to_a - [
0x01c5, 0x01c8, 0x01cb, 0x01f2, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b,
0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b,
0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab,
0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fbc, 0x1fcc, 0x1ffc,
]
when Encoding::Windows_1253
range = range.to_a - [0xb5]
end
range.each do |codepoint|
character = codepoint.chr(encoding)
if character.match?(/[[:alpha:]]/)
if character.match?(/[[:upper:]]/)
assert_encoding_constant(name, character)
else
assert_encoding_identifier(name, character)
end
elsif character.match?(/[[:alnum:]]/)
assert_encoding_identifier(name, "_#{character}")
else
next if ["/", "{"].include?(character)
source = "# encoding: #{name}\n/(?##{character})/\n"
assert Prism.parse(source).success?
end
rescue RangeError
source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}"
refute Prism.parse(source).success?
end
end
def assert_encoding_flags(encoding, escapes)
escapes.each do |escaped|
source = "# encoding: #{encoding.name}\n\"#{escaped}\""
expected =
begin
eval(source).encoding
rescue SyntaxError => error
if error.message.include?("UTF-8 mixed within")
error.message[/: (.+?)\n/, 1]
else
raise
end
end
actual =
Prism.parse(source).then do |result|
if result.success?
string = result.value.statements.body.first
if string.forced_utf8_encoding?
Encoding::UTF_8
elsif string.forced_binary_encoding?
Encoding::ASCII_8BIT
else
encoding
end
else
error = result.errors.first
if error.message.include?("mixed")
error.message
else
raise error.message
end
end
end
assert_equal expected, actual
end
end
end
end
|