1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
|
# frozen_string_literal: true
module ActiveSupport
module Multibyte
module Unicode
extend self
# A list of all available normalization forms.
# See https://www.unicode.org/reports/tr15/tr15-29.html for more
# information about normalization.
NORMALIZATION_FORMS = [:c, :kc, :d, :kd]
NORMALIZATION_FORM_ALIASES = { # :nodoc:
c: :nfc,
d: :nfd,
kc: :nfkc,
kd: :nfkd
}
# The Unicode version that is supported by the implementation
UNICODE_VERSION = RbConfig::CONFIG["UNICODE_VERSION"]
# The default normalization used for operations that require
# normalization. It can be set to any of the normalizations
# in NORMALIZATION_FORMS.
#
# ActiveSupport::Multibyte::Unicode.default_normalization_form = :c
attr_accessor :default_normalization_form
@default_normalization_form = :kc
# Unpack the string at grapheme boundaries. Returns a list of character
# lists.
#
# Unicode.unpack_graphemes('क्षि') # => [[2325, 2381], [2359], [2367]]
# Unicode.unpack_graphemes('Café') # => [[67], [97], [102], [233]]
def unpack_graphemes(string)
ActiveSupport::Deprecation.warn(<<-MSG.squish)
ActiveSupport::Multibyte::Unicode#unpack_graphemes is deprecated and will be
removed from Rails 6.1. Use string.scan(/\X/).map(&:codepoints) instead.
MSG
string.scan(/\X/).map(&:codepoints)
end
# Reverse operation of unpack_graphemes.
#
# Unicode.pack_graphemes(Unicode.unpack_graphemes('क्षि')) # => 'क्षि'
def pack_graphemes(unpacked)
ActiveSupport::Deprecation.warn(<<-MSG.squish)
ActiveSupport::Multibyte::Unicode#pack_graphemes is deprecated and will be
removed from Rails 6.1. Use array.flatten.pack("U*") instead.
MSG
unpacked.flatten.pack("U*")
end
# Decompose composed characters to the decomposed form.
def decompose(type, codepoints)
if type == :compatibility
codepoints.pack("U*").unicode_normalize(:nfkd).codepoints
else
codepoints.pack("U*").unicode_normalize(:nfd).codepoints
end
end
# Compose decomposed characters to the composed form.
def compose(codepoints)
codepoints.pack("U*").unicode_normalize(:nfc).codepoints
end
# Rubinius' String#scrub, however, doesn't support ASCII-incompatible chars.
if !defined?(Rubinius)
# Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
# resulting in a valid UTF-8 string.
#
# Passing +true+ will forcibly tidy all bytes, assuming that the string's
# encoding is entirely CP1252 or ISO-8859-1.
def tidy_bytes(string, force = false)
return string if string.empty?
return recode_windows1252_chars(string) if force
string.scrub { |bad| recode_windows1252_chars(bad) }
end
else
def tidy_bytes(string, force = false)
return string if string.empty?
return recode_windows1252_chars(string) if force
# We can't transcode to the same format, so we choose a nearly-identical encoding.
# We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
# CP1252 when we get errors. The final string will be 'converted' back to UTF-8
# before returning.
reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_16LE)
source = string.dup
out = "".force_encoding(Encoding::UTF_16LE)
loop do
reader.primitive_convert(source, out)
_, _, _, error_bytes, _ = reader.primitive_errinfo
break if error_bytes.nil?
out << error_bytes.encode(Encoding::UTF_16LE, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end
reader.finish
out.encode!(Encoding::UTF_8)
end
end
# Returns the KC normalization of the string by default. NFKC is
# considered the best normalization form for passing strings to databases
# and validations.
#
# * <tt>string</tt> - The string to perform normalization on.
# * <tt>form</tt> - The form you want to normalize in. Should be one of
# the following: <tt>:c</tt>, <tt>:kc</tt>, <tt>:d</tt>, or <tt>:kd</tt>.
# Default is ActiveSupport::Multibyte::Unicode.default_normalization_form.
def normalize(string, form = nil)
form ||= @default_normalization_form
# See https://www.unicode.org/reports/tr15, Table 1
if alias_form = NORMALIZATION_FORM_ALIASES[form]
ActiveSupport::Deprecation.warn(<<-MSG.squish)
ActiveSupport::Multibyte::Unicode#normalize is deprecated and will be
removed from Rails 6.1. Use String#unicode_normalize(:#{alias_form}) instead.
MSG
string.unicode_normalize(alias_form)
else
ActiveSupport::Deprecation.warn(<<-MSG.squish)
ActiveSupport::Multibyte::Unicode#normalize is deprecated and will be
removed from Rails 6.1. Use String#unicode_normalize instead.
MSG
raise ArgumentError, "#{form} is not a valid normalization variant", caller
end
end
%w(downcase upcase swapcase).each do |method|
define_method(method) do |string|
ActiveSupport::Deprecation.warn(<<-MSG.squish)
ActiveSupport::Multibyte::Unicode##{method} is deprecated and
will be removed from Rails 6.1. Use String methods directly.
MSG
string.send(method)
end
end
private
def recode_windows1252_chars(string)
string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end
end
end
end
|