1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
require 'htmlentities/legacy'
#
# HTML entity encoding and decoding for Ruby
#
class HTMLEntities
VERSION = '4.0.0'
FLAVORS = %w[html4 xhtml1]
INSTRUCTIONS = [:basic, :named, :decimal, :hexadecimal]
class InstructionError < RuntimeError
end
class UnknownFlavor < RuntimeError
end
#
# Create a new HTMLEntities coder for the specified flavor.
# Available flavors are 'html4' and 'xhtml1' (the default).
# The only difference in functionality between the two is in the handling of the apos
# (apostrophe) named entity, which is not defined in HTML4.
#
def initialize(flavor='xhtml1')
@flavor = flavor.to_s.downcase
raise UnknownFlavor, "Unknown flavor #{flavor}" unless FLAVORS.include?(@flavor)
end
#
# Decode entities in a string into their UTF-8
# equivalents. Obviously, if your string is not already in UTF-8, you'd
# better convert it before using this method, or the output will be mixed
# up.
#
# Unknown named entities will not be converted
#
def decode(source)
return source.to_s.gsub(named_entity_regexp) {
(cp = map[$1]) ? [cp].pack('U') : $&
}.gsub(/&#([0-9]{1,7});|&#x([0-9a-f]{1,6});/i) {
$1 ? [$1.to_i].pack('U') : [$2.to_i(16)].pack('U')
}
end
#
# Encode codepoints into their corresponding entities. Various operations
# are possible, and may be specified in order:
#
# :basic :: Convert the five XML entities ('"<>&)
# :named :: Convert non-ASCII characters to their named HTML 4.01 equivalent
# :decimal :: Convert non-ASCII characters to decimal entities (e.g. Ӓ)
# :hexadecimal :: Convert non-ASCII characters to hexadecimal entities (e.g. # ካ)
#
# You can specify the commands in any order, but they will be executed in
# the order listed above to ensure that entity ampersands are not
# clobbered and that named entities are replaced before numeric ones.
#
# If no instructions are specified, :basic will be used.
#
# Examples:
# encode_entities(str) - XML-safe
# encode_entities(str, :basic, :decimal) - XML-safe and 7-bit clean
# encode_entities(str, :basic, :named, :decimal) - 7-bit clean, with all
# non-ASCII characters replaced with their named entity where possible, and
# decimal equivalents otherwise.
#
# Note: It is the program's responsibility to ensure that the source
# contains valid UTF-8 before calling this method.
#
def encode(source, *instructions)
string = source.to_s.dup
if (instructions.empty?)
instructions = [:basic]
elsif (unknown_instructions = instructions - INSTRUCTIONS) != []
raise InstructionError,
"unknown encode_entities command(s): #{unknown_instructions.inspect}"
end
basic_entity_encoder =
if instructions.include?(:basic) || instructions.include?(:named)
:encode_named
elsif instructions.include?(:decimal)
:encode_decimal
else instructions.include?(:hexadecimal)
:encode_hexadecimal
end
string.gsub!(basic_entity_regexp){ __send__(basic_entity_encoder, $&) }
extended_entity_encoders = []
if instructions.include?(:named)
extended_entity_encoders << :encode_named
end
if instructions.include?(:decimal)
extended_entity_encoders << :encode_decimal
elsif instructions.include?(:hexadecimal)
extended_entity_encoders << :encode_hexadecimal
end
unless extended_entity_encoders.empty?
string.gsub!(extended_entity_regexp){
encode_extended(extended_entity_encoders, $&)
}
end
return string
end
private
def map
@map ||= (require "htmlentities/#{@flavor}"; HTMLEntities::MAPPINGS[@flavor])
end
def basic_entity_regexp
@basic_entity_regexp ||= (
case @flavor
when /^html/
/[<>"&]/
else
/[<>'"&]/
end
)
end
def extended_entity_regexp
@extended_entity_regexp ||= (
regexp = '[\x00-\x1f]|[\xc0-\xfd][\x80-\xbf]+'
regexp += "|'" if @flavor == 'html4'
Regexp.new(regexp)
)
end
def named_entity_regexp
@named_entity_regexp ||= (
min_length = map.keys.map{ |a| a.length }.min
max_length = map.keys.map{ |a| a.length }.max
/&([a-z][a-z0-9]{#{min_length-1},#{max_length-1}});/i
)
end
def reverse_map
@reverse_map ||= map.invert
end
def encode_named(char)
cp = char.unpack('U')[0]
(e = reverse_map[cp]) && "&#{e};"
end
def encode_decimal(char)
"&##{char.unpack('U')[0]};"
end
def encode_hexadecimal(char)
"&#x#{char.unpack('U')[0].to_s(16)};"
end
def encode_extended(encoders, char)
encoders.each do |encoder|
encoded = __send__(encoder, char)
return encoded if encoded
end
return char
end
end
|