1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
|
require 'nokogiri'
require 'htmlentities'
class TruncatedSaxDocument < Nokogiri::XML::SAX::Document
IGNORABLE_TAGS = %w(html head body)
SINGLE_TAGS = %w{br img}
attr_reader :truncated_string, :max_length, :max_length_reached, :tail,
:count_tags, :filtered_attributes, :filtered_tags, :ignored_levels
def initialize(options)
@html_coder = HTMLEntities.new
capture_options options
init_parsing_state
end
def start_element name, attributes
enter_ignored_level if filtered_tags.include?(name)
return if @max_length_reached || ignorable_tag?(name) || ignore_mode?
@closing_tags.push name unless single_tag_element? name
append_to_truncated_string opening_tag(name, attributes), overriden_tag_length
end
def characters decoded_string
return if @max_length_reached || ignore_mode?
remaining_length = max_length - @estimated_length - 1
string_to_append = decoded_string.length > remaining_length ? truncate_string(decoded_string, remaining_length) : decoded_string
append_to_truncated_string @html_coder.encode(string_to_append), string_to_append.length
end
def comment string
if @comments
return if @max_length_reached
process_comment string
end
end
def end_element name
if filtered_tags.include?(name) && ignore_mode?
exit_ignored_level
return
end
return if @max_length_reached || ignorable_tag?(name) || ignore_mode?
unless single_tag_element? name
@closing_tags.pop
append_to_truncated_string closing_tag(name), overriden_tag_length
end
end
def end_document
close_truncated_document if max_length_reached
end
private
def capture_options(options)
@max_length = options[:max_length]
@count_tags = options [:count_tags]
@count_tail = options.fetch(:count_tail, false)
@tail = options[:tail]
@filtered_attributes = options[:filtered_attributes] || []
@filtered_tags = options[:filtered_tags] || []
@tail_before_final_tag = options.fetch(:tail_before_final_tag, false)
@comments = options.fetch(:comments, false)
end
def process_comment(string)
remaining_length = max_length - @estimated_length - 1
string_to_append = comment_tag(string).length > remaining_length ? truncate_comment(comment_tag(string), remaining_length) : comment_tag(string)
append_to_truncated_string string_to_append
end
def comment_tag comment
"<!--#{comment}-->"
end
def init_parsing_state
@truncated_string = ""
@closing_tags = []
@estimated_length = @count_tail ? tail_length : 0
@max_length_reached = false
@ignored_levels = 0
end
def tail_length
tail.match(/^&\w+;$/).nil? ? tail.length : 1
end
def single_tag_element? name
SINGLE_TAGS.include? name
end
def append_to_truncated_string string, overriden_length=nil
@truncated_string << string
increase_estimated_length(overriden_length || string.length)
end
def opening_tag name, attributes
attributes_string = attributes_to_string attributes
if single_tag_element? name
"<#{name}#{attributes_string}/>"
else
"<#{name}#{attributes_string}>"
end
end
def attributes_to_string attributes
return "" if attributes.empty?
attributes_string = concatenate_attributes_declaration attributes
attributes_string.rstrip
end
def concatenate_attributes_declaration attributes
attributes.inject(' ') do |string, attribute|
key, value = attribute
next string if @filtered_attributes.include? key
string << "#{key}='#{@html_coder.encode value}' "
end
end
def closing_tag name
"</#{name}>"
end
def increase_estimated_length amount
@estimated_length += amount
check_max_length_reached
end
def check_max_length_reached
@max_length_reached = true if @estimated_length >= max_length
end
def truncate_string string, remaining_length
if @tail_before_final_tag
string[0..remaining_length]
else
@tail_appended = true
"#{string[0..remaining_length]}#{tail}"
end
end
def truncate_comment string, remaining_length
if @tail_before_final_tag
string[0..remaining_length]
else
@tail_appended = true
"#{string[0..remaining_length]}#{tail}-->"
end
end
def close_truncated_document
append_tail_between_closing_tags if @tail_before_final_tag
append_to_truncated_string tail unless @tail_appended
append_closing_tags
end
def append_closing_tags
@closing_tags.reverse.each { |name| append_to_truncated_string closing_tag name }
end
def overriden_tag_length
@count_tags ? nil : 0
end
def ignorable_tag?(name)
artificial_root_name?(name) || IGNORABLE_TAGS.include?(name.downcase)
end
def artificial_root_name? name
name == Truncato::ARTIFICIAL_ROOT_NAME
end
def append_tail_between_closing_tags
append_to_truncated_string closing_tag(@closing_tags.delete_at (@closing_tags.length - 1)) if @closing_tags.length > 1
end
def enter_ignored_level
@ignored_levels += 1
end
def exit_ignored_level
@ignored_levels -= 1
end
def ignore_mode?
@ignored_levels > 0
end
end
|