File: truncated_sax_document.rb

package info (click to toggle)
ruby-truncato 0.7.12-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 84 kB
  • sloc: ruby: 194; makefile: 3
file content (193 lines) | stat: -rw-r--r-- 5,196 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
require 'nokogiri'
require 'htmlentities'

class TruncatedSaxDocument < Nokogiri::XML::SAX::Document
  IGNORABLE_TAGS = %w(html head body)

  SINGLE_TAGS = %w{br img}

  attr_reader :truncated_string, :max_length, :max_length_reached, :tail,
              :count_tags, :filtered_attributes, :filtered_tags, :ignored_levels

  def initialize(options)
    @html_coder = HTMLEntities.new
    capture_options options
    init_parsing_state
  end

  def start_element name, attributes
    enter_ignored_level if filtered_tags.include?(name)
    return if @max_length_reached || ignorable_tag?(name) || ignore_mode?
    @closing_tags.push name unless single_tag_element? name
    append_to_truncated_string opening_tag(name, attributes), overriden_tag_length
  end

  def characters decoded_string
    return if @max_length_reached || ignore_mode?
    remaining_length = max_length - @estimated_length - 1
    string_to_append = decoded_string.length > remaining_length ? truncate_string(decoded_string, remaining_length) : decoded_string
    append_to_truncated_string @html_coder.encode(string_to_append), string_to_append.length
  end

  def comment string
    if @comments
      return if @max_length_reached
      process_comment string
    end
  end

  def end_element name
    if filtered_tags.include?(name) && ignore_mode?
      exit_ignored_level
      return
    end

    return if @max_length_reached || ignorable_tag?(name) || ignore_mode?

    unless single_tag_element? name
      @closing_tags.pop
      append_to_truncated_string closing_tag(name), overriden_tag_length
    end
  end

  def end_document
    close_truncated_document if max_length_reached
  end

  private

  def capture_options(options)
    @max_length = options[:max_length]
    @count_tags = options [:count_tags]
    @count_tail = options.fetch(:count_tail, false)
    @tail = options[:tail]
    @filtered_attributes = options[:filtered_attributes] || []
    @filtered_tags = options[:filtered_tags] || []
    @tail_before_final_tag = options.fetch(:tail_before_final_tag, false)
    @comments = options.fetch(:comments, false)
  end

  def process_comment(string)
    remaining_length = max_length - @estimated_length - 1
    string_to_append = comment_tag(string).length > remaining_length ? truncate_comment(comment_tag(string), remaining_length) : comment_tag(string)
    append_to_truncated_string string_to_append
  end

  def comment_tag comment
    "<!--#{comment}-->"
  end

  def init_parsing_state
    @truncated_string = ""
    @closing_tags = []
    @estimated_length = @count_tail ? tail_length : 0
    @max_length_reached = false
    @ignored_levels = 0
  end

  def tail_length
    tail.match(/^&\w+;$/).nil? ? tail.length : 1
  end

  def single_tag_element? name
    SINGLE_TAGS.include? name
  end

  def append_to_truncated_string string, overriden_length=nil
    @truncated_string << string
    increase_estimated_length(overriden_length || string.length)
  end

  def opening_tag name, attributes
    attributes_string = attributes_to_string attributes
    if single_tag_element? name
      "<#{name}#{attributes_string}/>"
    else
      "<#{name}#{attributes_string}>"
    end
  end

  def attributes_to_string attributes
    return "" if attributes.empty?
    attributes_string = concatenate_attributes_declaration attributes
    attributes_string.rstrip
  end

  def concatenate_attributes_declaration attributes
    attributes.inject(' ') do |string, attribute|
      key, value = attribute
      next string if @filtered_attributes.include? key
      string << "#{key}='#{@html_coder.encode value}' "
    end
  end

  def closing_tag name
    "</#{name}>"
  end

  def increase_estimated_length amount
    @estimated_length += amount
    check_max_length_reached
  end

  def check_max_length_reached
    @max_length_reached = true if @estimated_length >= max_length
  end

  def truncate_string string, remaining_length
    if @tail_before_final_tag
      string[0..remaining_length]
    else
      @tail_appended = true
      "#{string[0..remaining_length]}#{tail}"
    end
  end

  def truncate_comment string, remaining_length
    if @tail_before_final_tag
      string[0..remaining_length]
    else
      @tail_appended = true
      "#{string[0..remaining_length]}#{tail}-->"
    end
  end

  def close_truncated_document
    append_tail_between_closing_tags if @tail_before_final_tag
    append_to_truncated_string tail unless @tail_appended
    append_closing_tags
  end

  def append_closing_tags
    @closing_tags.reverse.each { |name| append_to_truncated_string closing_tag name }
  end

  def overriden_tag_length
    @count_tags ? nil : 0
  end


  def ignorable_tag?(name)
    artificial_root_name?(name) || IGNORABLE_TAGS.include?(name.downcase)
  end

  def artificial_root_name? name
    name == Truncato::ARTIFICIAL_ROOT_NAME
  end

  def append_tail_between_closing_tags
    append_to_truncated_string closing_tag(@closing_tags.delete_at (@closing_tags.length - 1)) if @closing_tags.length > 1
  end

  def enter_ignored_level
    @ignored_levels += 1
  end

  def exit_ignored_level
    @ignored_levels -= 1
  end

  def ignore_mode?
    @ignored_levels > 0
  end
end