File: encoder.rb

package info (click to toggle)
ruby-htree 0.8%2Bdfsg-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 520 kB
  • sloc: ruby: 5,928; makefile: 23
file content (342 lines) | stat: -rw-r--r-- 9,814 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
if !"".respond_to?(:encode)
  require 'iconv'
end

module HTree
  class DummyEncodingConverter
    def initialize(encoding)
      @encoding = encoding
    end

    def primitive_convert(src, dst, destination_buffer=nil, destination_byteoffset=nil, destination_bytesize=nil, opts=nil)
      dst << src
      src.clear
      :source_buffer_empty
    end

    def convert(str)
      str
    end

    def finish
      ""
    end
  end

  class Encoder
    # HTree::Encoder.internal_charset returns the MIME charset corresponding to $KCODE.
    #
    # - 'ISO-8859-1' when $KCODE=='NONE'
    # - 'UTF-8' when $KCODE=='UTF8'
    # - 'EUC-JP' when $KCODE=='EUC'
    # - 'Shift_JIS' when $KCODE=='SJIS'
    #
    # This mapping ignores EUC-KR and various single byte charset other than ISO-8859-1 at least.
    # This should be fixed when Ruby is m17nized.
    def Encoder.internal_charset
      if Object.const_defined? :Encoding
        Encoding.default_external.name
      else
        KcodeCharset[$KCODE]
      end
    end

    def initialize(output_encoding, internal_encoding=HTree::Encoder.internal_charset)
      @buf = ''
      @internal_encoding = internal_encoding
      @output_encoding = output_encoding
      if defined? Encoding::Converter
        if @internal_encoding == output_encoding
          @ic = DummyEncodingConverter.new(@internal_encoding)
        else
          @ic = Encoding::Converter.new(@internal_encoding, output_encoding)
        end
      else
        @ic = Iconv.new(output_encoding, @internal_encoding)
      end
      @charpat = FirstCharPattern[internal_encoding]
      @subcharset_list = SubCharset[output_encoding] || []
      @subcharset_ic = {}
      @subcharset_list.each {|subcharset|
        if defined? Encoding::Converter
          if @internal_encoding == subcharset
            @subcharset_ic[subcharset] = DummyEncodingConverter.new(@internal_encoding)
          else
            @subcharset_ic[subcharset] = Encoding::Converter.new(@internal_encoding, subcharset)
          end
        else
          @subcharset_ic[subcharset] = Iconv.new(subcharset, @internal_encoding)
        end
      }
      @html_output = false
    end

    # :stopdoc:
    def html_output?
      @html_output
    end

    def html_output=(flag)
      @html_output = flag
    end

    def output_cdata_content_do(out, pre, body, post)
      if @html_output
        pre.call
        body.call
        post.call(out)
      else
        body.call
      end
      return out
    end

    def output_slash_if_xml
      if !@html_output
        output_string('/')
      end
    end

    def output_cdata_content(content, context)
      if @html_output
        # xxx: should raise an error for non-text node?
        texts = content.grep(HTree::Text)
        text = HTree::Text.concat(*texts)
        text.output_cdata(self)
      else
        content.each {|n| n.output(self, context) }
      end
    end

    def output_cdata_for_html(*args)
      str = args.join('')
      if %r{</} =~ str
        raise ArgumentError, "cdata contains '</' : #{str.inspect}"
      end
      output_string str
    end

    def output_string(internal_str, external_str=nil)
      if !external_str
        if @ic.respond_to? :convert
          external_str = @ic.convert(internal_str)
        else
          external_str = @ic.iconv(internal_str)
        end
      end
      @buf.force_encoding(external_str.encoding) if @buf.empty? && @buf.respond_to?(:force_encoding) # xxx: should be fixed Ruby itself
      @buf << external_str
      @subcharset_ic.reject! {|subcharset, ic|
        if ic.respond_to? :convert
          begin
            ic.convert(internal_str) != external_str
          rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
            true
          end
        else
          begin
            ic.iconv(internal_str) != external_str
          rescue Iconv::Failure
            true
          end
        end
      }
      nil
    end

    def output_text(string)
      if string.respond_to? :encode
        if string.encoding != Encoding::US_ASCII &&
           string.encoding.to_s != @internal_encoding
          string = string.encode(@internal_encoding)
        end
        string = string.dup.force_encoding("ASCII-8BIT")
      end
      while true
        if @ic.respond_to? :convert
          if string
            src = string.dup
            res = @ic.primitive_convert(src, dst="", nil, nil, :partial_input => true)
          else
            res = @ic.primitive_convert(nil, dst="")
          end
          case res
          when :invalid_byte_sequence
            success = dst
            failed = src
            _, _, _, error_bytes, _ = @ic.primitive_errinfo
            preconv_bytesize = string.bytesize - failed.bytesize - error_bytes.bytesize
            output_string string[0, preconv_bytesize], success
            string = @ic.putback + failed
            output_string '?'
            next
          when :undefined_conversion
            success = dst
            failed = src
            _, enc1, _, error_bytes, _ = @ic.primitive_errinfo
            preconv_bytesize = string.bytesize - failed.bytesize - error_bytes.bytesize
            output_string string[0, preconv_bytesize], success
            string = @ic.putback + failed
            output_string error_bytes.encode('US-ASCII', enc1, :xml=>:text)
            next
          when :source_buffer_empty, :finished
            output_string string, dst
            return
          else
            raise "unexpected encoding converter result: #{res}"
          end
        else
          begin
            output_string string, @ic.iconv(string)
            return
          rescue Iconv::IllegalSequence, Iconv::InvalidCharacter => e
            success = e.success
            failed = e.failed
          end
          output_string string[0, string.length - failed.length], success
        end
        if FirstCharPattern[@internal_encoding] !~ failed
          # xxx: should be configulable?
          #raise ArgumentError, "cannot extract first character: #{e.failed.dump}"
          string = failed[1, failed.length-1]
          output_string '?'
        else
          char = $&
          rest = $'
          begin
            if char.respond_to? :encode
              excs = [Encoding::UndefinedConversionError,
                      Encoding::InvalidByteSequenceError]
              ucode = char.encode("UTF-8", @internal_encoding).unpack("U")[0]
            else
              excs = [Iconv::IllegalSequence, Iconv::InvalidCharacter]
              ucode = Iconv.conv("UTF-8", @internal_encoding, char).unpack("U")[0]
            end
            char = "&##{ucode};"
          rescue *excs
            # xxx: should be configulable?
            char = '?'
          end
          output_string char
          string = rest
        end
      end
    end

    ChRef = {
      '&' => '&amp;',
      '<' => '&lt;',
      '>' => '&gt;',
      '"' => '&quot;',
    }

    def output_dynamic_text(string)
      if string.respond_to? :rcdata
        output_text(string.rcdata.gsub(/[<>]/) { ChRef[$&] })
      else
        output_text(string.to_s.gsub(/[&<>]/) { ChRef[$&] })
      end
    end

    def output_dynamic_attvalue(string)
      if string.respond_to? :rcdata
        output_text(string.rcdata.gsub(/[<>"]/) { ChRef[$&] })
      else
        output_text(string.to_s.gsub(/[&<>"]/) { ChRef[$&] })
      end
    end

    # :startdoc:

    def finish
      if @ic.respond_to? :finish
        external_str = @ic.finish
      else
        external_str = @ic.close
      end
      @buf << external_str
      @subcharset_ic.reject! {|subcharset, ic|
        if ic.respond_to? :finish
          begin
            ic.finish != external_str
          rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError
            true
          end
        else
          begin
            ic.close != external_str
          rescue Iconv::Failure
            true
          end
        end
      }
      @buf
    end

    def finish_with_xmldecl
      content = finish
      str = "<?xml version=\"1.0\" encoding=\"#{minimal_charset}\"?>"
      if str.respond_to? :encode
        xmldecl = str.encode(@output_encoding, 'US-ASCII')
      else
        xmldecl = Iconv.conv(@output_encoding, 'US-ASCII', str)
      end
      xmldecl + content
    end

    def minimal_charset
      @subcharset_list.each {|subcharset|
        if @subcharset_ic.include? subcharset
          return subcharset
        end
      }
      @output_encoding
    end

    # :stopdoc:

    KcodeCharset = {
      'EUC' => 'EUC-JP',
      'SJIS' => 'Shift_JIS',
      'UTF8' => 'UTF-8',
      'NONE' => 'ISO-8859-1',
    }

    SingleCharPattern = {
      'EUC-JP' => /(?:
         [\x00-\x7f]
        |[\xa1-\xfe][\xa1-\xfe]
        |\x8e[\xa1-\xfe]
        |\x8f[\xa1-\xfe][\xa1-\xfe])/nx,
      'Shift_JIS' => /(?:
         [\x00-\x7f]
        |[\x81-\x9f][\x40-\x7e\x80-\xfc]
        |[\xa1-\xdf]
        |[\xe0-\xfc][\x40-\x7e\x80-\xfc])/nx,
      'UTF-8' => /(?:
         [\x00-\x7f]
        |[\xc0-\xdf][\x80-\xbf]
        |[\xe0-\xef][\x80-\xbf][\x80-\xbf]
        |[\xf0-\xf7][\x80-\xbf][\x80-\xbf][\x80-\xbf]
        |[\xf8-\xfb][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf]
        |[\xfc-\xfd][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf][\x80-\xbf])/nx,
      'ISO-8859-1' => /[\x00-\xff]/n
    }

    FirstCharPattern = {}
    SingleCharPattern.each {|charset, pat|
      FirstCharPattern[charset] = /\A#{pat}/
    }

    SubCharset = {
      'ISO-2022-JP-2' => ['US-ASCII', 'ISO-2022-JP'],
      'ISO-2022-JP-3' => ['US-ASCII', 'ISO-2022-JP'],
      'UTF-16BE' => [],
      'UTF-16LE' => [],
      'UTF-16' => [],
    }
    SubCharset.default = ['US-ASCII']

    # :startdoc:
  end
end