File: encoding_reader.rb

package info (click to toggle)
ruby-nokogiri 1.18.2%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 8,076 kB
  • sloc: ansic: 38,893; xml: 27,665; ruby: 27,285; java: 15,348; cpp: 7,107; yacc: 244; sh: 208; makefile: 154; sed: 14
file content (121 lines) | stat: -rw-r--r-- 3,627 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# frozen_string_literal: true

module Nokogiri
  module HTML4
    # Libxml2's parser has poor support for encoding detection.  First, it does not recognize the
    # HTML5 style meta charset declaration.  Secondly, even if it successfully detects an encoding
    # hint, it does not re-decode or re-parse the preceding part which may be garbled.
    #
    # EncodingReader aims to perform advanced encoding detection beyond what Libxml2 does, and to
    # emulate rewinding of a stream and make Libxml2 redo parsing from the start when an encoding
    # hint is found.

    # :nodoc: all
    class EncodingReader
      class EncodingFound < StandardError
        attr_reader :found_encoding

        def initialize(encoding)
          @found_encoding = encoding
          super(format("encoding found: %s", encoding))
        end
      end

      class SAXHandler < Nokogiri::XML::SAX::Document
        attr_reader :encoding

        def initialize
          @encoding = nil
          super
        end

        def start_element(name, attrs = [])
          return unless name == "meta"

          attr = Hash[attrs]
          (charset = attr["charset"]) &&
            (@encoding = charset)
          (http_equiv = attr["http-equiv"]) &&
            http_equiv.match(/\AContent-Type\z/i) &&
            (content = attr["content"]) &&
            (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
            (@encoding = m[1])
        end
      end

      class JumpSAXHandler < SAXHandler
        def initialize(jumptag)
          @jumptag = jumptag
          super()
        end

        def start_element(name, attrs = [])
          super
          throw(@jumptag, @encoding) if @encoding
          throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
        end
      end

      def self.detect_encoding(chunk)
        (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
          (return Nokogiri.XML(m[1]).encoding)

        if Nokogiri.jruby?
          (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
            (return m[4])
          catch(:encoding_found) do
            Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
            nil
          end
        else
          handler = SAXHandler.new
          parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
          begin
            parser << chunk
          rescue
            Nokogiri::SyntaxError
          end
          handler.encoding
        end
      end

      def initialize(io)
        @io = io
        @firstchunk = nil
        @encoding_found = nil
      end

      # This method is used by the C extension so that
      # Nokogiri::HTML4::Document#read_io() does not leak memory when
      # EncodingFound is raised.
      attr_reader :encoding_found

      def read(len)
        # no support for a call without len

        unless @firstchunk
          (@firstchunk = @io.read(len)) || return

          # This implementation expects that the first call from
          # htmlReadIO() is made with a length long enough (~1KB) to
          # achieve advanced encoding detection.
          if (encoding = EncodingReader.detect_encoding(@firstchunk))
            # The first chunk is stored for the next read in retry.
            raise @encoding_found = EncodingFound.new(encoding)
          end
        end
        @encoding_found = nil

        ret = @firstchunk.slice!(0, len)
        if (len -= ret.length) > 0
          (rest = @io.read(len)) && ret << (rest)
        end
        if ret.empty?
          nil
        else
          ret
        end
      end
    end
  end
end