File: sgml-parser.rb

package info (click to toggle)
ruby-feedparser 0.11.4-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 228 kB
  • sloc: ruby: 1,871; sh: 24; makefile: 5
file content (331 lines) | stat: -rw-r--r-- 7,986 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# A parser for SGML, using the derived class as static DTD.
# from http://raa.ruby-lang.org/project/html-parser
module FeedParser
  class SGMLParser
    # Regular expressions used for parsing:
    Interesting = /[&<]/
    Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
                                '<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
                                '![^<>]*)?')

    Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*);/
    Charref = /&#([0-9]+);/

    Starttagopen = /<[>a-zA-Z]/
    Endtagopen = /<\/[<>a-zA-Z]/
    Endbracket = /[<>]/
    Special = /<![^<>]*>/
    Commentopen = /<!--/
    Commentclose = /--[ \t\n]*>/
    Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
    Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
                              '(\s*=\s*' +
                              "('[^']*'" +
                              '|"[^"]*"' +
                              '|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')

    Entitydefs =
      {'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}

    def initialize(verbose=false)
      @verbose = verbose
      reset
    end

    def reset
      @rawdata = ''
      @stack = []
      @lasttag = '???'
      @nomoretags = false
      @literal = false
    end

    def has_context(gi)
      @stack.include? gi
    end

    def setnomoretags
      @nomoretags = true
      @literal = true
    end

    def setliteral(*args)
      @literal = true
    end

    def feed(data)
      @rawdata << data
      goahead(false)
    end

    def close
      goahead(true)
    end

    def goahead(_end)
      rawdata = @rawdata
      i = 0
      n = rawdata.length
      while i < n
        if @nomoretags
          handle_data(rawdata[i..(n-1)])
          i = n
          break
        end
        j = rawdata.index(Interesting, i)
        j = n unless j
        if i < j
          handle_data(rawdata[i..(j-1)])
        end
        i = j
        break if (i == n)
        if rawdata[i] == ?< #
          if rawdata.index(Starttagopen, i) == i
            if @literal
              handle_data(rawdata[i, 1])
              i += 1
              next
            end
            k = parse_starttag(i)
            break unless k
            i = k
            next
          end
          if rawdata.index(Endtagopen, i) == i
            k = parse_endtag(i)
            break unless k
            i = k
            @literal = false
            next
          end
          if rawdata.index(Commentopen, i) == i
            if @literal
              handle_data(rawdata[i,1])
              i += 1
              next
            end
            k = parse_comment(i)
            break unless k
            i += k
            next
          end
          if rawdata.index(Special, i) == i
            if @literal
              handle_data(rawdata[i, 1])
              i += 1
              next
            end
            k = parse_special(i)
            break unless k
            i += k
            next
          end
        elsif rawdata[i] == ?& #
          if rawdata.index(Charref, i) == i
            i += $&.length
            handle_charref($1)
            i -= 1 unless rawdata[i-1] == ?;
            next
          end
          if rawdata.index(Entityref, i) == i
            i += $&.length
            handle_entityref($1)
            i -= 1 unless rawdata[i-1] == ?;
            next
          end
        else
          raise RuntimeError, 'neither < nor & ??'
        end
        # We get here only if incomplete matches but
        # nothing else
        match = rawdata.index(Incomplete, i)
        unless match == i
          handle_data(rawdata[i, 1])
          i += 1
          next
        end
        j = match + $&.length
        break if j == n # Really incomplete
        handle_data(rawdata[i..(j-1)])
        i = j
      end
      # end while
      if _end and i < n
        handle_data(@rawdata[i..(n-1)])
        i = n
      end
      @rawdata = rawdata[i..-1]
    end

    def parse_comment(i)
      rawdata = @rawdata
      if rawdata[i, 4] != '<!--'
        raise RuntimeError, 'unexpected call to handle_comment'
      end
      match = rawdata.index(Commentclose, i)
      return nil unless match
      matched_length = $&.length
      j = match
      handle_comment(rawdata[i+4..(j-1)])
      j = match + matched_length
      return j-i
    end

    def parse_starttag(i)
      rawdata = @rawdata
      j = rawdata.index(Endbracket, i + 1)
      return nil unless j
      attrs = []
      if rawdata[i+1] == ?> #
        # SGML shorthand: <> == <last open tag seen>
        k = j
        tag = @lasttag
      else
        match = rawdata.index(Tagfind, i + 1)
        unless match
          raise RuntimeError, 'unexpected call to parse_starttag'
        end
        k = i + 1 + ($&.length)
        tag = $&.downcase
        @lasttag = tag
      end
      while k < j
        break unless rawdata.index(Attrfind, k)
        matched_length = $&.length
        attrname, rest, attrvalue = $1, $2, $3
        if not rest
          attrvalue = '' # was: = attrname
        elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
            (attrvalue[0] == ?" && attrvalue[-1,1] == ?")
          attrvalue = attrvalue[1..-2]
        end
        attrs << [attrname.downcase, attrvalue]
        k += matched_length
      end
      if rawdata[j] == ?> #
        j += 1
      end
      finish_starttag(tag, attrs)
      return j
    end

    def parse_endtag(i)
      rawdata = @rawdata
      j = rawdata.index(Endbracket, i + 1)
      return nil unless j
      tag = (rawdata[i+2..j-1].strip).downcase
      if rawdata[j] == ?> #
        j += 1
      end
      finish_endtag(tag)
      return j
    end

    def finish_starttag(tag, attrs)
      method = 'start_' + tag
      if self.respond_to?(method)
        @stack << tag
        handle_starttag(tag, method, attrs)
        return 1
      else
        method = 'do_' + tag
        if self.respond_to?(method)
          handle_starttag(tag, method, attrs)
          return 0
        else
          unknown_starttag(tag, attrs)
          return -1
        end
      end
    end

    def finish_endtag(tag)
      if tag == ''
        found = @stack.length - 1
        if found < 0
          unknown_endtag(tag)
          return
        end
      else
        unless @stack.include? tag
          method = 'end_' + tag
          unless self.respond_to?(method)
            unknown_endtag(tag)
          end
          return
        end
        found = @stack.index(tag) #or @stack.length
      end
      while @stack.length > found
        tag = @stack[-1]
        method = 'end_' + tag
        if respond_to?(method)
          handle_endtag(tag, method)
        else
          unknown_endtag(tag)
        end
        @stack.pop
      end
    end

    def parse_special(i)
      rawdata = @rawdata
      match = rawdata.index(Endbracket, i+1)
      return nil unless match
      matched_length = $&.length
      handle_special(rawdata[i+1..(match-1)])
      return match - i + matched_length
    end

    def handle_starttag(tag, method, attrs)
      self.send(method, attrs)
    end

    def handle_endtag(tag, method)
      self.send(method)
    end

    def report_unbalanced(tag)
      if @verbose
        print '*** Unbalanced </' + tag + '>', "\n"
        print '*** Stack:', self.stack, "\n"
      end
    end

    def handle_charref(name)
      if name =~ /[0-9]+/
        unknown_charref(name)
      else
        handle_data(name)
      end
    end

    def handle_entityref(name)
      table = Entitydefs
      if table.include?(name)
        handle_data(table[name])
      else
        unknown_entityref(name)
        return
      end
    end

    def handle_data(data)
    end

    def handle_comment(data)
    end

    def handle_special(data)
    end

    def unknown_starttag(tag, attrs)
    end
    def unknown_endtag(tag)
    end
    def unknown_charref(ref)
    end
    def unknown_entityref(ref)
    end
  end
end