1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
|
# A parser for SGML, using the derived class as static DTD.
# from http://raa.ruby-lang.org/project/html-parser
module FeedParser
class SGMLParser
# Regular expressions used for parsing:
Interesting = /[&<]/
Incomplete = Regexp.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|' +
'<([a-zA-Z][^<>]*|/([a-zA-Z][^<>]*)?|' +
'![^<>]*)?')
Entityref = /&([a-zA-Z][-.a-zA-Z0-9]*);/
Charref = /&#([0-9]+);/
Starttagopen = /<[>a-zA-Z]/
Endtagopen = /<\/[<>a-zA-Z]/
Endbracket = /[<>]/
Special = /<![^<>]*>/
Commentopen = /<!--/
Commentclose = /--[ \t\n]*>/
Tagfind = /[a-zA-Z][a-zA-Z0-9.-]*/
Attrfind = Regexp.compile('[\s,]*([a-zA-Z_][a-zA-Z_0-9.-]*)' +
'(\s*=\s*' +
"('[^']*'" +
'|"[^"]*"' +
'|[-~a-zA-Z0-9,./:+*%?!()_#=]*))?')
Entitydefs =
{'lt'=>'<', 'gt'=>'>', 'amp'=>'&', 'quot'=>'"', 'apos'=>'\''}
def initialize(verbose=false)
@verbose = verbose
reset
end
def reset
@rawdata = ''
@stack = []
@lasttag = '???'
@nomoretags = false
@literal = false
end
def has_context(gi)
@stack.include? gi
end
def setnomoretags
@nomoretags = true
@literal = true
end
def setliteral(*args)
@literal = true
end
def feed(data)
@rawdata << data
goahead(false)
end
def close
goahead(true)
end
def goahead(_end)
rawdata = @rawdata
i = 0
n = rawdata.length
while i < n
if @nomoretags
handle_data(rawdata[i..(n-1)])
i = n
break
end
j = rawdata.index(Interesting, i)
j = n unless j
if i < j
handle_data(rawdata[i..(j-1)])
end
i = j
break if (i == n)
if rawdata[i] == ?< #
if rawdata.index(Starttagopen, i) == i
if @literal
handle_data(rawdata[i, 1])
i += 1
next
end
k = parse_starttag(i)
break unless k
i = k
next
end
if rawdata.index(Endtagopen, i) == i
k = parse_endtag(i)
break unless k
i = k
@literal = false
next
end
if rawdata.index(Commentopen, i) == i
if @literal
handle_data(rawdata[i,1])
i += 1
next
end
k = parse_comment(i)
break unless k
i += k
next
end
if rawdata.index(Special, i) == i
if @literal
handle_data(rawdata[i, 1])
i += 1
next
end
k = parse_special(i)
break unless k
i += k
next
end
elsif rawdata[i] == ?& #
if rawdata.index(Charref, i) == i
i += $&.length
handle_charref($1)
i -= 1 unless rawdata[i-1] == ?;
next
end
if rawdata.index(Entityref, i) == i
i += $&.length
handle_entityref($1)
i -= 1 unless rawdata[i-1] == ?;
next
end
else
raise RuntimeError, 'neither < nor & ??'
end
# We get here only if incomplete matches but
# nothing else
match = rawdata.index(Incomplete, i)
unless match == i
handle_data(rawdata[i, 1])
i += 1
next
end
j = match + $&.length
break if j == n # Really incomplete
handle_data(rawdata[i..(j-1)])
i = j
end
# end while
if _end and i < n
handle_data(@rawdata[i..(n-1)])
i = n
end
@rawdata = rawdata[i..-1]
end
def parse_comment(i)
rawdata = @rawdata
if rawdata[i, 4] != '<!--'
raise RuntimeError, 'unexpected call to handle_comment'
end
match = rawdata.index(Commentclose, i)
return nil unless match
matched_length = $&.length
j = match
handle_comment(rawdata[i+4..(j-1)])
j = match + matched_length
return j-i
end
def parse_starttag(i)
rawdata = @rawdata
j = rawdata.index(Endbracket, i + 1)
return nil unless j
attrs = []
if rawdata[i+1] == ?> #
# SGML shorthand: <> == <last open tag seen>
k = j
tag = @lasttag
else
match = rawdata.index(Tagfind, i + 1)
unless match
raise RuntimeError, 'unexpected call to parse_starttag'
end
k = i + 1 + ($&.length)
tag = $&.downcase
@lasttag = tag
end
while k < j
break unless rawdata.index(Attrfind, k)
matched_length = $&.length
attrname, rest, attrvalue = $1, $2, $3
if not rest
attrvalue = '' # was: = attrname
elsif (attrvalue[0] == ?' && attrvalue[-1] == ?') or
(attrvalue[0] == ?" && attrvalue[-1,1] == ?")
attrvalue = attrvalue[1..-2]
end
attrs << [attrname.downcase, attrvalue]
k += matched_length
end
if rawdata[j] == ?> #
j += 1
end
finish_starttag(tag, attrs)
return j
end
def parse_endtag(i)
rawdata = @rawdata
j = rawdata.index(Endbracket, i + 1)
return nil unless j
tag = (rawdata[i+2..j-1].strip).downcase
if rawdata[j] == ?> #
j += 1
end
finish_endtag(tag)
return j
end
def finish_starttag(tag, attrs)
method = 'start_' + tag
if self.respond_to?(method)
@stack << tag
handle_starttag(tag, method, attrs)
return 1
else
method = 'do_' + tag
if self.respond_to?(method)
handle_starttag(tag, method, attrs)
return 0
else
unknown_starttag(tag, attrs)
return -1
end
end
end
def finish_endtag(tag)
if tag == ''
found = @stack.length - 1
if found < 0
unknown_endtag(tag)
return
end
else
unless @stack.include? tag
method = 'end_' + tag
unless self.respond_to?(method)
unknown_endtag(tag)
end
return
end
found = @stack.index(tag) #or @stack.length
end
while @stack.length > found
tag = @stack[-1]
method = 'end_' + tag
if respond_to?(method)
handle_endtag(tag, method)
else
unknown_endtag(tag)
end
@stack.pop
end
end
def parse_special(i)
rawdata = @rawdata
match = rawdata.index(Endbracket, i+1)
return nil unless match
matched_length = $&.length
handle_special(rawdata[i+1..(match-1)])
return match - i + matched_length
end
def handle_starttag(tag, method, attrs)
self.send(method, attrs)
end
def handle_endtag(tag, method)
self.send(method)
end
def report_unbalanced(tag)
if @verbose
print '*** Unbalanced </' + tag + '>', "\n"
print '*** Stack:', self.stack, "\n"
end
end
def handle_charref(name)
if name =~ /[0-9]+/
unknown_charref(name)
else
handle_data(name)
end
end
def handle_entityref(name)
table = Entitydefs
if table.include?(name)
handle_data(table[name])
else
unknown_entityref(name)
return
end
end
def handle_data(data)
end
def handle_comment(data)
end
def handle_special(data)
end
def unknown_starttag(tag, attrs)
end
def unknown_endtag(tag)
end
def unknown_charref(ref)
end
def unknown_entityref(ref)
end
end
end
|