1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317
|
require "rexml/parent"
require "rexml/parseexception"
module REXML
##
# Represents an XML DOCTYPE declaration; that is, the contents of <!DOCTYPE
# ... >. DOCTYPES can be used to declare the DTD of a document, as well as
# being used to declare entities used in the document.
class DocType < Parent
START = "<!DOCTYPE"
START_RE = /\A\s*#{START}\s/um
STOP = ">"
STOP_RE = />/u
SYSTEM = "SYSTEM"
PUBLIC = "PUBLIC"
OPEN_RE = /\A\s*\[/u
PATTERN_RE = /\s*#{START}\s+(.*?)(\[|>)/um
## name is the name of the doctype
# external_id is the referenced DTD, if given
attr_reader :name, :external_id
##
# Constructor
# @param parent If first is String, it must be String and set to
# external_id. Otherwise, it is set as the parent of this object.
# @param first can be multiple types. If String, name is set to this
# and external_id is set to nil. If DocType, the object is cloned. If
# Source, the source is scanned for the DOCTYPE declaration.
def initialize( first, parent=nil )
if first.kind_of? String
super()
@name = first
@external_id = parent
elsif first.kind_of? DocType
super(parent)
@name = first.name
@external_id = first.external_id
elsif first.kind_of? Source
super(parent)
md = first.match( PATTERN_RE, true )
identity = md[1]
close = md[2]
identity =~ /^([!\*\w]+)(\s+\w+)?(\s+["'].*?['"])?(\s+['"].*?["'])?/u
@name = $1
raise ParseException.new("DOCTYPE is missing a name", first) if @name.nil?
@pub_sys = $2.nil? ? nil : $2.strip
@long_name = $3.nil? ? nil : $3.strip
@uri = $4.nil? ? nil : $4.strip
@external_id = nil
case @pub_sys
when "SYSTEM"
@external_id = "SYSTEM"
when "PUBLIC"
@external_id = "PUBLIC"
else
# Done, or junk
end
# If these raise nil exceptions, then the doctype was malformed
begin
@external_id << " #@long_name" if @long_name
@external_id << " #@uri" if @uri
rescue
raise "malformed DOCTYPE declaration #$&"
end
return if close == ">"
parse_entities first
end
end
def clone
DocType.new self
end
def write( output, indent=0 )
indent( output, indent )
output << START
output << ' '
output << @name
output << " #@external_id" unless @external_id.nil?
unless @children.empty?
#output << "\n"
next_indent = indent + 2
#output << ' '*next_indent
output << ' ['
child = nil # speed
@children.each { |child|
output << "\n"
child.write( output, next_indent )
}
output << "\n"
#output << ' '*next_indent
output << "]"
end
output << STOP
end
def DocType.parse_stream source, listener
md = source.match( PATTERN_RE, true )
identity = md[1]
close = md[2]
identity =~ /^(\w+)(\s+\w+)?(\s+["'].*?['"])?(\s+['"].*?["'])?/u
name = $1
raise "DOCTYPE is missing a name" if name.nil?
pub_sys = $2.nil? ? nil : $2.strip
long_name = $3.nil? ? nil : $3.strip
uri = $4.nil? ? nil : $4.strip
listener.doctype name, pub_sys, long_name, uri
return if close == ">"
parse_entities_source source, listener
end
private
def DocType.parser source
begin
md = source.match(/\s*(.*?)>/um)
until md[1].strip == "]"
case md[1]
when /^%/ #/u
md = source.match(/^\s*%(.*?);/um, true)
yield md[1]
when AttlistDecl::START_RE
yield AttlistDecl
when ElementDecl::START_RE
yield ElementDecl
when EntityDecl::START_RE
yield EntityDecl
when NotationDecl::START_RE
yield NotationDecl
when Comment::START_RE
yield Comment
when Instruction::START_RE
yield Instruction
else
if md.nil?
raise "no match!"
else
raise "illegal entry \"#{md[1]}\" in DOCTYPE\n(match data was '#{md[0]}'"
end
end
md = source.match(/\s*(.*?)>/um)
raise ParseException.new( "Invalid end of DOCTYPE declaration \"#{source.buffer}\"", source ) if md.nil?
end
source.match(/\s*]\s*>/um, true)
rescue ParseException
raise
rescue Exception => err
raise
raise ParseException.new( "Error parsing DOCTYPE declaration", source, nil, err )
end
end
def DocType.parse_entities_source source, listener
DocType.parser source do |arg|
if arg.kind_of? String
listener.entity arg
else
arg.parse_source source, listener
end
end
end
def parse_entities src
DocType.parser src do |arg|
if arg.kind_of? String
add_entity_sub arg
else
self.add( arg.new(src) )
end
end
end
def add_entity_sub ent
end
end
# We don't really handle any of these since we're not a validating
# parser, so we can be pretty dumb about them. All we need to be able
# to do is spew them back out on a write()
class Declaration < Child
def initialize src
super()
md = src.match( pattern, true )
@string = md[1]
end
def to_s
@string
end
def write( output, indent )
output << (' '*indent) if indent > 0
output << @string
end
def Declaration.parse_source source, listener
md = src.match( pattern, true )
listener.send inspect.downcase, md[1]
end
end
class AttlistDecl < Declaration
START = "<!ATTLIST"
START_RE = /^\s*#{START}/um
PATTERN_RE = /\s*(#{START}.*?>)/um
def pattern
PATTERN_RE
end
end
class ElementDecl < Declaration
START = "<!ELEMENT"
START_RE = /^\s*#{START}/um
PATTERN_RE = /^\s*(#{START}.*?)>/um
def pattern
PATTERN_RE
end
end
class EntityDecl < Child
START = "<!ENTITY"
START_RE = /^\s*#{START}/um
PUBLIC = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+PUBLIC\s+((["']).*?\3)\s+((["']).*?\5)\s*>/um
SYSTEM = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+SYSTEM\s+((["']).*?\3)(?:\s+NDATA\s+\w+)?\s*>/um
PLAIN = /^\s*#{START}\s+(\w+)\s+((["']).*?\3)\s*>/um
PERCENT = /^\s*#{START}\s+%\s+(\w+)\s+((["']).*?\3)\s*>/um
# <!ENTITY name SYSTEM "...">
# <!ENTITY name "...">
def initialize src
super()
md = nil
if src.match( PUBLIC )
md = src.match( PUBLIC, true )
@middle = "PUBLIC"
@content = "#{md[2]} #{md[4]}"
elsif src.match( SYSTEM )
md = src.match( SYSTEM, true )
@middle = "SYSTEM"
@content = md[2]
elsif src.match( PLAIN )
md = src.match( PLAIN, true )
@middle = ""
@content = md[2]
elsif src.match( PERCENT )
md = src.match( PERCENT, true )
@middle = ""
@content = md[2]
end
raise ParseException.new("failed Entity match", src) if md.nil?
@name = md[1]
end
def to_s
rv = "<!ENTITY #@name "
rv << "#@middle " if @middle.size > 0
rv << @content
rv
end
def write( output, indent )
output << (' '*indent) if indent > 0
output << to_s
end
def EntityDecl.parse_source source, listener
md = source.match( PATTERN_RE, true )
thing = md[0].squeeze " \t\n\r"
listener.send inspect.downcase, thing
end
end
class NotationDecl < Child
START = "<!NOTATION"
START_RE = /^\s*#{START}/um
#PATTERN_RE = /^\s*(#{START}.*?>)/um
PUBLIC = /^\s*#{START}\s+(\w[\w-]*)\s+(PUBLIC)\s+((["']).*?\4)\s*>/um
SYSTEM = /^\s*#{START}\s+(\w[\w-]*)\s+(SYSTEM)\s+((["']).*?\4)\s*>/um
def initialize src
super()
if src.match( PUBLIC )
md = src.match( PUBLIC, true )
elsif src.match( SYSTEM )
md = src.match( SYSTEM, true )
else
raise ParseException.new( "error parsing notation: no matching pattern", src )
end
@name = md[1]
@middle = md[2]
@rest = md[3]
end
def to_s
"<!NOTATION #@name #@middle #@rest>"
end
def write( output, indent )
output << (' '*indent) if indent > 0
output << to_s
end
def NotationDecl.parse_source source, listener
md = source.match( PATTERN_RE, true )
thing = md[0].squeeze " \t\n\r"
listener.send inspect.downcase, thing
end
end
end
|