File: doctype.rb

package info (click to toggle)
librexml-ruby 1.2.5-1
  • links: PTS
  • area: main
  • in suites: woody
  • size: 792 kB
  • ctags: 655
  • sloc: ruby: 3,778; xml: 1,609; java: 109; makefile: 43
file content (317 lines) | stat: -rw-r--r-- 7,847 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
require "rexml/parent"
require "rexml/parseexception"

module REXML
	##
	# Represents an XML DOCTYPE declaration; that is, the contents of <!DOCTYPE
	# ... >.  DOCTYPES can be used to declare the DTD of a document, as well as
	# being used to declare entities used in the document.
	class DocType < Parent
		START = "<!DOCTYPE"
		START_RE = /\A\s*#{START}\s/um
		STOP = ">"
		STOP_RE = />/u
		SYSTEM = "SYSTEM"
		PUBLIC = "PUBLIC"
		OPEN_RE = /\A\s*\[/u
		PATTERN_RE = /\s*#{START}\s+(.*?)(\[|>)/um

		## name is the name of the doctype
		# external_id is the referenced DTD, if given
		attr_reader :name, :external_id

		##
		# Constructor
		# @param parent If first is String, it must be String and set to
		# external_id. Otherwise, it is set as the parent of this object.
		# @param first can be multiple types.  If String, name is set to this
		# and external_id is set to nil.  If DocType, the object is cloned.  If
		# Source, the source is scanned for the DOCTYPE declaration.
		def initialize( first, parent=nil )
			if first.kind_of? String
				super()
				@name = first
				@external_id = parent
			elsif first.kind_of? DocType
				super(parent)
				@name = first.name
				@external_id = first.external_id
			elsif first.kind_of? Source
				super(parent)
				md = first.match( PATTERN_RE, true )
				identity = md[1]
				close = md[2]

				identity =~ /^([!\*\w]+)(\s+\w+)?(\s+["'].*?['"])?(\s+['"].*?["'])?/u
				@name = $1

				raise ParseException.new("DOCTYPE is missing a name", first) if @name.nil?

				@pub_sys = $2.nil? ? nil : $2.strip
				@long_name = $3.nil? ? nil : $3.strip
				@uri = $4.nil? ? nil : $4.strip
				@external_id = nil

				case @pub_sys
				when "SYSTEM"
					@external_id = "SYSTEM"
				when "PUBLIC"
					@external_id = "PUBLIC"
				else
					# Done, or junk
				end
				# If these raise nil exceptions, then the doctype was malformed
				begin
					@external_id << " #@long_name" if @long_name
					@external_id << " #@uri" if @uri
				rescue
					raise "malformed DOCTYPE declaration #$&"
				end

				return if close == ">"
				parse_entities first
			end
		end

		def clone
			DocType.new self
		end

		def write( output, indent=0 )
			indent( output, indent )
			output << START
			output << ' '
			output << @name
			output << " #@external_id" unless @external_id.nil?
			unless @children.empty?
				#output << "\n"
				next_indent = indent + 2
				#output << '   '*next_indent
				output << ' ['
				child = nil		# speed
				@children.each { |child|
					output << "\n"
					child.write( output, next_indent )
				}
				output << "\n"
				#output << '   '*next_indent
				output << "]"
			end
			output << STOP
		end

		def DocType.parse_stream source, listener
			md = source.match( PATTERN_RE, true )
			identity = md[1]
			close = md[2]

			identity =~ /^(\w+)(\s+\w+)?(\s+["'].*?['"])?(\s+['"].*?["'])?/u
			name = $1

			raise "DOCTYPE is missing a name" if name.nil?

			pub_sys = $2.nil? ? nil : $2.strip
			long_name = $3.nil? ? nil : $3.strip
			uri = $4.nil? ? nil : $4.strip

			listener.doctype name, pub_sys, long_name, uri
			return if close == ">"
			parse_entities_source source, listener
		end

		private
		def DocType.parser source
			begin
				md = source.match(/\s*(.*?)>/um)
				until md[1].strip == "]" 
					case md[1]
					when /^%/ #/u
						md = source.match(/^\s*%(.*?);/um, true)
						yield md[1]
					when AttlistDecl::START_RE
						yield AttlistDecl
					when ElementDecl::START_RE
						yield ElementDecl
					when EntityDecl::START_RE
						yield EntityDecl
					when NotationDecl::START_RE
						yield NotationDecl
					when Comment::START_RE
						yield Comment
					when Instruction::START_RE
						yield Instruction
					else
						if md.nil?
							raise "no match!"
						else
							raise "illegal entry \"#{md[1]}\" in DOCTYPE\n(match data was '#{md[0]}'"
						end
					end
					md = source.match(/\s*(.*?)>/um)
					raise ParseException.new( "Invalid end of DOCTYPE declaration \"#{source.buffer}\"", source ) if md.nil?
				end
				source.match(/\s*]\s*>/um, true)
			rescue ParseException
				raise
			rescue Exception => err
				raise
				raise ParseException.new( "Error parsing DOCTYPE declaration", source, nil, err )
			end
		end

		def DocType.parse_entities_source source, listener
			DocType.parser source do |arg|
				if arg.kind_of? String
					listener.entity arg
				else
					arg.parse_source source, listener
				end
			end
		end

		def parse_entities src
			DocType.parser src do |arg|
				if arg.kind_of? String
					add_entity_sub arg
				else
					self.add( arg.new(src) )
				end
			end
		end

		def add_entity_sub ent
		end
	end

	# We don't really handle any of these since we're not a validating
	# parser, so we can be pretty dumb about them.  All we need to be able
	# to do is spew them back out on a write()

	class Declaration < Child
		def initialize src
			super()
			md = src.match( pattern, true )
			@string = md[1]
		end

		def to_s
			@string
		end

		def write( output, indent )
			output << ('   '*indent) if indent > 0
			output << @string
		end

		def Declaration.parse_source source, listener
			md = src.match( pattern, true )
			listener.send inspect.downcase, md[1]
		end
	end
	
	class AttlistDecl < Declaration
		START = "<!ATTLIST"
		START_RE = /^\s*#{START}/um
		PATTERN_RE = /\s*(#{START}.*?>)/um
		def pattern
			PATTERN_RE
		end
	end

	class ElementDecl < Declaration
		START = "<!ELEMENT"
		START_RE = /^\s*#{START}/um
		PATTERN_RE = /^\s*(#{START}.*?)>/um
		def pattern
			PATTERN_RE
		end
	end

	class EntityDecl < Child
		START = "<!ENTITY"
		START_RE = /^\s*#{START}/um
		PUBLIC = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+PUBLIC\s+((["']).*?\3)\s+((["']).*?\5)\s*>/um
		SYSTEM = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+SYSTEM\s+((["']).*?\3)(?:\s+NDATA\s+\w+)?\s*>/um
		PLAIN = /^\s*#{START}\s+(\w+)\s+((["']).*?\3)\s*>/um
		PERCENT = /^\s*#{START}\s+%\s+(\w+)\s+((["']).*?\3)\s*>/um
		# <!ENTITY name SYSTEM "...">
		# <!ENTITY name "...">
		def initialize src
			super()
			md = nil
			if src.match( PUBLIC )
				md = src.match( PUBLIC, true )
				@middle = "PUBLIC"
				@content = "#{md[2]} #{md[4]}"
			elsif src.match( SYSTEM )
				md = src.match( SYSTEM, true )
				@middle = "SYSTEM"
				@content = md[2]
			elsif src.match( PLAIN )
				md = src.match( PLAIN, true )
				@middle = ""
				@content = md[2]
			elsif src.match( PERCENT )
				md = src.match( PERCENT, true )
				@middle = ""
				@content = md[2]
			end
			raise ParseException.new("failed Entity match", src) if md.nil?
			@name = md[1]
		end

		def to_s
			rv = "<!ENTITY #@name "
			rv << "#@middle " if @middle.size > 0
			rv << @content
			rv
		end

		def write( output, indent )
			output << ('   '*indent) if indent > 0
			output << to_s
		end

		def EntityDecl.parse_source source, listener
			md = source.match( PATTERN_RE, true )
			thing = md[0].squeeze " \t\n\r"
			listener.send inspect.downcase, thing 
		end
	end

	class NotationDecl < Child
		START = "<!NOTATION"
		START_RE = /^\s*#{START}/um
		#PATTERN_RE = /^\s*(#{START}.*?>)/um
		PUBLIC = /^\s*#{START}\s+(\w[\w-]*)\s+(PUBLIC)\s+((["']).*?\4)\s*>/um
		SYSTEM = /^\s*#{START}\s+(\w[\w-]*)\s+(SYSTEM)\s+((["']).*?\4)\s*>/um
		def initialize src
			super()
			if src.match( PUBLIC )
				md = src.match( PUBLIC, true )
			elsif src.match( SYSTEM )
				md = src.match( SYSTEM, true )
			else
				raise ParseException.new( "error parsing notation: no matching pattern", src )
			end
			@name = md[1]
			@middle = md[2]
			@rest = md[3]
		end

		def to_s
			"<!NOTATION #@name #@middle #@rest>"
		end

		def write( output, indent )
			output << ('   '*indent) if indent > 0
			output << to_s
		end

		def NotationDecl.parse_source source, listener
			md = source.match( PATTERN_RE, true )
			thing = md[0].squeeze " \t\n\r"
			listener.send inspect.downcase, thing 
		end
	end
end