1 require 'rexml/parseexception'
2 require 'rexml/undefinednamespaceexception'
3 require 'rexml/source'
4 require 'set'
5
6 module REXML
7 module Parsers
8 # = Using the Pull Parser
9 # <em>This API is experimental, and subject to change.</em>
10 # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
11 # while parser.has_next?
12 # res = parser.next
13 # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
14 # end
15 # See the PullEvent class for information on the content of the results.
16 # The data is identical to the arguments passed for the various events to
17 # the StreamListener API.
18 #
19 # Notice that:
20 # parser = PullParser.new( "<a>BAD DOCUMENT" )
21 # while parser.has_next?
22 # res = parser.next
23 # raise res[1] if res.error?
24 # end
25 #
26 # Nat Price gave me some good ideas for the API.
27 class BaseParser
28 NCNAME_STR= '[\w:][\-\w\d.]*'
29 NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
30 UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
31
32 NAMECHAR = '[\-\w\d\.:]'
33 NAME = "([\\w:]#{NAMECHAR}*)"
34 NMTOKEN = "(?:#{NAMECHAR})+"
35 NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
36 REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
37 REFERENCE_RE = /#{REFERENCE}/
38
39 DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
40 DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
41 ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
42 COMMENT_START = /\A<!--/u
43 COMMENT_PATTERN = /<!--(.*?)-->/um
44 CDATA_START = /\A<!\[CDATA\[/u
45 CDATA_END = /^\s*\]\s*>/um
46 CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
47 XMLDECL_START = /\A<\?xml\s/u;
48 XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
49 INSTRUCTION_START = /\A<\?/u
50 INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
51 TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
52 CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
53
54 VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
55 ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
56 STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
57
58 ENTITY_START = /^\s*<!ENTITY/
59 IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
60 ELEMENTDECL_START = /^\s*<!ELEMENT/um
61 ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
62 SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
63 ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
64 NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
65 ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
66 ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
67 ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
68 DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
69 ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
70 ATTDEF_RE = /#{ATTDEF}/
71 ATTLISTDECL_START = /^\s*<!ATTLIST/um
72 ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
73 NOTATIONDECL_START = /^\s*<!NOTATION/um
74 PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
75 SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
76
77 TEXT_PATTERN = /\A([^<]*)/um
78
79 # Entity constants
80 PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
81 SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
82 PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
83 EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
84 NDATADECL = "\\s+NDATA\\s+#{NAME}"
85 PEREFERENCE = "%#{NAME};"
86 ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
87 PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
88 ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
89 PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
90 GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
91 ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
92
93 EREFERENCE = /&(?!#{NAME};)/
94
95 DEFAULT_ENTITIES = {
96 'gt' => [/>/, '>', '>', />/],
97 'lt' => [/</, '<', '<', /</],
98 'quot' => [/"/, '"', '"', /"/],
99 "apos" => [/'/, "'", "'", /'/]
100 }
101
102
103 ######################################################################
104 # These are patterns to identify common markup errors, to make the
105 # error messages more informative.
106 ######################################################################
107 MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
108
109 def initialize( source )
110 self.stream = source
111 end
112
113 def add_listener( listener )
114 if !defined?(@listeners) or !@listeners
115 @listeners = []
116 instance_eval <<-EOL
117 alias :_old_pull :pull
118 def pull
119 event = _old_pull
120 @listeners.each do |listener|
121 listener.receive event
122 end
123 event
124 end
125 EOL
126 end
127 @listeners << listener
128 end
129
130 attr_reader :source
131
132 def stream=( source )
133 @source = SourceFactory.create_from( source )
134 @closed = nil
135 @document_status = nil
136 @tags = []
137 @stack = []
138 @entities = []
139 @nsstack = []
140 end
141
142 def position
143 if @source.respond_to? :position
144 @source.position
145 else
146 # FIXME
147 0
148 end
149 end
150
151 # Returns true if there are no more events
152 def empty?
153 return (@source.empty? and @stack.empty?)
154 end
155
156 # Returns true if there are more events. Synonymous with !empty?
157 def has_next?
158 return !(@source.empty? and @stack.empty?)
159 end
160
161 # Push an event back on the head of the stream. This method
162 # has (theoretically) infinite depth.
163 def unshift token
164 @stack.unshift(token)
165 end
166
167 # Peek at the +depth+ event in the stack. The first element on the stack
168 # is at depth 0. If +depth+ is -1, will parse to the end of the input
169 # stream and return the last event, which is always :end_document.
170 # Be aware that this causes the stream to be parsed up to the +depth+
171 # event, so you can effectively pre-parse the entire document (pull the
172 # entire thing into memory) using this method.
173 def peek depth=0
174 raise %Q[Illegal argument "#{depth}"] if depth < -1
175 temp = []
176 if depth == -1
177 temp.push(pull()) until empty?
178 else
179 while @stack.size+temp.size < depth+1
180 temp.push(pull())
181 end
182 end
183 @stack += temp if temp.size > 0
184 @stack[depth]
185 end
186
187 # Returns the next event. This is a +PullEvent+ object.
188 def pull
189 if @closed
190 x, @closed = @closed, nil
191 return [ :end_element, x ]
192 end
193 return [ :end_document ] if empty?
194 return @stack.shift if @stack.size > 0
195 #STDERR.puts @source.encoding
196 @source.read if @source.buffer.size<2
197 #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
198 if @document_status == nil
199 #@source.consume( /^\s*/um )
200 word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
201 word = word[1] unless word.nil?
202 #STDERR.puts "WORD = #{word.inspect}"
203 case word
204 when COMMENT_START
205 return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
206 when XMLDECL_START
207 #STDERR.puts "XMLDECL"
208 results = @source.match( XMLDECL_PATTERN, true )[1]
209 version = VERSION.match( results )
210 version = version[1] unless version.nil?
211 encoding = ENCODING.match(results)
212 encoding = encoding[1] unless encoding.nil?
213 @source.encoding = encoding
214 standalone = STANDALONE.match(results)
215 standalone = standalone[1] unless standalone.nil?
216 return [ :xmldecl, version, encoding, standalone ]
217 when INSTRUCTION_START
218 return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
219 when DOCTYPE_START
220 md = @source.match( DOCTYPE_PATTERN, true )
221 @nsstack.unshift(curr_ns=Set.new)
222 identity = md[1]
223 close = md[2]
224 identity =~ IDENTITY
225 name = $1
226 raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
227 pub_sys = $2.nil? ? nil : $2.strip
228 long_name = $4.nil? ? nil : $4.strip
229 uri = $6.nil? ? nil : $6.strip
230 args = [ :start_doctype, name, pub_sys, long_name, uri ]
231 if close == ">"
232 @document_status = :after_doctype
233 @source.read if @source.buffer.size<2
234 md = @source.match(/^\s*/um, true)
235 @stack << [ :end_doctype ]
236 else
237 @document_status = :in_doctype
238 end
239 return args
240 when /^\s+/
241 else
242 @document_status = :after_doctype
243 @source.read if @source.buffer.size<2
244 md = @source.match(/\s*/um, true)
245 end
246 end
247 if @document_status == :in_doctype
248 md = @source.match(/\s*(.*?>)/um)
249 case md[1]
250 when SYSTEMENTITY
251 match = @source.match( SYSTEMENTITY, true )[1]
252 return [ :externalentity, match ]
253
254 when ELEMENTDECL_START
255 return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
256
257 when ENTITY_START
258 match = @source.match( ENTITYDECL, true ).to_a.compact
259 match[0] = :entitydecl
260 ref = false
261 if match[1] == '%'
262 ref = true
263 match.delete_at 1
264 end
265 # Now we have to sort out what kind of entity reference this is
266 if match[2] == 'SYSTEM'
267 # External reference
268 match[3] = match[3][1..-2] # PUBID
269 match.delete_at(4) if match.size > 4 # Chop out NDATA decl
270 # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
271 elsif match[2] == 'PUBLIC'
272 # External reference
273 match[3] = match[3][1..-2] # PUBID
274 match[4] = match[4][1..-2] # HREF
275 # match is [ :entity, name, PUBLIC, pubid, href ]
276 else
277 match[2] = match[2][1..-2]
278 match.pop if match.size == 4
279 # match is [ :entity, name, value ]
280 end
281 match << '%' if ref
282 return match
283 when ATTLISTDECL_START
284 md = @source.match( ATTLISTDECL_PATTERN, true )
285 raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
286 element = md[1]
287 contents = md[0]
288
289 pairs = {}
290 values = md[0].scan( ATTDEF_RE )
291 values.each do |attdef|
292 unless attdef[3] == "#IMPLIED"
293 attdef.compact!
294 val = attdef[3]
295 val = attdef[4] if val == "#FIXED "
296 pairs[attdef[0]] = val
297 if attdef[0] =~ /^xmlns:(.*)/
298 @nsstack[0] << $1
299 end
300 end
301 end
302 return [ :attlistdecl, element, pairs, contents ]
303 when NOTATIONDECL_START
304 md = nil
305 if @source.match( PUBLIC )
306 md = @source.match( PUBLIC, true )
307 vals = [md[1],md[2],md[4],md[6]]
308 elsif @source.match( SYSTEM )
309 md = @source.match( SYSTEM, true )
310 vals = [md[1],md[2],nil,md[4]]
311 else
312 raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
313 end
314 return [ :notationdecl, *vals ]
315 when CDATA_END
316 @document_status = :after_doctype
317 @source.match( CDATA_END, true )
318 return [ :end_doctype ]
319 end
320 end
321 begin
322 if @source.buffer[0] == ?<
323 if @source.buffer[1] == ?/
324 @nsstack.shift
325 last_tag = @tags.pop
326 #md = @source.match_to_consume( '>', CLOSE_MATCH)
327 md = @source.match( CLOSE_MATCH, true )
328 raise REXML::ParseException.new( "Missing end tag for "+
329 "'#{last_tag}' (got \"#{md[1]}\")",
330 @source) unless last_tag == md[1]
331 return [ :end_element, last_tag ]
332 elsif @source.buffer[1] == ?!
333 md = @source.match(/\A(\s*[^>]*>)/um)
334 #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
335 raise REXML::ParseException.new("Malformed node", @source) unless md
336 if md[0][2] == ?-
337 md = @source.match( COMMENT_PATTERN, true )
338 return [ :comment, md[1] ] if md
339 else
340 md = @source.match( CDATA_PATTERN, true )
341 return [ :cdata, md[1] ] if md
342 end
343 raise REXML::ParseException.new( "Declarations can only occur "+
344 "in the doctype declaration.", @source)
345 elsif @source.buffer[1] == ??
346 md = @source.match( INSTRUCTION_PATTERN, true )
347 return [ :processing_instruction, md[1], md[2] ] if md
348 raise REXML::ParseException.new( "Bad instruction declaration",
349 @source)
350 else
351 # Get the next tag
352 md = @source.match(TAG_MATCH, true)
353 unless md
354 # Check for missing attribute quotes
355 raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
356 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
357 end
358 attributes = {}
359 prefixes = Set.new
360 prefixes << md[2] if md[2]
361 @nsstack.unshift(curr_ns=Set.new)
362 if md[4].size > 0
363 attrs = md[4].scan( ATTRIBUTE_PATTERN )
364 raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
365 attrs.each { |a,b,c,d,e|
366 if b == "xmlns"
367 if c == "xml"
368 if d != "http://www.w3.org/XML/1998/namespace"
369 msg = "The 'xml' prefix must not be bound to any other namespace "+
370 "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
371 raise REXML::ParseException.new( msg, @source, self )
372 end
373 elsif c == "xmlns"
374 msg = "The 'xmlns' prefix must not be declared "+
375 "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
376 raise REXML::ParseException.new( msg, @source, self)
377 end
378 curr_ns << c
379 elsif b
380 prefixes << b unless b == "xml"
381 end
382 attributes[a] = e
383 }
384 end
385
386 # Verify that all of the prefixes have been defined
387 for prefix in prefixes
388 unless @nsstack.find{|k| k.member?(prefix)}
389 raise UndefinedNamespaceException.new(prefix,@source,self)
390 end
391 end
392
393 if md[6]
394 @closed = md[1]
395 @nsstack.shift
396 else
397 @tags.push( md[1] )
398 end
399 return [ :start_element, md[1], attributes ]
400 end
401 else
402 md = @source.match( TEXT_PATTERN, true )
403 if md[0].length == 0
404 @source.match( /(\s+)/, true )
405 end
406 #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
407 #return [ :text, "" ] if md[0].length == 0
408 # unnormalized = Text::unnormalize( md[1], self )
409 # return PullEvent.new( :text, md[1], unnormalized )
410 return [ :text, md[1] ]
411 end
412 rescue REXML::UndefinedNamespaceException
413 raise
414 rescue REXML::ParseException
415 raise
416 rescue Exception, NameError => error
417 raise REXML::ParseException.new( "Exception parsing",
418 @source, self, (error ? error : $!) )
419 end
420 return [ :dummy ]
421 end
422
423 def entity( reference, entities )
424 value = nil
425 value = entities[ reference ] if entities
426 if not value
427 value = DEFAULT_ENTITIES[ reference ]
428 value = value[2] if value
429 end
430 unnormalize( value, entities ) if value
431 end
432
433 # Escapes all possible entities
434 def normalize( input, entities=nil, entity_filter=nil )
435 copy = input.clone
436 # Doing it like this rather than in a loop improves the speed
437 copy.gsub!( EREFERENCE, '&' )
438 entities.each do |key, value|
439 copy.gsub!( value, "&#{key};" ) unless entity_filter and
440 entity_filter.include?(entity)
441 end if entities
442 copy.gsub!( EREFERENCE, '&' )
443 DEFAULT_ENTITIES.each do |key, value|
444 copy.gsub!( value[3], value[1] )
445 end
446 copy
447 end
448
449 # Unescapes all possible entities
450 def unnormalize( string, entities=nil, filter=nil )
451 rv = string.clone
452 rv.gsub!( /\r\n?/, "\n" )
453 matches = rv.scan( REFERENCE_RE )
454 return rv if matches.size == 0
455 rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
456 m=$1
457 m = "0#{m}" if m[0] == ?x
458 [Integer(m)].pack('U*')
459 }
460 matches.collect!{|x|x[0]}.compact!
461 if matches.size > 0
462 matches.each do |entity_reference|
463 unless filter and filter.include?(entity_reference)
464 entity_value = entity( entity_reference, entities )
465 if entity_value
466 re = /&#{entity_reference};/
467 rv.gsub!( re, entity_value )
468 end
469 end
470 end
471 matches.each do |entity_reference|
472 unless filter and filter.include?(entity_reference)
473 er = DEFAULT_ENTITIES[entity_reference]
474 rv.gsub!( er[0], er[2] ) if er
475 end
476 end
477 rv.gsub!( /&/, '&' )
478 end
479 rv
480 end
481 end
482 end
483 end
484
485 =begin
486 case event[0]
487 when :start_element
488 when :text
489 when :end_element
490 when :processing_instruction
491 when :cdata
492 when :comment
493 when :xmldecl
494 when :start_doctype
495 when :end_doctype
496 when :externalentity
497 when :elementdecl
498 when :entity
499 when :attlistdecl
500 when :notationdecl
501 when :end_doctype
502 end
503 =end