1 require 'rexml/encoding'
2
3 module REXML
4 # Generates Source-s. USE THIS CLASS.
5 class SourceFactory
6 # Generates a Source object
7 # @param arg Either a String, or an IO
8 # @return a Source, or nil if a bad argument was given
9 def SourceFactory::create_from(arg)
10 if arg.kind_of? String
11 Source.new(arg)
12 elsif arg.respond_to? :read and
13 arg.respond_to? :readline and
14 arg.respond_to? :nil? and
15 arg.respond_to? :eof?
16 IOSource.new(arg)
17 elsif arg.kind_of? Source
18 arg
19 else
20 raise "#{arg.class} is not a valid input stream. It must walk \n"+
21 "like either a String, an IO, or a Source."
22 end
23 end
24 end
25
26 # A Source can be searched for patterns, and wraps buffers and other
27 # objects and provides consumption of text
28 class Source
29 include Encoding
30 # The current buffer (what we're going to read next)
31 attr_reader :buffer
32 # The line number of the last consumed text
33 attr_reader :line
34 attr_reader :encoding
35
36 # Constructor
37 # @param arg must be a String, and should be a valid XML document
38 # @param encoding if non-null, sets the encoding of the source to this
39 # value, overriding all encoding detection
40 def initialize(arg, encoding=nil)
41 @orig = @buffer = arg
42 if encoding
43 self.encoding = encoding
44 else
45 self.encoding = check_encoding( @buffer )
46 end
47 @line = 0
48 end
49
50
51 # Inherited from Encoding
52 # Overridden to support optimized en/decoding
53 def encoding=(enc)
54 return unless super
55 @line_break = encode( '>' )
56 if enc != UTF_8
57 @buffer = decode(@buffer)
58 @to_utf = true
59 else
60 @to_utf = false
61 end
62 end
63
64 # Scans the source for a given pattern. Note, that this is not your
65 # usual scan() method. For one thing, the pattern argument has some
66 # requirements; for another, the source can be consumed. You can easily
67 # confuse this method. Originally, the patterns were easier
68 # to construct and this method more robust, because this method
69 # generated search regexes on the fly; however, this was
70 # computationally expensive and slowed down the entire REXML package
71 # considerably, since this is by far the most commonly called method.
72 # @param pattern must be a Regexp, and must be in the form of
73 # /^\s*(#{your pattern, with no groups})(.*)/. The first group
74 # will be returned; the second group is used if the consume flag is
75 # set.
76 # @param consume if true, the pattern returned will be consumed, leaving
77 # everything after it in the Source.
78 # @return the pattern, if found, or nil if the Source is empty or the
79 # pattern is not found.
80 def scan(pattern, cons=false)
81 return nil if @buffer.nil?
82 rv = @buffer.scan(pattern)
83 @buffer = $' if cons and rv.size>0
84 rv
85 end
86
87 def read
88 end
89
90 def consume( pattern )
91 @buffer = $' if pattern.match( @buffer )
92 end
93
94 def match_to( char, pattern )
95 return pattern.match(@buffer)
96 end
97
98 def match_to_consume( char, pattern )
99 md = pattern.match(@buffer)
100 @buffer = $'
101 return md
102 end
103
104 def match(pattern, cons=false)
105 md = pattern.match(@buffer)
106 @buffer = $' if cons and md
107 return md
108 end
109
110 # @return true if the Source is exhausted
111 def empty?
112 @buffer == ""
113 end
114
115 def position
116 @orig.index( @buffer )
117 end
118
119 # @return the current line in the source
120 def current_line
121 lines = @orig.split
122 res = lines.grep @buffer[0..30]
123 res = res[-1] if res.kind_of? Array
124 lines.index( res ) if res
125 end
126 end
127
128 # A Source that wraps an IO. See the Source class for method
129 # documentation
130 class IOSource < Source
131 #attr_reader :block_size
132
133 # block_size has been deprecated
134 def initialize(arg, block_size=500, encoding=nil)
135 @er_source = @source = arg
136 @to_utf = false
137
138 # Determining the encoding is a deceptively difficult issue to resolve.
139 # First, we check the first two bytes for UTF-16. Then we
140 # assume that the encoding is at least ASCII enough for the '>', and
141 # we read until we get one of those. This gives us the XML declaration,
142 # if there is one. If there isn't one, the file MUST be UTF-8, as per
143 # the XML spec. If there is one, we can determine the encoding from
144 # it.
145 @buffer = ""
146 str = @source.read( 2 )
147 if encoding
148 self.encoding = encoding
149 elsif 0xfe == str[0] && 0xff == str[1]
150 @line_break = "\000>"
151 elsif 0xff == str[0] && 0xfe == str[1]
152 @line_break = ">\000"
153 elsif 0xef == str[0] && 0xbb == str[1]
154 str += @source.read(1)
155 str = '' if (0xbf == str[2])
156 @line_break = ">"
157 else
158 @line_break = ">"
159 end
160 super str+@source.readline( @line_break )
161 end
162
163 def scan(pattern, cons=false)
164 rv = super
165 # You'll notice that this next section is very similar to the same
166 # section in match(), but just a liiittle different. This is
167 # because it is a touch faster to do it this way with scan()
168 # than the way match() does it; enough faster to warrent duplicating
169 # some code
170 if rv.size == 0
171 until @buffer =~ pattern or @source.nil?
172 begin
173 # READLINE OPT
174 #str = @source.read(@block_size)
175 str = @source.readline(@line_break)
176 str = decode(str) if @to_utf and str
177 @buffer << str
178 rescue Iconv::IllegalSequence
179 raise
180 rescue
181 @source = nil
182 end
183 end
184 rv = super
185 end
186 rv.taint
187 rv
188 end
189
190 def read
191 begin
192 str = @source.readline(@line_break)
193 str = decode(str) if @to_utf and str
194 @buffer << str
195 rescue Exception, NameError
196 @source = nil
197 end
198 end
199
200 def consume( pattern )
201 match( pattern, true )
202 end
203
204 def match( pattern, cons=false )
205 rv = pattern.match(@buffer)
206 @buffer = $' if cons and rv
207 while !rv and @source
208 begin
209 str = @source.readline(@line_break)
210 str = decode(str) if @to_utf and str
211 @buffer << str
212 rv = pattern.match(@buffer)
213 @buffer = $' if cons and rv
214 rescue
215 @source = nil
216 end
217 end
218 rv.taint
219 rv
220 end
221
222 def empty?
223 super and ( @source.nil? || @source.eof? )
224 end
225
226 def position
227 @er_source.stat.pipe? ? 0 : @er_source.pos
228 end
229
230 # @return the current line in the source
231 def current_line
232 begin
233 pos = @er_source.pos # The byte position in the source
234 lineno = @er_source.lineno # The XML < position in the source
235 @er_source.rewind
236 line = 0 # The \r\n position in the source
237 begin
238 while @er_source.pos < pos
239 @er_source.readline
240 line += 1
241 end
242 rescue
243 end
244 rescue IOError
245 pos = -1
246 line = -1
247 end
248 [pos, lineno, line]
249 end
250 end
251 end