1 # coding: utf-8
2 # frozen_string_literal: true
3
4
5
6 module Umu
7
8 module Lexical
9
10 module Lexer
11
12 class Token < Abstract
13
14 IDENT_WORD = '(_*[[:alpha:]][[:alnum:]]*(\-[[:alnum:]]+)*_*[\?!]?\'*)'
15 # See -> https://qiita.com/Takayuki_Nakano/items/8d38beaddb84b488d683
16
17 MODULE_DIRECTORY_PATTERN = Regexp.new IDENT_WORD + '::'
18 IDENT_PATTERN = Regexp.new '([@$\.])?' + IDENT_WORD + '(:)?'
19
20 RESERVED_WORDS = [
21 '__FILE__', '__LINE__',
22 'and', 'assert',
23 'case', 'cond',
24 'delay', 'do',
25 'else', 'elsif',
26 'fun',
27 'if', 'import', 'in',
28 'kind-of?',
29 'let',
30 'mod',
31 'of',
32 'pow',
33 'rec',
34 'struct', 'structure',
35 'then',
36 'val',
37 'where',
38
39 # Not used, but reserved for future
40
41 # For pattern matching
42 'as',
43
44 # For pragma
45 'export', 'pragma', 'use',
46
47 # For module language
48 'functor', 'signat', 'signature',
49
50 # For data type declaration
51 'data', 'datum', 'type',
52
53 # For class declaration
54 'abstract', 'alias', 'class', 'def',
55 'has', 'is-a', 'protocol', 'self', 'super', 'with',
56
57 # For infix operator declaration
58 'infixl', 'infixr',
59
60 # For continuation
61 'callcc', 'throw',
62
63 # For exception
64 'begin', 'ensure', 'raise', 'rescue',
65
66 # For lazy evaluation
67 'lazy',
68
69 # For non-determinism
70 'none', 'or'
71 ].inject({}) { |hash, word|
72 hash.merge(word => true) { |key, _, _|
73 ASSERT.abort format("Duplicated reserved-word: '%s'", key)
74 }
75 }
76
77 RESERVED_SYMBOLS = [
78 '=', '$', '!', '_', ',',
79 '&', '|',
80 '&&', '||',
81 '.', ':', ';',
82 '..', '::', ';;',
83 '->', '<-',
84 '<<', '>>', '<|', '|>',
85
86 # Redefinable symbols
87 '+', '-', '*', '/', '^',
88 '==', '<>', '<', '>', '<=', '>=', '<=>',
89 '++',
90 ':=',
91
92 # Not used, but reserved for future
93 '?', # Propagating errors in DO-expression
94 '...', # Range (exclude last value)
95 ':/:', # Junction for component oriented design
96 ']|[' # Guard separator for concurrency
97 ].inject({}) { |hash, x|
98 hash.merge(x => true) { |key, _, _|
99 ASSERT.abort format("Duplicated reserved-symbol: '%s'", key)
100 }
101 }
102
103
104 IDENTIFIER_SYMBOLS = [
105 '!!' # Peek operator for reference type
106 ].inject({}) { |hash, x|
107 hash.merge(x => true) { |key, _, _|
108 ASSERT.abort format("Duplicated identifier-symbol: '%s'", key)
109 }
110 }
111
112
113 BRAKET_PAIRS = [
114 ['(', ')'], # Tuple, etc
115 ['[', ']'], # List
116 ['{', '}'], # Lambda, etc
117 ['.(', ')'], # Message
118 ['.[', ']'], # Apply Message
119 ['%[', ']'], # Polymorphic pattern
120 ['%S(', ')'], # S-Expression
121 ['%{', '}'], # Embeded-expression in S-expression (and Map)
122 ['&(', ')'], # Instance message
123 ['&[', ']'], # Cell stream
124 ['&{', '}'], # Memorized and Suspended stream
125 ['$(', ')'], # Named tuple modifier
126
127
128 # Not used, but reserved for future
129
130 ['%q[', ']'], # Queue
131 ['%v[', ']'], # Vector
132 ['%a[', ']'], # Array
133 ['%J[', ']'], # JSON
134 ['%X[', ']'], # XML
135 ['%(', ')'], # Set
136 ['@[', ']'], # Assoc -- Key-Value list
137 ['@(', ')'], # Dict -- Key-Value set
138 ['$[', ']'] # Exchange -- Communication channel for concurrency
139 ]
140
141
142 BRAKET_MAP_OF_BEGIN_TO_END = BRAKET_PAIRS.inject({}) { |hash, (bb, eb)|
143 hash.merge(bb => eb) { |key, _, _|
144 ASSERT.abort format("Duplicated begin-braket: '%s'", key)
145 }
146 }
147
148
149 BEGIN_BRAKET_SYMBOLS = BRAKET_PAIRS.inject({}) { |hash, (bb, _eb)|
150 hash.merge(bb => true) { |key, _, _|
151 ASSERT.abort format("Duplicated begin-braket: '%s'", key)
152 }
153 }
154
155
156 END_BRAKET_SYMBOLS = BRAKET_PAIRS.inject({}) { |hash, (_bb, eb)|
157 hash.merge(eb => true)
158 }
159
160
161 SYMBOL_PATTERNS = [
162 RESERVED_SYMBOLS,
163 IDENTIFIER_SYMBOLS,
164 BEGIN_BRAKET_SYMBOLS,
165 END_BRAKET_SYMBOLS
166 ].inject({}) { |acc_hash, elem_hash|
167 acc_hash.merge(elem_hash) { |key, _, _|
168 ASSERT.abort format("Duplicated symbol: '%s'", key)
169 }
170 }.keys.sort { |x, y|
171 y.length <=> x.length # For longest-match
172 }.map { |s|
173 Regexp.new Regexp.escape(s)
174 }
175
176
177 def lex(scanner)
178 ASSERT.kind_of scanner, ::StringScanner
179
180 case
181 # Float or Int
182 when scanner.scan(/[+-]?\d+(\.\d+)?/)
183 [
184 :Number,
185
186 scanner.matched,
187
188 [
189 if scanner[1]
190 LT.make_float self.loc, scanner.matched.to_f
191 else
192 LT.make_integer self.loc, scanner.matched.to_i
193 end
194 ],
195
196 __make_separator__
197 ]
198
199 # Number-Selector
200 when scanner.scan(/\$(\d+)/)
201 [
202 :NumberSelector,
203
204 scanner.matched,
205
206 [LT.make_number_selector(self.loc, scanner[1].to_i)],
207
208 __make_separator__
209 ]
210
211 # Begin-String
212 when scanner.scan(/(@)?"/)
213 [
214 :BeginString,
215
216 scanner.matched,
217
218 [],
219
220 if scanner[1]
221 __make_symbolized_string__('')
222 else
223 __make_string__('')
224 end
225 ]
226
227
228 # Module identifier word
229 when scanner.scan(MODULE_DIRECTORY_PATTERN)
230 body_matched = scanner[1]
231
232 [
233 :Word,
234
235 scanner.matched,
236
237 [LT.make_module_directory(self.loc, body_matched)],
238
239 __make_separator__
240 ]
241
242
243 # Symbol, Message, Reserved-word or Identifier-word
244 when scanner.scan(IDENT_PATTERN)
245 head_matched = scanner[1]
246 body_matched = scanner[2]
247 tail_matched = scanner[4]
248
249 [
250 :Word,
251
252 scanner.matched,
253
254 [
255 if head_matched
256 if tail_matched
257 raise X::LexicalError.new(
258 self.loc,
259 "Invalid character: ':' in word: '%s'",
260 scanner.matched
261 )
262 end
263
264 case head_matched
265 when '@'
266 LT.make_symbol self.loc, body_matched
267 when '$'
268 LT.make_label_selector self.loc, body_matched
269 when '.'
270 LT.make_message self.loc, body_matched
271 else
272 ASSERT.abort head_matched
273 end
274 else
275 if tail_matched
276 LT.make_label self.loc, body_matched
277 else
278 if RESERVED_WORDS[body_matched]
279 LT.make_reserved_word self.loc, body_matched
280 else
281 LT.make_identifier self.loc, body_matched
282 end
283 end
284 end
285 ],
286
287 __make_separator__
288 ]
289
290
291 # Reserved-symbol or Identifier-symbol
292 when SYMBOL_PATTERNS.any? { |pat| scanner.scan pat }
293 matched = scanner.matched
294
295 if RESERVED_SYMBOLS[matched]
296 [
297 :ReservedSymbol,
298
299 scanner.matched,
300
301 [LT.make_reserved_symbol(self.loc, matched)],
302
303 __make_separator__
304 ]
305 elsif IDENTIFIER_SYMBOLS[matched]
306 [
307 :IdentifierSymbol,
308
309 scanner.matched,
310
311 [LT.make_identifier(self.loc, matched)],
312
313 __make_separator__
314 ]
315 elsif BEGIN_BRAKET_SYMBOLS[matched]
316 [
317 :BeginBraket,
318
319 scanner.matched,
320
321 [LT.make_reserved_symbol(self.loc, matched)],
322
323 __make_separator__(
324 self.loc, [matched] + self.braket_stack
325 )
326 ]
327 elsif END_BRAKET_SYMBOLS[matched]
328 bb, *stack = self.braket_stack
329 unless bb # Is stack empty?
330 raise X::LexicalError.new(
331 self.loc,
332 "Unexpected end-braket: '%s'", matched
333 )
334 end
335
336 eb = BRAKET_MAP_OF_BEGIN_TO_END[bb]
337 unless eb
338 ASSERT.abort self.inspect
339 end
340
341 if matched == eb
342 [
343 :EndBraket,
344
345 scanner.matched,
346
347 [LT.make_reserved_symbol(self.loc, matched)],
348
349 __make_separator__(self.loc, stack)
350 ]
351 else
352 raise X::LexicalError.new(
353 self.loc,
354 "Mismatch of brakets: '%s' .... '%s'",
355 bb, matched
356 )
357 end
358 else
359 ASSERT.abort matched
360 end
361
362 # Unmatched
363 else
364 raise X::LexicalError.new(
365 self.loc,
366 "Can't recognized as token: '%s'", scanner.inspect
367 )
368 end
369 end
370 end
371
372 end # Umu::Lexical::Lexer
373
374 end # Umu::Lexical
375
376 end # Umu