1 # coding: utf-8
2 # frozen_string_literal: true
3
4
5 require 'strscan'
6
7 require_relative 'lexer/abstract'
8 require_relative 'lexer/separator'
9 require_relative 'lexer/comment'
10 require_relative 'lexer/token'
11 require_relative 'lexer/string'
12 require_relative 'lexer/entry'
13
14
15
16 =begin
17 The lexcical analyzer is modeled as a finite state machine with a stack.
18 Here we show the definition of a model using a fictitious domain
19 specific language (DSL).
20 This domain specific language was inspired by Estelle,
21 a formal specification description language for communication protocols.
22
23
24 字句解析器は、スタック付き有限状態機械としてモデル化される。
25 ここでは、架空のドメイン固有言語(DSL)によるモデルの定義を示す。
26 このドメイン記述言語は、通信プロトコルの
27 形式的仕様記述言語であるEstelleから発想を得て考案した。
28
29 Estelleに関する文献を以下に示す。
30 * 通信システムの形式記述技法の標準化:
31 Estelle言語の特質と処理系の現状と動向
32 情報処理, Vol.31 No.1, Jan. 1990, 岡田康治, IPSJ-MGN310109
33 * プロトコル言語
34 1994年, 監修 水野忠則, 発行 (株)カットシステム, ISBN 4-906391-08-7
35
36
37 #######################################
38 ##### State Machine Specification ####
39 #######################################
40
41 %STRUCTURE Lexer {
42 %DATA Pattern {
43 | PatBeginComment String
44 | PatEndComment String
45 | PatNewline String
46 | PatSpace String
47 | PatBeginString String
48 | PatEndString String
49 | PatBeginBraket String
50 | PatEndBraket String
51 | PatNumber String
52 | PatWord String
53 | Any String
54 }
55
56 %DATA Token {
57 | TokNewline (String, Int) /* NL(String) */
58 | TokSpace (String, Int) /* SP(String) */
59 | TokComment (String, Int) /* COMMENT(String) */
60
61 | TokBeginBraket (String, Int)
62 | TokEndBraket (String, Int)
63
64 | TokReserved (Symbol, Int) /* IF, THEN, ELSE, ... etc */
65
66 | TokIdentifier (Symbol, Int) /* ID(Symbol) or */
67 /* MSG(Symbol) or */
68 /* LABEL(Symbol) or */
69 /* DIR(Symbol) or */
70 /* NSEL(Int) or */
71 /* LSEL(Symbol) or */
72
73 | TokNumber (Number, Int) /* INT(Int) or */
74 /* FLOAT(Float) */
75
76 | TokSymbol (Symbol, Int) /* SYM(Symbol) */
77 | TokString (String, Int) /* STRING(String) */
78 }
79
80 %INPUT %EVENT = Pattern
81 %OUTPUT %EVENT = Token
82
83
84 %STRUCTURE Abstraction {
85 %ABSTRACT %STATE Abstract %HAS {
86 line-num: Int,
87 braket-stack: String List
88 }
89
90 %ABSTRACT %STATE String %IS-A Abstract %HAS {
91 buf: String
92 }
93 }
94
95 %STATE Separator %IS-A Abstraction::Abstract
96
97 %STATE Comment %IS-A Abstraction::Abstract %HAS {
98 buf: String,
99 saved-line-num: Int,
100 comment-depth: Int
101 }
102
103 %STATE Token %IS-A Abstraction::Abstract
104
105 %STATE BasicString %IS-A Abstraction::String
106 %STATE SymbolizedString %IS-A Abstraction::String
107
108 %INITIAL %STATE = %NEW Separator {line-num: 1 braket-stack: []}
109
110 %VAL braket-map = %{"(" -> ")", "[" -> "]", "{" -> "}", ....}
111
112
113 %TRANSITION {
114 %FROM Separator {line-num:}
115 %WHEN PatBeginComment _
116 %TO Comment {
117 buf: "" saved-line-num: line-num comment-depth: 1
118 }
119 %WHEN PatNewline matched
120 %OUTPUT TokNewline (matched, line-num)
121 %TO %SAME {line-num: line-num + 1}
122 %WHEN PatSpace matched
123 %OUTPUT TokSpace (matched, line-num)
124 %WHEN %ANY
125 %TO Token
126 | %FROM Comment {
127 buf:
128 saved-line-num: saved-line-num
129 comment-depth: depth
130 }
131 %WHEN PatBeginComment matched
132 %TO %SAME {buf: buf ^ matched comment-depth: depth + 1}
133 %WHEN PatEndComment matched
134 ! %IF depth <= 1 %THEN
135 %OUTPUT TokComment (buf, saved-line-num)
136 %TO Separator
137 %ELSE
138 %TO %SAME {buf: buf ^ matched comment-depth: depth - 1}
139 %WHEN PatNewline matched
140 %TO %SAME {line-num: line-num + 1 buf: buf ^ matched}
141 %WHEN PatAny matched
142 %TO %SAME {buf: buf ^ matched}
143 %WHEN %ANY
144 %ABORT "No case"
145 | %FROM Token {line-num: braket-stack: stack}
146 %WHEN PatBeginString
147 ! %IF symbol?
148 %TO SymbolizedString {buf: ""}
149 %ELSE
150 %TO BasicString {buf: ""}
151 %WHEN PatBeginBraket matched
152 %OUTPUT BeginBraket (matched, line-num)
153 %TO Separator {braket-stack: [matched|stack]}
154 %WHEN PatEndBraket matched-eb
155 ! %CASE stack %OF {
156 [] -> %ERROR "Unexpected end-braket"
157 | [bb|stack'] ->
158 %CASE braket-map.(lookup bb) %OF {
159 | NONE -> %ABORT ("Unknown bracket: " ^ bb)
160 | Some found-eb ->
161 %IF matched-eb == found-eb %THEN
162 %OUTPUT EndBraket (matched-eb, line-num)
163 %TO Separator {braket-stack: stack'}
164 %ELSE
165 %ERROR "Mismatched brakets"
166 }
167 }
168 %WHEN PatNumber matched
169 %OUTPUT TokNumber (to-number matched, line-num)
170 %TO Separator
171 %WHEN PatWord matched
172 ! %IF symbol? %THEN
173 %OUTPUT TokSymbol (to-symbol matched, line-num)
174 %ELSE
175 %IF reserved? %THEN
176 %OUTPUT TokReserved (to-symbol matched, line-num)
177 %ELSE
178 %OUTPUT TokIdentifier (to-symbol matched, line-num)
179 %TO Separator
180 %WHEN %ANY
181 %ERROR "Can't recognized as token"
182 | %FROM AbstractString {buf: ..}
183 %WHEN PatNewline
184 %ERROR "Unexpected end-of-string"
185 %WHEN PatAny matched
186 %TO %SAME {buf: buf ^ matched}
187 | %FROM BasicString {line-num:, buf: ..}
188 %WHEN PatEndString
189 %OUTPUT TokString (buf, line-num)
190 %TO Separator
191 %WHEN %ANY
192 %ABORT
193 | %FROM SymbolizedString {line-num:, buf: ..}
194 %WHEN PatEndString
195 %OUTPUT TokSymbol (buf, line-num)
196 %TO Separator
197 %WHEN %ANY
198 %ABORT
199 }
200 }
201 =end