File: lexical/lexer.rb

Code

Code

   1  # coding: utf-8
   2  # frozen_string_literal: true
   3 
   4 
   5  require 'strscan'
   6 
   7  require_relative 'lexer/abstract'
   8  require_relative 'lexer/separator'
   9  require_relative 'lexer/comment'
  10  require_relative 'lexer/token'
  11  require_relative 'lexer/string'
  12  require_relative 'lexer/entry'
  13 
  14 
  15 
  16  =begin
  17  The lexcical analyzer is modeled as a finite state machine with a stack.
  18  Here we show the definition of a model using a fictitious domain
  19  specific language (DSL).
  20  This domain specific language was inspired by Estelle,
  21  a formal specification description language for communication protocols.
  22 
  23 
  24  字句解析器は、スタック付き有限状態機械としてモデル化される。
  25  ここでは、架空のドメイン固有言語(DSL)によるモデルの定義を示す。
  26  このドメイン記述言語は、通信プロトコルの
  27  形式的仕様記述言語であるEstelleから発想を得て考案した。
  28 
  29  Estelleに関する文献を以下に示す。
  30  * 通信システムの形式記述技法の標準化:
  31      Estelle言語の特質と処理系の現状と動向
  32      情報処理, Vol.31 No.1, Jan. 1990, 岡田康治, IPSJ-MGN310109
  33  * プロトコル言語
  34      1994年, 監修 水野忠則, 発行 (株)カットシステム, ISBN 4-906391-08-7
  35 
  36 
  37  #######################################
  38  ##### State Machine Specification  ####
  39  #######################################
  40 
  41  %STRUCTURE Lexer {
  42      %DATA Pattern {
  43        | PatBeginComment  String
  44        | PatEndComment    String
  45        | PatNewline       String
  46        | PatSpace         String
  47        | PatBeginString   String
  48        | PatEndString     String
  49        | PatBeginBraket   String
  50        | PatEndBraket     String
  51        | PatNumber        String
  52        | PatWord          String
  53        | Any              String
  54      }
  55 
  56      %DATA Token {
  57        | TokNewline     (String, Int)    /* NL(String)       */
  58        | TokSpace       (String, Int)    /* SP(String)       */
  59        | TokComment     (String, Int)    /* COMMENT(String)  */
  60 
  61        | TokBeginBraket (String, Int)
  62        | TokEndBraket   (String, Int)
  63 
  64        | TokReserved    (Symbol, Int)    /* IF, THEN, ELSE, ... etc */
  65 
  66        | TokIdentifier  (Symbol, Int)    /* ID(Symbol) or    */
  67                                          /* MSG(Symbol) or   */
  68                                          /* LABEL(Symbol) or */
  69                                          /* DIR(Symbol) or   */
  70                                          /* NSEL(Int) or     */
  71                                          /* LSEL(Symbol) or  */
  72 
  73        | TokNumber      (Number, Int)    /* INT(Int) or      */
  74                                          /* FLOAT(Float)     */
  75 
  76        | TokSymbol      (Symbol, Int)    /* SYM(Symbol)      */
  77        | TokString      (String, Int)    /* STRING(String)   */
  78      }
  79 
  80      %INPUT %EVENT  = Pattern
  81      %OUTPUT %EVENT = Token
  82 
  83 
  84      %STRUCTURE Abstraction {
  85          %ABSTRACT %STATE Abstract %HAS {
  86              line-num:       Int,
  87              braket-stack:   String List
  88          }
  89 
  90          %ABSTRACT %STATE String %IS-A Abstract %HAS {
  91              buf:            String
  92          }
  93      }
  94 
  95      %STATE Separator %IS-A Abstraction::Abstract
  96 
  97      %STATE Comment %IS-A Abstraction::Abstract %HAS {
  98          buf:            String,
  99          saved-line-num: Int,
 100          comment-depth:  Int
 101      }
 102 
 103      %STATE Token %IS-A Abstraction::Abstract
 104 
 105      %STATE BasicString      %IS-A Abstraction::String
 106      %STATE SymbolizedString %IS-A Abstraction::String
 107 
 108      %INITIAL %STATE = %NEW Separator {line-num: 1 braket-stack: []}
 109 
 110      %VAL braket-map = %{"(" -> ")", "[" -> "]", "{" -> "}", ....}
 111 
 112 
 113      %TRANSITION {
 114        %FROM Separator {line-num:}
 115          %WHEN PatBeginComment _
 116              %TO Comment {
 117                      buf: "" saved-line-num: line-num comment-depth: 1
 118                  }
 119          %WHEN PatNewline matched
 120              %OUTPUT TokNewline (matched, line-num)
 121              %TO %SAME {line-num: line-num + 1}
 122          %WHEN PatSpace matched
 123              %OUTPUT TokSpace (matched, line-num)
 124          %WHEN %ANY
 125              %TO Token
 126      | %FROM Comment {
 127              buf:
 128              saved-line-num: saved-line-num
 129              comment-depth: depth
 130          }
 131          %WHEN PatBeginComment matched
 132              %TO %SAME {buf: buf ^ matched comment-depth: depth + 1}
 133          %WHEN PatEndComment matched
 134              ! %IF depth <= 1 %THEN
 135                  %OUTPUT TokComment (buf, saved-line-num)
 136                  %TO Separator
 137                %ELSE
 138                  %TO %SAME {buf: buf ^ matched comment-depth: depth - 1}
 139          %WHEN PatNewline matched
 140              %TO %SAME {line-num: line-num + 1 buf: buf ^ matched}
 141          %WHEN PatAny matched
 142              %TO %SAME {buf: buf ^ matched}
 143          %WHEN %ANY
 144              %ABORT "No case"
 145      | %FROM Token {line-num: braket-stack: stack}
 146          %WHEN PatBeginString
 147              ! %IF symbol?
 148                  %TO SymbolizedString {buf: ""}
 149                %ELSE
 150                  %TO BasicString {buf: ""}
 151          %WHEN PatBeginBraket matched
 152              %OUTPUT BeginBraket (matched, line-num)
 153              %TO Separator {braket-stack: [matched|stack]}
 154          %WHEN PatEndBraket matched-eb
 155              ! %CASE stack %OF {
 156                  [] -> %ERROR "Unexpected end-braket"
 157                | [bb|stack'] ->
 158                      %CASE braket-map.(lookup bb) %OF {
 159                        | NONE -> %ABORT ("Unknown bracket: " ^ bb)
 160                        | Some found-eb ->
 161                              %IF matched-eb == found-eb %THEN
 162                                  %OUTPUT EndBraket (matched-eb, line-num)
 163                                  %TO Separator {braket-stack: stack'}
 164                              %ELSE
 165                                  %ERROR "Mismatched brakets"
 166                      }
 167              }
 168          %WHEN PatNumber matched
 169              %OUTPUT TokNumber (to-number matched, line-num)
 170              %TO Separator
 171          %WHEN PatWord matched
 172              ! %IF symbol? %THEN
 173                  %OUTPUT TokSymbol (to-symbol matched, line-num)
 174                %ELSE
 175                  %IF reserved? %THEN
 176                      %OUTPUT TokReserved (to-symbol matched, line-num)
 177                  %ELSE
 178                      %OUTPUT TokIdentifier (to-symbol matched, line-num)
 179              %TO Separator
 180          %WHEN %ANY
 181              %ERROR "Can't recognized as token"
 182      | %FROM AbstractString {buf: ..}
 183          %WHEN PatNewline
 184              %ERROR "Unexpected end-of-string"
 185          %WHEN PatAny matched
 186              %TO %SAME {buf: buf ^ matched}
 187      | %FROM BasicString {line-num:, buf: ..}
 188          %WHEN PatEndString
 189              %OUTPUT TokString (buf, line-num)
 190              %TO Separator
 191          %WHEN %ANY
 192              %ABORT
 193      | %FROM SymbolizedString {line-num:, buf: ..}
 194          %WHEN PatEndString
 195              %OUTPUT TokSymbol (buf, line-num)
 196              %TO Separator
 197          %WHEN %ANY
 198              %ABORT
 199      }
 200  }
 201  =end