Nim/compiler/parsecfg.nim

#
#
#            Nimrod's Runtime Library
#        (c) Copyright 2012 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

# A HIGH-PERFORMANCE configuration file parser;
# the Nimrod version of this file is part of the
# standard library.

import
  llstream, nhashes, strutils, nimlexbase

type
  TCfgEventKind* = enum
    cfgEof,                   # end of file reached
    cfgSectionStart,          # a ``[section]`` has been parsed
    cfgKeyValuePair,          # a ``key=value`` pair has been detected
    cfgOption,                # a ``--key=value`` command line option
    cfgError # an error ocurred during parsing; msg contains the
             # error message
  TCfgEvent* = object of TObject
    case kind*: TCfgEventKind
    of cfgEof:
        nil

    of cfgSectionStart:
        section*: string

    of cfgKeyValuePair, cfgOption:
        key*, value*: string

    of cfgError:
        msg*: string


  TTokKind* = enum
    tkInvalid, tkEof,         # order is important here!
    tkSymbol, tkEquals, tkColon, tkBracketLe, tkBracketRi, tkDashDash
  TToken*{.final.} = object   # a token
    kind*: TTokKind           # the type of the token
    literal*: string          # the parsed (string) literal

  TParserState* = enum
    startState, commaState
  TCfgParser* = object of TBaseLexer
    tok*: TToken
    state*: TParserState
    filename*: string


proc Open*(c: var TCfgParser, filename: string, inputStream: PLLStream)
proc Close*(c: var TCfgParser)
proc next*(c: var TCfgParser): TCfgEvent
proc getColumn*(c: TCfgParser): int
proc getLine*(c: TCfgParser): int
proc getFilename*(c: TCfgParser): string
proc errorStr*(c: TCfgParser, msg: string): string
# implementation

const
  SymChars: TCharSet = {'a'..'z', 'A'..'Z', '0'..'9', '_', '\x80'..'\xFF'} #
                                                                           # ----------------------------------------------------------------------------

proc rawGetTok(c: var TCfgParser, tok: var TToken)
proc open(c: var TCfgParser, filename: string, inputStream: PLLStream) =
  openBaseLexer(c, inputStream)
  c.filename = filename
  c.state = startState
  c.tok.kind = tkInvalid
  c.tok.literal = ""
  rawGetTok(c, c.tok)

proc close(c: var TCfgParser) =
  closeBaseLexer(c)

proc getColumn(c: TCfgParser): int =
  result = getColNumber(c, c.bufPos)

proc getLine(c: TCfgParser): int =
  result = c.linenumber

proc getFilename(c: TCfgParser): string =
  result = c.filename

proc handleHexChar(c: var TCfgParser, xi: var int) =
  case c.buf[c.bufpos]
  of '0'..'9':
    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('0'))
    inc(c.bufpos)
  of 'a'..'f':
    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('a') + 10)
    inc(c.bufpos)
  of 'A'..'F':
    xi = (xi shl 4) or (ord(c.buf[c.bufpos]) - ord('A') + 10)
    inc(c.bufpos)
  else:
    nil

proc handleDecChars(c: var TCfgParser, xi: var int) =
  while c.buf[c.bufpos] in {'0'..'9'}:
    xi = (xi * 10) + (ord(c.buf[c.bufpos]) - ord('0'))
    inc(c.bufpos)

proc getEscapedChar(c: var TCfgParser, tok: var TToken) =
  var xi: int
  inc(c.bufpos)               # skip '\'
  case c.buf[c.bufpos]
  of 'n', 'N':
    tok.literal = tok.literal & "\n"
    Inc(c.bufpos)
  of 'r', 'R', 'c', 'C':
    add(tok.literal, CR)
    Inc(c.bufpos)
  of 'l', 'L':
    add(tok.literal, LF)
    Inc(c.bufpos)
  of 'f', 'F':
    add(tok.literal, FF)
    inc(c.bufpos)
  of 'e', 'E':
    add(tok.literal, ESC)
    Inc(c.bufpos)
  of 'a', 'A':
    add(tok.literal, BEL)
    Inc(c.bufpos)
  of 'b', 'B':
    add(tok.literal, BACKSPACE)
    Inc(c.bufpos)
  of 'v', 'V':
    add(tok.literal, VT)
    Inc(c.bufpos)
  of 't', 'T':
    add(tok.literal, Tabulator)
    Inc(c.bufpos)
  of '\'', '\"':
    add(tok.literal, c.buf[c.bufpos])
    Inc(c.bufpos)
  of '\\':
    add(tok.literal, '\\')
    Inc(c.bufpos)
  of 'x', 'X':
    inc(c.bufpos)
    xi = 0
    handleHexChar(c, xi)
    handleHexChar(c, xi)
    add(tok.literal, Chr(xi))
  of '0'..'9':
    xi = 0
    handleDecChars(c, xi)
    if (xi <= 255): add(tok.literal, Chr(xi))
    else: tok.kind = tkInvalid
  else: tok.kind = tkInvalid

proc HandleCRLF(c: var TCfgParser, pos: int): int =
  case c.buf[pos]
  of CR: result = lexbase.HandleCR(c, pos)
  of LF: result = lexbase.HandleLF(c, pos)
  else: result = pos

proc getString(c: var TCfgParser, tok: var TToken, rawMode: bool) =
  var
    pos: int
    ch: Char
    buf: cstring
  pos = c.bufPos + 1          # skip "
  buf = c.buf                 # put `buf` in a register
  tok.kind = tkSymbol
  if (buf[pos] == '\"') and (buf[pos + 1] == '\"'):
    # long string literal:
    inc(pos, 2)               # skip ""
                              # skip leading newline:
    pos = HandleCRLF(c, pos)
    buf = c.buf
    while true:
      case buf[pos]
      of '\"':
        if (buf[pos + 1] == '\"') and (buf[pos + 2] == '\"'): break
        add(tok.literal, '\"')
        Inc(pos)
      of CR, LF:
        pos = HandleCRLF(c, pos)
        buf = c.buf
        tok.literal = tok.literal & "\n"
      of lexbase.EndOfFile:
        tok.kind = tkInvalid
        break
      else:
        add(tok.literal, buf[pos])
        Inc(pos)
    c.bufpos = pos +
        3                     # skip the three """
  else:
    # ordinary string literal
    while true:
      ch = buf[pos]
      if ch == '\"':
        inc(pos)              # skip '"'
        break
      if ch in {CR, LF, lexbase.EndOfFile}:
        tok.kind = tkInvalid
        break
      if (ch == '\\') and not rawMode:
        c.bufPos = pos
        getEscapedChar(c, tok)
        pos = c.bufPos
      else:
        add(tok.literal, ch)
        Inc(pos)
    c.bufpos = pos

proc getSymbol(c: var TCfgParser, tok: var TToken) =
  var
    pos: int
    buf: cstring
  pos = c.bufpos
  buf = c.buf
  while true:
    add(tok.literal, buf[pos])
    Inc(pos)
    if not (buf[pos] in SymChars): break
  c.bufpos = pos
  tok.kind = tkSymbol

proc skip(c: var TCfgParser) =
  var
    buf: cstring
    pos: int
  pos = c.bufpos
  buf = c.buf
  while true:
    case buf[pos]
    of ' ':
      Inc(pos)
    of Tabulator:
      inc(pos)
    of '#', ';':
      while not (buf[pos] in {CR, LF, lexbase.EndOfFile}): inc(pos)
    of CR, LF:
      pos = HandleCRLF(c, pos)
      buf = c.buf
    else:
      break                   # EndOfFile also leaves the loop
  c.bufpos = pos

proc rawGetTok(c: var TCfgParser, tok: var TToken) =
  tok.kind = tkInvalid
  setlen(tok.literal, 0)
  skip(c)
  case c.buf[c.bufpos]
  of '=':
    tok.kind = tkEquals
    inc(c.bufpos)
    tok.literal = "="
  of '-':
    inc(c.bufPos)
    if c.buf[c.bufPos] == '-': inc(c.bufPos)
    tok.kind = tkDashDash
    tok.literal = "--"
  of ':':
    tok.kind = tkColon
    inc(c.bufpos)
    tok.literal = ":"
  of 'r', 'R':
    if c.buf[c.bufPos + 1] == '\"':
      Inc(c.bufPos)
      getString(c, tok, true)
    else:
      getSymbol(c, tok)
  of '[':
    tok.kind = tkBracketLe
    inc(c.bufpos)
    tok.literal = "["
  of ']':
    tok.kind = tkBracketRi
    Inc(c.bufpos)
    tok.literal = "]"
  of '\"':
    getString(c, tok, false)
  of lexbase.EndOfFile:
    tok.kind = tkEof
  else: getSymbol(c, tok)

proc errorStr(c: TCfgParser, msg: string): string =
  result = `%`("$1($2, $3) Error: $4",
               [c.filename, $(getLine(c)), $(getColumn(c)), msg])

proc getKeyValPair(c: var TCfgParser, kind: TCfgEventKind): TCfgEvent =
  if c.tok.kind == tkSymbol:
    result.kind = kind
    result.key = c.tok.literal
    result.value = ""
    rawGetTok(c, c.tok)
    while c.tok.literal == ".":
      add(result.key, '.')
      rawGetTok(c, c.tok)
      if c.tok.kind == tkSymbol:
        add(result.key, c.tok.literal)
        rawGetTok(c, c.tok)
      else:
        result.kind = cfgError
        result.msg = errorStr(c, "symbol expected, but found: " & c.tok.literal)
        break
    if c.tok.kind in {tkEquals, tkColon}:
      rawGetTok(c, c.tok)
      if c.tok.kind == tkSymbol:
        result.value = c.tok.literal
      else:
        result.kind = cfgError
        result.msg = errorStr(c, "symbol expected, but found: " & c.tok.literal)
      rawGetTok(c, c.tok)
  else:
    result.kind = cfgError
    result.msg = errorStr(c, "symbol expected, but found: " & c.tok.literal)
    rawGetTok(c, c.tok)

proc next(c: var TCfgParser): TCfgEvent =
  case c.tok.kind
  of tkEof:
    result.kind = cfgEof
  of tkDashDash:
    rawGetTok(c, c.tok)
    result = getKeyValPair(c, cfgOption)
  of tkSymbol:
    result = getKeyValPair(c, cfgKeyValuePair)
  of tkBracketLe:
    rawGetTok(c, c.tok)
    if c.tok.kind == tkSymbol:
      result.kind = cfgSectionStart
      result.section = c.tok.literal
    else:
      result.kind = cfgError
      result.msg = errorStr(c, "symbol expected, but found: " & c.tok.literal)
    rawGetTok(c, c.tok)
    if c.tok.kind == tkBracketRi:
      rawGetTok(c, c.tok)
    else:
      result.kind = cfgError
      result.msg = errorStr(c, "\']\' expected, but found: " & c.tok.literal)
  of tkInvalid, tkBracketRi, tkEquals, tkColon:
    result.kind = cfgError
    result.msg = errorStr(c, "invalid token: " & c.tok.literal)
    rawGetTok(c, c.tok)