mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-28 17:04:41 +00:00
1378 lines
46 KiB
Nim
1378 lines
46 KiB
Nim
#
|
||
#
|
||
# The Nim Compiler
|
||
# (c) Copyright 2015 Andreas Rumpf
|
||
#
|
||
# See the file "copying.txt", included in this
|
||
# distribution, for details about the copyright.
|
||
#
|
||
|
||
# This lexer is handwritten for efficiency. I used an elegant buffering
|
||
# scheme which I have not seen anywhere else:
|
||
# We guarantee that a whole line is in the buffer. Thus only when scanning
|
||
# the \n or \r character we have to check whether we need to read in the next
|
||
# chunk. (\n or \r already need special handling for incrementing the line
|
||
# counter; choosing both \n and \r allows the lexer to properly read Unix,
|
||
# DOS or Macintosh text files, even when it is not the native format.
|
||
|
||
import
|
||
options, msgs, platform, idents, nimlexbase, llstream,
|
||
wordrecg, lineinfos, pathutils
|
||
|
||
import std/[hashes, parseutils, strutils]
|
||
|
||
when defined(nimPreviewSlimSystem):
|
||
import std/[assertions, formatfloat]
|
||
|
||
const
|
||
numChars*: set[char] = {'0'..'9', 'a'..'z', 'A'..'Z'}
|
||
SymChars*: set[char] = {'a'..'z', 'A'..'Z', '0'..'9', '\x80'..'\xFF'}
|
||
SymStartChars*: set[char] = {'a'..'z', 'A'..'Z', '\x80'..'\xFF'}
|
||
OpChars*: set[char] = {'+', '-', '*', '/', '\\', '<', '>', '!', '?', '^', '.',
|
||
'|', '=', '%', '&', '$', '@', '~', ':'}
|
||
UnaryMinusWhitelist = {' ', '\t', '\n', '\r', ',', ';', '(', '[', '{'}
|
||
|
||
# don't forget to update the 'highlite' module if these charsets should change
|
||
|
||
type
|
||
TokType* = enum
|
||
tkInvalid = "tkInvalid", tkEof = "[EOF]", # order is important here!
|
||
tkSymbol = "tkSymbol", # keywords:
|
||
tkAddr = "addr", tkAnd = "and", tkAs = "as", tkAsm = "asm",
|
||
tkBind = "bind", tkBlock = "block", tkBreak = "break", tkCase = "case", tkCast = "cast",
|
||
tkConcept = "concept", tkConst = "const", tkContinue = "continue", tkConverter = "converter",
|
||
tkDefer = "defer", tkDiscard = "discard", tkDistinct = "distinct", tkDiv = "div", tkDo = "do",
|
||
tkElif = "elif", tkElse = "else", tkEnd = "end", tkEnum = "enum", tkExcept = "except", tkExport = "export",
|
||
tkFinally = "finally", tkFor = "for", tkFrom = "from", tkFunc = "func",
|
||
tkIf = "if", tkImport = "import", tkIn = "in", tkInclude = "include", tkInterface = "interface",
|
||
tkIs = "is", tkIsnot = "isnot", tkIterator = "iterator",
|
||
tkLet = "let",
|
||
tkMacro = "macro", tkMethod = "method", tkMixin = "mixin", tkMod = "mod", tkNil = "nil", tkNot = "not", tkNotin = "notin",
|
||
tkObject = "object", tkOf = "of", tkOr = "or", tkOut = "out",
|
||
tkProc = "proc", tkPtr = "ptr", tkRaise = "raise", tkRef = "ref", tkReturn = "return",
|
||
tkShl = "shl", tkShr = "shr", tkStatic = "static",
|
||
tkTemplate = "template",
|
||
tkTry = "try", tkTuple = "tuple", tkType = "type", tkUsing = "using",
|
||
tkVar = "var", tkWhen = "when", tkWhile = "while", tkXor = "xor",
|
||
tkYield = "yield", # end of keywords
|
||
|
||
tkIntLit = "tkIntLit", tkInt8Lit = "tkInt8Lit", tkInt16Lit = "tkInt16Lit",
|
||
tkInt32Lit = "tkInt32Lit", tkInt64Lit = "tkInt64Lit",
|
||
tkUIntLit = "tkUIntLit", tkUInt8Lit = "tkUInt8Lit", tkUInt16Lit = "tkUInt16Lit",
|
||
tkUInt32Lit = "tkUInt32Lit", tkUInt64Lit = "tkUInt64Lit",
|
||
tkFloatLit = "tkFloatLit", tkFloat32Lit = "tkFloat32Lit",
|
||
tkFloat64Lit = "tkFloat64Lit", tkFloat128Lit = "tkFloat128Lit",
|
||
tkStrLit = "tkStrLit", tkRStrLit = "tkRStrLit", tkTripleStrLit = "tkTripleStrLit",
|
||
tkGStrLit = "tkGStrLit", tkGTripleStrLit = "tkGTripleStrLit", tkCharLit = "tkCharLit",
|
||
tkCustomLit = "tkCustomLit",
|
||
|
||
tkParLe = "(", tkParRi = ")", tkBracketLe = "[",
|
||
tkBracketRi = "]", tkCurlyLe = "{", tkCurlyRi = "}",
|
||
tkBracketDotLe = "[.", tkBracketDotRi = ".]",
|
||
tkCurlyDotLe = "{.", tkCurlyDotRi = ".}",
|
||
tkParDotLe = "(.", tkParDotRi = ".)",
|
||
tkComma = ",", tkSemiColon = ";",
|
||
tkColon = ":", tkColonColon = "::", tkEquals = "=",
|
||
tkDot = ".", tkDotDot = "..", tkBracketLeColon = "[:",
|
||
tkOpr, tkComment, tkAccent = "`",
|
||
# these are fake tokens used by renderer.nim
|
||
tkSpaces, tkInfixOpr, tkPrefixOpr, tkPostfixOpr, tkHideableStart, tkHideableEnd
|
||
|
||
TokTypes* = set[TokType]
|
||
|
||
const
|
||
weakTokens = {tkComma, tkSemiColon, tkColon,
|
||
tkParRi, tkParDotRi, tkBracketRi, tkBracketDotRi,
|
||
tkCurlyRi} # \
|
||
# tokens that should not be considered for previousToken
|
||
tokKeywordLow* = succ(tkSymbol)
|
||
tokKeywordHigh* = pred(tkIntLit)
|
||
|
||
type
|
||
NumericalBase* = enum
|
||
base10, # base10 is listed as the first element,
|
||
# so that it is the correct default value
|
||
base2, base8, base16
|
||
|
||
TokenSpacing* = enum
|
||
tsLeading, tsTrailing, tsEof
|
||
|
||
Token* = object # a Nim token
|
||
tokType*: TokType # the type of the token
|
||
base*: NumericalBase # the numerical base; only valid for int
|
||
# or float literals
|
||
spacing*: set[TokenSpacing] # spaces around token
|
||
indent*: int # the indentation; != -1 if the token has been
|
||
# preceded with indentation
|
||
ident*: PIdent # the parsed identifier
|
||
iNumber*: BiggestInt # the parsed integer literal
|
||
fNumber*: BiggestFloat # the parsed floating point literal
|
||
literal*: string # the parsed (string) literal; and
|
||
# documentation comments are here too
|
||
line*, col*: int
|
||
when defined(nimpretty):
|
||
offsetA*, offsetB*: int # used for pretty printing so that literals
|
||
# like 0b01 or r"\L" are unaffected
|
||
commentOffsetA*, commentOffsetB*: int
|
||
|
||
ErrorHandler* = proc (conf: ConfigRef; info: TLineInfo; msg: TMsgKind; arg: string)
|
||
Lexer* = object of TBaseLexer
|
||
fileIdx*: FileIndex
|
||
indentAhead*: int # if > 0 an indentation has already been read
|
||
# this is needed because scanning comments
|
||
# needs so much look-ahead
|
||
currLineIndent*: int
|
||
errorHandler*: ErrorHandler
|
||
cache*: IdentCache
|
||
when defined(nimsuggest):
|
||
previousToken: TLineInfo
|
||
tokenEnd*: TLineInfo
|
||
previousTokenEnd*: TLineInfo
|
||
config*: ConfigRef
|
||
|
||
proc getLineInfo*(L: Lexer, tok: Token): TLineInfo {.inline.} =
|
||
result = newLineInfo(L.fileIdx, tok.line, tok.col)
|
||
when defined(nimpretty):
|
||
result.offsetA = tok.offsetA
|
||
result.offsetB = tok.offsetB
|
||
result.commentOffsetA = tok.commentOffsetA
|
||
result.commentOffsetB = tok.commentOffsetB
|
||
|
||
proc isKeyword*(kind: TokType): bool =
|
||
(kind >= tokKeywordLow) and (kind <= tokKeywordHigh)
|
||
|
||
template ones(n): untyped = ((1 shl n)-1) # for utf-8 conversion
|
||
|
||
proc isNimIdentifier*(s: string): bool =
|
||
let sLen = s.len
|
||
if sLen > 0 and s[0] in SymStartChars:
|
||
var i = 1
|
||
while i < sLen:
|
||
if s[i] == '_': inc(i)
|
||
if i < sLen and s[i] notin SymChars: return false
|
||
inc(i)
|
||
result = true
|
||
else:
|
||
result = false
|
||
|
||
proc `$`*(tok: Token): string =
|
||
case tok.tokType
|
||
of tkIntLit..tkInt64Lit: $tok.iNumber
|
||
of tkFloatLit..tkFloat64Lit: $tok.fNumber
|
||
of tkInvalid, tkStrLit..tkCharLit, tkComment: tok.literal
|
||
of tkParLe..tkColon, tkEof, tkAccent: $tok.tokType
|
||
else:
|
||
if tok.ident != nil:
|
||
tok.ident.s
|
||
else:
|
||
""
|
||
|
||
proc prettyTok*(tok: Token): string =
|
||
if isKeyword(tok.tokType): "keyword " & tok.ident.s
|
||
else: $tok
|
||
|
||
proc printTok*(conf: ConfigRef; tok: Token) =
|
||
# xxx factor with toLocation
|
||
msgWriteln(conf, $tok.line & ":" & $tok.col & "\t" & $tok.tokType & " " & $tok)
|
||
|
||
proc openLexer*(lex: var Lexer, fileIdx: FileIndex, inputstream: PLLStream;
|
||
cache: IdentCache; config: ConfigRef) =
|
||
openBaseLexer(lex, inputstream)
|
||
lex.fileIdx = fileIdx
|
||
lex.indentAhead = -1
|
||
lex.currLineIndent = 0
|
||
inc(lex.lineNumber, inputstream.lineOffset)
|
||
lex.cache = cache
|
||
when defined(nimsuggest):
|
||
lex.previousToken.fileIndex = fileIdx
|
||
lex.config = config
|
||
|
||
proc openLexer*(lex: var Lexer, filename: AbsoluteFile, inputstream: PLLStream;
|
||
cache: IdentCache; config: ConfigRef) =
|
||
openLexer(lex, fileInfoIdx(config, filename), inputstream, cache, config)
|
||
|
||
proc closeLexer*(lex: var Lexer) =
|
||
if lex.config != nil:
|
||
inc(lex.config.linesCompiled, lex.lineNumber)
|
||
closeBaseLexer(lex)
|
||
|
||
proc getLineInfo(L: Lexer): TLineInfo =
|
||
result = newLineInfo(L.fileIdx, L.lineNumber, getColNumber(L, L.bufpos))
|
||
|
||
proc dispMessage(L: Lexer; info: TLineInfo; msg: TMsgKind; arg: string) =
|
||
if L.errorHandler.isNil:
|
||
msgs.message(L.config, info, msg, arg)
|
||
else:
|
||
L.errorHandler(L.config, info, msg, arg)
|
||
|
||
proc lexMessage*(L: Lexer, msg: TMsgKind, arg = "") =
|
||
L.dispMessage(getLineInfo(L), msg, arg)
|
||
|
||
proc lexMessageTok*(L: Lexer, msg: TMsgKind, tok: Token, arg = "") =
|
||
var info = newLineInfo(L.fileIdx, tok.line, tok.col)
|
||
L.dispMessage(info, msg, arg)
|
||
|
||
proc lexMessagePos(L: var Lexer, msg: TMsgKind, pos: int, arg = "") =
|
||
var info = newLineInfo(L.fileIdx, L.lineNumber, pos - L.lineStart)
|
||
L.dispMessage(info, msg, arg)
|
||
|
||
proc matchTwoChars(L: Lexer, first: char, second: set[char]): bool =
|
||
result = (L.buf[L.bufpos] == first) and (L.buf[L.bufpos + 1] in second)
|
||
|
||
template tokenBegin(tok, pos) {.dirty.} =
|
||
when defined(nimsuggest):
|
||
var colA = getColNumber(L, pos)
|
||
when defined(nimpretty):
|
||
tok.offsetA = L.offsetBase + pos
|
||
|
||
template tokenEnd(tok, pos) {.dirty.} =
|
||
when defined(nimsuggest):
|
||
let colB = getColNumber(L, pos)+1
|
||
if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
|
||
L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
|
||
L.config.m.trackPos.col = colA.int16
|
||
colA = 0
|
||
when defined(nimpretty):
|
||
tok.offsetB = L.offsetBase + pos
|
||
|
||
template tokenEndIgnore(tok, pos) =
|
||
when defined(nimsuggest):
|
||
let colB = getColNumber(L, pos)
|
||
if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
|
||
L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
|
||
L.config.m.trackPos.fileIndex = trackPosInvalidFileIdx
|
||
L.config.m.trackPos.line = 0'u16
|
||
colA = 0
|
||
when defined(nimpretty):
|
||
tok.offsetB = L.offsetBase + pos
|
||
|
||
template tokenEndPrevious(tok, pos) =
|
||
when defined(nimsuggest):
|
||
# when we detect the cursor in whitespace, we attach the track position
|
||
# to the token that came before that, but only if we haven't detected
|
||
# the cursor in a string literal or comment:
|
||
let colB = getColNumber(L, pos)
|
||
if L.fileIdx == L.config.m.trackPos.fileIndex and L.config.m.trackPos.col in colA..colB and
|
||
L.lineNumber == L.config.m.trackPos.line.int and L.config.ideCmd in {ideSug, ideCon}:
|
||
L.config.m.trackPos = L.previousToken
|
||
L.config.m.trackPosAttached = true
|
||
colA = 0
|
||
when defined(nimpretty):
|
||
tok.offsetB = L.offsetBase + pos
|
||
|
||
template eatChar(L: var Lexer, t: var Token, replacementChar: char) =
|
||
t.literal.add(replacementChar)
|
||
inc(L.bufpos)
|
||
|
||
template eatChar(L: var Lexer, t: var Token) =
|
||
t.literal.add(L.buf[L.bufpos])
|
||
inc(L.bufpos)
|
||
|
||
proc getNumber(L: var Lexer, result: var Token) =
|
||
proc matchUnderscoreChars(L: var Lexer, tok: var Token, chars: set[char]): Natural =
|
||
var pos = L.bufpos # use registers for pos, buf
|
||
result = 0
|
||
while true:
|
||
if L.buf[pos] in chars:
|
||
tok.literal.add(L.buf[pos])
|
||
inc(pos)
|
||
inc(result)
|
||
else:
|
||
break
|
||
if L.buf[pos] == '_':
|
||
if L.buf[pos+1] notin chars:
|
||
lexMessage(L, errGenerated,
|
||
"only single underscores may occur in a token and token may not " &
|
||
"end with an underscore: e.g. '1__1' and '1_' are invalid")
|
||
break
|
||
tok.literal.add('_')
|
||
inc(pos)
|
||
L.bufpos = pos
|
||
|
||
proc matchChars(L: var Lexer, tok: var Token, chars: set[char]) =
|
||
var pos = L.bufpos # use registers for pos, buf
|
||
while L.buf[pos] in chars:
|
||
tok.literal.add(L.buf[pos])
|
||
inc(pos)
|
||
L.bufpos = pos
|
||
|
||
proc lexMessageLitNum(L: var Lexer, msg: string, startpos: int, msgKind = errGenerated) =
|
||
# Used to get slightly human friendlier err messages.
|
||
const literalishChars = {'A'..'Z', 'a'..'z', '0'..'9', '_', '.', '\''}
|
||
var msgPos = L.bufpos
|
||
var t = Token(literal: "")
|
||
L.bufpos = startpos # Use L.bufpos as pos because of matchChars
|
||
matchChars(L, t, literalishChars)
|
||
# We must verify +/- specifically so that we're not past the literal
|
||
if L.buf[L.bufpos] in {'+', '-'} and
|
||
L.buf[L.bufpos - 1] in {'e', 'E'}:
|
||
t.literal.add(L.buf[L.bufpos])
|
||
inc(L.bufpos)
|
||
matchChars(L, t, literalishChars)
|
||
if L.buf[L.bufpos] in literalishChars:
|
||
t.literal.add(L.buf[L.bufpos])
|
||
inc(L.bufpos)
|
||
matchChars(L, t, {'0'..'9'})
|
||
L.bufpos = msgPos
|
||
lexMessage(L, msgKind, msg % t.literal)
|
||
|
||
var
|
||
xi: BiggestInt
|
||
isBase10 = true
|
||
numDigits = 0
|
||
const
|
||
# 'c', 'C' is deprecated
|
||
baseCodeChars = {'X', 'x', 'o', 'b', 'B', 'c', 'C'}
|
||
literalishChars = baseCodeChars + {'A'..'F', 'a'..'f', '0'..'9', '_', '\''}
|
||
floatTypes = {tkFloatLit, tkFloat32Lit, tkFloat64Lit, tkFloat128Lit}
|
||
result.tokType = tkIntLit # int literal until we know better
|
||
result.literal = ""
|
||
result.base = base10
|
||
tokenBegin(result, L.bufpos)
|
||
|
||
var isPositive = true
|
||
if L.buf[L.bufpos] == '-':
|
||
eatChar(L, result)
|
||
isPositive = false
|
||
|
||
let startpos = L.bufpos
|
||
|
||
template setNumber(field, value) =
|
||
field = (if isPositive: value else: -value)
|
||
|
||
# First stage: find out base, make verifications, build token literal string
|
||
# {'c', 'C'} is added for deprecation reasons to provide a clear error message
|
||
if L.buf[L.bufpos] == '0' and L.buf[L.bufpos + 1] in baseCodeChars + {'c', 'C', 'O'}:
|
||
isBase10 = false
|
||
eatChar(L, result, '0')
|
||
case L.buf[L.bufpos]
|
||
of 'c', 'C':
|
||
lexMessageLitNum(L,
|
||
"$1 will soon be invalid for oct literals; Use '0o' " &
|
||
"for octals. 'c', 'C' prefix",
|
||
startpos,
|
||
warnDeprecated)
|
||
eatChar(L, result, 'c')
|
||
numDigits = matchUnderscoreChars(L, result, {'0'..'7'})
|
||
of 'O':
|
||
lexMessageLitNum(L, "$1 is an invalid int literal; For octal literals " &
|
||
"use the '0o' prefix.", startpos)
|
||
of 'x', 'X':
|
||
eatChar(L, result, 'x')
|
||
numDigits = matchUnderscoreChars(L, result, {'0'..'9', 'a'..'f', 'A'..'F'})
|
||
of 'o':
|
||
eatChar(L, result, 'o')
|
||
numDigits = matchUnderscoreChars(L, result, {'0'..'7'})
|
||
of 'b', 'B':
|
||
eatChar(L, result, 'b')
|
||
numDigits = matchUnderscoreChars(L, result, {'0'..'1'})
|
||
else:
|
||
internalError(L.config, getLineInfo(L), "getNumber")
|
||
if numDigits == 0:
|
||
lexMessageLitNum(L, "invalid number: '$1'", startpos)
|
||
else:
|
||
discard matchUnderscoreChars(L, result, {'0'..'9'})
|
||
if (L.buf[L.bufpos] == '.') and (L.buf[L.bufpos + 1] in {'0'..'9'}):
|
||
result.tokType = tkFloatLit
|
||
eatChar(L, result, '.')
|
||
discard matchUnderscoreChars(L, result, {'0'..'9'})
|
||
if L.buf[L.bufpos] in {'e', 'E'}:
|
||
result.tokType = tkFloatLit
|
||
eatChar(L, result)
|
||
if L.buf[L.bufpos] in {'+', '-'}:
|
||
eatChar(L, result)
|
||
discard matchUnderscoreChars(L, result, {'0'..'9'})
|
||
let endpos = L.bufpos
|
||
|
||
# Second stage, find out if there's a datatype suffix and handle it
|
||
var postPos = endpos
|
||
|
||
if L.buf[postPos] in {'\'', 'f', 'F', 'd', 'D', 'i', 'I', 'u', 'U'}:
|
||
let errPos = postPos
|
||
var customLitPossible = false
|
||
if L.buf[postPos] == '\'':
|
||
inc(postPos)
|
||
customLitPossible = true
|
||
|
||
if L.buf[postPos] in SymChars:
|
||
var suffix = newStringOfCap(10)
|
||
while true:
|
||
suffix.add L.buf[postPos]
|
||
inc postPos
|
||
if L.buf[postPos] notin SymChars+{'_'}: break
|
||
let suffixAsLower = suffix.toLowerAscii
|
||
case suffixAsLower
|
||
of "f", "f32": result.tokType = tkFloat32Lit
|
||
of "d", "f64": result.tokType = tkFloat64Lit
|
||
of "f128": result.tokType = tkFloat128Lit
|
||
of "i8": result.tokType = tkInt8Lit
|
||
of "i16": result.tokType = tkInt16Lit
|
||
of "i32": result.tokType = tkInt32Lit
|
||
of "i64": result.tokType = tkInt64Lit
|
||
of "u": result.tokType = tkUIntLit
|
||
of "u8": result.tokType = tkUInt8Lit
|
||
of "u16": result.tokType = tkUInt16Lit
|
||
of "u32": result.tokType = tkUInt32Lit
|
||
of "u64": result.tokType = tkUInt64Lit
|
||
elif customLitPossible:
|
||
# remember the position of the `'` so that the parser doesn't
|
||
# have to reparse the custom literal:
|
||
result.iNumber = len(result.literal)
|
||
result.literal.add '\''
|
||
result.literal.add suffix
|
||
result.tokType = tkCustomLit
|
||
else:
|
||
lexMessageLitNum(L, "invalid number suffix: '$1'", errPos)
|
||
else:
|
||
lexMessageLitNum(L, "invalid number suffix: '$1'", errPos)
|
||
|
||
# Is there still a literalish char awaiting? Then it's an error!
|
||
if L.buf[postPos] in literalishChars or
|
||
(L.buf[postPos] == '.' and L.buf[postPos + 1] in {'0'..'9'}):
|
||
lexMessageLitNum(L, "invalid number: '$1'", startpos)
|
||
|
||
if result.tokType != tkCustomLit:
|
||
# Third stage, extract actual number
|
||
L.bufpos = startpos # restore position
|
||
var pos = startpos
|
||
try:
|
||
if (L.buf[pos] == '0') and (L.buf[pos + 1] in baseCodeChars):
|
||
inc(pos, 2)
|
||
xi = 0 # it is a base prefix
|
||
|
||
case L.buf[pos - 1]
|
||
of 'b', 'B':
|
||
result.base = base2
|
||
while pos < endpos:
|
||
if L.buf[pos] != '_':
|
||
xi = `shl`(xi, 1) or (ord(L.buf[pos]) - ord('0'))
|
||
inc(pos)
|
||
# 'c', 'C' is deprecated (a warning is issued elsewhere)
|
||
of 'o', 'c', 'C':
|
||
result.base = base8
|
||
while pos < endpos:
|
||
if L.buf[pos] != '_':
|
||
xi = `shl`(xi, 3) or (ord(L.buf[pos]) - ord('0'))
|
||
inc(pos)
|
||
of 'x', 'X':
|
||
result.base = base16
|
||
while pos < endpos:
|
||
case L.buf[pos]
|
||
of '_':
|
||
inc(pos)
|
||
of '0'..'9':
|
||
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('0'))
|
||
inc(pos)
|
||
of 'a'..'f':
|
||
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('a') + 10)
|
||
inc(pos)
|
||
of 'A'..'F':
|
||
xi = `shl`(xi, 4) or (ord(L.buf[pos]) - ord('A') + 10)
|
||
inc(pos)
|
||
else:
|
||
break
|
||
else:
|
||
internalError(L.config, getLineInfo(L), "getNumber")
|
||
|
||
case result.tokType
|
||
of tkIntLit, tkInt64Lit: setNumber result.iNumber, xi
|
||
of tkInt8Lit: setNumber result.iNumber, ashr(xi shl 56, 56)
|
||
of tkInt16Lit: setNumber result.iNumber, ashr(xi shl 48, 48)
|
||
of tkInt32Lit: setNumber result.iNumber, ashr(xi shl 32, 32)
|
||
of tkUIntLit, tkUInt64Lit: setNumber result.iNumber, xi
|
||
of tkUInt8Lit: setNumber result.iNumber, xi and 0xff
|
||
of tkUInt16Lit: setNumber result.iNumber, xi and 0xffff
|
||
of tkUInt32Lit: setNumber result.iNumber, xi and 0xffffffff
|
||
of tkFloat32Lit:
|
||
setNumber result.fNumber, (cast[ptr float32](addr(xi)))[]
|
||
# note: this code is endian neutral!
|
||
# XXX: Test this on big endian machine!
|
||
of tkFloat64Lit, tkFloatLit:
|
||
setNumber result.fNumber, (cast[ptr float64](addr(xi)))[]
|
||
else: internalError(L.config, getLineInfo(L), "getNumber")
|
||
|
||
# Bounds checks. Non decimal literals are allowed to overflow the range of
|
||
# the datatype as long as their pattern don't overflow _bitwise_, hence
|
||
# below checks of signed sizes against uint*.high is deliberate:
|
||
# (0x80'u8 = 128, 0x80'i8 = -128, etc == OK)
|
||
if result.tokType notin floatTypes:
|
||
let outOfRange =
|
||
case result.tokType
|
||
of tkUInt8Lit, tkUInt16Lit, tkUInt32Lit: result.iNumber != xi
|
||
of tkInt8Lit: (xi > BiggestInt(uint8.high))
|
||
of tkInt16Lit: (xi > BiggestInt(uint16.high))
|
||
of tkInt32Lit: (xi > BiggestInt(uint32.high))
|
||
else: false
|
||
|
||
if outOfRange:
|
||
#echo "out of range num: ", result.iNumber, " vs ", xi
|
||
lexMessageLitNum(L, "number out of range: '$1'", startpos)
|
||
|
||
else:
|
||
case result.tokType
|
||
of floatTypes:
|
||
result.fNumber = parseFloat(result.literal)
|
||
of tkUInt64Lit, tkUIntLit:
|
||
var iNumber: uint64 = uint64(0)
|
||
var len: int = 0
|
||
try:
|
||
len = parseBiggestUInt(result.literal, iNumber)
|
||
except ValueError:
|
||
raise newException(OverflowDefect, "number out of range: " & result.literal)
|
||
if len != result.literal.len:
|
||
raise newException(ValueError, "invalid integer: " & result.literal)
|
||
result.iNumber = cast[int64](iNumber)
|
||
else:
|
||
var iNumber: int64 = int64(0)
|
||
var len: int = 0
|
||
try:
|
||
len = parseBiggestInt(result.literal, iNumber)
|
||
except ValueError:
|
||
raise newException(OverflowDefect, "number out of range: " & result.literal)
|
||
if len != result.literal.len:
|
||
raise newException(ValueError, "invalid integer: " & result.literal)
|
||
result.iNumber = iNumber
|
||
|
||
# Explicit bounds checks.
|
||
let outOfRange =
|
||
case result.tokType
|
||
of tkInt8Lit: result.iNumber > int8.high or result.iNumber < int8.low
|
||
of tkUInt8Lit: result.iNumber > BiggestInt(uint8.high) or result.iNumber < 0
|
||
of tkInt16Lit: result.iNumber > int16.high or result.iNumber < int16.low
|
||
of tkUInt16Lit: result.iNumber > BiggestInt(uint16.high) or result.iNumber < 0
|
||
of tkInt32Lit: result.iNumber > int32.high or result.iNumber < int32.low
|
||
of tkUInt32Lit: result.iNumber > BiggestInt(uint32.high) or result.iNumber < 0
|
||
else: false
|
||
|
||
if outOfRange:
|
||
lexMessageLitNum(L, "number out of range: '$1'", startpos)
|
||
|
||
# Promote int literal to int64? Not always necessary, but more consistent
|
||
if result.tokType == tkIntLit:
|
||
if result.iNumber > high(int32) or result.iNumber < low(int32):
|
||
result.tokType = tkInt64Lit
|
||
|
||
except ValueError:
|
||
lexMessageLitNum(L, "invalid number: '$1'", startpos)
|
||
except OverflowDefect, RangeDefect:
|
||
lexMessageLitNum(L, "number out of range: '$1'", startpos)
|
||
tokenEnd(result, postPos-1)
|
||
L.bufpos = postPos
|
||
|
||
proc handleHexChar(L: var Lexer, xi: var int; position: range[0..4]) =
|
||
template invalid() =
|
||
lexMessage(L, errGenerated,
|
||
"expected a hex digit, but found: " & L.buf[L.bufpos] &
|
||
"; maybe prepend with 0")
|
||
|
||
case L.buf[L.bufpos]
|
||
of '0'..'9':
|
||
xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('0'))
|
||
inc(L.bufpos)
|
||
of 'a'..'f':
|
||
xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('a') + 10)
|
||
inc(L.bufpos)
|
||
of 'A'..'F':
|
||
xi = (xi shl 4) or (ord(L.buf[L.bufpos]) - ord('A') + 10)
|
||
inc(L.bufpos)
|
||
of '"', '\'':
|
||
if position <= 1: invalid()
|
||
# do not progress the bufpos here.
|
||
if position == 0: inc(L.bufpos)
|
||
else:
|
||
invalid()
|
||
# Need to progress for `nim check`
|
||
inc(L.bufpos)
|
||
|
||
proc handleDecChars(L: var Lexer, xi: var int) =
|
||
while L.buf[L.bufpos] in {'0'..'9'}:
|
||
xi = (xi * 10) + (ord(L.buf[L.bufpos]) - ord('0'))
|
||
inc(L.bufpos)
|
||
|
||
proc addUnicodeCodePoint(s: var string, i: int) =
|
||
let i = cast[uint](i)
|
||
# inlined toUTF-8 to avoid unicode and strutils dependencies.
|
||
let pos = s.len
|
||
if i <= 127:
|
||
s.setLen(pos+1)
|
||
s[pos+0] = chr(i)
|
||
elif i <= 0x07FF:
|
||
s.setLen(pos+2)
|
||
s[pos+0] = chr((i shr 6) or 0b110_00000)
|
||
s[pos+1] = chr((i and ones(6)) or 0b10_0000_00)
|
||
elif i <= 0xFFFF:
|
||
s.setLen(pos+3)
|
||
s[pos+0] = chr(i shr 12 or 0b1110_0000)
|
||
s[pos+1] = chr(i shr 6 and ones(6) or 0b10_0000_00)
|
||
s[pos+2] = chr(i and ones(6) or 0b10_0000_00)
|
||
elif i <= 0x001FFFFF:
|
||
s.setLen(pos+4)
|
||
s[pos+0] = chr(i shr 18 or 0b1111_0000)
|
||
s[pos+1] = chr(i shr 12 and ones(6) or 0b10_0000_00)
|
||
s[pos+2] = chr(i shr 6 and ones(6) or 0b10_0000_00)
|
||
s[pos+3] = chr(i and ones(6) or 0b10_0000_00)
|
||
elif i <= 0x03FFFFFF:
|
||
s.setLen(pos+5)
|
||
s[pos+0] = chr(i shr 24 or 0b111110_00)
|
||
s[pos+1] = chr(i shr 18 and ones(6) or 0b10_0000_00)
|
||
s[pos+2] = chr(i shr 12 and ones(6) or 0b10_0000_00)
|
||
s[pos+3] = chr(i shr 6 and ones(6) or 0b10_0000_00)
|
||
s[pos+4] = chr(i and ones(6) or 0b10_0000_00)
|
||
elif i <= 0x7FFFFFFF:
|
||
s.setLen(pos+6)
|
||
s[pos+0] = chr(i shr 30 or 0b1111110_0)
|
||
s[pos+1] = chr(i shr 24 and ones(6) or 0b10_0000_00)
|
||
s[pos+2] = chr(i shr 18 and ones(6) or 0b10_0000_00)
|
||
s[pos+3] = chr(i shr 12 and ones(6) or 0b10_0000_00)
|
||
s[pos+4] = chr(i shr 6 and ones(6) or 0b10_0000_00)
|
||
s[pos+5] = chr(i and ones(6) or 0b10_0000_00)
|
||
|
||
proc getEscapedChar(L: var Lexer, tok: var Token) =
|
||
inc(L.bufpos) # skip '\'
|
||
case L.buf[L.bufpos]
|
||
of 'n', 'N':
|
||
tok.literal.add('\L')
|
||
inc(L.bufpos)
|
||
of 'p', 'P':
|
||
if tok.tokType == tkCharLit:
|
||
lexMessage(L, errGenerated, "\\p not allowed in character literal")
|
||
tok.literal.add(L.config.target.tnl)
|
||
inc(L.bufpos)
|
||
of 'r', 'R', 'c', 'C':
|
||
tok.literal.add(CR)
|
||
inc(L.bufpos)
|
||
of 'l', 'L':
|
||
tok.literal.add(LF)
|
||
inc(L.bufpos)
|
||
of 'f', 'F':
|
||
tok.literal.add(FF)
|
||
inc(L.bufpos)
|
||
of 'e', 'E':
|
||
tok.literal.add(ESC)
|
||
inc(L.bufpos)
|
||
of 'a', 'A':
|
||
tok.literal.add(BEL)
|
||
inc(L.bufpos)
|
||
of 'b', 'B':
|
||
tok.literal.add(BACKSPACE)
|
||
inc(L.bufpos)
|
||
of 'v', 'V':
|
||
tok.literal.add(VT)
|
||
inc(L.bufpos)
|
||
of 't', 'T':
|
||
tok.literal.add('\t')
|
||
inc(L.bufpos)
|
||
of '\'', '\"':
|
||
tok.literal.add(L.buf[L.bufpos])
|
||
inc(L.bufpos)
|
||
of '\\':
|
||
tok.literal.add('\\')
|
||
inc(L.bufpos)
|
||
of 'x', 'X':
|
||
inc(L.bufpos)
|
||
var xi = 0
|
||
handleHexChar(L, xi, 1)
|
||
handleHexChar(L, xi, 2)
|
||
tok.literal.add(chr(xi))
|
||
of 'u', 'U':
|
||
if tok.tokType == tkCharLit:
|
||
lexMessage(L, errGenerated, "\\u not allowed in character literal")
|
||
inc(L.bufpos)
|
||
var xi = 0
|
||
if L.buf[L.bufpos] == '{':
|
||
inc(L.bufpos)
|
||
var start = L.bufpos
|
||
while L.buf[L.bufpos] != '}':
|
||
handleHexChar(L, xi, 0)
|
||
if start == L.bufpos:
|
||
lexMessage(L, errGenerated,
|
||
"Unicode codepoint cannot be empty")
|
||
inc(L.bufpos)
|
||
if xi > 0x10FFFF:
|
||
let hex = ($L.buf)[start..L.bufpos-2]
|
||
lexMessage(L, errGenerated,
|
||
"Unicode codepoint must be lower than 0x10FFFF, but was: " & hex)
|
||
else:
|
||
handleHexChar(L, xi, 1)
|
||
handleHexChar(L, xi, 2)
|
||
handleHexChar(L, xi, 3)
|
||
handleHexChar(L, xi, 4)
|
||
addUnicodeCodePoint(tok.literal, xi)
|
||
of '0'..'9':
|
||
if matchTwoChars(L, '0', {'0'..'9'}):
|
||
lexMessage(L, warnOctalEscape)
|
||
var xi = 0
|
||
handleDecChars(L, xi)
|
||
if (xi <= 255): tok.literal.add(chr(xi))
|
||
else: lexMessage(L, errGenerated, "invalid character constant")
|
||
else: lexMessage(L, errGenerated, "invalid character constant")
|
||
|
||
proc handleCRLF(L: var Lexer, pos: int): int =
|
||
template registerLine =
|
||
let col = L.getColNumber(pos)
|
||
|
||
case L.buf[pos]
|
||
of CR:
|
||
registerLine()
|
||
result = nimlexbase.handleCR(L, pos)
|
||
of LF:
|
||
registerLine()
|
||
result = nimlexbase.handleLF(L, pos)
|
||
else: result = pos
|
||
|
||
type
|
||
StringMode = enum
|
||
normal,
|
||
raw,
|
||
generalized
|
||
|
||
proc getString(L: var Lexer, tok: var Token, mode: StringMode) =
|
||
var pos = L.bufpos
|
||
var line = L.lineNumber # save linenumber for better error message
|
||
tokenBegin(tok, pos - ord(mode == raw))
|
||
inc pos # skip "
|
||
if L.buf[pos] == '\"' and L.buf[pos+1] == '\"':
|
||
tok.tokType = tkTripleStrLit # long string literal:
|
||
inc(pos, 2) # skip ""
|
||
# skip leading newline:
|
||
if L.buf[pos] in {' ', '\t'}:
|
||
var newpos = pos+1
|
||
while L.buf[newpos] in {' ', '\t'}: inc newpos
|
||
if L.buf[newpos] in {CR, LF}: pos = newpos
|
||
pos = handleCRLF(L, pos)
|
||
while true:
|
||
case L.buf[pos]
|
||
of '\"':
|
||
if L.buf[pos+1] == '\"' and L.buf[pos+2] == '\"' and
|
||
L.buf[pos+3] != '\"':
|
||
tokenEndIgnore(tok, pos+2)
|
||
L.bufpos = pos + 3 # skip the three """
|
||
break
|
||
tok.literal.add('\"')
|
||
inc(pos)
|
||
of CR, LF:
|
||
tokenEndIgnore(tok, pos)
|
||
pos = handleCRLF(L, pos)
|
||
tok.literal.add("\n")
|
||
of nimlexbase.EndOfFile:
|
||
tokenEndIgnore(tok, pos)
|
||
var line2 = L.lineNumber
|
||
L.lineNumber = line
|
||
lexMessagePos(L, errGenerated, L.lineStart, "closing \"\"\" expected, but end of file reached")
|
||
L.lineNumber = line2
|
||
L.bufpos = pos
|
||
break
|
||
else:
|
||
tok.literal.add(L.buf[pos])
|
||
inc(pos)
|
||
else:
|
||
# ordinary string literal
|
||
if mode != normal: tok.tokType = tkRStrLit
|
||
else: tok.tokType = tkStrLit
|
||
while true:
|
||
let c = L.buf[pos]
|
||
if c == '\"':
|
||
if mode != normal and L.buf[pos+1] == '\"':
|
||
inc(pos, 2)
|
||
tok.literal.add('"')
|
||
else:
|
||
tokenEndIgnore(tok, pos)
|
||
inc(pos) # skip '"'
|
||
break
|
||
elif c in {CR, LF, nimlexbase.EndOfFile}:
|
||
tokenEndIgnore(tok, pos)
|
||
lexMessage(L, errGenerated, "closing \" expected")
|
||
break
|
||
elif (c == '\\') and mode == normal:
|
||
L.bufpos = pos
|
||
getEscapedChar(L, tok)
|
||
pos = L.bufpos
|
||
else:
|
||
tok.literal.add(c)
|
||
inc(pos)
|
||
L.bufpos = pos
|
||
|
||
proc getCharacter(L: var Lexer; tok: var Token) =
|
||
tokenBegin(tok, L.bufpos)
|
||
let startPos = L.bufpos
|
||
inc(L.bufpos) # skip '
|
||
let c = L.buf[L.bufpos]
|
||
case c
|
||
of '\0'..pred(' '), '\'':
|
||
lexMessage(L, errGenerated, "invalid character literal")
|
||
tok.literal = $c
|
||
of '\\': getEscapedChar(L, tok)
|
||
else:
|
||
tok.literal = $c
|
||
inc(L.bufpos)
|
||
if L.buf[L.bufpos] == '\'':
|
||
tokenEndIgnore(tok, L.bufpos)
|
||
inc(L.bufpos) # skip '
|
||
else:
|
||
if startPos > 0 and L.buf[startPos-1] == '`':
|
||
tok.literal = "'"
|
||
L.bufpos = startPos+1
|
||
else:
|
||
lexMessage(L, errGenerated, "missing closing ' for character literal")
|
||
tokenEndIgnore(tok, L.bufpos)
|
||
|
||
const
|
||
UnicodeOperatorStartChars = {'\226', '\194', '\195'}
|
||
# the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔")
|
||
# all start with one of these.
|
||
|
||
type
|
||
UnicodeOprPred = enum
|
||
Mul, Add
|
||
|
||
proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) =
|
||
template m(len): untyped = (int8(len), Mul)
|
||
template a(len): untyped = (int8(len), Add)
|
||
result = 0.m
|
||
case buf[pos]
|
||
of '\226':
|
||
if buf[pos+1] == '\136':
|
||
if buf[pos+2] == '\152': result = 3.m # ∘
|
||
elif buf[pos+2] == '\153': result = 3.m # ∙
|
||
elif buf[pos+2] == '\167': result = 3.m # ∧
|
||
elif buf[pos+2] == '\168': result = 3.a # ∨
|
||
elif buf[pos+2] == '\169': result = 3.m # ∩
|
||
elif buf[pos+2] == '\170': result = 3.a # ∪
|
||
elif buf[pos+1] == '\138':
|
||
if buf[pos+2] == '\147': result = 3.m # ⊓
|
||
elif buf[pos+2] == '\148': result = 3.a # ⊔
|
||
elif buf[pos+2] == '\149': result = 3.a # ⊕
|
||
elif buf[pos+2] == '\150': result = 3.a # ⊖
|
||
elif buf[pos+2] == '\151': result = 3.m # ⊗
|
||
elif buf[pos+2] == '\152': result = 3.m # ⊘
|
||
elif buf[pos+2] == '\153': result = 3.m # ⊙
|
||
elif buf[pos+2] == '\155': result = 3.m # ⊛
|
||
elif buf[pos+2] == '\158': result = 3.a # ⊞
|
||
elif buf[pos+2] == '\159': result = 3.a # ⊟
|
||
elif buf[pos+2] == '\160': result = 3.m # ⊠
|
||
elif buf[pos+2] == '\161': result = 3.m # ⊡
|
||
elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★
|
||
of '\194':
|
||
if buf[pos+1] == '\177': result = 2.a # ±
|
||
of '\195':
|
||
if buf[pos+1] == '\151': result = 2.m # ×
|
||
else:
|
||
discard
|
||
|
||
proc getSymbol(L: var Lexer, tok: var Token) =
|
||
var h: Hash = 0
|
||
var pos = L.bufpos
|
||
tokenBegin(tok, pos)
|
||
var suspicious = false
|
||
while true:
|
||
var c = L.buf[pos]
|
||
case c
|
||
of 'a'..'z', '0'..'9':
|
||
h = h !& ord(c)
|
||
inc(pos)
|
||
of 'A'..'Z':
|
||
c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
|
||
h = h !& ord(c)
|
||
inc(pos)
|
||
suspicious = true
|
||
of '_':
|
||
if L.buf[pos+1] notin SymChars:
|
||
lexMessage(L, errGenerated, "invalid token: trailing underscore")
|
||
break
|
||
inc(pos)
|
||
suspicious = true
|
||
of '\x80'..'\xFF':
|
||
if c in UnicodeOperatorStartChars and unicodeOprLen(L.buf, pos)[0] != 0:
|
||
break
|
||
else:
|
||
h = h !& ord(c)
|
||
inc(pos)
|
||
else: break
|
||
tokenEnd(tok, pos-1)
|
||
h = !$h
|
||
tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h)
|
||
if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
|
||
(tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)):
|
||
tok.tokType = tkSymbol
|
||
else:
|
||
tok.tokType = TokType(tok.ident.id + ord(tkSymbol))
|
||
if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}:
|
||
lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s)
|
||
L.bufpos = pos
|
||
|
||
|
||
proc endOperator(L: var Lexer, tok: var Token, pos: int,
|
||
hash: Hash) {.inline.} =
|
||
var h = !$hash
|
||
tok.ident = L.cache.getIdent(cast[cstring](addr(L.buf[L.bufpos])), pos - L.bufpos, h)
|
||
if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
|
||
else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
|
||
L.bufpos = pos
|
||
|
||
proc getOperator(L: var Lexer, tok: var Token) =
|
||
var pos = L.bufpos
|
||
tokenBegin(tok, pos)
|
||
var h: Hash = 0
|
||
while true:
|
||
let c = L.buf[pos]
|
||
if c in OpChars:
|
||
h = h !& ord(c)
|
||
inc(pos)
|
||
elif c in UnicodeOperatorStartChars:
|
||
let oprLen = unicodeOprLen(L.buf, pos)[0]
|
||
if oprLen == 0: break
|
||
for i in 0..<oprLen:
|
||
h = h !& ord(L.buf[pos])
|
||
inc pos
|
||
else:
|
||
break
|
||
endOperator(L, tok, pos, h)
|
||
tokenEnd(tok, pos-1)
|
||
# advance pos but don't store it in L.bufpos so the next token (which might
|
||
# be an operator too) gets the preceding spaces:
|
||
tok.spacing = tok.spacing - {tsTrailing, tsEof}
|
||
var trailing = false
|
||
while L.buf[pos] == ' ':
|
||
inc pos
|
||
trailing = true
|
||
if L.buf[pos] in {CR, LF, nimlexbase.EndOfFile}:
|
||
tok.spacing.incl(tsEof)
|
||
elif trailing:
|
||
tok.spacing.incl(tsTrailing)
|
||
|
||
proc getPrecedence*(tok: Token): int =
|
||
## Calculates the precedence of the given token.
|
||
const
|
||
MulPred = 9
|
||
PlusPred = 8
|
||
case tok.tokType
|
||
of tkOpr:
|
||
let relevantChar = tok.ident.s[0]
|
||
|
||
# arrow like?
|
||
if tok.ident.s.len > 1 and tok.ident.s[^1] == '>' and
|
||
tok.ident.s[^2] in {'-', '~', '='}: return 0
|
||
|
||
template considerAsgn(value: untyped) =
|
||
result = if tok.ident.s[^1] == '=': 1 else: value
|
||
|
||
case relevantChar
|
||
of '$', '^': considerAsgn(10)
|
||
of '*', '%', '/', '\\': considerAsgn(MulPred)
|
||
of '~': result = 8
|
||
of '+', '-', '|': considerAsgn(PlusPred)
|
||
of '&': considerAsgn(7)
|
||
of '=', '<', '>', '!': result = 5
|
||
of '.': considerAsgn(6)
|
||
of '?': result = 2
|
||
of UnicodeOperatorStartChars:
|
||
if tok.ident.s[^1] == '=':
|
||
result = 1
|
||
else:
|
||
let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0)
|
||
if len != 0:
|
||
result = if pred == Mul: MulPred else: PlusPred
|
||
else:
|
||
result = 2
|
||
else: considerAsgn(2)
|
||
of tkDiv, tkMod, tkShl, tkShr: result = 9
|
||
of tkDotDot: result = 6
|
||
of tkIn, tkNotin, tkIs, tkIsnot, tkOf, tkAs, tkFrom: result = 5
|
||
of tkAnd: result = 4
|
||
of tkOr, tkXor, tkPtr, tkRef: result = 3
|
||
else: return -10
|
||
|
||
proc skipMultiLineComment(L: var Lexer; tok: var Token; start: int;
|
||
isDoc: bool) =
|
||
var pos = start
|
||
var toStrip = 0
|
||
tokenBegin(tok, pos)
|
||
# detect the amount of indentation:
|
||
if isDoc:
|
||
toStrip = getColNumber(L, pos)
|
||
while L.buf[pos] == ' ':
|
||
inc pos
|
||
inc toStrip
|
||
while L.buf[pos] in {CR, LF}: # skip blank lines
|
||
pos = handleCRLF(L, pos)
|
||
toStrip = 0
|
||
while L.buf[pos] == ' ':
|
||
inc pos
|
||
inc toStrip
|
||
var nesting = 0
|
||
while true:
|
||
case L.buf[pos]
|
||
of '#':
|
||
if isDoc:
|
||
if L.buf[pos+1] == '#' and L.buf[pos+2] == '[':
|
||
inc nesting
|
||
tok.literal.add '#'
|
||
elif L.buf[pos+1] == '[':
|
||
inc nesting
|
||
inc pos
|
||
of ']':
|
||
if isDoc:
|
||
if L.buf[pos+1] == '#' and L.buf[pos+2] == '#':
|
||
if nesting == 0:
|
||
tokenEndIgnore(tok, pos+2)
|
||
inc(pos, 3)
|
||
break
|
||
dec nesting
|
||
tok.literal.add ']'
|
||
elif L.buf[pos+1] == '#':
|
||
if nesting == 0:
|
||
tokenEndIgnore(tok, pos+1)
|
||
inc(pos, 2)
|
||
break
|
||
dec nesting
|
||
inc pos
|
||
of CR, LF:
|
||
tokenEndIgnore(tok, pos)
|
||
pos = handleCRLF(L, pos)
|
||
# strip leading whitespace:
|
||
when defined(nimpretty): tok.literal.add "\L"
|
||
if isDoc:
|
||
when not defined(nimpretty): tok.literal.add "\n"
|
||
var c = toStrip
|
||
while L.buf[pos] == ' ' and c > 0:
|
||
inc pos
|
||
dec c
|
||
of nimlexbase.EndOfFile:
|
||
tokenEndIgnore(tok, pos)
|
||
lexMessagePos(L, errGenerated, pos, "end of multiline comment expected")
|
||
break
|
||
else:
|
||
if isDoc or defined(nimpretty): tok.literal.add L.buf[pos]
|
||
inc(pos)
|
||
L.bufpos = pos
|
||
when defined(nimpretty):
|
||
tok.commentOffsetB = L.offsetBase + pos - 1
|
||
|
||
proc scanComment(L: var Lexer, tok: var Token) =
|
||
var pos = L.bufpos
|
||
tok.tokType = tkComment
|
||
assert L.buf[pos+1] == '#'
|
||
when defined(nimpretty):
|
||
tok.commentOffsetA = L.offsetBase + pos
|
||
|
||
if L.buf[pos+2] == '[':
|
||
skipMultiLineComment(L, tok, pos+3, true)
|
||
return
|
||
tokenBegin(tok, pos)
|
||
inc(pos, 2)
|
||
|
||
var toStrip = 0
|
||
var stripInit = false
|
||
|
||
while true:
|
||
if not stripInit: # find baseline indentation inside comment
|
||
while L.buf[pos] == ' ':
|
||
inc pos
|
||
inc toStrip
|
||
if L.buf[pos] in {CR, LF}: # don't set toStrip in blank comment lines
|
||
toStrip = 0
|
||
else: # found first non-whitespace character
|
||
stripInit = true
|
||
while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
|
||
tok.literal.add(L.buf[pos])
|
||
inc(pos)
|
||
tokenEndIgnore(tok, pos)
|
||
pos = handleCRLF(L, pos)
|
||
var indent = 0
|
||
while L.buf[pos] == ' ':
|
||
inc(pos)
|
||
inc(indent)
|
||
|
||
if L.buf[pos] == '#' and L.buf[pos+1] == '#':
|
||
tok.literal.add "\n"
|
||
inc(pos, 2)
|
||
if stripInit:
|
||
var c = toStrip
|
||
while L.buf[pos] == ' ' and c > 0:
|
||
inc pos
|
||
dec c
|
||
else:
|
||
if L.buf[pos] > ' ':
|
||
L.indentAhead = indent
|
||
tokenEndIgnore(tok, pos)
|
||
break
|
||
L.bufpos = pos
|
||
when defined(nimpretty):
|
||
tok.commentOffsetB = L.offsetBase + pos - 1
|
||
|
||
proc skip(L: var Lexer, tok: var Token) =
|
||
var pos = L.bufpos
|
||
tokenBegin(tok, pos)
|
||
tok.spacing.excl(tsLeading)
|
||
when defined(nimpretty):
|
||
var hasComment = false
|
||
var commentIndent = L.currLineIndent
|
||
tok.commentOffsetA = L.offsetBase + pos
|
||
tok.commentOffsetB = tok.commentOffsetA
|
||
tok.line = -1
|
||
while true:
|
||
case L.buf[pos]
|
||
of ' ':
|
||
inc(pos)
|
||
tok.spacing.incl(tsLeading)
|
||
of '\t':
|
||
lexMessagePos(L, errGenerated, pos, "tabs are not allowed, use spaces instead")
|
||
inc(pos)
|
||
of CR, LF:
|
||
tokenEndPrevious(tok, pos)
|
||
pos = handleCRLF(L, pos)
|
||
var indent = 0
|
||
while true:
|
||
if L.buf[pos] == ' ':
|
||
inc(pos)
|
||
inc(indent)
|
||
elif L.buf[pos] == '#' and L.buf[pos+1] == '[':
|
||
when defined(nimpretty):
|
||
hasComment = true
|
||
if tok.line < 0:
|
||
tok.line = L.lineNumber
|
||
commentIndent = indent
|
||
skipMultiLineComment(L, tok, pos+2, false)
|
||
pos = L.bufpos
|
||
else:
|
||
break
|
||
tok.spacing.excl(tsLeading)
|
||
when defined(nimpretty):
|
||
if L.buf[pos] == '#' and tok.line < 0: commentIndent = indent
|
||
if L.buf[pos] > ' ' and (L.buf[pos] != '#' or L.buf[pos+1] == '#'):
|
||
tok.indent = indent
|
||
L.currLineIndent = indent
|
||
break
|
||
of '#':
|
||
# do not skip documentation comment:
|
||
if L.buf[pos+1] == '#': break
|
||
when defined(nimpretty):
|
||
hasComment = true
|
||
if tok.line < 0:
|
||
tok.line = L.lineNumber
|
||
|
||
if L.buf[pos+1] == '[':
|
||
skipMultiLineComment(L, tok, pos+2, false)
|
||
pos = L.bufpos
|
||
else:
|
||
tokenBegin(tok, pos)
|
||
while L.buf[pos] notin {CR, LF, nimlexbase.EndOfFile}:
|
||
when defined(nimpretty): tok.literal.add L.buf[pos]
|
||
inc(pos)
|
||
tokenEndIgnore(tok, pos+1)
|
||
when defined(nimpretty):
|
||
tok.commentOffsetB = L.offsetBase + pos + 1
|
||
else:
|
||
break # EndOfFile also leaves the loop
|
||
tokenEndPrevious(tok, pos-1)
|
||
L.bufpos = pos
|
||
when defined(nimpretty):
|
||
if hasComment:
|
||
tok.commentOffsetB = L.offsetBase + pos - 1
|
||
tok.tokType = tkComment
|
||
tok.indent = commentIndent
|
||
|
||
proc rawGetTok*(L: var Lexer, tok: var Token) =
|
||
template atTokenEnd() {.dirty.} =
|
||
when defined(nimsuggest):
|
||
L.previousTokenEnd.line = L.tokenEnd.line
|
||
L.previousTokenEnd.col = L.tokenEnd.col
|
||
L.tokenEnd.line = tok.line.uint16
|
||
L.tokenEnd.col = getColNumber(L, L.bufpos).int16
|
||
# we attach the cursor to the last *strong* token
|
||
if tok.tokType notin weakTokens:
|
||
L.previousToken.line = tok.line.uint16
|
||
L.previousToken.col = tok.col.int16
|
||
|
||
reset(tok)
|
||
if L.indentAhead >= 0:
|
||
tok.indent = L.indentAhead
|
||
L.currLineIndent = L.indentAhead
|
||
L.indentAhead = -1
|
||
else:
|
||
tok.indent = -1
|
||
skip(L, tok)
|
||
when defined(nimpretty):
|
||
if tok.tokType == tkComment:
|
||
L.indentAhead = L.currLineIndent
|
||
return
|
||
let c = L.buf[L.bufpos]
|
||
tok.line = L.lineNumber
|
||
tok.col = getColNumber(L, L.bufpos)
|
||
if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars:
|
||
getSymbol(L, tok)
|
||
else:
|
||
case c
|
||
of UnicodeOperatorStartChars:
|
||
if unicodeOprLen(L.buf, L.bufpos)[0] != 0:
|
||
getOperator(L, tok)
|
||
else:
|
||
getSymbol(L, tok)
|
||
of '#':
|
||
scanComment(L, tok)
|
||
of '*':
|
||
# '*:' is unfortunately a special case, because it is two tokens in
|
||
# 'var v*: int'.
|
||
if L.buf[L.bufpos+1] == ':' and L.buf[L.bufpos+2] notin OpChars:
|
||
var h = 0 !& ord('*')
|
||
endOperator(L, tok, L.bufpos+1, h)
|
||
else:
|
||
getOperator(L, tok)
|
||
of ',':
|
||
tok.tokType = tkComma
|
||
inc(L.bufpos)
|
||
of 'r', 'R':
|
||
if L.buf[L.bufpos + 1] == '\"':
|
||
inc(L.bufpos)
|
||
getString(L, tok, raw)
|
||
else:
|
||
getSymbol(L, tok)
|
||
of '(':
|
||
inc(L.bufpos)
|
||
if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
|
||
tok.tokType = tkParDotLe
|
||
inc(L.bufpos)
|
||
else:
|
||
tok.tokType = tkParLe
|
||
when defined(nimsuggest):
|
||
if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col < L.config.m.trackPos.col and
|
||
tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideCon:
|
||
L.config.m.trackPos.col = tok.col.int16
|
||
of ')':
|
||
tok.tokType = tkParRi
|
||
inc(L.bufpos)
|
||
of '[':
|
||
inc(L.bufpos)
|
||
if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
|
||
tok.tokType = tkBracketDotLe
|
||
inc(L.bufpos)
|
||
elif L.buf[L.bufpos] == ':':
|
||
tok.tokType = tkBracketLeColon
|
||
inc(L.bufpos)
|
||
else:
|
||
tok.tokType = tkBracketLe
|
||
of ']':
|
||
tok.tokType = tkBracketRi
|
||
inc(L.bufpos)
|
||
of '.':
|
||
when defined(nimsuggest):
|
||
if L.fileIdx == L.config.m.trackPos.fileIndex and tok.col+1 == L.config.m.trackPos.col and
|
||
tok.line == L.config.m.trackPos.line.int and L.config.ideCmd == ideSug:
|
||
tok.tokType = tkDot
|
||
L.config.m.trackPos.col = tok.col.int16
|
||
inc(L.bufpos)
|
||
atTokenEnd()
|
||
return
|
||
if L.buf[L.bufpos+1] == ']':
|
||
tok.tokType = tkBracketDotRi
|
||
inc(L.bufpos, 2)
|
||
elif L.buf[L.bufpos+1] == '}':
|
||
tok.tokType = tkCurlyDotRi
|
||
inc(L.bufpos, 2)
|
||
elif L.buf[L.bufpos+1] == ')':
|
||
tok.tokType = tkParDotRi
|
||
inc(L.bufpos, 2)
|
||
else:
|
||
getOperator(L, tok)
|
||
of '{':
|
||
inc(L.bufpos)
|
||
if L.buf[L.bufpos] == '.' and L.buf[L.bufpos+1] != '.':
|
||
tok.tokType = tkCurlyDotLe
|
||
inc(L.bufpos)
|
||
else:
|
||
tok.tokType = tkCurlyLe
|
||
of '}':
|
||
tok.tokType = tkCurlyRi
|
||
inc(L.bufpos)
|
||
of ';':
|
||
tok.tokType = tkSemiColon
|
||
inc(L.bufpos)
|
||
of '`':
|
||
tok.tokType = tkAccent
|
||
inc(L.bufpos)
|
||
of '_':
|
||
inc(L.bufpos)
|
||
if L.buf[L.bufpos] notin SymChars+{'_'}:
|
||
tok.tokType = tkSymbol
|
||
tok.ident = L.cache.getIdent("_")
|
||
else:
|
||
tok.literal = $c
|
||
tok.tokType = tkInvalid
|
||
lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')')
|
||
of '\"':
|
||
# check for generalized raw string literal:
|
||
let mode = if L.bufpos > 0 and L.buf[L.bufpos-1] in SymChars: generalized else: normal
|
||
getString(L, tok, mode)
|
||
if mode == generalized:
|
||
# tkRStrLit -> tkGStrLit
|
||
# tkTripleStrLit -> tkGTripleStrLit
|
||
inc(tok.tokType, 2)
|
||
of '\'':
|
||
tok.tokType = tkCharLit
|
||
getCharacter(L, tok)
|
||
tok.tokType = tkCharLit
|
||
of '0'..'9':
|
||
getNumber(L, tok)
|
||
let c = L.buf[L.bufpos]
|
||
if c in SymChars+{'_'}:
|
||
if c in UnicodeOperatorStartChars and
|
||
unicodeOprLen(L.buf, L.bufpos)[0] != 0:
|
||
discard
|
||
else:
|
||
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
|
||
of '-':
|
||
if L.buf[L.bufpos+1] in {'0'..'9'} and
|
||
(L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist):
|
||
# x)-23 # binary minus
|
||
# ,-23 # unary minus
|
||
# \n-78 # unary minus? Yes.
|
||
# =-3 # parsed as `=-` anyway
|
||
getNumber(L, tok)
|
||
let c = L.buf[L.bufpos]
|
||
if c in SymChars+{'_'}:
|
||
if c in UnicodeOperatorStartChars and
|
||
unicodeOprLen(L.buf, L.bufpos)[0] != 0:
|
||
discard
|
||
else:
|
||
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
|
||
else:
|
||
getOperator(L, tok)
|
||
else:
|
||
if c in OpChars:
|
||
getOperator(L, tok)
|
||
elif c == nimlexbase.EndOfFile:
|
||
tok.tokType = tkEof
|
||
tok.indent = 0
|
||
else:
|
||
tok.literal = $c
|
||
tok.tokType = tkInvalid
|
||
lexMessage(L, errGenerated, "invalid token: " & c & " (\\" & $(ord(c)) & ')')
|
||
inc(L.bufpos)
|
||
atTokenEnd()
|
||
|
||
proc getIndentWidth*(fileIdx: FileIndex, inputstream: PLLStream;
|
||
cache: IdentCache; config: ConfigRef): int =
|
||
result = 0
|
||
var lex: Lexer = default(Lexer)
|
||
var tok: Token = default(Token)
|
||
openLexer(lex, fileIdx, inputstream, cache, config)
|
||
var prevToken = tkEof
|
||
while tok.tokType != tkEof:
|
||
rawGetTok(lex, tok)
|
||
if tok.indent > 0 and prevToken in {tkColon, tkEquals, tkType, tkConst, tkLet, tkVar, tkUsing}:
|
||
result = tok.indent
|
||
if result > 0: break
|
||
prevToken = tok.tokType
|
||
closeLexer(lex)
|
||
|
||
proc getPrecedence*(ident: PIdent): int =
|
||
## assumes ident is binary operator already
|
||
let
|
||
tokType =
|
||
if ident.id in ord(tokKeywordLow) - ord(tkSymbol)..ord(tokKeywordHigh) - ord(tkSymbol):
|
||
TokType(ident.id + ord(tkSymbol))
|
||
else: tkOpr
|
||
tok = Token(ident: ident, tokType: tokType)
|
||
|
||
getPrecedence(tok)
|