unicode operator bugfixes (#18802)

This commit is contained in:
Andreas Rumpf
2021-09-04 17:49:27 +02:00
committed by GitHub
parent e8dad482a3
commit 5c85e480a6
2 changed files with 63 additions and 47 deletions

View File

@@ -838,50 +838,6 @@ proc getCharacter(L: var Lexer; tok: var Token) =
lexMessage(L, errGenerated, "missing closing ' for character literal")
tokenEndIgnore(tok, L.bufpos)
proc getSymbol(L: var Lexer, tok: var Token) =
var h: Hash = 0
var pos = L.bufpos
tokenBegin(tok, pos)
var suspicious = false
while true:
var c = L.buf[pos]
case c
of 'a'..'z', '0'..'9', '\x80'..'\xFF':
h = h !& ord(c)
inc(pos)
of 'A'..'Z':
c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
h = h !& ord(c)
inc(pos)
suspicious = true
of '_':
if L.buf[pos+1] notin SymChars:
lexMessage(L, errGenerated, "invalid token: trailing underscore")
break
inc(pos)
suspicious = true
else: break
tokenEnd(tok, pos-1)
h = !$h
tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
(tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)):
tok.tokType = tkSymbol
else:
tok.tokType = TokType(tok.ident.id + ord(tkSymbol))
if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}:
lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s)
L.bufpos = pos
proc endOperator(L: var Lexer, tok: var Token, pos: int,
hash: Hash) {.inline.} =
var h = !$hash
tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
L.bufpos = pos
const
UnicodeOperatorStartChars = {'\226', '\194', '\195'}
# the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ⊔")
@@ -925,6 +881,56 @@ proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) =
else:
discard
proc getSymbol(L: var Lexer, tok: var Token) =
var h: Hash = 0
var pos = L.bufpos
tokenBegin(tok, pos)
var suspicious = false
while true:
var c = L.buf[pos]
case c
of 'a'..'z', '0'..'9':
h = h !& ord(c)
inc(pos)
of 'A'..'Z':
c = chr(ord(c) + (ord('a') - ord('A'))) # toLower()
h = h !& ord(c)
inc(pos)
suspicious = true
of '_':
if L.buf[pos+1] notin SymChars:
lexMessage(L, errGenerated, "invalid token: trailing underscore")
break
inc(pos)
suspicious = true
of '\x80'..'\xFF':
if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and unicodeOprLen(L.buf, pos)[0] != 0:
break
else:
h = h !& ord(c)
inc(pos)
else: break
tokenEnd(tok, pos-1)
h = !$h
tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
if (tok.ident.id < ord(tokKeywordLow) - ord(tkSymbol)) or
(tok.ident.id > ord(tokKeywordHigh) - ord(tkSymbol)):
tok.tokType = tkSymbol
else:
tok.tokType = TokType(tok.ident.id + ord(tkSymbol))
if suspicious and {optStyleHint, optStyleError} * L.config.globalOptions != {}:
lintReport(L.config, getLineInfo(L), tok.ident.s.normalize, tok.ident.s)
L.bufpos = pos
proc endOperator(L: var Lexer, tok: var Token, pos: int,
hash: Hash) {.inline.} =
var h = !$hash
tok.ident = L.cache.getIdent(addr(L.buf[L.bufpos]), pos - L.bufpos, h)
if (tok.ident.id < oprLow) or (tok.ident.id > oprHigh): tok.tokType = tkOpr
else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
L.bufpos = pos
proc getOperator(L: var Lexer, tok: var Token) =
var pos = L.bufpos
tokenBegin(tok, pos)
@@ -1346,7 +1352,11 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
getNumber(L, tok)
let c = L.buf[L.bufpos]
if c in SymChars+{'_'}:
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and
unicodeOprLen(L.buf, L.bufpos)[0] != 0:
discard
else:
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
of '-':
if L.buf[L.bufpos+1] in {'0'..'9'} and
(L.bufpos-1 == 0 or L.buf[L.bufpos-1] in UnaryMinusWhitelist):
@@ -1357,7 +1367,11 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
getNumber(L, tok)
let c = L.buf[L.bufpos]
if c in SymChars+{'_'}:
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
if c in UnicodeOperatorStartChars and unicodeOperators in L.config.features and
unicodeOprLen(L.buf, L.bufpos)[0] != 0:
discard
else:
lexMessage(L, errGenerated, "invalid token: no whitespace between number and identifier")
else:
getOperator(L, tok)
else:

View File

@@ -5,8 +5,10 @@ proc `⊙=`(x: var int, y: int) = x *= y
proc `⊞++`(x, y: int): int = x + y
const a = 9
var x = 45
x ⊙= 9 ⊞++ 4 ⊙ 3
x ⊙= a⊞++4⊙3
var y = 45
y *= 9 + 4 * 3