From c2b20516d33520b1d339b447ece32ade8625fefc Mon Sep 17 00:00:00 2001 From: Andreas Rumpf Date: Fri, 3 Sep 2021 17:31:16 +0200 Subject: [PATCH] implemented Unicode operators (#18789) * implemented Unicode operators; refs https://github.com/nim-lang/RFCs/issues/388 * bugfix * better test * arguably more elegant implementation * Update changelog.md Co-authored-by: konsumlamm <44230978+konsumlamm@users.noreply.github.com> Co-authored-by: konsumlamm <44230978+konsumlamm@users.noreply.github.com> --- changelog.md | 7 +++ compiler/lexer.nim | 80 +++++++++++++++++++++++++++--- compiler/options.nim | 3 +- doc/manual.rst | 19 +++++++ tests/lexer/nim.cfg | 1 + tests/lexer/tunicode_operators.nim | 14 ++++++ 6 files changed, 117 insertions(+), 7 deletions(-) create mode 100644 tests/lexer/nim.cfg create mode 100644 tests/lexer/tunicode_operators.nim diff --git a/changelog.md b/changelog.md index 10824cfc6b..bc4898c9d0 100644 --- a/changelog.md +++ b/changelog.md @@ -445,6 +445,13 @@ proc mysort(s: seq; cmp: proc(a, b: T): int) {.effectsOf: cmp.} To enable the new effect system, use --experimental:strictEffects. +- Nim now supports a small subset of Unicode operators as operator symbols. + The supported symbols are: "∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔". + To enable this feature, use `--experimental:unicodeOperators`. Note that due + to parser limitations you **cannot** enable this feature via a + pragma `{.experimental: "unicodeOperators".}` reliably. + + ## Compiler changes - Added `--declaredLocs` to show symbol declaration location in messages. diff --git a/compiler/lexer.nim b/compiler/lexer.nim index c7a9aa47f5..ddf98661a8 100644 --- a/compiler/lexer.nim +++ b/compiler/lexer.nim @@ -882,15 +882,66 @@ proc endOperator(L: var Lexer, tok: var Token, pos: int, else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon)) L.bufpos = pos +const + UnicodeOperatorStartChars = {'\226', '\194', '\195'} + # the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔") + # all start with one of these. + +type + UnicodeOprPred = enum + Mul, Add + +proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) = + template m(len): untyped = (int8(len), Mul) + template a(len): untyped = (int8(len), Add) + result = 0.m + case buf[pos] + of '\226': + if buf[pos+1] == '\136': + if buf[pos+2] == '\152': result = 3.m # ∘ + elif buf[pos+2] == '\153': result = 3.m # ∙ + elif buf[pos+2] == '\167': result = 3.m # ∧ + elif buf[pos+2] == '\168': result = 3.a # ∨ + elif buf[pos+2] == '\169': result = 3.m # ∩ + elif buf[pos+2] == '\170': result = 3.a # ∪ + elif buf[pos+1] == '\138': + if buf[pos+2] == '\147': result = 3.m # ⊓ + elif buf[pos+2] == '\148': result = 3.a # ⊔ + elif buf[pos+2] == '\149': result = 3.a # ⊕ + elif buf[pos+2] == '\150': result = 3.a # ⊖ + elif buf[pos+2] == '\151': result = 3.m # ⊗ + elif buf[pos+2] == '\152': result = 3.m # ⊘ + elif buf[pos+2] == '\153': result = 3.m # ⊙ + elif buf[pos+2] == '\155': result = 3.m # ⊛ + elif buf[pos+2] == '\158': result = 3.a # ⊞ + elif buf[pos+2] == '\159': result = 3.a # ⊟ + elif buf[pos+2] == '\160': result = 3.m # ⊠ + elif buf[pos+2] == '\161': result = 3.m # ⊡ + elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★ + of '\194': + if buf[pos+1] == '\177': result = 2.a # ± + of '\195': + if buf[pos+1] == '\151': result = 2.m # × + else: + discard + proc getOperator(L: var Lexer, tok: var Token) = var pos = L.bufpos tokenBegin(tok, pos) var h: Hash = 0 while true: var c = L.buf[pos] - if c notin OpChars: break - h = h !& ord(c) - inc(pos) + if c in OpChars: + h = h !& ord(c) + inc(pos) + elif c in UnicodeOperatorStartChars and unicodeOperators in L.config.features: + let oprLen = unicodeOprLen(L.buf, pos)[0] + if oprLen == 0: break + for i in 0..', '!': result = 5 of '.': considerAsgn(6) of '?': result = 2 + of UnicodeOperatorStartChars: + if tok.ident.s[^1] == '=': + result = 1 + else: + let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0) + if len != 0: + result = if pred == Mul: MulPred else: PlusPred + else: + result = 2 else: considerAsgn(2) of tkDiv, tkMod, tkShl, tkShr: result = 9 of tkDotDot: result = 6 @@ -1167,10 +1230,15 @@ proc rawGetTok*(L: var Lexer, tok: var Token) = var c = L.buf[L.bufpos] tok.line = L.lineNumber tok.col = getColNumber(L, L.bufpos) - if c in SymStartChars - {'r', 'R'}: + if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars: getSymbol(L, tok) else: case c + of UnicodeOperatorStartChars: + if unicodeOperators in L.config.features and unicodeOprLen(L.buf, L.bufpos)[0] != 0: + getOperator(L, tok) + else: + getSymbol(L, tok) of '#': scanComment(L, tok) of '*': diff --git a/compiler/options.nim b/compiler/options.nim index 89a16a49cf..ea302aed63 100644 --- a/compiler/options.nim +++ b/compiler/options.nim @@ -205,7 +205,8 @@ type views, strictNotNil, overloadableEnums, - strictEffects + strictEffects, + unicodeOperators LegacyFeature* = enum allowSemcheckedAstModification, diff --git a/doc/manual.rst b/doc/manual.rst index 02c688968e..997253f16a 100644 --- a/doc/manual.rst +++ b/doc/manual.rst @@ -700,6 +700,25 @@ contain a dot: `{..}` are the three tokens `{`:tok:, `..`:tok:, `}`:tok: and not the two tokens `{.`:tok:, `.}`:tok:. +Unicode Operators +----------------- + +Under the `--experimental:unicodeOperators` switch these Unicode operators are +also parsed as operators:: + + ∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ # same priority as * (multiplication) + ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔ # same priority as + (addition) + + +If enabled, Unicode operators can be combined with non-Unicode operator +symbols. The usual precedence extensions then apply, for example, `⊠=` is an +assignment like operator just like `*=` is. + +No Unicode normalization step is performed. + +**Note**: Due to parser limitations one **cannot** enable this feature via a +pragma `{.experimental: "unicodeOperators".}` reliably. + Syntax ====== diff --git a/tests/lexer/nim.cfg b/tests/lexer/nim.cfg new file mode 100644 index 0000000000..f7a301a10c --- /dev/null +++ b/tests/lexer/nim.cfg @@ -0,0 +1 @@ +--experimental:unicodeOperators diff --git a/tests/lexer/tunicode_operators.nim b/tests/lexer/tunicode_operators.nim new file mode 100644 index 0000000000..74fcbb763c --- /dev/null +++ b/tests/lexer/tunicode_operators.nim @@ -0,0 +1,14 @@ +#{.experimental: "unicodeOperators".} + +proc `⊙`(x, y: int): int = x * y +proc `⊙=`(x: var int, y: int) = x *= y + +proc `⊞++`(x, y: int): int = x + y + +var x = 45 +x ⊙= 9 ⊞++ 4 ⊙ 3 + +var y = 45 +y *= 9 + 4 * 3 + +assert x == y