From c2b20516d33520b1d339b447ece32ade8625fefc Mon Sep 17 00:00:00 2001
From: Andreas Rumpf <rumpf_a@web.de>
Date: Fri, 3 Sep 2021 17:31:16 +0200
Subject: [PATCH] implemented Unicode operators (#18789)

* implemented Unicode operators; refs https://github.com/nim-lang/RFCs/issues/388

* bugfix

* better test

* arguably more elegant implementation

* Update changelog.md

Co-authored-by: konsumlamm <44230978+konsumlamm@users.noreply.github.com>

Co-authored-by: konsumlamm <44230978+konsumlamm@users.noreply.github.com>
---
 changelog.md                       |  7 +++
 compiler/lexer.nim                 | 80 +++++++++++++++++++++++++++---
 compiler/options.nim               |  3 +-
 doc/manual.rst                     | 19 +++++++
 tests/lexer/nim.cfg                |  1 +
 tests/lexer/tunicode_operators.nim | 14 ++++++
 6 files changed, 117 insertions(+), 7 deletions(-)
 create mode 100644 tests/lexer/nim.cfg
 create mode 100644 tests/lexer/tunicode_operators.nim

diff --git a/changelog.md b/changelog.md
index 10824cfc6b..bc4898c9d0 100644
--- a/changelog.md
+++ b/changelog.md
@@ -445,6 +445,13 @@ proc mysort(s: seq; cmp: proc(a, b: T): int) {.effectsOf: cmp.}
   To enable the new effect system, use --experimental:strictEffects.
 
 
+- Nim now supports a small subset of Unicode operators as operator symbols.
+  The supported symbols are: "∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔".
+  To enable this feature, use `--experimental:unicodeOperators`. Note that due
+  to parser limitations you **cannot** enable this feature via a
+  pragma `{.experimental: "unicodeOperators".}` reliably.
+
+
 ## Compiler changes
 
 - Added `--declaredLocs` to show symbol declaration location in messages.
diff --git a/compiler/lexer.nim b/compiler/lexer.nim
index c7a9aa47f5..ddf98661a8 100644
--- a/compiler/lexer.nim
+++ b/compiler/lexer.nim
@@ -882,15 +882,66 @@ proc endOperator(L: var Lexer, tok: var Token, pos: int,
   else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
   L.bufpos = pos
 
+const
+  UnicodeOperatorStartChars = {'\226', '\194', '\195'}
+    # the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔")
+    # all start with one of these.
+
+type
+  UnicodeOprPred = enum
+    Mul, Add
+
+proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) =
+  template m(len): untyped = (int8(len), Mul)
+  template a(len): untyped = (int8(len), Add)
+  result = 0.m
+  case buf[pos]
+  of '\226':
+    if buf[pos+1] == '\136':
+      if buf[pos+2] == '\152': result = 3.m # ∘
+      elif buf[pos+2] == '\153': result = 3.m # ∙
+      elif buf[pos+2] == '\167': result = 3.m # ∧
+      elif buf[pos+2] == '\168': result = 3.a # ∨
+      elif buf[pos+2] == '\169': result = 3.m # ∩
+      elif buf[pos+2] == '\170': result = 3.a # ∪
+    elif buf[pos+1] == '\138':
+      if buf[pos+2] == '\147': result = 3.m # ⊓
+      elif buf[pos+2] == '\148': result = 3.a # ⊔
+      elif buf[pos+2] == '\149': result = 3.a # ⊕
+      elif buf[pos+2] == '\150': result = 3.a # ⊖
+      elif buf[pos+2] == '\151': result = 3.m # ⊗
+      elif buf[pos+2] == '\152': result = 3.m # ⊘
+      elif buf[pos+2] == '\153': result = 3.m # ⊙
+      elif buf[pos+2] == '\155': result = 3.m # ⊛
+      elif buf[pos+2] == '\158': result = 3.a # ⊞
+      elif buf[pos+2] == '\159': result = 3.a # ⊟
+      elif buf[pos+2] == '\160': result = 3.m # ⊠
+      elif buf[pos+2] == '\161': result = 3.m # ⊡
+    elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★
+  of '\194':
+    if buf[pos+1] == '\177': result = 2.a # ±
+  of '\195':
+    if buf[pos+1] == '\151': result = 2.m # ×
+  else:
+    discard
+
 proc getOperator(L: var Lexer, tok: var Token) =
   var pos = L.bufpos
   tokenBegin(tok, pos)
   var h: Hash = 0
   while true:
     var c = L.buf[pos]
-    if c notin OpChars: break
-    h = h !& ord(c)
-    inc(pos)
+    if c in OpChars:
+      h = h !& ord(c)
+      inc(pos)
+    elif c in UnicodeOperatorStartChars and unicodeOperators in L.config.features:
+      let oprLen = unicodeOprLen(L.buf, pos)[0]
+      if oprLen == 0: break
+      for i in 0..<oprLen:
+        h = h !& ord(L.buf[pos])
+        inc pos
+    else:
+      break
   endOperator(L, tok, pos, h)
   tokenEnd(tok, pos-1)
   # advance pos but don't store it in L.bufpos so the next token (which might
@@ -904,6 +955,9 @@ proc getOperator(L: var Lexer, tok: var Token) =
 
 proc getPrecedence*(tok: Token): int =
   ## Calculates the precedence of the given token.
+  const
+    MulPred = 9
+    PlusPred = 8
   case tok.tokType
   of tkOpr:
     let relevantChar = tok.ident.s[0]
@@ -917,13 +971,22 @@ proc getPrecedence*(tok: Token): int =
 
     case relevantChar
     of '$', '^': considerAsgn(10)
-    of '*', '%', '/', '\\': considerAsgn(9)
+    of '*', '%', '/', '\\': considerAsgn(MulPred)
     of '~': result = 8
-    of '+', '-', '|': considerAsgn(8)
+    of '+', '-', '|': considerAsgn(PlusPred)
     of '&': considerAsgn(7)
     of '=', '<', '>', '!': result = 5
     of '.': considerAsgn(6)
     of '?': result = 2
+    of UnicodeOperatorStartChars:
+      if tok.ident.s[^1] == '=':
+        result = 1
+      else:
+        let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0)
+        if len != 0:
+          result = if pred == Mul: MulPred else: PlusPred
+        else:
+          result = 2
     else: considerAsgn(2)
   of tkDiv, tkMod, tkShl, tkShr: result = 9
   of tkDotDot: result = 6
@@ -1167,10 +1230,15 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
   var c = L.buf[L.bufpos]
   tok.line = L.lineNumber
   tok.col = getColNumber(L, L.bufpos)
-  if c in SymStartChars - {'r', 'R'}:
+  if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars:
     getSymbol(L, tok)
   else:
     case c
+    of UnicodeOperatorStartChars:
+      if unicodeOperators in L.config.features and unicodeOprLen(L.buf, L.bufpos)[0] != 0:
+        getOperator(L, tok)
+      else:
+        getSymbol(L, tok)
     of '#':
       scanComment(L, tok)
     of '*':
diff --git a/compiler/options.nim b/compiler/options.nim
index 89a16a49cf..ea302aed63 100644
--- a/compiler/options.nim
+++ b/compiler/options.nim
@@ -205,7 +205,8 @@ type
     views,
     strictNotNil,
     overloadableEnums,
-    strictEffects
+    strictEffects,
+    unicodeOperators
 
   LegacyFeature* = enum
     allowSemcheckedAstModification,
diff --git a/doc/manual.rst b/doc/manual.rst
index 02c688968e..997253f16a 100644
--- a/doc/manual.rst
+++ b/doc/manual.rst
@@ -700,6 +700,25 @@ contain a dot: `{..}` are the three tokens `{`:tok:, `..`:tok:, `}`:tok:
 and not the two tokens `{.`:tok:, `.}`:tok:.
 
 
+Unicode Operators
+-----------------
+
+Under the `--experimental:unicodeOperators` switch these Unicode operators are
+also parsed as operators::
+
+  ∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓   # same priority as * (multiplication)
+  ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔             # same priority as + (addition)
+
+
+If enabled, Unicode operators can be combined with non-Unicode operator
+symbols. The usual precedence extensions then apply, for example, `⊠=` is an
+assignment like operator just like `*=` is.
+
+No Unicode normalization step is performed.
+
+**Note**: Due to parser limitations one **cannot** enable this feature via a
+pragma `{.experimental: "unicodeOperators".}` reliably.
+
 
 Syntax
 ======
diff --git a/tests/lexer/nim.cfg b/tests/lexer/nim.cfg
new file mode 100644
index 0000000000..f7a301a10c
--- /dev/null
+++ b/tests/lexer/nim.cfg
@@ -0,0 +1 @@
+--experimental:unicodeOperators
diff --git a/tests/lexer/tunicode_operators.nim b/tests/lexer/tunicode_operators.nim
new file mode 100644
index 0000000000..74fcbb763c
--- /dev/null
+++ b/tests/lexer/tunicode_operators.nim
@@ -0,0 +1,14 @@
+#{.experimental: "unicodeOperators".}
+
+proc `⊙`(x, y: int): int = x * y
+proc `⊙=`(x: var int, y: int) = x *= y
+
+proc `⊞++`(x, y: int): int = x + y
+
+var x = 45
+x ⊙= 9 ⊞++ 4 ⊙ 3
+
+var y = 45
+y *= 9 + 4 * 3
+
+assert x == y