implemented Unicode operators (#18789)

* implemented Unicode operators; refs https://github.com/nim-lang/RFCs/issues/388 * bugfix * better test * arguably more elegant implementation * Update changelog.md Co-authored-by: konsumlamm <44230978+konsumlamm@users.noreply.github.com> Co-authored-by: konsumlamm <44230978+konsumlamm@users.noreply.github.com>
2026-01-24 05:20:48 +00:00 · 2021-09-03 17:31:16 +02:00
parent 905fae51f6
commit c2b20516d3
6 changed files with 117 additions and 7 deletions
--- a/changelog.md
+++ b/changelog.md
@@ -445,6 +445,13 @@ proc mysort(s: seq; cmp: proc(a, b: T): int) {.effectsOf: cmp.}
  To enable the new effect system, use --experimental:strictEffects.


+- Nim now supports a small subset of Unicode operators as operator symbols.
+  The supported symbols are: "∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔".
+  To enable this feature, use `--experimental:unicodeOperators`. Note that due
+  to parser limitations you **cannot** enable this feature via a
+  pragma `{.experimental: "unicodeOperators".}` reliably.
+
+
 ## Compiler changes

 - Added `--declaredLocs` to show symbol declaration location in messages.
--- a/compiler/lexer.nim
+++ b/compiler/lexer.nim
@@ -882,15 +882,66 @@ proc endOperator(L: var Lexer, tok: var Token, pos: int,
  else: tok.tokType = TokType(tok.ident.id - oprLow + ord(tkColon))
  L.bufpos = pos

+const
+  UnicodeOperatorStartChars = {'\226', '\194', '\195'}
+    # the allowed unicode characters ("∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓ ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔")
+    # all start with one of these.
+
+type
+  UnicodeOprPred = enum
+    Mul, Add
+
+proc unicodeOprLen(buf: cstring; pos: int): (int8, UnicodeOprPred) =
+  template m(len): untyped = (int8(len), Mul)
+  template a(len): untyped = (int8(len), Add)
+  result = 0.m
+  case buf[pos]
+  of '\226':
+    if buf[pos+1] == '\136':
+      if buf[pos+2] == '\152': result = 3.m # ∘
+      elif buf[pos+2] == '\153': result = 3.m # ∙
+      elif buf[pos+2] == '\167': result = 3.m # ∧
+      elif buf[pos+2] == '\168': result = 3.a # ∨
+      elif buf[pos+2] == '\169': result = 3.m # ∩
+      elif buf[pos+2] == '\170': result = 3.a # ∪
+    elif buf[pos+1] == '\138':
+      if buf[pos+2] == '\147': result = 3.m # ⊓
+      elif buf[pos+2] == '\148': result = 3.a # ⊔
+      elif buf[pos+2] == '\149': result = 3.a # ⊕
+      elif buf[pos+2] == '\150': result = 3.a # ⊖
+      elif buf[pos+2] == '\151': result = 3.m # ⊗
+      elif buf[pos+2] == '\152': result = 3.m # ⊘
+      elif buf[pos+2] == '\153': result = 3.m # ⊙
+      elif buf[pos+2] == '\155': result = 3.m # ⊛
+      elif buf[pos+2] == '\158': result = 3.a # ⊞
+      elif buf[pos+2] == '\159': result = 3.a # ⊟
+      elif buf[pos+2] == '\160': result = 3.m # ⊠
+      elif buf[pos+2] == '\161': result = 3.m # ⊡
+    elif buf[pos+1] == '\152' and buf[pos+2] == '\133': result = 3.m # ★
+  of '\194':
+    if buf[pos+1] == '\177': result = 2.a # ±
+  of '\195':
+    if buf[pos+1] == '\151': result = 2.m # ×
+  else:
+    discard
+
 proc getOperator(L: var Lexer, tok: var Token) =
  var pos = L.bufpos
  tokenBegin(tok, pos)
  var h: Hash = 0
  while true:
    var c = L.buf[pos]
-    if c notin OpChars: break
-    h = h !& ord(c)
-    inc(pos)
+    if c in OpChars:
+      h = h !& ord(c)
+      inc(pos)
+    elif c in UnicodeOperatorStartChars and unicodeOperators in L.config.features:
+      let oprLen = unicodeOprLen(L.buf, pos)[0]
+      if oprLen == 0: break
+      for i in 0..<oprLen:
+        h = h !& ord(L.buf[pos])
+        inc pos
+    else:
+      break
  endOperator(L, tok, pos, h)
  tokenEnd(tok, pos-1)
  # advance pos but don't store it in L.bufpos so the next token (which might
@@ -904,6 +955,9 @@ proc getOperator(L: var Lexer, tok: var Token) =

 proc getPrecedence*(tok: Token): int =
  ## Calculates the precedence of the given token.
+  const
+    MulPred = 9
+    PlusPred = 8
  case tok.tokType
  of tkOpr:
    let relevantChar = tok.ident.s[0]
@@ -917,13 +971,22 @@ proc getPrecedence*(tok: Token): int =

    case relevantChar
    of '$', '^': considerAsgn(10)
-    of '*', '%', '/', '\\': considerAsgn(9)
+    of '*', '%', '/', '\\': considerAsgn(MulPred)
    of '~': result = 8
-    of '+', '-', '|': considerAsgn(8)
+    of '+', '-', '|': considerAsgn(PlusPred)
    of '&': considerAsgn(7)
    of '=', '<', '>', '!': result = 5
    of '.': considerAsgn(6)
    of '?': result = 2
+    of UnicodeOperatorStartChars:
+      if tok.ident.s[^1] == '=':
+        result = 1
+      else:
+        let (len, pred) = unicodeOprLen(cstring(tok.ident.s), 0)
+        if len != 0:
+          result = if pred == Mul: MulPred else: PlusPred
+        else:
+          result = 2
    else: considerAsgn(2)
  of tkDiv, tkMod, tkShl, tkShr: result = 9
  of tkDotDot: result = 6
@@ -1167,10 +1230,15 @@ proc rawGetTok*(L: var Lexer, tok: var Token) =
  var c = L.buf[L.bufpos]
  tok.line = L.lineNumber
  tok.col = getColNumber(L, L.bufpos)
-  if c in SymStartChars - {'r', 'R'}:
+  if c in SymStartChars - {'r', 'R'} - UnicodeOperatorStartChars:
    getSymbol(L, tok)
  else:
    case c
+    of UnicodeOperatorStartChars:
+      if unicodeOperators in L.config.features and unicodeOprLen(L.buf, L.bufpos)[0] != 0:
+        getOperator(L, tok)
+      else:
+        getSymbol(L, tok)
    of '#':
      scanComment(L, tok)
    of '*':
--- a/compiler/options.nim
+++ b/compiler/options.nim
@@ -205,7 +205,8 @@ type
    views,
    strictNotNil,
    overloadableEnums,
-    strictEffects
+    strictEffects,
+    unicodeOperators

  LegacyFeature* = enum
    allowSemcheckedAstModification,
--- a/doc/manual.rst
+++ b/doc/manual.rst
@@ -700,6 +700,25 @@ contain a dot: `{..}` are the three tokens `{`:tok:, `..`:tok:, `}`:tok:
 and not the two tokens `{.`:tok:, `.}`:tok:.


+Unicode Operators
+-----------------
+
+Under the `--experimental:unicodeOperators` switch these Unicode operators are
+also parsed as operators::
+
+  ∙ ∘ × ★ ⊗ ⊘ ⊙ ⊛ ⊠ ⊡ ∩ ∧ ⊓   # same priority as * (multiplication)
+  ± ⊕ ⊖ ⊞ ⊟ ∪ ∨ ⊔             # same priority as + (addition)
+
+
+If enabled, Unicode operators can be combined with non-Unicode operator
+symbols. The usual precedence extensions then apply, for example, `⊠=` is an
+assignment like operator just like `*=` is.
+
+No Unicode normalization step is performed.
+
+**Note**: Due to parser limitations one **cannot** enable this feature via a
+pragma `{.experimental: "unicodeOperators".}` reliably.
+

 Syntax
 ======
--- a/tests/lexer/nim.cfg
+++ b/tests/lexer/nim.cfg
@@ -0,0 +1 @@
+--experimental:unicodeOperators
--- a/tests/lexer/tunicode_operators.nim
+++ b/tests/lexer/tunicode_operators.nim
@@ -0,0 +1,14 @@
+#{.experimental: "unicodeOperators".}
+
+proc `⊙`(x, y: int): int = x * y
+proc `⊙=`(x: var int, y: int) = x *= y
+
+proc `⊞++`(x, y: int): int = x + y
+
+var x = 45
+x ⊙= 9 ⊞++ 4 ⊙ 3
+
+var y = 45
+y *= 9 + 4 * 3
+
+assert x == y