From 1138cf5234674e7942abc6bf94e88d798fb4d0e0 Mon Sep 17 00:00:00 2001
From: Hans Raaf <hara@oderwat.de>
Date: Wed, 25 Mar 2015 14:26:01 +0100
Subject: [PATCH 1/3] Some procs to deal with Rune position base indexing.

It can't be perfect but at least one can index on rune position
efficiently.
---
 lib/pure/unicode.nim | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim
index eeb1b607d6..7f44786e32 100644
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -183,6 +183,25 @@ proc `$`*(runes: seq[Rune]): string =
   result = ""
   for rune in runes: result.add(rune.toUTF8)
 
+proc runeOffset*(s: string, pos:int): int =
+  ## Returns the byte position of unicode character at position in s
+  var
+    i = 0
+    o = 0
+  while i < pos:
+    o += runeLenAt(s, o)
+    inc i
+  o
+
+proc rune*(s: string, pos:int): Rune =
+  ## Returns the unicode character at position pos
+  fastRuneAt(s, runeOffset(s, pos), result, false)
+
+proc runeStr*(s: string, pos:int): string =
+  ## Returns the unicode character at position pos as UTF8 String
+  let o = runeOffset(s, pos)
+  s[o.. (o+runeLenAt(s, o)-1)]
+
 const
   alphaRanges = [
     0x00d8,  0x00f6,  #  -

From ac6de565ec82c5cdd3bbc3d90dc72836e985eca8 Mon Sep 17 00:00:00 2001
From: Hans Raaf <hara@oderwat.de>
Date: Fri, 27 Mar 2015 23:31:12 +0100
Subject: [PATCH 2/3] More work in optimizing, names and added substr().

This is work in progress. I added an unicode substring. Tried to handle
edgecases more consistent too.
---
 lib/pure/unicode.nim | 46 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 6 deletions(-)

diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim
index 7f44786e32..586111e37a 100644
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -183,25 +183,59 @@ proc `$`*(runes: seq[Rune]): string =
   result = ""
   for rune in runes: result.add(rune.toUTF8)
 
-proc runeOffset*(s: string, pos:int): int =
-  ## Returns the byte position of unicode character at position in s
+proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int =
+  ## Returns the byte position of unicode character
+  ## at position pos in s with an optional start byte position.
+  ## returns the special value -1 if it runs out of the string
   var
     i = 0
-    o = 0
+    o = start
   while i < pos:
     o += runeLenAt(s, o)
+    if o >= s.len:
+      return -1
+      #raise newException(IndexError, "Position out of bounds")
     inc i
-  o
+  return o
 
-proc rune*(s: string, pos:int): Rune =
+proc runeAtPos*(s: string, pos: int): Rune =
   ## Returns the unicode character at position pos
   fastRuneAt(s, runeOffset(s, pos), result, false)
 
-proc runeStr*(s: string, pos:int): string =
+proc runeStrAtPos*(s: string, pos: Natural): string =
   ## Returns the unicode character at position pos as UTF8 String
   let o = runeOffset(s, pos)
   s[o.. (o+runeLenAt(s, o)-1)]
 
+proc runeSubStr*(s: string, pos: int, len: int = int.high): string =
+  ## Returns the UTF-8 substring starting at codepoint pos
+  ## with len codepoints. If pos or len is negativ they count from
+  ## the end of the string. If len is not given it means the longest
+  ## possible string. This reensembles how substr() in PHP works.
+  if pos < 0: 
+    # offset from the end could be optimized further
+    var o = runeLen(s) + pos
+    if o < 0: o = 0
+    result = runeSubStr(s, o, len)
+  else:
+    let o = runeOffset(s, pos)
+    if o < 0:
+      result = ""
+    elif len == int.high:
+      result = s[o.. s.len-1]
+    elif len < 0:
+      # offset from the end could be optimized further
+      let e = runeLen(s) + len
+      if e <= 0:
+        result = ""
+      else:
+        result = s[o.. runeOffset(s, e)-1]
+    else: 
+      var e = runeOffset(s, len, o)
+      if e < 0:
+        e = s.len
+      result = s[o.. e-1]
+
 const
   alphaRanges = [
     0x00d8,  0x00f6,  #  -

From 2791915d7fb06c1d5d3eb0b8356881ed5a12c120 Mon Sep 17 00:00:00 2001
From: Hans Raaf <hara@oderwat.de>
Date: Sat, 28 Mar 2015 00:56:09 +0100
Subject: [PATCH 3/3] Optimized end offsets and added tests.

I hope this also shows that there are use cases. I still think the user
should get warned about performance issues with those procs, which I
added to the doc comments.
---
 lib/pure/unicode.nim | 105 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 93 insertions(+), 12 deletions(-)

diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim
index 586111e37a..5d302c9dc4 100644
--- a/lib/pure/unicode.nim
+++ b/lib/pure/unicode.nim
@@ -187,6 +187,10 @@ proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int =
   ## Returns the byte position of unicode character
   ## at position pos in s with an optional start byte position.
   ## returns the special value -1 if it runs out of the string
+  ##
+  ## Beware: This can lead to unoptimized code and slow execution!
+  ## Most problems are solve more efficient by using an iterator
+  ## or conversion to a seq of Rune.
   var
     i = 0
     o = start
@@ -194,29 +198,71 @@ proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int =
     o += runeLenAt(s, o)
     if o >= s.len:
       return -1
-      #raise newException(IndexError, "Position out of bounds")
     inc i
   return o
 
 proc runeAtPos*(s: string, pos: int): Rune =
   ## Returns the unicode character at position pos
+  ##
+  ## Beware: This can lead to unoptimized code and slow execution!
+  ## Most problems are solve more efficient by using an iterator
+  ## or conversion to a seq of Rune.
   fastRuneAt(s, runeOffset(s, pos), result, false)
 
 proc runeStrAtPos*(s: string, pos: Natural): string =
   ## Returns the unicode character at position pos as UTF8 String
+  ##
+  ## Beware: This can lead to unoptimized code and slow execution!
+  ## Most problems are solve more efficient by using an iterator
+  ## or conversion to a seq of Rune.
   let o = runeOffset(s, pos)
   s[o.. (o+runeLenAt(s, o)-1)]
 
-proc runeSubStr*(s: string, pos: int, len: int = int.high): string =
+proc runeReverseOffset*(s: string, rev:Positive): (int, int) =
+  ## Returns a tuple with the the byte offset of the
+  ## unicode character at position ``rev`` in s counting
+  ## from the end (starting with 1) and the total
+  ## number of runes in the string. Returns a negative value
+  ## for offset if there are to few runes in the string to
+  ## satisfy the request.
+  ##
+  ## Beware: This can lead to unoptimized code and slow execution!
+  ## Most problems are solve more efficient by using an iterator
+  ## or conversion to a seq of Rune.
+  var
+    a = rev.int
+    o = 0
+    x = 0
+  while o < s.len:
+    let r = runeLenAt(s, o)
+    o += r
+    if a < 0:
+      x += r
+    dec a
+
+  if a > 0:
+    return (-a, rev.int-a)
+  return (x, -a+rev.int)
+
+proc runeSubStr*(s: string, pos:int, len:int = int.high): string =
   ## Returns the UTF-8 substring starting at codepoint pos
   ## with len codepoints. If pos or len is negativ they count from
   ## the end of the string. If len is not given it means the longest
-  ## possible string. This reensembles how substr() in PHP works.
-  if pos < 0: 
-    # offset from the end could be optimized further
-    var o = runeLen(s) + pos
-    if o < 0: o = 0
-    result = runeSubStr(s, o, len)
+  ## possible string.
+  ##
+  ## (Needs some examples)
+  if pos < 0:
+    let (o, rl) = runeReverseOffset(s, -pos)
+    if len >= rl:
+      result = s[o.. s.len-1]
+    elif len < 0:
+      let e = rl + len
+      if e < 0:
+        result = ""
+      else:
+        result = s[o.. runeOffset(s, e-(rl+pos) , o)-1]
+    else:
+      result = s[o.. runeOffset(s, len, o)-1]
   else:
     let o = runeOffset(s, pos)
     if o < 0:
@@ -224,13 +270,13 @@ proc runeSubStr*(s: string, pos: int, len: int = int.high): string =
     elif len == int.high:
       result = s[o.. s.len-1]
     elif len < 0:
-      # offset from the end could be optimized further
-      let e = runeLen(s) + len
+      let (e, rl) = runeReverseOffset(s, -len)
+      discard rl
       if e <= 0:
         result = ""
       else:
-        result = s[o.. runeOffset(s, e)-1]
-    else: 
+        result = s[o.. e-1]
+    else:
       var e = runeOffset(s, len, o)
       if e < 0:
         e = s.len
@@ -1413,3 +1459,38 @@ when isMainModule:
   const test = "as⃝"
   doAssert lastRune(test, test.len-1)[1] == 3
   doAssert graphemeLen("è", 0) == 2
+
+  # test for rune positioning and runeSubStr()
+  let s = "Hänsel  ««: 10,00€"
+
+  doAssert(runeReverseOffset(s, 1) == (20, 18))
+  doAssert(runeReverseOffset(s, 19) == (-1, 18))
+
+  doAssert(runeStrAtPos(s, 0) == "H")
+  doAssert(runeSubStr(s, 0, 1) == "H")
+  doAssert(runeStrAtPos(s, 10) == ":")
+  doAssert(runeSubStr(s, 10, 1) == ":")
+  doAssert(runeStrAtPos(s, 9) == "«")
+  doAssert(runeSubStr(s, 9, 1) == "«")
+  doAssert(runeStrAtPos(s, 17) == "€")
+  doAssert(runeSubStr(s, 17, 1) == "€")
+  # echo runeStrAtPos(s, 18) # index error
+
+  doAssert(runeSubStr(s, 0) ==  "Hänsel  ««: 10,00€")
+  doAssert(runeSubStr(s, -18) ==  "Hänsel  ««: 10,00€")
+  doAssert(runeSubStr(s, 10) == ": 10,00€")
+  doAssert(runeSubStr(s, 18) == "")
+  doAssert(runeSubStr(s, 0, 10) == "Hänsel  ««")
+
+  doAssert(runeSubStr(s, 12) == "10,00€")
+  doAssert(runeSubStr(s, -6) == "10,00€")
+
+  doAssert(runeSubStr(s, 12, 5) == "10,00")
+  doAssert(runeSubStr(s, 12, -1) == "10,00")
+  doAssert(runeSubStr(s, -6, 5) == "10,00")
+  doAssert(runeSubStr(s, -6, -1) == "10,00")
+
+  doAssert(runeSubStr(s, 0, 100) ==  "Hänsel  ««: 10,00€")
+  doAssert(runeSubStr(s, -100, 100) ==  "Hänsel  ««: 10,00€")
+  doAssert(runeSubStr(s, 0, -100) == "")
+  doAssert(runeSubStr(s, 100, -100) == "")