From 1138cf5234674e7942abc6bf94e88d798fb4d0e0 Mon Sep 17 00:00:00 2001 From: Hans Raaf Date: Wed, 25 Mar 2015 14:26:01 +0100 Subject: [PATCH 1/3] Some procs to deal with Rune position base indexing. It can't be perfect but at least one can index on rune position efficiently. --- lib/pure/unicode.nim | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index eeb1b607d6..7f44786e32 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -183,6 +183,25 @@ proc `$`*(runes: seq[Rune]): string = result = "" for rune in runes: result.add(rune.toUTF8) +proc runeOffset*(s: string, pos:int): int = + ## Returns the byte position of unicode character at position in s + var + i = 0 + o = 0 + while i < pos: + o += runeLenAt(s, o) + inc i + o + +proc rune*(s: string, pos:int): Rune = + ## Returns the unicode character at position pos + fastRuneAt(s, runeOffset(s, pos), result, false) + +proc runeStr*(s: string, pos:int): string = + ## Returns the unicode character at position pos as UTF8 String + let o = runeOffset(s, pos) + s[o.. (o+runeLenAt(s, o)-1)] + const alphaRanges = [ 0x00d8, 0x00f6, # - From ac6de565ec82c5cdd3bbc3d90dc72836e985eca8 Mon Sep 17 00:00:00 2001 From: Hans Raaf Date: Fri, 27 Mar 2015 23:31:12 +0100 Subject: [PATCH 2/3] More work in optimizing, names and added substr(). This is work in progress. I added an unicode substring. Tried to handle edgecases more consistent too. --- lib/pure/unicode.nim | 46 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 7f44786e32..586111e37a 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -183,25 +183,59 @@ proc `$`*(runes: seq[Rune]): string = result = "" for rune in runes: result.add(rune.toUTF8) -proc runeOffset*(s: string, pos:int): int = - ## Returns the byte position of unicode character at position in s +proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int = + ## Returns the byte position of unicode character + ## at position pos in s with an optional start byte position. + ## returns the special value -1 if it runs out of the string var i = 0 - o = 0 + o = start while i < pos: o += runeLenAt(s, o) + if o >= s.len: + return -1 + #raise newException(IndexError, "Position out of bounds") inc i - o + return o -proc rune*(s: string, pos:int): Rune = +proc runeAtPos*(s: string, pos: int): Rune = ## Returns the unicode character at position pos fastRuneAt(s, runeOffset(s, pos), result, false) -proc runeStr*(s: string, pos:int): string = +proc runeStrAtPos*(s: string, pos: Natural): string = ## Returns the unicode character at position pos as UTF8 String let o = runeOffset(s, pos) s[o.. (o+runeLenAt(s, o)-1)] +proc runeSubStr*(s: string, pos: int, len: int = int.high): string = + ## Returns the UTF-8 substring starting at codepoint pos + ## with len codepoints. If pos or len is negativ they count from + ## the end of the string. If len is not given it means the longest + ## possible string. This reensembles how substr() in PHP works. + if pos < 0: + # offset from the end could be optimized further + var o = runeLen(s) + pos + if o < 0: o = 0 + result = runeSubStr(s, o, len) + else: + let o = runeOffset(s, pos) + if o < 0: + result = "" + elif len == int.high: + result = s[o.. s.len-1] + elif len < 0: + # offset from the end could be optimized further + let e = runeLen(s) + len + if e <= 0: + result = "" + else: + result = s[o.. runeOffset(s, e)-1] + else: + var e = runeOffset(s, len, o) + if e < 0: + e = s.len + result = s[o.. e-1] + const alphaRanges = [ 0x00d8, 0x00f6, # - From 2791915d7fb06c1d5d3eb0b8356881ed5a12c120 Mon Sep 17 00:00:00 2001 From: Hans Raaf Date: Sat, 28 Mar 2015 00:56:09 +0100 Subject: [PATCH 3/3] Optimized end offsets and added tests. I hope this also shows that there are use cases. I still think the user should get warned about performance issues with those procs, which I added to the doc comments. --- lib/pure/unicode.nim | 105 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 12 deletions(-) diff --git a/lib/pure/unicode.nim b/lib/pure/unicode.nim index 586111e37a..5d302c9dc4 100644 --- a/lib/pure/unicode.nim +++ b/lib/pure/unicode.nim @@ -187,6 +187,10 @@ proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int = ## Returns the byte position of unicode character ## at position pos in s with an optional start byte position. ## returns the special value -1 if it runs out of the string + ## + ## Beware: This can lead to unoptimized code and slow execution! + ## Most problems are solve more efficient by using an iterator + ## or conversion to a seq of Rune. var i = 0 o = start @@ -194,29 +198,71 @@ proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int = o += runeLenAt(s, o) if o >= s.len: return -1 - #raise newException(IndexError, "Position out of bounds") inc i return o proc runeAtPos*(s: string, pos: int): Rune = ## Returns the unicode character at position pos + ## + ## Beware: This can lead to unoptimized code and slow execution! + ## Most problems are solve more efficient by using an iterator + ## or conversion to a seq of Rune. fastRuneAt(s, runeOffset(s, pos), result, false) proc runeStrAtPos*(s: string, pos: Natural): string = ## Returns the unicode character at position pos as UTF8 String + ## + ## Beware: This can lead to unoptimized code and slow execution! + ## Most problems are solve more efficient by using an iterator + ## or conversion to a seq of Rune. let o = runeOffset(s, pos) s[o.. (o+runeLenAt(s, o)-1)] -proc runeSubStr*(s: string, pos: int, len: int = int.high): string = +proc runeReverseOffset*(s: string, rev:Positive): (int, int) = + ## Returns a tuple with the the byte offset of the + ## unicode character at position ``rev`` in s counting + ## from the end (starting with 1) and the total + ## number of runes in the string. Returns a negative value + ## for offset if there are to few runes in the string to + ## satisfy the request. + ## + ## Beware: This can lead to unoptimized code and slow execution! + ## Most problems are solve more efficient by using an iterator + ## or conversion to a seq of Rune. + var + a = rev.int + o = 0 + x = 0 + while o < s.len: + let r = runeLenAt(s, o) + o += r + if a < 0: + x += r + dec a + + if a > 0: + return (-a, rev.int-a) + return (x, -a+rev.int) + +proc runeSubStr*(s: string, pos:int, len:int = int.high): string = ## Returns the UTF-8 substring starting at codepoint pos ## with len codepoints. If pos or len is negativ they count from ## the end of the string. If len is not given it means the longest - ## possible string. This reensembles how substr() in PHP works. - if pos < 0: - # offset from the end could be optimized further - var o = runeLen(s) + pos - if o < 0: o = 0 - result = runeSubStr(s, o, len) + ## possible string. + ## + ## (Needs some examples) + if pos < 0: + let (o, rl) = runeReverseOffset(s, -pos) + if len >= rl: + result = s[o.. s.len-1] + elif len < 0: + let e = rl + len + if e < 0: + result = "" + else: + result = s[o.. runeOffset(s, e-(rl+pos) , o)-1] + else: + result = s[o.. runeOffset(s, len, o)-1] else: let o = runeOffset(s, pos) if o < 0: @@ -224,13 +270,13 @@ proc runeSubStr*(s: string, pos: int, len: int = int.high): string = elif len == int.high: result = s[o.. s.len-1] elif len < 0: - # offset from the end could be optimized further - let e = runeLen(s) + len + let (e, rl) = runeReverseOffset(s, -len) + discard rl if e <= 0: result = "" else: - result = s[o.. runeOffset(s, e)-1] - else: + result = s[o.. e-1] + else: var e = runeOffset(s, len, o) if e < 0: e = s.len @@ -1413,3 +1459,38 @@ when isMainModule: const test = "as⃝" doAssert lastRune(test, test.len-1)[1] == 3 doAssert graphemeLen("è", 0) == 2 + + # test for rune positioning and runeSubStr() + let s = "Hänsel ««: 10,00€" + + doAssert(runeReverseOffset(s, 1) == (20, 18)) + doAssert(runeReverseOffset(s, 19) == (-1, 18)) + + doAssert(runeStrAtPos(s, 0) == "H") + doAssert(runeSubStr(s, 0, 1) == "H") + doAssert(runeStrAtPos(s, 10) == ":") + doAssert(runeSubStr(s, 10, 1) == ":") + doAssert(runeStrAtPos(s, 9) == "«") + doAssert(runeSubStr(s, 9, 1) == "«") + doAssert(runeStrAtPos(s, 17) == "€") + doAssert(runeSubStr(s, 17, 1) == "€") + # echo runeStrAtPos(s, 18) # index error + + doAssert(runeSubStr(s, 0) == "Hänsel ««: 10,00€") + doAssert(runeSubStr(s, -18) == "Hänsel ««: 10,00€") + doAssert(runeSubStr(s, 10) == ": 10,00€") + doAssert(runeSubStr(s, 18) == "") + doAssert(runeSubStr(s, 0, 10) == "Hänsel ««") + + doAssert(runeSubStr(s, 12) == "10,00€") + doAssert(runeSubStr(s, -6) == "10,00€") + + doAssert(runeSubStr(s, 12, 5) == "10,00") + doAssert(runeSubStr(s, 12, -1) == "10,00") + doAssert(runeSubStr(s, -6, 5) == "10,00") + doAssert(runeSubStr(s, -6, -1) == "10,00") + + doAssert(runeSubStr(s, 0, 100) == "Hänsel ««: 10,00€") + doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€") + doAssert(runeSubStr(s, 0, -100) == "") + doAssert(runeSubStr(s, 100, -100) == "")