mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-29 01:14:41 +00:00
Merge pull request #4264 from oderwat/add-unicode-rune-offsets
Add unicode rune offsets
This commit is contained in:
@@ -183,6 +183,105 @@ proc `$`*(runes: seq[Rune]): string =
|
||||
result = ""
|
||||
for rune in runes: result.add(rune.toUTF8)
|
||||
|
||||
proc runeOffset*(s: string, pos:Natural, start: Natural = 0): int =
|
||||
## Returns the byte position of unicode character
|
||||
## at position pos in s with an optional start byte position.
|
||||
## returns the special value -1 if it runs out of the string
|
||||
##
|
||||
## Beware: This can lead to unoptimized code and slow execution!
|
||||
## Most problems are solve more efficient by using an iterator
|
||||
## or conversion to a seq of Rune.
|
||||
var
|
||||
i = 0
|
||||
o = start
|
||||
while i < pos:
|
||||
o += runeLenAt(s, o)
|
||||
if o >= s.len:
|
||||
return -1
|
||||
inc i
|
||||
return o
|
||||
|
||||
proc runeAtPos*(s: string, pos: int): Rune =
|
||||
## Returns the unicode character at position pos
|
||||
##
|
||||
## Beware: This can lead to unoptimized code and slow execution!
|
||||
## Most problems are solve more efficient by using an iterator
|
||||
## or conversion to a seq of Rune.
|
||||
fastRuneAt(s, runeOffset(s, pos), result, false)
|
||||
|
||||
proc runeStrAtPos*(s: string, pos: Natural): string =
|
||||
## Returns the unicode character at position pos as UTF8 String
|
||||
##
|
||||
## Beware: This can lead to unoptimized code and slow execution!
|
||||
## Most problems are solve more efficient by using an iterator
|
||||
## or conversion to a seq of Rune.
|
||||
let o = runeOffset(s, pos)
|
||||
s[o.. (o+runeLenAt(s, o)-1)]
|
||||
|
||||
proc runeReverseOffset*(s: string, rev:Positive): (int, int) =
|
||||
## Returns a tuple with the the byte offset of the
|
||||
## unicode character at position ``rev`` in s counting
|
||||
## from the end (starting with 1) and the total
|
||||
## number of runes in the string. Returns a negative value
|
||||
## for offset if there are to few runes in the string to
|
||||
## satisfy the request.
|
||||
##
|
||||
## Beware: This can lead to unoptimized code and slow execution!
|
||||
## Most problems are solve more efficient by using an iterator
|
||||
## or conversion to a seq of Rune.
|
||||
var
|
||||
a = rev.int
|
||||
o = 0
|
||||
x = 0
|
||||
while o < s.len:
|
||||
let r = runeLenAt(s, o)
|
||||
o += r
|
||||
if a < 0:
|
||||
x += r
|
||||
dec a
|
||||
|
||||
if a > 0:
|
||||
return (-a, rev.int-a)
|
||||
return (x, -a+rev.int)
|
||||
|
||||
proc runeSubStr*(s: string, pos:int, len:int = int.high): string =
|
||||
## Returns the UTF-8 substring starting at codepoint pos
|
||||
## with len codepoints. If pos or len is negativ they count from
|
||||
## the end of the string. If len is not given it means the longest
|
||||
## possible string.
|
||||
##
|
||||
## (Needs some examples)
|
||||
if pos < 0:
|
||||
let (o, rl) = runeReverseOffset(s, -pos)
|
||||
if len >= rl:
|
||||
result = s[o.. s.len-1]
|
||||
elif len < 0:
|
||||
let e = rl + len
|
||||
if e < 0:
|
||||
result = ""
|
||||
else:
|
||||
result = s[o.. runeOffset(s, e-(rl+pos) , o)-1]
|
||||
else:
|
||||
result = s[o.. runeOffset(s, len, o)-1]
|
||||
else:
|
||||
let o = runeOffset(s, pos)
|
||||
if o < 0:
|
||||
result = ""
|
||||
elif len == int.high:
|
||||
result = s[o.. s.len-1]
|
||||
elif len < 0:
|
||||
let (e, rl) = runeReverseOffset(s, -len)
|
||||
discard rl
|
||||
if e <= 0:
|
||||
result = ""
|
||||
else:
|
||||
result = s[o.. e-1]
|
||||
else:
|
||||
var e = runeOffset(s, len, o)
|
||||
if e < 0:
|
||||
e = s.len
|
||||
result = s[o.. e-1]
|
||||
|
||||
const
|
||||
alphaRanges = [
|
||||
0x00d8, 0x00f6, # -
|
||||
@@ -1360,3 +1459,38 @@ when isMainModule:
|
||||
const test = "as⃝"
|
||||
doAssert lastRune(test, test.len-1)[1] == 3
|
||||
doAssert graphemeLen("è", 0) == 2
|
||||
|
||||
# test for rune positioning and runeSubStr()
|
||||
let s = "Hänsel ««: 10,00€"
|
||||
|
||||
doAssert(runeReverseOffset(s, 1) == (20, 18))
|
||||
doAssert(runeReverseOffset(s, 19) == (-1, 18))
|
||||
|
||||
doAssert(runeStrAtPos(s, 0) == "H")
|
||||
doAssert(runeSubStr(s, 0, 1) == "H")
|
||||
doAssert(runeStrAtPos(s, 10) == ":")
|
||||
doAssert(runeSubStr(s, 10, 1) == ":")
|
||||
doAssert(runeStrAtPos(s, 9) == "«")
|
||||
doAssert(runeSubStr(s, 9, 1) == "«")
|
||||
doAssert(runeStrAtPos(s, 17) == "€")
|
||||
doAssert(runeSubStr(s, 17, 1) == "€")
|
||||
# echo runeStrAtPos(s, 18) # index error
|
||||
|
||||
doAssert(runeSubStr(s, 0) == "Hänsel ««: 10,00€")
|
||||
doAssert(runeSubStr(s, -18) == "Hänsel ««: 10,00€")
|
||||
doAssert(runeSubStr(s, 10) == ": 10,00€")
|
||||
doAssert(runeSubStr(s, 18) == "")
|
||||
doAssert(runeSubStr(s, 0, 10) == "Hänsel ««")
|
||||
|
||||
doAssert(runeSubStr(s, 12) == "10,00€")
|
||||
doAssert(runeSubStr(s, -6) == "10,00€")
|
||||
|
||||
doAssert(runeSubStr(s, 12, 5) == "10,00")
|
||||
doAssert(runeSubStr(s, 12, -1) == "10,00")
|
||||
doAssert(runeSubStr(s, -6, 5) == "10,00")
|
||||
doAssert(runeSubStr(s, -6, -1) == "10,00")
|
||||
|
||||
doAssert(runeSubStr(s, 0, 100) == "Hänsel ««: 10,00€")
|
||||
doAssert(runeSubStr(s, -100, 100) == "Hänsel ««: 10,00€")
|
||||
doAssert(runeSubStr(s, 0, -100) == "")
|
||||
doAssert(runeSubStr(s, 100, -100) == "")
|
||||
|
||||
Reference in New Issue
Block a user