mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-29 01:14:41 +00:00
* Refactor: rename proc to func * Fix punycode.decode function This function could only properly decode punycodes containing a single encoded unicode character. As soon as there was more than one punycode character group to decode it produced invalid output - the number of characters was correct, but their position was not. * Update tpunycode.nim Co-authored-by: Clay Sweetser <Varriount@users.noreply.github.com>
210 lines
5.6 KiB
Nim
210 lines
5.6 KiB
Nim
#
|
|
#
|
|
# Nim's Runtime Library
|
|
# (c) Copyright 2016 Andreas Rumpf
|
|
#
|
|
# See the file "copying.txt", included in this
|
|
# distribution, for details about the copyright.
|
|
#
|
|
|
|
## Implements a representation of Unicode with the limited
|
|
## ASCII character subset.
|
|
|
|
import strutils
|
|
import unicode
|
|
|
|
# issue #3045
|
|
|
|
const
|
|
Base = 36
|
|
TMin = 1
|
|
TMax = 26
|
|
Skew = 38
|
|
Damp = 700
|
|
InitialBias = 72
|
|
InitialN = 128
|
|
Delimiter = '-'
|
|
|
|
type
|
|
PunyError* = object of ValueError
|
|
|
|
func decodeDigit(x: char): int {.raises: [PunyError].} =
|
|
if '0' <= x and x <= '9':
|
|
result = ord(x) - (ord('0') - 26)
|
|
elif 'A' <= x and x <= 'Z':
|
|
result = ord(x) - ord('A')
|
|
elif 'a' <= x and x <= 'z':
|
|
result = ord(x) - ord('a')
|
|
else:
|
|
raise newException(PunyError, "Bad input")
|
|
|
|
func encodeDigit(digit: int): Rune {.raises: [PunyError].} =
|
|
if 0 <= digit and digit < 26:
|
|
result = Rune(digit + ord('a'))
|
|
elif 26 <= digit and digit < 36:
|
|
result = Rune(digit + (ord('0') - 26))
|
|
else:
|
|
raise newException(PunyError, "internal error in punycode encoding")
|
|
|
|
func isBasic(c: char): bool = ord(c) < 0x80
|
|
func isBasic(r: Rune): bool = int(r) < 0x80
|
|
|
|
func adapt(delta, numPoints: int, first: bool): int =
|
|
var d = if first: delta div Damp else: delta div 2
|
|
d += d div numPoints
|
|
var k = 0
|
|
while d > ((Base-TMin)*TMax) div 2:
|
|
d = d div (Base - TMin)
|
|
k += Base
|
|
result = k + (Base - TMin + 1) * d div (d + Skew)
|
|
|
|
func encode*(prefix, s: string): string {.raises: [PunyError].} =
|
|
## Encode a string that may contain Unicode.
|
|
## Prepend `prefix` to the result
|
|
result = prefix
|
|
var (d, n, bias) = (0, InitialN, InitialBias)
|
|
var (b, remaining) = (0, 0)
|
|
for r in s.runes:
|
|
if r.isBasic:
|
|
# basic Ascii character
|
|
inc b
|
|
result.add($r)
|
|
else:
|
|
# special character
|
|
inc remaining
|
|
|
|
var h = b
|
|
if b > 0:
|
|
result.add(Delimiter) # we have some Ascii chars
|
|
while remaining != 0:
|
|
var m: int = high(int32)
|
|
for r in s.runes:
|
|
if m > int(r) and int(r) >= n:
|
|
m = int(r)
|
|
d += (m - n) * (h + 1)
|
|
if d < 0:
|
|
raise newException(PunyError, "invalid label " & s)
|
|
n = m
|
|
for r in s.runes:
|
|
if int(r) < n:
|
|
inc d
|
|
if d < 0:
|
|
raise newException(PunyError, "invalid label " & s)
|
|
continue
|
|
if int(r) > n:
|
|
continue
|
|
var q = d
|
|
var k = Base
|
|
while true:
|
|
var t = k - bias
|
|
if t < TMin:
|
|
t = TMin
|
|
elif t > TMax:
|
|
t = TMax
|
|
if q < t:
|
|
break
|
|
result.add($encodeDigit(t + (q - t) mod (Base - t)))
|
|
q = (q - t) div (Base - t)
|
|
k += Base
|
|
result.add($encodeDigit(q))
|
|
bias = adapt(d, h + 1, h == b)
|
|
d = 0
|
|
inc h
|
|
dec remaining
|
|
inc d
|
|
inc n
|
|
|
|
func encode*(s: string): string {.raises: [PunyError].} =
|
|
## Encode a string that may contain Unicode. Prefix is empty.
|
|
result = encode("", s)
|
|
|
|
func decode*(encoded: string): string {.raises: [PunyError].} =
|
|
## Decode a Punycode-encoded string
|
|
var
|
|
n = InitialN
|
|
i = 0
|
|
bias = InitialBias
|
|
var d = rfind(encoded, Delimiter)
|
|
var output: seq[Rune]
|
|
|
|
if d > 0:
|
|
# found Delimiter
|
|
for j in 0..<d:
|
|
var c = encoded[j] # char
|
|
if not c.isBasic:
|
|
raise newException(PunyError, "Encoded contains a non-basic char")
|
|
output.add(Rune(c)) # add the character
|
|
inc d
|
|
else:
|
|
d = 0 # set to first index
|
|
|
|
while (d < len(encoded)):
|
|
var oldi = i
|
|
var w = 1
|
|
var k = Base
|
|
while true:
|
|
if d == len(encoded):
|
|
raise newException(PunyError, "Bad input: " & encoded)
|
|
var c = encoded[d]; inc d
|
|
var digit = int(decodeDigit(c))
|
|
if digit > (high(int32) - i) div w:
|
|
raise newException(PunyError, "Too large a value: " & $digit)
|
|
i += digit * w
|
|
var t: int
|
|
if k <= bias:
|
|
t = TMin
|
|
elif k >= bias + TMax:
|
|
t = TMax
|
|
else:
|
|
t = k - bias
|
|
if digit < t:
|
|
break
|
|
w *= Base - t
|
|
k += Base
|
|
bias = adapt(i - oldi, len(output) + 1, oldi == 0)
|
|
|
|
if i div (len(output) + 1) > high(int32) - n:
|
|
raise newException(PunyError, "Value too large")
|
|
|
|
n += i div (len(output) + 1)
|
|
i = i mod (len(output) + 1)
|
|
insert(output, Rune(n), i)
|
|
inc i
|
|
|
|
result = $output
|
|
|
|
runnableExamples:
|
|
static:
|
|
block:
|
|
doAssert encode("") == ""
|
|
doAssert encode("a") == "a-"
|
|
doAssert encode("A") == "A-"
|
|
doAssert encode("3") == "3-"
|
|
doAssert encode("-") == "--"
|
|
doAssert encode("--") == "---"
|
|
doAssert encode("abc") == "abc-"
|
|
doAssert encode("London") == "London-"
|
|
doAssert encode("Lloyd-Atkinson") == "Lloyd-Atkinson-"
|
|
doAssert encode("This has spaces") == "This has spaces-"
|
|
doAssert encode("ü") == "tda"
|
|
doAssert encode("München") == "Mnchen-3ya"
|
|
doAssert encode("Mnchen-3ya") == "Mnchen-3ya-"
|
|
doAssert encode("München-Ost") == "Mnchen-Ost-9db"
|
|
doAssert encode("Bahnhof München-Ost") == "Bahnhof Mnchen-Ost-u6b"
|
|
block:
|
|
doAssert decode("") == ""
|
|
doAssert decode("a-") == "a"
|
|
doAssert decode("A-") == "A"
|
|
doAssert decode("3-") == "3"
|
|
doAssert decode("--") == "-"
|
|
doAssert decode("---") == "--"
|
|
doAssert decode("abc-") == "abc"
|
|
doAssert decode("London-") == "London"
|
|
doAssert decode("Lloyd-Atkinson-") == "Lloyd-Atkinson"
|
|
doAssert decode("This has spaces-") == "This has spaces"
|
|
doAssert decode("tda") == "ü"
|
|
doAssert decode("Mnchen-3ya") == "München"
|
|
doAssert decode("Mnchen-3ya-") == "Mnchen-3ya"
|
|
doAssert decode("Mnchen-Ost-9db") == "München-Ost"
|
|
doAssert decode("Bahnhof Mnchen-Ost-u6b") == "Bahnhof München-Ost"
|