mirror of
https://github.com/nim-lang/Nim.git
synced 2025-12-30 01:44:37 +00:00
added encodings stdlib
This commit is contained in:
@@ -79,6 +79,10 @@ String handling
|
||||
|
||||
* `unicode <unicode.html>`_
|
||||
This module provides support to handle the Unicode UTF-8 encoding.
|
||||
|
||||
* `encodings <encodings.html>`_
|
||||
Converts between different character encodings. On UNIX, this uses
|
||||
the ``iconv`` library, on Windows the Windows API.
|
||||
|
||||
* `pegs <pegs.html>`_
|
||||
This module contains procedures and operators for handling PEGs.
|
||||
|
||||
445
lib/pure/encodings.nim
Normal file
445
lib/pure/encodings.nim
Normal file
@@ -0,0 +1,445 @@
|
||||
#
|
||||
#
|
||||
# Nimrod's Runtime Library
|
||||
# (c) Copyright 2011 Andreas Rumpf
|
||||
#
|
||||
# See the file "copying.txt", included in this
|
||||
# distribution, for details about the copyright.
|
||||
#
|
||||
|
||||
## Converts between different character encodings. On UNIX, this uses
|
||||
## the `iconv`:idx: library, on Windows the Windows API.
|
||||
|
||||
import os, parseutils, strutils
|
||||
|
||||
when not defined(windows):
|
||||
type
|
||||
TConverter = object {.pure, final.}
|
||||
PConverter* = ptr TConverter ## can convert between two character sets
|
||||
|
||||
else:
|
||||
type
|
||||
TCodePage = distinct int32
|
||||
PConverter* = object {.pure.}
|
||||
dest, src: TCodePage
|
||||
|
||||
type
|
||||
EInvalidEncoding* = object of EInvalidValue ## exception that is raised
|
||||
## for encoding errors
|
||||
|
||||
when defined(windows):
|
||||
proc EqEncodingNames(a, b: string): bool =
|
||||
var i = 0
|
||||
var j = 0
|
||||
while i < a.len and j < b.len:
|
||||
if a[i] in {'-', '_'}: inc i
|
||||
if b[j] in {'-', '_'}: inc j
|
||||
if a[i].tolower != b[j].tolower: return false
|
||||
inc i
|
||||
inc j
|
||||
result = i == a.len and j == b.len
|
||||
|
||||
const
|
||||
winEncodings = [
|
||||
(037, "IBM037"), # IBM EBCDIC US-Canada
|
||||
(437, "IBM437"), # OEM United States
|
||||
(500, "IBM500"), # IBM EBCDIC International
|
||||
(708, "ASMO-708"), # Arabic (ASMO 708)
|
||||
(709, "ASMO_449"), # Arabic (ASMO-449+, BCON V4)
|
||||
(710, ""), # Arabic - Transparent Arabic
|
||||
(720, "DOS-720"), # Arabic (Transparent ASMO); Arabic (DOS)
|
||||
(737, "ibm737"), # OEM Greek (formerly 437G); Greek (DOS)
|
||||
(775, "ibm775"), # OEM Baltic; Baltic (DOS)
|
||||
(850, "ibm850"), # OEM Multilingual Latin 1; Western European (DOS)
|
||||
(852, "ibm852"), # OEM Latin 2; Central European (DOS)
|
||||
(855, "IBM855"), # OEM Cyrillic (primarily Russian)
|
||||
(857, "ibm857"), # OEM Turkish; Turkish (DOS)
|
||||
(858, "IBM00858"), # OEM Multilingual Latin 1 + Euro symbol
|
||||
(860, "IBM860"), # OEM Portuguese; Portuguese (DOS)
|
||||
(861, "ibm861"), # OEM Icelandic; Icelandic (DOS)
|
||||
(862, "DOS-862"), # OEM Hebrew; Hebrew (DOS)
|
||||
(863, "IBM863"), # OEM French Canadian; French Canadian (DOS)
|
||||
(864, "IBM864"), # OEM Arabic; Arabic (864)
|
||||
(865, "IBM865"), # OEM Nordic; Nordic (DOS)
|
||||
(866, "cp866"), # OEM Russian; Cyrillic (DOS)
|
||||
(869, "ibm869"), # OEM Modern Greek; Greek, Modern (DOS)
|
||||
(870, "IBM870"), # IBM EBCDIC Multilingual/ROECE (Latin 2); IBM EBCDIC Multilingual Latin 2
|
||||
(874, "windows-874"), # ANSI/OEM Thai (same as 28605, ISO 8859-15); Thai (Windows)
|
||||
(875, "cp875"), # IBM EBCDIC Greek Modern
|
||||
(932, "shift_jis"), # ANSI/OEM Japanese; Japanese (Shift-JIS)
|
||||
(936, "gb2312"), # ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
|
||||
(949, "ks_c_5601-1987"), # ANSI/OEM Korean (Unified Hangul Code)
|
||||
(950, "big5"), # ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
|
||||
(1026, "IBM1026"), # IBM EBCDIC Turkish (Latin 5)
|
||||
(1047, "IBM01047"), # IBM EBCDIC Latin 1/Open System
|
||||
(1140, "IBM01140"), # IBM EBCDIC US-Canada (037 + Euro symbol); IBM EBCDIC (US-Canada-Euro)
|
||||
(1141, "IBM01141"), # IBM EBCDIC Germany (20273 + Euro symbol); IBM EBCDIC (Germany-Euro)
|
||||
(1142, "IBM01142"), # IBM EBCDIC Denmark-Norway (20277 + Euro symbol); IBM EBCDIC (Denmark-Norway-Euro)
|
||||
(1143, "IBM01143"), # IBM EBCDIC Finland-Sweden (20278 + Euro symbol); IBM EBCDIC (Finland-Sweden-Euro)
|
||||
(1144, "IBM01144"), # IBM EBCDIC Italy (20280 + Euro symbol); IBM EBCDIC (Italy-Euro)
|
||||
(1145, "IBM01145"), # IBM EBCDIC Latin America-Spain (20284 + Euro symbol); IBM EBCDIC (Spain-Euro)
|
||||
(1146, "IBM01146"), # IBM EBCDIC United Kingdom (20285 + Euro symbol); IBM EBCDIC (UK-Euro)
|
||||
(1147, "IBM01147"), # IBM EBCDIC France (20297 + Euro symbol); IBM EBCDIC (France-Euro)
|
||||
(1148, "IBM01148"), # IBM EBCDIC International (500 + Euro symbol); IBM EBCDIC (International-Euro)
|
||||
(1149, "IBM01149"), # IBM EBCDIC Icelandic (20871 + Euro symbol); IBM EBCDIC (Icelandic-Euro)
|
||||
(1200, "utf-16"), # Unicode UTF-16, little endian byte order (BMP of ISO 10646); available only to managed applications
|
||||
(1201, "unicodeFFFE"), # Unicode UTF-16, big endian byte order; available only to managed applications
|
||||
(1250, "windows-1250"), # ANSI Central European; Central European (Windows)
|
||||
(1251, "windows-1251"), # ANSI Cyrillic; Cyrillic (Windows)
|
||||
(1252, "windows-1252"), # ANSI Latin 1; Western European (Windows)
|
||||
(1253, "windows-1253"), # ANSI Greek; Greek (Windows)
|
||||
(1254, "windows-1254"), # ANSI Turkish; Turkish (Windows)
|
||||
(1255, "windows-1255"), # ANSI Hebrew; Hebrew (Windows)
|
||||
(1256, "windows-1256"), # ANSI Arabic; Arabic (Windows)
|
||||
(1257, "windows-1257"), # ANSI Baltic; Baltic (Windows)
|
||||
(1258, "windows-1258"), # ANSI/OEM Vietnamese; Vietnamese (Windows)
|
||||
(1361, "Johab"), # Korean (Johab)
|
||||
(10000, "macintosh"), # MAC Roman; Western European (Mac)
|
||||
(10001, "x-mac-japanese"), # Japanese (Mac)
|
||||
(10002, "x-mac-chinesetrad"), # MAC Traditional Chinese (Big5); Chinese Traditional (Mac)
|
||||
(10003, "x-mac-korean"), # Korean (Mac)
|
||||
(10004, "x-mac-arabic"), # Arabic (Mac)
|
||||
(10005, "x-mac-hebrew"), # Hebrew (Mac)
|
||||
(10006, "x-mac-greek"), # Greek (Mac)
|
||||
(10007, "x-mac-cyrillic"), # Cyrillic (Mac)
|
||||
(10008, "x-mac-chinesesimp"), # MAC Simplified Chinese (GB 2312); Chinese Simplified (Mac)
|
||||
(10010, "x-mac-romanian"), # Romanian (Mac)
|
||||
(10017, "x-mac-ukrainian"), # Ukrainian (Mac)
|
||||
(10021, "x-mac-thai"), # Thai (Mac)
|
||||
(10029, "x-mac-ce"), # MAC Latin 2; Central European (Mac)
|
||||
(10079, "x-mac-icelandic"), # Icelandic (Mac)
|
||||
(10081, "x-mac-turkish"), # Turkish (Mac)
|
||||
(10082, "x-mac-croatian"), # Croatian (Mac)
|
||||
(12000, "utf-32"), # Unicode UTF-32, little endian byte order; available only to managed applications
|
||||
(12001, "utf-32BE"), # Unicode UTF-32, big endian byte order; available only to managed applications
|
||||
(20000, "x-Chinese_CNS"), # CNS Taiwan; Chinese Traditional (CNS)
|
||||
(20001, "x-cp20001"), # TCA Taiwan
|
||||
(20002, "x_Chinese-Eten"), # Eten Taiwan; Chinese Traditional (Eten)
|
||||
(20003, "x-cp20003"), # IBM5550 Taiwan
|
||||
(20004, "x-cp20004"), # TeleText Taiwan
|
||||
(20005, "x-cp20005"), # Wang Taiwan
|
||||
(20105, "x-IA5"), # IA5 (IRV International Alphabet No. 5, 7-bit); Western European (IA5)
|
||||
(20106, "x-IA5-German"), # IA5 German (7-bit)
|
||||
(20107, "x-IA5-Swedish"), # IA5 Swedish (7-bit)
|
||||
(20108, "x-IA5-Norwegian"), # IA5 Norwegian (7-bit)
|
||||
(20127, "us-ascii"), # US-ASCII (7-bit)
|
||||
(20261, "x-cp20261"), # T.61
|
||||
(20269, "x-cp20269"), # ISO 6937 Non-Spacing Accent
|
||||
(20273, "IBM273"), # IBM EBCDIC Germany
|
||||
(20277, "IBM277"), # IBM EBCDIC Denmark-Norway
|
||||
(20278, "IBM278"), # IBM EBCDIC Finland-Sweden
|
||||
(20280, "IBM280"), # IBM EBCDIC Italy
|
||||
(20284, "IBM284"), # IBM EBCDIC Latin America-Spain
|
||||
(20285, "IBM285"), # IBM EBCDIC United Kingdom
|
||||
(20290, "IBM290"), # IBM EBCDIC Japanese Katakana Extended
|
||||
(20297, "IBM297"), # IBM EBCDIC France
|
||||
(20420, "IBM420"), # IBM EBCDIC Arabic
|
||||
(20423, "IBM423"), # IBM EBCDIC Greek
|
||||
(20424, "IBM424"), # IBM EBCDIC Hebrew
|
||||
(20833, "x-EBCDIC-KoreanExtended"), # IBM EBCDIC Korean Extended
|
||||
(20838, "IBM-Thai"), # IBM EBCDIC Thai
|
||||
(20866, "koi8-r"), # Russian (KOI8-R); Cyrillic (KOI8-R)
|
||||
(20871, "IBM871"), # IBM EBCDIC Icelandic
|
||||
(20880, "IBM880"), # IBM EBCDIC Cyrillic Russian
|
||||
(20905, "IBM905"), # IBM EBCDIC Turkish
|
||||
(20924, "IBM00924"), # IBM EBCDIC Latin 1/Open System (1047 + Euro symbol)
|
||||
(20932, "EUC-JP"), # Japanese (JIS 0208-1990 and 0121-1990)
|
||||
(20936, "x-cp20936"), # Simplified Chinese (GB2312); Chinese Simplified (GB2312-80)
|
||||
(20949, "x-cp20949"), # Korean Wansung
|
||||
(21025, "cp1025"), # IBM EBCDIC Cyrillic Serbian-Bulgarian
|
||||
(21027, ""), # (deprecated)
|
||||
(21866, "koi8-u"), # Ukrainian (KOI8-U); Cyrillic (KOI8-U)
|
||||
(28591, "iso-8859-1"), # ISO 8859-1 Latin 1; Western European (ISO)
|
||||
(28592, "iso-8859-2"), # ISO 8859-2 Central European; Central European (ISO)
|
||||
(28593, "iso-8859-3"), # ISO 8859-3 Latin 3
|
||||
(28594, "iso-8859-4"), # ISO 8859-4 Baltic
|
||||
(28595, "iso-8859-5"), # ISO 8859-5 Cyrillic
|
||||
(28596, "iso-8859-6"), # ISO 8859-6 Arabic
|
||||
(28597, "iso-8859-7"), # ISO 8859-7 Greek
|
||||
(28598, "iso-8859-8"), # ISO 8859-8 Hebrew; Hebrew (ISO-Visual)
|
||||
(28599, "iso-8859-9"), # ISO 8859-9 Turkish
|
||||
(28603, "iso-8859-13"), # ISO 8859-13 Estonian
|
||||
(28605, "iso-8859-15"), # ISO 8859-15 Latin 9
|
||||
(29001, "x-Europa"), # Europa 3
|
||||
(38598, "iso-8859-8-i"), # ISO 8859-8 Hebrew; Hebrew (ISO-Logical)
|
||||
(50220, "iso-2022-jp"), # ISO 2022 Japanese with no halfwidth Katakana; Japanese (JIS)
|
||||
(50221, "csISO2022JP"), # ISO 2022 Japanese with halfwidth Katakana; Japanese (JIS-Allow 1 byte Kana)
|
||||
(50222, "iso-2022-jp"), # ISO 2022 Japanese JIS X 0201-1989; Japanese (JIS-Allow 1 byte Kana - SO/SI)
|
||||
(50225, "iso-2022-kr"), # ISO 2022 Korean
|
||||
(50227, "x-cp50227"), # ISO 2022 Simplified Chinese; Chinese Simplified (ISO 2022)
|
||||
(50229, ""), # ISO 2022 Traditional Chinese
|
||||
(50930, ""), # EBCDIC Japanese (Katakana) Extended
|
||||
(50931, ""), # EBCDIC US-Canada and Japanese
|
||||
(50933, ""), # EBCDIC Korean Extended and Korean
|
||||
(50935, ""), # EBCDIC Simplified Chinese Extended and Simplified Chinese
|
||||
(50936, ""), # EBCDIC Simplified Chinese
|
||||
(50937, ""), # EBCDIC US-Canada and Traditional Chinese
|
||||
(50939, ""), # EBCDIC Japanese (Latin) Extended and Japanese
|
||||
(51932, "euc-jp"), # EUC Japanese
|
||||
(51936, "EUC-CN"), # EUC Simplified Chinese; Chinese Simplified (EUC)
|
||||
(51949, "euc-kr"), # EUC Korean
|
||||
(51950, ""), # EUC Traditional Chinese
|
||||
(52936, "hz-gb-2312"), # HZ-GB2312 Simplified Chinese; Chinese Simplified (HZ)
|
||||
(54936, "GB18030"), # Windows XP and later: GB18030 Simplified Chinese (4 byte); Chinese Simplified (GB18030)
|
||||
(57002, "x-iscii-de"), # ISCII Devanagari
|
||||
(57003, "x-iscii-be"), # ISCII Bengali
|
||||
(57004, "x-iscii-ta"), # ISCII Tamil
|
||||
(57005, "x-iscii-te"), # ISCII Telugu
|
||||
(57006, "x-iscii-as"), # ISCII Assamese
|
||||
(57007, "x-iscii-or"), # ISCII Oriya
|
||||
(57008, "x-iscii-ka"), # ISCII Kannada
|
||||
(57009, "x-iscii-ma"), # ISCII Malayalam
|
||||
(57010, "x-iscii-gu"), # ISCII Gujarati
|
||||
(57011, "x-iscii-pa"), # ISCII Punjabi
|
||||
(65000, "utf-7"), # Unicode (UTF-7)
|
||||
(65001, "utf-8")] # Unicode (UTF-8)
|
||||
|
||||
when false:
|
||||
# not needed yet:
|
||||
type
|
||||
TCpInfo = object {.pure.}
|
||||
MaxCharSize: int32
|
||||
DefaultChar: array[0..1, char]
|
||||
LeadByte: array[0..12-1, char]
|
||||
|
||||
proc GetCPInfo(CodePage: TCodePage, lpCPInfo: var TCpInfo): int32 {.
|
||||
stdcall, importc: "GetCPInfo", dynlib: "kernel32".}
|
||||
|
||||
proc nameToCodePage(name: string): TCodePage =
|
||||
var nameAsInt: int
|
||||
if parseInt(name, nameAsInt) == 0: nameAsInt = -1
|
||||
for no, na in items(winEncodings):
|
||||
if no == nameAsInt or EqEncodingNames(na, name): return TCodePage(no)
|
||||
result = TCodePage(-1)
|
||||
|
||||
proc codePageToName(c: TCodePage): string =
|
||||
for no, na in items(winEncodings):
|
||||
if no == int(c):
|
||||
return if na.len != 0: na else: $no
|
||||
result = ""
|
||||
|
||||
proc GetACP(): TCodePage {.stdcall, importc: "GetACP", dynlib: "kernel32".}
|
||||
|
||||
proc MultiByteToWideChar(
|
||||
CodePage: TCodePage,
|
||||
dwFlags: int32,
|
||||
lpMultiByteStr: cstring,
|
||||
cbMultiByte: cint,
|
||||
lpWideCharStr: cstring,
|
||||
cchWideChar: cint): cint {.
|
||||
stdcall, importc: "MultiByteToWideChar", dynlib: "kernel32".}
|
||||
|
||||
proc WideCharToMultiByte(
|
||||
CodePage: TCodePage,
|
||||
dwFlags: int32,
|
||||
lpWideCharStr: cstring,
|
||||
cchWideChar: cint,
|
||||
lpMultiByteStr: cstring,
|
||||
cbMultiByte: cint,
|
||||
lpDefaultChar: cstring=nil,
|
||||
lpUsedDefaultChar: pointer=nil): cint {.
|
||||
stdcall, importc: "WideCharToMultiByte", dynlib: "kernel32".}
|
||||
|
||||
else:
|
||||
when defined(haiku):
|
||||
const iconvDll = "(libc.so.6|libiconv.so|libtextencoding.so)"
|
||||
else:
|
||||
const iconvDll = "(libc.so.6|libiconv.so)"
|
||||
|
||||
when defined(macosx) and defined(powerpc32):
|
||||
const prefix = "lib"
|
||||
else:
|
||||
const prefix = ""
|
||||
|
||||
const
|
||||
E2BIG = 7.cint
|
||||
EINVAL = 22.cint
|
||||
when defined(linux):
|
||||
const EILSEQ = 84.cint
|
||||
elif defined(macosx):
|
||||
const EILSEQ = 92.cint
|
||||
elif defined(bsd):
|
||||
const EILSEQ = 86.cint
|
||||
elif defined(solaris):
|
||||
const EILSEQ = 88.cint
|
||||
|
||||
var errno {.importc, header: "<errno.h>".}: cint
|
||||
|
||||
proc iconvOpen(tocode, fromcode: cstring): PConverter {.
|
||||
importc: prefix & "iconv_open", cdecl, dynlib: iconvDll.}
|
||||
proc iconvClose(c: PConverter) {.
|
||||
importc: prefix & "iconv_close", cdecl, dynlib: iconvDll.}
|
||||
proc iconv(c: PConverter, inbuf: var cstring, inbytesLeft: var int,
|
||||
outbuf: var cstring, outbytesLeft: var int): int {.
|
||||
importc: prefix & "iconv", cdecl, dynlib: iconvDll.}
|
||||
proc iconv(c: PConverter, inbuf: pointer, inbytesLeft: pointer,
|
||||
outbuf: var cstring, outbytesLeft: var int): int {.
|
||||
importc: prefix & "iconv", cdecl, dynlib: iconvDll.}
|
||||
|
||||
proc getCurrentEncoding*(): string =
|
||||
## retrieves the current encoding. On Unix, always "UTF-8" is returned.
|
||||
when defined(windows):
|
||||
result = codePageToName(GetACP())
|
||||
else:
|
||||
result = "UTF-8"
|
||||
|
||||
proc open*(destEncoding = "UTF-8", srcEncoding = "CP1252"): PConverter =
|
||||
## opens a converter that can convert from `srcEncoding` to `destEncoding`.
|
||||
## Raises `EIO` if it cannot fullfill the request.
|
||||
when not defined(windows):
|
||||
result = iconvOpen(srcEncoding, destEncoding)
|
||||
if result == nil:
|
||||
raise newException(EInvalidEncoding,
|
||||
"cannot create encoding converter from " &
|
||||
srcEncoding & " to " & destEncoding)
|
||||
else:
|
||||
result.dest = nameToCodePage(destEncoding)
|
||||
result.src = nameToCodePage(srcEncoding)
|
||||
if int(result.dest) == -1:
|
||||
raise newException(EInvalidEncoding,
|
||||
"cannot find encoding " & destEncoding)
|
||||
if int(result.src) == -1:
|
||||
raise newException(EInvalidEncoding,
|
||||
"cannot find encoding " & srcEncoding)
|
||||
|
||||
proc close*(c: PConverter) =
|
||||
## frees the resources the converter `c` holds.
|
||||
when not defined(windows):
|
||||
iconvClose(c)
|
||||
|
||||
when defined(windows):
|
||||
|
||||
proc convert*(c: PConverter, s: string): string =
|
||||
## converts `s` to `destEncoding` that was given to the converter `c`. It
|
||||
## assumed that `s` is in `srcEncoding`.
|
||||
|
||||
# special case: empty string: needed because MultiByteToWideChar
|
||||
# return 0 in case of error:
|
||||
if s.len == 0: return ""
|
||||
# educated guess of capacity:
|
||||
var cap = s.len + s.len shr 2
|
||||
result = newStringOfCap(cap*2)
|
||||
# convert to utf-16 LE
|
||||
var m = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32,
|
||||
lpMultiByteStr = cstring(s),
|
||||
cbMultiByte = cint(s.len),
|
||||
lpWideCharStr = cstring(result),
|
||||
cchWideChar = cint(cap))
|
||||
if m == 0:
|
||||
# try again; ask for capacity:
|
||||
cap = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32,
|
||||
lpMultiByteStr = cstring(s),
|
||||
cbMultiByte = cint(s.len),
|
||||
lpWideCharStr = nil,
|
||||
cchWideChar = cint(0))
|
||||
# and do the conversion properly:
|
||||
result = newStringOfCap(cap*2)
|
||||
m = MultiByteToWideChar(CodePage = c.src, dwFlags = 0'i32,
|
||||
lpMultiByteStr = cstring(s),
|
||||
cbMultiByte = cint(s.len),
|
||||
lpWideCharStr = cstring(result),
|
||||
cchWideChar = cint(cap))
|
||||
if m == 0: OSError()
|
||||
setLen(result, m*2)
|
||||
elif m <= cap:
|
||||
setLen(result, m*2)
|
||||
else:
|
||||
assert(false) # cannot happen
|
||||
|
||||
# if already utf-16 LE, no further need to do something:
|
||||
if int(c.dest) == 1200: return
|
||||
# otherwise the fun starts again:
|
||||
cap = s.len + s.len shr 2
|
||||
var res = newStringOfCap(cap)
|
||||
m = WideCharToMultiByte(
|
||||
CodePage = c.dest,
|
||||
dwFlags = 0'i32,
|
||||
lpWideCharStr = cstring(result),
|
||||
cchWideChar = cint(result.len div 2),
|
||||
lpMultiByteStr = cstring(res),
|
||||
cbMultiByte = cap)
|
||||
if m == 0:
|
||||
# try again; ask for capacity:
|
||||
cap = WideCharToMultiByte(
|
||||
CodePage = c.dest,
|
||||
dwFlags = 0'i32,
|
||||
lpWideCharStr = cstring(result),
|
||||
cchWideChar = cint(result.len div 2),
|
||||
lpMultiByteStr = nil,
|
||||
cbMultiByte = cint(0))
|
||||
# and do the conversion properly:
|
||||
res = newStringOfCap(cap)
|
||||
m = WideCharToMultiByte(
|
||||
CodePage = c.dest,
|
||||
dwFlags = 0'i32,
|
||||
lpWideCharStr = cstring(result),
|
||||
cchWideChar = cint(result.len div 2),
|
||||
lpMultiByteStr = cstring(res),
|
||||
cbMultiByte = cap)
|
||||
if m == 0: OSError()
|
||||
setLen(res, m)
|
||||
result = res
|
||||
elif m <= cap:
|
||||
setLen(res, m)
|
||||
result = res
|
||||
else:
|
||||
assert(false) # cannot happen
|
||||
|
||||
else:
|
||||
|
||||
proc convert*(c: PConverter, s: string): string =
|
||||
result = newString(s.len)
|
||||
var inLen = len(S)
|
||||
var outLen = len(result)
|
||||
var src = cstring(S)
|
||||
var dst = cstring(result)
|
||||
var iconvres: int
|
||||
while InLen > 0:
|
||||
iconvres = iconv(c, src, inLen, dst, outLen)
|
||||
if iconvres == -1:
|
||||
var lerr = errno
|
||||
if lerr == EILSEQ or lerr == EINVAL:
|
||||
# unknown char, skip
|
||||
Dst[0] = Src[0]
|
||||
src = cast[cstring](cast[int](src) + 1)
|
||||
dst = cast[cstring](cast[int](dst) + 1)
|
||||
dec(inLen)
|
||||
dec(outLen)
|
||||
elif lerr == E2BIG:
|
||||
var offset = cast[int](dst) - cast[int](cstring(result))
|
||||
setLen(result, len(result)+inLen*2+5)
|
||||
# 5 is minimally one utf-8 char
|
||||
dst = cast[cstring](cast[int](cstring(result)) + offset)
|
||||
outLen = len(result) - offset
|
||||
else:
|
||||
OSError()
|
||||
# iconv has a buffer that needs flushing, specially if the last char is
|
||||
# not '\0'
|
||||
discard iconv(c, nil, nil, dst, outlen)
|
||||
if iconvres == Cint(-1) and errno == E2BIG:
|
||||
var offset = cast[int](dst) - cast[int](cstring(result))
|
||||
setLen(result, len(result)+inLen*2+5)
|
||||
# 5 is minimally one utf-8 char
|
||||
dst = cast[cstring](cast[int](cstring(result)) + offset)
|
||||
outLen = len(result) - offset
|
||||
discard iconv(c, nil, nil, dst, outlen)
|
||||
# trim output buffer
|
||||
setLen(result, len(result) - outlen)
|
||||
|
||||
proc convert*(s: string, destEncoding = "UTF-8",
|
||||
srcEncoding = "CP1252"): string =
|
||||
## converts `s` to `destEncoding`. It assumed that `s` is in `srcEncoding`.
|
||||
## This opens a converter, uses it and closes it again and is thus more
|
||||
## convienent but also likely less efficient than re-using a converter.
|
||||
var c = open(destEncoding, srcEncoding)
|
||||
try:
|
||||
result = convert(c, s)
|
||||
finally:
|
||||
close(c)
|
||||
|
||||
when IsMainModule:
|
||||
var orig = "öäüß"
|
||||
var crap = convert(orig, "CP1252", "UTF-8")
|
||||
echo convert(crap)
|
||||
|
||||
|
||||
5
todo.txt
5
todo.txt
@@ -4,10 +4,9 @@ High priority (version 0.8.12)
|
||||
|
||||
* add --deadlock_prevention:on|off switch? timeout for locks?
|
||||
* built-in serialization
|
||||
- bug: invoking a generic iterator twice triggers a code gen bug
|
||||
- bug: invoking a generic iterator twice triggers a code gen bug (titer2)
|
||||
- pegs: the anchor '^' does not work because many procs use a linear search
|
||||
and matchLen()
|
||||
- conversion between character sets
|
||||
|
||||
|
||||
version 0.9.0
|
||||
@@ -43,6 +42,7 @@ version 0.9.XX
|
||||
|
||||
- distinct types for array/seq indexes
|
||||
- GC: marker procs for native Nimrod GC and Boehm GC
|
||||
- code concerning 'assert' is wasteful and unnecessarily complex
|
||||
- implicit ref/ptr->var conversion; the compiler may store an object
|
||||
implicitly on the heap for write barrier efficiency
|
||||
- resizing of strings/sequences could take into account the memory that
|
||||
@@ -52,7 +52,6 @@ version 0.9.XX
|
||||
is hard because of partial evaluation --> symbol files will fix this as
|
||||
a side effect
|
||||
- EcmaScript needs a new and better code gen: simply adapt the C code gen to it
|
||||
- prefer proc in current module over other procs with same overloading result?
|
||||
- generalized case statement (requires better transf)
|
||||
- tlastmod returns wrong results on BSD (Linux, MacOS X: works)
|
||||
- nested tuple unpacking
|
||||
|
||||
@@ -54,6 +54,7 @@ Additions
|
||||
- Added ``intsets`` module which contains a specialized int set data type.
|
||||
- Added ``scgi`` module.
|
||||
- Added ``smtp`` module.
|
||||
- Added ``encodings`` module.
|
||||
- Added ``re.findAll``, ``pegs.findAll``.
|
||||
- Added ``os.findExe``.
|
||||
- Added ``parseutils.parseUntil`` and ``parseutils.parseWhile``.
|
||||
|
||||
@@ -39,7 +39,7 @@ srcdoc: "pure/xmlparser;pure/htmlparser;pure/xmltree;pure/colors"
|
||||
srcdoc: "pure/json;pure/base64;pure/scgi;pure/redis;impure/graphics"
|
||||
srcdoc: "impure/rdstdin;wrappers/zmq;wrappers/sphinx"
|
||||
srcdoc: "pure/collections/tables;pure/collections/sets;pure/collections/lists"
|
||||
srcdoc: "pure/collections/intsets"
|
||||
srcdoc: "pure/collections/intsets;pure/encodings"
|
||||
|
||||
webdoc: "wrappers/libcurl;pure/md5;wrappers/mysql;wrappers/iup"
|
||||
webdoc: "wrappers/sqlite3;wrappers/postgres;wrappers/tinyc"
|
||||
|
||||
Reference in New Issue
Block a user