Nim/lib/pure/unidecode/unidecode.nim

#
#
#            Nim's Runtime Library
#        (c) Copyright 2012 Andreas Rumpf
#
#    See the file "copying.txt", included in this
#    distribution, for details about the copyright.
#

## This module is based on Python's [Unidecode](https://pypi.org/project/Unidecode/)
## module by Tomaz Solc, which in turn is based on the
## [Text::Unidecode](https://metacpan.org/pod/Text::Unidecode)
## Perl module by Sean M. Burke.
##
## It provides a `unidecode proc <#unidecode,string>`_ that does
## Unicode to ASCII transliterations: It finds the sequence of ASCII characters
## that is the closest approximation to the Unicode string.
##
## For example, the closest to string "Äußerst" in ASCII is "Ausserst". Some
## information is lost in this transformation, of course, since several Unicode
## strings can be transformed to the same ASCII representation. So this is a
## strictly one-way transformation. However, a human reader will probably
## still be able to guess from the context, what the original string was.
##
## This module needs the data file `unidecode.dat` to work: This file is
## embedded as a resource into your application by default. You can also
## define the symbol `--define:noUnidecodeTable` during compile time and
## use the `loadUnidecodeTable proc <#loadUnidecodeTable>`_ to initialize
## this module.

import std/unicode

when not defined(noUnidecodeTable):
  import std/strutils

  const translationTable = splitLines(slurp"unidecode/unidecode.dat")
else:
  # shared is fine for threading:
  var translationTable: seq[string]

proc loadUnidecodeTable*(datafile = "unidecode.dat") =
  ## Loads the datafile that `unidecode <#unidecode,string>`_ needs to work.
  ## This is only required if the module was compiled with the
  ## `--define:noUnidecodeTable` switch. This needs to be called by the
  ## main thread before any thread can make a call to `unidecode`.
  when defined(noUnidecodeTable):
    newSeq(translationTable, 0xffff)
    var i = 0
    for line in lines(datafile):
      translationTable[i] = line
      inc(i)

proc unidecode*(s: string): string =
  ## Finds the sequence of ASCII characters that is the closest approximation
  ## to the UTF-8 string `s`.
  runnableExamples:
    doAssert unidecode("北京") == "Bei Jing "
    doAssert unidecode("Äußerst") == "Ausserst"

  result = ""
  for r in runes(s):
    var c = int(r)
    if c <=% 127: add(result, chr(c))
    elif c <% translationTable.len: add(result, translationTable[c - 128])