diff --git a/benchmark.py b/benchmark.py index 7aaf6c4..4dc9537 100644 --- a/benchmark.py +++ b/benchmark.py @@ -1,26 +1,27 @@ # -*- coding: utf-8 -*- +from __future__ import print_function import timeit def main(): - print "unidecode_expect_ascii, ASCII string" + print("unidecode_expect_ascii, ASCII string") timeit.main([ '-s', 'from unidecode import unidecode_expect_ascii', 'unidecode_expect_ascii(u"Hello, World")']) - print "unidecode_expect_ascii, non-ASCII string" + print("unidecode_expect_ascii, non-ASCII string") timeit.main([ '-s', 'from unidecode import unidecode_expect_ascii', 'unidecode_expect_ascii(u"¡Hola mundo!")']) - print "unidecode_expect_nonascii, ASCII string" + print("unidecode_expect_nonascii, ASCII string") timeit.main([ '-s', 'from unidecode import unidecode_expect_nonascii', 'unidecode_expect_nonascii(u"Hello, World")']) - print "unidecode_expect_nonascii, non-ASCII string" + print("unidecode_expect_nonascii, non-ASCII string") timeit.main([ '-s', 'from unidecode import unidecode_expect_nonascii', diff --git a/unidecode/__init__.py b/unidecode/__init__.py index 7ac3675..08544e9 100644 --- a/unidecode/__init__.py +++ b/unidecode/__init__.py @@ -17,9 +17,11 @@ b'Knosos' """ import warnings -from typing import Dict, Optional, Sequence +from typing import Dict, Iterator, Optional, Sequence +from pathlib import Path -Cache: Dict[int, Optional[Sequence[Optional[str]]]] = {} +Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]] +Translator = None # type: Optional[Dict[int, str]] class UnidecodeError(ValueError): def __init__(self, message: str, index: Optional[int] = None) -> None: @@ -136,3 +138,83 @@ def _unidecode(string: str, errors: str, replace_str:str) -> str: retval.append(repl) return ''.join(retval) + +def preload_translator() -> Dict[int, str]: + global Translator + + if Translator is None: + Translator = { + codepoint : char + + for file in Path(__file__).parent.glob('x*.py') + for codepoint, char in enumerate( + __import__(f'unidecode.{file.stem}', globals(), locals(), ['data']).data, + int(f'0{file.stem}', base=16) << 8 + ) + if codepoint > 127 and isinstance(char, str) + } + + return Translator + +def _unidecode_translate_replace_iterator (string: str, replace_str: str) -> Iterator[int]: + replace_bytes = replace_str.encode() + + for char in string: + char_ord = ord(char) + + if char_ord > 127: + yield from replace_bytes + + else: + yield char_ord + +def unidecode_translate( + string: str, errors: str = 'ignore', replace_str: str = '?', check_surrogates: bool = False +) -> str: + """Transliterate an Unicode object into an ASCII string + This method is usually faster than unidecode_expect_nonascii/unidecode, but it uses more memory + To reduce first call time, invoke preload_translator to preload translation table + + >>> unidecode("\u5317\u4EB0") + "Bei Jing " + + See unidecode_expect_nonascii. + """ + if check_surrogates: + for char in string: + if 0xd800 <= ord(char) <= 0xdfff: + warnings.warn( + f'Surrogate character {char} will be ignored. ' + 'You might be using a narrow Python build.', + RuntimeWarning, 2 + ) + + retval = string.translate(preload_translator()) + + if errors == 'preserve': + return retval + + retval_bytes: bytes + + if errors in ('ignore', 'strict') or (errors == 'replace' and replace_str == '?'): + try: + retval_bytes = retval.encode('ascii', errors=errors) + + except UnicodeEncodeError as exc: + raise UnidecodeError( + f'no replacement found for character {exc.object[exc.start : exc.end]} ' + f'in position {exc.start}', + exc.start + ) from None + + elif errors == 'replace': + if replace_str == '?': + retval_bytes = retval.encode('ascii', errors='replace') + + else: + retval_bytes = bytes(_unidecode_translate_replace_iterator(retval, replace_str)) + + else: + raise UnidecodeError(f'invalid value for errors parameter {errors}') + + return retval_bytes.decode()