Varia's website https://varia.zone
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

139 lines
4.1 KiB

# -*- coding: utf-8 -*-
# vi:tabstop=4:expandtab:sw=4
"""Transliterate Unicode text into plain 7-bit ASCII.
Example usage:
>>> from unidecode import unidecode
>>> unidecode("\u5317\u4EB0")
"Bei Jing "
The transliteration uses a straightforward map, and doesn't have alternatives
for the same character based on language, position, or anything else.
A standard string object will be returned. If you need bytes, use:
>>> unidecode("Κνωσός").encode("ascii")
b'Knosos'
"""
import warnings
from typing import Dict, Optional, Sequence
Cache = {} # type: Dict[int, Optional[Sequence[Optional[str]]]]
class UnidecodeError(ValueError):
def __init__(self, message: str, index: Optional[int] = None) -> None:
"""Raised for Unidecode-related errors.
The index attribute contains the index of the character that caused
the error.
"""
super(UnidecodeError, self).__init__(message)
self.index = index
def unidecode_expect_ascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
"""Transliterate an Unicode object into an ASCII string
>>> unidecode("\u5317\u4EB0")
"Bei Jing "
This function first tries to convert the string using ASCII codec.
If it fails (because of non-ASCII characters), it falls back to
transliteration using the character tables.
This is approx. five times faster if the string only contains ASCII
characters, but slightly slower than unicode_expect_nonascii if
non-ASCII characters are present.
errors specifies what to do with characters that have not been
found in replacement tables. The default is 'ignore' which ignores
the character. 'strict' raises an UnidecodeError. 'replace'
substitutes the character with replace_str (default is '?').
'preserve' keeps the original character.
Note that if 'preserve' is used the returned string might not be
ASCII!
"""
try:
bytestring = string.encode('ASCII')
except UnicodeEncodeError:
pass
else:
return string
return _unidecode(string, errors, replace_str)
def unidecode_expect_nonascii(string: str, errors: str = 'ignore', replace_str: str = '?') -> str:
"""Transliterate an Unicode object into an ASCII string
>>> unidecode("\u5317\u4EB0")
"Bei Jing "
See unidecode_expect_ascii.
"""
return _unidecode(string, errors, replace_str)
unidecode = unidecode_expect_ascii
def _get_repl_str(char: str) -> Optional[str]:
codepoint = ord(char)
if codepoint < 0x80:
# Already ASCII
return str(char)
if codepoint > 0xeffff:
# No data on characters in Private Use Area and above.
return None
if 0xd800 <= codepoint <= 0xdfff:
warnings.warn( "Surrogate character %r will be ignored. "
"You might be using a narrow Python build." % (char,),
RuntimeWarning, 2)
section = codepoint >> 8 # Chop off the last two hex digits
position = codepoint % 256 # Last two hex digits
try:
table = Cache[section]
except KeyError:
try:
mod = __import__('unidecode.x%03x'%(section), globals(), locals(), ['data'])
except ImportError:
# No data on this character
Cache[section] = None
return None
Cache[section] = table = mod.data
if table and len(table) > position:
return table[position]
else:
return None
def _unidecode(string: str, errors: str, replace_str:str) -> str:
retval = []
for index, char in enumerate(string):
repl = _get_repl_str(char)
if repl is None:
if errors == 'ignore':
repl = ''
elif errors == 'strict':
raise UnidecodeError('no replacement found for character %r '
'in position %d' % (char, index), index)
elif errors == 'replace':
repl = replace_str
elif errors == 'preserve':
repl = char
else:
raise UnidecodeError('invalid value for errors parameter %r' % (errors,))
retval.append(repl)
return ''.join(retval)