Source code for medkit.text.utils.decoding
__all__ = ["get_ascii_from_unicode"]
import logging
import unidecode
[docs]def get_ascii_from_unicode(text: str, keep_length: bool = True, logger=None) -> str:
"""
Function returning the (closest) ascii text when possible
Parameters
----------
text:
The unicode text to decode to ascii
keep_length
If True, special characters which change the length are kept in returned string
logger
Logger to pass if one wants to keep caller information
Returns
-------
str
The closest ascii text
"""
if logger is None:
logger = logging.getLogger(__name__)
output = unidecode.unidecode(text)
# Verify that text length is conserved
if keep_length and len(output) != len(text):
# if text conversion had changed its length, only change characters with same length
output = ""
special_chars = set()
for c in text:
cprim = unidecode.unidecode(c)
if len(cprim) == 1:
output += cprim
else:
output += c
special_chars.add(c)
logger.info(
"Some characters can't be decoded to ascii without changing length."
" Strategy is to keep these special characters: %s \n",
special_chars,
)
logger.debug("original text:\t%s\ndecoded text:\t%s\n", text, output)
return output