12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- import re
- import unicodedata
- import regex
- # non-ASCII letters that are not separated by "NFKD" normalization
- ADDITIONAL_DIACRITICS = {
- "œ": "oe",
- "Œ": "OE",
- "ø": "o",
- "Ø": "O",
- "æ": "ae",
- "Æ": "AE",
- "ß": "ss",
- "ẞ": "SS",
- "đ": "d",
- "Đ": "D",
- "ð": "d",
- "Ð": "D",
- "þ": "th",
- "Þ": "th",
- "ł": "l",
- "Ł": "L",
- }
- def remove_symbols_and_diacritics(s: str, keep=""):
- """
- Replace any other markers, symbols, and punctuations with a space,
- and drop any diacritics (category 'Mn' and some manual mappings)
- """
- return "".join(
- c
- if c in keep
- else ADDITIONAL_DIACRITICS[c]
- if c in ADDITIONAL_DIACRITICS
- else ""
- if unicodedata.category(c) == "Mn"
- else " "
- if unicodedata.category(c)[0] in "MSP"
- else c
- for c in unicodedata.normalize("NFKD", s)
- )
- def remove_symbols(s: str):
- """
- Replace any other markers, symbols, punctuations with a space, keeping diacritics
- """
- return "".join(
- " " if unicodedata.category(c)[0] in "MSP" else c
- for c in unicodedata.normalize("NFKC", s)
- )
- class BasicTextNormalizer:
- def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
- self.clean = (
- remove_symbols_and_diacritics if remove_diacritics else remove_symbols
- )
- self.split_letters = split_letters
- def __call__(self, s: str):
- s = s.lower()
- s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
- s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
- s = self.clean(s).lower()
- if self.split_letters:
- s = " ".join(regex.findall(r"\X", s, regex.U))
- s = re.sub(
- r"\s+", " ", s
- ) # replace any successive whitespace characters with a space
- return s
|