basic.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. import re
  2. import unicodedata
  3. import regex
  4. # non-ASCII letters that are not separated by "NFKD" normalization
  5. ADDITIONAL_DIACRITICS = {
  6. "œ": "oe",
  7. "Œ": "OE",
  8. "ø": "o",
  9. "Ø": "O",
  10. "æ": "ae",
  11. "Æ": "AE",
  12. "ß": "ss",
  13. "ẞ": "SS",
  14. "đ": "d",
  15. "Đ": "D",
  16. "ð": "d",
  17. "Ð": "D",
  18. "þ": "th",
  19. "Þ": "th",
  20. "ł": "l",
  21. "Ł": "L",
  22. }
  23. def remove_symbols_and_diacritics(s: str, keep=""):
  24. """
  25. Replace any other markers, symbols, and punctuations with a space,
  26. and drop any diacritics (category 'Mn' and some manual mappings)
  27. """
  28. return "".join(
  29. c
  30. if c in keep
  31. else ADDITIONAL_DIACRITICS[c]
  32. if c in ADDITIONAL_DIACRITICS
  33. else ""
  34. if unicodedata.category(c) == "Mn"
  35. else " "
  36. if unicodedata.category(c)[0] in "MSP"
  37. else c
  38. for c in unicodedata.normalize("NFKD", s)
  39. )
  40. def remove_symbols(s: str):
  41. """
  42. Replace any other markers, symbols, punctuations with a space, keeping diacritics
  43. """
  44. return "".join(
  45. " " if unicodedata.category(c)[0] in "MSP" else c
  46. for c in unicodedata.normalize("NFKC", s)
  47. )
  48. class BasicTextNormalizer:
  49. def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
  50. self.clean = (
  51. remove_symbols_and_diacritics if remove_diacritics else remove_symbols
  52. )
  53. self.split_letters = split_letters
  54. def __call__(self, s: str):
  55. s = s.lower()
  56. s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
  57. s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
  58. s = self.clean(s).lower()
  59. if self.split_letters:
  60. s = " ".join(regex.findall(r"\X", s, regex.U))
  61. s = re.sub(
  62. r"\s+", " ", s
  63. ) # replace any successive whitespace characters with a space
  64. return s