english.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. import json
  2. import os
  3. import re
  4. from fractions import Fraction
  5. from typing import Iterator, List, Match, Optional, Union
  6. from more_itertools import windowed
  7. from .basic import remove_symbols_and_diacritics
  8. class EnglishNumberNormalizer:
  9. """
  10. Convert any spelled-out numbers into arabic numbers, while handling:
  11. - remove any commas
  12. - keep the suffixes such as: `1960s`, `274th`, `32nd`, etc.
  13. - spell out currency symbols after the number. e.g. `$20 million` -> `20000000 dollars`
  14. - spell out `one` and `ones`
  15. - interpret successive single-digit numbers as nominal: `one oh one` -> `101`
  16. """
  17. def __init__(self):
  18. super().__init__()
  19. self.zeros = {"o", "oh", "zero"}
  20. self.ones = {
  21. name: i
  22. for i, name in enumerate(
  23. [
  24. "one",
  25. "two",
  26. "three",
  27. "four",
  28. "five",
  29. "six",
  30. "seven",
  31. "eight",
  32. "nine",
  33. "ten",
  34. "eleven",
  35. "twelve",
  36. "thirteen",
  37. "fourteen",
  38. "fifteen",
  39. "sixteen",
  40. "seventeen",
  41. "eighteen",
  42. "nineteen",
  43. ],
  44. start=1,
  45. )
  46. }
  47. self.ones_plural = {
  48. "sixes" if name == "six" else name + "s": (value, "s")
  49. for name, value in self.ones.items()
  50. }
  51. self.ones_ordinal = {
  52. "zeroth": (0, "th"),
  53. "first": (1, "st"),
  54. "second": (2, "nd"),
  55. "third": (3, "rd"),
  56. "fifth": (5, "th"),
  57. "twelfth": (12, "th"),
  58. **{
  59. name + ("h" if name.endswith("t") else "th"): (value, "th")
  60. for name, value in self.ones.items()
  61. if value > 3 and value != 5 and value != 12
  62. },
  63. }
  64. self.ones_suffixed = {**self.ones_plural, **self.ones_ordinal}
  65. self.tens = {
  66. "twenty": 20,
  67. "thirty": 30,
  68. "forty": 40,
  69. "fifty": 50,
  70. "sixty": 60,
  71. "seventy": 70,
  72. "eighty": 80,
  73. "ninety": 90,
  74. }
  75. self.tens_plural = {
  76. name.replace("y", "ies"): (value, "s") for name, value in self.tens.items()
  77. }
  78. self.tens_ordinal = {
  79. name.replace("y", "ieth"): (value, "th")
  80. for name, value in self.tens.items()
  81. }
  82. self.tens_suffixed = {**self.tens_plural, **self.tens_ordinal}
  83. self.multipliers = {
  84. "hundred": 100,
  85. "thousand": 1_000,
  86. "million": 1_000_000,
  87. "billion": 1_000_000_000,
  88. "trillion": 1_000_000_000_000,
  89. "quadrillion": 1_000_000_000_000_000,
  90. "quintillion": 1_000_000_000_000_000_000,
  91. "sextillion": 1_000_000_000_000_000_000_000,
  92. "septillion": 1_000_000_000_000_000_000_000_000,
  93. "octillion": 1_000_000_000_000_000_000_000_000_000,
  94. "nonillion": 1_000_000_000_000_000_000_000_000_000_000,
  95. "decillion": 1_000_000_000_000_000_000_000_000_000_000_000,
  96. }
  97. self.multipliers_plural = {
  98. name + "s": (value, "s") for name, value in self.multipliers.items()
  99. }
  100. self.multipliers_ordinal = {
  101. name + "th": (value, "th") for name, value in self.multipliers.items()
  102. }
  103. self.multipliers_suffixed = {
  104. **self.multipliers_plural,
  105. **self.multipliers_ordinal,
  106. }
  107. self.decimals = {*self.ones, *self.tens, *self.zeros}
  108. self.preceding_prefixers = {
  109. "minus": "-",
  110. "negative": "-",
  111. "plus": "+",
  112. "positive": "+",
  113. }
  114. self.following_prefixers = {
  115. "pound": "£",
  116. "pounds": "£",
  117. "euro": "€",
  118. "euros": "€",
  119. "dollar": "$",
  120. "dollars": "$",
  121. "cent": "¢",
  122. "cents": "¢",
  123. }
  124. self.prefixes = set(
  125. list(self.preceding_prefixers.values())
  126. + list(self.following_prefixers.values())
  127. )
  128. self.suffixers = {
  129. "per": {"cent": "%"},
  130. "percent": "%",
  131. }
  132. self.specials = {"and", "double", "triple", "point"}
  133. self.words = set(
  134. [
  135. key
  136. for mapping in [
  137. self.zeros,
  138. self.ones,
  139. self.ones_suffixed,
  140. self.tens,
  141. self.tens_suffixed,
  142. self.multipliers,
  143. self.multipliers_suffixed,
  144. self.preceding_prefixers,
  145. self.following_prefixers,
  146. self.suffixers,
  147. self.specials,
  148. ]
  149. for key in mapping
  150. ]
  151. )
  152. self.literal_words = {"one", "ones"}
  153. def process_words(self, words: List[str]) -> Iterator[str]:
  154. prefix: Optional[str] = None
  155. value: Optional[Union[str, int]] = None
  156. skip = False
  157. def to_fraction(s: str):
  158. try:
  159. return Fraction(s)
  160. except ValueError:
  161. return None
  162. def output(result: Union[str, int]):
  163. nonlocal prefix, value
  164. result = str(result)
  165. if prefix is not None:
  166. result = prefix + result
  167. value = None
  168. prefix = None
  169. return result
  170. if len(words) == 0:
  171. return
  172. for prev, current, next in windowed([None] + words + [None], 3):
  173. if skip:
  174. skip = False
  175. continue
  176. next_is_numeric = next is not None and re.match(r"^\d+(\.\d+)?$", next)
  177. has_prefix = current[0] in self.prefixes
  178. current_without_prefix = current[1:] if has_prefix else current
  179. if re.match(r"^\d+(\.\d+)?$", current_without_prefix):
  180. # arabic numbers (potentially with signs and fractions)
  181. f = to_fraction(current_without_prefix)
  182. assert f is not None
  183. if value is not None:
  184. if isinstance(value, str) and value.endswith("."):
  185. # concatenate decimals / ip address components
  186. value = str(value) + str(current)
  187. continue
  188. else:
  189. yield output(value)
  190. prefix = current[0] if has_prefix else prefix
  191. if f.denominator == 1:
  192. value = f.numerator # store integers as int
  193. else:
  194. value = current_without_prefix
  195. elif current not in self.words:
  196. # non-numeric words
  197. if value is not None:
  198. yield output(value)
  199. yield output(current)
  200. elif current in self.zeros:
  201. value = str(value or "") + "0"
  202. elif current in self.ones:
  203. ones = self.ones[current]
  204. if value is None:
  205. value = ones
  206. elif isinstance(value, str) or prev in self.ones:
  207. if (
  208. prev in self.tens and ones < 10
  209. ): # replace the last zero with the digit
  210. assert value[-1] == "0"
  211. value = value[:-1] + str(ones)
  212. else:
  213. value = str(value) + str(ones)
  214. elif ones < 10:
  215. if value % 10 == 0:
  216. value += ones
  217. else:
  218. value = str(value) + str(ones)
  219. else: # eleven to nineteen
  220. if value % 100 == 0:
  221. value += ones
  222. else:
  223. value = str(value) + str(ones)
  224. elif current in self.ones_suffixed:
  225. # ordinal or cardinal; yield the number right away
  226. ones, suffix = self.ones_suffixed[current]
  227. if value is None:
  228. yield output(str(ones) + suffix)
  229. elif isinstance(value, str) or prev in self.ones:
  230. if prev in self.tens and ones < 10:
  231. assert value[-1] == "0"
  232. yield output(value[:-1] + str(ones) + suffix)
  233. else:
  234. yield output(str(value) + str(ones) + suffix)
  235. elif ones < 10:
  236. if value % 10 == 0:
  237. yield output(str(value + ones) + suffix)
  238. else:
  239. yield output(str(value) + str(ones) + suffix)
  240. else: # eleven to nineteen
  241. if value % 100 == 0:
  242. yield output(str(value + ones) + suffix)
  243. else:
  244. yield output(str(value) + str(ones) + suffix)
  245. value = None
  246. elif current in self.tens:
  247. tens = self.tens[current]
  248. if value is None:
  249. value = tens
  250. elif isinstance(value, str):
  251. value = str(value) + str(tens)
  252. else:
  253. if value % 100 == 0:
  254. value += tens
  255. else:
  256. value = str(value) + str(tens)
  257. elif current in self.tens_suffixed:
  258. # ordinal or cardinal; yield the number right away
  259. tens, suffix = self.tens_suffixed[current]
  260. if value is None:
  261. yield output(str(tens) + suffix)
  262. elif isinstance(value, str):
  263. yield output(str(value) + str(tens) + suffix)
  264. else:
  265. if value % 100 == 0:
  266. yield output(str(value + tens) + suffix)
  267. else:
  268. yield output(str(value) + str(tens) + suffix)
  269. elif current in self.multipliers:
  270. multiplier = self.multipliers[current]
  271. if value is None:
  272. value = multiplier
  273. elif isinstance(value, str) or value == 0:
  274. f = to_fraction(value)
  275. p = f * multiplier if f is not None else None
  276. if f is not None and p.denominator == 1:
  277. value = p.numerator
  278. else:
  279. yield output(value)
  280. value = multiplier
  281. else:
  282. before = value // 1000 * 1000
  283. residual = value % 1000
  284. value = before + residual * multiplier
  285. elif current in self.multipliers_suffixed:
  286. multiplier, suffix = self.multipliers_suffixed[current]
  287. if value is None:
  288. yield output(str(multiplier) + suffix)
  289. elif isinstance(value, str):
  290. f = to_fraction(value)
  291. p = f * multiplier if f is not None else None
  292. if f is not None and p.denominator == 1:
  293. yield output(str(p.numerator) + suffix)
  294. else:
  295. yield output(value)
  296. yield output(str(multiplier) + suffix)
  297. else: # int
  298. before = value // 1000 * 1000
  299. residual = value % 1000
  300. value = before + residual * multiplier
  301. yield output(str(value) + suffix)
  302. value = None
  303. elif current in self.preceding_prefixers:
  304. # apply prefix (positive, minus, etc.) if it precedes a number
  305. if value is not None:
  306. yield output(value)
  307. if next in self.words or next_is_numeric:
  308. prefix = self.preceding_prefixers[current]
  309. else:
  310. yield output(current)
  311. elif current in self.following_prefixers:
  312. # apply prefix (dollars, cents, etc.) only after a number
  313. if value is not None:
  314. prefix = self.following_prefixers[current]
  315. yield output(value)
  316. else:
  317. yield output(current)
  318. elif current in self.suffixers:
  319. # apply suffix symbols (percent -> '%')
  320. if value is not None:
  321. suffix = self.suffixers[current]
  322. if isinstance(suffix, dict):
  323. if next in suffix:
  324. yield output(str(value) + suffix[next])
  325. skip = True
  326. else:
  327. yield output(value)
  328. yield output(current)
  329. else:
  330. yield output(str(value) + suffix)
  331. else:
  332. yield output(current)
  333. elif current in self.specials:
  334. if next not in self.words and not next_is_numeric:
  335. # apply special handling only if the next word can be numeric
  336. if value is not None:
  337. yield output(value)
  338. yield output(current)
  339. elif current == "and":
  340. # ignore "and" after hundreds, thousands, etc.
  341. if prev not in self.multipliers:
  342. if value is not None:
  343. yield output(value)
  344. yield output(current)
  345. elif current == "double" or current == "triple":
  346. if next in self.ones or next in self.zeros:
  347. repeats = 2 if current == "double" else 3
  348. ones = self.ones.get(next, 0)
  349. value = str(value or "") + str(ones) * repeats
  350. skip = True
  351. else:
  352. if value is not None:
  353. yield output(value)
  354. yield output(current)
  355. elif current == "point":
  356. if next in self.decimals or next_is_numeric:
  357. value = str(value or "") + "."
  358. else:
  359. # should all have been covered at this point
  360. raise ValueError(f"Unexpected token: {current}")
  361. else:
  362. # all should have been covered at this point
  363. raise ValueError(f"Unexpected token: {current}")
  364. if value is not None:
  365. yield output(value)
  366. def preprocess(self, s: str):
  367. # replace "<number> and a half" with "<number> point five"
  368. results = []
  369. segments = re.split(r"\band\s+a\s+half\b", s)
  370. for i, segment in enumerate(segments):
  371. if len(segment.strip()) == 0:
  372. continue
  373. if i == len(segments) - 1:
  374. results.append(segment)
  375. else:
  376. results.append(segment)
  377. last_word = segment.rsplit(maxsplit=2)[-1]
  378. if last_word in self.decimals or last_word in self.multipliers:
  379. results.append("point five")
  380. else:
  381. results.append("and a half")
  382. s = " ".join(results)
  383. # put a space at number/letter boundary
  384. s = re.sub(r"([a-z])([0-9])", r"\1 \2", s)
  385. s = re.sub(r"([0-9])([a-z])", r"\1 \2", s)
  386. # but remove spaces which could be a suffix
  387. s = re.sub(r"([0-9])\s+(st|nd|rd|th|s)\b", r"\1\2", s)
  388. return s
  389. def postprocess(self, s: str):
  390. def combine_cents(m: Match):
  391. try:
  392. currency = m.group(1)
  393. integer = m.group(2)
  394. cents = int(m.group(3))
  395. return f"{currency}{integer}.{cents:02d}"
  396. except ValueError:
  397. return m.string
  398. def extract_cents(m: Match):
  399. try:
  400. return f"¢{int(m.group(1))}"
  401. except ValueError:
  402. return m.string
  403. # apply currency postprocessing; "$2 and ¢7" -> "$2.07"
  404. s = re.sub(r"([€£$])([0-9]+) (?:and )?¢([0-9]{1,2})\b", combine_cents, s)
  405. s = re.sub(r"[€£$]0.([0-9]{1,2})\b", extract_cents, s)
  406. # write "one(s)" instead of "1(s)", just for the readability
  407. s = re.sub(r"\b1(s?)\b", r"one\1", s)
  408. return s
  409. def __call__(self, s: str):
  410. s = self.preprocess(s)
  411. s = " ".join(word for word in self.process_words(s.split()) if word is not None)
  412. s = self.postprocess(s)
  413. return s
  414. class EnglishSpellingNormalizer:
  415. """
  416. Applies British-American spelling mappings as listed in [1].
  417. [1] https://www.tysto.com/uk-us-spelling-list.html
  418. """
  419. def __init__(self):
  420. mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
  421. self.mapping = json.load(open(mapping_path))
  422. def __call__(self, s: str):
  423. return " ".join(self.mapping.get(word, word) for word in s.split())
  424. class EnglishTextNormalizer:
  425. def __init__(self):
  426. self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
  427. self.replacers = {
  428. # common contractions
  429. r"\bwon't\b": "will not",
  430. r"\bcan't\b": "can not",
  431. r"\blet's\b": "let us",
  432. r"\bain't\b": "aint",
  433. r"\by'all\b": "you all",
  434. r"\bwanna\b": "want to",
  435. r"\bgotta\b": "got to",
  436. r"\bgonna\b": "going to",
  437. r"\bi'ma\b": "i am going to",
  438. r"\bimma\b": "i am going to",
  439. r"\bwoulda\b": "would have",
  440. r"\bcoulda\b": "could have",
  441. r"\bshoulda\b": "should have",
  442. r"\bma'am\b": "madam",
  443. # contractions in titles/prefixes
  444. r"\bmr\b": "mister ",
  445. r"\bmrs\b": "missus ",
  446. r"\bst\b": "saint ",
  447. r"\bdr\b": "doctor ",
  448. r"\bprof\b": "professor ",
  449. r"\bcapt\b": "captain ",
  450. r"\bgov\b": "governor ",
  451. r"\bald\b": "alderman ",
  452. r"\bgen\b": "general ",
  453. r"\bsen\b": "senator ",
  454. r"\brep\b": "representative ",
  455. r"\bpres\b": "president ",
  456. r"\brev\b": "reverend ",
  457. r"\bhon\b": "honorable ",
  458. r"\basst\b": "assistant ",
  459. r"\bassoc\b": "associate ",
  460. r"\blt\b": "lieutenant ",
  461. r"\bcol\b": "colonel ",
  462. r"\bjr\b": "junior ",
  463. r"\bsr\b": "senior ",
  464. r"\besq\b": "esquire ",
  465. # prefect tenses, ideally it should be any past participles, but it's harder..
  466. r"'d been\b": " had been",
  467. r"'s been\b": " has been",
  468. r"'d gone\b": " had gone",
  469. r"'s gone\b": " has gone",
  470. r"'d done\b": " had done", # "'s done" is ambiguous
  471. r"'s got\b": " has got",
  472. # general contractions
  473. r"n't\b": " not",
  474. r"'re\b": " are",
  475. r"'s\b": " is",
  476. r"'d\b": " would",
  477. r"'ll\b": " will",
  478. r"'t\b": " not",
  479. r"'ve\b": " have",
  480. r"'m\b": " am",
  481. }
  482. self.standardize_numbers = EnglishNumberNormalizer()
  483. self.standardize_spellings = EnglishSpellingNormalizer()
  484. def __call__(self, s: str):
  485. s = s.lower()
  486. s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
  487. s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
  488. s = re.sub(self.ignore_patterns, "", s)
  489. s = re.sub(r"\s+'", "'", s) # when there's a space before an apostrophe
  490. for pattern, replacement in self.replacers.items():
  491. s = re.sub(pattern, replacement, s)
  492. s = re.sub(r"(\d),(\d)", r"\1\2", s) # remove commas between digits
  493. s = re.sub(r"\.([^0-9]|$)", r" \1", s) # remove periods not followed by numbers
  494. s = remove_symbols_and_diacritics(s, keep=".%$¢€£") # keep numeric symbols
  495. s = self.standardize_numbers(s)
  496. s = self.standardize_spellings(s)
  497. # now remove prefix/suffix symbols that are not preceded/followed by numbers
  498. s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
  499. s = re.sub(r"([^0-9])%", r"\1 ", s)
  500. s = re.sub(r"\s+", " ", s) # replace any successive whitespaces with a space
  501. return s