test_normalizer.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596
  1. import pytest
  2. from whisper.normalizers import EnglishTextNormalizer
  3. from whisper.normalizers.english import (
  4. EnglishNumberNormalizer,
  5. EnglishSpellingNormalizer,
  6. )
  7. @pytest.mark.parametrize("std", [EnglishNumberNormalizer(), EnglishTextNormalizer()])
  8. def test_number_normalizer(std):
  9. assert std("two") == "2"
  10. assert std("thirty one") == "31"
  11. assert std("five twenty four") == "524"
  12. assert std("nineteen ninety nine") == "1999"
  13. assert std("twenty nineteen") == "2019"
  14. assert std("two point five million") == "2500000"
  15. assert std("four point two billions") == "4200000000s"
  16. assert std("200 thousand") == "200000"
  17. assert std("200 thousand dollars") == "$200000"
  18. assert std("$20 million") == "$20000000"
  19. assert std("€52.4 million") == "€52400000"
  20. assert std("£77 thousands") == "£77000s"
  21. assert std("two double o eight") == "2008"
  22. assert std("three thousand twenty nine") == "3029"
  23. assert std("forty three thousand two hundred sixty") == "43260"
  24. assert std("forty three thousand two hundred and sixty") == "43260"
  25. assert std("nineteen fifties") == "1950s"
  26. assert std("thirty first") == "31st"
  27. assert std("thirty three thousand and three hundred and thirty third") == "33333rd"
  28. assert std("three billion") == "3000000000"
  29. assert std("millions") == "1000000s"
  30. assert std("july third twenty twenty") == "july 3rd 2020"
  31. assert std("august twenty sixth twenty twenty one") == "august 26th 2021"
  32. assert std("3 14") == "3 14"
  33. assert std("3.14") == "3.14"
  34. assert std("3 point 2") == "3.2"
  35. assert std("3 point 14") == "3.14"
  36. assert std("fourteen point 4") == "14.4"
  37. assert std("two point two five dollars") == "$2.25"
  38. assert std("two hundred million dollars") == "$200000000"
  39. assert std("$20.1 million") == "$20100000"
  40. assert std("ninety percent") == "90%"
  41. assert std("seventy six per cent") == "76%"
  42. assert std("double oh seven") == "007"
  43. assert std("double zero seven") == "007"
  44. assert std("nine one one") == "911"
  45. assert std("nine double one") == "911"
  46. assert std("one triple oh one") == "10001"
  47. assert std("two thousandth") == "2000th"
  48. assert std("thirty two thousandth") == "32000th"
  49. assert std("minus 500") == "-500"
  50. assert std("positive twenty thousand") == "+20000"
  51. assert std("two dollars and seventy cents") == "$2.70"
  52. assert std("3 cents") == "¢3"
  53. assert std("$0.36") == "¢36"
  54. assert std("three euros and sixty five cents") == "€3.65"
  55. assert std("three and a half million") == "3500000"
  56. assert std("forty eight and a half dollars") == "$48.5"
  57. assert std("b747") == "b 747"
  58. assert std("10 th") == "10th"
  59. assert std("10th") == "10th"
  60. def test_spelling_normalizer():
  61. std = EnglishSpellingNormalizer()
  62. assert std("mobilisation") == "mobilization"
  63. assert std("cancelation") == "cancellation"
  64. def test_text_normalizer():
  65. std = EnglishTextNormalizer()
  66. assert std("Let's") == "let us"
  67. assert std("he's like") == "he is like"
  68. assert std("she's been like") == "she has been like"
  69. assert std("10km") == "10 km"
  70. assert std("10mm") == "10 mm"
  71. assert std("RC232") == "rc 232"
  72. assert (
  73. std("Mr. Park visited Assoc. Prof. Kim Jr.")
  74. == "mister park visited associate professor kim junior"
  75. )