test_tokenizer.py 524 B

1234567891011121314
  1. from whisper.tokenizer import get_tokenizer
  2. def test_tokenizer():
  3. gpt2_tokenizer = get_tokenizer(multilingual=False)
  4. multilingual_tokenizer = get_tokenizer(multilingual=True)
  5. text = "다람쥐 헌 쳇바퀴에 타고파"
  6. gpt2_tokens = gpt2_tokenizer.encode(text)
  7. multilingual_tokens = multilingual_tokenizer.encode(text)
  8. assert gpt2_tokenizer.decode(gpt2_tokens) == text
  9. assert multilingual_tokenizer.decode(multilingual_tokens) == text
  10. assert len(gpt2_tokens) > len(multilingual_tokens)