Browse Source

Update tokenizer.py (#1163)

Jong Wook Kim 2 years ago
parent
commit
b5851c6c40
1 changed files with 1 additions and 2 deletions
  1. 1 2
      whisper/tokenizer.py

+ 1 - 2
whisper/tokenizer.py

@@ -6,7 +6,6 @@ from functools import cached_property, lru_cache
 from typing import Dict, List, Optional, Tuple
 from typing import Dict, List, Optional, Tuple
 
 
 import tiktoken
 import tiktoken
-from tiktoken_ext.openai_public import gpt2
 
 
 LANGUAGES = {
 LANGUAGES = {
     "en": "english",
     "en": "english",
@@ -352,7 +351,7 @@ def get_encoding(name: str = "gpt2"):
     return tiktoken.Encoding(
     return tiktoken.Encoding(
         name=os.path.basename(vocab_path),
         name=os.path.basename(vocab_path),
         explicit_n_vocab=n_vocab,
         explicit_n_vocab=n_vocab,
-        pat_str=gpt2()["pat_str"],
+        pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
         mergeable_ranks=ranks,
         mergeable_ranks=ranks,
         special_tokens=special_tokens,
         special_tokens=special_tokens,
     )
     )