2 년 전 · b5851c6c40
--- a/whisper/tokenizer.py
+++ b/whisper/tokenizer.py
@@ -6,7 +6,6 @@ from functools import cached_property, lru_cache
 
				 from typing import Dict, List, Optional, Tuple
			
 
				 
			
 
				 import tiktoken
			
 
				-from tiktoken_ext.openai_public import gpt2
			
 
				 
			
 
				 LANGUAGES = {
			
 
				     "en": "english",
			
@@ -352,7 +351,7 @@ def get_encoding(name: str = "gpt2"):
 
				     return tiktoken.Encoding(
			
 
				         name=os.path.basename(vocab_path),
			
 
				         explicit_n_vocab=n_vocab,
			
 
				-        pat_str=gpt2()["pat_str"],
			
 
				+        pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""",
			
 
				         mergeable_ranks=ranks,
			
 
				         special_tokens=special_tokens,
			
 
				     )