| 
														
															@@ -1,7 +1,17 @@ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+import pytest 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 from whisper.tokenizer import get_tokenizer 
														 | 
														
														 | 
														
															 from whisper.tokenizer import get_tokenizer 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-def test_tokenizer(): 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+@pytest.mark.parametrize("multilingual", [True, False]) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+def test_tokenizer(multilingual): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    tokenizer = get_tokenizer(multilingual=False) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    assert tokenizer.sot in tokenizer.sot_sequence 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    assert len(tokenizer.all_language_codes) == len(tokenizer.all_language_tokens) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    assert all(c < tokenizer.timestamp_begin for c in tokenizer.all_language_tokens) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+ 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+def test_multilingual_tokenizer(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     gpt2_tokenizer = get_tokenizer(multilingual=False) 
														 | 
														
														 | 
														
															     gpt2_tokenizer = get_tokenizer(multilingual=False) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     multilingual_tokenizer = get_tokenizer(multilingual=True) 
														 | 
														
														 | 
														
															     multilingual_tokenizer = get_tokenizer(multilingual=True) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
										
											
												
													
														 | 
														
															@@ -20,5 +30,5 @@ def test_split_on_unicode(): 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378] 
														 | 
														
														 | 
														
															     tokens = [8404, 871, 287, 6, 246, 526, 3210, 20378] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens) 
														 | 
														
														 | 
														
															     words, word_tokens = multilingual_tokenizer.split_tokens_on_unicode(tokens) 
														 | 
													
												
											
												
													
														| 
														 | 
														
															  
														 | 
														
														 | 
														
															  
														 | 
													
												
											
												
													
														| 
														 | 
														
															-    assert words == [" elle", " est", " l", "'", "�", "é", "rit", "oire"] 
														 | 
														
														 | 
														
															 
														 | 
													
												
											
												
													
														| 
														 | 
														
															 
														 | 
														
														 | 
														
															+    assert words == [" elle", " est", " l", "'", "\ufffd", "é", "rit", "oire"] 
														 | 
													
												
											
												
													
														| 
														 | 
														
															     assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]] 
														 | 
														
														 | 
														
															     assert word_tokens == [[8404], [871], [287], [6], [246], [526], [3210], [20378]] 
														 |