| 
					
				 | 
			
			
				@@ -874,7 +874,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "from IPython.display import display, HTML\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "from whisper.tokenizer import get_tokenizer\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "from dtw import dtw\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        "from scipy.signal import medfilt\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "from scipy.ndimage import median_filter\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "%matplotlib inline\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "%config InlineBackend.figure_format = \"retina\"" 
			 | 
		
	
	
		
			
				| 
					
				 | 
			
			
				@@ -3610,7 +3610,7 @@ 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "    weights = torch.cat(QKs)  # layers * heads * tokens * frames    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "    weights = weights[:, :, :, : duration // AUDIO_SAMPLES_PER_TOKEN].cpu()\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				-        "    weights = medfilt(weights, (1, 1, 1, medfilt_width))\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				+        "    weights = median_filter(weights, (1, 1, 1, medfilt_width))\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "    weights = torch.tensor(weights * qk_scale).softmax(dim=-1)\n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "    \n", 
			 | 
		
	
		
			
				 | 
				 | 
			
			
				         "    w = weights / weights.norm(dim=-2, keepdim=True)\n", 
			 |