The commands below will install the Python packages needed to use Whisper models and evaluate the transcription results.
! pip install git+https://github.com/openai/whisper.git
! pip install jiwer
The following will load the test-clean split of the LibriSpeech corpus using torchaudio.
import os
import numpy as np
try:
import tensorflow # required in Colab to avoid protobuf compatibility issues
except ImportError:
pass
import torch
import pandas as pd
import whisper
import torchaudio
from tqdm.notebook import tqdm
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
class LibriSpeech(torch.utils.data.Dataset):
"""
A simple class to wrap LibriSpeech and trim/pad the audio to 30 seconds.
It will drop the last few seconds of a very small portion of the utterances.
"""
def __init__(self, split="test-clean", device=DEVICE):
self.dataset = torchaudio.datasets.LIBRISPEECH(
root=os.path.expanduser("~/.cache"),
url=split,
download=True,
)
self.device = device
def __len__(self):
return len(self.dataset)
def __getitem__(self, item):
audio, sample_rate, text, _, _, _ = self.dataset[item]
assert sample_rate == 16000
audio = whisper.pad_or_trim(audio.flatten()).to(self.device)
mel = whisper.log_mel_spectrogram(audio)
return (mel, text)
dataset = LibriSpeech("test-clean")
loader = torch.utils.data.DataLoader(dataset, batch_size=16)
The following will take a few minutes to transcribe all utterances in the dataset.
model = whisper.load_model("base.en")
print(
f"Model is {'multilingual' if model.is_multilingual else 'English-only'} "
f"and has {sum(np.prod(p.shape) for p in model.parameters()):,} parameters."
)
Model is English-only and has 71,825,408 parameters.
# predict without timestamps for short-form transcription
options = whisper.DecodingOptions(language="en", without_timestamps=True)
hypotheses = []
references = []
for mels, texts in tqdm(loader):
results = model.decode(mels, options)
hypotheses.extend([result.text for result in results])
references.extend(texts)
0%| | 0/164 [00:00<?, ?it/s]
data = pd.DataFrame(dict(hypothesis=hypotheses, reference=references))
data
hypothesis | reference | |
---|---|---|
0 | He hoped there would be stew for dinner, turni... | HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP... |
1 | Stuffered into you, his belly counseled him. | STUFF IT INTO YOU HIS BELLY COUNSELLED HIM |
2 | After early nightfall the yellow lamps would l... | AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L... |
3 | Hello Bertie, any good in your mind? | HELLO BERTIE ANY GOOD IN YOUR MIND |
4 | Number 10. Fresh Nelly is waiting on you. Good... | NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ... |
... | ... | ... |
2615 | Oh, to shoot my soul's full meaning into futur... | OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE... |
2616 | Then I, long tried by natural ills, received t... | THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE... |
2617 | I love thee freely as men strive for right. I ... | I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L... |
2618 | I love thee with the passion put to use, in my... | I LOVE THEE WITH THE PASSION PUT TO USE IN MY ... |
2619 | I love thee with the love I seemed to lose wit... | I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ... |
2620 rows Ã 2 columns
Now, we use our English normalizer implementation to standardize the transcription and calculate the WER.
import jiwer
from whisper.normalizers import EnglishTextNormalizer
normalizer = EnglishTextNormalizer()
data["hypothesis_clean"] = [normalizer(text) for text in data["hypothesis"]]
data["reference_clean"] = [normalizer(text) for text in data["reference"]]
data
hypothesis | reference | hypothesis_clean | reference_clean | |
---|---|---|---|---|
0 | He hoped there would be stew for dinner, turni... | HE HOPED THERE WOULD BE STEW FOR DINNER TURNIP... | he hoped there would be stew for dinner turnip... | he hoped there would be stew for dinner turnip... |
1 | Stuffered into you, his belly counseled him. | STUFF IT INTO YOU HIS BELLY COUNSELLED HIM | stuffered into you his belly counseled him | stuff it into you his belly counseled him |
2 | After early nightfall the yellow lamps would l... | AFTER EARLY NIGHTFALL THE YELLOW LAMPS WOULD L... | after early nightfall the yellow lamps would l... | after early nightfall the yellow lamps would l... |
3 | Hello Bertie, any good in your mind? | HELLO BERTIE ANY GOOD IN YOUR MIND | hello bertie any good in your mind | hello bertie any good in your mind |
4 | Number 10. Fresh Nelly is waiting on you. Good... | NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD ... | number 10 fresh nelly is waiting on you good n... | number 10 fresh nelly is waiting on you good n... |
... | ... | ... | ... | ... |
2615 | Oh, to shoot my soul's full meaning into futur... | OH TO SHOOT MY SOUL'S FULL MEANING INTO FUTURE... | 0 to shoot my soul is full meaning into future... | 0 to shoot my soul is full meaning into future... |
2616 | Then I, long tried by natural ills, received t... | THEN I LONG TRIED BY NATURAL ILLS RECEIVED THE... | then i long tried by natural ills received the... | then i long tried by natural ills received the... |
2617 | I love thee freely as men strive for right. I ... | I LOVE THEE FREELY AS MEN STRIVE FOR RIGHT I L... | i love thee freely as men strive for right i l... | i love thee freely as men strive for right i l... |
2618 | I love thee with the passion put to use, in my... | I LOVE THEE WITH THE PASSION PUT TO USE IN MY ... | i love thee with the passion put to use in my ... | i love thee with the passion put to use in my ... |
2619 | I love thee with the love I seemed to lose wit... | I LOVE THEE WITH A LOVE I SEEMED TO LOSE WITH ... | i love thee with the love i seemed to lose wit... | i love thee with a love i seemed to lose with ... |
2620 rows Ã 4 columns
wer = jiwer.wer(list(data["reference_clean"]), list(data["hypothesis_clean"]))
print(f"WER: {wer * 100:.2f} %")
WER: 4.26 %