transcribe.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. import argparse
  2. import os
  3. import warnings
  4. from typing import TYPE_CHECKING, Optional, Tuple, Union
  5. import numpy as np
  6. import torch
  7. import tqdm
  8. from .audio import (
  9. FRAMES_PER_SECOND,
  10. HOP_LENGTH,
  11. N_FRAMES,
  12. SAMPLE_RATE,
  13. log_mel_spectrogram,
  14. pad_or_trim,
  15. )
  16. from .decoding import DecodingOptions, DecodingResult
  17. from .timing import add_word_timestamps
  18. from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
  19. from .utils import (
  20. exact_div,
  21. format_timestamp,
  22. get_writer,
  23. make_safe,
  24. optional_float,
  25. optional_int,
  26. str2bool,
  27. )
  28. if TYPE_CHECKING:
  29. from .model import Whisper
  30. def transcribe(
  31. model: "Whisper",
  32. audio: Union[str, np.ndarray, torch.Tensor],
  33. *,
  34. verbose: Optional[bool] = None,
  35. temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
  36. compression_ratio_threshold: Optional[float] = 2.4,
  37. logprob_threshold: Optional[float] = -1.0,
  38. no_speech_threshold: Optional[float] = 0.6,
  39. condition_on_previous_text: bool = True,
  40. initial_prompt: Optional[str] = None,
  41. word_timestamps: bool = False,
  42. prepend_punctuations: str = "\"'“¿([{-",
  43. append_punctuations: str = "\"'.。,,!!??::”)]}、",
  44. **decode_options,
  45. ):
  46. """
  47. Transcribe an audio file using Whisper
  48. Parameters
  49. ----------
  50. model: Whisper
  51. The Whisper model instance
  52. audio: Union[str, np.ndarray, torch.Tensor]
  53. The path to the audio file to open, or the audio waveform
  54. verbose: bool
  55. Whether to display the text being decoded to the console. If True, displays all the details,
  56. If False, displays minimal details. If None, does not display anything
  57. temperature: Union[float, Tuple[float, ...]]
  58. Temperature for sampling. It can be a tuple of temperatures, which will be successively used
  59. upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
  60. compression_ratio_threshold: float
  61. If the gzip compression ratio is above this value, treat as failed
  62. logprob_threshold: float
  63. If the average log probability over sampled tokens is below this value, treat as failed
  64. no_speech_threshold: float
  65. If the no_speech probability is higher than this value AND the average log probability
  66. over sampled tokens is below `logprob_threshold`, consider the segment as silent
  67. condition_on_previous_text: bool
  68. if True, the previous output of the model is provided as a prompt for the next window;
  69. disabling may make the text inconsistent across windows, but the model becomes less prone to
  70. getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
  71. word_timestamps: bool
  72. Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
  73. and include the timestamps for each word in each segment.
  74. prepend_punctuations: str
  75. If word_timestamps is True, merge these punctuation symbols with the next word
  76. append_punctuations: str
  77. If word_timestamps is True, merge these punctuation symbols with the previous word
  78. initial_prompt: Optional[str]
  79. Optional text to provide as a prompt for the first window. This can be used to provide, or
  80. "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
  81. to make it more likely to predict those word correctly.
  82. decode_options: dict
  83. Keyword arguments to construct `DecodingOptions` instances
  84. Returns
  85. -------
  86. A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
  87. the spoken language ("language"), which is detected when `decode_options["language"]` is None.
  88. """
  89. dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
  90. if model.device == torch.device("cpu"):
  91. if torch.cuda.is_available():
  92. warnings.warn("Performing inference on CPU when CUDA is available")
  93. if dtype == torch.float16:
  94. warnings.warn("FP16 is not supported on CPU; using FP32 instead")
  95. dtype = torch.float32
  96. if dtype == torch.float32:
  97. decode_options["fp16"] = False
  98. mel = log_mel_spectrogram(audio)
  99. if decode_options.get("language", None) is None:
  100. if not model.is_multilingual:
  101. decode_options["language"] = "en"
  102. else:
  103. if verbose:
  104. print(
  105. "Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
  106. )
  107. mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
  108. _, probs = model.detect_language(mel_segment)
  109. decode_options["language"] = max(probs, key=probs.get)
  110. if verbose is not None:
  111. print(
  112. f"Detected language: {LANGUAGES[decode_options['language']].title()}"
  113. )
  114. language: str = decode_options["language"]
  115. task: str = decode_options.get("task", "transcribe")
  116. tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
  117. if word_timestamps and task == "translate":
  118. warnings.warn("Word-level timestamps on translations may not be reliable.")
  119. def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
  120. temperatures = (
  121. [temperature] if isinstance(temperature, (int, float)) else temperature
  122. )
  123. decode_result = None
  124. for t in temperatures:
  125. kwargs = {**decode_options}
  126. if t > 0:
  127. # disable beam_size and patience when t > 0
  128. kwargs.pop("beam_size", None)
  129. kwargs.pop("patience", None)
  130. else:
  131. # disable best_of when t == 0
  132. kwargs.pop("best_of", None)
  133. options = DecodingOptions(**kwargs, temperature=t)
  134. decode_result = model.decode(segment, options)
  135. needs_fallback = False
  136. if (
  137. compression_ratio_threshold is not None
  138. and decode_result.compression_ratio > compression_ratio_threshold
  139. ):
  140. needs_fallback = True # too repetitive
  141. if (
  142. logprob_threshold is not None
  143. and decode_result.avg_logprob < logprob_threshold
  144. ):
  145. needs_fallback = True # average log probability is too low
  146. if not needs_fallback:
  147. break
  148. return decode_result
  149. seek = 0
  150. input_stride = exact_div(
  151. N_FRAMES, model.dims.n_audio_ctx
  152. ) # mel frames per output token: 2
  153. time_precision = (
  154. input_stride * HOP_LENGTH / SAMPLE_RATE
  155. ) # time per output token: 0.02 (seconds)
  156. all_tokens = []
  157. all_segments = []
  158. prompt_reset_since = 0
  159. if initial_prompt is not None:
  160. initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
  161. all_tokens.extend(initial_prompt_tokens)
  162. else:
  163. initial_prompt_tokens = []
  164. def new_segment(
  165. *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
  166. ):
  167. text_tokens = [token for token in tokens.tolist() if token < tokenizer.eot]
  168. return {
  169. "id": len(all_segments),
  170. "seek": seek,
  171. "start": start,
  172. "end": end,
  173. "text": tokenizer.decode(text_tokens),
  174. "tokens": text_tokens,
  175. "temperature": result.temperature,
  176. "avg_logprob": result.avg_logprob,
  177. "compression_ratio": result.compression_ratio,
  178. "no_speech_prob": result.no_speech_prob,
  179. }
  180. # show the progress bar when verbose is False (if True, transcribed text will be printed)
  181. num_frames = mel.shape[-1]
  182. with tqdm.tqdm(
  183. total=num_frames, unit="frames", disable=verbose is not False
  184. ) as pbar:
  185. while seek < num_frames:
  186. time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
  187. mel_segment = mel[:, seek:]
  188. segment_size = min(mel_segment.shape[-1], N_FRAMES)
  189. segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
  190. mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
  191. decode_options["prompt"] = all_tokens[prompt_reset_since:]
  192. result: DecodingResult = decode_with_fallback(mel_segment)
  193. tokens = torch.tensor(result.tokens)
  194. if no_speech_threshold is not None:
  195. # no voice activity check
  196. should_skip = result.no_speech_prob > no_speech_threshold
  197. if (
  198. logprob_threshold is not None
  199. and result.avg_logprob > logprob_threshold
  200. ):
  201. # don't skip if the logprob is high enough, despite the no_speech_prob
  202. should_skip = False
  203. if should_skip:
  204. seek += segment_size # fast-forward to the next segment boundary
  205. continue
  206. previous_seek = seek
  207. current_segments = []
  208. current_tokens = []
  209. timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
  210. consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[
  211. 0
  212. ].add_(1)
  213. if (
  214. len(consecutive) > 0
  215. ): # if the output contains two consecutive timestamp tokens
  216. if ended_with_single_timestamp := timestamp_tokens[-2:].tolist() == [
  217. False,
  218. True,
  219. ]:
  220. consecutive = consecutive.tolist() + [len(tokens)]
  221. last_slice = 0
  222. for current_slice in consecutive:
  223. sliced_tokens = tokens[last_slice:current_slice]
  224. start_timestamp_pos = (
  225. sliced_tokens[0].item() - tokenizer.timestamp_begin
  226. )
  227. end_timestamp_pos = (
  228. sliced_tokens[-1].item() - tokenizer.timestamp_begin
  229. )
  230. current_segments.append(
  231. new_segment(
  232. start=time_offset + start_timestamp_pos * time_precision,
  233. end=time_offset + end_timestamp_pos * time_precision,
  234. tokens=sliced_tokens,
  235. result=result,
  236. )
  237. )
  238. current_tokens.append(sliced_tokens.tolist())
  239. last_slice = current_slice
  240. if ended_with_single_timestamp:
  241. # single timestamp at the end means no speech after the last timestamp.
  242. seek += segment_size
  243. else:
  244. # otherwise, ignore the unfinished segment and seek to the last timestamp
  245. last_timestamp_pos = (
  246. tokens[last_slice - 1].item() - tokenizer.timestamp_begin
  247. )
  248. seek += last_timestamp_pos * input_stride
  249. all_tokens.extend(tokens[: last_slice + 1].tolist())
  250. else:
  251. duration = segment_duration
  252. timestamps = tokens[timestamp_tokens.nonzero().flatten()]
  253. if (
  254. len(timestamps) > 0
  255. and timestamps[-1].item() != tokenizer.timestamp_begin
  256. ):
  257. # no consecutive timestamps but it has a timestamp; use the last one.
  258. last_timestamp_pos = (
  259. timestamps[-1].item() - tokenizer.timestamp_begin
  260. )
  261. duration = last_timestamp_pos * time_precision
  262. current_segments.append(
  263. new_segment(
  264. start=time_offset,
  265. end=time_offset + duration,
  266. tokens=tokens,
  267. result=result,
  268. )
  269. )
  270. current_tokens.append(tokens.tolist())
  271. seek += segment_size
  272. if not condition_on_previous_text or result.temperature > 0.5:
  273. # do not feed the prompt tokens if a high temperature was used
  274. prompt_reset_since = len(all_tokens)
  275. if word_timestamps:
  276. add_word_timestamps(
  277. segments=current_segments,
  278. model=model,
  279. tokenizer=tokenizer,
  280. mel=mel_segment,
  281. num_frames=segment_size,
  282. prepend_punctuations=prepend_punctuations,
  283. append_punctuations=append_punctuations,
  284. )
  285. word_end_timestamps = [
  286. w["end"] for s in current_segments for w in s["words"]
  287. ]
  288. if len(consecutive) > 0 and len(word_end_timestamps) > 0:
  289. seek_shift = round(
  290. (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND
  291. )
  292. if seek_shift > 0:
  293. seek = previous_seek + seek_shift
  294. if verbose:
  295. for segment in current_segments:
  296. start, end, text = segment["start"], segment["end"], segment["text"]
  297. line = f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"
  298. print(make_safe(line))
  299. # if a segment is instantaneous or does not contain text, clear it
  300. for i, segment in enumerate(current_segments):
  301. if segment["start"] == segment["end"] or segment["text"].strip() == "":
  302. segment["text"] = ""
  303. segment["tokens"] = []
  304. segment["words"] = []
  305. current_tokens[i] = []
  306. all_segments.extend(current_segments)
  307. all_tokens.extend(
  308. [token for segment in current_tokens for token in segment]
  309. )
  310. # update progress bar
  311. pbar.update(min(num_frames, seek) - previous_seek)
  312. return dict(
  313. text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),
  314. segments=all_segments,
  315. language=language,
  316. )
  317. def cli():
  318. from . import available_models
  319. # fmt: off
  320. parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  321. parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
  322. parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
  323. parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
  324. parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
  325. parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
  326. parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
  327. parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
  328. parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
  329. parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
  330. parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
  331. parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
  332. parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
  333. parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
  334. parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
  335. parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
  336. parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
  337. parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
  338. parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
  339. parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
  340. parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
  341. parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
  342. parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
  343. parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
  344. parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
  345. parser.add_argument("--append_punctuations", type=str, default="\"\'.。,,!!??::”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
  346. parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
  347. # fmt: on
  348. args = parser.parse_args().__dict__
  349. model_name: str = args.pop("model")
  350. model_dir: str = args.pop("model_dir")
  351. output_dir: str = args.pop("output_dir")
  352. output_format: str = args.pop("output_format")
  353. device: str = args.pop("device")
  354. os.makedirs(output_dir, exist_ok=True)
  355. if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
  356. if args["language"] is not None:
  357. warnings.warn(
  358. f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
  359. )
  360. args["language"] = "en"
  361. temperature = args.pop("temperature")
  362. if (increment := args.pop("temperature_increment_on_fallback")) is not None:
  363. temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
  364. else:
  365. temperature = [temperature]
  366. if (threads := args.pop("threads")) > 0:
  367. torch.set_num_threads(threads)
  368. from . import load_model
  369. model = load_model(model_name, device=device, download_root=model_dir)
  370. writer = get_writer(output_format, output_dir)
  371. for audio_path in args.pop("audio"):
  372. result = transcribe(model, audio_path, temperature=temperature, **args)
  373. writer(result, audio_path)
  374. if __name__ == "__main__":
  375. cli()