transcribe.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. import argparse
  2. import os
  3. import warnings
  4. from typing import TYPE_CHECKING, Optional, Tuple, Union
  5. import numpy as np
  6. import torch
  7. import tqdm
  8. from .audio import (
  9. FRAMES_PER_SECOND,
  10. HOP_LENGTH,
  11. N_FRAMES,
  12. N_SAMPLES,
  13. SAMPLE_RATE,
  14. log_mel_spectrogram,
  15. pad_or_trim,
  16. )
  17. from .decoding import DecodingOptions, DecodingResult
  18. from .timing import add_word_timestamps
  19. from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
  20. from .utils import (
  21. exact_div,
  22. format_timestamp,
  23. get_writer,
  24. make_safe,
  25. optional_float,
  26. optional_int,
  27. str2bool,
  28. )
  29. if TYPE_CHECKING:
  30. from .model import Whisper
  31. def transcribe(
  32. model: "Whisper",
  33. audio: Union[str, np.ndarray, torch.Tensor],
  34. *,
  35. verbose: Optional[bool] = None,
  36. temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
  37. compression_ratio_threshold: Optional[float] = 2.4,
  38. logprob_threshold: Optional[float] = -1.0,
  39. no_speech_threshold: Optional[float] = 0.6,
  40. condition_on_previous_text: bool = True,
  41. initial_prompt: Optional[str] = None,
  42. word_timestamps: bool = False,
  43. prepend_punctuations: str = "\"'“¿([{-",
  44. append_punctuations: str = "\"'.。,,!!??::”)]}、",
  45. **decode_options,
  46. ):
  47. """
  48. Transcribe an audio file using Whisper
  49. Parameters
  50. ----------
  51. model: Whisper
  52. The Whisper model instance
  53. audio: Union[str, np.ndarray, torch.Tensor]
  54. The path to the audio file to open, or the audio waveform
  55. verbose: bool
  56. Whether to display the text being decoded to the console. If True, displays all the details,
  57. If False, displays minimal details. If None, does not display anything
  58. temperature: Union[float, Tuple[float, ...]]
  59. Temperature for sampling. It can be a tuple of temperatures, which will be successively used
  60. upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
  61. compression_ratio_threshold: float
  62. If the gzip compression ratio is above this value, treat as failed
  63. logprob_threshold: float
  64. If the average log probability over sampled tokens is below this value, treat as failed
  65. no_speech_threshold: float
  66. If the no_speech probability is higher than this value AND the average log probability
  67. over sampled tokens is below `logprob_threshold`, consider the segment as silent
  68. condition_on_previous_text: bool
  69. if True, the previous output of the model is provided as a prompt for the next window;
  70. disabling may make the text inconsistent across windows, but the model becomes less prone to
  71. getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
  72. word_timestamps: bool
  73. Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
  74. and include the timestamps for each word in each segment.
  75. prepend_punctuations: str
  76. If word_timestamps is True, merge these punctuation symbols with the next word
  77. append_punctuations: str
  78. If word_timestamps is True, merge these punctuation symbols with the previous word
  79. initial_prompt: Optional[str]
  80. Optional text to provide as a prompt for the first window. This can be used to provide, or
  81. "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
  82. to make it more likely to predict those word correctly.
  83. decode_options: dict
  84. Keyword arguments to construct `DecodingOptions` instances
  85. Returns
  86. -------
  87. A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
  88. the spoken language ("language"), which is detected when `decode_options["language"]` is None.
  89. """
  90. dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
  91. if model.device == torch.device("cpu"):
  92. if torch.cuda.is_available():
  93. warnings.warn("Performing inference on CPU when CUDA is available")
  94. if dtype == torch.float16:
  95. warnings.warn("FP16 is not supported on CPU; using FP32 instead")
  96. dtype = torch.float32
  97. if dtype == torch.float32:
  98. decode_options["fp16"] = False
  99. # Pad 30-seconds of silence to the input audio, for slicing
  100. mel = log_mel_spectrogram(audio, padding=N_SAMPLES)
  101. content_frames = mel.shape[-1] - N_FRAMES
  102. if decode_options.get("language", None) is None:
  103. if not model.is_multilingual:
  104. decode_options["language"] = "en"
  105. else:
  106. if verbose:
  107. print(
  108. "Detecting language using up to the first 30 seconds. Use `--language` to specify the language"
  109. )
  110. mel_segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
  111. _, probs = model.detect_language(mel_segment)
  112. decode_options["language"] = max(probs, key=probs.get)
  113. if verbose is not None:
  114. print(
  115. f"Detected language: {LANGUAGES[decode_options['language']].title()}"
  116. )
  117. language: str = decode_options["language"]
  118. task: str = decode_options.get("task", "transcribe")
  119. tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
  120. if word_timestamps and task == "translate":
  121. warnings.warn("Word-level timestamps on translations may not be reliable.")
  122. def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
  123. temperatures = (
  124. [temperature] if isinstance(temperature, (int, float)) else temperature
  125. )
  126. decode_result = None
  127. for t in temperatures:
  128. kwargs = {**decode_options}
  129. if t > 0:
  130. # disable beam_size and patience when t > 0
  131. kwargs.pop("beam_size", None)
  132. kwargs.pop("patience", None)
  133. else:
  134. # disable best_of when t == 0
  135. kwargs.pop("best_of", None)
  136. options = DecodingOptions(**kwargs, temperature=t)
  137. decode_result = model.decode(segment, options)
  138. needs_fallback = False
  139. if (
  140. compression_ratio_threshold is not None
  141. and decode_result.compression_ratio > compression_ratio_threshold
  142. ):
  143. needs_fallback = True # too repetitive
  144. if (
  145. logprob_threshold is not None
  146. and decode_result.avg_logprob < logprob_threshold
  147. ):
  148. needs_fallback = True # average log probability is too low
  149. if (
  150. no_speech_threshold is not None
  151. and decode_result.no_speech_prob > no_speech_threshold
  152. ):
  153. needs_fallback = False # silence
  154. if not needs_fallback:
  155. break
  156. return decode_result
  157. seek = 0
  158. input_stride = exact_div(
  159. N_FRAMES, model.dims.n_audio_ctx
  160. ) # mel frames per output token: 2
  161. time_precision = (
  162. input_stride * HOP_LENGTH / SAMPLE_RATE
  163. ) # time per output token: 0.02 (seconds)
  164. all_tokens = []
  165. all_segments = []
  166. prompt_reset_since = 0
  167. if initial_prompt is not None:
  168. initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
  169. all_tokens.extend(initial_prompt_tokens)
  170. else:
  171. initial_prompt_tokens = []
  172. def new_segment(
  173. *, start: float, end: float, tokens: torch.Tensor, result: DecodingResult
  174. ):
  175. tokens = tokens.tolist()
  176. text_tokens = [token for token in tokens if token < tokenizer.eot]
  177. return {
  178. "seek": seek,
  179. "start": start,
  180. "end": end,
  181. "text": tokenizer.decode(text_tokens),
  182. "tokens": tokens,
  183. "temperature": result.temperature,
  184. "avg_logprob": result.avg_logprob,
  185. "compression_ratio": result.compression_ratio,
  186. "no_speech_prob": result.no_speech_prob,
  187. }
  188. # show the progress bar when verbose is False (if True, transcribed text will be printed)
  189. with tqdm.tqdm(
  190. total=content_frames, unit="frames", disable=verbose is not False
  191. ) as pbar:
  192. last_speech_timestamp = 0.0
  193. while seek < content_frames:
  194. time_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
  195. mel_segment = mel[:, seek : seek + N_FRAMES]
  196. segment_size = min(N_FRAMES, content_frames - seek)
  197. segment_duration = segment_size * HOP_LENGTH / SAMPLE_RATE
  198. mel_segment = pad_or_trim(mel_segment, N_FRAMES).to(model.device).to(dtype)
  199. decode_options["prompt"] = all_tokens[prompt_reset_since:]
  200. result: DecodingResult = decode_with_fallback(mel_segment)
  201. tokens = torch.tensor(result.tokens)
  202. if no_speech_threshold is not None:
  203. # no voice activity check
  204. should_skip = result.no_speech_prob > no_speech_threshold
  205. if (
  206. logprob_threshold is not None
  207. and result.avg_logprob > logprob_threshold
  208. ):
  209. # don't skip if the logprob is high enough, despite the no_speech_prob
  210. should_skip = False
  211. if should_skip:
  212. seek += segment_size # fast-forward to the next segment boundary
  213. continue
  214. previous_seek = seek
  215. current_segments = []
  216. timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
  217. single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
  218. consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
  219. consecutive.add_(1)
  220. if len(consecutive) > 0:
  221. # if the output contains two consecutive timestamp tokens
  222. slices = consecutive.tolist()
  223. if single_timestamp_ending:
  224. slices.append(len(tokens))
  225. last_slice = 0
  226. for current_slice in slices:
  227. sliced_tokens = tokens[last_slice:current_slice]
  228. start_timestamp_pos = (
  229. sliced_tokens[0].item() - tokenizer.timestamp_begin
  230. )
  231. end_timestamp_pos = (
  232. sliced_tokens[-1].item() - tokenizer.timestamp_begin
  233. )
  234. current_segments.append(
  235. new_segment(
  236. start=time_offset + start_timestamp_pos * time_precision,
  237. end=time_offset + end_timestamp_pos * time_precision,
  238. tokens=sliced_tokens,
  239. result=result,
  240. )
  241. )
  242. last_slice = current_slice
  243. if single_timestamp_ending:
  244. # single timestamp at the end means no speech after the last timestamp.
  245. seek += segment_size
  246. else:
  247. # otherwise, ignore the unfinished segment and seek to the last timestamp
  248. last_timestamp_pos = (
  249. tokens[last_slice - 1].item() - tokenizer.timestamp_begin
  250. )
  251. seek += last_timestamp_pos * input_stride
  252. else:
  253. duration = segment_duration
  254. timestamps = tokens[timestamp_tokens.nonzero().flatten()]
  255. if (
  256. len(timestamps) > 0
  257. and timestamps[-1].item() != tokenizer.timestamp_begin
  258. ):
  259. # no consecutive timestamps but it has a timestamp; use the last one.
  260. last_timestamp_pos = (
  261. timestamps[-1].item() - tokenizer.timestamp_begin
  262. )
  263. duration = last_timestamp_pos * time_precision
  264. current_segments.append(
  265. new_segment(
  266. start=time_offset,
  267. end=time_offset + duration,
  268. tokens=tokens,
  269. result=result,
  270. )
  271. )
  272. seek += segment_size
  273. if word_timestamps:
  274. add_word_timestamps(
  275. segments=current_segments,
  276. model=model,
  277. tokenizer=tokenizer,
  278. mel=mel_segment,
  279. num_frames=segment_size,
  280. prepend_punctuations=prepend_punctuations,
  281. append_punctuations=append_punctuations,
  282. last_speech_timestamp=last_speech_timestamp,
  283. )
  284. word_end_timestamps = [
  285. w["end"] for s in current_segments for w in s["words"]
  286. ]
  287. if len(word_end_timestamps) > 0:
  288. last_speech_timestamp = word_end_timestamps[-1]
  289. if not single_timestamp_ending and len(word_end_timestamps) > 0:
  290. seek_shift = round(
  291. (word_end_timestamps[-1] - time_offset) * FRAMES_PER_SECOND
  292. )
  293. if seek_shift > 0:
  294. seek = previous_seek + seek_shift
  295. if verbose:
  296. for segment in current_segments:
  297. start, end, text = segment["start"], segment["end"], segment["text"]
  298. line = f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"
  299. print(make_safe(line))
  300. # if a segment is instantaneous or does not contain text, clear it
  301. for i, segment in enumerate(current_segments):
  302. if segment["start"] == segment["end"] or segment["text"].strip() == "":
  303. segment["text"] = ""
  304. segment["tokens"] = []
  305. segment["words"] = []
  306. all_segments.extend(
  307. [
  308. {"id": i, **segment}
  309. for i, segment in enumerate(
  310. current_segments, start=len(all_segments)
  311. )
  312. ]
  313. )
  314. all_tokens.extend(
  315. [token for segment in current_segments for token in segment["tokens"]]
  316. )
  317. if not condition_on_previous_text or result.temperature > 0.5:
  318. # do not feed the prompt tokens if a high temperature was used
  319. prompt_reset_since = len(all_tokens)
  320. # update progress bar
  321. pbar.update(min(content_frames, seek) - previous_seek)
  322. return dict(
  323. text=tokenizer.decode(all_tokens[len(initial_prompt_tokens) :]),
  324. segments=all_segments,
  325. language=language,
  326. )
  327. def cli():
  328. from . import available_models
  329. # fmt: off
  330. parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  331. parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
  332. parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
  333. parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
  334. parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
  335. parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
  336. parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
  337. parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
  338. parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
  339. parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
  340. parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
  341. parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
  342. parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
  343. parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
  344. parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
  345. parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
  346. parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
  347. parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
  348. parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
  349. parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
  350. parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
  351. parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
  352. parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
  353. parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
  354. parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
  355. parser.add_argument("--append_punctuations", type=str, default="\"\'.。,,!!??::”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
  356. parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
  357. parser.add_argument("--max_line_width", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of characters in a line before breaking the line")
  358. parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --word_timestamps True) the maximum number of lines in a segment")
  359. parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
  360. # fmt: on
  361. args = parser.parse_args().__dict__
  362. model_name: str = args.pop("model")
  363. model_dir: str = args.pop("model_dir")
  364. output_dir: str = args.pop("output_dir")
  365. output_format: str = args.pop("output_format")
  366. device: str = args.pop("device")
  367. os.makedirs(output_dir, exist_ok=True)
  368. if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
  369. if args["language"] is not None:
  370. warnings.warn(
  371. f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
  372. )
  373. args["language"] = "en"
  374. temperature = args.pop("temperature")
  375. if (increment := args.pop("temperature_increment_on_fallback")) is not None:
  376. temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
  377. else:
  378. temperature = [temperature]
  379. if (threads := args.pop("threads")) > 0:
  380. torch.set_num_threads(threads)
  381. from . import load_model
  382. model = load_model(model_name, device=device, download_root=model_dir)
  383. writer = get_writer(output_format, output_dir)
  384. word_options = ["highlight_words", "max_line_count", "max_line_width"]
  385. if not args["word_timestamps"]:
  386. for option in word_options:
  387. if args[option]:
  388. parser.error(f"--{option} requires --word_timestamps True")
  389. if args["max_line_count"] and not args["max_line_width"]:
  390. warnings.warn("--max_line_count has no effect without --max_line_width")
  391. writer_args = {arg: args.pop(arg) for arg in word_options}
  392. for audio_path in args.pop("audio"):
  393. result = transcribe(model, audio_path, temperature=temperature, **args)
  394. writer(result, audio_path, writer_args)
  395. if __name__ == "__main__":
  396. cli()