transcribe.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. import argparse
  2. import os
  3. import warnings
  4. from typing import Optional, Tuple, Union, Callable, TYPE_CHECKING
  5. import numpy as np
  6. import torch
  7. import tqdm
  8. from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram
  9. from .decoding import DecodingOptions, DecodingResult
  10. from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
  11. from .utils import exact_div, format_timestamp, make_safe, optional_int, optional_float, str2bool, get_writer
  12. if TYPE_CHECKING:
  13. from .model import Whisper
  14. def transcribe(
  15. model: "Whisper",
  16. audio: Union[str, np.ndarray, torch.Tensor],
  17. *,
  18. verbose: Optional[bool] = None,
  19. temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
  20. compression_ratio_threshold: Optional[float] = 2.4,
  21. logprob_threshold: Optional[float] = -1.0,
  22. no_speech_threshold: Optional[float] = 0.6,
  23. condition_on_previous_text: bool = True,
  24. initial_prompt: Optional[str] = None,
  25. progress_callback: Optional[Callable[[float],None]] = None,
  26. **decode_options,
  27. ):
  28. """
  29. Transcribe an audio file using Whisper
  30. Parameters
  31. ----------
  32. model: Whisper
  33. The Whisper model instance
  34. audio: Union[str, np.ndarray, torch.Tensor]
  35. The path to the audio file to open, or the audio waveform
  36. verbose: bool
  37. Whether to display the text being decoded to the console. If True, displays all the details,
  38. If False, displays minimal details. If None, does not display anything
  39. temperature: Union[float, Tuple[float, ...]]
  40. Temperature for sampling. It can be a tuple of temperatures, which will be successively used
  41. upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
  42. compression_ratio_threshold: float
  43. If the gzip compression ratio is above this value, treat as failed
  44. logprob_threshold: float
  45. If the average log probability over sampled tokens is below this value, treat as failed
  46. no_speech_threshold: float
  47. If the no_speech probability is higher than this value AND the average log probability
  48. over sampled tokens is below `logprob_threshold`, consider the segment as silent
  49. condition_on_previous_text: bool
  50. if True, the previous output of the model is provided as a prompt for the next window;
  51. disabling may make the text inconsistent across windows, but the model becomes less prone to
  52. getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
  53. decode_options: dict
  54. Keyword arguments to construct `DecodingOptions` instances
  55. Returns
  56. -------
  57. A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
  58. the spoken language ("language"), which is detected when `decode_options["language"]` is None.
  59. """
  60. dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
  61. if model.device == torch.device("cpu"):
  62. if torch.cuda.is_available():
  63. warnings.warn("Performing inference on CPU when CUDA is available")
  64. if dtype == torch.float16:
  65. warnings.warn("FP16 is not supported on CPU; using FP32 instead")
  66. dtype = torch.float32
  67. if dtype == torch.float32:
  68. decode_options["fp16"] = False
  69. mel = log_mel_spectrogram(audio)
  70. if decode_options.get("language", None) is None:
  71. if not model.is_multilingual:
  72. decode_options["language"] = "en"
  73. else:
  74. if verbose:
  75. print("Detecting language using up to the first 30 seconds. Use `--language` to specify the language")
  76. segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
  77. _, probs = model.detect_language(segment)
  78. decode_options["language"] = max(probs, key=probs.get)
  79. if verbose is not None:
  80. print(f"Detected language: {LANGUAGES[decode_options['language']].title()}")
  81. language = decode_options["language"]
  82. task = decode_options.get("task", "transcribe")
  83. tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
  84. def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
  85. temperatures = [temperature] if isinstance(temperature, (int, float)) else temperature
  86. decode_result = None
  87. for t in temperatures:
  88. kwargs = {**decode_options}
  89. if t > 0:
  90. # disable beam_size and patience when t > 0
  91. kwargs.pop("beam_size", None)
  92. kwargs.pop("patience", None)
  93. else:
  94. # disable best_of when t == 0
  95. kwargs.pop("best_of", None)
  96. options = DecodingOptions(**kwargs, temperature=t)
  97. decode_result = model.decode(segment, options)
  98. needs_fallback = False
  99. if compression_ratio_threshold is not None and decode_result.compression_ratio > compression_ratio_threshold:
  100. needs_fallback = True # too repetitive
  101. if logprob_threshold is not None and decode_result.avg_logprob < logprob_threshold:
  102. needs_fallback = True # average log probability is too low
  103. if not needs_fallback:
  104. break
  105. return decode_result
  106. seek = 0
  107. input_stride = exact_div(
  108. N_FRAMES, model.dims.n_audio_ctx
  109. ) # mel frames per output token: 2
  110. time_precision = (
  111. input_stride * HOP_LENGTH / SAMPLE_RATE
  112. ) # time per output token: 0.02 (seconds)
  113. all_tokens = []
  114. all_segments = []
  115. prompt_reset_since = 0
  116. if initial_prompt is not None:
  117. initial_prompt_tokens = tokenizer.encode(" " + initial_prompt.strip())
  118. all_tokens.extend(initial_prompt_tokens)
  119. else:
  120. initial_prompt_tokens = []
  121. def add_segment(
  122. *, start: float, end: float, text_tokens: torch.Tensor, result: DecodingResult
  123. ):
  124. text = tokenizer.decode([token for token in text_tokens if token < tokenizer.eot])
  125. if len(text.strip()) == 0: # skip empty text output
  126. return
  127. all_segments.append(
  128. {
  129. "id": len(all_segments),
  130. "seek": seek,
  131. "start": start,
  132. "end": end,
  133. "text": text,
  134. "tokens": text_tokens.tolist(),
  135. "temperature": result.temperature,
  136. "avg_logprob": result.avg_logprob,
  137. "compression_ratio": result.compression_ratio,
  138. "no_speech_prob": result.no_speech_prob,
  139. }
  140. )
  141. if verbose:
  142. print(make_safe(f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"))
  143. # show the progress bar when verbose is False (otherwise the transcribed text will be printed)
  144. num_frames = mel.shape[-1]
  145. previous_seek_value = seek
  146. with tqdm.tqdm(total=num_frames, unit='frames', disable=verbose is not False) as pbar:
  147. while seek < num_frames:
  148. if progress_callback is not None:
  149. progress_value = seek/num_frames
  150. progress_callback(progress_value)
  151. timestamp_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
  152. segment = pad_or_trim(mel[:, seek:], N_FRAMES).to(model.device).to(dtype)
  153. segment_duration = segment.shape[-1] * HOP_LENGTH / SAMPLE_RATE
  154. decode_options["prompt"] = all_tokens[prompt_reset_since:]
  155. result: DecodingResult = decode_with_fallback(segment)
  156. tokens = torch.tensor(result.tokens)
  157. if no_speech_threshold is not None:
  158. # no voice activity check
  159. should_skip = result.no_speech_prob > no_speech_threshold
  160. if logprob_threshold is not None and result.avg_logprob > logprob_threshold:
  161. # don't skip if the logprob is high enough, despite the no_speech_prob
  162. should_skip = False
  163. if should_skip:
  164. seek += segment.shape[-1] # fast-forward to the next segment boundary
  165. continue
  166. timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
  167. consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0].add_(1)
  168. if len(consecutive) > 0: # if the output contains two consecutive timestamp tokens
  169. last_slice = 0
  170. for current_slice in consecutive:
  171. sliced_tokens = tokens[last_slice:current_slice]
  172. start_timestamp_position = (
  173. sliced_tokens[0].item() - tokenizer.timestamp_begin
  174. )
  175. end_timestamp_position = (
  176. sliced_tokens[-1].item() - tokenizer.timestamp_begin
  177. )
  178. add_segment(
  179. start=timestamp_offset + start_timestamp_position * time_precision,
  180. end=timestamp_offset + end_timestamp_position * time_precision,
  181. text_tokens=sliced_tokens[1:-1],
  182. result=result,
  183. )
  184. last_slice = current_slice
  185. last_timestamp_position = (
  186. tokens[last_slice - 1].item() - tokenizer.timestamp_begin
  187. )
  188. seek += last_timestamp_position * input_stride
  189. all_tokens.extend(tokens[: last_slice + 1].tolist())
  190. else:
  191. duration = segment_duration
  192. timestamps = tokens[timestamp_tokens.nonzero().flatten()]
  193. if len(timestamps) > 0 and timestamps[-1].item() != tokenizer.timestamp_begin:
  194. # no consecutive timestamps but it has a timestamp; use the last one.
  195. # single timestamp at the end means no speech after the last timestamp.
  196. last_timestamp_position = timestamps[-1].item() - tokenizer.timestamp_begin
  197. duration = last_timestamp_position * time_precision
  198. add_segment(
  199. start=timestamp_offset,
  200. end=timestamp_offset + duration,
  201. text_tokens=tokens,
  202. result=result,
  203. )
  204. seek += segment.shape[-1]
  205. all_tokens.extend(tokens.tolist())
  206. if not condition_on_previous_text or result.temperature > 0.5:
  207. # do not feed the prompt tokens if a high temperature was used
  208. prompt_reset_since = len(all_tokens)
  209. # update progress bar
  210. pbar.update(min(num_frames, seek) - previous_seek_value)
  211. previous_seek_value = seek
  212. return dict(
  213. text=tokenizer.decode(all_tokens[len(initial_prompt_tokens):]),
  214. segments=all_segments,
  215. language=language
  216. )
  217. def cli():
  218. from . import available_models
  219. parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  220. parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
  221. parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
  222. parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
  223. parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
  224. parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
  225. parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
  226. parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
  227. parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
  228. parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
  229. parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
  230. parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
  231. parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
  232. parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
  233. parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
  234. parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
  235. parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
  236. parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
  237. parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
  238. parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
  239. parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
  240. parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
  241. parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
  242. parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
  243. args = parser.parse_args().__dict__
  244. model_name: str = args.pop("model")
  245. model_dir: str = args.pop("model_dir")
  246. output_dir: str = args.pop("output_dir")
  247. output_format: str = args.pop("output_format")
  248. device: str = args.pop("device")
  249. os.makedirs(output_dir, exist_ok=True)
  250. if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
  251. if args["language"] is not None:
  252. warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
  253. args["language"] = "en"
  254. temperature = args.pop("temperature")
  255. if (increment := args.pop("temperature_increment_on_fallback")) is not None:
  256. temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
  257. else:
  258. temperature = [temperature]
  259. if (threads := args.pop("threads")) > 0:
  260. torch.set_num_threads(threads)
  261. from . import load_model
  262. model = load_model(model_name, device=device, download_root=model_dir)
  263. writer = get_writer(output_format, output_dir)
  264. for audio_path in args.pop("audio"):
  265. result = transcribe(model, audio_path, temperature=temperature, **args)
  266. writer(result, audio_path)
  267. if __name__ == '__main__':
  268. cli()