transcribe.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. import argparse
  2. import os
  3. import warnings
  4. from typing import Optional, Tuple, Union, TYPE_CHECKING
  5. import numpy as np
  6. import torch
  7. import tqdm
  8. from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram
  9. from .decoding import DecodingOptions, DecodingResult
  10. from .tokenizer import LANGUAGES, TO_LANGUAGE_CODE, get_tokenizer
  11. from .utils import exact_div, format_timestamp, make_safe, optional_int, optional_float, str2bool, get_writer
  12. if TYPE_CHECKING:
  13. from .model import Whisper
  14. def transcribe(
  15. model: "Whisper",
  16. audio: Union[str, np.ndarray, torch.Tensor],
  17. *,
  18. verbose: Optional[bool] = None,
  19. temperature: Union[float, Tuple[float, ...]] = (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
  20. compression_ratio_threshold: Optional[float] = 2.4,
  21. logprob_threshold: Optional[float] = -1.0,
  22. no_speech_threshold: Optional[float] = 0.6,
  23. condition_on_previous_text: bool = True,
  24. **decode_options,
  25. ):
  26. """
  27. Transcribe an audio file using Whisper
  28. Parameters
  29. ----------
  30. model: Whisper
  31. The Whisper model instance
  32. audio: Union[str, np.ndarray, torch.Tensor]
  33. The path to the audio file to open, or the audio waveform
  34. verbose: bool
  35. Whether to display the text being decoded to the console. If True, displays all the details,
  36. If False, displays minimal details. If None, does not display anything
  37. temperature: Union[float, Tuple[float, ...]]
  38. Temperature for sampling. It can be a tuple of temperatures, which will be successively used
  39. upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.
  40. compression_ratio_threshold: float
  41. If the gzip compression ratio is above this value, treat as failed
  42. logprob_threshold: float
  43. If the average log probability over sampled tokens is below this value, treat as failed
  44. no_speech_threshold: float
  45. If the no_speech probability is higher than this value AND the average log probability
  46. over sampled tokens is below `logprob_threshold`, consider the segment as silent
  47. condition_on_previous_text: bool
  48. if True, the previous output of the model is provided as a prompt for the next window;
  49. disabling may make the text inconsistent across windows, but the model becomes less prone to
  50. getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.
  51. decode_options: dict
  52. Keyword arguments to construct `DecodingOptions` instances
  53. Returns
  54. -------
  55. A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
  56. the spoken language ("language"), which is detected when `decode_options["language"]` is None.
  57. """
  58. dtype = torch.float16 if decode_options.get("fp16", True) else torch.float32
  59. if model.device == torch.device("cpu"):
  60. if torch.cuda.is_available():
  61. warnings.warn("Performing inference on CPU when CUDA is available")
  62. if dtype == torch.float16:
  63. warnings.warn("FP16 is not supported on CPU; using FP32 instead")
  64. dtype = torch.float32
  65. if dtype == torch.float32:
  66. decode_options["fp16"] = False
  67. mel = log_mel_spectrogram(audio)
  68. if decode_options.get("language", None) is None:
  69. if not model.is_multilingual:
  70. decode_options["language"] = "en"
  71. else:
  72. if verbose:
  73. print("Detecting language using up to the first 30 seconds. Use `--language` to specify the language")
  74. segment = pad_or_trim(mel, N_FRAMES).to(model.device).to(dtype)
  75. _, probs = model.detect_language(segment)
  76. decode_options["language"] = max(probs, key=probs.get)
  77. if verbose is not None:
  78. print(f"Detected language: {LANGUAGES[decode_options['language']].title()}")
  79. language = decode_options["language"]
  80. task = decode_options.get("task", "transcribe")
  81. tokenizer = get_tokenizer(model.is_multilingual, language=language, task=task)
  82. def decode_with_fallback(segment: torch.Tensor) -> DecodingResult:
  83. temperatures = [temperature] if isinstance(temperature, (int, float)) else temperature
  84. decode_result = None
  85. for t in temperatures:
  86. kwargs = {**decode_options}
  87. if t > 0:
  88. # disable beam_size and patience when t > 0
  89. kwargs.pop("beam_size", None)
  90. kwargs.pop("patience", None)
  91. else:
  92. # disable best_of when t == 0
  93. kwargs.pop("best_of", None)
  94. options = DecodingOptions(**kwargs, temperature=t)
  95. decode_result = model.decode(segment, options)
  96. needs_fallback = False
  97. if compression_ratio_threshold is not None and decode_result.compression_ratio > compression_ratio_threshold:
  98. needs_fallback = True # too repetitive
  99. if logprob_threshold is not None and decode_result.avg_logprob < logprob_threshold:
  100. needs_fallback = True # average log probability is too low
  101. if not needs_fallback:
  102. break
  103. return decode_result
  104. seek = 0
  105. input_stride = exact_div(
  106. N_FRAMES, model.dims.n_audio_ctx
  107. ) # mel frames per output token: 2
  108. time_precision = (
  109. input_stride * HOP_LENGTH / SAMPLE_RATE
  110. ) # time per output token: 0.02 (seconds)
  111. all_tokens = []
  112. all_segments = []
  113. prompt_reset_since = 0
  114. initial_prompt = decode_options.pop("initial_prompt", None) or []
  115. if initial_prompt:
  116. initial_prompt = tokenizer.encode(" " + initial_prompt.strip())
  117. all_tokens.extend(initial_prompt)
  118. def add_segment(
  119. *, start: float, end: float, text_tokens: torch.Tensor, result: DecodingResult
  120. ):
  121. text = tokenizer.decode([token for token in text_tokens if token < tokenizer.eot])
  122. if len(text.strip()) == 0: # skip empty text output
  123. return
  124. all_segments.append(
  125. {
  126. "id": len(all_segments),
  127. "seek": seek,
  128. "start": start,
  129. "end": end,
  130. "text": text,
  131. "tokens": text_tokens.tolist(),
  132. "temperature": result.temperature,
  133. "avg_logprob": result.avg_logprob,
  134. "compression_ratio": result.compression_ratio,
  135. "no_speech_prob": result.no_speech_prob,
  136. }
  137. )
  138. if verbose:
  139. print(make_safe(f"[{format_timestamp(start)} --> {format_timestamp(end)}] {text}"))
  140. # show the progress bar when verbose is False (otherwise the transcribed text will be printed)
  141. num_frames = mel.shape[-1]
  142. previous_seek_value = seek
  143. with tqdm.tqdm(total=num_frames, unit='frames', disable=verbose is not False) as pbar:
  144. while seek < num_frames:
  145. timestamp_offset = float(seek * HOP_LENGTH / SAMPLE_RATE)
  146. segment = pad_or_trim(mel[:, seek:], N_FRAMES).to(model.device).to(dtype)
  147. segment_duration = segment.shape[-1] * HOP_LENGTH / SAMPLE_RATE
  148. decode_options["prompt"] = all_tokens[prompt_reset_since:]
  149. result: DecodingResult = decode_with_fallback(segment)
  150. tokens = torch.tensor(result.tokens)
  151. if no_speech_threshold is not None:
  152. # no voice activity check
  153. should_skip = result.no_speech_prob > no_speech_threshold
  154. if logprob_threshold is not None and result.avg_logprob > logprob_threshold:
  155. # don't skip if the logprob is high enough, despite the no_speech_prob
  156. should_skip = False
  157. if should_skip:
  158. seek += segment.shape[-1] # fast-forward to the next segment boundary
  159. continue
  160. timestamp_tokens: torch.Tensor = tokens.ge(tokenizer.timestamp_begin)
  161. consecutive = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0].add_(1)
  162. if len(consecutive) > 0: # if the output contains two consecutive timestamp tokens
  163. last_slice = 0
  164. for current_slice in consecutive:
  165. sliced_tokens = tokens[last_slice:current_slice]
  166. start_timestamp_position = (
  167. sliced_tokens[0].item() - tokenizer.timestamp_begin
  168. )
  169. end_timestamp_position = (
  170. sliced_tokens[-1].item() - tokenizer.timestamp_begin
  171. )
  172. add_segment(
  173. start=timestamp_offset + start_timestamp_position * time_precision,
  174. end=timestamp_offset + end_timestamp_position * time_precision,
  175. text_tokens=sliced_tokens[1:-1],
  176. result=result,
  177. )
  178. last_slice = current_slice
  179. last_timestamp_position = (
  180. tokens[last_slice - 1].item() - tokenizer.timestamp_begin
  181. )
  182. seek += last_timestamp_position * input_stride
  183. all_tokens.extend(tokens[: last_slice + 1].tolist())
  184. else:
  185. duration = segment_duration
  186. timestamps = tokens[timestamp_tokens.nonzero().flatten()]
  187. if len(timestamps) > 0 and timestamps[-1].item() != tokenizer.timestamp_begin:
  188. # no consecutive timestamps but it has a timestamp; use the last one.
  189. # single timestamp at the end means no speech after the last timestamp.
  190. last_timestamp_position = timestamps[-1].item() - tokenizer.timestamp_begin
  191. duration = last_timestamp_position * time_precision
  192. add_segment(
  193. start=timestamp_offset,
  194. end=timestamp_offset + duration,
  195. text_tokens=tokens,
  196. result=result,
  197. )
  198. seek += segment.shape[-1]
  199. all_tokens.extend(tokens.tolist())
  200. if not condition_on_previous_text or result.temperature > 0.5:
  201. # do not feed the prompt tokens if a high temperature was used
  202. prompt_reset_since = len(all_tokens)
  203. # update progress bar
  204. pbar.update(min(num_frames, seek) - previous_seek_value)
  205. previous_seek_value = seek
  206. return dict(text=tokenizer.decode(all_tokens[len(initial_prompt):]), segments=all_segments, language=language)
  207. def cli():
  208. from . import available_models
  209. parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  210. parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
  211. parser.add_argument("--model", default="small", choices=available_models(), help="name of the Whisper model to use")
  212. parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
  213. parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
  214. parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
  215. parser.add_argument("--output_format", "-f", type=str, default="all", choices=["txt", "vtt", "srt", "tsv", "json", "all"], help="format of the output file; if not specified, all available formats will be produced")
  216. parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
  217. parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
  218. parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
  219. parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
  220. parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
  221. parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
  222. parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
  223. parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
  224. parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
  225. parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
  226. parser.add_argument("--condition_on_previous_text", type=str2bool, default=True, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
  227. parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
  228. parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
  229. parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
  230. parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
  231. parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
  232. parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
  233. args = parser.parse_args().__dict__
  234. model_name: str = args.pop("model")
  235. model_dir: str = args.pop("model_dir")
  236. output_dir: str = args.pop("output_dir")
  237. output_format: str = args.pop("output_format")
  238. device: str = args.pop("device")
  239. os.makedirs(output_dir, exist_ok=True)
  240. if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
  241. if args["language"] is not None:
  242. warnings.warn(f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead.")
  243. args["language"] = "en"
  244. temperature = args.pop("temperature")
  245. temperature_increment_on_fallback = args.pop("temperature_increment_on_fallback")
  246. if temperature_increment_on_fallback is not None:
  247. temperature = tuple(np.arange(temperature, 1.0 + 1e-6, temperature_increment_on_fallback))
  248. else:
  249. temperature = [temperature]
  250. threads = args.pop("threads")
  251. if threads > 0:
  252. torch.set_num_threads(threads)
  253. from . import load_model
  254. model = load_model(model_name, device=device, download_root=model_dir)
  255. writer = get_writer(output_format, output_dir)
  256. for audio_path in args.pop("audio"):
  257. result = transcribe(model, audio_path, temperature=temperature, **args)
  258. writer(result, audio_path)
  259. if __name__ == '__main__':
  260. cli()