whisperX/whisperx/transcribe.py

import argparse
import gc
import os
import warnings

import numpy as np
import torch

from .alignment import align, load_align_model
from .asr import load_model
from .audio import load_audio
from .diarize import DiarizationPipeline, assign_word_speakers
from .utils import (LANGUAGES, TO_LANGUAGE_CODE, get_writer, optional_float,
                    optional_int, str2bool)


def cli():
    # fmt: off
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
    parser.add_argument("--model", default="small", help="name of the Whisper model to use")
    parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
    parser.add_argument("--batch_size", default=8, type=int, help="device to use for PyTorch inference")

    parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
    parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "vtt", "txt", "tsv", "json"], help="format of the output file; if not specified, all available formats will be produced")
    parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")

    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
    parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")

    # alignment params
    parser.add_argument("--align_model", default=None, help="Name of phoneme-level ASR model to do alignment")
    parser.add_argument("--interpolate_method", default="nearest", choices=["nearest", "linear", "ignore"], help="For word .srt, method to assign timestamps to non-aligned words, or merge them into neighbouring.")
    parser.add_argument("--no_align", action='store_true', help="Do not perform phoneme alignment")

    # vad params
    parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected")
    parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.")

    # diarization params
    parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
    parser.add_argument("--min_speakers", default=None, type=int)
    parser.add_argument("--max_speakers", default=None, type=int)

    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
    parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
    parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
    parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")

    parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
    parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
    parser.add_argument("--condition_on_previous_text", type=str2bool, default=False, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
    parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")

    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")

    parser.add_argument("--max_line_width", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")
    parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --no_align) the maximum number of lines in a segment")
    parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")

    # parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
    # parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
    # parser.add_argument("--append_punctuations", type=str, default="\"\'.。,，!！?？:：”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
    parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")

    parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models")
    # parser.add_argument("--model_flush", action="store_true", help="Flush memory from each model after use, reduces GPU requirement but slower processing >1 audio file.")
    parser.add_argument("--tmp_dir", default=None, help="Temporary directory to write audio file if input if not .wav format (only for VAD).")
    # fmt: on

    args = parser.parse_args().__dict__
    model_name: str = args.pop("model")
    batch_size: int = args.pop("batch_size")
    output_dir: str = args.pop("output_dir")
    output_format: str = args.pop("output_format")
    device: str = args.pop("device")
    # model_flush: bool = args.pop("model_flush")
    os.makedirs(output_dir, exist_ok=True)

    tmp_dir: str = args.pop("tmp_dir")
    if tmp_dir is not None:
        os.makedirs(tmp_dir, exist_ok=True)

    align_model: str = args.pop("align_model")
    interpolate_method: str = args.pop("interpolate_method")
    no_align: bool = args.pop("no_align")

    hf_token: str = args.pop("hf_token")
    vad_onset: float = args.pop("vad_onset")
    vad_offset: float = args.pop("vad_offset")

    diarize: bool = args.pop("diarize")
    min_speakers: int = args.pop("min_speakers")
    max_speakers: int = args.pop("max_speakers")

    # TODO: check model loading works.

    if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
        if args["language"] is not None:
            warnings.warn(
                f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
            )
        args["language"] = "en"

    temperature = args.pop("temperature")
    if (increment := args.pop("temperature_increment_on_fallback")) is not None:
        temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
    else:
        temperature = [temperature]

    if (threads := args.pop("threads")) > 0:
        torch.set_num_threads(threads)

    asr_options = {
        "beam_size": args.pop("beam_size"),
        "patience": args.pop("patience"),
        "length_penalty": args.pop("length_penalty"),
        "temperatures": temperature,
        "compression_ratio_threshold": args.pop("compression_ratio_threshold"),
        "log_prob_threshold": args.pop("logprob_threshold"),
        "no_speech_threshold": args.pop("no_speech_threshold"),
        "condition_on_previous_text": False,
        "initial_prompt": args.pop("initial_prompt"),
    }

    writer = get_writer(output_format, output_dir)
    word_options = ["highlight_words", "max_line_count", "max_line_width"]
    if no_align:
        for option in word_options:
            if args[option]:
                parser.error(f"--{option} requires --word_timestamps True")
    if args["max_line_count"] and not args["max_line_width"]:
        warnings.warn("--max_line_count has no effect without --max_line_width")
    writer_args = {arg: args.pop(arg) for arg in word_options}
    
    # Part 1: VAD & ASR Loop
    results = []
    tmp_results = []
    # model = load_model(model_name, device=device, download_root=model_dir)
    model = load_model(model_name, device=device, language=args['language'], asr_options=asr_options, vad_options={"vad_onset": vad_onset, "vad_offset": vad_offset},)

    for audio_path in args.pop("audio"):
        audio = load_audio(audio_path)
        # >> VAD & ASR
        print(">>Performing transcription...")
        result = model.transcribe(audio, batch_size=batch_size)
        results.append((result, audio_path))

    # Unload Whisper and VAD
    del model
    gc.collect()
    torch.cuda.empty_cache()

    # Part 2: Align Loop
    if not no_align:
        tmp_results = results
        results = []
        align_language = args["language"] if args["language"] is not None else "en" # default to loading english if not specified
        align_model, align_metadata = load_align_model(align_language, device, model_name=align_model)
        for result, audio_path in tmp_results:
            # >> Align
            if len(tmp_results) > 1:
                input_audio = audio_path
            else:
                # lazily load audio from part 1
                input_audio = audio

            if align_model is not None and len(result["segments"]) > 0:
                if result.get("language", "en") != align_metadata["language"]:
                    # load new language
                    print(f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language...")
                    align_model, align_metadata = load_align_model(result["language"], device)
                print(">>Performing alignment...")
                result = align(result["segments"], align_model, align_metadata, input_audio, device, interpolate_method=interpolate_method)
            results.append((result, audio_path))

        # Unload align model
        del align_model
        gc.collect()
        torch.cuda.empty_cache()

    # >> Diarize
    if diarize:
        if hf_token is None:
            print("Warning, no --hf_token used, needs to be saved in environment variable, otherwise will throw error loading diarization model...")
        tmp_results = results
        results = []
        diarize_model = DiarizationPipeline(use_auth_token=hf_token)
        for result, input_audio_path in tmp_results:
            diarize_segments = diarize_model(input_audio_path, min_speakers=min_speakers, max_speakers=max_speakers)
            results_segments, word_segments = assign_word_speakers(diarize_segments, result["segments"])
            result = {"segments": results_segments, "word_segments": word_segments}
            results.append((result, input_audio_path))

    # >> Write
    for result, audio_path in results:
        writer(result, audio_path, writer_args)

if __name__ == "__main__":
    cli()
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								import argparse
-												opti the inference loop

											
										
										
											2023-04-09 15:58:55 +08:00
+								import gc
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								import os
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								import warnings
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								import numpy as np
 								import torch
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
 								from .alignment import align, load_align_model
 								from .asr import load_model
 								from .audio import load_audio
-												.wav conversion, handle audio with no detected speech

											
										
										
											2023-03-31 23:02:38 +01:00
+								from .diarize import DiarizationPipeline, assign_word_speakers
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								from .utils import (LANGUAGES, TO_LANGUAGE_CODE, get_writer, optional_float,
 								                    optional_int, str2bool)
-												support batch processing

											
										
										
											2023-02-01 19:41:20 +00:00
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								def cli():
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    # fmt: off
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 								    parser.add_argument("audio", nargs="+", type=str, help="audio file(s) to transcribe")
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    parser.add_argument("--model", default="small", help="name of the Whisper model to use")
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
 								    parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    parser.add_argument("--batch_size", default=8, type=int, help="device to use for PyTorch inference")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "vtt", "txt", "tsv", "json"], help="format of the output file; if not specified, all available formats will be produced")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages")
 								    parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
 								    parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), help="language spoken in the audio, specify None to perform language detection")
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    # alignment params
-												support huggingface + model select based on lang.

											
										
										
											2022-12-20 19:54:55 +00:00
+								    parser.add_argument("--align_model", default=None, help="Name of phoneme-level ASR model to do alignment")
-												new logic, diarization, vad filtering

											
										
										
											2023-01-24 15:02:08 +00:00
+								    parser.add_argument("--interpolate_method", default="nearest", choices=["nearest", "linear", "ignore"], help="For word .srt, method to assign timestamps to non-aligned words, or merge them into neighbouring.")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    parser.add_argument("--no_align", action='store_true', help="Do not perform phoneme alignment")
-												new logic, diarization, vad filtering

											
										
										
											2023-01-24 15:02:08 +00:00
+								    # vad params
-												fix tfile naming

											
										
										
											2023-03-30 19:24:42 +01:00
+								    parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected")
 								    parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
-												new logic, diarization, vad filtering

											
										
										
											2023-01-24 15:02:08 +00:00
+								    # diarization params
-												update readme

											
										
										
											2023-02-01 22:09:11 +00:00
+								    parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
-												new logic, diarization, vad filtering

											
										
										
											2023-01-24 15:02:08 +00:00
+								    parser.add_argument("--min_speakers", default=None, type=int)
 								    parser.add_argument("--max_speakers", default=None, type=int)
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
 								    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
 								    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
 								    parser.add_argument("--beam_size", type=optional_int, default=5, help="number of beams in beam search, only applicable when temperature is zero")
 								    parser.add_argument("--patience", type=float, default=None, help="optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search")
 								    parser.add_argument("--length_penalty", type=float, default=None, help="optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default")
 								    parser.add_argument("--suppress_tokens", type=str, default="-1", help="comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations")
 								    parser.add_argument("--initial_prompt", type=str, default=None, help="optional text to provide as a prompt for the first window.")
-												.wav conversion, handle audio with no detected speech

											
										
										
											2023-03-31 23:02:38 +01:00
+								    parser.add_argument("--condition_on_previous_text", type=str2bool, default=False, help="if True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop")
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    parser.add_argument("--fp16", type=str2bool, default=True, help="whether to perform inference in fp16; True by default")
 								    parser.add_argument("--temperature_increment_on_fallback", type=optional_float, default=0.2, help="temperature to increase when falling back when the decoding fails to meet either of the thresholds below")
 								    parser.add_argument("--compression_ratio_threshold", type=optional_float, default=2.4, help="if the gzip compression ratio is higher than this value, treat the decoding as failed")
 								    parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed")
 								    parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence")
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
 								    parser.add_argument("--max_line_width", type=optional_int, default=None, help="(not possible with --no_align) the maximum number of characters in a line before breaking the line")
 								    parser.add_argument("--max_line_count", type=optional_int, default=None, help="(requires --no_align) the maximum number of lines in a segment")
 								    parser.add_argument("--highlight_words", type=str2bool, default=False, help="(requires --word_timestamps True) underline each word as it is spoken in srt and vtt")
 								    # parser.add_argument("--word_timestamps", type=str2bool, default=False, help="(experimental) extract word-level timestamps and refine the results based on them")
 								    # parser.add_argument("--prepend_punctuations", type=str, default="\"\'“¿([{-", help="if word_timestamps is True, merge these punctuation symbols with the next word")
 								    # parser.add_argument("--append_punctuations", type=str, default="\"\'.。,，!！?？:：”)]}、", help="if word_timestamps is True, merge these punctuation symbols with the previous word")
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
-												Update transcribe.py

added the ability to include HF access token in order to use PyAnnote models
											
										
										
											2023-01-26 00:42:35 +02:00
+								    parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models")
-												.wav conversion, handle audio with no detected speech

											
										
										
											2023-03-31 23:02:38 +01:00
+								    # parser.add_argument("--model_flush", action="store_true", help="Flush memory from each model after use, reduces GPU requirement but slower processing >1 audio file.")
-												fix tfile naming

											
										
										
											2023-03-30 19:24:42 +01:00
+								    parser.add_argument("--tmp_dir", default=None, help="Temporary directory to write audio file if input if not .wav format (only for VAD).")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    # fmt: on
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    args = parser.parse_args().__dict__
 								    model_name: str = args.pop("model")
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    batch_size: int = args.pop("batch_size")
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    output_dir: str = args.pop("output_dir")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    output_format: str = args.pop("output_format")
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    device: str = args.pop("device")
-												.wav conversion, handle audio with no detected speech

											
										
										
											2023-03-31 23:02:38 +01:00
+								    # model_flush: bool = args.pop("model_flush")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    os.makedirs(output_dir, exist_ok=True)
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
-												.wav conversion, handle audio with no detected speech

											
										
										
											2023-03-31 23:02:38 +01:00
+								    tmp_dir: str = args.pop("tmp_dir")
 								    if tmp_dir is not None:
 								        os.makedirs(tmp_dir, exist_ok=True)
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    align_model: str = args.pop("align_model")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    interpolate_method: str = args.pop("interpolate_method")
 								    no_align: bool = args.pop("no_align")
-												Update transcribe.py

added the ability to include HF access token in order to use PyAnnote models
											
										
										
											2023-01-26 00:42:35 +02:00
+								    hf_token: str = args.pop("hf_token")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    vad_onset: float = args.pop("vad_onset")
 								    vad_offset: float = args.pop("vad_offset")
-												vad filter

											
										
										
											2023-01-20 12:54:20 +00:00
-												new logic, diarization, vad filtering

											
										
										
											2023-01-24 15:02:08 +00:00
+								    diarize: bool = args.pop("diarize")
 								    min_speakers: int = args.pop("min_speakers")
 								    max_speakers: int = args.pop("max_speakers")
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    # TODO: check model loading works.
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
 								    if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
 								        if args["language"] is not None:
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								            warnings.warn(
 								                f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
 								            )
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								        args["language"] = "en"
 								    temperature = args.pop("temperature")
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    if (increment := args.pop("temperature_increment_on_fallback")) is not None:
 								        temperature = tuple(np.arange(temperature, 1.0 + 1e-6, increment))
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								    else:
 								        temperature = [temperature]
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    if (threads := args.pop("threads")) > 0:
-												init commit

											
										
										
											2022-12-14 18:59:12 +00:00
+								        torch.set_num_threads(threads)
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    asr_options = {
 								        "beam_size": args.pop("beam_size"),
 								        "patience": args.pop("patience"),
 								        "length_penalty": args.pop("length_penalty"),
 								        "temperatures": temperature,
 								        "compression_ratio_threshold": args.pop("compression_ratio_threshold"),
 								        "log_prob_threshold": args.pop("logprob_threshold"),
 								        "no_speech_threshold": args.pop("no_speech_threshold"),
 								        "condition_on_previous_text": False,
 								        "initial_prompt": args.pop("initial_prompt"),
 								    }
-												support huggingface + model select based on lang.

											
										
										
											2022-12-20 19:54:55 +00:00
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    writer = get_writer(output_format, output_dir)
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    word_options = ["highlight_words", "max_line_count", "max_line_width"]
 								    if no_align:
 								        for option in word_options:
 								            if args[option]:
 								                parser.error(f"--{option} requires --word_timestamps True")
 								    if args["max_line_count"] and not args["max_line_width"]:
 								        warnings.warn("--max_line_count has no effect without --max_line_width")
 								    writer_args = {arg: args.pop(arg) for arg in word_options}
-												opti the inference loop

											
										
										
											2023-04-09 15:58:55 +08:00
+								    # Part 1: VAD & ASR Loop
 								    results = []
 								    tmp_results = []
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    # model = load_model(model_name, device=device, download_root=model_dir)
 								    model = load_model(model_name, device=device, language=args['language'], asr_options=asr_options, vad_options={"vad_onset": vad_onset, "vad_offset": vad_offset},)
-												handle tmp wav file better

											
										
										
											2023-04-01 00:06:40 +01:00
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								    for audio_path in args.pop("audio"):
 								        audio = load_audio(audio_path)
-												handle tmp wav file better

											
										
										
											2023-04-01 00:06:40 +01:00
+								        # >> VAD & ASR
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								        print(">>Performing transcription...")
 								        result = model.transcribe(audio, batch_size=batch_size)
 								        results.append((result, audio_path))
-												opti the inference loop

											
										
										
											2023-04-09 15:58:55 +08:00
 								    # Unload Whisper and VAD
 								    del model
 								    gc.collect()
 								    torch.cuda.empty_cache()
 								    # Part 2: Align Loop
 								    if not no_align:
 								        tmp_results = results
 								        results = []
 								        align_language = args["language"] if args["language"] is not None else "en" # default to loading english if not specified
 								        align_model, align_metadata = load_align_model(align_language, device, model_name=align_model)
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								        for result, audio_path in tmp_results:
-												opti the inference loop

											
										
										
											2023-04-09 15:58:55 +08:00
+								            # >> Align
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								            if len(tmp_results) > 1:
 								                input_audio = audio_path
 								            else:
 								                # lazily load audio from part 1
 								                input_audio = audio
-												opti the inference loop

											
										
										
											2023-04-09 15:58:55 +08:00
+								            if align_model is not None and len(result["segments"]) > 0:
 								                if result.get("language", "en") != align_metadata["language"]:
 								                    # load new language
 								                    print(f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language...")
 								                    align_model, align_metadata = load_align_model(result["language"], device)
 								                print(">>Performing alignment...")
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								                result = align(result["segments"], align_model, align_metadata, input_audio, device, interpolate_method=interpolate_method)
 								            results.append((result, audio_path))
-												opti the inference loop

											
										
										
											2023-04-09 15:58:55 +08:00
 								        # Unload align model
 								        del align_model
 								        gc.collect()
 								        torch.cuda.empty_cache()
 								    # >> Diarize
 								    if diarize:
 								        if hf_token is None:
 								            print("Warning, no --hf_token used, needs to be saved in environment variable, otherwise will throw error loading diarization model...")
 								        tmp_results = results
 								        results = []
 								        diarize_model = DiarizationPipeline(use_auth_token=hf_token)
 								        for result, input_audio_path in tmp_results:
-												.wav conversion, handle audio with no detected speech

											
										
										
											2023-03-31 23:02:38 +01:00
+								            diarize_segments = diarize_model(input_audio_path, min_speakers=min_speakers, max_speakers=max_speakers)
-												handle tmp wav file better

											
										
										
											2023-04-01 00:06:40 +01:00
+								            results_segments, word_segments = assign_word_speakers(diarize_segments, result["segments"])
 								            result = {"segments": results_segments, "word_segments": word_segments}
-												opti the inference loop

											
										
										
											2023-04-09 15:58:55 +08:00
+								            results.append((result, input_audio_path))
-												handle tmp wav file better

											
										
										
											2023-04-01 00:06:40 +01:00
-												opti the inference loop

											
										
										
											2023-04-09 15:58:55 +08:00
+								    # >> Write
 								    for result, audio_path in results:
-												v3 init

											
										
										
											2023-04-24 21:08:43 +01:00
+								        writer(result, audio_path, writer_args)
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
-												new logic, diarization, vad filtering

											
										
										
											2023-01-24 15:02:08 +00:00
+								if __name__ == "__main__":
-												skeleton v2

											
										
										
											2023-03-30 05:31:57 +01:00
+								    cli()