From c3de5e9580045947083b0a760a43059f68bad1d5 Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf <32404268+MahmoudAshraf97@users.noreply.github.com> Date: Thu, 26 Jan 2023 00:36:29 +0200 Subject: [PATCH 1/3] Update README.md fixed model name --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3a24550..9fae9fc 100644 --- a/README.md +++ b/README.md @@ -85,7 +85,7 @@ Run whisper on example segment (using default params) For increased timestamp accuracy, at the cost of higher gpu mem, use bigger models and VAD filtering e.g. - whisperx examples/sample01.wav --model large.en --vad_filter --align_model WAV2VEC2_ASR_LARGE_LV60K_960H + whisperx examples/sample01.wav --model large-v2 --vad_filter --align_model WAV2VEC2_ASR_LARGE_LV60K_960H Result using *WhisperX* with forced alignment to wav2vec2.0 large: From e7773358a3f3f7096f9fdfb1b9a33a8146c6585f Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf <32404268+MahmoudAshraf97@users.noreply.github.com> Date: Thu, 26 Jan 2023 00:42:35 +0200 Subject: [PATCH 2/3] Update transcribe.py added the ability to include HF access token in order to use PyAnnote models --- whisperx/transcribe.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index 7f07f3c..4acc9c5 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -385,7 +385,8 @@ def cli(): parser.add_argument("--logprob_threshold", type=optional_float, default=-1.0, help="if the average log probability is lower than this value, treat the decoding as failed") parser.add_argument("--no_speech_threshold", type=optional_float, default=0.6, help="if the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence") parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS") - + parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models") + args = parser.parse_args().__dict__ model_name: str = args.pop("model") model_dir: str = args.pop("model_dir") @@ -397,7 +398,8 @@ def cli(): align_extend: float = args.pop("align_extend") align_from_prev: bool = args.pop("align_from_prev") interpolate_method: bool = args.pop("interpolate_method") - + + hf_token: str = args.pop("hf_token") vad_filter: bool = args.pop("vad_filter") vad_input: bool = args.pop("vad_input") @@ -410,12 +412,14 @@ def cli(): vad_input = pd.read_csv(vad_input, header=None, sep= " ") elif vad_filter: from pyannote.audio import Pipeline - vad_pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection") + vad_pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", + use_auth_token=hf_token) diarize_pipeline = None if diarize: from pyannote.audio import Pipeline - diarize_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1") + diarize_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization@2.1", + use_auth_token=hf_token) os.makedirs(output_dir, exist_ok=True) From 99b6e79fbffaba9301888d254996a975beb516c0 Mon Sep 17 00:00:00 2001 From: Mahmoud Ashraf <32404268+MahmoudAshraf97@users.noreply.github.com> Date: Thu, 26 Jan 2023 00:56:10 +0200 Subject: [PATCH 3/3] Update README.md added additional instructions to use PyAnnote modules --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9fae9fc..af5dec7 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ This repository refines the timestamps of openAI's Whisper model via forced alig - Character level timestamps (see `*.char.ass` file output) - Diarization (still in beta, add `--diarization`) +To enable VAD filtering and Diarization, include your Hugging Face access token that you can generate from [Here](https://huggingface.co/settings/tokens) after the `--hf_token` argument and accept the user agreement for the following models: [Segmentation](https://huggingface.co/pyannote/segmentation) , [Voice Activity Detection (VAD)](https://huggingface.co/pyannote/voice-activity-detection) , and [Speaker Diarization](https://huggingface.co/pyannote/speaker-diarization) +

Setup ⚙️

Install this package using