From 14e593f60b8036a7308cb019d33d7bc6215a35bd Mon Sep 17 00:00:00 2001 From: awerks Date: Wed, 16 Aug 2023 16:08:25 +0200 Subject: [PATCH 01/11] Update alignment.py --- whisperx/alignment.py | 1 + 1 file changed, 1 insertion(+) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index 8024bf8..0628f42 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -50,6 +50,7 @@ DEFAULT_ALIGN_MODELS_HF = { "ko": "kresnik/wav2vec2-large-xlsr-korean", "ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu", "te": "anuragshas/wav2vec2-large-xlsr-53-telugu", + "hi": "theainerd/Wav2Vec2-large-xlsr-hindi" } From 4acb5b3abcf667a9bbf36891e6cd9fc556a7a0a4 Mon Sep 17 00:00:00 2001 From: awerks Date: Wed, 16 Aug 2023 16:11:46 +0200 Subject: [PATCH 02/11] Update asr.py --- whisperx/asr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/whisperx/asr.py b/whisperx/asr.py index ecc2765..ba65a95 100644 --- a/whisperx/asr.py +++ b/whisperx/asr.py @@ -285,7 +285,10 @@ class FasterWhisperPipeline(Pipeline): segments: List[SingleSegment] = [] batch_size = batch_size or self._batch_size + total_segments = len(vad_segments) for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)): + percent_complete = ((idx + 1) / total_segments) * 100 + print(f"Progress: {percent_complete:.2f}%...") text = out['text'] if batch_size in [0, 1, None]: text = text[0] From 1bb4839b0f8d5d10a19ac64aae91623d8ad96db0 Mon Sep 17 00:00:00 2001 From: awerks Date: Wed, 16 Aug 2023 16:13:28 +0200 Subject: [PATCH 03/11] Update alignment.py --- whisperx/alignment.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index 0628f42..bda322c 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -162,9 +162,15 @@ def align( segment["sentence_spans"] = sentence_spans aligned_segments: List[SingleAlignedSegment] = [] - + total_segments = len(list(transcript)) + transcript = iter(transcript) + # 2. Get prediction matrix from alignment model & align for sdx, segment in enumerate(transcript): + + percent_complete = ((sdx + 1) / total_segments) * 100 + print(f"Progress: {percent_complete:.2f}%...") + t1 = segment["start"] t2 = segment["end"] text = segment["text"] From 72685d0398a2923241a8055fade1fd2dbf17a77d Mon Sep 17 00:00:00 2001 From: awerks Date: Wed, 16 Aug 2023 16:15:24 +0200 Subject: [PATCH 04/11] Update asr.py --- whisperx/asr.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/whisperx/asr.py b/whisperx/asr.py index ba65a95..8b1e5ca 100644 --- a/whisperx/asr.py +++ b/whisperx/asr.py @@ -247,7 +247,7 @@ class FasterWhisperPipeline(Pipeline): return final_iterator def transcribe( - self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None + self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, print_progress = False ) -> TranscriptionResult: if isinstance(audio, str): audio = load_audio(audio) @@ -287,8 +287,9 @@ class FasterWhisperPipeline(Pipeline): batch_size = batch_size or self._batch_size total_segments = len(vad_segments) for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)): - percent_complete = ((idx + 1) / total_segments) * 100 - print(f"Progress: {percent_complete:.2f}%...") + if print_progress: + percent_complete = ((idx + 1) / total_segments) * 100 + print(f"Progress: {percent_complete:.2f}%...") text = out['text'] if batch_size in [0, 1, None]: text = text[0] From 65688208c9696ad479f608ab4cd2e66bf789eea8 Mon Sep 17 00:00:00 2001 From: awerks Date: Wed, 16 Aug 2023 16:18:00 +0200 Subject: [PATCH 05/11] Update alignment.py --- whisperx/alignment.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index bda322c..859c617 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -98,6 +98,7 @@ def align( device: str, interpolate_method: str = "nearest", return_char_alignments: bool = False, + print_progress = False ) -> AlignedTranscriptionResult: """ Align phoneme recognition predictions to known transcription. @@ -116,9 +117,16 @@ def align( model_lang = align_model_metadata["language"] model_type = align_model_metadata["type"] + total_segments = len(list(transcript)) + transcript = iter(transcript) + # 1. Preprocess to keep only characters in dictionary for sdx, segment in enumerate(transcript): # strip spaces at beginning / end, but keep track of the amount. + if print_progress: + percent_complete = ((sdx + 1) / total_segments) * 100 + print(f"Progress: {percent_complete:.2f}%...") + num_leading = len(segment["text"]) - len(segment["text"].lstrip()) num_trailing = len(segment["text"]) - len(segment["text"].rstrip()) text = segment["text"] @@ -162,15 +170,10 @@ def align( segment["sentence_spans"] = sentence_spans aligned_segments: List[SingleAlignedSegment] = [] - total_segments = len(list(transcript)) - transcript = iter(transcript) # 2. Get prediction matrix from alignment model & align for sdx, segment in enumerate(transcript): - percent_complete = ((sdx + 1) / total_segments) * 100 - print(f"Progress: {percent_complete:.2f}%...") - t1 = segment["start"] t2 = segment["end"] text = segment["text"] From cb3ed4ab9d92937703993e2a653d70dfa420c73a Mon Sep 17 00:00:00 2001 From: awerks Date: Wed, 16 Aug 2023 16:22:29 +0200 Subject: [PATCH 06/11] Update transcribe.py --- whisperx/transcribe.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index 1cc144e..49788bd 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -73,6 +73,8 @@ def cli(): parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS") parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models") + + parser.add_argument("--print_progress", type=str2bool, default = False, help = "if True, progress will be printed in transcribe() and align() methods.") # fmt: on args = parser.parse_args().__dict__ @@ -104,6 +106,7 @@ def cli(): diarize: bool = args.pop("diarize") min_speakers: int = args.pop("min_speakers") max_speakers: int = args.pop("max_speakers") + print_progress: bool = args.pop("print_progress") if model_name.endswith(".en") and args["language"] not in {"en", "English"}: @@ -156,7 +159,7 @@ def cli(): audio = load_audio(audio_path) # >> VAD & ASR print(">>Performing transcription...") - result = model.transcribe(audio, batch_size=batch_size) + result = model.transcribe(audio, batch_size=batch_size, print_progress=print_progress) results.append((result, audio_path)) # Unload Whisper and VAD @@ -184,7 +187,7 @@ def cli(): print(f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language...") align_model, align_metadata = load_align_model(result["language"], device) print(">>Performing alignment...") - result = align(result["segments"], align_model, align_metadata, input_audio, device, interpolate_method=interpolate_method, return_char_alignments=return_char_alignments) + result = align(result["segments"], align_model, align_metadata, input_audio, device, interpolate_method=interpolate_method, return_char_alignments=return_char_alignments, print_progress=print_progress) results.append((result, audio_path)) From d2d840f06ce32a10f5184fff771ada4c3a53324c Mon Sep 17 00:00:00 2001 From: awerks Date: Thu, 17 Aug 2023 14:45:23 +0200 Subject: [PATCH 07/11] Update utils.py --- whisperx/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/whisperx/utils.py b/whisperx/utils.py index 36c7543..c68e224 100644 --- a/whisperx/utils.py +++ b/whisperx/utils.py @@ -225,6 +225,9 @@ class SubtitlesWriter(ResultWriter): highlight_words: bool = options["highlight_words"] max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width preserve_segments = max_line_count is None or raw_max_line_width is None + + if len(result["segments"]) == 0: + return def iterate_subtitles(): line_len = 0 From ea7bb91a5614b222f77df72f9448207051998902 Mon Sep 17 00:00:00 2001 From: awerks Date: Thu, 17 Aug 2023 14:49:57 +0200 Subject: [PATCH 08/11] Update asr.py --- whisperx/asr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/whisperx/asr.py b/whisperx/asr.py index 8b1e5ca..bef3cd8 100644 --- a/whisperx/asr.py +++ b/whisperx/asr.py @@ -247,7 +247,7 @@ class FasterWhisperPipeline(Pipeline): return final_iterator def transcribe( - self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, print_progress = False + self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, print_progress = False, combined_progress=False ) -> TranscriptionResult: if isinstance(audio, str): audio = load_audio(audio) @@ -288,7 +288,8 @@ class FasterWhisperPipeline(Pipeline): total_segments = len(vad_segments) for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)): if print_progress: - percent_complete = ((idx + 1) / total_segments) * 100 + base_progress = ((idx + 1) / total_segments) * 100 + percent_complete = base_progress / 2 if combined_progress else base_progress print(f"Progress: {percent_complete:.2f}%...") text = out['text'] if batch_size in [0, 1, None]: From abbb66b58e97ad6f98a2224ec31a222bbad04bcb Mon Sep 17 00:00:00 2001 From: awerks Date: Thu, 17 Aug 2023 14:53:53 +0200 Subject: [PATCH 09/11] Update alignment.py --- whisperx/alignment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index 859c617..c0d9c84 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -98,7 +98,8 @@ def align( device: str, interpolate_method: str = "nearest", return_char_alignments: bool = False, - print_progress = False + print_progress = False, + combined_progress = False ) -> AlignedTranscriptionResult: """ Align phoneme recognition predictions to known transcription. @@ -124,7 +125,8 @@ def align( for sdx, segment in enumerate(transcript): # strip spaces at beginning / end, but keep track of the amount. if print_progress: - percent_complete = ((sdx + 1) / total_segments) * 100 + base_progress = ((sdx + 1) / total_segments) * 100 + percent_complete = (50 + base_progress / 2) if combined_progress else base_progress print(f"Progress: {percent_complete:.2f}%...") num_leading = len(segment["text"]) - len(segment["text"].lstrip()) From 6cb7267dc2b566d2cdb880288944bf6854e7b946 Mon Sep 17 00:00:00 2001 From: awerks Date: Thu, 17 Aug 2023 14:56:54 +0200 Subject: [PATCH 10/11] Update alignment.py --- whisperx/alignment.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index c0d9c84..d98c290 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -99,7 +99,8 @@ def align( interpolate_method: str = "nearest", return_char_alignments: bool = False, print_progress = False, - combined_progress = False + combined_progress = False, + total_segments = 0 ) -> AlignedTranscriptionResult: """ Align phoneme recognition predictions to known transcription. @@ -118,9 +119,6 @@ def align( model_lang = align_model_metadata["language"] model_type = align_model_metadata["type"] - total_segments = len(list(transcript)) - transcript = iter(transcript) - # 1. Preprocess to keep only characters in dictionary for sdx, segment in enumerate(transcript): # strip spaces at beginning / end, but keep track of the amount. From 4e28492dbd412ad4286e01474154817495013e1a Mon Sep 17 00:00:00 2001 From: awerks Date: Thu, 17 Aug 2023 14:57:53 +0200 Subject: [PATCH 11/11] Update alignment.py --- whisperx/alignment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/whisperx/alignment.py b/whisperx/alignment.py index d98c290..5608f49 100644 --- a/whisperx/alignment.py +++ b/whisperx/alignment.py @@ -98,9 +98,9 @@ def align( device: str, interpolate_method: str = "nearest", return_char_alignments: bool = False, - print_progress = False, - combined_progress = False, - total_segments = 0 + print_progress: bool = False, + combined_progress: bool = False, + total_segments: int = 0 ) -> AlignedTranscriptionResult: """ Align phoneme recognition predictions to known transcription.