From 14e593f60b8036a7308cb019d33d7bc6215a35bd Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Wed, 16 Aug 2023 16:08:25 +0200
Subject: [PATCH 01/11] Update alignment.py

---
 whisperx/alignment.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index 8024bf8..0628f42 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -50,6 +50,7 @@ DEFAULT_ALIGN_MODELS_HF = {
     "ko": "kresnik/wav2vec2-large-xlsr-korean",
     "ur": "kingabzpro/wav2vec2-large-xls-r-300m-Urdu",
     "te": "anuragshas/wav2vec2-large-xlsr-53-telugu",
+    "hi": "theainerd/Wav2Vec2-large-xlsr-hindi"
 }
 
 

From 4acb5b3abcf667a9bbf36891e6cd9fc556a7a0a4 Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Wed, 16 Aug 2023 16:11:46 +0200
Subject: [PATCH 02/11] Update asr.py

---
 whisperx/asr.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/whisperx/asr.py b/whisperx/asr.py
index ecc2765..ba65a95 100644
--- a/whisperx/asr.py
+++ b/whisperx/asr.py
@@ -285,7 +285,10 @@ class FasterWhisperPipeline(Pipeline):
 
         segments: List[SingleSegment] = []
         batch_size = batch_size or self._batch_size
+        total_segments = len(vad_segments)
         for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)):
+            percent_complete = ((idx + 1) / total_segments) * 100
+            print(f"Progress: {percent_complete:.2f}%...")
             text = out['text']
             if batch_size in [0, 1, None]:
                 text = text[0]

From 1bb4839b0f8d5d10a19ac64aae91623d8ad96db0 Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Wed, 16 Aug 2023 16:13:28 +0200
Subject: [PATCH 03/11] Update alignment.py

---
 whisperx/alignment.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index 0628f42..bda322c 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -162,9 +162,15 @@ def align(
         segment["sentence_spans"] = sentence_spans
     
     aligned_segments: List[SingleAlignedSegment] = []
-
+    total_segments = len(list(transcript))
+    transcript = iter(transcript)
+    
     # 2. Get prediction matrix from alignment model & align
     for sdx, segment in enumerate(transcript):
+        
+        percent_complete = ((sdx + 1) / total_segments) * 100
+        print(f"Progress: {percent_complete:.2f}%...")
+        
         t1 = segment["start"]
         t2 = segment["end"]
         text = segment["text"]

From 72685d0398a2923241a8055fade1fd2dbf17a77d Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Wed, 16 Aug 2023 16:15:24 +0200
Subject: [PATCH 04/11] Update asr.py

---
 whisperx/asr.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/whisperx/asr.py b/whisperx/asr.py
index ba65a95..8b1e5ca 100644
--- a/whisperx/asr.py
+++ b/whisperx/asr.py
@@ -247,7 +247,7 @@ class FasterWhisperPipeline(Pipeline):
         return final_iterator
 
     def transcribe(
-        self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None
+        self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, print_progress = False
     ) -> TranscriptionResult:
         if isinstance(audio, str):
             audio = load_audio(audio)
@@ -287,8 +287,9 @@ class FasterWhisperPipeline(Pipeline):
         batch_size = batch_size or self._batch_size
         total_segments = len(vad_segments)
         for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)):
-            percent_complete = ((idx + 1) / total_segments) * 100
-            print(f"Progress: {percent_complete:.2f}%...")
+            if print_progress:
+                percent_complete = ((idx + 1) / total_segments) * 100
+                print(f"Progress: {percent_complete:.2f}%...")
             text = out['text']
             if batch_size in [0, 1, None]:
                 text = text[0]

From 65688208c9696ad479f608ab4cd2e66bf789eea8 Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Wed, 16 Aug 2023 16:18:00 +0200
Subject: [PATCH 05/11] Update alignment.py

---
 whisperx/alignment.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index bda322c..859c617 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -98,6 +98,7 @@ def align(
     device: str,
     interpolate_method: str = "nearest",
     return_char_alignments: bool = False,
+    print_progress = False
 ) -> AlignedTranscriptionResult:
     """
     Align phoneme recognition predictions to known transcription.
@@ -116,9 +117,16 @@ def align(
     model_lang = align_model_metadata["language"]
     model_type = align_model_metadata["type"]
 
+    total_segments = len(list(transcript))
+    transcript = iter(transcript)
+
     # 1. Preprocess to keep only characters in dictionary
     for sdx, segment in enumerate(transcript):
         # strip spaces at beginning / end, but keep track of the amount.
+        if print_progress:
+            percent_complete = ((sdx + 1) / total_segments) * 100
+            print(f"Progress: {percent_complete:.2f}%...")
+            
         num_leading = len(segment["text"]) - len(segment["text"].lstrip())
         num_trailing = len(segment["text"]) - len(segment["text"].rstrip())
         text = segment["text"]
@@ -162,15 +170,10 @@ def align(
         segment["sentence_spans"] = sentence_spans
     
     aligned_segments: List[SingleAlignedSegment] = []
-    total_segments = len(list(transcript))
-    transcript = iter(transcript)
     
     # 2. Get prediction matrix from alignment model & align
     for sdx, segment in enumerate(transcript):
         
-        percent_complete = ((sdx + 1) / total_segments) * 100
-        print(f"Progress: {percent_complete:.2f}%...")
-        
         t1 = segment["start"]
         t2 = segment["end"]
         text = segment["text"]

From cb3ed4ab9d92937703993e2a653d70dfa420c73a Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Wed, 16 Aug 2023 16:22:29 +0200
Subject: [PATCH 06/11] Update transcribe.py

---
 whisperx/transcribe.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py
index 1cc144e..49788bd 100644
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@@ -73,6 +73,8 @@ def cli():
     parser.add_argument("--threads", type=optional_int, default=0, help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
 
     parser.add_argument("--hf_token", type=str, default=None, help="Hugging Face Access Token to access PyAnnote gated models")
+
+    parser.add_argument("--print_progress", type=str2bool, default = False, help = "if True, progress will be printed in transcribe() and align() methods.")
     # fmt: on
 
     args = parser.parse_args().__dict__
@@ -104,6 +106,7 @@ def cli():
     diarize: bool = args.pop("diarize")
     min_speakers: int = args.pop("min_speakers")
     max_speakers: int = args.pop("max_speakers")
+    print_progress: bool = args.pop("print_progress")
 
 
     if model_name.endswith(".en") and args["language"] not in {"en", "English"}:
@@ -156,7 +159,7 @@ def cli():
         audio = load_audio(audio_path)
         # >> VAD & ASR
         print(">>Performing transcription...")
-        result = model.transcribe(audio, batch_size=batch_size)
+        result = model.transcribe(audio, batch_size=batch_size, print_progress=print_progress)
         results.append((result, audio_path))
 
     # Unload Whisper and VAD
@@ -184,7 +187,7 @@ def cli():
                     print(f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language...")
                     align_model, align_metadata = load_align_model(result["language"], device)
                 print(">>Performing alignment...")
-                result = align(result["segments"], align_model, align_metadata, input_audio, device, interpolate_method=interpolate_method, return_char_alignments=return_char_alignments)
+                result = align(result["segments"], align_model, align_metadata, input_audio, device, interpolate_method=interpolate_method, return_char_alignments=return_char_alignments, print_progress=print_progress)
 
             results.append((result, audio_path))
 

From d2d840f06ce32a10f5184fff771ada4c3a53324c Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Thu, 17 Aug 2023 14:45:23 +0200
Subject: [PATCH 07/11] Update utils.py

---
 whisperx/utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/whisperx/utils.py b/whisperx/utils.py
index 36c7543..c68e224 100644
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@@ -225,6 +225,9 @@ class SubtitlesWriter(ResultWriter):
         highlight_words: bool = options["highlight_words"]
         max_line_width = 1000 if raw_max_line_width is None else raw_max_line_width
         preserve_segments = max_line_count is None or raw_max_line_width is None
+        
+        if len(result["segments"]) == 0:
+            return
 
         def iterate_subtitles():
             line_len = 0

From ea7bb91a5614b222f77df72f9448207051998902 Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Thu, 17 Aug 2023 14:49:57 +0200
Subject: [PATCH 08/11] Update asr.py

---
 whisperx/asr.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/whisperx/asr.py b/whisperx/asr.py
index 8b1e5ca..bef3cd8 100644
--- a/whisperx/asr.py
+++ b/whisperx/asr.py
@@ -247,7 +247,7 @@ class FasterWhisperPipeline(Pipeline):
         return final_iterator
 
     def transcribe(
-        self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, print_progress = False
+        self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, print_progress = False, combined_progress=False
     ) -> TranscriptionResult:
         if isinstance(audio, str):
             audio = load_audio(audio)
@@ -288,7 +288,8 @@ class FasterWhisperPipeline(Pipeline):
         total_segments = len(vad_segments)
         for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)):
             if print_progress:
-                percent_complete = ((idx + 1) / total_segments) * 100
+                base_progress = ((idx + 1) / total_segments) * 100
+                percent_complete = base_progress / 2 if combined_progress else base_progress
                 print(f"Progress: {percent_complete:.2f}%...")
             text = out['text']
             if batch_size in [0, 1, None]:

From abbb66b58e97ad6f98a2224ec31a222bbad04bcb Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Thu, 17 Aug 2023 14:53:53 +0200
Subject: [PATCH 09/11] Update alignment.py

---
 whisperx/alignment.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index 859c617..c0d9c84 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -98,7 +98,8 @@ def align(
     device: str,
     interpolate_method: str = "nearest",
     return_char_alignments: bool = False,
-    print_progress = False
+    print_progress = False,
+    combined_progress = False
 ) -> AlignedTranscriptionResult:
     """
     Align phoneme recognition predictions to known transcription.
@@ -124,7 +125,8 @@ def align(
     for sdx, segment in enumerate(transcript):
         # strip spaces at beginning / end, but keep track of the amount.
         if print_progress:
-            percent_complete = ((sdx + 1) / total_segments) * 100
+            base_progress = ((sdx + 1) / total_segments) * 100
+            percent_complete = (50 + base_progress / 2) if combined_progress else base_progress
             print(f"Progress: {percent_complete:.2f}%...")
             
         num_leading = len(segment["text"]) - len(segment["text"].lstrip())

From 6cb7267dc2b566d2cdb880288944bf6854e7b946 Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Thu, 17 Aug 2023 14:56:54 +0200
Subject: [PATCH 10/11] Update alignment.py

---
 whisperx/alignment.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index c0d9c84..d98c290 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -99,7 +99,8 @@ def align(
     interpolate_method: str = "nearest",
     return_char_alignments: bool = False,
     print_progress = False,
-    combined_progress = False
+    combined_progress = False,
+    total_segments = 0
 ) -> AlignedTranscriptionResult:
     """
     Align phoneme recognition predictions to known transcription.
@@ -118,9 +119,6 @@ def align(
     model_lang = align_model_metadata["language"]
     model_type = align_model_metadata["type"]
 
-    total_segments = len(list(transcript))
-    transcript = iter(transcript)
-
     # 1. Preprocess to keep only characters in dictionary
     for sdx, segment in enumerate(transcript):
         # strip spaces at beginning / end, but keep track of the amount.

From 4e28492dbd412ad4286e01474154817495013e1a Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Thu, 17 Aug 2023 14:57:53 +0200
Subject: [PATCH 11/11] Update alignment.py

---
 whisperx/alignment.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/whisperx/alignment.py b/whisperx/alignment.py
index d98c290..5608f49 100644
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@@ -98,9 +98,9 @@ def align(
     device: str,
     interpolate_method: str = "nearest",
     return_char_alignments: bool = False,
-    print_progress = False,
-    combined_progress = False,
-    total_segments = 0
+    print_progress: bool = False,
+    combined_progress: bool = False,
+    total_segments: int = 0
 ) -> AlignedTranscriptionResult:
     """
     Align phoneme recognition predictions to known transcription.