Merge pull request #205 from prashanthellina/v3-fix-diarization

references #202 , attempt to fix speaker diarization failing in v3
Merge pull request #204 from sorgfresser/v3
2025-07-01 18:17:27 -04:00 · 2023-04-30 21:08:45 +01:00 · 2023-04-30 17:33:24 +00:00 · 2023-04-30 18:29:46 +01:00 · 2023-04-30 18:34:18 +02:00 · 2023-04-29 23:35:42 +01:00
3 changed files with 14 additions and 5 deletions
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@ -450,8 +450,8 @@ def align(
                "end": srow["end"],
                "text": text,
                "words": word_list,
-                # "word-segments": wseg,
-                # "char-segments": cseg
+                "word-segments": wseg,
+                "char-segments": cseg
            }
        )
    
--- a/whisperx/asr.py
+++ b/whisperx/asr.py
@ -207,7 +207,7 @@ class FasterWhisperPipeline(Pipeline):
        return final_iterator

    def transcribe(
-        self, audio: Union[str, np.ndarray], batch_size=None
+        self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0
    ):
        if isinstance(audio, str):
            audio = load_audio(audio)
@ -232,7 +232,7 @@ class FasterWhisperPipeline(Pipeline):

        segments = []
        batch_size = batch_size or self._batch_size
-        for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size)):
+        for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)):
            text = out['text']
            if batch_size in [0, 1, None]:
                text = text[0]
@ -251,7 +251,10 @@ class FasterWhisperPipeline(Pipeline):


    def detect_language(self, audio: np.ndarray):
-        segment = log_mel_spectrogram(audio[: N_SAMPLES], padding=0)
+        if audio.shape[0] < N_SAMPLES:
+            print("Warning: audio is shorter than 30s, language detection may be inaccurate.")
+        segment = log_mel_spectrogram(audio[: N_SAMPLES],
+                                      padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0])
        encoder_output = self.model.encode(segment)
        results = self.model.model.detect_language(encoder_output)
        language_token, language_probability = results[0][0]
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@ -203,6 +203,12 @@ def cli():

    # >> Write
    for result, audio_path in results:
+        # Remove pandas dataframes from result so that
+        # we can serialize the result with json
+        for seg in result["segments"]:
+            seg.pop("word-segments", None)
+            seg.pop("char-segments", None)
+
        writer(result, audio_path, writer_args)

 if __name__ == "__main__":
Author	SHA1	Message	Date
Max Bain	e24ca9e0a2	Merge pull request #205 from prashanthellina/v3-fix-diarization	2023-04-30 21:08:45 +01:00
Prashanth Ellina	601c91140f	references #202 , attempt to fix speaker diarization failing in v3	2023-04-30 17:33:24 +00:00
Max Bain	31a9ec7466	Merge pull request #204 from sorgfresser/v3	2023-04-30 18:29:46 +01:00
Simon	b9c8c5072b	Pad language detection if audio is too short	2023-04-30 18:34:18 +02:00
Max Bain	a903e57cf1	Merge pull request #199 from thomasmol/v3	2023-04-29 23:35:42 +01:00
Thomas Mol	cb176a186e	added num_workers to fix pickling error	2023-04-29 19:51:05 +02:00