Merge 399010fd12 into d700b56c9c

Revert "docs: add troubleshooting section for libcudnn dependencies in README"
This reverts commit 6fe0a8784a. Revert the commit now that the issue is fixed. Signed-off-by: CHEN, CHUN <jim60105@gmail.com>
2025-07-01 18:17:27 -04:00 · 2025-06-13 16:23:14 +00:00 · 2025-06-14 00:22:57 +08:00 · 2025-06-14 00:21:53 +08:00 · 2025-06-14 00:21:53 +08:00
6 changed files with 1087 additions and 1106 deletions
--- a/README.md
+++ b/README.md
@ -97,25 +97,6 @@ uv sync --all-extras --dev

 You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.

-### Common Issues & Troubleshooting 🔧
-
-#### libcudnn Dependencies (GPU Users)
-
-If you're using WhisperX with GPU support and encounter errors like:
-
- `Could not load library libcudnn_ops_infer.so.8`
- `Unable to load any of {libcudnn_cnn.so.9.1.0, libcudnn_cnn.so.9.1, libcudnn_cnn.so.9, libcudnn_cnn.so}`
- `libcudnn_ops_infer.so.8: cannot open shared object file: No such file or directory`
-
-This means your system is missing the CUDA Deep Neural Network library (cuDNN). This library is needed for GPU acceleration but isn't always installed by default.
-
-**Install cuDNN (example for apt based systems):**
-
-```bash
-sudo apt update
-sudo apt install libcudnn8 libcudnn8-dev -y
-```
-
 ### Speaker Diarization

 To **enable Speaker Diarization**, include your Hugging Face access token (read) that you can generate from [Here](https://huggingface.co/settings/tokens) after the `--hf_token` argument and accept the user agreement for the following models: [Segmentation](https://huggingface.co/pyannote/segmentation-3.0) and [Speaker-Diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) (if you choose to use Speaker-Diarization 2.x, follow requirements [here](https://huggingface.co/pyannote/speaker-diarization) instead.)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -2,7 +2,7 @@
 urls = { repository = "https://github.com/m-bain/whisperx" }
 authors = [{ name = "Max Bain" }]
 name = "whisperx"
-version = "3.4.0"
+version = "3.3.4"
 description = "Time-Accurate Automatic Speech Recognition using Whisper."
 readme = "README.md"
 requires-python = ">=3.9, <3.13"
@ -13,11 +13,11 @@ dependencies = [
    "faster-whisper>=1.1.1",
    "nltk>=3.9.1",
    "numpy>=2.0.2",
-    "onnxruntime>=1.19",
+    "onnxruntime>=1.19,<1.20.0",
    "pandas>=2.2.3",
    "pyannote-audio>=3.3.2",
-    "torch>=2.5.1",
-    "torchaudio>=2.5.1",
+    "torch<2.4.0",
+    "torchaudio",
    "transformers>=4.48.0",
 ]

--- a/uv.lock
+++ b/uv.lock
--- a/whisperx/main.py
+++ b/whisperx/main.py
@ -44,7 +44,6 @@ def cli():
    parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
    parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
    parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-3.1", type=str, help="Name of the speaker diarization model to use")
-    parser.add_argument("--speaker_embeddings", action="store_true", help="Include speaker embeddings in JSON output (only works with --diarize)")

    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
--- a/whisperx/diarize.py
+++ b/whisperx/diarize.py
@ -26,81 +26,25 @@ class DiarizationPipeline:
        num_speakers: Optional[int] = None,
        min_speakers: Optional[int] = None,
        max_speakers: Optional[int] = None,
-        return_embeddings: bool = False,
-    ) -> Union[tuple[pd.DataFrame, Optional[dict[str, list[float]]]], pd.DataFrame]:
-        """
-        Perform speaker diarization on audio.
-
-        Args:
-            audio: Path to audio file or audio array
-            num_speakers: Exact number of speakers (if known)
-            min_speakers: Minimum number of speakers to detect
-            max_speakers: Maximum number of speakers to detect
-            return_embeddings: Whether to return speaker embeddings
-
-        Returns:
-            If return_embeddings is True:
-                Tuple of (diarization dataframe, speaker embeddings dictionary)
-            Otherwise:
-                Just the diarization dataframe
-        """
+    ):
        if isinstance(audio, str):
            audio = load_audio(audio)
        audio_data = {
            'waveform': torch.from_numpy(audio[None, :]),
            'sample_rate': SAMPLE_RATE
        }
-
-        if return_embeddings:
-            diarization, embeddings = self.model(
-                audio_data,
-                num_speakers=num_speakers,
-                min_speakers=min_speakers,
-                max_speakers=max_speakers,
-                return_embeddings=True,
-            )
-        else:
-            diarization = self.model(
-                audio_data,
-                num_speakers=num_speakers,
-                min_speakers=min_speakers,
-                max_speakers=max_speakers,
-            )
-            embeddings = None
-
-        diarize_df = pd.DataFrame(diarization.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
+        segments = self.model(audio_data, num_speakers = num_speakers, min_speakers=min_speakers, max_speakers=max_speakers)
+        diarize_df = pd.DataFrame(segments.itertracks(yield_label=True), columns=['segment', 'label', 'speaker'])
        diarize_df['start'] = diarize_df['segment'].apply(lambda x: x.start)
        diarize_df['end'] = diarize_df['segment'].apply(lambda x: x.end)
-
-        if return_embeddings and embeddings is not None:
-            speaker_embeddings = {speaker: embeddings[s].tolist() for s, speaker in enumerate(diarization.labels())}
-            return diarize_df, speaker_embeddings
-        
-        # For backwards compatibility
-        if return_embeddings:
-            return diarize_df, None
-        else:
-            return diarize_df
+        return diarize_df


 def assign_word_speakers(
    diarize_df: pd.DataFrame,
    transcript_result: Union[AlignedTranscriptionResult, TranscriptionResult],
-    speaker_embeddings: Optional[dict[str, list[float]]] = None,
-    fill_nearest: bool = False,
-) -> Union[AlignedTranscriptionResult, TranscriptionResult]:
-    """
-    Assign speakers to words and segments in the transcript.
-
-    Args:
-        diarize_df: Diarization dataframe from DiarizationPipeline
-        transcript_result: Transcription result to augment with speaker labels
-        speaker_embeddings: Optional dictionary mapping speaker IDs to embedding vectors
-        fill_nearest: If True, assign speakers even when there's no direct time overlap
-
-    Returns:
-        Updated transcript_result with speaker assignments and optionally embeddings
-    """
+    fill_nearest=False,
+) -> dict:
    transcript_segments = transcript_result["segments"]
    for seg in transcript_segments:
        # assign speaker to segment (if any)
@ -131,12 +75,8 @@ def assign_word_speakers(
                        # sum over speakers
                        speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
                        word["speaker"] = speaker
-
-    # Add speaker embeddings to the result if provided
-    if speaker_embeddings is not None:
-        transcript_result["speaker_embeddings"] = speaker_embeddings
-
-    return transcript_result
+        
+    return transcript_result            


 class Segment:
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@ -59,10 +59,6 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
    max_speakers: int = args.pop("max_speakers")
    diarize_model_name: str = args.pop("diarize_model")
    print_progress: bool = args.pop("print_progress")
-    return_speaker_embeddings: bool = args.pop("speaker_embeddings")
-
-    if return_speaker_embeddings and not diarize:
-        warnings.warn("--speaker_embeddings has no effect without --diarize")

    if args["language"] is not None:
        args["language"] = args["language"].lower()
@ -213,13 +209,10 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
        results = []
        diarize_model = DiarizationPipeline(model_name=diarize_model_name, use_auth_token=hf_token, device=device)
        for result, input_audio_path in tmp_results:
-            diarize_segments, speaker_embeddings = diarize_model(
-                input_audio_path, 
-                min_speakers=min_speakers, 
-                max_speakers=max_speakers, 
-                return_embeddings=return_speaker_embeddings
+            diarize_segments = diarize_model(
+                input_audio_path, min_speakers=min_speakers, max_speakers=max_speakers
            )
-            result = assign_word_speakers(diarize_segments, result, speaker_embeddings)
+            result = assign_word_speakers(diarize_segments, result)
            results.append((result, input_audio_path))
    # >> Write
    for result, audio_path in results:
Author	SHA1	Message	Date
陳鈞	b94778fd60	Merge `399010fd12` into `d700b56c9c`	2025-06-13 16:23:14 +00:00
CHEN, CHUN	399010fd12	Revert "docs: add troubleshooting section for libcudnn dependencies in README" This reverts commit `6fe0a8784a`. Revert the commit now that the issue is fixed. Signed-off-by: CHEN, CHUN <jim60105@gmail.com>	2025-06-14 00:22:57 +08:00
CHEN, CHUN	d3dcb1175f	chore: restrict onnxruntime to version 1.19 for python 3.9 compatibility - Restrict the onnxruntime dependency to versions >=1.19 and <1.20.0 to avoid potential compatibility issues. Signed-off-by: CHEN, CHUN <jim60105@gmail.com>	2025-06-14 00:21:53 +08:00
CHEN, CHUN	4f99f1f67c	chore: restrict torch version to below 2.4 in dependencies torch depends on libcudnn9 from version 2.4.0 onward. If we restrict torch<2.4.0, there is no need to manually install libcudnn8 and also save about 1GB disk space. - Update torch dependency to be below version 2.4.0 instead of at least 2.5.1 - Change torchaudio dependency to have no minimum version specified Signed-off-by: CHEN, CHUN <jim60105@gmail.com>	2025-06-14 00:21:53 +08:00