Merge ae7ea9f4b6 into b343241253

feat: add diarize_model arg to CLI (#1101 )
docs: add troubleshooting section for libcudnn dependencies in README
2025-07-01 18:17:27 -04:00 · 2025-05-31 13:32:42 +02:00 · 2025-05-31 13:32:31 +02:00 · 2025-05-31 05:20:06 -06:00 · 2025-05-03 16:25:43 +02:00 · 2025-05-03 16:25:43 +02:00
8 changed files with 59 additions and 8 deletions
--- a/.github/workflows/build-and-release.yml
+++ b/.github/workflows/build-and-release.yml
@ -17,6 +17,9 @@ jobs:
          version: "0.5.14"
          python-version: "3.9"
      - name: Check if lockfile is up to date
        run: uv lock --check
      - name: Build package
        run: uv build
--- a/.github/workflows/python-compatibility.yml
+++ b/.github/workflows/python-compatibility.yml
@ -23,6 +23,9 @@ jobs:
          version: "0.5.14"
          python-version: ${{ matrix.python-version }}
      - name: Check if lockfile is up to date
        run: uv lock --check
      - name: Install the project
        run: uv sync --all-extras
--- a/README.md
+++ b/README.md
@ -97,6 +97,25 @@ uv sync --all-extras --dev
 You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
 ### Common Issues & Troubleshooting 🔧
 #### libcudnn Dependencies (GPU Users)
 If you're using WhisperX with GPU support and encounter errors like:
 - `Could not load library libcudnn_ops_infer.so.8`
 - `Unable to load any of {libcudnn_cnn.so.9.1.0, libcudnn_cnn.so.9.1, libcudnn_cnn.so.9, libcudnn_cnn.so}`
 - `libcudnn_ops_infer.so.8: cannot open shared object file: No such file or directory`
 This means your system is missing the CUDA Deep Neural Network library (cuDNN). This library is needed for GPU acceleration but isn't always installed by default.
 **Install cuDNN (example for apt based systems):**
 ```bash
 sudo apt update
 sudo apt install libcudnn8 libcudnn8-dev -y
 ```
 ### Speaker Diarization
 To **enable Speaker Diarization**, include your Hugging Face access token (read) that you can generate from [Here](https://huggingface.co/settings/tokens) after the `--hf_token` argument and accept the user agreement for the following models: [Segmentation](https://huggingface.co/pyannote/segmentation-3.0) and [Speaker-Diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) (if you choose to use Speaker-Diarization 2.x, follow requirements [here](https://huggingface.co/pyannote/speaker-diarization) instead.)
--- a/uv.lock
+++ b/uv.lock
@ -2787,7 +2787,7 @@ wheels = [
 [[package]]
 name = "whisperx"
-version = "3.3.3"
+version = "3.3.4"
 source = { editable = "." }
 dependencies = [
    { name = "ctranslate2" },
--- a/whisperx/main.py
+++ b/whisperx/main.py
@ -43,6 +43,7 @@ def cli():
    parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
    parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
    parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
    parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-3.1", type=str, help="Name of the speaker diarization model to use")
    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")
--- a/whisperx/audio.py
+++ b/whisperx/audio.py
@ -58,11 +58,33 @@ def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
            str(sr),
            "-",
        ]
        out = subprocess.run(cmd, capture_output=True, check=True).stdout
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+        process = subprocess.Popen(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            bufsize=10 * 1024 * 1024
        )
        out = bytearray()
        while True:
            chunk = process.stdout.read(1024 * 1024)
            if not chunk:
                break
            out.extend(chunk)
        stderr_output = process.stderr.read()
        return_code = process.wait()
        if return_code != 0:
            raise RuntimeError(f"FFmpeg process failed with error: {stderr_output.decode()}")
        if len(out) % 2 != 0:
            raise ValueError("Audio buffer size is not aligned to int16.")
        return np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
    except Exception as e:
        raise RuntimeError(f"Error loading audio file {file}: {str(e)}")
 def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
--- a/whisperx/diarize.py
+++ b/whisperx/diarize.py
@ -11,13 +11,14 @@ from whisperx.types import TranscriptionResult, AlignedTranscriptionResult
 class DiarizationPipeline:
    def __init__(
        self,
-        model_name="pyannote/speaker-diarization-3.1",
+        model_name=None,
        use_auth_token=None,
        device: Optional[Union[str, torch.device]] = "cpu",
    ):
        if isinstance(device, str):
            device = torch.device(device)
-        self.model = Pipeline.from_pretrained(model_name, use_auth_token=use_auth_token).to(device)
+        model_config = model_name or "pyannote/speaker-diarization-3.1"
        self.model = Pipeline.from_pretrained(model_config, use_auth_token=use_auth_token).to(device)
    def __call__(
        self,
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@ -57,6 +57,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
    diarize: bool = args.pop("diarize")
    min_speakers: int = args.pop("min_speakers")
    max_speakers: int = args.pop("max_speakers")
    diarize_model_name: str = args.pop("diarize_model")
    print_progress: bool = args.pop("print_progress")
    if args["language"] is not None:
@ -204,8 +205,9 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
            )
        tmp_results = results
        print(">>Performing diarization...")
        print(">>Using model:", diarize_model_name)
        results = []
-        diarize_model = DiarizationPipeline(use_auth_token=hf_token, device=device)
+        diarize_model = DiarizationPipeline(model_name=diarize_model_name, use_auth_token=hf_token, device=device)
        for result, input_audio_path in tmp_results:
            diarize_segments = diarize_model(
                input_audio_path, min_speakers=min_speakers, max_speakers=max_speakers
Author	SHA1	Message	Date
tcohenpoliglotnews	a326c15268	Merge `ae7ea9f4b6` into `b343241253`	2025-05-31 13:32:42 +02:00
bog	b343241253	feat: add diarize_model arg to CLI (#1101 )	2025-05-31 13:32:31 +02:00
Barabazs	6fe0a8784a	docs: add troubleshooting section for libcudnn dependencies in README	2025-05-31 05:20:06 -06:00
Barabazs	5012650d0f	chore: update lockfile	2025-05-03 16:25:43 +02:00
Barabazs	108bd0c400	chore: add lockfile check step to CI workflows	2025-05-03 16:25:43 +02:00
tcohenpoliglotnews	ae7ea9f4b6	fix crashes when loading large audio files Fixes issue where `np.frombuffer()` would crash on large audio files due to excessive memory usage or misaligned buffer sizes.	2025-03-03 22:51:45 -05:00