7 Commits

9 changed files with 66 additions and 10 deletions

View File

@ -17,6 +17,9 @@ jobs:
version: "0.5.14"
python-version: "3.9"
- name: Check if lockfile is up to date
run: uv lock --check
- name: Build package
run: uv build

View File

@ -23,6 +23,9 @@ jobs:
version: "0.5.14"
python-version: ${{ matrix.python-version }}
- name: Check if lockfile is up to date
run: uv lock --check
- name: Install the project
run: uv sync --all-extras

View File

@ -97,6 +97,25 @@ uv sync --all-extras --dev
You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
### Common Issues & Troubleshooting 🔧
#### libcudnn Dependencies (GPU Users)
If you're using WhisperX with GPU support and encounter errors like:
- `Could not load library libcudnn_ops_infer.so.8`
- `Unable to load any of {libcudnn_cnn.so.9.1.0, libcudnn_cnn.so.9.1, libcudnn_cnn.so.9, libcudnn_cnn.so}`
- `libcudnn_ops_infer.so.8: cannot open shared object file: No such file or directory`
This means your system is missing the CUDA Deep Neural Network library (cuDNN). This library is needed for GPU acceleration but isn't always installed by default.
**Install cuDNN (example for apt based systems):**
```bash
sudo apt update
sudo apt install libcudnn8 libcudnn8-dev -y
```
### Speaker Diarization
To **enable Speaker Diarization**, include your Hugging Face access token (read) that you can generate from [Here](https://huggingface.co/settings/tokens) after the `--hf_token` argument and accept the user agreement for the following models: [Segmentation](https://huggingface.co/pyannote/segmentation-3.0) and [Speaker-Diarization-3.1](https://huggingface.co/pyannote/speaker-diarization-3.1) (if you choose to use Speaker-Diarization 2.x, follow requirements [here](https://huggingface.co/pyannote/speaker-diarization) instead.)
@ -170,7 +189,7 @@ result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model
# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model
# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
@ -179,7 +198,7 @@ result = whisperx.align(result["segments"], model_a, metadata, audio, device, re
print(result["segments"]) # after alignment
# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model_a
# 3. Assign speaker labels
diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)

2
uv.lock generated
View File

@ -2787,7 +2787,7 @@ wheels = [
[[package]]
name = "whisperx"
version = "3.3.3"
version = "3.3.4"
source = { editable = "." }
dependencies = [
{ name = "ctranslate2" },

View File

@ -43,6 +43,7 @@ def cli():
parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-3.1", type=str, help="Name of the speaker diarization model to use")
parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")

View File

@ -5,7 +5,7 @@ C. Max Bain
import math
from dataclasses import dataclass
from typing import Iterable, Optional, Union, List
from typing import Iterable, Union, List, Callable, Optional
import numpy as np
import pandas as pd
@ -120,6 +120,7 @@ def align(
return_char_alignments: bool = False,
print_progress: bool = False,
combined_progress: bool = False,
on_progress: Callable[[int, int], None] = None
) -> AlignedTranscriptionResult:
"""
Align phoneme recognition predictions to known transcription.
@ -148,6 +149,9 @@ def align(
base_progress = ((sdx + 1) / total_segments) * 100
percent_complete = (50 + base_progress / 2) if combined_progress else base_progress
print(f"Progress: {percent_complete:.2f}%...")
if on_progress:
on_progress(sdx + 1, total_segments)
num_leading = len(segment["text"]) - len(segment["text"].lstrip())
num_trailing = len(segment["text"]) - len(segment["text"].rstrip())

View File

@ -1,6 +1,8 @@
import os
from typing import List, Optional, Union
from dataclasses import replace
import warnings
from typing import List, Union, Optional, NamedTuple, Callable
from enum import Enum
import ctranslate2
import faster_whisper
@ -103,6 +105,12 @@ class FasterWhisperPipeline(Pipeline):
# - add support for timestamp mode
# - add support for custom inference kwargs
class TranscriptionState(Enum):
LOADING_AUDIO = "loading_audio"
GENERATING_VAD_SEGMENTS = "generating_vad_segments"
TRANSCRIBING = "transcribing"
FINISHED = "finished"
def __init__(
self,
model: WhisperModel,
@ -197,8 +205,12 @@ class FasterWhisperPipeline(Pipeline):
print_progress=False,
combined_progress=False,
verbose=False,
on_progress: Callable[[TranscriptionState, Optional[int], Optional[int]], None] = None,
) -> TranscriptionResult:
if isinstance(audio, str):
if on_progress:
on_progress(self.__class__.TranscriptionState.LOADING_AUDIO)
audio = load_audio(audio)
def data(audio, segments):
@ -216,6 +228,8 @@ class FasterWhisperPipeline(Pipeline):
else:
waveform = Pyannote.preprocess_audio(audio)
merge_chunks = Pyannote.merge_chunks
if on_progress:
on_progress(self.__class__.TranscriptionState.GENERATING_VAD_SEGMENTS)
vad_segments = self.vad_model({"waveform": waveform, "sample_rate": SAMPLE_RATE})
vad_segments = merge_chunks(
@ -255,16 +269,22 @@ class FasterWhisperPipeline(Pipeline):
segments: List[SingleSegment] = []
batch_size = batch_size or self._batch_size
total_segments = len(vad_segments)
if on_progress:
on_progress(self.__class__.TranscriptionState.TRANSCRIBING, 0, total_segments)
for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)):
if print_progress:
base_progress = ((idx + 1) / total_segments) * 100
percent_complete = base_progress / 2 if combined_progress else base_progress
print(f"Progress: {percent_complete:.2f}%...")
if on_progress:
on_progress(self.__class__.TranscriptionState.TRANSCRIBING, idx + 1, total_segments)
text = out['text']
if batch_size in [0, 1, None]:
text = text[0]
if verbose:
print(f"Transcript: [{round(vad_segments[idx]['start'], 3)} --> {round(vad_segments[idx]['end'], 3)}] {text}")
segments.append(
{
"text": text,
@ -273,6 +293,9 @@ class FasterWhisperPipeline(Pipeline):
}
)
if on_progress:
on_progress(self.__class__.TranscriptionState.FINISHED)
# revert the tokenizer if multilingual inference is enabled
if self.preset_language is None:
self.tokenizer = None

View File

@ -11,13 +11,14 @@ from whisperx.types import TranscriptionResult, AlignedTranscriptionResult
class DiarizationPipeline:
def __init__(
self,
model_name="pyannote/speaker-diarization-3.1",
model_name=None,
use_auth_token=None,
device: Optional[Union[str, torch.device]] = "cpu",
):
if isinstance(device, str):
device = torch.device(device)
self.model = Pipeline.from_pretrained(model_name, use_auth_token=use_auth_token).to(device)
model_config = model_name or "pyannote/speaker-diarization-3.1"
self.model = Pipeline.from_pretrained(model_config, use_auth_token=use_auth_token).to(device)
def __call__(
self,

View File

@ -57,6 +57,7 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
diarize: bool = args.pop("diarize")
min_speakers: int = args.pop("min_speakers")
max_speakers: int = args.pop("max_speakers")
diarize_model_name: str = args.pop("diarize_model")
print_progress: bool = args.pop("print_progress")
if args["language"] is not None:
@ -204,8 +205,9 @@ def transcribe_task(args: dict, parser: argparse.ArgumentParser):
)
tmp_results = results
print(">>Performing diarization...")
print(">>Using model:", diarize_model_name)
results = []
diarize_model = DiarizationPipeline(use_auth_token=hf_token, device=device)
diarize_model = DiarizationPipeline(model_name=diarize_model_name, use_auth_token=hf_token, device=device)
for result, input_audio_path in tmp_results:
diarize_segments = diarize_model(
input_audio_path, min_speakers=min_speakers, max_speakers=max_speakers