Merge pull request #570 from hidenori-endo/main

Drop ffmpeg-python dependency and call ffmpeg directly.
2025-07-01 18:17:27 -04:00 · 2023-11-09 18:39:53 +00:00
parent a2af569838 6703d2774b
commit 52fbe5c26f
2 changed files with 22 additions and 10 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,6 @@ torch>=2
 torchaudio>=2
 faster-whisper>=0.8
 transformers
 ffmpeg-python>=0.2
 pandas
 setuptools>=65
 nltk
--- a/whisperx/audio.py
+++ b/whisperx/audio.py
@ -1,8 +1,8 @@
 import os
 import subprocess
 from functools import lru_cache
 from typing import Optional, Union
 import ffmpeg
 import numpy as np
 import torch
 import torch.nn.functional as F
@ -40,14 +40,27 @@ def load_audio(file: str, sr: int = SAMPLE_RATE):
    A NumPy array containing the audio waveform, in float32 dtype.
    """
    try:
-        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
+        # Launches a subprocess to decode audio while down-mixing and resampling as necessary.
-        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+        # Requires the ffmpeg CLI to be installed.
-        out, _ = (
+        cmd = [
-            ffmpeg.input(file, threads=0)
+            "ffmpeg",
-            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=sr)
+            "-nostdin",
-            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
+            "-threads",
-        )
+            "0",
-    except ffmpeg.Error as e:
+            "-i",
            file,
            "-f",
            "s16le",
            "-ac",
            "1",
            "-acodec",
            "pcm_s16le",
            "-ar",
            str(sr),
            "-",
        ]
        out = subprocess.run(cmd, capture_output=True, check=True).stdout
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0