From ae7ea9f4b644db7b41a317de04c5e2458539c500 Mon Sep 17 00:00:00 2001 From: tcohenpoliglotnews <160532532+tcohenpoliglotnews@users.noreply.github.com> Date: Mon, 3 Mar 2025 22:51:45 -0500 Subject: [PATCH] fix crashes when loading large audio files Fixes issue where `np.frombuffer()` would crash on large audio files due to excessive memory usage or misaligned buffer sizes. --- whisperx/audio.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/whisperx/audio.py b/whisperx/audio.py index 42f97b8..b3aa778 100644 --- a/whisperx/audio.py +++ b/whisperx/audio.py @@ -58,11 +58,33 @@ def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray: str(sr), "-", ] - out = subprocess.run(cmd, capture_output=True, check=True).stdout - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e - return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + process = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + bufsize=10 * 1024 * 1024 + ) + + out = bytearray() + while True: + chunk = process.stdout.read(1024 * 1024) + if not chunk: + break + out.extend(chunk) + + stderr_output = process.stderr.read() + return_code = process.wait() + + if return_code != 0: + raise RuntimeError(f"FFmpeg process failed with error: {stderr_output.decode()}") + + if len(out) % 2 != 0: + raise ValueError("Audio buffer size is not aligned to int16.") + + return np.frombuffer(out, np.int16).astype(np.float32) / 32768.0 + except Exception as e: + raise RuntimeError(f"Error loading audio file {file}: {str(e)}") def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):