From ae7ea9f4b644db7b41a317de04c5e2458539c500 Mon Sep 17 00:00:00 2001
From: tcohenpoliglotnews
 <160532532+tcohenpoliglotnews@users.noreply.github.com>
Date: Mon, 3 Mar 2025 22:51:45 -0500
Subject: [PATCH] fix crashes when loading large audio files

Fixes issue where `np.frombuffer()` would crash on large audio files due to excessive memory usage or misaligned buffer sizes.
---
 whisperx/audio.py | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/whisperx/audio.py b/whisperx/audio.py
index 42f97b8..b3aa778 100644
--- a/whisperx/audio.py
+++ b/whisperx/audio.py
@@ -58,11 +58,33 @@ def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
             str(sr),
             "-",
         ]
-        out = subprocess.run(cmd, capture_output=True, check=True).stdout
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 
-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            bufsize=10 * 1024 * 1024
+        )
+
+        out = bytearray()
+        while True:
+            chunk = process.stdout.read(1024 * 1024)
+            if not chunk:
+                break
+            out.extend(chunk)
+
+        stderr_output = process.stderr.read()
+        return_code = process.wait()
+
+        if return_code != 0:
+            raise RuntimeError(f"FFmpeg process failed with error: {stderr_output.decode()}")
+
+        if len(out) % 2 != 0:
+            raise ValueError("Audio buffer size is not aligned to int16.")
+        
+        return np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
+    except Exception as e:
+        raise RuntimeError(f"Error loading audio file {file}: {str(e)}")
 
 
 def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):