Merge ae7ea9f4b6 into d700b56c9c

docs: add missing torch import to Python usage example in README
fix crashes when loading large audio files
2025-07-01 18:17:27 -04:00 · 2025-06-17 19:02:36 +02:00 · 2025-06-08 03:34:49 -06:00 · 2025-03-03 22:51:45 -05:00
2 changed files with 28 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -189,7 +189,7 @@ result = model.transcribe(audio, batch_size=batch_size)
 print(result["segments"]) # before alignment

 # delete model if low on GPU resources
-# import gc; gc.collect(); torch.cuda.empty_cache(); del model
+# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model

 # 2. Align whisper output
 model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
@ -198,7 +198,7 @@ result = whisperx.align(result["segments"], model_a, metadata, audio, device, re
 print(result["segments"]) # after alignment

 # delete model if low on GPU resources
-# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
+# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model_a

 # 3. Assign speaker labels
 diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
--- a/whisperx/audio.py
+++ b/whisperx/audio.py
@ -58,11 +58,33 @@ def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
            str(sr),
            "-",
        ]
-        out = subprocess.run(cmd, capture_output=True, check=True).stdout
-    except subprocess.CalledProcessError as e:
-        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

-    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+        process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            bufsize=10 * 1024 * 1024
+        )
+
+        out = bytearray()
+        while True:
+            chunk = process.stdout.read(1024 * 1024)
+            if not chunk:
+                break
+            out.extend(chunk)
+
+        stderr_output = process.stderr.read()
+        return_code = process.wait()
+
+        if return_code != 0:
+            raise RuntimeError(f"FFmpeg process failed with error: {stderr_output.decode()}")
+
+        if len(out) % 2 != 0:
+            raise ValueError("Audio buffer size is not aligned to int16.")
+        
+        return np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
+    except Exception as e:
+        raise RuntimeError(f"Error loading audio file {file}: {str(e)}")


 def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
Author	SHA1	Message	Date
tcohenpoliglotnews	ac1a189e74	Merge `ae7ea9f4b6` into `d700b56c9c`	2025-06-17 19:02:36 +02:00
Kirill	d700b56c9c	docs: add missing torch import to Python usage example in README	2025-06-08 03:34:49 -06:00
tcohenpoliglotnews	ae7ea9f4b6	fix crashes when loading large audio files Fixes issue where `np.frombuffer()` would crash on large audio files due to excessive memory usage or misaligned buffer sizes.	2025-03-03 22:51:45 -05:00