mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 10:07:28 -04:00
Compare commits
3 Commits
a326c15268
...
ac1a189e74
Author | SHA1 | Date | |
---|---|---|---|
ac1a189e74 | |||
d700b56c9c | |||
ae7ea9f4b6 |
@ -189,7 +189,7 @@ result = model.transcribe(audio, batch_size=batch_size)
|
||||
print(result["segments"]) # before alignment
|
||||
|
||||
# delete model if low on GPU resources
|
||||
# import gc; gc.collect(); torch.cuda.empty_cache(); del model
|
||||
# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model
|
||||
|
||||
# 2. Align whisper output
|
||||
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
||||
@ -198,7 +198,7 @@ result = whisperx.align(result["segments"], model_a, metadata, audio, device, re
|
||||
print(result["segments"]) # after alignment
|
||||
|
||||
# delete model if low on GPU resources
|
||||
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a
|
||||
# import gc; import torch; gc.collect(); torch.cuda.empty_cache(); del model_a
|
||||
|
||||
# 3. Assign speaker labels
|
||||
diarize_model = whisperx.diarize.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)
|
||||
|
@ -58,11 +58,33 @@ def load_audio(file: str, sr: int = SAMPLE_RATE) -> np.ndarray:
|
||||
str(sr),
|
||||
"-",
|
||||
]
|
||||
out = subprocess.run(cmd, capture_output=True, check=True).stdout
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
||||
|
||||
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
|
||||
process = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
bufsize=10 * 1024 * 1024
|
||||
)
|
||||
|
||||
out = bytearray()
|
||||
while True:
|
||||
chunk = process.stdout.read(1024 * 1024)
|
||||
if not chunk:
|
||||
break
|
||||
out.extend(chunk)
|
||||
|
||||
stderr_output = process.stderr.read()
|
||||
return_code = process.wait()
|
||||
|
||||
if return_code != 0:
|
||||
raise RuntimeError(f"FFmpeg process failed with error: {stderr_output.decode()}")
|
||||
|
||||
if len(out) % 2 != 0:
|
||||
raise ValueError("Audio buffer size is not aligned to int16.")
|
||||
|
||||
return np.frombuffer(out, np.int16).astype(np.float32) / 32768.0
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Error loading audio file {file}: {str(e)}")
|
||||
|
||||
|
||||
def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
|
||||
|
Reference in New Issue
Block a user