Accept alternative VAD methods. Extend to use Silero VAD.

2025-07-01 18:17:27 -04:00 · 2024-09-26 10:28:52 +02:00
parent 10b05fc43f
commit 79eb8fa53d
8 changed files with 262 additions and 101 deletions
--- a/whisperx/vads/silero.py
+++ b/whisperx/vads/silero.py
@ -0,0 +1,62 @@
+from io import IOBase
+from pathlib import Path
+from typing import Mapping, Text
+from typing import Optional
+from typing import Union
+
+import torch
+
+from whisperx.diarize import Segment as SegmentX
+from whisperx.vads.vad import Vad
+
+AudioFile = Union[Text, Path, IOBase, Mapping]
+
+
+class Silero(Vad):
+    # check again default values
+    def __init__(self, **kwargs):
+        print(">>Performing voice activity detection using Silero...")
+        super().__init__(kwargs['vad_onset'])
+
+        self.vad_onset = kwargs['vad_onset']
+        self.chunk_size = kwargs['chunk_size']
+        self.vad_pipeline, vad_utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
+                                                      model='silero_vad',
+                                                      force_reload=False,
+                                                      onnx=False,
+                                                      trust_repo=True)
+        (self.get_speech_timestamps, _, self.read_audio, _, _) = vad_utils
+
+    def __call__(self, audio: AudioFile, **kwargs):
+        """use silero to get segments of speech"""
+        # Only accept 16000 Hz for now.
+        # Note: Silero models support both 8000 and 16000 Hz. Although other values are not directly supported,
+        # multiples of 16000 (e.g. 32000 or 48000) are cast to 16000 inside of the JIT model!
+        sample_rate = audio["sample_rate"]
+        if sample_rate != 16000:
+            raise ValueError("Only 16000Hz sample rate is allowed")
+
+        timestamps = self.get_speech_timestamps(audio["waveform"],
+                                                model=self.vad_pipeline,
+                                                sampling_rate=sample_rate,
+                                                max_speech_duration_s=self.chunk_size,
+                                                threshold=self.vad_onset
+                                                # min_silence_duration_ms = self.min_duration_off/1000
+                                                # min_speech_duration_ms = self.min_duration_on/1000
+                                                # ...
+                                                # See silero documentation for full option list
+                                                )
+        return [SegmentX(i['start'] / sample_rate, i['end'] / sample_rate, "UNKNOWN") for i in timestamps]
+
+    @staticmethod
+    def preprocess_audio(audio):
+        return audio
+
+    @staticmethod
+    def merge_chunks(segments,
+                     chunk_size,
+                     onset: float = 0.5,
+                     offset: Optional[float] = None,
+                     ):
+        assert chunk_size > 0
+        return Vad.merge_chunks(segments, chunk_size, onset, offset)