update setup.py to install pyannote.audio==3.1.1, update diarize.py to include num_speakers; to fix Issue #592

2025-07-01 18:17:27 -04:00 · 2023-12-26 13:01:49 +01:00
parent 8540ff5985
commit 8ae6416594
15 changed files with 2366 additions and 3 deletions
--- a/build/lib/whisperx/vad.py
+++ b/build/lib/whisperx/vad.py
@ -0,0 +1,311 @@
+import hashlib
+import os
+import urllib
+from typing import Callable, Optional, Text, Union
+
+import numpy as np
+import pandas as pd
+import torch
+from pyannote.audio import Model
+from pyannote.audio.core.io import AudioFile
+from pyannote.audio.pipelines import VoiceActivityDetection
+from pyannote.audio.pipelines.utils import PipelineModel
+from pyannote.core import Annotation, Segment, SlidingWindowFeature
+from tqdm import tqdm
+
+from .diarize import Segment as SegmentX
+
+VAD_SEGMENTATION_URL = "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation/0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin"
+
+def load_vad_model(device, vad_onset=0.500, vad_offset=0.363, use_auth_token=None, model_fp=None):
+    model_dir = torch.hub._get_torch_home()
+    os.makedirs(model_dir, exist_ok = True)
+    if model_fp is None:
+        model_fp = os.path.join(model_dir, "whisperx-vad-segmentation.bin")
+    if os.path.exists(model_fp) and not os.path.isfile(model_fp):
+        raise RuntimeError(f"{model_fp} exists and is not a regular file")
+
+    if not os.path.isfile(model_fp):
+        with urllib.request.urlopen(VAD_SEGMENTATION_URL) as source, open(model_fp, "wb") as output:
+            with tqdm(
+                total=int(source.info().get("Content-Length")),
+                ncols=80,
+                unit="iB",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as loop:
+                while True:
+                    buffer = source.read(8192)
+                    if not buffer:
+                        break
+
+                    output.write(buffer)
+                    loop.update(len(buffer))
+
+    model_bytes = open(model_fp, "rb").read()
+    if hashlib.sha256(model_bytes).hexdigest() != VAD_SEGMENTATION_URL.split('/')[-2]:
+        raise RuntimeError(
+            "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
+        )
+
+    vad_model = Model.from_pretrained(model_fp, use_auth_token=use_auth_token)
+    hyperparameters = {"onset": vad_onset, 
+                    "offset": vad_offset,
+                    "min_duration_on": 0.1,
+                    "min_duration_off": 0.1}
+    vad_pipeline = VoiceActivitySegmentation(segmentation=vad_model, device=torch.device(device))
+    vad_pipeline.instantiate(hyperparameters)
+
+    return vad_pipeline
+
+class Binarize:
+    """Binarize detection scores using hysteresis thresholding, with min-cut operation
+    to ensure not segments are longer than max_duration.
+
+    Parameters
+    ----------
+    onset : float, optional
+        Onset threshold. Defaults to 0.5.
+    offset : float, optional
+        Offset threshold. Defaults to `onset`.
+    min_duration_on : float, optional
+        Remove active regions shorter than that many seconds. Defaults to 0s.
+    min_duration_off : float, optional
+        Fill inactive regions shorter than that many seconds. Defaults to 0s.
+    pad_onset : float, optional
+        Extend active regions by moving their start time by that many seconds.
+        Defaults to 0s.
+    pad_offset : float, optional
+        Extend active regions by moving their end time by that many seconds.
+        Defaults to 0s.
+    max_duration: float
+        The maximum length of an active segment, divides segment at timestamp with lowest score.
+    Reference
+    ---------
+    Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of
+    RNN-based Voice Activity Detection", InterSpeech 2015.
+
+    Modified by Max Bain to include WhisperX's min-cut operation 
+    https://arxiv.org/abs/2303.00747
+    
+    Pyannote-audio
+    """
+
+    def __init__(
+        self,
+        onset: float = 0.5,
+        offset: Optional[float] = None,
+        min_duration_on: float = 0.0,
+        min_duration_off: float = 0.0,
+        pad_onset: float = 0.0,
+        pad_offset: float = 0.0,
+        max_duration: float = float('inf')
+    ):
+
+        super().__init__()
+
+        self.onset = onset
+        self.offset = offset or onset
+
+        self.pad_onset = pad_onset
+        self.pad_offset = pad_offset
+
+        self.min_duration_on = min_duration_on
+        self.min_duration_off = min_duration_off
+
+        self.max_duration = max_duration
+
+    def __call__(self, scores: SlidingWindowFeature) -> Annotation:
+        """Binarize detection scores
+        Parameters
+        ----------
+        scores : SlidingWindowFeature
+            Detection scores.
+        Returns
+        -------
+        active : Annotation
+            Binarized scores.
+        """
+
+        num_frames, num_classes = scores.data.shape
+        frames = scores.sliding_window
+        timestamps = [frames[i].middle for i in range(num_frames)]
+
+        # annotation meant to store 'active' regions
+        active = Annotation()
+        for k, k_scores in enumerate(scores.data.T):
+
+            label = k if scores.labels is None else scores.labels[k]
+
+            # initial state
+            start = timestamps[0]
+            is_active = k_scores[0] > self.onset
+            curr_scores = [k_scores[0]]
+            curr_timestamps = [start]
+            t = start
+            for t, y in zip(timestamps[1:], k_scores[1:]):
+                # currently active
+                if is_active: 
+                    curr_duration = t - start
+                    if curr_duration > self.max_duration:
+                        search_after = len(curr_scores) // 2
+                        # divide segment
+                        min_score_div_idx = search_after + np.argmin(curr_scores[search_after:])
+                        min_score_t = curr_timestamps[min_score_div_idx]
+                        region = Segment(start - self.pad_onset, min_score_t + self.pad_offset)
+                        active[region, k] = label
+                        start = curr_timestamps[min_score_div_idx]
+                        curr_scores = curr_scores[min_score_div_idx+1:]
+                        curr_timestamps = curr_timestamps[min_score_div_idx+1:]
+                    # switching from active to inactive
+                    elif y < self.offset:
+                        region = Segment(start - self.pad_onset, t + self.pad_offset)
+                        active[region, k] = label
+                        start = t
+                        is_active = False
+                        curr_scores = []
+                        curr_timestamps = []
+                    curr_scores.append(y)
+                    curr_timestamps.append(t)
+                # currently inactive
+                else:
+                    # switching from inactive to active
+                    if y > self.onset:
+                        start = t
+                        is_active = True
+
+            # if active at the end, add final region
+            if is_active:
+                region = Segment(start - self.pad_onset, t + self.pad_offset)
+                active[region, k] = label
+
+        # because of padding, some active regions might be overlapping: merge them.
+        # also: fill same speaker gaps shorter than min_duration_off
+        if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0:
+            if self.max_duration < float("inf"):
+                raise NotImplementedError(f"This would break current max_duration param")
+            active = active.support(collar=self.min_duration_off)
+
+        # remove tracks shorter than min_duration_on
+        if self.min_duration_on > 0:
+            for segment, track in list(active.itertracks()):
+                if segment.duration < self.min_duration_on:
+                    del active[segment, track]
+
+        return active
+
+
+class VoiceActivitySegmentation(VoiceActivityDetection):
+    def __init__(
+        self,
+        segmentation: PipelineModel = "pyannote/segmentation",
+        fscore: bool = False,
+        use_auth_token: Union[Text, None] = None,
+        **inference_kwargs,
+    ):
+
+        super().__init__(segmentation=segmentation, fscore=fscore, use_auth_token=use_auth_token, **inference_kwargs)
+
+    def apply(self, file: AudioFile, hook: Optional[Callable] = None) -> Annotation:
+        """Apply voice activity detection
+
+        Parameters
+        ----------
+        file : AudioFile
+            Processed file.
+        hook : callable, optional
+            Hook called after each major step of the pipeline with the following
+            signature: hook("step_name", step_artefact, file=file)
+
+        Returns
+        -------
+        speech : Annotation
+            Speech regions.
+        """
+
+        # setup hook (e.g. for debugging purposes)
+        hook = self.setup_hook(file, hook=hook)
+
+        # apply segmentation model (only if needed)
+        # output shape is (num_chunks, num_frames, 1)
+        if self.training:
+            if self.CACHED_SEGMENTATION in file:
+                segmentations = file[self.CACHED_SEGMENTATION]
+            else:
+                segmentations = self._segmentation(file)
+                file[self.CACHED_SEGMENTATION] = segmentations
+        else:
+            segmentations: SlidingWindowFeature = self._segmentation(file)
+
+        return segmentations
+
+
+def merge_vad(vad_arr, pad_onset=0.0, pad_offset=0.0, min_duration_off=0.0, min_duration_on=0.0):
+
+    active = Annotation()
+    for k, vad_t in enumerate(vad_arr):
+        region = Segment(vad_t[0] - pad_onset, vad_t[1] + pad_offset)
+        active[region, k] = 1
+
+
+    if pad_offset > 0.0 or pad_onset > 0.0 or min_duration_off > 0.0:
+        active = active.support(collar=min_duration_off)
+    
+    # remove tracks shorter than min_duration_on
+    if min_duration_on > 0:
+        for segment, track in list(active.itertracks()):
+            if segment.duration < min_duration_on:
+                    del active[segment, track]
+    
+    active = active.for_json()
+    active_segs = pd.DataFrame([x['segment'] for x in active['content']])
+    return active_segs
+
+def merge_chunks(
+    segments,
+    chunk_size,
+    onset: float = 0.5,
+    offset: Optional[float] = None,
+):
+    """
+    Merge operation described in paper
+    """
+    curr_end = 0
+    merged_segments = []
+    seg_idxs = []
+    speaker_idxs = []
+
+    assert chunk_size > 0
+    binarize = Binarize(max_duration=chunk_size, onset=onset, offset=offset)
+    segments = binarize(segments)
+    segments_list = []
+    for speech_turn in segments.get_timeline():
+        segments_list.append(SegmentX(speech_turn.start, speech_turn.end, "UNKNOWN"))
+
+    if len(segments_list) == 0:
+        print("No active speech found in audio")
+        return []
+    # assert segments_list, "segments_list is empty."
+    # Make sur the starting point is the start of the segment.
+    curr_start = segments_list[0].start
+
+    for seg in segments_list:
+        if seg.end - curr_start > chunk_size and curr_end-curr_start > 0:
+            merged_segments.append({
+                "start": curr_start,
+                "end": curr_end,
+                "segments": seg_idxs,
+            })
+            curr_start = seg.start
+            seg_idxs = []
+            speaker_idxs = []
+        curr_end = seg.end
+        seg_idxs.append((seg.start, seg.end))
+        speaker_idxs.append(seg.speaker)
+    # add final
+    merged_segments.append({ 
+                "start": curr_start,
+                "end": curr_end,
+                "segments": seg_idxs,
+            })    
+    return merged_segments