whisperX/whisperx/vads/vad.py

from typing import Optional

import pandas as pd
from pyannote.core import Annotation, Segment


class Vad:
    def __init__(self, vad_onset):
        if not (0 < vad_onset < 1):
            raise ValueError(
                "vad_onset is a decimal value between 0 and 1."
            )

    @staticmethod
    def preprocess_audio(audio):
        pass

    # keep merge_chunks as static so it can be also used by manually assigned vad_model (see 'load_model')
    @staticmethod
    def merge_chunks(segments,
                     chunk_size,
                     onset: float,
                     offset: Optional[float]):
        """
         Merge operation described in paper
         """
        curr_end = 0
        merged_segments = []
        seg_idxs: list[tuple]= []
        speaker_idxs: list[Optional[str]] = []

        curr_start = segments[0].start
        for seg in segments:
            if seg.end - curr_start > chunk_size and curr_end - curr_start > 0:
                merged_segments.append({
                    "start": curr_start,
                    "end": curr_end,
                    "segments": seg_idxs,
                })
                curr_start = seg.start
                seg_idxs = []
                speaker_idxs = []
            curr_end = seg.end
            seg_idxs.append((seg.start, seg.end))
            speaker_idxs.append(seg.speaker)
        # add final
        merged_segments.append({
            "start": curr_start,
            "end": curr_end,
            "segments": seg_idxs,
        })

        return merged_segments

    # Unused function
    @staticmethod
    def merge_vad(vad_arr, pad_onset=0.0, pad_offset=0.0, min_duration_off=0.0, min_duration_on=0.0):
        active = Annotation()
        for k, vad_t in enumerate(vad_arr):
            region = Segment(vad_t[0] - pad_onset, vad_t[1] + pad_offset)
            active[region, k] = 1

        if pad_offset > 0.0 or pad_onset > 0.0 or min_duration_off > 0.0:
            active = active.support(collar=min_duration_off)

        # remove tracks shorter than min_duration_on
        if min_duration_on > 0:
            for segment, track in list(active.itertracks()):
                if segment.duration < min_duration_on:
                    del active[segment, track]

        active = active.for_json()
        active_segs = pd.DataFrame([x['segment'] for x in active['content']])
        return active_segs