mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
55 lines
1.5 KiB
Python
55 lines
1.5 KiB
Python
from typing import Optional
|
|
|
|
import pandas as pd
|
|
from pyannote.core import Annotation, Segment
|
|
|
|
|
|
class Vad:
|
|
def __init__(self, vad_onset):
|
|
if not (0 < vad_onset < 1):
|
|
raise ValueError(
|
|
"vad_onset is a decimal value between 0 and 1."
|
|
)
|
|
|
|
@staticmethod
|
|
def preprocess_audio(audio):
|
|
pass
|
|
|
|
# keep merge_chunks as static so it can be also used by manually assigned vad_model (see 'load_model')
|
|
@staticmethod
|
|
def merge_chunks(segments,
|
|
chunk_size,
|
|
onset: float,
|
|
offset: Optional[float]):
|
|
"""
|
|
Merge operation described in paper
|
|
"""
|
|
curr_end = 0
|
|
merged_segments = []
|
|
seg_idxs: list[tuple]= []
|
|
speaker_idxs: list[Optional[str]] = []
|
|
|
|
curr_start = segments[0].start
|
|
for seg in segments:
|
|
if seg.end - curr_start > chunk_size and curr_end - curr_start > 0:
|
|
merged_segments.append({
|
|
"start": curr_start,
|
|
"end": curr_end,
|
|
"segments": seg_idxs,
|
|
})
|
|
curr_start = seg.start
|
|
seg_idxs = []
|
|
speaker_idxs = []
|
|
curr_end = seg.end
|
|
seg_idxs.append((seg.start, seg.end))
|
|
speaker_idxs.append(seg.speaker)
|
|
# add final
|
|
merged_segments.append({
|
|
"start": curr_start,
|
|
"end": curr_end,
|
|
"segments": seg_idxs,
|
|
})
|
|
|
|
return merged_segments
|
|
|