feat: Add merge chunks chunk_size as arguments.

Suggest from https://github.com/m-bain/whisperX/issues/200#issuecomment-1666507780
This commit is contained in:
陳鈞
2023-08-29 23:09:02 +08:00
parent ef965a03ed
commit eb771cf56d
2 changed files with 6 additions and 3 deletions

View File

@ -247,7 +247,7 @@ class FasterWhisperPipeline(Pipeline):
return final_iterator return final_iterator
def transcribe( def transcribe(
self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0, language=None, task=None, chunk_size=30
) -> TranscriptionResult: ) -> TranscriptionResult:
if isinstance(audio, str): if isinstance(audio, str):
audio = load_audio(audio) audio = load_audio(audio)
@ -260,7 +260,7 @@ class FasterWhisperPipeline(Pipeline):
yield {'inputs': audio[f1:f2]} yield {'inputs': audio[f1:f2]}
vad_segments = self.vad_model({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": SAMPLE_RATE}) vad_segments = self.vad_model({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": SAMPLE_RATE})
vad_segments = merge_chunks(vad_segments, 30) vad_segments = merge_chunks(vad_segments, chunk_size)
if self.tokenizer is None: if self.tokenizer is None:
language = language or self.detect_language(audio) language = language or self.detect_language(audio)
task = task or "transcribe" task = task or "transcribe"

View File

@ -41,6 +41,7 @@ def cli():
# vad params # vad params
parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected") parser.add_argument("--vad_onset", type=float, default=0.500, help="Onset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected")
parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.") parser.add_argument("--vad_offset", type=float, default=0.363, help="Offset threshold for VAD (see pyannote.audio), reduce this if speech is not being detected.")
parser.add_argument("--chunk_size", type=int, default=30, help="Chunk size for merging VAD segments. Default is 30, reduce this if the chunk is too long.")
# diarization params # diarization params
parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word") parser.add_argument("--diarize", action="store_true", help="Apply diarization to assign speaker labels to each segment/word")
@ -101,6 +102,8 @@ def cli():
vad_onset: float = args.pop("vad_onset") vad_onset: float = args.pop("vad_onset")
vad_offset: float = args.pop("vad_offset") vad_offset: float = args.pop("vad_offset")
chunk_size: int = args.pop("chunk_size")
diarize: bool = args.pop("diarize") diarize: bool = args.pop("diarize")
min_speakers: int = args.pop("min_speakers") min_speakers: int = args.pop("min_speakers")
max_speakers: int = args.pop("max_speakers") max_speakers: int = args.pop("max_speakers")
@ -156,7 +159,7 @@ def cli():
audio = load_audio(audio_path) audio = load_audio(audio_path)
# >> VAD & ASR # >> VAD & ASR
print(">>Performing transcription...") print(">>Performing transcription...")
result = model.transcribe(audio, batch_size=batch_size) result = model.transcribe(audio, batch_size=batch_size, chunk_size=chunk_size)
results.append((result, audio_path)) results.append((result, audio_path))
# Unload Whisper and VAD # Unload Whisper and VAD