diff --git a/whisperx/asr.py b/whisperx/asr.py index ac16459..e78d77c 100644 --- a/whisperx/asr.py +++ b/whisperx/asr.py @@ -269,6 +269,10 @@ def transcribe( end_timestamp_pos = ( sliced_tokens[-1].item() - tokenizer.timestamp_begin ) + + # clamp end-time to at least be 1 frame after start-time + end_timestamp_pos = max(end_timestamp_pos, start_timestamp_pos + time_precision) + current_segments.append( new_segment( start=time_offset + start_timestamp_pos * time_precision, @@ -426,4 +430,4 @@ def transcribe_with_vad( output["language"] = output["segments"][0]["language"] - return output \ No newline at end of file + return output