From eec6d1f8d8bd46cde0eb094707ab0e989bf4896f Mon Sep 17 00:00:00 2001 From: Max Bain Date: Tue, 24 Jan 2023 16:37:19 +0000 Subject: [PATCH] missing word timestamps --- whisperx/transcribe.py | 21 ++++++++++----------- whisperx/utils.py | 32 ++++++++++++++++---------------- 2 files changed, 26 insertions(+), 27 deletions(-) diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index f9382dd..5cfc6aa 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -330,8 +330,8 @@ def align( aligned_segments = [] prev_t2 = 0 - sdx = 0 for segment in transcript: + aligned_subsegments = [] while True: segment_align_success = False @@ -505,7 +505,7 @@ def align( else: word_level = None - aligned_segments.append( + aligned_subsegments.append( { "text": segment["seg-text"][sub_seg_idx], "start": seg_start_actual, @@ -515,10 +515,7 @@ def align( } ) if "language" in segment: - aligned_segments[-1]["language"] = segment["language"] - - print(f"[{format_timestamp(aligned_segments[-1]['start'])} --> {format_timestamp(aligned_segments[-1]['end'])}] {aligned_segments[-1]['text']}") - + aligned_subsegments[-1]["language"] = segment["language"] char_level = { "start": [], @@ -555,12 +552,14 @@ def align( # reset prev_t2 due to drifting issues if not segment_align_success: prev_t2 = 0 + + start = interpolate_nans(pd.DataFrame(aligned_subsegments)["start"], method=interpolate_method) + end = interpolate_nans(pd.DataFrame(aligned_subsegments)["end"], method=interpolate_method) + for idx, seg in enumerate(aligned_subsegments): + seg['start'] = start.iloc[idx] + seg['end'] = end.iloc[idx] - # shift segment index by amount of sub-segments - if "seg-text" in segment: - sdx += len(segment["seg-text"]) - else: - sdx += 1 + aligned_segments += aligned_subsegments # create word level segments for .srt word_seg = [] diff --git a/whisperx/utils.py b/whisperx/utils.py index 79aba53..77243d6 100644 --- a/whisperx/utils.py +++ b/whisperx/utils.py @@ -4,7 +4,7 @@ from typing import Callable, TextIO, Iterator, Tuple import pandas as pd def exact_div(x, y): - ***ert x % y == 0 + assert x % y == 0 return x // y @@ -30,7 +30,7 @@ def compression_ratio(text) -> float: def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'): - ***ert seconds >= 0, "non-negative timestamp expected" + assert seconds >= 0, "non-negative timestamp expected" milliseconds = round(seconds * 1000.0) hours = milliseconds // 3_600_000 @@ -105,11 +105,11 @@ def write_ass(transcript: Iterator[dict], strip=True, **kwargs): """ Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py - Generate Advanced SubStation Alpha (***) file from results to + Generate Advanced SubStation Alpha (ass) file from results to display both phrase-level & word-level timestamp simultaneously by: -using segment-level timestamps display phrases as usual -using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment - Note: *** file is used in the same way as srt, vtt, etc. + Note: ass file is used in the same way as srt, vtt, etc. Parameters ---------- transcript: dict @@ -125,14 +125,14 @@ def write_ass(transcript: Iterator[dict], whether to underline a word at its corresponding timestamp prefmt: str used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline') - appears as such in the .*** file: + appears as such in the .ass file: Hi, {}how{} are you? - reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm + reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm suffmt: str used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline') - appears as such in the .*** file: + appears as such in the .ass file: Hi, {}how{} are you? - reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm + reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm font: str word font (default: Arial) font_size: int @@ -165,13 +165,13 @@ def write_ass(transcript: Iterator[dict], styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}' - ***_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \ + ass_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \ f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \ f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n' if prefmt or suffmt: if suffmt: - ***ert prefmt, 'prefmt must be used along with suffmt' + assert prefmt, 'prefmt must be used along with suffmt' else: suffmt = r'\r' else: @@ -201,9 +201,9 @@ def write_ass(transcript: Iterator[dict], elif resolution == "char": resolution_key = "char-segments" else: - raise ValueError(".*** resolution should be 'word' or 'char', not ", resolution) + raise ValueError(".ass resolution should be 'word' or 'char', not ", resolution) - ***_arr = [] + ass_arr = [] for segment in transcript: if resolution_key in segment: @@ -231,7 +231,7 @@ def write_ass(transcript: Iterator[dict], "idx_1": -1 } - ***_arr.append(filler_ts) + ass_arr.append(filler_ts) # highlight current word f_word_ts = { "chars": speaker_str + segment['text'], @@ -240,12 +240,12 @@ def write_ass(transcript: Iterator[dict], "idx_0": idx_0 + len(speaker_str), "idx_1": idx_1 + len(speaker_str) } - ***_arr.append(f_word_ts) + ass_arr.append(f_word_ts) prev = crow['end'] - ***_str += '\n'.join(map(lambda x: dialogue(**x), ***_arr)) + ass_str += '\n'.join(map(lambda x: dialogue(**x), ass_arr)) - file.write(***_str) + file.write(ass_str) def interpolate_nans(x, method='nearest'): if x.notnull().sum() > 1: