missing word timestamps

2025-07-01 18:17:27 -04:00 · 2023-01-24 16:37:19 +00:00
parent d1600e5b0f
commit eec6d1f8d8
2 changed files with 26 additions and 27 deletions
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@ -330,8 +330,8 @@ def align(
    aligned_segments = []

    prev_t2 = 0
-    sdx = 0
    for segment in transcript:
+        aligned_subsegments = []
        while True:
            segment_align_success = False

@ -505,7 +505,7 @@ def align(
                    else:
                        word_level = None

-                    aligned_segments.append(
+                    aligned_subsegments.append(
                        {
                            "text": segment["seg-text"][sub_seg_idx],
                            "start": seg_start_actual,
@ -515,10 +515,7 @@ def align(
                        }
                    )
                    if "language" in segment:
-                        aligned_segments[-1]["language"] = segment["language"]
-
-                    print(f"[{format_timestamp(aligned_segments[-1]['start'])} --> {format_timestamp(aligned_segments[-1]['end'])}] {aligned_segments[-1]['text']}")
-
+                        aligned_subsegments[-1]["language"] = segment["language"]

                    char_level = {
                        "start": [],
@ -555,12 +552,14 @@ def align(
        # reset prev_t2 due to drifting issues
        if not segment_align_success:
            prev_t2 = 0
+        
+        start = interpolate_nans(pd.DataFrame(aligned_subsegments)["start"], method=interpolate_method) 
+        end = interpolate_nans(pd.DataFrame(aligned_subsegments)["end"], method=interpolate_method)
+        for idx, seg in enumerate(aligned_subsegments):
+            seg['start'] = start.iloc[idx]
+            seg['end'] = end.iloc[idx]

-        # shift segment index by amount of sub-segments
-        if "seg-text" in segment:
-            sdx += len(segment["seg-text"])
-        else:
-            sdx += 1
+        aligned_segments += aligned_subsegments

    # create word level segments for .srt
    word_seg = []
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@ -4,7 +4,7 @@ from typing import Callable, TextIO, Iterator, Tuple
 import pandas as pd

 def exact_div(x, y):
-    ***ert x % y == 0
+    assert x % y == 0
    return x // y


@ -30,7 +30,7 @@ def compression_ratio(text) -> float:


 def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
-    ***ert seconds >= 0, "non-negative timestamp expected"
+    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)

    hours = milliseconds // 3_600_000
@ -105,11 +105,11 @@ def write_ass(transcript: Iterator[dict],
            strip=True, **kwargs):
    """
    Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py
-        Generate Advanced SubStation Alpha (***) file from results to
+        Generate Advanced SubStation Alpha (ass) file from results to
    display both phrase-level & word-level timestamp simultaneously by:
     -using segment-level timestamps display phrases as usual
     -using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment
-    Note: *** file is used in the same way as srt, vtt, etc.
+    Note: ass file is used in the same way as srt, vtt, etc.
    Parameters
    ----------
    transcript: dict
@ -125,14 +125,14 @@ def write_ass(transcript: Iterator[dict],
        whether to underline a word at its corresponding timestamp
    prefmt: str
        used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline')
-        appears as such in the .*** file:
+        appears as such in the .ass file:
            Hi, {<prefmt>}how{<suffmt>} are you?
-        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm
+        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
    suffmt: str
        used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline')
-        appears as such in the .*** file:
+        appears as such in the .ass file:
            Hi, {<prefmt>}how{<suffmt>} are you?
-        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm
+        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
    font: str
        word font (default: Arial)
    font_size: int
@ -165,13 +165,13 @@ def write_ass(transcript: Iterator[dict],

    styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}'

-    ***_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
+    ass_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
            f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \
            f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n'

    if prefmt or suffmt:
        if suffmt:
-            ***ert prefmt, 'prefmt must be used along with suffmt'
+            assert prefmt, 'prefmt must be used along with suffmt'
        else:
            suffmt = r'\r'
    else:
@ -201,9 +201,9 @@ def write_ass(transcript: Iterator[dict],
    elif resolution == "char":
        resolution_key = "char-segments"
    else:
-        raise ValueError(".*** resolution should be 'word' or 'char', not ", resolution)
+        raise ValueError(".ass resolution should be 'word' or 'char', not ", resolution)
    
-    ***_arr = []
+    ass_arr = []

    for segment in transcript:
        if resolution_key in segment:
@ -231,7 +231,7 @@ def write_ass(transcript: Iterator[dict],
                            "idx_1": -1
                        }

-                        ***_arr.append(filler_ts)
+                        ass_arr.append(filler_ts)
                    # highlight current word
                    f_word_ts = {
                        "chars": speaker_str + segment['text'],
@ -240,12 +240,12 @@ def write_ass(transcript: Iterator[dict],
                        "idx_0": idx_0 + len(speaker_str),
                        "idx_1": idx_1 + len(speaker_str)
                    }
-                    ***_arr.append(f_word_ts)
+                    ass_arr.append(f_word_ts)
                    prev = crow['end']

-    ***_str += '\n'.join(map(lambda x: dialogue(**x), ***_arr))
+    ass_str += '\n'.join(map(lambda x: dialogue(**x), ass_arr))

-    file.write(***_str)
+    file.write(ass_str)

 def interpolate_nans(x, method='nearest'):
    if x.notnull().sum() > 1: