From eec6d1f8d8bd46cde0eb094707ab0e989bf4896f Mon Sep 17 00:00:00 2001
From: Max Bain <maxbain@robots.ox.ac.uk>
Date: Tue, 24 Jan 2023 16:37:19 +0000
Subject: [PATCH] missing word timestamps

---
 whisperx/transcribe.py | 21 ++++++++++-----------
 whisperx/utils.py      | 32 ++++++++++++++++----------------
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py
index f9382dd..5cfc6aa 100644
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@@ -330,8 +330,8 @@ def align(
     aligned_segments = []
 
     prev_t2 = 0
-    sdx = 0
     for segment in transcript:
+        aligned_subsegments = []
         while True:
             segment_align_success = False
 
@@ -505,7 +505,7 @@ def align(
                     else:
                         word_level = None
 
-                    aligned_segments.append(
+                    aligned_subsegments.append(
                         {
                             "text": segment["seg-text"][sub_seg_idx],
                             "start": seg_start_actual,
@@ -515,10 +515,7 @@ def align(
                         }
                     )
                     if "language" in segment:
-                        aligned_segments[-1]["language"] = segment["language"]
-
-                    print(f"[{format_timestamp(aligned_segments[-1]['start'])} --> {format_timestamp(aligned_segments[-1]['end'])}] {aligned_segments[-1]['text']}")
-
+                        aligned_subsegments[-1]["language"] = segment["language"]
 
                     char_level = {
                         "start": [],
@@ -555,12 +552,14 @@ def align(
         # reset prev_t2 due to drifting issues
         if not segment_align_success:
             prev_t2 = 0
+        
+        start = interpolate_nans(pd.DataFrame(aligned_subsegments)["start"], method=interpolate_method) 
+        end = interpolate_nans(pd.DataFrame(aligned_subsegments)["end"], method=interpolate_method)
+        for idx, seg in enumerate(aligned_subsegments):
+            seg['start'] = start.iloc[idx]
+            seg['end'] = end.iloc[idx]
 
-        # shift segment index by amount of sub-segments
-        if "seg-text" in segment:
-            sdx += len(segment["seg-text"])
-        else:
-            sdx += 1
+        aligned_segments += aligned_subsegments
 
     # create word level segments for .srt
     word_seg = []
diff --git a/whisperx/utils.py b/whisperx/utils.py
index 79aba53..77243d6 100644
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@@ -4,7 +4,7 @@ from typing import Callable, TextIO, Iterator, Tuple
 import pandas as pd
 
 def exact_div(x, y):
-    ***ert x % y == 0
+    assert x % y == 0
     return x // y
 
 
@@ -30,7 +30,7 @@ def compression_ratio(text) -> float:
 
 
 def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
-    ***ert seconds >= 0, "non-negative timestamp expected"
+    assert seconds >= 0, "non-negative timestamp expected"
     milliseconds = round(seconds * 1000.0)
 
     hours = milliseconds // 3_600_000
@@ -105,11 +105,11 @@ def write_ass(transcript: Iterator[dict],
             strip=True, **kwargs):
     """
     Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py
-        Generate Advanced SubStation Alpha (***) file from results to
+        Generate Advanced SubStation Alpha (ass) file from results to
     display both phrase-level & word-level timestamp simultaneously by:
      -using segment-level timestamps display phrases as usual
      -using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment
-    Note: *** file is used in the same way as srt, vtt, etc.
+    Note: ass file is used in the same way as srt, vtt, etc.
     Parameters
     ----------
     transcript: dict
@@ -125,14 +125,14 @@ def write_ass(transcript: Iterator[dict],
         whether to underline a word at its corresponding timestamp
     prefmt: str
         used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline')
-        appears as such in the .*** file:
+        appears as such in the .ass file:
             Hi, {<prefmt>}how{<suffmt>} are you?
-        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm
+        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
     suffmt: str
         used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline')
-        appears as such in the .*** file:
+        appears as such in the .ass file:
             Hi, {<prefmt>}how{<suffmt>} are you?
-        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm
+        reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
     font: str
         word font (default: Arial)
     font_size: int
@@ -165,13 +165,13 @@ def write_ass(transcript: Iterator[dict],
 
     styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}'
 
-    ***_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
+    ass_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
             f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \
             f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n'
 
     if prefmt or suffmt:
         if suffmt:
-            ***ert prefmt, 'prefmt must be used along with suffmt'
+            assert prefmt, 'prefmt must be used along with suffmt'
         else:
             suffmt = r'\r'
     else:
@@ -201,9 +201,9 @@ def write_ass(transcript: Iterator[dict],
     elif resolution == "char":
         resolution_key = "char-segments"
     else:
-        raise ValueError(".*** resolution should be 'word' or 'char', not ", resolution)
+        raise ValueError(".ass resolution should be 'word' or 'char', not ", resolution)
     
-    ***_arr = []
+    ass_arr = []
 
     for segment in transcript:
         if resolution_key in segment:
@@ -231,7 +231,7 @@ def write_ass(transcript: Iterator[dict],
                             "idx_1": -1
                         }
 
-                        ***_arr.append(filler_ts)
+                        ass_arr.append(filler_ts)
                     # highlight current word
                     f_word_ts = {
                         "chars": speaker_str + segment['text'],
@@ -240,12 +240,12 @@ def write_ass(transcript: Iterator[dict],
                         "idx_0": idx_0 + len(speaker_str),
                         "idx_1": idx_1 + len(speaker_str)
                     }
-                    ***_arr.append(f_word_ts)
+                    ass_arr.append(f_word_ts)
                     prev = crow['end']
 
-    ***_str += '\n'.join(map(lambda x: dialogue(**x), ***_arr))
+    ass_str += '\n'.join(map(lambda x: dialogue(**x), ass_arr))
 
-    file.write(***_str)
+    file.write(ass_str)
 
 def interpolate_nans(x, method='nearest'):
     if x.notnull().sum() > 1: