missing word timestamps

This commit is contained in:
Max Bain
2023-01-24 16:37:19 +00:00
parent d1600e5b0f
commit eec6d1f8d8
2 changed files with 26 additions and 27 deletions

View File

@ -330,8 +330,8 @@ def align(
aligned_segments = [] aligned_segments = []
prev_t2 = 0 prev_t2 = 0
sdx = 0
for segment in transcript: for segment in transcript:
aligned_subsegments = []
while True: while True:
segment_align_success = False segment_align_success = False
@ -505,7 +505,7 @@ def align(
else: else:
word_level = None word_level = None
aligned_segments.append( aligned_subsegments.append(
{ {
"text": segment["seg-text"][sub_seg_idx], "text": segment["seg-text"][sub_seg_idx],
"start": seg_start_actual, "start": seg_start_actual,
@ -515,10 +515,7 @@ def align(
} }
) )
if "language" in segment: if "language" in segment:
aligned_segments[-1]["language"] = segment["language"] aligned_subsegments[-1]["language"] = segment["language"]
print(f"[{format_timestamp(aligned_segments[-1]['start'])} --> {format_timestamp(aligned_segments[-1]['end'])}] {aligned_segments[-1]['text']}")
char_level = { char_level = {
"start": [], "start": [],
@ -556,11 +553,13 @@ def align(
if not segment_align_success: if not segment_align_success:
prev_t2 = 0 prev_t2 = 0
# shift segment index by amount of sub-segments start = interpolate_nans(pd.DataFrame(aligned_subsegments)["start"], method=interpolate_method)
if "seg-text" in segment: end = interpolate_nans(pd.DataFrame(aligned_subsegments)["end"], method=interpolate_method)
sdx += len(segment["seg-text"]) for idx, seg in enumerate(aligned_subsegments):
else: seg['start'] = start.iloc[idx]
sdx += 1 seg['end'] = end.iloc[idx]
aligned_segments += aligned_subsegments
# create word level segments for .srt # create word level segments for .srt
word_seg = [] word_seg = []

View File

@ -4,7 +4,7 @@ from typing import Callable, TextIO, Iterator, Tuple
import pandas as pd import pandas as pd
def exact_div(x, y): def exact_div(x, y):
***ert x % y == 0 assert x % y == 0
return x // y return x // y
@ -30,7 +30,7 @@ def compression_ratio(text) -> float:
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'): def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
***ert seconds >= 0, "non-negative timestamp expected" assert seconds >= 0, "non-negative timestamp expected"
milliseconds = round(seconds * 1000.0) milliseconds = round(seconds * 1000.0)
hours = milliseconds // 3_600_000 hours = milliseconds // 3_600_000
@ -105,11 +105,11 @@ def write_ass(transcript: Iterator[dict],
strip=True, **kwargs): strip=True, **kwargs):
""" """
Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py
Generate Advanced SubStation Alpha (***) file from results to Generate Advanced SubStation Alpha (ass) file from results to
display both phrase-level & word-level timestamp simultaneously by: display both phrase-level & word-level timestamp simultaneously by:
-using segment-level timestamps display phrases as usual -using segment-level timestamps display phrases as usual
-using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment -using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment
Note: *** file is used in the same way as srt, vtt, etc. Note: ass file is used in the same way as srt, vtt, etc.
Parameters Parameters
---------- ----------
transcript: dict transcript: dict
@ -125,14 +125,14 @@ def write_ass(transcript: Iterator[dict],
whether to underline a word at its corresponding timestamp whether to underline a word at its corresponding timestamp
prefmt: str prefmt: str
used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline') used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline')
appears as such in the .*** file: appears as such in the .ass file:
Hi, {<prefmt>}how{<suffmt>} are you? Hi, {<prefmt>}how{<suffmt>} are you?
reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
suffmt: str suffmt: str
used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline') used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline')
appears as such in the .*** file: appears as such in the .ass file:
Hi, {<prefmt>}how{<suffmt>} are you? Hi, {<prefmt>}how{<suffmt>} are you?
reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
font: str font: str
word font (default: Arial) word font (default: Arial)
font_size: int font_size: int
@ -165,13 +165,13 @@ def write_ass(transcript: Iterator[dict],
styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}' styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}'
***_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \ ass_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \ f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \
f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n' f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n'
if prefmt or suffmt: if prefmt or suffmt:
if suffmt: if suffmt:
***ert prefmt, 'prefmt must be used along with suffmt' assert prefmt, 'prefmt must be used along with suffmt'
else: else:
suffmt = r'\r' suffmt = r'\r'
else: else:
@ -201,9 +201,9 @@ def write_ass(transcript: Iterator[dict],
elif resolution == "char": elif resolution == "char":
resolution_key = "char-segments" resolution_key = "char-segments"
else: else:
raise ValueError(".*** resolution should be 'word' or 'char', not ", resolution) raise ValueError(".ass resolution should be 'word' or 'char', not ", resolution)
***_arr = [] ass_arr = []
for segment in transcript: for segment in transcript:
if resolution_key in segment: if resolution_key in segment:
@ -231,7 +231,7 @@ def write_ass(transcript: Iterator[dict],
"idx_1": -1 "idx_1": -1
} }
***_arr.append(filler_ts) ass_arr.append(filler_ts)
# highlight current word # highlight current word
f_word_ts = { f_word_ts = {
"chars": speaker_str + segment['text'], "chars": speaker_str + segment['text'],
@ -240,12 +240,12 @@ def write_ass(transcript: Iterator[dict],
"idx_0": idx_0 + len(speaker_str), "idx_0": idx_0 + len(speaker_str),
"idx_1": idx_1 + len(speaker_str) "idx_1": idx_1 + len(speaker_str)
} }
***_arr.append(f_word_ts) ass_arr.append(f_word_ts)
prev = crow['end'] prev = crow['end']
***_str += '\n'.join(map(lambda x: dialogue(**x), ***_arr)) ass_str += '\n'.join(map(lambda x: dialogue(**x), ass_arr))
file.write(***_str) file.write(ass_str)
def interpolate_nans(x, method='nearest'): def interpolate_nans(x, method='nearest'):
if x.notnull().sum() > 1: if x.notnull().sum() > 1: