mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
missing word timestamps
This commit is contained in:
@ -330,8 +330,8 @@ def align(
|
|||||||
aligned_segments = []
|
aligned_segments = []
|
||||||
|
|
||||||
prev_t2 = 0
|
prev_t2 = 0
|
||||||
sdx = 0
|
|
||||||
for segment in transcript:
|
for segment in transcript:
|
||||||
|
aligned_subsegments = []
|
||||||
while True:
|
while True:
|
||||||
segment_align_success = False
|
segment_align_success = False
|
||||||
|
|
||||||
@ -505,7 +505,7 @@ def align(
|
|||||||
else:
|
else:
|
||||||
word_level = None
|
word_level = None
|
||||||
|
|
||||||
aligned_segments.append(
|
aligned_subsegments.append(
|
||||||
{
|
{
|
||||||
"text": segment["seg-text"][sub_seg_idx],
|
"text": segment["seg-text"][sub_seg_idx],
|
||||||
"start": seg_start_actual,
|
"start": seg_start_actual,
|
||||||
@ -515,10 +515,7 @@ def align(
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
if "language" in segment:
|
if "language" in segment:
|
||||||
aligned_segments[-1]["language"] = segment["language"]
|
aligned_subsegments[-1]["language"] = segment["language"]
|
||||||
|
|
||||||
print(f"[{format_timestamp(aligned_segments[-1]['start'])} --> {format_timestamp(aligned_segments[-1]['end'])}] {aligned_segments[-1]['text']}")
|
|
||||||
|
|
||||||
|
|
||||||
char_level = {
|
char_level = {
|
||||||
"start": [],
|
"start": [],
|
||||||
@ -556,11 +553,13 @@ def align(
|
|||||||
if not segment_align_success:
|
if not segment_align_success:
|
||||||
prev_t2 = 0
|
prev_t2 = 0
|
||||||
|
|
||||||
# shift segment index by amount of sub-segments
|
start = interpolate_nans(pd.DataFrame(aligned_subsegments)["start"], method=interpolate_method)
|
||||||
if "seg-text" in segment:
|
end = interpolate_nans(pd.DataFrame(aligned_subsegments)["end"], method=interpolate_method)
|
||||||
sdx += len(segment["seg-text"])
|
for idx, seg in enumerate(aligned_subsegments):
|
||||||
else:
|
seg['start'] = start.iloc[idx]
|
||||||
sdx += 1
|
seg['end'] = end.iloc[idx]
|
||||||
|
|
||||||
|
aligned_segments += aligned_subsegments
|
||||||
|
|
||||||
# create word level segments for .srt
|
# create word level segments for .srt
|
||||||
word_seg = []
|
word_seg = []
|
||||||
|
@ -4,7 +4,7 @@ from typing import Callable, TextIO, Iterator, Tuple
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
def exact_div(x, y):
|
def exact_div(x, y):
|
||||||
***ert x % y == 0
|
assert x % y == 0
|
||||||
return x // y
|
return x // y
|
||||||
|
|
||||||
|
|
||||||
@ -30,7 +30,7 @@ def compression_ratio(text) -> float:
|
|||||||
|
|
||||||
|
|
||||||
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
|
def format_timestamp(seconds: float, always_include_hours: bool = False, decimal_marker: str = '.'):
|
||||||
***ert seconds >= 0, "non-negative timestamp expected"
|
assert seconds >= 0, "non-negative timestamp expected"
|
||||||
milliseconds = round(seconds * 1000.0)
|
milliseconds = round(seconds * 1000.0)
|
||||||
|
|
||||||
hours = milliseconds // 3_600_000
|
hours = milliseconds // 3_600_000
|
||||||
@ -105,11 +105,11 @@ def write_ass(transcript: Iterator[dict],
|
|||||||
strip=True, **kwargs):
|
strip=True, **kwargs):
|
||||||
"""
|
"""
|
||||||
Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py
|
Credit: https://github.com/jianfch/stable-ts/blob/ff79549bd01f764427879f07ecd626c46a9a430a/stable_whisper/text_output.py
|
||||||
Generate Advanced SubStation Alpha (***) file from results to
|
Generate Advanced SubStation Alpha (ass) file from results to
|
||||||
display both phrase-level & word-level timestamp simultaneously by:
|
display both phrase-level & word-level timestamp simultaneously by:
|
||||||
-using segment-level timestamps display phrases as usual
|
-using segment-level timestamps display phrases as usual
|
||||||
-using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment
|
-using word-level timestamps change formats (e.g. color/underline) of the word in the displayed segment
|
||||||
Note: *** file is used in the same way as srt, vtt, etc.
|
Note: ass file is used in the same way as srt, vtt, etc.
|
||||||
Parameters
|
Parameters
|
||||||
----------
|
----------
|
||||||
transcript: dict
|
transcript: dict
|
||||||
@ -125,14 +125,14 @@ def write_ass(transcript: Iterator[dict],
|
|||||||
whether to underline a word at its corresponding timestamp
|
whether to underline a word at its corresponding timestamp
|
||||||
prefmt: str
|
prefmt: str
|
||||||
used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline')
|
used to specify format for word-level timestamps (must be use with 'suffmt' and overrides 'color'&'underline')
|
||||||
appears as such in the .*** file:
|
appears as such in the .ass file:
|
||||||
Hi, {<prefmt>}how{<suffmt>} are you?
|
Hi, {<prefmt>}how{<suffmt>} are you?
|
||||||
reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm
|
reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
|
||||||
suffmt: str
|
suffmt: str
|
||||||
used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline')
|
used to specify format for word-level timestamps (must be use with 'prefmt' and overrides 'color'&'underline')
|
||||||
appears as such in the .*** file:
|
appears as such in the .ass file:
|
||||||
Hi, {<prefmt>}how{<suffmt>} are you?
|
Hi, {<prefmt>}how{<suffmt>} are you?
|
||||||
reference [Appendix A: Style override codes] in http://www.tcax.org/docs/***-specs.htm
|
reference [Appendix A: Style override codes] in http://www.tcax.org/docs/ass-specs.htm
|
||||||
font: str
|
font: str
|
||||||
word font (default: Arial)
|
word font (default: Arial)
|
||||||
font_size: int
|
font_size: int
|
||||||
@ -165,13 +165,13 @@ def write_ass(transcript: Iterator[dict],
|
|||||||
|
|
||||||
styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}'
|
styles = f'Style: {",".join(map(str, fmt_style_dict.values()))}'
|
||||||
|
|
||||||
***_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
|
ass_str = f'[Script Info]\nScriptType: v4.00+\nPlayResX: 384\nPlayResY: 288\nScaledBorderAndShadow: yes\n\n' \
|
||||||
f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \
|
f'[V4+ Styles]\n{fmts}\n{styles}\n\n' \
|
||||||
f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n'
|
f'[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n\n'
|
||||||
|
|
||||||
if prefmt or suffmt:
|
if prefmt or suffmt:
|
||||||
if suffmt:
|
if suffmt:
|
||||||
***ert prefmt, 'prefmt must be used along with suffmt'
|
assert prefmt, 'prefmt must be used along with suffmt'
|
||||||
else:
|
else:
|
||||||
suffmt = r'\r'
|
suffmt = r'\r'
|
||||||
else:
|
else:
|
||||||
@ -201,9 +201,9 @@ def write_ass(transcript: Iterator[dict],
|
|||||||
elif resolution == "char":
|
elif resolution == "char":
|
||||||
resolution_key = "char-segments"
|
resolution_key = "char-segments"
|
||||||
else:
|
else:
|
||||||
raise ValueError(".*** resolution should be 'word' or 'char', not ", resolution)
|
raise ValueError(".ass resolution should be 'word' or 'char', not ", resolution)
|
||||||
|
|
||||||
***_arr = []
|
ass_arr = []
|
||||||
|
|
||||||
for segment in transcript:
|
for segment in transcript:
|
||||||
if resolution_key in segment:
|
if resolution_key in segment:
|
||||||
@ -231,7 +231,7 @@ def write_ass(transcript: Iterator[dict],
|
|||||||
"idx_1": -1
|
"idx_1": -1
|
||||||
}
|
}
|
||||||
|
|
||||||
***_arr.append(filler_ts)
|
ass_arr.append(filler_ts)
|
||||||
# highlight current word
|
# highlight current word
|
||||||
f_word_ts = {
|
f_word_ts = {
|
||||||
"chars": speaker_str + segment['text'],
|
"chars": speaker_str + segment['text'],
|
||||||
@ -240,12 +240,12 @@ def write_ass(transcript: Iterator[dict],
|
|||||||
"idx_0": idx_0 + len(speaker_str),
|
"idx_0": idx_0 + len(speaker_str),
|
||||||
"idx_1": idx_1 + len(speaker_str)
|
"idx_1": idx_1 + len(speaker_str)
|
||||||
}
|
}
|
||||||
***_arr.append(f_word_ts)
|
ass_arr.append(f_word_ts)
|
||||||
prev = crow['end']
|
prev = crow['end']
|
||||||
|
|
||||||
***_str += '\n'.join(map(lambda x: dialogue(**x), ***_arr))
|
ass_str += '\n'.join(map(lambda x: dialogue(**x), ass_arr))
|
||||||
|
|
||||||
file.write(***_str)
|
file.write(ass_str)
|
||||||
|
|
||||||
def interpolate_nans(x, method='nearest'):
|
def interpolate_nans(x, method='nearest'):
|
||||||
if x.notnull().sum() > 1:
|
if x.notnull().sum() > 1:
|
||||||
|
Reference in New Issue
Block a user