mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
fix starting timestamp for multiple fail-to-aligned words
This commit is contained in:
@ -95,7 +95,7 @@ https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-
|
|||||||
The phoneme ASR alignment model is *language-specific*, for tested languages these models are [automatically picked from torchaudio pipelines or huggingface](https://github.com/m-bain/whisperX/blob/e909f2f766b23b2000f2d95df41f9b844ac53e49/whisperx/transcribe.py#L22).
|
The phoneme ASR alignment model is *language-specific*, for tested languages these models are [automatically picked from torchaudio pipelines or huggingface](https://github.com/m-bain/whisperX/blob/e909f2f766b23b2000f2d95df41f9b844ac53e49/whisperx/transcribe.py#L22).
|
||||||
Just pass in the `--language` code, and use the whisper `--model large`.
|
Just pass in the `--language` code, and use the whisper `--model large`.
|
||||||
|
|
||||||
Currently default models provided for `{en, fr, de, es, it, ja, zh, nl}`. If the detected language is not in this list, you need to find a phoneme-based ASR model from [huggingface model hub](https://huggingface.co/models) and test it on your data.
|
Currently default models provided for `{en, fr, de, es, it, ja, zh, nl, uk}`. If the detected language is not in this list, you need to find a phoneme-based ASR model from [huggingface model hub](https://huggingface.co/models) and test it on your data.
|
||||||
|
|
||||||
|
|
||||||
#### E.g. German
|
#### E.g. German
|
||||||
@ -103,6 +103,9 @@ Currently default models provided for `{en, fr, de, es, it, ja, zh, nl}`. If the
|
|||||||
|
|
||||||
https://user-images.githubusercontent.com/36994049/208298811-e36002ba-3698-4731-97d4-0aebd07e0eb3.mov
|
https://user-images.githubusercontent.com/36994049/208298811-e36002ba-3698-4731-97d4-0aebd07e0eb3.mov
|
||||||
|
|
||||||
|
|
||||||
|
See more exac
|
||||||
|
|
||||||
## Python usage 🐍
|
## Python usage 🐍
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -369,14 +369,16 @@ def align(
|
|||||||
|
|
||||||
# for per-word .srt ouput
|
# for per-word .srt ouput
|
||||||
# merge missing words to previous, or merge with next word ahead if idx == 0
|
# merge missing words to previous, or merge with next word ahead if idx == 0
|
||||||
|
found_first_ts = False
|
||||||
for x in range(len(t_local)):
|
for x in range(len(t_local)):
|
||||||
curr_word = t_words[x]
|
curr_word = t_words[x]
|
||||||
curr_timestamp = t_local[x]
|
curr_timestamp = t_local[x]
|
||||||
if curr_timestamp is not None:
|
if curr_timestamp is not None:
|
||||||
word_segments_list.append({"text": curr_word, "start": curr_timestamp[0], "end": curr_timestamp[1]})
|
word_segments_list.append({"text": curr_word, "start": curr_timestamp[0], "end": curr_timestamp[1]})
|
||||||
|
found_first_ts = True
|
||||||
elif not drop_non_aligned_words:
|
elif not drop_non_aligned_words:
|
||||||
# then we merge
|
# then we merge
|
||||||
if x == 0:
|
if not found_first_ts:
|
||||||
t_words[x+1] = " ".join([curr_word, t_words[x+1]])
|
t_words[x+1] = " ".join([curr_word, t_words[x+1]])
|
||||||
else:
|
else:
|
||||||
word_segments_list[-1]['text'] += ' ' + curr_word
|
word_segments_list[-1]['text'] += ' ' + curr_word
|
||||||
|
Reference in New Issue
Block a user