diff --git a/README.md b/README.md index 32ab80b..93fa776 100644 --- a/README.md +++ b/README.md @@ -95,7 +95,7 @@ https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2- The phoneme ASR alignment model is *language-specific*, for tested languages these models are [automatically picked from torchaudio pipelines or huggingface](https://github.com/m-bain/whisperX/blob/e909f2f766b23b2000f2d95df41f9b844ac53e49/whisperx/transcribe.py#L22). Just pass in the `--language` code, and use the whisper `--model large`. -Currently default models provided for `{en, fr, de, es, it, ja, zh, nl}`. If the detected language is not in this list, you need to find a phoneme-based ASR model from [huggingface model hub](https://huggingface.co/models) and test it on your data. +Currently default models provided for `{en, fr, de, es, it, ja, zh, nl, uk}`. If the detected language is not in this list, you need to find a phoneme-based ASR model from [huggingface model hub](https://huggingface.co/models) and test it on your data. #### E.g. German @@ -103,6 +103,9 @@ Currently default models provided for `{en, fr, de, es, it, ja, zh, nl}`. If the https://user-images.githubusercontent.com/36994049/208298811-e36002ba-3698-4731-97d4-0aebd07e0eb3.mov + +See more exac + ## Python usage 🐍 ```python diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index 7375b9b..ed0c2e9 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -369,14 +369,16 @@ def align( # for per-word .srt ouput # merge missing words to previous, or merge with next word ahead if idx == 0 + found_first_ts = False for x in range(len(t_local)): curr_word = t_words[x] curr_timestamp = t_local[x] if curr_timestamp is not None: word_segments_list.append({"text": curr_word, "start": curr_timestamp[0], "end": curr_timestamp[1]}) + found_first_ts = True elif not drop_non_aligned_words: # then we merge - if x == 0: + if not found_first_ts: t_words[x+1] = " ".join([curr_word, t_words[x+1]]) else: word_segments_list[-1]['text'] += ' ' + curr_word