From 228b857597488ab7c705bd5ad680818dee769dbd Mon Sep 17 00:00:00 2001 From: Max Bain Date: Mon, 19 Dec 2022 19:12:50 +0000 Subject: [PATCH] add back word .srt, update readme --- README.md | 41 ++++-- examples/whisperx/sample01.wav.word.srt | 176 ++++++++++++------------ whisperx/transcribe.py | 36 ++++- 3 files changed, 147 insertions(+), 106 deletions(-) diff --git a/README.md b/README.md index 195dac7..482288c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,24 @@

WhisperX

-
Made by Max Bain • :globe_with_meridians: https://www.maxbain.com/
+

+ + GitHub stars + + + GitHub issues + + + GitHub license + + + Twitter + +

+ + +
Made by Max Bain • :globe_with_meridians: https://www.maxbain.com

Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy using forced alignment. @@ -28,22 +47,22 @@ Install this package using You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.

Examples💬

- ### English Run whisper on example segment (using default params) -`whisperx examples/sample01.wav --model medium.en --output examples/whisperx --align_model WAV2VEC2_ASR_LARGE_LV60K_960H --align_extend 2` +`whisperx examples/sample01.wav --model medium.en --output examples/whisperx --align_model WAV2VEC2_ASR_BASE_960H --align_extend 2` -If low gpu memory is required, use a smaller align model e.g. `WAV2VEC2_ASR_BASE_LV60K_960H` +For increased timestamp accuracy, at the cost of higher gpu mem, use a bigger alignment model e.g. -Using normal whisper out of the box, many transcriptions are out of sync: +`WAV2VEC2_ASR_LARGE_LV60K_960H` or `HUBERT_ASR_XLARGE` -https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-b404-bb941db73652.mov - -Now, using *WhisperX* with forced alignment to wav2vec2.0: +Result using *WhisperX* with forced alignment to wav2vec2.0 large: https://user-images.githubusercontent.com/36994049/208253969-7e35fe2a-7541-434a-ae91-8e919540555d.mp4 +Compare this to original whisper out the box, where many transcriptions are out of sync: + +https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-b404-bb941db73652.mov ## Other Languages @@ -78,7 +97,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-

Limitations ⚠️

-- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know its results on your data +- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know the results on your data - Whisper normalises spoken numbers e.g. "fifty seven" to arabic numerals "57". Need to perform this normalization after alignment, so the phonemes can be aligned. Currently just ignores numbers. - Assumes the initial whisper timestamps are accurate to some degree (within margin of 2 seconds, adjust if needed -- bigger margins more prone to alignment errors) - Hacked this up quite quickly, there might be some errors, please raise an issue if you encounter any. @@ -91,7 +110,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54- [ ] Automatic align model selection based on language detection -[ ] Reduce GPU (clear cache etc.) +[ ] Option to minimise gpu load (chunk wav2vec) [ ] Incorporating word-level speaker diarization @@ -99,7 +118,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-

Contact

-Contact maxbain[at]robots[dot]ox[dot]ac[dot]uk if using this for commerical purposes. +Contact maxbain[at]robots[dot]ox[dot]ac[dot]uk if using this commerically.

Acknowledgements 🙏

diff --git a/examples/whisperx/sample01.wav.word.srt b/examples/whisperx/sample01.wav.word.srt index c07d0e5..d7ec8fb 100644 --- a/examples/whisperx/sample01.wav.word.srt +++ b/examples/whisperx/sample01.wav.word.srt @@ -431,194 +431,194 @@ green case. 109 -00:00:38,095 --> 00:00:38,256 +00:00:38,135 --> 00:00:38,255 Do 110 -00:00:38,276 --> 00:00:38,356 +00:00:38,275 --> 00:00:38,355 you 111 -00:00:38,376 --> 00:00:38,516 +00:00:38,375 --> 00:00:38,535 want 112 -00:00:38,556 --> 00:00:38,736 +00:00:38,555 --> 00:00:38,736 your 113 -00:00:38,877 --> 00:00:39,297 +00:00:38,876 --> 00:00:39,296 PJs? 114 -00:00:39,862 --> 00:00:40,185 +00:00:39,879 --> 00:00:40,181 Yeah. 115 -00:00:42,394 --> 00:00:42,474 -Yeah. - -116 -00:00:42,474 --> 00:00:42,694 +00:00:42,388 --> 00:00:42,689 Lifting -117 -00:00:42,714 --> 00:00:42,754 +116 +00:00:42,729 --> 00:00:42,749 a -118 -00:00:42,794 --> 00:00:43,095 +117 +00:00:42,809 --> 00:00:43,110 bundle -119 -00:00:43,135 --> 00:00:43,195 +118 +00:00:43,131 --> 00:00:43,191 of -120 -00:00:43,235 --> 00:00:43,776 +119 +00:00:43,251 --> 00:00:43,773 pajamas, -121 -00:00:44,076 --> 00:00:44,316 +120 +00:00:44,073 --> 00:00:44,314 Peter -122 -00:00:44,376 --> 00:00:44,637 +121 +00:00:44,374 --> 00:00:44,634 finds -123 -00:00:44,677 --> 00:00:44,697 +122 +00:00:44,674 --> 00:00:44,694 a -124 -00:00:44,757 --> 00:00:44,957 +123 +00:00:44,754 --> 00:00:44,955 sheet -125 -00:00:44,997 --> 00:00:45,057 +124 +00:00:44,995 --> 00:00:45,055 of -126 -00:00:45,117 --> 00:00:45,418 +125 +00:00:45,115 --> 00:00:45,456 paper -127 -00:00:45,538 --> 00:00:45,899 +126 +00:00:45,536 --> 00:00:45,876 labeled -128 -00:00:46,341 --> 00:00:47,043 +127 +00:00:46,338 --> 00:00:47,041 Lancaster -129 -00:00:47,124 --> 00:00:47,384 +128 +00:00:47,121 --> 00:00:47,382 North -130 -00:00:47,445 --> 00:00:47,946 +129 +00:00:47,442 --> 00:00:47,944 Hospital -131 -00:00:48,267 --> 00:00:48,930 +130 +00:00:48,266 --> 00:00:48,928 discharge -132 -00:00:49,030 --> 00:00:49,251 +131 +00:00:49,029 --> 00:00:49,249 sheet. -133 -00:00:50,293 --> 00:00:50,373 +132 +00:00:50,291 --> 00:00:50,371 He -134 -00:00:50,413 --> 00:00:50,774 +133 +00:00:50,412 --> 00:00:50,772 closes -135 -00:00:50,814 --> 00:00:50,914 +134 +00:00:50,812 --> 00:00:50,912 the -136 -00:00:50,954 --> 00:00:51,395 +135 +00:00:50,953 --> 00:00:51,393 suitcase -137 -00:00:51,435 --> 00:00:51,515 +136 +00:00:51,433 --> 00:00:51,514 and -138 -00:00:51,535 --> 00:00:51,796 +137 +00:00:51,534 --> 00:00:51,794 brings -139 -00:00:51,836 --> 00:00:52,217 +138 +00:00:51,834 --> 00:00:52,235 Gloria -140 -00:00:52,257 --> 00:00:52,317 +139 +00:00:52,255 --> 00:00:52,315 the -141 -00:00:52,357 --> 00:00:52,858 +140 +00:00:52,355 --> 00:00:52,856 pajamas. -142 -00:00:54,187 --> 00:00:54,489 +141 +00:00:54,186 --> 00:00:54,488 There -143 -00:00:54,550 --> 00:00:54,771 +142 +00:00:54,549 --> 00:00:54,771 you -144 -00:00:54,791 --> 00:00:54,832 +143 +00:00:54,791 --> 00:00:54,831 go. -145 -00:00:55,655 --> 00:00:55,755 +144 +00:00:55,654 --> 00:00:55,775 Thank -146 -00:00:55,775 --> 00:00:55,896 +145 +00:00:55,795 --> 00:00:55,895 you. -147 -00:00:55,916 --> 00:00:55,956 +146 +00:00:55,895 --> 00:00:55,936 He -148 -00:00:55,976 --> 00:00:56,077 +147 +00:00:55,956 --> 00:00:56,097 picks -149 -00:00:56,097 --> 00:00:56,198 +148 +00:00:56,117 --> 00:00:56,198 up -150 +149 00:00:56,218 --> 00:00:56,319 the -151 +150 00:00:56,359 --> 00:00:56,742 locket. -152 +151 00:00:57,124 --> 00:00:57,225 -He +You -153 +152 00:00:57,265 --> 00:00:57,466 kept -154 +153 00:00:57,547 --> 00:00:57,627 it. -155 -00:00:58,874 --> 00:00:58,995 +154 +00:00:58,874 --> 00:00:58,994 Oh, -156 -00:00:59,678 --> 00:00:59,899 -cool. +155 +00:00:59,276 --> 00:00:59,578 +of + +156 +00:00:59,678 --> 00:00:59,960 +course. diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index c915aca..174cdbd 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -255,6 +255,7 @@ def align( device: str, extend_duration: float = 0.0, start_from_previous: bool = True, + drop_non_aligned_words: bool = False, ): print("Performing alignment...") if not torch.is_tensor(audio): @@ -267,6 +268,7 @@ def align( MAX_DURATION = audio.shape[1] / SAMPLE_RATE prev_t2 = 0 + word_segments_list = [] for idx, segment in enumerate(transcript): t1 = max(segment['start'] - extend_duration, 0) t2 = min(segment['end'] + extend_duration, MAX_DURATION) @@ -313,8 +315,7 @@ def align( segment['end'] = t2_actual prev_t2 = segment['end'] - - # merge missing words to previous, or merge with next word ahead if idx == 0 + # for the .ass output for x in range(len(t_local)): curr_word = t_words[x] curr_timestamp = t_local[x] @@ -323,15 +324,29 @@ def align( else: segment['word-level'].append({"text": curr_word, "start": None, "end": None}) + # for per-word .srt ouput + # merge missing words to previous, or merge with next word ahead if idx == 0 + for x in range(len(t_local)): + curr_word = t_words[x] + curr_timestamp = t_local[x] + if curr_timestamp is not None: + word_segments_list.append({"text": curr_word, "start": curr_timestamp[0], "end": curr_timestamp[1]}) + elif not drop_non_aligned_words: + # then we merge + if x == 0: + t_words[x+1] = " ".join([curr_word, t_words[x+1]]) + else: + word_segments_list[-1]['text'] += ' ' + curr_word else: # then we resort back to original whisper timestamps # segment['start] and segment['end'] are unchanged prev_t2 = 0 segment['word-level'].append({"text": segment['text'], "start": segment['start'], "end":segment['end']}) + word_segments_list.append({"text": segment['text'], "start": segment['start'], "end":segment['end']}) print(f"[{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}] {segment['text']}") - return {"segments": transcript} + return {"segments": transcript, "word_segments": word_segments_list} def cli(): from . import available_models @@ -342,9 +357,10 @@ def cli(): parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default") parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference") # alignment params - parser.add_argument("--align_model", default="WAV2VEC2_ASR_LARGE_LV60K_960H", help="Name of phoneme-level ASR model to do alignment") + parser.add_argument("--align_model", default="WAV2VEC2_ASR_BASE_960H", help="Name of phoneme-level ASR model to do alignment") parser.add_argument("--align_extend", default=2, type=float, help="Seconds before and after to extend the whisper segments for alignment") parser.add_argument("--align_from_prev", default=True, type=bool, help="Whether to clip the alignment start time of current segment to the end time of the last aligned word of the previous segment") + parser.add_argument("--drop_non_aligned", action="store_true", help="For word .srt, whether to drop non aliged words, or merge them into neighbouring.") parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs") parser.add_argument("--output_type", default="srt", choices=['all', 'srt', 'vtt', 'txt'], help="directory to save the outputs") @@ -381,7 +397,7 @@ def cli(): align_model: str = args.pop("align_model") align_extend: float = args.pop("align_extend") align_from_prev: bool = args.pop("align_from_prev") - # align_interpolate_missing: bool = args.pop("align_interpolate_missing") + drop_non_aligned: bool = args.pop("drop_non_aligned") os.makedirs(output_dir, exist_ok=True) @@ -409,12 +425,14 @@ def cli(): labels = bundle.get_labels() align_dictionary = {c.lower(): i for i, c in enumerate(labels)} else: - print(f'Align model "{align_model}" not found in torchaudio.pipelines, choose from:\n {torchaudio.pipelines.__all__}') + print(f'Align model "{align_model}" not found in torchaudio.pipelines, choose from:\n\ + {torchaudio.pipelines.__all__}\n\ + See details here https://pytorch.org/audio/stable/pipelines.html#id14') raise ValueError(f'Align model "{align_model}" not found in torchaudio.pipelines') for audio_path in args.pop("audio"): result = transcribe(model, audio_path, temperature=temperature, **args) result_aligned = align(result["segments"], align_model, align_dictionary, audio_path, device, - extend_duration=align_extend, start_from_previous=align_from_prev) + extend_duration=align_extend, start_from_previous=align_from_prev, drop_non_aligned_words=drop_non_aligned) audio_basename = os.path.basename(audio_path) # save TXT @@ -432,6 +450,10 @@ def cli(): with open(os.path.join(output_dir, audio_basename + ".srt"), "w", encoding="utf-8") as srt: write_srt(result_aligned["segments"], file=srt) + # save per-word SRT + with open(os.path.join(output_dir, audio_basename + ".word.srt"), "w", encoding="utf-8") as srt: + write_srt(result_aligned["word_segments"], file=srt) + # save ASS with open(os.path.join(output_dir, audio_basename + ".ass"), "w", encoding="utf-8") as srt: write_ass(result_aligned["segments"], file=srt)