From 228b857597488ab7c705bd5ad680818dee769dbd Mon Sep 17 00:00:00 2001
From: Max Bain Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy using forced alignment.
@@ -28,22 +47,22 @@ Install this package using
You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
WhisperX
-Made by Max Bain • :globe_with_meridians: https://www.maxbain.com/
+
+
+
+Made by Max Bain • :globe_with_meridians: https://www.maxbain.com
Examples💬
-
### English
Run whisper on example segment (using default params)
-`whisperx examples/sample01.wav --model medium.en --output examples/whisperx --align_model WAV2VEC2_ASR_LARGE_LV60K_960H --align_extend 2`
+`whisperx examples/sample01.wav --model medium.en --output examples/whisperx --align_model WAV2VEC2_ASR_BASE_960H --align_extend 2`
-If low gpu memory is required, use a smaller align model e.g. `WAV2VEC2_ASR_BASE_LV60K_960H`
+For increased timestamp accuracy, at the cost of higher gpu mem, use a bigger alignment model e.g.
-Using normal whisper out of the box, many transcriptions are out of sync:
+`WAV2VEC2_ASR_LARGE_LV60K_960H` or `HUBERT_ASR_XLARGE`
-https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-b404-bb941db73652.mov
-
-Now, using *WhisperX* with forced alignment to wav2vec2.0:
+Result using *WhisperX* with forced alignment to wav2vec2.0 large:
https://user-images.githubusercontent.com/36994049/208253969-7e35fe2a-7541-434a-ae91-8e919540555d.mp4
+Compare this to original whisper out the box, where many transcriptions are out of sync:
+
+https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-b404-bb941db73652.mov
## Other Languages
@@ -78,7 +97,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-
Limitations ⚠️
-- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know its results on your data
+- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know the results on your data
- Whisper normalises spoken numbers e.g. "fifty seven" to arabic numerals "57". Need to perform this normalization after alignment, so the phonemes can be aligned. Currently just ignores numbers.
- Assumes the initial whisper timestamps are accurate to some degree (within margin of 2 seconds, adjust if needed -- bigger margins more prone to alignment errors)
- Hacked this up quite quickly, there might be some errors, please raise an issue if you encounter any.
@@ -91,7 +110,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-
[ ] Automatic align model selection based on language detection
-[ ] Reduce GPU (clear cache etc.)
+[ ] Option to minimise gpu load (chunk wav2vec)
[ ] Incorporating word-level speaker diarization
@@ -99,7 +118,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-
Contact
-Contact maxbain[at]robots[dot]ox[dot]ac[dot]uk if using this for commerical purposes.
+Contact maxbain[at]robots[dot]ox[dot]ac[dot]uk if using this commerically.
Acknowledgements 🙏
diff --git a/examples/whisperx/sample01.wav.word.srt b/examples/whisperx/sample01.wav.word.srt
index c07d0e5..d7ec8fb 100644
--- a/examples/whisperx/sample01.wav.word.srt
+++ b/examples/whisperx/sample01.wav.word.srt
@@ -431,194 +431,194 @@ green
case.
109
-00:00:38,095 --> 00:00:38,256
+00:00:38,135 --> 00:00:38,255
Do
110
-00:00:38,276 --> 00:00:38,356
+00:00:38,275 --> 00:00:38,355
you
111
-00:00:38,376 --> 00:00:38,516
+00:00:38,375 --> 00:00:38,535
want
112
-00:00:38,556 --> 00:00:38,736
+00:00:38,555 --> 00:00:38,736
your
113
-00:00:38,877 --> 00:00:39,297
+00:00:38,876 --> 00:00:39,296
PJs?
114
-00:00:39,862 --> 00:00:40,185
+00:00:39,879 --> 00:00:40,181
Yeah.
115
-00:00:42,394 --> 00:00:42,474
-Yeah.
-
-116
-00:00:42,474 --> 00:00:42,694
+00:00:42,388 --> 00:00:42,689
Lifting
-117
-00:00:42,714 --> 00:00:42,754
+116
+00:00:42,729 --> 00:00:42,749
a
-118
-00:00:42,794 --> 00:00:43,095
+117
+00:00:42,809 --> 00:00:43,110
bundle
-119
-00:00:43,135 --> 00:00:43,195
+118
+00:00:43,131 --> 00:00:43,191
of
-120
-00:00:43,235 --> 00:00:43,776
+119
+00:00:43,251 --> 00:00:43,773
pajamas,
-121
-00:00:44,076 --> 00:00:44,316
+120
+00:00:44,073 --> 00:00:44,314
Peter
-122
-00:00:44,376 --> 00:00:44,637
+121
+00:00:44,374 --> 00:00:44,634
finds
-123
-00:00:44,677 --> 00:00:44,697
+122
+00:00:44,674 --> 00:00:44,694
a
-124
-00:00:44,757 --> 00:00:44,957
+123
+00:00:44,754 --> 00:00:44,955
sheet
-125
-00:00:44,997 --> 00:00:45,057
+124
+00:00:44,995 --> 00:00:45,055
of
-126
-00:00:45,117 --> 00:00:45,418
+125
+00:00:45,115 --> 00:00:45,456
paper
-127
-00:00:45,538 --> 00:00:45,899
+126
+00:00:45,536 --> 00:00:45,876
labeled
-128
-00:00:46,341 --> 00:00:47,043
+127
+00:00:46,338 --> 00:00:47,041
Lancaster
-129
-00:00:47,124 --> 00:00:47,384
+128
+00:00:47,121 --> 00:00:47,382
North
-130
-00:00:47,445 --> 00:00:47,946
+129
+00:00:47,442 --> 00:00:47,944
Hospital
-131
-00:00:48,267 --> 00:00:48,930
+130
+00:00:48,266 --> 00:00:48,928
discharge
-132
-00:00:49,030 --> 00:00:49,251
+131
+00:00:49,029 --> 00:00:49,249
sheet.
-133
-00:00:50,293 --> 00:00:50,373
+132
+00:00:50,291 --> 00:00:50,371
He
-134
-00:00:50,413 --> 00:00:50,774
+133
+00:00:50,412 --> 00:00:50,772
closes
-135
-00:00:50,814 --> 00:00:50,914
+134
+00:00:50,812 --> 00:00:50,912
the
-136
-00:00:50,954 --> 00:00:51,395
+135
+00:00:50,953 --> 00:00:51,393
suitcase
-137
-00:00:51,435 --> 00:00:51,515
+136
+00:00:51,433 --> 00:00:51,514
and
-138
-00:00:51,535 --> 00:00:51,796
+137
+00:00:51,534 --> 00:00:51,794
brings
-139
-00:00:51,836 --> 00:00:52,217
+138
+00:00:51,834 --> 00:00:52,235
Gloria
-140
-00:00:52,257 --> 00:00:52,317
+139
+00:00:52,255 --> 00:00:52,315
the
-141
-00:00:52,357 --> 00:00:52,858
+140
+00:00:52,355 --> 00:00:52,856
pajamas.
-142
-00:00:54,187 --> 00:00:54,489
+141
+00:00:54,186 --> 00:00:54,488
There
-143
-00:00:54,550 --> 00:00:54,771
+142
+00:00:54,549 --> 00:00:54,771
you
-144
-00:00:54,791 --> 00:00:54,832
+143
+00:00:54,791 --> 00:00:54,831
go.
-145
-00:00:55,655 --> 00:00:55,755
+144
+00:00:55,654 --> 00:00:55,775
Thank
-146
-00:00:55,775 --> 00:00:55,896
+145
+00:00:55,795 --> 00:00:55,895
you.
-147
-00:00:55,916 --> 00:00:55,956
+146
+00:00:55,895 --> 00:00:55,936
He
-148
-00:00:55,976 --> 00:00:56,077
+147
+00:00:55,956 --> 00:00:56,097
picks
-149
-00:00:56,097 --> 00:00:56,198
+148
+00:00:56,117 --> 00:00:56,198
up
-150
+149
00:00:56,218 --> 00:00:56,319
the
-151
+150
00:00:56,359 --> 00:00:56,742
locket.
-152
+151
00:00:57,124 --> 00:00:57,225
-He
+You
-153
+152
00:00:57,265 --> 00:00:57,466
kept
-154
+153
00:00:57,547 --> 00:00:57,627
it.
-155
-00:00:58,874 --> 00:00:58,995
+154
+00:00:58,874 --> 00:00:58,994
Oh,
-156
-00:00:59,678 --> 00:00:59,899
-cool.
+155
+00:00:59,276 --> 00:00:59,578
+of
+
+156
+00:00:59,678 --> 00:00:59,960
+course.
diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py
index c915aca..174cdbd 100644
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@@ -255,6 +255,7 @@ def align(
device: str,
extend_duration: float = 0.0,
start_from_previous: bool = True,
+ drop_non_aligned_words: bool = False,
):
print("Performing alignment...")
if not torch.is_tensor(audio):
@@ -267,6 +268,7 @@ def align(
MAX_DURATION = audio.shape[1] / SAMPLE_RATE
prev_t2 = 0
+ word_segments_list = []
for idx, segment in enumerate(transcript):
t1 = max(segment['start'] - extend_duration, 0)
t2 = min(segment['end'] + extend_duration, MAX_DURATION)
@@ -313,8 +315,7 @@ def align(
segment['end'] = t2_actual
prev_t2 = segment['end']
-
- # merge missing words to previous, or merge with next word ahead if idx == 0
+ # for the .ass output
for x in range(len(t_local)):
curr_word = t_words[x]
curr_timestamp = t_local[x]
@@ -323,15 +324,29 @@ def align(
else:
segment['word-level'].append({"text": curr_word, "start": None, "end": None})
+ # for per-word .srt ouput
+ # merge missing words to previous, or merge with next word ahead if idx == 0
+ for x in range(len(t_local)):
+ curr_word = t_words[x]
+ curr_timestamp = t_local[x]
+ if curr_timestamp is not None:
+ word_segments_list.append({"text": curr_word, "start": curr_timestamp[0], "end": curr_timestamp[1]})
+ elif not drop_non_aligned_words:
+ # then we merge
+ if x == 0:
+ t_words[x+1] = " ".join([curr_word, t_words[x+1]])
+ else:
+ word_segments_list[-1]['text'] += ' ' + curr_word
else:
# then we resort back to original whisper timestamps
# segment['start] and segment['end'] are unchanged
prev_t2 = 0
segment['word-level'].append({"text": segment['text'], "start": segment['start'], "end":segment['end']})
+ word_segments_list.append({"text": segment['text'], "start": segment['start'], "end":segment['end']})
print(f"[{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}] {segment['text']}")
- return {"segments": transcript}
+ return {"segments": transcript, "word_segments": word_segments_list}
def cli():
from . import available_models
@@ -342,9 +357,10 @@ def cli():
parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
# alignment params
- parser.add_argument("--align_model", default="WAV2VEC2_ASR_LARGE_LV60K_960H", help="Name of phoneme-level ASR model to do alignment")
+ parser.add_argument("--align_model", default="WAV2VEC2_ASR_BASE_960H", help="Name of phoneme-level ASR model to do alignment")
parser.add_argument("--align_extend", default=2, type=float, help="Seconds before and after to extend the whisper segments for alignment")
parser.add_argument("--align_from_prev", default=True, type=bool, help="Whether to clip the alignment start time of current segment to the end time of the last aligned word of the previous segment")
+ parser.add_argument("--drop_non_aligned", action="store_true", help="For word .srt, whether to drop non aliged words, or merge them into neighbouring.")
parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
parser.add_argument("--output_type", default="srt", choices=['all', 'srt', 'vtt', 'txt'], help="directory to save the outputs")
@@ -381,7 +397,7 @@ def cli():
align_model: str = args.pop("align_model")
align_extend: float = args.pop("align_extend")
align_from_prev: bool = args.pop("align_from_prev")
- # align_interpolate_missing: bool = args.pop("align_interpolate_missing")
+ drop_non_aligned: bool = args.pop("drop_non_aligned")
os.makedirs(output_dir, exist_ok=True)
@@ -409,12 +425,14 @@ def cli():
labels = bundle.get_labels()
align_dictionary = {c.lower(): i for i, c in enumerate(labels)}
else:
- print(f'Align model "{align_model}" not found in torchaudio.pipelines, choose from:\n {torchaudio.pipelines.__all__}')
+ print(f'Align model "{align_model}" not found in torchaudio.pipelines, choose from:\n\
+ {torchaudio.pipelines.__all__}\n\
+ See details here https://pytorch.org/audio/stable/pipelines.html#id14')
raise ValueError(f'Align model "{align_model}" not found in torchaudio.pipelines')
for audio_path in args.pop("audio"):
result = transcribe(model, audio_path, temperature=temperature, **args)
result_aligned = align(result["segments"], align_model, align_dictionary, audio_path, device,
- extend_duration=align_extend, start_from_previous=align_from_prev)
+ extend_duration=align_extend, start_from_previous=align_from_prev, drop_non_aligned_words=drop_non_aligned)
audio_basename = os.path.basename(audio_path)
# save TXT
@@ -432,6 +450,10 @@ def cli():
with open(os.path.join(output_dir, audio_basename + ".srt"), "w", encoding="utf-8") as srt:
write_srt(result_aligned["segments"], file=srt)
+ # save per-word SRT
+ with open(os.path.join(output_dir, audio_basename + ".word.srt"), "w", encoding="utf-8") as srt:
+ write_srt(result_aligned["word_segments"], file=srt)
+
# save ASS
with open(os.path.join(output_dir, audio_basename + ".ass"), "w", encoding="utf-8") as srt:
write_ass(result_aligned["segments"], file=srt)
From cbaeb85034d2391027a348c67a148761f6109ea3 Mon Sep 17 00:00:00 2001
From: Max Bain
-
-
+
@@ -17,6 +17,15 @@
+ What is it • + Setup • + Example usage +
+ +