From 228b857597488ab7c705bd5ad680818dee769dbd Mon Sep 17 00:00:00 2001
From: Max Bain <maxbain@robots.ox.ac.uk>
Date: Mon, 19 Dec 2022 19:12:50 +0000
Subject: [PATCH] add back word .srt, update readme

---
 README.md                               |  41 ++++--
 examples/whisperx/sample01.wav.word.srt | 176 ++++++++++++------------
 whisperx/transcribe.py                  |  36 ++++-
 3 files changed, 147 insertions(+), 106 deletions(-)
diff --git a/README.md b/README.md
index 195dac7..482288c 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,24 @@
 <h1 align="center">WhisperX</h1>
-<h6 align="center">Made by Max Bain • :globe_with_meridians: <a href="https://www.maxbain.com/">https://www.maxbain.com/</a></h6>
+<p align="center">
+  <a href="https://github.com/m-bain/whisperX/stargazers">
+    <img src="https://img.shields.io/github/stars/m-bain/whisperX.svg?colorA=orange&colorB=orange&logo=github"
+         alt="GitHub stars">
+  </a>
+  <a href="https://github.com/hanxiao/bert-as-service/issues">
+        <img src="https://img.shields.io/github/issues/hanxiao/bert-as-service.svg"
+             alt="GitHub issues">
+  </a>
+  <a href="https://github.com/m-bain/whisperX/blob/master/LICENSE">
+        <img src="https://img.shields.io/github/license/m-bain/whisperX.svg"
+             alt="GitHub license">
+  </a>
+  <a href="https://twitter.com/intent/tweet?text=&url=https%3A%2F%2Fgithub.com%2Fm-bain%2FwhisperX">
+  <img src="https://img.shields.io/twitter/url/https/github.com/m-bain/whisperX.svg?style=social" alt="Twitter">
+  </a>      
+</p>
+
+
+<h6 align="center">Made by Max Bain • :globe_with_meridians: <a href="https://www.maxbain.com">https://www.maxbain.com</a></h6>
 
 <p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy using forced alignment.
 
@@ -28,22 +47,22 @@ Install this package using
 You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
 
 <h2 align="left">Examples💬</h2>
-
 ### English
 Run whisper on example segment (using default params)
 
-`whisperx examples/sample01.wav --model medium.en --output examples/whisperx --align_model WAV2VEC2_ASR_LARGE_LV60K_960H --align_extend 2`
+`whisperx examples/sample01.wav --model medium.en --output examples/whisperx --align_model WAV2VEC2_ASR_BASE_960H --align_extend 2`
 
-If low gpu memory is required, use a smaller align model e.g. `WAV2VEC2_ASR_BASE_LV60K_960H`
+For increased timestamp accuracy, at the cost of higher gpu mem, use a bigger alignment model e.g.
 
-Using normal whisper out of the box, many transcriptions are out of sync:
+`WAV2VEC2_ASR_LARGE_LV60K_960H` or `HUBERT_ASR_XLARGE`
 
-https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-b404-bb941db73652.mov
-
-Now, using *WhisperX* with forced alignment to wav2vec2.0:
+Result using *WhisperX* with forced alignment to wav2vec2.0 large:
 
 https://user-images.githubusercontent.com/36994049/208253969-7e35fe2a-7541-434a-ae91-8e919540555d.mp4
 
+Compare this to original whisper out the box, where many transcriptions are out of sync:
+
+https://user-images.githubusercontent.com/36994049/207743923-b4f0d537-29ae-4be2-b404-bb941db73652.mov
 
 ## Other Languages
 
@@ -78,7 +97,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-
 
 <h2 align="left">Limitations ⚠️</h2>
 
-- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know its results on your data
+- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know the results on your data
 - Whisper normalises spoken numbers e.g. "fifty seven" to arabic numerals "57". Need to perform this normalization after alignment, so the phonemes can be aligned. Currently just ignores numbers.
 - Assumes the initial whisper timestamps are accurate to some degree (within margin of 2 seconds, adjust if needed -- bigger margins more prone to alignment errors)
 - Hacked this up quite quickly, there might be some errors, please raise an issue if you encounter any.
@@ -91,7 +110,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-
 
 [ ] Automatic align model selection based on language detection
 
-[ ] Reduce GPU (clear cache etc.)
+[ ] Option to minimise gpu load (chunk wav2vec)
 
 [ ] Incorporating word-level speaker diarization
 
@@ -99,7 +118,7 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54-
 
 <h2 align="left">Contact</h2>
 
-Contact maxbain[at]robots[dot]ox[dot]ac[dot]uk if using this for commerical purposes.
+Contact maxbain[at]robots[dot]ox[dot]ac[dot]uk if using this commerically.
 
 
 <h2 align="left">Acknowledgements 🙏</h2>
diff --git a/examples/whisperx/sample01.wav.word.srt b/examples/whisperx/sample01.wav.word.srt
index c07d0e5..d7ec8fb 100644
--- a/examples/whisperx/sample01.wav.word.srt
+++ b/examples/whisperx/sample01.wav.word.srt
@@ -431,194 +431,194 @@ green
 case.
 
 109
-00:00:38,095 --> 00:00:38,256
+00:00:38,135 --> 00:00:38,255
 Do
 
 110
-00:00:38,276 --> 00:00:38,356
+00:00:38,275 --> 00:00:38,355
 you
 
 111
-00:00:38,376 --> 00:00:38,516
+00:00:38,375 --> 00:00:38,535
 want
 
 112
-00:00:38,556 --> 00:00:38,736
+00:00:38,555 --> 00:00:38,736
 your
 
 113
-00:00:38,877 --> 00:00:39,297
+00:00:38,876 --> 00:00:39,296
 PJs?
 
 114
-00:00:39,862 --> 00:00:40,185
+00:00:39,879 --> 00:00:40,181
 Yeah.
 
 115
-00:00:42,394 --> 00:00:42,474
-Yeah.
-
-116
-00:00:42,474 --> 00:00:42,694
+00:00:42,388 --> 00:00:42,689
 Lifting
 
-117
-00:00:42,714 --> 00:00:42,754
+116
+00:00:42,729 --> 00:00:42,749
 a
 
-118
-00:00:42,794 --> 00:00:43,095
+117
+00:00:42,809 --> 00:00:43,110
 bundle
 
-119
-00:00:43,135 --> 00:00:43,195
+118
+00:00:43,131 --> 00:00:43,191
 of
 
-120
-00:00:43,235 --> 00:00:43,776
+119
+00:00:43,251 --> 00:00:43,773
 pajamas,
 
-121
-00:00:44,076 --> 00:00:44,316
+120
+00:00:44,073 --> 00:00:44,314
 Peter
 
-122
-00:00:44,376 --> 00:00:44,637
+121
+00:00:44,374 --> 00:00:44,634
 finds
 
-123
-00:00:44,677 --> 00:00:44,697
+122
+00:00:44,674 --> 00:00:44,694
 a
 
-124
-00:00:44,757 --> 00:00:44,957
+123
+00:00:44,754 --> 00:00:44,955
 sheet
 
-125
-00:00:44,997 --> 00:00:45,057
+124
+00:00:44,995 --> 00:00:45,055
 of
 
-126
-00:00:45,117 --> 00:00:45,418
+125
+00:00:45,115 --> 00:00:45,456
 paper
 
-127
-00:00:45,538 --> 00:00:45,899
+126
+00:00:45,536 --> 00:00:45,876
 labeled
 
-128
-00:00:46,341 --> 00:00:47,043
+127
+00:00:46,338 --> 00:00:47,041
 Lancaster
 
-129
-00:00:47,124 --> 00:00:47,384
+128
+00:00:47,121 --> 00:00:47,382
 North
 
-130
-00:00:47,445 --> 00:00:47,946
+129
+00:00:47,442 --> 00:00:47,944
 Hospital
 
-131
-00:00:48,267 --> 00:00:48,930
+130
+00:00:48,266 --> 00:00:48,928
 discharge
 
-132
-00:00:49,030 --> 00:00:49,251
+131
+00:00:49,029 --> 00:00:49,249
 sheet.
 
-133
-00:00:50,293 --> 00:00:50,373
+132
+00:00:50,291 --> 00:00:50,371
 He
 
-134
-00:00:50,413 --> 00:00:50,774
+133
+00:00:50,412 --> 00:00:50,772
 closes
 
-135
-00:00:50,814 --> 00:00:50,914
+134
+00:00:50,812 --> 00:00:50,912
 the
 
-136
-00:00:50,954 --> 00:00:51,395
+135
+00:00:50,953 --> 00:00:51,393
 suitcase
 
-137
-00:00:51,435 --> 00:00:51,515
+136
+00:00:51,433 --> 00:00:51,514
 and
 
-138
-00:00:51,535 --> 00:00:51,796
+137
+00:00:51,534 --> 00:00:51,794
 brings
 
-139
-00:00:51,836 --> 00:00:52,217
+138
+00:00:51,834 --> 00:00:52,235
 Gloria
 
-140
-00:00:52,257 --> 00:00:52,317
+139
+00:00:52,255 --> 00:00:52,315
 the
 
-141
-00:00:52,357 --> 00:00:52,858
+140
+00:00:52,355 --> 00:00:52,856
 pajamas.
 
-142
-00:00:54,187 --> 00:00:54,489
+141
+00:00:54,186 --> 00:00:54,488
 There
 
-143
-00:00:54,550 --> 00:00:54,771
+142
+00:00:54,549 --> 00:00:54,771
 you
 
-144
-00:00:54,791 --> 00:00:54,832
+143
+00:00:54,791 --> 00:00:54,831
 go.
 
-145
-00:00:55,655 --> 00:00:55,755
+144
+00:00:55,654 --> 00:00:55,775
 Thank
 
-146
-00:00:55,775 --> 00:00:55,896
+145
+00:00:55,795 --> 00:00:55,895
 you.
 
-147
-00:00:55,916 --> 00:00:55,956
+146
+00:00:55,895 --> 00:00:55,936
 He
 
-148
-00:00:55,976 --> 00:00:56,077
+147
+00:00:55,956 --> 00:00:56,097
 picks
 
-149
-00:00:56,097 --> 00:00:56,198
+148
+00:00:56,117 --> 00:00:56,198
 up
 
-150
+149
 00:00:56,218 --> 00:00:56,319
 the
 
-151
+150
 00:00:56,359 --> 00:00:56,742
 locket.
 
-152
+151
 00:00:57,124 --> 00:00:57,225
-He
+You
 
-153
+152
 00:00:57,265 --> 00:00:57,466
 kept
 
-154
+153
 00:00:57,547 --> 00:00:57,627
 it.
 
-155
-00:00:58,874 --> 00:00:58,995
+154
+00:00:58,874 --> 00:00:58,994
 Oh,
 
-156
-00:00:59,678 --> 00:00:59,899
-cool.
+155
+00:00:59,276 --> 00:00:59,578
+of
+
+156
+00:00:59,678 --> 00:00:59,960
+course.
 
diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py
index c915aca..174cdbd 100644
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@@ -255,6 +255,7 @@ def align(
     device: str,
     extend_duration: float = 0.0,
     start_from_previous: bool = True,
+    drop_non_aligned_words: bool = False,
 ):
     print("Performing alignment...")
     if not torch.is_tensor(audio):
@@ -267,6 +268,7 @@ def align(
     MAX_DURATION = audio.shape[1] / SAMPLE_RATE
 
     prev_t2 = 0
+    word_segments_list = []
     for idx, segment in enumerate(transcript):
         t1 = max(segment['start'] - extend_duration, 0)
         t2 = min(segment['end'] + extend_duration, MAX_DURATION)
@@ -313,8 +315,7 @@ def align(
             segment['end'] = t2_actual
             prev_t2 = segment['end'] 
 
-
-            # merge missing words to previous, or merge with next word ahead if idx == 0
+            # for the .ass output
             for x in range(len(t_local)):
                 curr_word = t_words[x]
                 curr_timestamp = t_local[x]
@@ -323,15 +324,29 @@ def align(
                 else:
                     segment['word-level'].append({"text": curr_word, "start": None, "end": None})
 
+            # for per-word .srt ouput
+            # merge missing words to previous, or merge with next word ahead if idx == 0
+            for x in range(len(t_local)):
+                curr_word = t_words[x]
+                curr_timestamp = t_local[x]
+                if curr_timestamp is not None:
+                    word_segments_list.append({"text": curr_word, "start": curr_timestamp[0], "end": curr_timestamp[1]})
+                elif not drop_non_aligned_words:
+                    # then we merge
+                    if x == 0:
+                        t_words[x+1] = " ".join([curr_word, t_words[x+1]])
+                    else:
+                        word_segments_list[-1]['text'] += ' ' + curr_word          
         else:
             # then we resort back to original whisper timestamps
             # segment['start] and segment['end'] are unchanged
             prev_t2 = 0
             segment['word-level'].append({"text": segment['text'], "start": segment['start'], "end":segment['end']})
+            word_segments_list.append({"text": segment['text'], "start": segment['start'], "end":segment['end']})
 
         print(f"[{format_timestamp(segment['start'])} --> {format_timestamp(segment['end'])}] {segment['text']}")
 
-    return {"segments": transcript}
+    return {"segments": transcript, "word_segments": word_segments_list}
 
 def cli():
     from . import available_models
@@ -342,9 +357,10 @@ def cli():
     parser.add_argument("--model_dir", type=str, default=None, help="the path to save model files; uses ~/.cache/whisper by default")
     parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference")
     # alignment params
-    parser.add_argument("--align_model", default="WAV2VEC2_ASR_LARGE_LV60K_960H", help="Name of phoneme-level ASR model to do alignment")
+    parser.add_argument("--align_model", default="WAV2VEC2_ASR_BASE_960H", help="Name of phoneme-level ASR model to do alignment")
     parser.add_argument("--align_extend", default=2, type=float, help="Seconds before and after to extend the whisper segments for alignment")
     parser.add_argument("--align_from_prev", default=True, type=bool, help="Whether to clip the alignment start time of current segment to the end time of the last aligned word of the previous segment")
+    parser.add_argument("--drop_non_aligned", action="store_true", help="For word .srt, whether to drop non aliged words, or merge them into neighbouring.")
 
     parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs")
     parser.add_argument("--output_type", default="srt", choices=['all', 'srt', 'vtt', 'txt'], help="directory to save the outputs")
@@ -381,7 +397,7 @@ def cli():
     align_model: str = args.pop("align_model")
     align_extend: float = args.pop("align_extend")
     align_from_prev: bool = args.pop("align_from_prev")
-    # align_interpolate_missing: bool = args.pop("align_interpolate_missing")
+    drop_non_aligned: bool = args.pop("drop_non_aligned")
 
     os.makedirs(output_dir, exist_ok=True)
 
@@ -409,12 +425,14 @@ def cli():
         labels = bundle.get_labels()
         align_dictionary = {c.lower(): i for i, c in enumerate(labels)}
     else:
-        print(f'Align model "{align_model}" not found in torchaudio.pipelines, choose from:\n {torchaudio.pipelines.__all__}')
+        print(f'Align model "{align_model}" not found in torchaudio.pipelines, choose from:\n\
+            {torchaudio.pipelines.__all__}\n\
+            See details here https://pytorch.org/audio/stable/pipelines.html#id14')
         raise ValueError(f'Align model "{align_model}" not found in torchaudio.pipelines')
     for audio_path in args.pop("audio"):
         result = transcribe(model, audio_path, temperature=temperature, **args)
         result_aligned = align(result["segments"], align_model, align_dictionary, audio_path, device,
-                    extend_duration=align_extend, start_from_previous=align_from_prev)
+                    extend_duration=align_extend, start_from_previous=align_from_prev, drop_non_aligned_words=drop_non_aligned)
         audio_basename = os.path.basename(audio_path)
 
         # save TXT
@@ -432,6 +450,10 @@ def cli():
             with open(os.path.join(output_dir, audio_basename + ".srt"), "w", encoding="utf-8") as srt:
                 write_srt(result_aligned["segments"], file=srt)
 
+        # save per-word SRT
+        with open(os.path.join(output_dir, audio_basename + ".word.srt"), "w", encoding="utf-8") as srt:
+            write_srt(result_aligned["word_segments"], file=srt)
+
         # save ASS
         with open(os.path.join(output_dir, audio_basename + ".ass"), "w", encoding="utf-8") as srt:
             write_ass(result_aligned["segments"], file=srt)