mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
handle negative / tiny duration segments, final
This commit is contained in:
15
README.md
15
README.md
@ -29,7 +29,7 @@
|
|||||||
|
|
||||||
<h6 align="center">Made by Max Bain • :globe_with_meridians: <a href="https://www.maxbain.com">https://www.maxbain.com</a></h6>
|
<h6 align="center">Made by Max Bain • :globe_with_meridians: <a href="https://www.maxbain.com">https://www.maxbain.com</a></h6>
|
||||||
|
|
||||||
<img width="1216" align="center" alt="whisperx-arch" src="https://user-images.githubusercontent.com/36994049/208313881-903ab3ea-4932-45fd-b3dc-70876cddaaa2.png">
|
<img width="1216" align="center" alt="whisperx-arch" src="https://user-images.githubusercontent.com/36994049/211200186-8b779e26-0bfd-4127-aee2-5a9238b95e1f.png">
|
||||||
|
|
||||||
|
|
||||||
<p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy using forced alignment.
|
<p align="left">Whisper-Based Automatic Speech Recognition (ASR) with improved timestamp accuracy using forced alignment.
|
||||||
@ -64,6 +64,7 @@ $ cd whisperX
|
|||||||
$ pip install -e .
|
$ pip install -e .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
|
You may also need to install ffmpeg, rust etc. Follow openAI instructions here https://github.com/openai/whisper#setup.
|
||||||
|
|
||||||
<h2 align="left" id="example">Usage 💬 (command line)</h2>
|
<h2 align="left" id="example">Usage 💬 (command line)</h2>
|
||||||
@ -101,7 +102,7 @@ Currently default models provided for `{en, fr, de, es, it, ja, zh, nl, uk}`. If
|
|||||||
https://user-images.githubusercontent.com/36994049/208298811-e36002ba-3698-4731-97d4-0aebd07e0eb3.mov
|
https://user-images.githubusercontent.com/36994049/208298811-e36002ba-3698-4731-97d4-0aebd07e0eb3.mov
|
||||||
|
|
||||||
|
|
||||||
See more exac
|
See more examples in other languages [here](EXAMPLES.md).
|
||||||
|
|
||||||
## Python usage 🐍
|
## Python usage 🐍
|
||||||
|
|
||||||
@ -127,6 +128,16 @@ print(result_aligned["segments"]) # after alignment
|
|||||||
print(result_aligned["word_segments"]) # after alignment
|
print(result_aligned["word_segments"]) # after alignment
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
<h2 align="left" id="whisper-mod">Whisper Modifications</h2>
|
||||||
|
|
||||||
|
In addition to forced alignment, the following two modifications have been made to the whisper transcription method:
|
||||||
|
|
||||||
|
1. `--condition_on_prev_text` is set to `False` by default (reduces hallucination)
|
||||||
|
|
||||||
|
2. Clamping segment `end_time` to be at least 0.02s (one time precision) later than `start_time` (prevents segments with negative duration)
|
||||||
|
|
||||||
|
|
||||||
<h2 align="left" id="limitations">Limitations ⚠️</h2>
|
<h2 align="left" id="limitations">Limitations ⚠️</h2>
|
||||||
|
|
||||||
- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know the results on your data
|
- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know the results on your data
|
||||||
|
@ -223,6 +223,10 @@ def transcribe(
|
|||||||
end_timestamp_position = (
|
end_timestamp_position = (
|
||||||
sliced_tokens[-1].item() - tokenizer.timestamp_begin
|
sliced_tokens[-1].item() - tokenizer.timestamp_begin
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# clamp end-time to at least be 1 frame after start-time
|
||||||
|
end_timestamp_position = max(end_timestamp_position, start_timestamp_position + time_precision)
|
||||||
|
|
||||||
add_segment(
|
add_segment(
|
||||||
start=timestamp_offset + start_timestamp_position * time_precision,
|
start=timestamp_offset + start_timestamp_position * time_precision,
|
||||||
end=timestamp_offset + end_timestamp_position * time_precision,
|
end=timestamp_offset + end_timestamp_position * time_precision,
|
||||||
@ -291,28 +295,27 @@ def align(
|
|||||||
prev_t2 = 0
|
prev_t2 = 0
|
||||||
word_segments_list = []
|
word_segments_list = []
|
||||||
for idx, segment in enumerate(transcript):
|
for idx, segment in enumerate(transcript):
|
||||||
if int(segment['start'] * SAMPLE_RATE) >= audio.shape[1]:
|
# first we pad
|
||||||
print("Failed to align segment: original start time longer than audio duration, skipping...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if int(segment['start']) >= int(segment['end']):
|
|
||||||
print("Failed to align segment: original end time is not after start time, skipping...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
t1 = max(segment['start'] - extend_duration, 0)
|
t1 = max(segment['start'] - extend_duration, 0)
|
||||||
t2 = min(segment['end'] + extend_duration, MAX_DURATION)
|
t2 = min(segment['end'] + extend_duration, MAX_DURATION)
|
||||||
|
|
||||||
|
# use prev_t2 as current t1 if it's later
|
||||||
if start_from_previous and t1 < prev_t2:
|
if start_from_previous and t1 < prev_t2:
|
||||||
t1 = prev_t2
|
t1 = prev_t2
|
||||||
|
|
||||||
|
# check if timestamp range is still valid
|
||||||
|
if t1 >= MAX_DURATION:
|
||||||
|
print("Failed to align segment: original start time longer than audio duration, skipping...")
|
||||||
|
continue
|
||||||
|
if t2 - t1 < 0.02:
|
||||||
|
print("Failed to align segment: duration smaller than 0.02s time precision")
|
||||||
|
continue
|
||||||
|
|
||||||
f1 = int(t1 * SAMPLE_RATE)
|
f1 = int(t1 * SAMPLE_RATE)
|
||||||
f2 = int(t2 * SAMPLE_RATE)
|
f2 = int(t2 * SAMPLE_RATE)
|
||||||
|
|
||||||
|
|
||||||
waveform_segment = audio[:, f1:f2]
|
waveform_segment = audio[:, f1:f2]
|
||||||
|
|
||||||
if waveform_segment.shape[1] < 10:
|
|
||||||
print("Failed to align segment: too short in duration, %.3f" % waveform_segment.shape[1]/SAMPLE_RATE)
|
|
||||||
continue
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
if model_type == "torchaudio":
|
if model_type == "torchaudio":
|
||||||
emissions, _ = model(waveform_segment.to(device))
|
emissions, _ = model(waveform_segment.to(device))
|
||||||
@ -321,6 +324,7 @@ def align(
|
|||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Align model of type {model_type} not supported.")
|
raise NotImplementedError(f"Align model of type {model_type} not supported.")
|
||||||
emissions = torch.log_softmax(emissions, dim=-1)
|
emissions = torch.log_softmax(emissions, dim=-1)
|
||||||
|
|
||||||
emission = emissions[0].cpu().detach()
|
emission = emissions[0].cpu().detach()
|
||||||
transcription = segment['text'].strip()
|
transcription = segment['text'].strip()
|
||||||
if model_lang not in LANGUAGES_WITHOUT_SPACES:
|
if model_lang not in LANGUAGES_WITHOUT_SPACES:
|
||||||
@ -519,6 +523,7 @@ def cli():
|
|||||||
print(f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language...")
|
print(f"New language found ({result['language']})! Previous was ({align_metadata['language']}), loading new alignment model for new language...")
|
||||||
align_model, align_metadata = load_align_model(result["language"], device)
|
align_model, align_metadata = load_align_model(result["language"], device)
|
||||||
|
|
||||||
|
print("Performing alignment...")
|
||||||
result_aligned = align(result["segments"], align_model, align_metadata, audio_path, device,
|
result_aligned = align(result["segments"], align_model, align_metadata, audio_path, device,
|
||||||
extend_duration=align_extend, start_from_previous=align_from_prev, drop_non_aligned_words=drop_non_aligned)
|
extend_duration=align_extend, start_from_previous=align_from_prev, drop_non_aligned_words=drop_non_aligned)
|
||||||
audio_basename = os.path.basename(audio_path)
|
audio_basename = os.path.basename(audio_path)
|
||||||
|
Reference in New Issue
Block a user