From 5d7c3b521ca09b4ed7e22fb00e54ec84a22177c6 Mon Sep 17 00:00:00 2001 From: Yasutaka Odo Date: Tue, 20 Dec 2022 22:29:18 +0900 Subject: [PATCH] fix error message --- README.md | 5 +++++ whisperx/transcribe.py | 16 ++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index f6fe6c7..80d7804 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,11 @@ https://user-images.githubusercontent.com/36994049/208298819-6f462b2c-8cae-4c54- `whisperx --model large --language ja examples/sample_ja_01.mp3 --align_model wav2vec2-large-xlsr-53-japanese--output_dir examples/whisperx --align_extend 2` + + +https://user-images.githubusercontent.com/19920981/208448405-60f80c0e-2715-42d8-9437-e19e6362b638.mov + +

Limitations ⚠️

- Not thoroughly tested, especially for non-english, results may vary -- please post issue to let me know its results on your data diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index 3e699a5..d02b3f7 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -20,7 +20,7 @@ from .utils import (exact_div, format_timestamp, optional_float, optional_int, if TYPE_CHECKING: from .model import Whisper -wa2vec2_on_hugginface = ["wav2vec2-large-xlsr-53-japanese"] +wa2vec2_models_on_hugginface = ["jonatasgrosman/wav2vec2-large-xlsr-53-japanese"] def transcribe( model: "Whisper", @@ -320,7 +320,7 @@ def align( segment['start'] = t1_actual segment['end'] = t2_actual - prev_t2 = segment['end'] + prev_t2 = segment['end'] # merge missing words to previous, or merge with next word ahead if idx == 0 @@ -417,19 +417,19 @@ def cli(): align_model = bundle.get_model().to(device) labels = bundle.get_labels() align_dictionary = {c.lower(): i for i, c in enumerate(labels)} - elif align_model == "wav2vec2-large-xlsr-53-japanese": - processor = AutoProcessor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-japanese") - align_model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-japanese") + elif align_model in wa2vec2_models_on_hugginface: + processor = AutoProcessor.from_pretrained(align_model) + align_model = Wav2Vec2ForCTC.from_pretrained(align_model).to(device) align_model.to(device) labels = processor.tokenizer.get_vocab() align_dictionary = processor.tokenizer.get_vocab() else: - print(f'Align model "{align_model}" not found in torchaudio.pipelines, choose from:\n {torchaudio.pipelines.__all__}') - raise ValueError(f'Align model "{align_model}" not found in torchaudio.pipelines') + print(f'Align model "{align_model}" is not supported, choose from:\n {torchaudio.pipelines.__all__ + wa2vec2_models_on_hugginface}') + raise ValueError(f'Align model "{align_model}" not supported') for audio_path in args.pop("audio"): result = transcribe(model, audio_path, temperature=temperature, **args) result_aligned = align(result["segments"], result["language"], align_model, align_dictionary, audio_path, device, - extend_duration=align_extend, start_from_previous=align_from_prev) + extend_duration=align_extend, start_from_previous=align_from_prev) audio_basename = os.path.basename(audio_path) # save TXT