add translate, fix word_timestamp error

This commit is contained in:
Max Bain
2023-05-13 12:14:06 +01:00
parent 4603f010a5
commit fd8f1003cf
5 changed files with 17 additions and 8 deletions

View File

@ -32,12 +32,12 @@
<!-- <h2 align="left", id="what-is-it">What is it 🔎</h2> --> <!-- <h2 align="left", id="what-is-it">What is it 🔎</h2> -->
This repository provides fast automatic speaker recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization. This repository provides fast automatic speech recognition (70x realtime with large-v2) with word-level timestamps and speaker diarization.
- ⚡️ Batched inference for 70x realtime transcription using whisper large-v2 - ⚡️ Batched inference for 70x realtime transcription using whisper large-v2
- 🪶 [faster-whisper](https://github.com/guillaumekln/faster-whisper) backend, requires <8GB gpu memory for large-v2 with beam_size=5 - 🪶 [faster-whisper](https://github.com/guillaumekln/faster-whisper) backend, requires <8GB gpu memory for large-v2 with beam_size=5
- 🎯 Accurate word-level timestamps using wav2vec2 alignment - 🎯 Accurate word-level timestamps using wav2vec2 alignment
- 👯 Multispeaker ASR using speaker diarization from [pyannote-audio](https://github.com/pyannote/pyannote-audio) (labels each segment/word with speaker ID) - 👯 Multispeaker ASR using speaker diarization from [pyannote-audio](https://github.com/pyannote/pyannote-audio) (speaker ID labels)
- 🗣 VAD preprocessing, reduces hallucination & batching with no WER degradation - 🗣 VAD preprocessing, reduces hallucination & batching with no WER degradation
@ -75,9 +75,9 @@ GPU execution requires the NVIDIA libraries cuBLAS 11.x and cuDNN 8.x to be inst
### 2. Install PyTorch2.0, e.g. for Linux and Windows CUDA11.7: ### 2. Install PyTorch2.0, e.g. for Linux and Windows CUDA11.7:
`pip3 install torch torchvision torchaudio` `conda install pytorch==2.0.0 torchvision==0.15.0 torchaudio==2.0.0 pytorch-cuda=11.7 -c pytorch -c nvidia`
See other methods [here.](https://pytorch.org/get-started/locally/) See other methods [here.](https://pytorch.org/get-started/previous-versions/#v200)
### 3. Install this repo ### 3. Install this repo

View File

@ -6,7 +6,7 @@ from setuptools import setup, find_packages
setup( setup(
name="whisperx", name="whisperx",
py_modules=["whisperx"], py_modules=["whisperx"],
version="3.1.0", version="3.1.1",
description="Time-Accurate Automatic Speech Recognition using Whisper.", description="Time-Accurate Automatic Speech Recognition using Whisper.",
readme="README.md", readme="README.md",
python_requires=">=3.8", python_requires=">=3.8",

View File

@ -259,6 +259,10 @@ def align(
word_text = "".join(word_chars["char"].tolist()).strip() word_text = "".join(word_chars["char"].tolist()).strip()
if len(word_text) == 0: if len(word_text) == 0:
continue continue
# dont use space character for alignment
word_chars = word_chars[word_chars["char"] != " "]
word_start = word_chars["start"].min() word_start = word_chars["start"].min()
word_end = word_chars["end"].max() word_end = word_chars["end"].max()
word_score = round(word_chars["score"].mean(), 3) word_score = round(word_chars["score"].mean(), 3)

View File

@ -14,7 +14,7 @@ from .vad import load_vad_model, merge_chunks
def load_model(whisper_arch, device, compute_type="float16", asr_options=None, language=None, def load_model(whisper_arch, device, compute_type="float16", asr_options=None, language=None,
vad_options=None, model=None): vad_options=None, model=None, task="transcribe"):
'''Load a Whisper model for inference. '''Load a Whisper model for inference.
Args: Args:
whisper_arch: str - The name of the Whisper model to load. whisper_arch: str - The name of the Whisper model to load.
@ -31,7 +31,7 @@ def load_model(whisper_arch, device, compute_type="float16", asr_options=None, l
model = WhisperModel(whisper_arch, device=device, compute_type=compute_type) model = WhisperModel(whisper_arch, device=device, compute_type=compute_type)
if language is not None: if language is not None:
tokenizer = faster_whisper.tokenizer.Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task="transcribe", language=language) tokenizer = faster_whisper.tokenizer.Tokenizer(model.hf_tokenizer, model.model.is_multilingual, task=task, language=language)
else: else:
print("No language specified, language will be first be detected for each audio file (increases inference time).") print("No language specified, language will be first be detected for each audio file (increases inference time).")
tokenizer = None tokenizer = None

View File

@ -86,6 +86,11 @@ def cli():
align_model: str = args.pop("align_model") align_model: str = args.pop("align_model")
interpolate_method: str = args.pop("interpolate_method") interpolate_method: str = args.pop("interpolate_method")
no_align: bool = args.pop("no_align") no_align: bool = args.pop("no_align")
task : str = args.pop("task")
if task == "translate":
# translation cannot be aligned
no_align = True
return_char_alignments: bool = args.pop("return_char_alignments") return_char_alignments: bool = args.pop("return_char_alignments")
hf_token: str = args.pop("hf_token") hf_token: str = args.pop("hf_token")
@ -139,7 +144,7 @@ def cli():
results = [] results = []
tmp_results = [] tmp_results = []
# model = load_model(model_name, device=device, download_root=model_dir) # model = load_model(model_name, device=device, download_root=model_dir)
model = load_model(model_name, device=device, compute_type=compute_type, language=args['language'], asr_options=asr_options, vad_options={"vad_onset": vad_onset, "vad_offset": vad_offset},) model = load_model(model_name, device=device, compute_type=compute_type, language=args['language'], asr_options=asr_options, vad_options={"vad_onset": vad_onset, "vad_offset": vad_offset}, task=task)
for audio_path in args.pop("audio"): for audio_path in args.pop("audio"):
audio = load_audio(audio_path) audio = load_audio(audio_path)