mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
change regex sub to align_dictionary sub
This allows the alignment method to be used for other languages -- depending on what characters are in the align_model dictionary.
This commit is contained in:
@ -7,7 +7,6 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import torchaudio
|
import torchaudio
|
||||||
import tqdm
|
import tqdm
|
||||||
import re
|
|
||||||
from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram, load_audio
|
from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram, load_audio
|
||||||
from .alignment import get_trellis, backtrack, merge_repeats, merge_words
|
from .alignment import get_trellis, backtrack, merge_repeats, merge_words
|
||||||
from .decoding import DecodingOptions, DecodingResult
|
from .decoding import DecodingOptions, DecodingResult
|
||||||
@ -256,7 +255,6 @@ def align(
|
|||||||
device: str,
|
device: str,
|
||||||
extend_duration: float = 0.0,
|
extend_duration: float = 0.0,
|
||||||
start_from_previous: bool = True,
|
start_from_previous: bool = True,
|
||||||
resolution: str = "phrase",
|
|
||||||
):
|
):
|
||||||
print("Performing alignment...")
|
print("Performing alignment...")
|
||||||
if not torch.is_tensor(audio):
|
if not torch.is_tensor(audio):
|
||||||
@ -287,7 +285,7 @@ def align(
|
|||||||
|
|
||||||
transcription = segment['text'].strip()
|
transcription = segment['text'].strip()
|
||||||
t_words = transcription.split(' ')
|
t_words = transcription.split(' ')
|
||||||
t_words_clean = [re.sub(r"[^a-zA-Z' ]", "", x) for x in t_words]
|
t_words_clean = [''.join([w for w in word if w in model_dictionary.keys()]) for word in t_words]
|
||||||
t_words_nonempty = [x for x in t_words_clean if x != ""]
|
t_words_nonempty = [x for x in t_words_clean if x != ""]
|
||||||
t_words_nonempty_idx = [x for x in range(len(t_words_clean)) if t_words_clean[x] != ""]
|
t_words_nonempty_idx = [x for x in range(len(t_words_clean)) if t_words_clean[x] != ""]
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user