From b47bb769a071de8344f46e47c80f1ad11fa1ed4c Mon Sep 17 00:00:00 2001
From: m-bain <36994049+m-bain@users.noreply.github.com>
Date: Sat, 17 Dec 2022 15:04:08 +0000
Subject: [PATCH] change regex sub to align_dictionary sub

This allows the alignment method to be used for other languages -- depending on what characters are in the align_model dictionary.
---
 whisperx/transcribe.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py
index e2dc76b..297d31e 100644
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@@ -7,7 +7,6 @@ import numpy as np
 import torch
 import torchaudio
 import tqdm
-import re
 from .audio import SAMPLE_RATE, N_FRAMES, HOP_LENGTH, pad_or_trim, log_mel_spectrogram, load_audio
 from .alignment import get_trellis, backtrack, merge_repeats, merge_words
 from .decoding import DecodingOptions, DecodingResult
@@ -256,7 +255,6 @@ def align(
     device: str,
     extend_duration: float = 0.0,
     start_from_previous: bool = True,
-    resolution: str = "phrase",
 ):
     print("Performing alignment...")
     if not torch.is_tensor(audio):
@@ -287,7 +285,7 @@ def align(
 
         transcription = segment['text'].strip()
         t_words = transcription.split(' ')
-        t_words_clean = [re.sub(r"[^a-zA-Z' ]", "", x) for x in t_words]
+        t_words_clean = [''.join([w for w in word if w in model_dictionary.keys()]) for word in t_words]
         t_words_nonempty = [x for x in t_words_clean if x != ""]
         t_words_nonempty_idx = [x for x in range(len(t_words_clean)) if t_words_clean[x] != ""]