no sentence split on mr. mrs. dr...

2025-07-01 18:17:27 -04:00 · 2023-05-29 12:48:14 +01:00
parent 1c528d1a3c
commit 4cbd3030cc
1 changed files with 8 additions and 1 deletions
--- a/whisperx/alignment.py
+++ b/whisperx/alignment.py
@ -15,6 +15,9 @@ from .audio import SAMPLE_RATE, load_audio
 from .utils import interpolate_nans
 from .types import AlignedTranscriptionResult, SingleSegment, SingleAlignedSegment, SingleWordSegment
 import nltk
+from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
+
+PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof']

 LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]

@ -143,7 +146,11 @@ def align(
            if any([c in model_dictionary.keys() for c in wrd]):
                clean_wdx.append(wdx)

-        sentence_spans = list(nltk.tokenize.punkt.PunktSentenceTokenizer().span_tokenize(text))
+                
+        punkt_param = PunktParameters()
+        punkt_param.abbrev_types = set(PUNKT_ABBREVIATIONS)
+        sentence_splitter = PunktSentenceTokenizer(punkt_param)
+        sentence_spans = list(sentence_splitter.span_tokenize(text))

        segment["clean_char"] = clean_char
        segment["clean_cdx"] = clean_cdx