mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
no sentence split on mr. mrs. dr...
This commit is contained in:
@ -15,6 +15,9 @@ from .audio import SAMPLE_RATE, load_audio
|
|||||||
from .utils import interpolate_nans
|
from .utils import interpolate_nans
|
||||||
from .types import AlignedTranscriptionResult, SingleSegment, SingleAlignedSegment, SingleWordSegment
|
from .types import AlignedTranscriptionResult, SingleSegment, SingleAlignedSegment, SingleWordSegment
|
||||||
import nltk
|
import nltk
|
||||||
|
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
|
||||||
|
|
||||||
|
PUNKT_ABBREVIATIONS = ['dr', 'vs', 'mr', 'mrs', 'prof']
|
||||||
|
|
||||||
LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
|
LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]
|
||||||
|
|
||||||
@ -143,7 +146,11 @@ def align(
|
|||||||
if any([c in model_dictionary.keys() for c in wrd]):
|
if any([c in model_dictionary.keys() for c in wrd]):
|
||||||
clean_wdx.append(wdx)
|
clean_wdx.append(wdx)
|
||||||
|
|
||||||
sentence_spans = list(nltk.tokenize.punkt.PunktSentenceTokenizer().span_tokenize(text))
|
|
||||||
|
punkt_param = PunktParameters()
|
||||||
|
punkt_param.abbrev_types = set(PUNKT_ABBREVIATIONS)
|
||||||
|
sentence_splitter = PunktSentenceTokenizer(punkt_param)
|
||||||
|
sentence_spans = list(sentence_splitter.span_tokenize(text))
|
||||||
|
|
||||||
segment["clean_char"] = clean_char
|
segment["clean_char"] = clean_char
|
||||||
segment["clean_cdx"] = clean_cdx
|
segment["clean_cdx"] = clean_cdx
|
||||||
|
Reference in New Issue
Block a user