fix issue by resetting tokenizer

This commit is contained in:
briguetjo
2023-07-29 18:56:33 +02:00
parent 9d736dca1c
commit 864976af23

View File

@ -258,10 +258,7 @@ class FasterWhisperPipeline(Pipeline):
vad_segments = self.vad_model({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": SAMPLE_RATE})
vad_segments = merge_chunks(vad_segments, 30)
if self.preset_language is None or self.preset_language != language:
if self.preset_language is not None and language is not None and self.preset_language != language:
print(f"Preset language '{self.preset_language}' is different from the language {language} passed to the transcribe method.")
print(f"Overriding preset language with {language}.")
if self.tokenizer is None:
language = language or self.detect_language(audio)
task = task or "transcribe"
self.tokenizer = faster_whisper.tokenizer.Tokenizer(self.model.hf_tokenizer,
@ -289,6 +286,9 @@ class FasterWhisperPipeline(Pipeline):
}
)
if self.preset_language is None:
self.tokenizer = None
return {"segments": segments, "language": language}