6 Commits

3 changed files with 14 additions and 5 deletions

View File

@ -450,8 +450,8 @@ def align(
"end": srow["end"],
"text": text,
"words": word_list,
# "word-segments": wseg,
# "char-segments": cseg
"word-segments": wseg,
"char-segments": cseg
}
)

View File

@ -207,7 +207,7 @@ class FasterWhisperPipeline(Pipeline):
return final_iterator
def transcribe(
self, audio: Union[str, np.ndarray], batch_size=None
self, audio: Union[str, np.ndarray], batch_size=None, num_workers=0
):
if isinstance(audio, str):
audio = load_audio(audio)
@ -232,7 +232,7 @@ class FasterWhisperPipeline(Pipeline):
segments = []
batch_size = batch_size or self._batch_size
for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size)):
for idx, out in enumerate(self.__call__(data(audio, vad_segments), batch_size=batch_size, num_workers=num_workers)):
text = out['text']
if batch_size in [0, 1, None]:
text = text[0]
@ -251,7 +251,10 @@ class FasterWhisperPipeline(Pipeline):
def detect_language(self, audio: np.ndarray):
segment = log_mel_spectrogram(audio[: N_SAMPLES], padding=0)
if audio.shape[0] < N_SAMPLES:
print("Warning: audio is shorter than 30s, language detection may be inaccurate.")
segment = log_mel_spectrogram(audio[: N_SAMPLES],
padding=0 if audio.shape[0] >= N_SAMPLES else N_SAMPLES - audio.shape[0])
encoder_output = self.model.encode(segment)
results = self.model.model.detect_language(encoder_output)
language_token, language_probability = results[0][0]

View File

@ -203,6 +203,12 @@ def cli():
# >> Write
for result, audio_path in results:
# Remove pandas dataframes from result so that
# we can serialize the result with json
for seg in result["segments"]:
seg.pop("word-segments", None)
seg.pop("char-segments", None)
writer(result, audio_path, writer_args)
if __name__ == "__main__":