diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index 6a2dcb6..162eb0d 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -118,6 +118,7 @@ def cli(): f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead." ) args["language"] = "en" + align_language = args["language"] if args["language"] is not None else "en" # default to loading english if not specified temperature = args.pop("temperature") if (increment := args.pop("temperature_increment_on_fallback")) is not None: @@ -174,7 +175,6 @@ def cli(): if not no_align: tmp_results = results results = [] - align_language = args["language"] if args["language"] is not None else "en" # default to loading english if not specified align_model, align_metadata = load_align_model(align_language, device, model_name=align_model) for result, audio_path in tmp_results: # >> Align @@ -213,6 +213,7 @@ def cli(): results.append((result, input_audio_path)) # >> Write for result, audio_path in results: + result["language"] = align_language writer(result, audio_path, writer_args) if __name__ == "__main__": diff --git a/whisperx/utils.py b/whisperx/utils.py index 86beea4..37792d3 100644 --- a/whisperx/utils.py +++ b/whisperx/utils.py @@ -123,6 +123,7 @@ TO_LANGUAGE_CODE = { "castilian": "es", } +LANGUAGES_WITHOUT_SPACES = ["ja", "zh"] system_encoding = sys.getdefaultencoding() @@ -283,7 +284,10 @@ class SubtitlesWriter(ResultWriter): sstart, ssend, speaker = _[0] subtitle_start = self.format_timestamp(sstart) subtitle_end = self.format_timestamp(ssend) - subtitle_text = " ".join([word["word"] for word in subtitle]) + if result["language"] in LANGUAGES_WITHOUT_SPACES: + subtitle_text = "".join([word["word"] for word in subtitle]) + else: + subtitle_text = " ".join([word["word"] for word in subtitle]) has_timing = any(["start" in word for word in subtitle]) # add [$SPEAKER_ID]: to each subtitle if speaker is available