Merge pull request #440 from jim60105/main

chore(writer): Join words without spaces for ja, zh
2025-07-01 18:17:27 -04:00 · 2023-08-29 11:22:30 -06:00
parent adf455a97c 5223de2a41
commit 8c4a21b66d
2 changed files with 7 additions and 2 deletions
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@ -118,6 +118,7 @@ def cli():
                f"{model_name} is an English-only model but receipted '{args['language']}'; using English instead."
            )
        args["language"] = "en"
+    align_language = args["language"] if args["language"] is not None else "en" # default to loading english if not specified

    temperature = args.pop("temperature")
    if (increment := args.pop("temperature_increment_on_fallback")) is not None:
@ -174,7 +175,6 @@ def cli():
    if not no_align:
        tmp_results = results
        results = []
-        align_language = args["language"] if args["language"] is not None else "en" # default to loading english if not specified
        align_model, align_metadata = load_align_model(align_language, device, model_name=align_model)
        for result, audio_path in tmp_results:
            # >> Align
@ -213,6 +213,7 @@ def cli():
            results.append((result, input_audio_path))
    # >> Write
    for result, audio_path in results:
+        result["language"] = align_language
        writer(result, audio_path, writer_args)

 if __name__ == "__main__":
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@ -123,6 +123,7 @@ TO_LANGUAGE_CODE = {
    "castilian": "es",
 }

+LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]

 system_encoding = sys.getdefaultencoding()

@ -283,7 +284,10 @@ class SubtitlesWriter(ResultWriter):
                sstart, ssend, speaker = _[0]
                subtitle_start = self.format_timestamp(sstart)
                subtitle_end = self.format_timestamp(ssend)
-                subtitle_text = " ".join([word["word"] for word in subtitle])
+                if result["language"] in LANGUAGES_WITHOUT_SPACES:
+                    subtitle_text = "".join([word["word"] for word in subtitle])
+                else:
+                    subtitle_text = " ".join([word["word"] for word in subtitle])
                has_timing = any(["start" in word for word in subtitle])

                # add [$SPEAKER_ID]: to each subtitle if speaker is available