chore(writer): Join words without spaces for ja, zh

fix #248, fix #310
2025-07-01 18:17:27 -04:00 · 2023-08-26 06:48:35 +08:00
parent adf455a97c
commit f505702dc7
2 changed files with 6 additions and 1 deletions
--- a/whisperx/transcribe.py
+++ b/whisperx/transcribe.py
@ -213,6 +213,7 @@ def cli():
            results.append((result, input_audio_path))
    # >> Write
    for result, audio_path in results:
+        result["language"] = align_language
        writer(result, audio_path, writer_args)

 if __name__ == "__main__":
--- a/whisperx/utils.py
+++ b/whisperx/utils.py
@ -123,6 +123,7 @@ TO_LANGUAGE_CODE = {
    "castilian": "es",
 }

+LANGUAGES_WITHOUT_SPACES = ["ja", "zh"]

 system_encoding = sys.getdefaultencoding()

@ -283,7 +284,10 @@ class SubtitlesWriter(ResultWriter):
                sstart, ssend, speaker = _[0]
                subtitle_start = self.format_timestamp(sstart)
                subtitle_end = self.format_timestamp(ssend)
-                subtitle_text = " ".join([word["word"] for word in subtitle])
+                if result["language"] in LANGUAGES_WITHOUT_SPACES:
+                    subtitle_text = "".join([word["word"] for word in subtitle])
+                else:
+                    subtitle_text = " ".join([word["word"] for word in subtitle])
                has_timing = any(["start" in word for word in subtitle])

                # add [$SPEAKER_ID]: to each subtitle if speaker is available