From 2ca99ce90928b99c00e737a7afe8683b4ea39894 Mon Sep 17 00:00:00 2001
From: awerks <sustila24@gmail.com>
Date: Mon, 4 Sep 2023 21:49:34 +0200
Subject: [PATCH] A solution to long subitles

Example usage:
subtitles_proccessor = SubtitlesProcessor(output["segments"], detected_language, max_line_length = 50, min_char_length_splitter = 35)
subtitles_proccessor.save("subtitles.srt", advanced_splitting = True)
---
 whisperx/SubtitlesProcessor.py | 227 +++++++++++++++++++++++++++++++++
 whisperx/conjunctions.py       |  43 +++++++
 2 files changed, 270 insertions(+)
 create mode 100644 whisperx/SubtitlesProcessor.py
 create mode 100644 whisperx/conjunctions.py

diff --git a/whisperx/SubtitlesProcessor.py b/whisperx/SubtitlesProcessor.py
new file mode 100644
index 0000000..5ffd1af
--- /dev/null
+++ b/whisperx/SubtitlesProcessor.py
@@ -0,0 +1,227 @@
+import math
+from conjunctions import get_conjunctions, get_comma
+from typing import TextIO
+
+def normal_round(n):
+    if n - math.floor(n) < 0.5:
+        return math.floor(n)
+    return math.ceil(n)
+
+
+def format_timestamp(seconds: float, is_vtt: bool = False):
+
+    assert seconds >= 0, "non-negative timestamp expected"
+    milliseconds = round(seconds * 1000.0)
+
+    hours = milliseconds // 3_600_000
+    milliseconds -= hours * 3_600_000
+
+    minutes = milliseconds // 60_000
+    milliseconds -= minutes * 60_000
+
+    seconds = milliseconds // 1_000
+    milliseconds -= seconds * 1_000
+
+    separator = '.' if is_vtt else ','
+    
+    hours_marker = f"{hours:02d}:"
+    return (
+        f"{hours_marker}{minutes:02d}:{seconds:02d}{separator}{milliseconds:03d}"
+    )
+
+
+
+class SubtitlesProcessor:
+    def __init__(self, segments, lang, max_line_length = 45, min_char_length_splitter = 30, is_vtt = False):
+        self.comma = get_comma(lang)
+        self.conjunctions = set(get_conjunctions(lang))
+        self.segments = segments
+        self.lang = lang
+        self.max_line_length = max_line_length
+        self.min_char_length_splitter = min_char_length_splitter
+        self.is_vtt = is_vtt
+        complex_script_languages = ['th', 'lo', 'my', 'km', 'am', 'ko', 'ja', 'zh', 'ti', 'ta', 'te', 'kn', 'ml', 'hi', 'ne', 'mr', 'ar', 'fa', 'ur', 'ka']
+        if self.lang in complex_script_languages:
+            self.max_line_length = 30
+            self.min_char_length_splitter = 20
+
+    def estimate_timestamp_for_word(self, words, i, next_segment_start_time=None):
+        k = 0.25
+        has_prev_end = i > 0 and 'end' in words[i - 1]
+        has_next_start = i < len(words) - 1 and 'start' in words[i + 1]
+
+        if has_prev_end:
+            words[i]['start'] = words[i - 1]['end']
+            if has_next_start:
+                words[i]['end'] = words[i + 1]['start']
+            else:
+                if next_segment_start_time:
+                    words[i]['end'] = next_segment_start_time if next_segment_start_time - words[i - 1]['end'] <= 1 else next_segment_start_time - 0.5
+                else:
+                    words[i]['end'] = words[i]['start'] + len(words[i]['word']) * k
+
+        elif has_next_start:
+            words[i]['start'] = words[i + 1]['start'] - len(words[i]['word']) * k
+            words[i]['end'] = words[i + 1]['start']
+
+        else:
+            if next_segment_start_time:
+                words[i]['start'] = next_segment_start_time - 1
+                words[i]['end'] = next_segment_start_time - 0.5
+            else:
+                words[i]['start'] = 0
+                words[i]['end'] = 0
+
+
+
+    def process_segments(self, advanced_splitting=True):
+        subtitles = []
+        for i, segment in enumerate(self.segments):
+            next_segment_start_time = self.segments[i + 1]['start'] if i + 1 < len(self.segments) else None
+            
+            if advanced_splitting:
+
+                split_points = self.determine_advanced_split_points(segment, next_segment_start_time)
+                subtitles.extend(self.generate_subtitles_from_split_points(segment, split_points, next_segment_start_time))
+            else:
+                words = segment['words']
+                for i, word in enumerate(words):
+                    if 'start' not in word or 'end' not in word:
+                        self.estimate_timestamp_for_word(words, i, next_segment_start_time)
+
+                subtitles.append({
+                'start': segment['start'],
+                'end': segment['end'],
+                'text': segment['text']
+            })
+
+        return subtitles
+
+    def determine_advanced_split_points(self, segment, next_segment_start_time=None):
+        split_points = []
+        last_split_point = 0
+        char_count = 0
+
+        words = segment.get('words', segment['text'].split())
+        add_space = 0 if self.lang in ['zh', 'ja'] else 1
+
+        total_char_count = sum(len(word['word']) if isinstance(word, dict) else len(word) + add_space for word in words)
+        char_count_after = total_char_count
+
+        for i, word in enumerate(words):
+            word_text = word['word'] if isinstance(word, dict) else word
+            word_length = len(word_text) + add_space
+            char_count += word_length
+            char_count_after -= word_length
+
+            char_count_before = char_count - word_length
+
+            if isinstance(word, dict) and ('start' not in word or 'end' not in word):
+                self.estimate_timestamp_for_word(words, i, next_segment_start_time)
+
+            if char_count >= self.max_line_length:
+                midpoint = normal_round((last_split_point + i) / 2)
+                if char_count_before >= self.min_char_length_splitter:
+                    split_points.append(midpoint)
+                    last_split_point = midpoint + 1
+                    char_count = sum(len(words[j]['word']) if isinstance(words[j], dict) else len(words[j]) + add_space for j in range(last_split_point, i + 1))
+
+            elif word_text.endswith(self.comma) and char_count_before >= self.min_char_length_splitter and char_count_after >= self.min_char_length_splitter:
+                split_points.append(i)
+                last_split_point = i + 1
+                char_count = 0
+
+            elif word_text.lower() in self.conjunctions and char_count_before >= self.min_char_length_splitter and char_count_after >= self.min_char_length_splitter:
+                split_points.append(i - 1)
+                last_split_point = i
+                char_count = word_length
+
+        return split_points
+
+    
+    def generate_subtitles_from_split_points(self, segment, split_points, next_start_time=None):
+        subtitles = []
+        
+        words = segment.get('words', segment['text'].split())
+        total_word_count = len(words)
+        total_time = segment['end'] - segment['start']
+        elapsed_time = segment['start']
+        prefix = ' ' if self.lang not in ['zh', 'ja'] else ''
+        start_idx = 0
+        for split_point in split_points:
+
+            fragment_words = words[start_idx:split_point + 1]
+            current_word_count = len(fragment_words)
+            
+
+            if isinstance(fragment_words[0], dict):
+                start_time = fragment_words[0]['start']
+                end_time = fragment_words[-1]['end']
+                next_start_time_for_word = words[split_point + 1]['start'] if split_point + 1 < len(words) else None
+                if next_start_time_for_word and (next_start_time_for_word - end_time) <= 0.8:
+                    end_time = next_start_time_for_word
+            else:
+                fragment = prefix.join(fragment_words).strip()
+                current_duration = (current_word_count / total_word_count) * total_time
+                start_time = elapsed_time
+                end_time = elapsed_time + current_duration
+                elapsed_time += current_duration
+
+
+            subtitles.append({
+                'start': start_time,
+                'end': end_time,
+                'text': fragment if not isinstance(fragment_words[0], dict) else prefix.join(word['word'] for word in fragment_words)
+            })
+            
+            start_idx = split_point + 1
+
+        # Handle the last fragment
+        if start_idx < len(words):
+            fragment_words = words[start_idx:]
+            current_word_count = len(fragment_words)
+            
+            if isinstance(fragment_words[0], dict):
+                start_time = fragment_words[0]['start']
+                end_time = fragment_words[-1]['end']
+            else:
+                fragment = prefix.join(fragment_words).strip()
+                current_duration = (current_word_count / total_word_count) * total_time
+                start_time = elapsed_time
+                end_time = elapsed_time + current_duration
+
+            if next_start_time and (next_start_time - end_time) <= 0.8:
+                end_time = next_start_time
+
+            subtitles.append({
+                'start': start_time,
+                'end': end_time if end_time is not None else segment['end'],
+                'text': fragment if not isinstance(fragment_words[0], dict) else prefix.join(word['word'] for word in fragment_words)
+            })
+            
+        return subtitles
+    
+
+
+    def save(self, filename="subtitles.srt", advanced_splitting=True):
+        
+        subtitles = self.process_segments(advanced_splitting)
+
+        def write_subtitle(file, idx, start_time, end_time, text):
+
+            file.write(f"{idx}\n")
+            file.write(f"{start_time} --> {end_time}\n")
+            file.write(text + "\n\n")
+
+        with open(filename, 'w', encoding='utf-8') as file:
+            if self.is_vtt:
+                file.write("WEBVTT\n\n")
+            
+            if advanced_splitting:
+                for idx, subtitle in enumerate(subtitles, 1):
+                    start_time = format_timestamp(subtitle['start'], self.is_vtt)
+                    end_time = format_timestamp(subtitle['end'], self.is_vtt)
+                    text = subtitle['text'].strip()
+                    write_subtitle(file, idx, start_time, end_time, text)
+
+        return len(subtitles)
\ No newline at end of file
diff --git a/whisperx/conjunctions.py b/whisperx/conjunctions.py
new file mode 100644
index 0000000..a3d35ea
--- /dev/null
+++ b/whisperx/conjunctions.py
@@ -0,0 +1,43 @@
+# conjunctions.py
+
+conjunctions_by_language = {
+    'en': {'and', 'whether', 'or', 'as', 'but', 'so', 'for', 'nor', 'which', 'yet', 'although', 'since', 'unless', 'when', 'while', 'because', 'if', 'how', 'that', 'than', 'who', 'where', 'what', 'near', 'before', 'after', 'across', 'through', 'until', 'once', 'whereas', 'even', 'both', 'either', 'neither', 'though'},
+    'fr': {'et', 'ou', 'mais', 'parce', 'bien', 'pendant', 'quand', 'où', 'comme', 'si', 'que', 'avant', 'après', 'aussitôt', 'jusqu’à', 'à', 'malgré', 'donc', 'tant', 'puisque', 'ni', 'soit', 'bien', 'encore', 'dès', 'lorsque'},
+    'de': {'und', 'oder', 'aber', 'weil', 'obwohl', 'während', 'wenn', 'wo', 'wie', 'dass', 'bevor', 'nachdem', 'sobald', 'bis', 'außer', 'trotzdem', 'also', 'sowie', 'indem', 'weder', 'sowohl', 'zwar', 'jedoch'},
+    'es': {'y', 'o', 'pero', 'porque', 'aunque', 'sin', 'mientras', 'cuando', 'donde', 'como', 'si', 'que', 'antes', 'después', 'tan', 'hasta', 'a', 'a', 'por', 'ya', 'ni', 'sino'},
+    'it': {'e', 'o', 'ma', 'perché', 'anche', 'mentre', 'quando', 'dove', 'come', 'se', 'che', 'prima', 'dopo', 'appena', 'fino', 'a', 'nonostante', 'quindi', 'poiché', 'né', 'ossia', 'cioè'},
+    'ja': {'そして', 'または', 'しかし', 'なぜなら', 'もし', 'それとも', 'だから', 'それに', 'なのに', 'そのため', 'かつ', 'それゆえに', 'ならば', 'もしくは', 'ため'},
+    'zh': {'和', '或', '但是', '因为', '任何', '也', '虽然', '而且', '所以', '如果', '除非', '尽管', '既然', '即使', '只要', '直到', '然后', '因此', '不但', '而是', '不过'},
+    'nl': {'en', 'of', 'maar', 'omdat', 'hoewel', 'terwijl', 'wanneer', 'waar', 'zoals', 'als', 'dat', 'voordat', 'nadat', 'zodra', 'totdat', 'tenzij', 'ondanks', 'dus', 'zowel', 'noch', 'echter', 'toch'},
+    'uk': {'та', 'або', 'але', 'тому', 'хоча', 'поки', 'бо', 'коли', 'де', 'як', 'якщо', 'що', 'перш', 'після', 'доки', 'незважаючи', 'тому', 'ані'},
+    'pt': {'e', 'ou', 'mas', 'porque', 'embora', 'enquanto', 'quando', 'onde', 'como', 'se', 'que', 'antes', 'depois', 'assim', 'até', 'a', 'apesar', 'portanto', 'já', 'pois', 'nem', 'senão'},
+    'ar': {'و', 'أو', 'لكن', 'لأن', 'مع', 'بينما', 'عندما', 'حيث', 'كما', 'إذا', 'الذي', 'قبل', 'بعد', 'فور', 'حتى', 'إلا', 'رغم', 'لذلك', 'بما'},
+    'cs': {'a', 'nebo', 'ale', 'protože', 'ačkoli', 'zatímco', 'když', 'kde', 'jako', 'pokud', 'že', 'než', 'poté', 'jakmile', 'dokud', 'pokud ne', 'navzdory', 'tak', 'stejně', 'ani', 'tudíž'},
+    'ru': {'и', 'или', 'но', 'потому', 'хотя', 'пока', 'когда', 'где', 'как', 'если', 'что', 'перед', 'после', 'несмотря', 'таким', 'также', 'ни', 'зато'},
+    'pl': {'i', 'lub', 'ale', 'ponieważ', 'chociaż', 'podczas', 'kiedy', 'gdzie', 'jak', 'jeśli', 'że', 'zanim', 'po', 'jak tylko', 'dopóki', 'chyba', 'pomimo', 'więc', 'tak', 'ani', 'czyli'},
+    'hu': {'és', 'vagy', 'de', 'mert', 'habár', 'míg', 'amikor', 'ahol', 'ahogy', 'ha', 'hogy', 'mielőtt', 'miután', 'amint', 'amíg', 'hacsak', 'ellenére', 'tehát', 'úgy', 'sem', 'vagyis'},
+    'fi': {'ja', 'tai', 'mutta', 'koska', 'vaikka', 'kun', 'missä', 'kuten', 'jos', 'että', 'ennen', 'sen jälkeen', 'heti', 'kunnes', 'ellei', 'huolimatta', 'siis', 'sekä', 'eikä', 'vaan'},
+    'fa': {'و', 'یا', 'اما', 'چون', 'اگرچه', 'در حالی', 'وقتی', 'کجا', 'چگونه', 'اگر', 'که', 'قبل', 'پس', 'به محض', 'تا زمانی', 'مگر', 'با وجود', 'پس', 'همچنین', 'نه'},
+    'el': {'και', 'ή', 'αλλά', 'επειδή', 'αν', 'ενώ', 'όταν', 'όπου', 'όπως', 'αν', 'που', 'προτού', 'αφού', 'μόλις', 'μέχρι', 'εκτός', 'παρά', 'έτσι', 'όπως', 'ούτε', 'δηλαδή'},
+    'tr': {'ve', 'veya', 'ama', 'çünkü', 'her ne', 'iken', 'nerede', 'nasıl', 'eğer', 'ki', 'önce', 'sonra', 'hemen', 'kadar', 'rağmen', 'hem', 'ne', 'yani'},
+    'da': {'og', 'eller', 'men', 'fordi', 'selvom', 'mens', 'når', 'hvor', 'som', 'hvis', 'at', 'før', 'efter', 'indtil', 'medmindre', 'således', 'ligesom', 'hverken', 'altså'},
+    'he': {'ו', 'או', 'אבל', 'כי', 'אף', 'בזמן', 'כאשר', 'היכן', 'כיצד', 'אם', 'ש', 'לפני', 'אחרי', 'ברגע', 'עד', 'אלא', 'למרות', 'לכן', 'כמו', 'לא', 'אז'},
+    'vi': {'và', 'hoặc', 'nhưng', 'bởi', 'mặc', 'trong', 'khi', 'ở', 'như', 'nếu', 'rằng', 'trước', 'sau', 'ngay', 'cho', 'trừ', 'mặc', 'vì', 'giống', 'cũng', 'tức'},
+    'ko': {'그리고', '또는','그런데','그래도', '이나', '결국', '마지막으로', '마찬가지로', '반면에', '아니면', '거나', '또는', '그럼에도', '그렇기', '때문에', '덧붙이자면', '게다가', '그러나',  '고', '그래서', '랑', '한다면', '하지만', '무엇', '왜냐하면', '비록', '동안', '언제', '어디서', '어떻게', '만약', '그', '전에', '후에', '즉시', '까지', '아니라면', '불구하고', '따라서', '같은', '도'},
+    'ur': {'اور', 'یا', 'مگر', 'کیونکہ', 'اگرچہ', 'جبکہ', 'جب', 'کہاں', 'کس طرح', 'اگر', 'کہ', 'سے پہلے', 'کے بعد', 'جیسے ہی', 'تک', 'اگر نہیں تو', 'کے باوجود', 'اس لئے', 'جیسے', 'نہ'},
+    'hi': {'और', 'या', 'पर', 'तो', 'न', 'फिर', 'हालांकि', 'चूंकि', 'अगर', 'कैसे', 'वह', 'से', 'जो', 'जहां', 'क्या', 'नजदीक', 'पहले', 'बाद', 'के', 'पार', 'माध्यम', 'तक', 'एक', 'जबकि', 'यहां', 'तक', 'दोनों', 'या', 'न', 'हालांकि'}
+
+}
+
+commas_by_language = {
+    'ja': '、', 
+    'zh': '，',
+    'fa': '،', 
+    'ur': '،'  
+}
+
+def get_conjunctions(lang_code):
+    return conjunctions_by_language.get(lang_code, set())
+
+def get_comma(lang_code):
+    return commas_by_language.get(lang_code, ',')
\ No newline at end of file