From 2ca99ce90928b99c00e737a7afe8683b4ea39894 Mon Sep 17 00:00:00 2001 From: awerks Date: Mon, 4 Sep 2023 21:49:34 +0200 Subject: [PATCH] A solution to long subitles Example usage: subtitles_proccessor = SubtitlesProcessor(output["segments"], detected_language, max_line_length = 50, min_char_length_splitter = 35) subtitles_proccessor.save("subtitles.srt", advanced_splitting = True) --- whisperx/SubtitlesProcessor.py | 227 +++++++++++++++++++++++++++++++++ whisperx/conjunctions.py | 43 +++++++ 2 files changed, 270 insertions(+) create mode 100644 whisperx/SubtitlesProcessor.py create mode 100644 whisperx/conjunctions.py diff --git a/whisperx/SubtitlesProcessor.py b/whisperx/SubtitlesProcessor.py new file mode 100644 index 0000000..5ffd1af --- /dev/null +++ b/whisperx/SubtitlesProcessor.py @@ -0,0 +1,227 @@ +import math +from conjunctions import get_conjunctions, get_comma +from typing import TextIO + +def normal_round(n): + if n - math.floor(n) < 0.5: + return math.floor(n) + return math.ceil(n) + + +def format_timestamp(seconds: float, is_vtt: bool = False): + + assert seconds >= 0, "non-negative timestamp expected" + milliseconds = round(seconds * 1000.0) + + hours = milliseconds // 3_600_000 + milliseconds -= hours * 3_600_000 + + minutes = milliseconds // 60_000 + milliseconds -= minutes * 60_000 + + seconds = milliseconds // 1_000 + milliseconds -= seconds * 1_000 + + separator = '.' if is_vtt else ',' + + hours_marker = f"{hours:02d}:" + return ( + f"{hours_marker}{minutes:02d}:{seconds:02d}{separator}{milliseconds:03d}" + ) + + + +class SubtitlesProcessor: + def __init__(self, segments, lang, max_line_length = 45, min_char_length_splitter = 30, is_vtt = False): + self.comma = get_comma(lang) + self.conjunctions = set(get_conjunctions(lang)) + self.segments = segments + self.lang = lang + self.max_line_length = max_line_length + self.min_char_length_splitter = min_char_length_splitter + self.is_vtt = is_vtt + complex_script_languages = ['th', 'lo', 'my', 'km', 'am', 'ko', 'ja', 'zh', 'ti', 'ta', 'te', 'kn', 'ml', 'hi', 'ne', 'mr', 'ar', 'fa', 'ur', 'ka'] + if self.lang in complex_script_languages: + self.max_line_length = 30 + self.min_char_length_splitter = 20 + + def estimate_timestamp_for_word(self, words, i, next_segment_start_time=None): + k = 0.25 + has_prev_end = i > 0 and 'end' in words[i - 1] + has_next_start = i < len(words) - 1 and 'start' in words[i + 1] + + if has_prev_end: + words[i]['start'] = words[i - 1]['end'] + if has_next_start: + words[i]['end'] = words[i + 1]['start'] + else: + if next_segment_start_time: + words[i]['end'] = next_segment_start_time if next_segment_start_time - words[i - 1]['end'] <= 1 else next_segment_start_time - 0.5 + else: + words[i]['end'] = words[i]['start'] + len(words[i]['word']) * k + + elif has_next_start: + words[i]['start'] = words[i + 1]['start'] - len(words[i]['word']) * k + words[i]['end'] = words[i + 1]['start'] + + else: + if next_segment_start_time: + words[i]['start'] = next_segment_start_time - 1 + words[i]['end'] = next_segment_start_time - 0.5 + else: + words[i]['start'] = 0 + words[i]['end'] = 0 + + + + def process_segments(self, advanced_splitting=True): + subtitles = [] + for i, segment in enumerate(self.segments): + next_segment_start_time = self.segments[i + 1]['start'] if i + 1 < len(self.segments) else None + + if advanced_splitting: + + split_points = self.determine_advanced_split_points(segment, next_segment_start_time) + subtitles.extend(self.generate_subtitles_from_split_points(segment, split_points, next_segment_start_time)) + else: + words = segment['words'] + for i, word in enumerate(words): + if 'start' not in word or 'end' not in word: + self.estimate_timestamp_for_word(words, i, next_segment_start_time) + + subtitles.append({ + 'start': segment['start'], + 'end': segment['end'], + 'text': segment['text'] + }) + + return subtitles + + def determine_advanced_split_points(self, segment, next_segment_start_time=None): + split_points = [] + last_split_point = 0 + char_count = 0 + + words = segment.get('words', segment['text'].split()) + add_space = 0 if self.lang in ['zh', 'ja'] else 1 + + total_char_count = sum(len(word['word']) if isinstance(word, dict) else len(word) + add_space for word in words) + char_count_after = total_char_count + + for i, word in enumerate(words): + word_text = word['word'] if isinstance(word, dict) else word + word_length = len(word_text) + add_space + char_count += word_length + char_count_after -= word_length + + char_count_before = char_count - word_length + + if isinstance(word, dict) and ('start' not in word or 'end' not in word): + self.estimate_timestamp_for_word(words, i, next_segment_start_time) + + if char_count >= self.max_line_length: + midpoint = normal_round((last_split_point + i) / 2) + if char_count_before >= self.min_char_length_splitter: + split_points.append(midpoint) + last_split_point = midpoint + 1 + char_count = sum(len(words[j]['word']) if isinstance(words[j], dict) else len(words[j]) + add_space for j in range(last_split_point, i + 1)) + + elif word_text.endswith(self.comma) and char_count_before >= self.min_char_length_splitter and char_count_after >= self.min_char_length_splitter: + split_points.append(i) + last_split_point = i + 1 + char_count = 0 + + elif word_text.lower() in self.conjunctions and char_count_before >= self.min_char_length_splitter and char_count_after >= self.min_char_length_splitter: + split_points.append(i - 1) + last_split_point = i + char_count = word_length + + return split_points + + + def generate_subtitles_from_split_points(self, segment, split_points, next_start_time=None): + subtitles = [] + + words = segment.get('words', segment['text'].split()) + total_word_count = len(words) + total_time = segment['end'] - segment['start'] + elapsed_time = segment['start'] + prefix = ' ' if self.lang not in ['zh', 'ja'] else '' + start_idx = 0 + for split_point in split_points: + + fragment_words = words[start_idx:split_point + 1] + current_word_count = len(fragment_words) + + + if isinstance(fragment_words[0], dict): + start_time = fragment_words[0]['start'] + end_time = fragment_words[-1]['end'] + next_start_time_for_word = words[split_point + 1]['start'] if split_point + 1 < len(words) else None + if next_start_time_for_word and (next_start_time_for_word - end_time) <= 0.8: + end_time = next_start_time_for_word + else: + fragment = prefix.join(fragment_words).strip() + current_duration = (current_word_count / total_word_count) * total_time + start_time = elapsed_time + end_time = elapsed_time + current_duration + elapsed_time += current_duration + + + subtitles.append({ + 'start': start_time, + 'end': end_time, + 'text': fragment if not isinstance(fragment_words[0], dict) else prefix.join(word['word'] for word in fragment_words) + }) + + start_idx = split_point + 1 + + # Handle the last fragment + if start_idx < len(words): + fragment_words = words[start_idx:] + current_word_count = len(fragment_words) + + if isinstance(fragment_words[0], dict): + start_time = fragment_words[0]['start'] + end_time = fragment_words[-1]['end'] + else: + fragment = prefix.join(fragment_words).strip() + current_duration = (current_word_count / total_word_count) * total_time + start_time = elapsed_time + end_time = elapsed_time + current_duration + + if next_start_time and (next_start_time - end_time) <= 0.8: + end_time = next_start_time + + subtitles.append({ + 'start': start_time, + 'end': end_time if end_time is not None else segment['end'], + 'text': fragment if not isinstance(fragment_words[0], dict) else prefix.join(word['word'] for word in fragment_words) + }) + + return subtitles + + + + def save(self, filename="subtitles.srt", advanced_splitting=True): + + subtitles = self.process_segments(advanced_splitting) + + def write_subtitle(file, idx, start_time, end_time, text): + + file.write(f"{idx}\n") + file.write(f"{start_time} --> {end_time}\n") + file.write(text + "\n\n") + + with open(filename, 'w', encoding='utf-8') as file: + if self.is_vtt: + file.write("WEBVTT\n\n") + + if advanced_splitting: + for idx, subtitle in enumerate(subtitles, 1): + start_time = format_timestamp(subtitle['start'], self.is_vtt) + end_time = format_timestamp(subtitle['end'], self.is_vtt) + text = subtitle['text'].strip() + write_subtitle(file, idx, start_time, end_time, text) + + return len(subtitles) \ No newline at end of file diff --git a/whisperx/conjunctions.py b/whisperx/conjunctions.py new file mode 100644 index 0000000..a3d35ea --- /dev/null +++ b/whisperx/conjunctions.py @@ -0,0 +1,43 @@ +# conjunctions.py + +conjunctions_by_language = { + 'en': {'and', 'whether', 'or', 'as', 'but', 'so', 'for', 'nor', 'which', 'yet', 'although', 'since', 'unless', 'when', 'while', 'because', 'if', 'how', 'that', 'than', 'who', 'where', 'what', 'near', 'before', 'after', 'across', 'through', 'until', 'once', 'whereas', 'even', 'both', 'either', 'neither', 'though'}, + 'fr': {'et', 'ou', 'mais', 'parce', 'bien', 'pendant', 'quand', 'où', 'comme', 'si', 'que', 'avant', 'après', 'aussitôt', 'jusqu’à', 'à', 'malgré', 'donc', 'tant', 'puisque', 'ni', 'soit', 'bien', 'encore', 'dès', 'lorsque'}, + 'de': {'und', 'oder', 'aber', 'weil', 'obwohl', 'während', 'wenn', 'wo', 'wie', 'dass', 'bevor', 'nachdem', 'sobald', 'bis', 'außer', 'trotzdem', 'also', 'sowie', 'indem', 'weder', 'sowohl', 'zwar', 'jedoch'}, + 'es': {'y', 'o', 'pero', 'porque', 'aunque', 'sin', 'mientras', 'cuando', 'donde', 'como', 'si', 'que', 'antes', 'después', 'tan', 'hasta', 'a', 'a', 'por', 'ya', 'ni', 'sino'}, + 'it': {'e', 'o', 'ma', 'perché', 'anche', 'mentre', 'quando', 'dove', 'come', 'se', 'che', 'prima', 'dopo', 'appena', 'fino', 'a', 'nonostante', 'quindi', 'poiché', 'né', 'ossia', 'cioè'}, + 'ja': {'そして', 'または', 'しかし', 'なぜなら', 'もし', 'それとも', 'だから', 'それに', 'なのに', 'そのため', 'かつ', 'それゆえに', 'ならば', 'もしくは', 'ため'}, + 'zh': {'和', '或', '但是', '因为', '任何', '也', '虽然', '而且', '所以', '如果', '除非', '尽管', '既然', '即使', '只要', '直到', '然后', '因此', '不但', '而是', '不过'}, + 'nl': {'en', 'of', 'maar', 'omdat', 'hoewel', 'terwijl', 'wanneer', 'waar', 'zoals', 'als', 'dat', 'voordat', 'nadat', 'zodra', 'totdat', 'tenzij', 'ondanks', 'dus', 'zowel', 'noch', 'echter', 'toch'}, + 'uk': {'та', 'або', 'але', 'тому', 'хоча', 'поки', 'бо', 'коли', 'де', 'як', 'якщо', 'що', 'перш', 'після', 'доки', 'незважаючи', 'тому', 'ані'}, + 'pt': {'e', 'ou', 'mas', 'porque', 'embora', 'enquanto', 'quando', 'onde', 'como', 'se', 'que', 'antes', 'depois', 'assim', 'até', 'a', 'apesar', 'portanto', 'já', 'pois', 'nem', 'senão'}, + 'ar': {'و', 'أو', 'لكن', 'لأن', 'مع', 'بينما', 'عندما', 'حيث', 'كما', 'إذا', 'الذي', 'قبل', 'بعد', 'فور', 'حتى', 'إلا', 'رغم', 'لذلك', 'بما'}, + 'cs': {'a', 'nebo', 'ale', 'protože', 'ačkoli', 'zatímco', 'když', 'kde', 'jako', 'pokud', 'že', 'než', 'poté', 'jakmile', 'dokud', 'pokud ne', 'navzdory', 'tak', 'stejně', 'ani', 'tudíž'}, + 'ru': {'и', 'или', 'но', 'потому', 'хотя', 'пока', 'когда', 'где', 'как', 'если', 'что', 'перед', 'после', 'несмотря', 'таким', 'также', 'ни', 'зато'}, + 'pl': {'i', 'lub', 'ale', 'ponieważ', 'chociaż', 'podczas', 'kiedy', 'gdzie', 'jak', 'jeśli', 'że', 'zanim', 'po', 'jak tylko', 'dopóki', 'chyba', 'pomimo', 'więc', 'tak', 'ani', 'czyli'}, + 'hu': {'és', 'vagy', 'de', 'mert', 'habár', 'míg', 'amikor', 'ahol', 'ahogy', 'ha', 'hogy', 'mielőtt', 'miután', 'amint', 'amíg', 'hacsak', 'ellenére', 'tehát', 'úgy', 'sem', 'vagyis'}, + 'fi': {'ja', 'tai', 'mutta', 'koska', 'vaikka', 'kun', 'missä', 'kuten', 'jos', 'että', 'ennen', 'sen jälkeen', 'heti', 'kunnes', 'ellei', 'huolimatta', 'siis', 'sekä', 'eikä', 'vaan'}, + 'fa': {'و', 'یا', 'اما', 'چون', 'اگرچه', 'در حالی', 'وقتی', 'کجا', 'چگونه', 'اگر', 'که', 'قبل', 'پس', 'به محض', 'تا زمانی', 'مگر', 'با وجود', 'پس', 'همچنین', 'نه'}, + 'el': {'και', 'ή', 'αλλά', 'επειδή', 'αν', 'ενώ', 'όταν', 'όπου', 'όπως', 'αν', 'που', 'προτού', 'αφού', 'μόλις', 'μέχρι', 'εκτός', 'παρά', 'έτσι', 'όπως', 'ούτε', 'δηλαδή'}, + 'tr': {'ve', 'veya', 'ama', 'çünkü', 'her ne', 'iken', 'nerede', 'nasıl', 'eğer', 'ki', 'önce', 'sonra', 'hemen', 'kadar', 'rağmen', 'hem', 'ne', 'yani'}, + 'da': {'og', 'eller', 'men', 'fordi', 'selvom', 'mens', 'når', 'hvor', 'som', 'hvis', 'at', 'før', 'efter', 'indtil', 'medmindre', 'således', 'ligesom', 'hverken', 'altså'}, + 'he': {'ו', 'או', 'אבל', 'כי', 'אף', 'בזמן', 'כאשר', 'היכן', 'כיצד', 'אם', 'ש', 'לפני', 'אחרי', 'ברגע', 'עד', 'אלא', 'למרות', 'לכן', 'כמו', 'לא', 'אז'}, + 'vi': {'và', 'hoặc', 'nhưng', 'bởi', 'mặc', 'trong', 'khi', 'ở', 'như', 'nếu', 'rằng', 'trước', 'sau', 'ngay', 'cho', 'trừ', 'mặc', 'vì', 'giống', 'cũng', 'tức'}, + 'ko': {'그리고', '또는','그런데','그래도', '이나', '결국', '마지막으로', '마찬가지로', '반면에', '아니면', '거나', '또는', '그럼에도', '그렇기', '때문에', '덧붙이자면', '게다가', '그러나', '고', '그래서', '랑', '한다면', '하지만', '무엇', '왜냐하면', '비록', '동안', '언제', '어디서', '어떻게', '만약', '그', '전에', '후에', '즉시', '까지', '아니라면', '불구하고', '따라서', '같은', '도'}, + 'ur': {'اور', 'یا', 'مگر', 'کیونکہ', 'اگرچہ', 'جبکہ', 'جب', 'کہاں', 'کس طرح', 'اگر', 'کہ', 'سے پہلے', 'کے بعد', 'جیسے ہی', 'تک', 'اگر نہیں تو', 'کے باوجود', 'اس لئے', 'جیسے', 'نہ'}, + 'hi': {'और', 'या', 'पर', 'तो', 'न', 'फिर', 'हालांकि', 'चूंकि', 'अगर', 'कैसे', 'वह', 'से', 'जो', 'जहां', 'क्या', 'नजदीक', 'पहले', 'बाद', 'के', 'पार', 'माध्यम', 'तक', 'एक', 'जबकि', 'यहां', 'तक', 'दोनों', 'या', 'न', 'हालांकि'} + +} + +commas_by_language = { + 'ja': '、', + 'zh': ',', + 'fa': '،', + 'ur': '،' +} + +def get_conjunctions(lang_code): + return conjunctions_by_language.get(lang_code, set()) + +def get_comma(lang_code): + return commas_by_language.get(lang_code, ',') \ No newline at end of file