From 076ff96eb20f560f95a22eca7e5f4bcd01747070 Mon Sep 17 00:00:00 2001 From: CaraDuf <91517923+Ca-ressemble-a-du-fake@users.noreply.github.com> Date: Wed, 7 Jun 2023 05:49:49 +0200 Subject: [PATCH 1/3] Add Audacity export This exports the transcript to a text file that can be directly imported in Audacity as label file. This is useful to quickly check the transcript-audio alignment. --- whisperx/utils.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/whisperx/utils.py b/whisperx/utils.py index d042bb7..ff17cce 100644 --- a/whisperx/utils.py +++ b/whisperx/utils.py @@ -365,6 +365,28 @@ class WriteTSV(ResultWriter): print(round(1000 * segment["end"]), file=file, end="\t") print(segment["text"].strip().replace("\t", " "), file=file, flush=True) +class WriteAudacity(ResultWriter): + """ + Write a transcript to a text file that audacity can import as labels. + The extension used is "aud" to distinguish it from the txt file produced by WriteTXT. + Yet this is not an audacity project but only a label file! + + Please note : Audacity uses seconds in timestamps not ms! + Also there is no header expected. + + If speaker is provided it is prepended to the text between double square brackets [[]]. + """ + + extension: str = "aud" + + def write_result(self, result: dict, file: TextIO, options: dict): + ARROW = " " + for segment in result["segments"]: + print(segment["start"], file=file, end=ARROW) + print(segment["end"], file=file, end=ARROW) + print( ( ("[[" + segment["speaker"] + "]]") if "speaker" in segment else "") + segment["text"].strip().replace("\t", " "), file=file, flush=True) + + class WriteJSON(ResultWriter): extension: str = "json" @@ -377,6 +399,7 @@ def get_writer( output_format: str, output_dir: str ) -> Callable[[dict, TextIO, dict], None]: writers = { + "aud": WriteAudacity, "txt": WriteTXT, "vtt": WriteVTT, "srt": WriteSRT, @@ -399,4 +422,4 @@ def interpolate_nans(x, method='nearest'): if x.notnull().sum() > 1: return x.interpolate(method=method).ffill().bfill() else: - return x.ffill().bfill() \ No newline at end of file + return x.ffill().bfill() From b13778fefd71c955eb93bd9c4d3bb9c850acd5db Mon Sep 17 00:00:00 2001 From: Max Bain <36994049+m-bain@users.noreply.github.com> Date: Wed, 7 Jun 2023 11:47:49 +0100 Subject: [PATCH 2/3] make aud optional --- whisperx/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/whisperx/utils.py b/whisperx/utils.py index ff17cce..36c7543 100644 --- a/whisperx/utils.py +++ b/whisperx/utils.py @@ -399,13 +399,15 @@ def get_writer( output_format: str, output_dir: str ) -> Callable[[dict, TextIO, dict], None]: writers = { - "aud": WriteAudacity, "txt": WriteTXT, "vtt": WriteVTT, "srt": WriteSRT, "tsv": WriteTSV, "json": WriteJSON, } + optional_writers = { + "aud": WriteAudacity, + } if output_format == "all": all_writers = [writer(output_dir) for writer in writers.values()] @@ -416,6 +418,8 @@ def get_writer( return write_all + if output_format in optional_writers: + return optional_writers[output_format](output_dir) return writers[output_format](output_dir) def interpolate_nans(x, method='nearest'): From d39c1b2319a8140911be7f37b8176f300a89e3da Mon Sep 17 00:00:00 2001 From: Max Bain <36994049+m-bain@users.noreply.github.com> Date: Wed, 7 Jun 2023 11:48:49 +0100 Subject: [PATCH 3/3] add "aud" to output_format --- whisperx/transcribe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/whisperx/transcribe.py b/whisperx/transcribe.py index 3bb1a36..1855178 100644 --- a/whisperx/transcribe.py +++ b/whisperx/transcribe.py @@ -26,7 +26,7 @@ def cli(): parser.add_argument("--compute_type", default="float16", type=str, choices=["float16", "float32", "int8"], help="compute type for computation") parser.add_argument("--output_dir", "-o", type=str, default=".", help="directory to save the outputs") - parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "vtt", "txt", "tsv", "json"], help="format of the output file; if not specified, all available formats will be produced") + parser.add_argument("--output_format", "-f", type=str, default="all", choices=["all", "srt", "vtt", "txt", "tsv", "json", "aud"], help="format of the output file; if not specified, all available formats will be produced") parser.add_argument("--verbose", type=str2bool, default=True, help="whether to print out the progress and debug messages") parser.add_argument("--task", type=str, default="transcribe", choices=["transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')") @@ -210,4 +210,4 @@ def cli(): writer(result, audio_path, writer_args) if __name__ == "__main__": - cli() \ No newline at end of file + cli()