feat: enhance diarization with optional output of speaker embeddings

- Updated DiarizationPipeline to include a return_embeddings parameter for optional speaker embeddings. - Modified assign_word_speakers to accept and process speaker embeddings. - Updated CLI to support --speaker_embeddings flag for JSON output. - Ensured backward compatibility for existing functionality.
2025-07-01 18:17:27 -04:00 · 2025-03-21 13:57:47 +00:00
parent d700b56c9c
commit 1631c3040f
3 changed files with 79 additions and 11 deletions
--- a/whisperx/main.py
+++ b/whisperx/main.py
@ -44,6 +44,7 @@ def cli():
    parser.add_argument("--min_speakers", default=None, type=int, help="Minimum number of speakers to in audio file")
    parser.add_argument("--max_speakers", default=None, type=int, help="Maximum number of speakers to in audio file")
    parser.add_argument("--diarize_model", default="pyannote/speaker-diarization-3.1", type=str, help="Name of the speaker diarization model to use")
+    parser.add_argument("--speaker_embeddings", action="store_true", help="Include speaker embeddings in JSON output (only works with --diarize)")

    parser.add_argument("--temperature", type=float, default=0, help="temperature to use for sampling")
    parser.add_argument("--best_of", type=optional_int, default=5, help="number of candidates when sampling with non-zero temperature")