mirror of
https://github.com/m-bain/whisperX.git
synced 2025-07-01 18:17:27 -04:00
fix long segments, break into sentences using nltk, improve align logic, improve diarize (sentence-based)
This commit is contained in:
@ -11,7 +11,6 @@ class DiarizationPipeline:
|
||||
use_auth_token=None,
|
||||
device: Optional[Union[str, torch.device]] = "cpu",
|
||||
):
|
||||
self.model = Pipeline.from_pretrained(model_name, use_auth_token=use_auth_token)
|
||||
if isinstance(device, str):
|
||||
device = torch.device(device)
|
||||
self.model = Pipeline.from_pretrained(model_name, use_auth_token=use_auth_token).to(device)
|
||||
@ -21,59 +20,44 @@ class DiarizationPipeline:
|
||||
diarize_df = pd.DataFrame(segments.itertracks(yield_label=True))
|
||||
diarize_df['start'] = diarize_df[0].apply(lambda x: x.start)
|
||||
diarize_df['end'] = diarize_df[0].apply(lambda x: x.end)
|
||||
diarize_df.rename(columns={2: "speaker"}, inplace=True)
|
||||
return diarize_df
|
||||
|
||||
def assign_word_speakers(diarize_df, result_segments, fill_nearest=False):
|
||||
for seg in result_segments:
|
||||
wdf = seg['word-segments']
|
||||
if len(wdf['start'].dropna()) == 0:
|
||||
wdf['start'] = seg['start']
|
||||
wdf['end'] = seg['end']
|
||||
speakers = []
|
||||
for wdx, wrow in wdf.iterrows():
|
||||
if not np.isnan(wrow['start']):
|
||||
diarize_df['intersection'] = np.minimum(diarize_df['end'], wrow['end']) - np.maximum(diarize_df['start'], wrow['start'])
|
||||
diarize_df['union'] = np.maximum(diarize_df['end'], wrow['end']) - np.minimum(diarize_df['start'], wrow['start'])
|
||||
# remove no hit
|
||||
if not fill_nearest:
|
||||
dia_tmp = diarize_df[diarize_df['intersection'] > 0]
|
||||
else:
|
||||
dia_tmp = diarize_df
|
||||
if len(dia_tmp) == 0:
|
||||
speaker = None
|
||||
else:
|
||||
speaker = dia_tmp.sort_values("intersection", ascending=False).iloc[0][2]
|
||||
else:
|
||||
speaker = None
|
||||
speakers.append(speaker)
|
||||
seg['word-segments']['speaker'] = speakers
|
||||
|
||||
speaker_count = pd.Series(speakers).value_counts()
|
||||
if len(speaker_count) == 0:
|
||||
seg["speaker"]= "UNKNOWN"
|
||||
def assign_word_speakers(diarize_df, transcript_result, fill_nearest=False):
|
||||
transcript_segments = transcript_result["segments"]
|
||||
for seg in transcript_segments:
|
||||
# assign speaker to segment (if any)
|
||||
diarize_df['intersection'] = np.minimum(diarize_df['end'], seg['end']) - np.maximum(diarize_df['start'], seg['start'])
|
||||
diarize_df['union'] = np.maximum(diarize_df['end'], seg['end']) - np.minimum(diarize_df['start'], seg['start'])
|
||||
# remove no hit, otherwise we look for closest (even negative intersection...)
|
||||
if not fill_nearest:
|
||||
dia_tmp = diarize_df[diarize_df['intersection'] > 0]
|
||||
else:
|
||||
seg["speaker"] = speaker_count.index[0]
|
||||
dia_tmp = diarize_df
|
||||
if len(dia_tmp) > 0:
|
||||
# sum over speakers
|
||||
speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
|
||||
seg["speaker"] = speaker
|
||||
|
||||
# assign speaker to words
|
||||
if 'words' in seg:
|
||||
for word in seg['words']:
|
||||
if 'start' in word:
|
||||
diarize_df['intersection'] = np.minimum(diarize_df['end'], word['end']) - np.maximum(diarize_df['start'], word['start'])
|
||||
diarize_df['union'] = np.maximum(diarize_df['end'], word['end']) - np.minimum(diarize_df['start'], word['start'])
|
||||
# remove no hit
|
||||
if not fill_nearest:
|
||||
dia_tmp = diarize_df[diarize_df['intersection'] > 0]
|
||||
else:
|
||||
dia_tmp = diarize_df
|
||||
if len(dia_tmp) > 0:
|
||||
# sum over speakers
|
||||
speaker = dia_tmp.groupby("speaker")["intersection"].sum().sort_values(ascending=False).index[0]
|
||||
word["speaker"] = speaker
|
||||
|
||||
return transcript_result
|
||||
|
||||
# create word level segments for .srt
|
||||
word_seg = []
|
||||
for seg in result_segments:
|
||||
wseg = pd.DataFrame(seg["word-segments"])
|
||||
for wdx, wrow in wseg.iterrows():
|
||||
if wrow["start"] is not None:
|
||||
speaker = wrow['speaker']
|
||||
if speaker is None or speaker == np.nan:
|
||||
speaker = "UNKNOWN"
|
||||
word_seg.append(
|
||||
{
|
||||
"start": wrow["start"],
|
||||
"end": wrow["end"],
|
||||
"text": f"[{speaker}]: " + seg["text"][int(wrow["segment-text-start"]):int(wrow["segment-text-end"])]
|
||||
}
|
||||
)
|
||||
|
||||
# TODO: create segments but split words on new speaker
|
||||
|
||||
return result_segments, word_seg
|
||||
|
||||
class Segment:
|
||||
def __init__(self, start, end, speaker=None):
|
||||
|
Reference in New Issue
Block a user