Source code for multivoice.lib.stt_speaker_mapping

# src/multivoice/lib/stt_speaker_mapping.py

import nltk


[docs] def get_word_ts_anchor(s, e, option="start"): """ Determine the anchor time for a word based on the specified option. Args: s (int): Start time of the word in milliseconds. e (int): End time of the word in milliseconds. option (str): The option to determine the anchor time ('start', 'mid', or 'end'). Returns: int: The anchor time for the word based on the option. """ if option == "end": return e elif option == "mid": return (s + e) / 2 return s
[docs] def get_words_speaker_mapping(wrd_ts, spk_ts, word_anchor_option="start"): """ Map each word to its corresponding speaker based on time segments. Args: wrd_ts (list): A list of dictionaries containing word timestamps and text. spk_ts (list): A list of lists containing speaker start and end times and speaker IDs. word_anchor_option (str): The option to determine the anchor time for words ('start', 'mid', or 'end'). Returns: list: A list of dictionaries mapping each word to its speaker, with timestamps and text. """ s, e, sp = spk_ts[0] wrd_pos, turn_idx = 0, 0 wrd_spk_mapping = [] for wrd_dict in wrd_ts: ws, we, wrd = ( int(wrd_dict["start"] * 1000), int(wrd_dict["end"] * 1000), wrd_dict["text"], ) wrd_pos = get_word_ts_anchor(ws, we, word_anchor_option) while wrd_pos > float(e): turn_idx += 1 turn_idx = min(turn_idx, len(spk_ts) - 1) s, e, sp = spk_ts[turn_idx] if turn_idx == len(spk_ts) - 1: e = get_word_ts_anchor(ws, we, option="end") wrd_spk_mapping.append( {"word": wrd, "start_time": ws, "end_time": we, "speaker": sp} ) return wrd_spk_mapping
[docs] def get_sentences_speaker_mapping(word_speaker_mapping, spk_ts): """ Group words into sentences and map each sentence to its speaker. Args: word_speaker_mapping (list): A list of dictionaries mapping words to speakers with timestamps. spk_ts (list): A list of lists containing speaker start and end times and speaker IDs. Returns: list: A list of dictionaries, each representing a sentence with the speaker, start time, end time, and text. """ sentence_checker = nltk.tokenize.PunktSentenceTokenizer().text_contains_sentbreak s, e, spk = spk_ts[0] prev_spk = spk snts = [] snt = {"speaker": f"Speaker {spk}", "start_time": s, "end_time": e, "text": ""} for wrd_dict in word_speaker_mapping: wrd, spk = wrd_dict["word"], wrd_dict["speaker"] s, e = wrd_dict["start_time"], wrd_dict["end_time"] if spk != prev_spk or sentence_checker(snt["text"] + " " + wrd): snts.append(snt) snt = { "speaker": f"Speaker {spk}", "start_time": s, "end_time": e, "text": "", } else: snt["end_time"] = e snt["text"] += wrd + " " prev_spk = spk snts.append(snt) return snts
[docs] def get_speaker_aware_transcript(sentences_speaker_mapping, f): """ Write a speaker-aware transcript to the provided file object. Args: sentences_speaker_mapping (list): A list of dictionaries representing sentences with speakers and timestamps. f (file object): The file object where the transcript will be written. """ previous_speaker = sentences_speaker_mapping[0]["speaker"] f.write(f"{previous_speaker}: ") for sentence_dict in sentences_speaker_mapping: speaker = sentence_dict["speaker"] sentence = sentence_dict["text"] # If this speaker doesn't match the previous one, start a new paragraph if speaker != previous_speaker: f.write(f"\n\n{speaker}: ") previous_speaker = speaker # No matter what, write the current sentence f.write(sentence + " ")