# src/multivoice/lib/stt_speaker_mapping.py
import nltk
[docs]
def get_word_ts_anchor(s, e, option="start"):
"""
Determine the anchor time for a word based on the specified option.
Args:
s (int): Start time of the word in milliseconds.
e (int): End time of the word in milliseconds.
option (str): The option to determine the anchor time ('start', 'mid', or 'end').
Returns:
int: The anchor time for the word based on the option.
"""
if option == "end":
return e
elif option == "mid":
return (s + e) / 2
return s
[docs]
def get_words_speaker_mapping(wrd_ts, spk_ts, word_anchor_option="start"):
"""
Map each word to its corresponding speaker based on time segments.
Args:
wrd_ts (list): A list of dictionaries containing word timestamps and text.
spk_ts (list): A list of lists containing speaker start and end times and speaker IDs.
word_anchor_option (str): The option to determine the anchor time for words ('start', 'mid', or 'end').
Returns:
list: A list of dictionaries mapping each word to its speaker, with timestamps and text.
"""
s, e, sp = spk_ts[0]
wrd_pos, turn_idx = 0, 0
wrd_spk_mapping = []
for wrd_dict in wrd_ts:
ws, we, wrd = (
int(wrd_dict["start"] * 1000),
int(wrd_dict["end"] * 1000),
wrd_dict["text"],
)
wrd_pos = get_word_ts_anchor(ws, we, word_anchor_option)
while wrd_pos > float(e):
turn_idx += 1
turn_idx = min(turn_idx, len(spk_ts) - 1)
s, e, sp = spk_ts[turn_idx]
if turn_idx == len(spk_ts) - 1:
e = get_word_ts_anchor(ws, we, option="end")
wrd_spk_mapping.append(
{"word": wrd, "start_time": ws, "end_time": we, "speaker": sp}
)
return wrd_spk_mapping
[docs]
def get_sentences_speaker_mapping(word_speaker_mapping, spk_ts):
"""
Group words into sentences and map each sentence to its speaker.
Args:
word_speaker_mapping (list): A list of dictionaries mapping words to speakers with timestamps.
spk_ts (list): A list of lists containing speaker start and end times and speaker IDs.
Returns:
list: A list of dictionaries, each representing a sentence with the speaker, start time, end time, and text.
"""
sentence_checker = nltk.tokenize.PunktSentenceTokenizer().text_contains_sentbreak
s, e, spk = spk_ts[0]
prev_spk = spk
snts = []
snt = {"speaker": f"Speaker {spk}", "start_time": s, "end_time": e, "text": ""}
for wrd_dict in word_speaker_mapping:
wrd, spk = wrd_dict["word"], wrd_dict["speaker"]
s, e = wrd_dict["start_time"], wrd_dict["end_time"]
if spk != prev_spk or sentence_checker(snt["text"] + " " + wrd):
snts.append(snt)
snt = {
"speaker": f"Speaker {spk}",
"start_time": s,
"end_time": e,
"text": "",
}
else:
snt["end_time"] = e
snt["text"] += wrd + " "
prev_spk = spk
snts.append(snt)
return snts
[docs]
def get_speaker_aware_transcript(sentences_speaker_mapping, f):
"""
Write a speaker-aware transcript to the provided file object.
Args:
sentences_speaker_mapping (list): A list of dictionaries representing sentences with speakers and timestamps.
f (file object): The file object where the transcript will be written.
"""
previous_speaker = sentences_speaker_mapping[0]["speaker"]
f.write(f"{previous_speaker}: ")
for sentence_dict in sentences_speaker_mapping:
speaker = sentence_dict["speaker"]
sentence = sentence_dict["text"]
# If this speaker doesn't match the previous one, start a new paragraph
if speaker != previous_speaker:
f.write(f"\n\n{speaker}: ")
previous_speaker = speaker
# No matter what, write the current sentence
f.write(sentence + " ")