Source code for multivoice.lib.stt_diarize_audio

# src/multivoice/lib/stt_diarize_audio.py

import logging
import os
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
from multivoice.lib.stt_config import create_config


[docs] def diarize_audio(temp_path, device): """ Diarizes the audio to identify speaker turns. Args: temp_path (str): The path where temporary files will be stored. device (str): The device to use for processing ('cpu' or 'cuda'). Returns: list: A list of lists containing start time, end time, and speaker ID for each identified segment. """ logging.debug("Diarizing the audio...") logging.debug("temp_path: %s", temp_path) # Initialize NeMo MSDD diarization model with configuration created from the temporary path msdd_model = NeuralDiarizer(cfg=create_config(temp_path)).to(device) msdd_model.diarize() # Perform diarization speaker_ts = [] # Read the output RTTM file to extract speaker segments with open(os.path.join(temp_path, "pred_rttms", "mono_file.rttm"), "r") as f: lines = f.readlines() for line in lines: line_list = line.split(" ") s = int(float(line_list[5]) * 1000) # Convert start time to milliseconds e = s + int( float(line_list[8]) * 1000 ) # Calculate end time in milliseconds speaker_ts.append( [s, e, int(line_list[11].split("_")[-1])] ) # Append [start_time, end_time, speaker_id] return speaker_ts