# src/multivoice/lib/stt_config.py
import json
import os
from omegaconf import OmegaConf
[docs]
def create_config(output_dir):
"""
Creates and configures the configuration for speaker diarization.
This function sets up necessary directories, and modifies the configuration
with specific parameters for audio processing and diarization.
Args:
output_dir (str): The directory where configuration files and processed data will be stored.
Returns:
OmegaConf.DictConfig: A configuration object containing all necessary settings for speaker diarization.
"""
DOMAIN_TYPE = "telephonic"
CONFIG_LOCAL_DIRECTORY = "nemo_msdd_configs"
CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
MODEL_CONFIG_PATH = os.path.join(CONFIG_LOCAL_DIRECTORY, CONFIG_FILE_NAME)
# Load the configuration file
config = OmegaConf.load(MODEL_CONFIG_PATH)
data_dir = os.path.join(output_dir, "data")
# Create the data directory if it does not exist
os.makedirs(data_dir, exist_ok=True)
meta = {
"audio_filepath": os.path.join(output_dir, "mono_file.wav"),
"offset": 0,
"duration": None,
"label": "infer",
"text": "-",
"rttm_filepath": None,
"uem_filepath": None,
}
# Write the metadata to an input manifest file
with open(os.path.join(data_dir, "input_manifest.json"), "w") as fp:
json.dump(meta, fp)
fp.write("\n")
pretrained_vad = "vad_multilingual_marblenet"
pretrained_speaker_model = "titanet_large"
# Set the number of workers to 0 for single-threaded processing
config.num_workers = 0
# Specify the path to the input manifest file and output directory in the configuration
config.diarizer.manifest_filepath = os.path.join(data_dir, "input_manifest.json")
config.diarizer.out_dir = (
output_dir # Directory to store intermediate files and prediction outputs
)
# Set the pretrained speaker model for embeddings extraction
config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
# Disable oracle VAD which uses RTTM files provided in the manifest file to get speech activity timestamps
config.diarizer.oracle_vad = (
False # compute VAD provided with model_path to vad config
)
# Disable oracle_num_speakers which specifies whether to use a fixed number of speakers
config.diarizer.clustering.parameters.oracle_num_speakers = False
# Set the pretrained NeMo VAD model and its parameters for speech activity detection
config.diarizer.vad.model_path = pretrained_vad
config.diarizer.vad.parameters.onset = 0.8
config.diarizer.vad.parameters.offset = 0.6
config.diarizer.vad.parameters.pad_offset = -0.05
# Set the telephonic speaker diarization model path
config.diarizer.msdd_model.model_path = (
"diar_msdd_telephonic" # Telephonic speaker diarization model
)
return config