Source code for multivoice.lib.stt_config

# src/multivoice/lib/stt_config.py

import json
import os
from omegaconf import OmegaConf


[docs] def create_config(output_dir): """ Creates and configures the configuration for speaker diarization. This function sets up necessary directories, and modifies the configuration with specific parameters for audio processing and diarization. Args: output_dir (str): The directory where configuration files and processed data will be stored. Returns: OmegaConf.DictConfig: A configuration object containing all necessary settings for speaker diarization. """ DOMAIN_TYPE = "telephonic" CONFIG_LOCAL_DIRECTORY = "nemo_msdd_configs" CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml" MODEL_CONFIG_PATH = os.path.join(CONFIG_LOCAL_DIRECTORY, CONFIG_FILE_NAME) # Load the configuration file config = OmegaConf.load(MODEL_CONFIG_PATH) data_dir = os.path.join(output_dir, "data") # Create the data directory if it does not exist os.makedirs(data_dir, exist_ok=True) meta = { "audio_filepath": os.path.join(output_dir, "mono_file.wav"), "offset": 0, "duration": None, "label": "infer", "text": "-", "rttm_filepath": None, "uem_filepath": None, } # Write the metadata to an input manifest file with open(os.path.join(data_dir, "input_manifest.json"), "w") as fp: json.dump(meta, fp) fp.write("\n") pretrained_vad = "vad_multilingual_marblenet" pretrained_speaker_model = "titanet_large" # Set the number of workers to 0 for single-threaded processing config.num_workers = 0 # Specify the path to the input manifest file and output directory in the configuration config.diarizer.manifest_filepath = os.path.join(data_dir, "input_manifest.json") config.diarizer.out_dir = ( output_dir # Directory to store intermediate files and prediction outputs ) # Set the pretrained speaker model for embeddings extraction config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model # Disable oracle VAD which uses RTTM files provided in the manifest file to get speech activity timestamps config.diarizer.oracle_vad = ( False # compute VAD provided with model_path to vad config ) # Disable oracle_num_speakers which specifies whether to use a fixed number of speakers config.diarizer.clustering.parameters.oracle_num_speakers = False # Set the pretrained NeMo VAD model and its parameters for speech activity detection config.diarizer.vad.model_path = pretrained_vad config.diarizer.vad.parameters.onset = 0.8 config.diarizer.vad.parameters.offset = 0.6 config.diarizer.vad.parameters.pad_offset = -0.05 # Set the telephonic speaker diarization model path config.diarizer.msdd_model.model_path = ( "diar_msdd_telephonic" # Telephonic speaker diarization model ) return config