Source code for multivoice.lib.stt_config

# src/multivoice/lib/stt_config.py

import json
import os
from omegaconf import OmegaConf



[docs]
def create_config(output_dir):
    """
    Creates and configures the configuration for speaker diarization.

    This function sets up necessary directories, and modifies the configuration
    with specific parameters for audio processing and diarization.

    Args:
        output_dir (str): The directory where configuration files and processed data will be stored.

    Returns:
        OmegaConf.DictConfig: A configuration object containing all necessary settings for speaker diarization.
    """
    DOMAIN_TYPE = "telephonic"
    CONFIG_LOCAL_DIRECTORY = "nemo_msdd_configs"
    CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
    MODEL_CONFIG_PATH = os.path.join(CONFIG_LOCAL_DIRECTORY, CONFIG_FILE_NAME)

    # Load the configuration file
    config = OmegaConf.load(MODEL_CONFIG_PATH)

    data_dir = os.path.join(output_dir, "data")

    # Create the data directory if it does not exist
    os.makedirs(data_dir, exist_ok=True)

    meta = {
        "audio_filepath": os.path.join(output_dir, "mono_file.wav"),
        "offset": 0,
        "duration": None,
        "label": "infer",
        "text": "-",
        "rttm_filepath": None,
        "uem_filepath": None,
    }

    # Write the metadata to an input manifest file
    with open(os.path.join(data_dir, "input_manifest.json"), "w") as fp:
        json.dump(meta, fp)
        fp.write("\n")

    pretrained_vad = "vad_multilingual_marblenet"
    pretrained_speaker_model = "titanet_large"

    # Set the number of workers to 0 for single-threaded processing
    config.num_workers = 0

    # Specify the path to the input manifest file and output directory in the configuration
    config.diarizer.manifest_filepath = os.path.join(data_dir, "input_manifest.json")
    config.diarizer.out_dir = (
        output_dir  # Directory to store intermediate files and prediction outputs
    )

    # Set the pretrained speaker model for embeddings extraction
    config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model

    # Disable oracle VAD which uses RTTM files provided in the manifest file to get speech activity timestamps
    config.diarizer.oracle_vad = (
        False  # compute VAD provided with model_path to vad config
    )

    # Disable oracle_num_speakers which specifies whether to use a fixed number of speakers
    config.diarizer.clustering.parameters.oracle_num_speakers = False

    # Set the pretrained NeMo VAD model and its parameters for speech activity detection
    config.diarizer.vad.model_path = pretrained_vad
    config.diarizer.vad.parameters.onset = 0.8
    config.diarizer.vad.parameters.offset = 0.6
    config.diarizer.vad.parameters.pad_offset = -0.05

    # Set the telephonic speaker diarization model path
    config.diarizer.msdd_model.model_path = (
        "diar_msdd_telephonic"  # Telephonic speaker diarization model
    )

    return config