Source code for multivoice.identify_voice

# src/multivoice/identify_voice.py

import argparse
import logging
import os
import pickle
import torch
import torchaudio
from sklearn.metrics.pairwise import cosine_similarity
from speechbrain.inference.speaker import SpeakerRecognition

EMBED_FILE = "speaker_vectors.pkl"



[docs]
class SpeakerVerifierCLI:
    def __init__(self):
        """
        Initializes the SpeakerVerifierCLI class with a device setting, loaded speaker embeddings,
        a similarity threshold, and a speaker recognition verifier.
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.speakers = self.load_embeddings()
        self.threshold = 0.50
        self.verifier = SpeakerRecognition.from_hparams(
            source="speechbrain/spkrec-ecapa-voxceleb",
            savedir="pretrained_models/spkrec-ecapa-voxceleb",
            run_opts={"device": str(self.device)},
        )


[docs]
    def load_embeddings(self):
        """
        Loads speaker embeddings from a pickle file if it exists.

        Returns:
            dict: A dictionary containing speaker IDs and their corresponding embeddings.
                  Returns an empty dictionary if the file does not exist.
        """
        if os.path.exists(EMBED_FILE):
            with open(EMBED_FILE, "rb") as f:
                return pickle.load(f)
        logging.debug("Embeddings file does not exist.")
        return {}



[docs]
    def extract_embedding(self, audio_path):
        """
        Extracts a speaker embedding from an audio file.

        Args:
            audio_path (str): The path to the audio file.

        Returns:
            numpy.ndarray or None: A numpy array containing the speaker embedding,
                                 or None if extraction fails.
        """
        try:
            signal, fs = torchaudio.load(audio_path)
            if signal.shape[1] < 32000:
                logging.warning("Voice is too short, at least 2 seconds required.")
                return None
            if fs != 16000:
                logging.debug(f"Resampling audio from {fs} Hz to 16000 Hz")
                signal = torchaudio.functional.resample(signal, fs, 16000)

            signal = signal.to(self.device)

            emb = self.verifier.encode_batch(signal).squeeze(0).detach().cpu().numpy()
        except Exception as e:
            logging.error(f"Error extracting embedding: {e}")
            emb = None

        return emb



[docs]
    def identify_voice(self, file_path):
        """
        Identifies a speaker from an audio file by comparing its embedding to known speakers.

        Args:
            file_path (str): The path to the audio file for identification.

        Returns:
            str or None: The ID of the identified speaker if the score meets the threshold,
                         otherwise returns None.
        """
        emb = self.extract_embedding(file_path)
        if emb is None or not self.speakers:
            logging.debug("No embedding or no speakers.")
            return
        sims = {
            user: cosine_similarity(emb.reshape(1, -1), ref.reshape(1, -1))[0][0]
            for user, ref in self.speakers.items()
        }
        best = max(sims.items(), key=lambda x: x[1])
        score = best[1]
        if score >= self.threshold:
            logging.debug(f"Speaker matched: {best[0]} (Score: {score:.2f})")
            voice_id = best[0]
        else:
            logging.debug(f"Unknown speaker (Best score: {score:.2f})")
            voice_id = None

        return voice_id





[docs]
def main():
    """
    Main function to parse command line arguments and initiate the speaker identification process.
    """
    parser = argparse.ArgumentParser(description="Identify a Speaker from a File")
    parser.add_argument("FILE_PATH", help="Path to the audio file for identification")

    args = parser.parse_args()

    verifier_cli = SpeakerVerifierCLI()
    verifier_cli.identify_voice(args.FILE_PATH)



if __name__ == "__main__":
    main()