Source code for multivoice.identify_voice

# src/multivoice/identify_voice.py

import argparse
import logging
import os
import pickle
import torch
import torchaudio
from sklearn.metrics.pairwise import cosine_similarity
from speechbrain.inference.speaker import SpeakerRecognition

EMBED_FILE = "speaker_vectors.pkl"


[docs] class SpeakerVerifierCLI: def __init__(self): """ Initializes the SpeakerVerifierCLI class with a device setting, loaded speaker embeddings, a similarity threshold, and a speaker recognition verifier. """ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.speakers = self.load_embeddings() self.threshold = 0.50 self.verifier = SpeakerRecognition.from_hparams( source="speechbrain/spkrec-ecapa-voxceleb", savedir="pretrained_models/spkrec-ecapa-voxceleb", run_opts={"device": str(self.device)}, )
[docs] def load_embeddings(self): """ Loads speaker embeddings from a pickle file if it exists. Returns: dict: A dictionary containing speaker IDs and their corresponding embeddings. Returns an empty dictionary if the file does not exist. """ if os.path.exists(EMBED_FILE): with open(EMBED_FILE, "rb") as f: return pickle.load(f) logging.debug("Embeddings file does not exist.") return {}
[docs] def extract_embedding(self, audio_path): """ Extracts a speaker embedding from an audio file. Args: audio_path (str): The path to the audio file. Returns: numpy.ndarray or None: A numpy array containing the speaker embedding, or None if extraction fails. """ try: signal, fs = torchaudio.load(audio_path) if signal.shape[1] < 32000: logging.warning("Voice is too short, at least 2 seconds required.") return None if fs != 16000: logging.debug(f"Resampling audio from {fs} Hz to 16000 Hz") signal = torchaudio.functional.resample(signal, fs, 16000) signal = signal.to(self.device) emb = self.verifier.encode_batch(signal).squeeze(0).detach().cpu().numpy() except Exception as e: logging.error(f"Error extracting embedding: {e}") emb = None return emb
[docs] def identify_voice(self, file_path): """ Identifies a speaker from an audio file by comparing its embedding to known speakers. Args: file_path (str): The path to the audio file for identification. Returns: str or None: The ID of the identified speaker if the score meets the threshold, otherwise returns None. """ emb = self.extract_embedding(file_path) if emb is None or not self.speakers: logging.debug("No embedding or no speakers.") return sims = { user: cosine_similarity(emb.reshape(1, -1), ref.reshape(1, -1))[0][0] for user, ref in self.speakers.items() } best = max(sims.items(), key=lambda x: x[1]) score = best[1] if score >= self.threshold: logging.debug(f"Speaker matched: {best[0]} (Score: {score:.2f})") voice_id = best[0] else: logging.debug(f"Unknown speaker (Best score: {score:.2f})") voice_id = None return voice_id
[docs] def main(): """ Main function to parse command line arguments and initiate the speaker identification process. """ parser = argparse.ArgumentParser(description="Identify a Speaker from a File") parser.add_argument("FILE_PATH", help="Path to the audio file for identification") args = parser.parse_args() verifier_cli = SpeakerVerifierCLI() verifier_cli.identify_voice(args.FILE_PATH)
if __name__ == "__main__": main()