# src/multivoice/identify_voice.py
import argparse
import logging
import os
import pickle
import torch
import torchaudio
from sklearn.metrics.pairwise import cosine_similarity
from speechbrain.inference.speaker import SpeakerRecognition
EMBED_FILE = "speaker_vectors.pkl"
[docs]
class SpeakerVerifierCLI:
def __init__(self):
"""
Initializes the SpeakerVerifierCLI class with a device setting, loaded speaker embeddings,
a similarity threshold, and a speaker recognition verifier.
"""
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.speakers = self.load_embeddings()
self.threshold = 0.50
self.verifier = SpeakerRecognition.from_hparams(
source="speechbrain/spkrec-ecapa-voxceleb",
savedir="pretrained_models/spkrec-ecapa-voxceleb",
run_opts={"device": str(self.device)},
)
[docs]
def load_embeddings(self):
"""
Loads speaker embeddings from a pickle file if it exists.
Returns:
dict: A dictionary containing speaker IDs and their corresponding embeddings.
Returns an empty dictionary if the file does not exist.
"""
if os.path.exists(EMBED_FILE):
with open(EMBED_FILE, "rb") as f:
return pickle.load(f)
logging.debug("Embeddings file does not exist.")
return {}
[docs]
def identify_voice(self, file_path):
"""
Identifies a speaker from an audio file by comparing its embedding to known speakers.
Args:
file_path (str): The path to the audio file for identification.
Returns:
str or None: The ID of the identified speaker if the score meets the threshold,
otherwise returns None.
"""
emb = self.extract_embedding(file_path)
if emb is None or not self.speakers:
logging.debug("No embedding or no speakers.")
return
sims = {
user: cosine_similarity(emb.reshape(1, -1), ref.reshape(1, -1))[0][0]
for user, ref in self.speakers.items()
}
best = max(sims.items(), key=lambda x: x[1])
score = best[1]
if score >= self.threshold:
logging.debug(f"Speaker matched: {best[0]} (Score: {score:.2f})")
voice_id = best[0]
else:
logging.debug(f"Unknown speaker (Best score: {score:.2f})")
voice_id = None
return voice_id
[docs]
def main():
"""
Main function to parse command line arguments and initiate the speaker identification process.
"""
parser = argparse.ArgumentParser(description="Identify a Speaker from a File")
parser.add_argument("FILE_PATH", help="Path to the audio file for identification")
args = parser.parse_args()
verifier_cli = SpeakerVerifierCLI()
verifier_cli.identify_voice(args.FILE_PATH)
if __name__ == "__main__":
main()