Source code for multivoice.stt_dir

# src/multivoice/stt_dir.py

import argparse
import logging
import natsort
import os
import subprocess
import sys
from pathlib import Path

# Disable progress bar noise by default
disable_tqdm = True
for arg in sys.argv:
    if arg == "--debug" or arg == "-D" or arg == "--verbose" or arg == "-v":
        disable_tqdm = False
        break

if disable_tqdm:
    os.environ["TQDM_DISABLE"] = "1"

# Crufty workaround because disabling tqdm needs to be before these imports
from torch import cuda
from multivoice._version import __version__
from multivoice.lib.stt_langs import whisper_langs



[docs]
def parse_arguments():
    """Parse command line arguments for directory processing.

    This function sets up the argument parser and defines all possible options
    that the user can specify when running the script from the command line.

    Returns:
        argparse.Namespace: An object containing all parsed arguments as attributes.
    """
    parser = argparse.ArgumentParser(description="Process audio files in a directory.")

    parser.add_argument("--dir", help="Top-level directory to process.", required=True)

    parser.add_argument(
        "--batch-size",
        type=int,
        dest="batch_size",
        default=8,
        help="Batch size for batched inference, reduce if you run out of memory, "
        "set to 0 for original whisper longform inference (default: 8)",
    )

    parser.add_argument(
        "-D",
        "--debug",
        help="Debugging",
        action="store_true",
    )

    parser.add_argument(
        "--dest",
        dest="dest_dir",
        default="processed_output",
        type=str,
        help="Destination output directory for the final .txt and .srt files (default: processed_output)",
    )

    parser.add_argument(
        "--device",
        dest="device",
        default="cuda" if cuda.is_available() else "cpu",
        help="if you have a GPU use 'cuda', otherwise 'cpu' (default: cuda, if available)",
    )

    parser.add_argument(
        "--language",
        type=str,
        default=None,
        choices=whisper_langs,
        help="Language spoken in the audio, specify None to perform language detection (default: None)",
    )

    parser.add_argument(
        "--model",
        dest="model_name",
        default="large-v3",
        help="Name of the Whisper model to use (default: large-v3)",
    )

    parser.add_argument(
        "--no-stem",
        action="store_false",
        dest="stemming",
        default=True,
        help="Disables source separation."
        " This helps with long files that don't contain a lot of music.",
    )

    parser.add_argument(
        "--suppress_numerals",
        action="store_true",
        dest="suppress_numerals",
        default=False,
        help="Suppresses Numerical Digits."
        " This helps the diarization accuracy but converts all digits into written text.",
    )

    parser.add_argument(
        "-v",
        "--verbose",
        help="Increase output verbosity",
        action="count",
        default=0,
    )

    parser.add_argument(
        "-V",
        "--version",
        help="Show version",
        action="version",
        version=f"{__version__}",
    )

    args = parser.parse_args()
    return args




[docs]
def setup_logging(args):
    """
    Set up logging based on command line arguments.

    Args:
        args (argparse.Namespace): The parsed command line arguments.
    """
    if args.debug:
        log_level = logging.DEBUG
    elif args.verbose > 0:
        log_level = logging.INFO
    else:
        log_level = logging.ERROR

    # Reconfigure logging with the appropriate level
    logging.basicConfig(
        level=log_level, format="%(asctime)s - %(levelname)s - %(message)s"
    )
    logging.disable(log_level)




[docs]
def find_deepest_directories(root_directory):
    """Find all the deepest directories in a given root directory.

    This function traverses the directory tree and collects paths to all
    directories that do not contain any subdirectories (i.e., the deepest ones).

    Args:
        root_directory (str): The path to the root directory.

    Returns:
        list: A list of paths to the deepest directories.
    """
    deepest_dirs = []
    for dirpath, dirnames, _ in os.walk(root_directory):
        if not dirnames:
            deepest_dirs.append(dirpath)
    return deepest_dirs




[docs]
def check_media_file(file_path):
    """Check the media file type using ffprobe.

    This function uses ffprobe to determine the types of streams present in a
    given media file (e.g., audio, video).

    Args:
        file_path (str): The path to the media file.

    Returns:
        set: A set containing the stream types found in the file.
    """
    try:
        result = subprocess.run(
            [
                "ffprobe",
                "-v",
                "error",
                "-show_entries",
                "stream=codec_type",
                "-of",
                "default=noprint_wrappers=1:nokey=1",
                file_path,
            ],
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            check=True,
        )
        stream_types = result.stdout.strip().split("\n")
        return set(stream_types)
    except subprocess.CalledProcessError as e:
        logging.error(f"Failed to probe {file_path}: {e}")
        return None




[docs]
def find_best_audio_file(directory):
    """Find the best audio file in a directory.

    This function searches for suitable audio files in a directory, prioritizing
    those that contain only audio streams.

    Args:
        directory (str): The path to the directory to search.

    Returns:
        str or None: The path to the best audio file found, or None if none were found.
    """
    video_extensions = (".mp4", ".mkv", ".webm", ".wav")
    audio_only_files = []
    audio_video_files = []

    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(video_extensions):
            stream_types = check_media_file(file_path)
            if stream_types == {"audio"}:
                audio_only_files.append(file_path)
            elif stream_types == {"audio", "video"}:
                audio_video_files.append(file_path)

    return next(iter(audio_only_files), next(iter(audio_video_files), None))




[docs]
def process_directory(dir_path, dest_dir, args):
    """Process a directory containing media files.

    This function finds the best audio file in the given directory and processes
    it using the `multivoice` command-line tool.

    Args:
        dir_path (str): The path to the directory to process.
        dest_dir (str): The destination output directory for processed files.
        args (argparse.Namespace): The parsed command line arguments.
    """
    best_audio_file = find_best_audio_file(dir_path)
    if not best_audio_file:
        logging.warning(f"No suitable audio file found in {dir_path}")
        return
    logging.debug("dir_path: %s", dir_path)
    logging.debug("dest_dir: %s", dest_dir)

    dir_name = os.path.basename(dir_path)
    logging.debug("dir_name: %s", dir_name)
    relative_path = os.path.relpath(os.path.dirname(best_audio_file), start=args.dir)
    logging.debug("relative_path: %s", relative_path)
    dest_subdir = Path(dest_dir) / Path(relative_path)
    logging.debug("dest_subdir: %s", dest_subdir)
    dest_subdir.mkdir(parents=True, exist_ok=True)

    output_base = dest_subdir
    logging.debug("output_base: %s", output_base)

    command = [
        "multivoice",
        "--model",
        args.model_name,
        "--language",
        args.language or "en",
        "--device",
        args.device,
        "-a",
        best_audio_file,
        "--output",
        str(output_base),
    ]

    # Add debug and verbose flags if they were set
    if args.debug:
        command.extend(["-D", "--debug"])
    if args.verbose > 0:
        command.extend(["-v", "--verbose"] * args.verbose)

    try:
        subprocess.run(command, check=True)
    except subprocess.CalledProcessError as e:
        logging.error(f"Failed to transcribe {best_audio_file}: {e}")




[docs]
def main():
    """Main function for processing audio files in a directory.

    This function parses command line arguments, sets up logging, finds the
    deepest directories in the specified top-level directory, and processes each
    of them.
    """
    args = parse_arguments()
    setup_logging(args)

    deepest_dirs = find_deepest_directories(args.dir)
    sorted_deepest_dirs = natsort.natsorted(deepest_dirs)

    for dir_path in sorted_deepest_dirs:
        process_directory(dir_path, args.dest_dir, args)



if __name__ == "__main__":
    main()