Source code for multivoice.lib.stt_timestamps

# src/multivoice/lib/stt_timestamps.py


[docs] def format_timestamp( milliseconds: float, always_include_hours: bool = False, decimal_marker: str = "." ): """ Formats a given time in milliseconds into a human-readable timestamp string. Args: milliseconds (float): The time in milliseconds to be formatted. always_include_hours (bool, optional): Whether to include hours even if they are zero. Defaults to False. decimal_marker (str, optional): The character used as the decimal marker. Defaults to '.'. Returns: str: A formatted timestamp string in the form of 'HH:MM:SS.mmm' or 'MM:SS.mmm'. """ assert milliseconds >= 0, "non-negative timestamp expected" hours = milliseconds // 3_600_000 milliseconds -= hours * 3_600_000 minutes = milliseconds // 60_000 milliseconds -= minutes * 60_000 seconds = milliseconds // 1_000 milliseconds -= seconds * 1_000 hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else "" return ( f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}" )
def _get_next_start_timestamp(word_timestamps, current_word_index, final_timestamp): """ Retrieves the start timestamp of the next word in a list of word timestamps. Args: word_timestamps (list): A list of dictionaries containing word and its associated timestamps. current_word_index (int): The index of the current word in the word_timestamps list. final_timestamp (float): The final timestamp to be used if there is no next word with a start timestamp. Returns: float: The start timestamp of the next word or the final_timestamp if applicable. """ # if current word is the last word if current_word_index == len(word_timestamps) - 1: return word_timestamps[current_word_index]["start"] next_word_index = current_word_index + 1 while current_word_index < len(word_timestamps) - 1: if word_timestamps[next_word_index].get("start") is None: # if next word doesn't have a start timestamp # merge it with the current word and delete it word_timestamps[current_word_index]["word"] += ( " " + word_timestamps[next_word_index]["word"] ) word_timestamps[next_word_index]["word"] = None next_word_index += 1 if next_word_index == len(word_timestamps): return final_timestamp else: return word_timestamps[next_word_index]["start"]
[docs] def filter_missing_timestamps( word_timestamps, initial_timestamp=0, final_timestamp=None ): """ Filters and fills in missing start timestamps for words in a list of word timestamps. Args: word_timestamps (list): A list of dictionaries containing word and its associated timestamps. initial_timestamp (float, optional): The initial timestamp to be used if the first word lacks a start timestamp. Defaults to 0. final_timestamp (float, optional): The final timestamp to be used for calculations. No default provided. Returns: list: A filtered list of dictionaries containing words and their associated timestamps with missing start times filled in. """ # handle the first and last word if word_timestamps[0].get("start") is None: word_timestamps[0]["start"] = ( initial_timestamp if initial_timestamp is not None else 0 ) word_timestamps[0]["end"] = _get_next_start_timestamp( word_timestamps, 0, final_timestamp ) result = [ word_timestamps[0], ] for i, ws in enumerate(word_timestamps[1:], start=1): # if ws doesn't have a start and end # use the previous end as start and next start as end if ws.get("start") is None and ws.get("word") is not None: ws["start"] = word_timestamps[i - 1]["end"] ws["end"] = _get_next_start_timestamp(word_timestamps, i, final_timestamp) if ws["word"] is not None: result.append(ws) return result