# src/multivoice/lib/stt_timestamps.py
def _get_next_start_timestamp(word_timestamps, current_word_index, final_timestamp):
"""
Retrieves the start timestamp of the next word in a list of word timestamps.
Args:
word_timestamps (list): A list of dictionaries containing word and its associated timestamps.
current_word_index (int): The index of the current word in the word_timestamps list.
final_timestamp (float): The final timestamp to be used if there is no next word with a start timestamp.
Returns:
float: The start timestamp of the next word or the final_timestamp if applicable.
"""
# if current word is the last word
if current_word_index == len(word_timestamps) - 1:
return word_timestamps[current_word_index]["start"]
next_word_index = current_word_index + 1
while current_word_index < len(word_timestamps) - 1:
if word_timestamps[next_word_index].get("start") is None:
# if next word doesn't have a start timestamp
# merge it with the current word and delete it
word_timestamps[current_word_index]["word"] += (
" " + word_timestamps[next_word_index]["word"]
)
word_timestamps[next_word_index]["word"] = None
next_word_index += 1
if next_word_index == len(word_timestamps):
return final_timestamp
else:
return word_timestamps[next_word_index]["start"]
[docs]
def filter_missing_timestamps(
word_timestamps, initial_timestamp=0, final_timestamp=None
):
"""
Filters and fills in missing start timestamps for words in a list of word timestamps.
Args:
word_timestamps (list): A list of dictionaries containing word and its associated timestamps.
initial_timestamp (float, optional): The initial timestamp to be used if the first word lacks a start timestamp. Defaults to 0.
final_timestamp (float, optional): The final timestamp to be used for calculations. No default provided.
Returns:
list: A filtered list of dictionaries containing words and their associated timestamps with missing start times filled in.
"""
# handle the first and last word
if word_timestamps[0].get("start") is None:
word_timestamps[0]["start"] = (
initial_timestamp if initial_timestamp is not None else 0
)
word_timestamps[0]["end"] = _get_next_start_timestamp(
word_timestamps, 0, final_timestamp
)
result = [
word_timestamps[0],
]
for i, ws in enumerate(word_timestamps[1:], start=1):
# if ws doesn't have a start and end
# use the previous end as start and next start as end
if ws.get("start") is None and ws.get("word") is not None:
ws["start"] = word_timestamps[i - 1]["end"]
ws["end"] = _get_next_start_timestamp(word_timestamps, i, final_timestamp)
if ws["word"] is not None:
result.append(ws)
return result