# Standard library imports import os from typing import List, Dict, Annotated class SpeakerTimestampReader: """ A class to read and parse speaker timestamps from an RTTM file. Attributes ---------- rttm_path : str Path to the RTTM file containing speaker timestamps. Methods ------- read_speaker_timestamps() Reads the RTTM file and extracts speaker timestamps. Parameters ---------- rttm_path : str Path to the RTTM file containing speaker timestamps. Raises ------ FileNotFoundError If the RTTM file does not exist at the specified path. """ def __init__(self, rttm_path: str): """ Initializes the SpeakerTimestampReader with the path to an RTTM file. Parameters ---------- rttm_path : str Path to the RTTM file containing speaker timestamps. Raises ------ FileNotFoundError If the RTTM file does not exist at the specified path. """ if not os.path.isfile(rttm_path): raise FileNotFoundError(f"RTTM file not found at: {rttm_path}") self.rttm_path = rttm_path def read_speaker_timestamps(self) -> List[List[float]]: """ Reads the RTTM file and extracts speaker timestamps. Returns ------- List[List[float]] A list where each sublist contains [start_time, end_time, speaker_label]. Notes ----- - The times are converted to milliseconds. - Lines with invalid data are skipped. Examples -------- >>> reader = SpeakerTimestampReader("path/to/rttm_file.rttm") >>> timestamps = reader.read_speaker_timestamps() Speaker_Timestamps: [[0.0, 2000.0, 1], [2100.0, 4000.0, 2]] """ speaker_ts = [] with open(self.rttm_path) as f: lines = f.readlines() for line in lines: line_list = line.strip().split() try: if len(line_list) < 8: print(f"Skipping line due to unexpected format: {line.strip()}") continue start_time = float(line_list[3]) * 1000 duration = float(line_list[4]) * 1000 end_time = start_time + duration speaker_label_str = line_list[7] speaker_label = int(speaker_label_str.split("_")[-1]) speaker_ts.append([start_time, end_time, speaker_label]) except (ValueError, IndexError) as e: print(f"Skipping line due to parsing error: {line.strip()} - {e}") continue print(f"Speaker_Timestamps: {speaker_ts}") return speaker_ts class TranscriptWriter: """ A class to write speaker-aware transcripts in plain text or SRT formats. Methods ------- write_transcript(sentences_speaker_mapping, file_path) Writes the speaker-aware transcript to a text file. write_srt(sentences_speaker_mapping, file_path) Writes the speaker-aware transcript to an SRT file format. """ def __init__(self): """ Initializes the TranscriptWriter. """ pass @staticmethod def write_transcript(sentences_speaker_mapping: List[Dict], file_path: str): """ Writes the speaker-aware transcript to a text file. Parameters ---------- sentences_speaker_mapping : List[Dict] List of sentences with speaker labels, where each dictionary contains: - "speaker": Speaker label (e.g., Speaker 1, Speaker 2). - "text": Text of the spoken sentence. file_path : str Path to the output text file. Examples -------- >>> sentences_speaker_map = [{"speaker": "Speaker 1", "text": "Hello."}, {"speaker": "Speaker 2", "text": "Hi there."}] >>> TranscriptWriter.write_transcript(sentences_speaker_mapping, "output.txt") """ with open(file_path, "w", encoding="utf-8") as f: previous_speaker = sentences_speaker_mapping[0]["speaker"] f.write(f"{previous_speaker}: ") for sentence_dict in sentences_speaker_mapping: speaker = sentence_dict["speaker"] sentence = sentence_dict["text"].strip() if speaker != previous_speaker: f.write(f"\n\n{speaker}: ") previous_speaker = speaker f.write(sentence + " ") @staticmethod def write_srt(sentences_speaker_mapping: List[Dict], file_path: str): """ Writes the speaker-aware transcript to an SRT file format. Parameters ---------- sentences_speaker_mapping : List[Dict] List of sentences with speaker labels and timestamps, where each dictionary contains: - "start_time": Start time of the sentence in milliseconds. - "end_time": End time of the sentence in milliseconds. - "speaker": Speaker label. - "text": Text of the spoken sentence. file_path : str Path to the output SRT file. Notes ----- The function formats timestamps in the HH:MM:SS,mmm format for SRT. Examples -------- >>> sentences_speaker_map = [{"start_time": 0, "end_time": 2000, "speaker": "Speaker 1", "text": "Hello."}] >>> TranscriptWriter.write_srt(sentences_speaker_mapping, "output.srt") """ def format_timestamp(milliseconds: Annotated[float, "Time in milliseconds"]) -> Annotated[ str, "Formatted timestamp in HH:MM:SS,mmm"]: """ Converts a time value in milliseconds to an SRT timestamp format. This function takes a time value in milliseconds and formats it into the standard SRT (SubRip Subtitle) timestamp format: `HH:MM:SS,mmm`. Parameters ---------- milliseconds : float Time value in milliseconds to be converted. Returns ------- str A string representing the time in `HH:MM:SS,mmm` format. Raises ------ ValueError If the input time is negative. Examples -------- >>> format_timestamp(3723001) '01:02:03,001' >>> format_timestamp(0) '00:00:00,000' >>> format_timestamp(59_999.9) '00:00:59,999' Notes ----- The function ensures the correct zero-padding for hours, minutes, seconds, and milliseconds to meet the SRT format requirements. """ if milliseconds < 0: raise ValueError("Time in milliseconds cannot be negative.") hours = int(milliseconds // 3_600_000) minutes = int((milliseconds % 3_600_000) // 60_000) seconds = int((milliseconds % 60_000) // 1_000) milliseconds = int(milliseconds % 1_000) return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" with open(file_path, "w", encoding="utf-8") as f: for i, segment in enumerate(sentences_speaker_mapping, start=1): start_time = format_timestamp(segment['start_time']) end_time = format_timestamp(segment['end_time']) speaker = segment['speaker'] text = segment['text'].strip().replace('-->', '->') f.write(f"{i}\n") f.write(f"{start_time} --> {end_time}\n") f.write(f"{speaker}: {text}\n\n") if __name__ == "__main__": example_rttm_path = "example.rttm" try: timestamp_reader = SpeakerTimestampReader(example_rttm_path) extracted_speaker_timestamps = timestamp_reader.read_speaker_timestamps() except FileNotFoundError as file_error: print(file_error) example_sentences_mapping = [ {"speaker": "Speaker 1", "text": "Hello there.", "start_time": 0, "end_time": 2000}, {"speaker": "Speaker 2", "text": "How are you?", "start_time": 2100, "end_time": 4000}, ] transcript_writer = TranscriptWriter() transcript_writer.write_transcript(example_sentences_mapping, "output.txt") transcript_writer.write_srt(example_sentences_mapping, "output.srt")