Spaces:

lucas-ventura
/

chapter-llama

Running on Zero

App Files Files Community

lucas-ventura commited on Apr 2

Commit

7a4927a

verified ·

1 Parent(s): 2e23f3d

Upload 3 files

Browse files

Files changed (3) hide show

chapters.py +341 -0
prompt.py +93 -0
single_video.py +70 -0

chapters.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import random
+from pathlib import Path
+from lutils import openf, writef
+class Chapters:
+    def __init__(self, vidc_dir: str = "dataset/", subset="", videos_dir="videos"):
+        self.vidc_dir = Path(vidc_dir)
+        assert self.vidc_dir.exists(), f"Directory {vidc_dir} does not exist."
+        self.subset = subset
+        self.data = self.load_subset_data(subset=subset)
+        self.video_ids = list(self.data.keys())
+        assert len(self.video_ids) == len(self.data), (
+            f"len(data)= {len(self.data)} != len(ids)= {len(self.video_ids)}."
+        )
+        self.videos_dir = videos_dir
+    def get_subset_ids(self, subset: str):
+        return openf(self.vidc_dir / f"docs/subset_data/{subset}.json")
+    def load_subset_data(self, subset=""):
+        if subset == "":
+            data_path = self.vidc_dir / "docs/chapters.json"
+            assert data_path.exists(), f"Data file {data_path} does not exist."
+            data = openf(data_path)
+            return data
+        data_path = self.vidc_dir / f"docs/subset_data/chapters/chapters_{subset}.json"
+        if not data_path.exists():
+            video_ids = openf(self.vidc_dir / f"docs/subset_data/{subset}.json")
+            data = openf(self.vidc_dir / "docs/chapters.json")
+            data = {video_id: data[video_id] for video_id in video_ids}
+            data_path.parent.mkdir(exist_ok=True)
+            writef(data, data_path)
+        else:
+            data = openf(data_path)
+        return data
+    def __len__(self):
+        return len(self.video_ids)
+    def __iter__(self):
+        return iter(self.video_ids)
+    def __contains__(self, vid_id):
+        return vid_id in self.data
+    def __getitem__(self, idx):
+        if isinstance(idx, int):
+            video_info = self.get_video_info(self.video_ids[idx])
+            video_info["video_id"] = self.video_ids[idx]
+            return video_info
+        elif isinstance(idx, str):
+            return self.get_video_info(idx)
+        else:
+            raise ValueError(f"Invalid index type {type(idx)}.")
+    def get_video_info(self, video_id):
+        assert video_id in self.data, f"Video ID {video_id} not found in data."
+        return self.data[video_id]
+    def get_chapters(self, video_id, hms=False, segments=False):
+        """Retrieve chapters for a specific video ID."""
+        video_info = self.get_video_info(video_id)
+        vid_chapters = video_info.get("chapters", {})
+        chapter_timestamps = {}
+        for time, label in vid_chapters.items():
+            time = sec_to_hms(time) if hms else hms_to_sec(time)
+            chapter_timestamps[time] = label
+        if not segments:
+            return chapter_timestamps
+        # If segments is True, we return the timestamps as segments
+        assert not hms, "hms must be False if segments is True."
+        timestamps = list(chapter_timestamps.keys())
+        start_times = timestamps
+        end_times = timestamps[1:] + [self.get_duration(video_id)]
+        segmented_chapters = {}
+        for start_time, end_time in zip(start_times, end_times):
+            segment = (start_time, end_time)
+            segmented_chapters[segment] = chapter_timestamps[start_time]
+        return segmented_chapters
+    def get_labels(self, video_id):
+        """Retrieve a list of chapter labels for a specific video ID."""
+        chapters = self.get_chapters(video_id)
+        return list(chapters.values())
+    def get_timestamps(
+        self, video_id, zero_handling="default", duration_handling="default"
+    ):
+        """Retrieve a list of chapter timestamps for a specific video ID."""
+        assert zero_handling in [
+            "default",
+            "add",
+            "remove",
+        ], f"Invalid zero handling {zero_handling}."
+        assert duration_handling in [
+            "default",
+            "add",
+            "remove",
+        ], f"Invalid duration handling {duration_handling}."
+        chapters = self.get_chapters(video_id)
+        timestamps = [int(time) for time in chapters]
+        # Handle zero timestamps based on the flag
+        if zero_handling == "add":
+            timestamps = (
+                [0] + timestamps if timestamps and timestamps[0] != 0 else timestamps
+            )
+        elif zero_handling == "remove":
+            timestamps = [time for time in timestamps if time != 0]
+        if duration_handling == "add":
+            duration = self.get_duration(video_id)
+            timestamps = (
+                timestamps + [duration] if timestamps[-1] != duration else timestamps
+            )
+        elif duration_handling == "remove":
+            duration = self.get_duration(video_id)
+            timestamps = timestamps[:-1] if timestamps[-1] == duration else timestamps
+        return timestamps
+    def get_n_timestamps(self, video_id, zero_handling="default"):
+        """Retrieve the number of chapter timestamps for a specific video ID."""
+        return len(self.get_timestamps(video_id, zero_handling=zero_handling))
+    def get_n_chapters(self, video_id):
+        return len(self.get_gt_segments(video_id))
+    def get_n_labels(self, video_id):
+        return len(self.get_labels(video_id))
+    def get_duration(self, video_id, hms=False):
+        """Retrieve the duration of a specific video ID."""
+        video_info = self.get_video_info(video_id)
+        duration = video_info.get("duration")
+        if hms:
+            return sec_to_hms(duration)
+        return duration
+    def get_hms_duration(self, video_id, string=True):
+        """Retrieve the duration of a specific video ID in hours, minutes, and seconds."""
+        h, m, s = self.get_duration(video_id)
+        if string:
+            return f"{h:02d}:{m:02d}:{s:02d}"
+        else:
+            return h, m, s
+    def get_title(self, video_id):
+        """Retrieve the title of a specific video ID."""
+        video_info = self.get_video_info(video_id)
+        return video_info.get("title")
+    def get_description(self, video_id):
+        """Retrieve the description of a specific video ID."""
+        video_info = self.get_video_info(video_id)
+        return video_info.get("description")
+    def get_channel_id(self, video_id):
+        """Retrieve the channel ID of a specific video ID."""
+        video_info = self.get_video_info(video_id)
+        return video_info.get("channel_id")
+    def get_view_count(self, video_id):
+        """Retrieve the view count of a specific video ID."""
+        video_info = self.get_video_info(video_id)
+        return video_info.get("view_count")
+    def get_video_path(self, video_id):
+        """Retrieve the path to the video file for a specific video ID."""
+        video_pth = (
+            self.vidc_dir / self.videos_dir / f"{video_id[:2]}" / f"{video_id}.mp4"
+        )
+        assert video_pth.exists(), f"Video file {video_pth} does not exist."
+        return str(video_pth)
+    def sample(self, n=1):
+        """Sample n video IDs."""
+        sample = random.sample(self.video_ids, n)
+        if n == 1:
+            return sample[0]
+        else:
+            return sample
+    def get_gt_segments(self, video_id, zero_handling="add"):
+        """Generate ground truth segments based on video ID with options to adjust zero timestamps."""
+        timestamps = self.get_timestamps(video_id, zero_handling=zero_handling)
+        segments = boundary2seg(
+            timestamps, self.get_duration(video_id), zero_handling=zero_handling
+        )
+        return segments
+    def get_segments(self, video_id, zero_handling="add"):
+        return self.get_gt_segments(
+            video_id,
+            zero_handling=zero_handling,
+        )
+    def get_all_gt_segments(self, zero_handling="add"):
+        """Generate ground truth segments for all video IDs."""
+        return {
+            video_id: self.get_gt_segments(video_id, zero_handling=zero_handling)
+            for video_id in self.video_ids
+        }
+    def get_pred_segments(self, vid_id, vid_preds, zero_handling="add"):
+        duration = self.get_duration(vid_id)
+        if isinstance(vid_preds, list):
+            # vid_preds are the timestamps
+            vid_preds = (
+                [hms_to_sec(hms) for hms in vid_preds]
+                if isinstance(vid_preds[0], str)
+                else vid_preds
+            )
+            return boundary2seg(vid_preds, duration, zero_handling=zero_handling)
+        elif isinstance(vid_preds, dict):
+            # vid_preds are the chapters with key timestamps
+            vid_preds_new = {}
+            start_times = list(vid_preds.keys())
+            end_times = start_times[1:] + [duration]
+            for start_time, end_time in zip(start_times, end_times):
+                segment = (hms_to_sec(start_time), hms_to_sec(end_time))
+                vid_preds_new[segment] = vid_preds[start_time]
+            return vid_preds_new
+    def convert_predictions_to_segments(self, preds):
+        segments = {}
+        for video_id, vid_preds in preds.items():
+            segments[video_id] = self.get_pred_segments(video_id, vid_preds)
+        return segments
+    def get_link(self, video_id):
+        return f"https://www.youtube.com/watch?v={video_id}"
+    def get_url(self, video_id):
+        return f"https://www.youtube.com/watch?v={video_id}"
+    @staticmethod
+    def sec_to_hms(seconds, string=True, short=False):
+        return sec_to_hms(seconds, string=True, short=False)
+    @staticmethod
+    def hms_to_sec(time_str, enable_single_part=False):
+        return hms_to_sec(time_str, enable_single_part=enable_single_part)
+    @staticmethod
+    def clean_segment(segment, zero_handling="add"):
+        return clean_segment(segment, zero_handling=zero_handling)
+    @staticmethod
+    def clean_timestamps(timestamps, zero_handling="remove"):
+        return clean_tiemstamps(timestamps, zero_handling=zero_handling)
+def boundary2seg(boundaries, duration, zero_handling="add"):
+    if zero_handling == "add" and boundaries[0] != 0:
+        boundaries = [0] + boundaries
+    gt = []
+    for i in range(len(boundaries)):
+        if i < len(boundaries) - 1:
+            gt.append((float(boundaries[i]), float(boundaries[i + 1])))
+        else:
+            # Check if the last boundary equals the duration
+            if boundaries[i] != duration:
+                gt.append((float(boundaries[i]), float(duration)))
+    return gt
+def sec_to_hms(seconds, string=True, short=False):
+    """Convert seconds to hours, minutes, and seconds."""
+    if isinstance(seconds, str) and ":" in seconds:
+        return sec_to_hms(hms_to_sec(seconds), string=string, short=short)
+    if isinstance(seconds, str) and seconds.isdigit() or isinstance(seconds, float):
+        seconds = int(seconds)
+    m, s = divmod(seconds, 60)
+    h, m = divmod(m, 60)
+    if string:
+        if h == 0 and short:
+            return f"{m:02d}:{s:02d}"
+        return f"{h:02d}:{m:02d}:{s:02d}"
+    return h, m, s
+def hms_to_sec(time_str, enable_single_part=False):
+    """Convert hours, minutes, and seconds to total seconds."""
+    if isinstance(time_str, (int, float)):
+        return time_str
+    if isinstance(time_str, str) and time_str.isdigit():
+        return int(time_str)
+    parts = time_str.split(":")
+    if len(parts) == 3:
+        hours, minutes, seconds = parts
+        seconds = float(seconds) if "." in seconds else int(seconds)
+        minutes = int(minutes)
+        if minutes >= 60 or seconds >= 60:
+            return False
+        total_seconds = int(hours) * 3600 + minutes * 60 + seconds
+    elif len(parts) == 2:
+        minutes, seconds = parts
+        seconds = float(seconds) if "." in seconds else int(seconds)
+        minutes = int(minutes)
+        if seconds >= 60:
+            return False
+        total_seconds = int(minutes) * 60 + seconds
+    elif len(parts) == 1 and enable_single_part:
+        seconds = float(parts[0]) if "." in parts[0] else int(parts[0])
+        total_seconds = seconds
+    else:
+        raise ValueError("Invalid time format")
+    return total_seconds
+def clean_segment(segment, zero_handling="add"):
+    if zero_handling == "add" and segment[0][0] != 0.0:
+        segment.insert(0, [0.0, segment[0][0]])
+    elif zero_handling == "remove" and segment[0][0] == 0.0:
+        segment.pop(0)
+    return segment
+def clean_tiemstamps(timestamps, zero_handling="remove"):
+    if zero_handling == "remove":
+        return [time for time in timestamps if time != 0]
+    elif zero_handling == "add":
+        return [0] + timestamps if timestamps[0] != 0 else timestamps
+    else:
+        return timestamps

prompt.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from src.data.chapters import Chapters, sec_to_hms
+class Prompt:
+    def __init__(
+        self,
+        chapters: Chapters,
+    ):
+        self.chapters = chapters
+    def __contains__(self, vid_id):
+        raise NotImplementedError(
+            "Subclasses must implement the '__contains__' method."
+        )
+    def get_duration_prompt(self, vid_id: str) -> str:
+        duration = self.chapters.get_duration(vid_id, hms=True)
+        return f"Given the complete transcript of a video of duration {duration}, "
+    def get_task_prompt(self) -> str:
+        raise NotImplementedError(
+            "Subclasses must implement the 'get_task_prompt' method."
+        )
+    def get_format_instruction(self):
+        return "Identify the approximate start time of each chapter in the format 'hh:mm:ss - Title'. "
+    def get_new_line_instruction(self):
+        return "Ensure each chapter entry is on a new line. "
+    def get_focus_instruction(self):
+        return "Focus on significant topic changes that would merit a new chapter in a video, "
+    def get_no_summaries_instruction(self):
+        return "but do not provide summaries of the chapters.\n"
+    def get_transcript_introduction(self):
+        return "Here is the transcript to analyze:\n"
+    def get_transcript(self, vid_id: str) -> str:
+        # By default, the transcript is the same for train and test
+        raise NotImplementedError(
+            "Subclasses must implement the 'get_transcript' method."
+        )
+    def get_transcript_train(self, vid_id: str) -> str:
+        return self.get_transcript(vid_id)
+    def get_transcript_test(self, vid_id: str) -> str:
+        return self.get_transcript(vid_id)
+    def get_base_prompt(self, vid_id: str) -> str:
+        prompt_parts = [
+            self.get_duration_prompt(vid_id),
+            self.get_task_prompt(),
+            self.get_format_instruction(),
+            self.get_new_line_instruction(),
+            self.get_focus_instruction(),
+            self.get_no_summaries_instruction(),
+            self.get_transcript_introduction(),
+        ]
+        return "".join(prompt_parts)
+    def get_prompt_train(self, vid_id: str) -> str:
+        return self.get_base_prompt(vid_id)
+    def get_prompt_test(self, vid_id: str) -> str:
+        return self.get_base_prompt(vid_id)
+    def get_output(self, vid_id: str) -> str:
+        vid_chapters = self.chapters.get_chapters(vid_id)
+        answers = []
+        for chp_time, chp_title in vid_chapters.items():
+            chp_time = sec_to_hms(chp_time)
+            answers.append(f"{chp_time} - {chp_title}")
+        return "\n".join(answers)
+    def get_dialog(self, vid_id: str) -> str:
+        prompt = self.get_prompt_train(vid_id)
+        transcript = self.get_transcript_train(vid_id)
+        output = self.get_output(vid_id)
+        dialog = [
+            {
+                "role": "user",
+                "content": prompt + transcript,
+            },
+            {
+                "role": "assistant",
+                "content": output,
+            },
+        ]
+        return dialog

single_video.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from pathlib import Path
+from lutils import openf, writef
+from src.data.chapters import sec_to_hms
+from tools.extract.asr import ASRProcessor
+class SingleVideo:
+    """
+    A simplified implementation of the src.data.chapters.Chapters interface for single video inference.
+    This class mimics the behavior of the ChaptersASR class but is designed to work with
+    a single video file rather than a dataset. It provides the necessary methods
+    required by the PromptASR class for generating chapter timestamps and titles.
+    Note: This class is intended for inference only and should not be used for
+    training or evaluation purposes.
+    """
+    def __init__(self, video_path: Path):
+        self.video_path = video_path
+        self.video_ids = [video_path.stem]
+        assert video_path.exists(), f"Video file {video_path} not found"
+        self.asr, self.duration = get_asr(video_path, overwrite=True)
+    def __len__(self):
+        return len(self.video_ids)
+    def __iter__(self):
+        return iter(self.video_ids)
+    def __contains__(self, vid_id):
+        return vid_id in self.video_ids
+    def get_duration(self, vid_id, hms=False):
+        assert vid_id == self.video_ids[0], f"Invalid video ID: {vid_id}"
+        if hms:
+            return sec_to_hms(self.duration)
+        return self.duration
+    def get_asr(self, vid_id):
+        assert vid_id == self.video_ids[0], f"Invalid video ID: {vid_id}"
+        return self.asr
+def get_asr(video_path: Path, overwrite=False):
+    output_dir = Path(f"outputs/inference/{video_path.stem}")
+    asr_output = output_dir / "asr.txt"
+    duration_output = output_dir / "duration.txt"
+    if asr_output.exists() and duration_output.exists() and not overwrite:
+        asr = openf(asr_output)
+        asr = "\n".join(asr) + "\n"
+        duration = openf(duration_output)
+        assert isinstance(duration, list) and len(duration) == 1, (
+            f"Duration is not a list of length 1: {duration}"
+        )
+        duration = float(duration[0])
+        assert duration > 0, f"Duration is not positive: {duration}"
+        return asr, duration
+    print(f"\n=== 🎙️ Processing ASR for {video_path} ===")
+    asr_processor = ASRProcessor()
+    asr, duration = asr_processor.get_asr(video_path)
+    print(f"=== ✅ ASR processing complete for {video_path} ===\n")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    writef(asr_output, asr)
+    writef(duration_output, str(duration))
+    return asr, duration