Spaces:

lucas-ventura
/

chapter-llama

Running on Zero

App Files Files Community

lucas-ventura commited on Apr 2

Commit

0ca274b

verified ·

1 Parent(s): 6303c5d

Upload utils_chapters.py

Browse files

Files changed (1) hide show

src/test/utils_chapters.py +79 -0

src/test/utils_chapters.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import re
+def extract_chapters(output: str | list[str]):
+    """
+    Extract chapters from the given output string or list of strings.
+    Args:
+        output (str | list[str]): The input text containing chapter information.
+        vid_duration (str | None): The video duration in hh:mm:ss format. Default is None.
+    Returns:
+        dict: A dictionary of extracted chapters with timestamps as keys and titles as values.
+    """
+    # Only capture the first timestamp (hh:mm:ss) and ignore the second.
+    pattern = r"(\d{2}:[0-5]\d:[0-5]\d)\b"
+    chapters = {}
+    if isinstance(output, str):
+        output = output.split("\n")
+    for line in output:
+        if len(line) == 0:
+            continue
+        match = re.search(pattern, line)
+        if match:
+            time = match.group(1)
+            # Strip any additional timestamp or text following it
+            title = re.sub(pattern, "", line).strip()
+            title = title.lstrip(" -:")  # Remove leading dash, colon, or space
+            title = title.strip()
+            if len(title) > 0:
+                chapters[time] = title
+    return chapters
+def filter_chapters(chapters: dict, vid_duration: str | None = None):
+    if vid_duration:
+        filter_chapters = {}
+        for k, v in sorted(chapters.items()):
+            if k > vid_duration:
+                break
+            filter_chapters[k] = v
+        chapters = filter_chapters
+    # Check if chapters are in ordered by time
+    times = list(chapters.keys())
+    for i in range(1, len(times)):
+        if times[i] < times[i - 1]:
+            return {}
+    # remove empty chapters
+    chapters = {k: v for k, v in chapters.items() if len(v) > 0}
+    # if only one chapter at 00:00:00, return empty dict
+    if len(chapters) == 1 and list(chapters.keys())[0] == "00:00:00":
+        return {}
+    return chapters
+if __name__ == "__main__":
+    # Example usage
+    text = """
+    00:00:00 Introduction - good
+    00:05:30 - 00:05:33: Second Chapter
+    00:05:33: Another Chapter
+    00:90:00 - Wrong time
+    00:42:00 - After video duration
+    00:39:00 - What is this?
+    01:04:00 - Outside of video duration
+    """
+    chapters = extract_chapters(text)
+    chapters = filter_chapters(chapters, vid_duration="00:40:00")
+    for time, title in chapters.items():
+        print(f"Time: {time}, Title: {title}")