Spaces:
Running
on
Zero
Running
on
Zero
Upload utils_chapters.py
Browse files- src/test/utils_chapters.py +79 -0
src/test/utils_chapters.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
def extract_chapters(output: str | list[str]):
|
5 |
+
"""
|
6 |
+
Extract chapters from the given output string or list of strings.
|
7 |
+
|
8 |
+
Args:
|
9 |
+
output (str | list[str]): The input text containing chapter information.
|
10 |
+
vid_duration (str | None): The video duration in hh:mm:ss format. Default is None.
|
11 |
+
|
12 |
+
Returns:
|
13 |
+
dict: A dictionary of extracted chapters with timestamps as keys and titles as values.
|
14 |
+
"""
|
15 |
+
|
16 |
+
# Only capture the first timestamp (hh:mm:ss) and ignore the second.
|
17 |
+
pattern = r"(\d{2}:[0-5]\d:[0-5]\d)\b"
|
18 |
+
chapters = {}
|
19 |
+
|
20 |
+
if isinstance(output, str):
|
21 |
+
output = output.split("\n")
|
22 |
+
|
23 |
+
for line in output:
|
24 |
+
if len(line) == 0:
|
25 |
+
continue
|
26 |
+
|
27 |
+
match = re.search(pattern, line)
|
28 |
+
if match:
|
29 |
+
time = match.group(1)
|
30 |
+
# Strip any additional timestamp or text following it
|
31 |
+
title = re.sub(pattern, "", line).strip()
|
32 |
+
title = title.lstrip(" -:") # Remove leading dash, colon, or space
|
33 |
+
title = title.strip()
|
34 |
+
if len(title) > 0:
|
35 |
+
chapters[time] = title
|
36 |
+
|
37 |
+
return chapters
|
38 |
+
|
39 |
+
|
40 |
+
def filter_chapters(chapters: dict, vid_duration: str | None = None):
|
41 |
+
if vid_duration:
|
42 |
+
filter_chapters = {}
|
43 |
+
for k, v in sorted(chapters.items()):
|
44 |
+
if k > vid_duration:
|
45 |
+
break
|
46 |
+
filter_chapters[k] = v
|
47 |
+
chapters = filter_chapters
|
48 |
+
|
49 |
+
# Check if chapters are in ordered by time
|
50 |
+
times = list(chapters.keys())
|
51 |
+
for i in range(1, len(times)):
|
52 |
+
if times[i] < times[i - 1]:
|
53 |
+
return {}
|
54 |
+
|
55 |
+
# remove empty chapters
|
56 |
+
chapters = {k: v for k, v in chapters.items() if len(v) > 0}
|
57 |
+
|
58 |
+
# if only one chapter at 00:00:00, return empty dict
|
59 |
+
if len(chapters) == 1 and list(chapters.keys())[0] == "00:00:00":
|
60 |
+
return {}
|
61 |
+
|
62 |
+
return chapters
|
63 |
+
|
64 |
+
|
65 |
+
if __name__ == "__main__":
|
66 |
+
# Example usage
|
67 |
+
text = """
|
68 |
+
00:00:00 Introduction - good
|
69 |
+
00:05:30 - 00:05:33: Second Chapter
|
70 |
+
00:05:33: Another Chapter
|
71 |
+
00:90:00 - Wrong time
|
72 |
+
00:42:00 - After video duration
|
73 |
+
00:39:00 - What is this?
|
74 |
+
01:04:00 - Outside of video duration
|
75 |
+
"""
|
76 |
+
chapters = extract_chapters(text)
|
77 |
+
chapters = filter_chapters(chapters, vid_duration="00:40:00")
|
78 |
+
for time, title in chapters.items():
|
79 |
+
print(f"Time: {time}, Title: {title}")
|