lucas-ventura commited on
Commit
0ca274b
·
verified ·
1 Parent(s): 6303c5d

Upload utils_chapters.py

Browse files
Files changed (1) hide show
  1. src/test/utils_chapters.py +79 -0
src/test/utils_chapters.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def extract_chapters(output: str | list[str]):
5
+ """
6
+ Extract chapters from the given output string or list of strings.
7
+
8
+ Args:
9
+ output (str | list[str]): The input text containing chapter information.
10
+ vid_duration (str | None): The video duration in hh:mm:ss format. Default is None.
11
+
12
+ Returns:
13
+ dict: A dictionary of extracted chapters with timestamps as keys and titles as values.
14
+ """
15
+
16
+ # Only capture the first timestamp (hh:mm:ss) and ignore the second.
17
+ pattern = r"(\d{2}:[0-5]\d:[0-5]\d)\b"
18
+ chapters = {}
19
+
20
+ if isinstance(output, str):
21
+ output = output.split("\n")
22
+
23
+ for line in output:
24
+ if len(line) == 0:
25
+ continue
26
+
27
+ match = re.search(pattern, line)
28
+ if match:
29
+ time = match.group(1)
30
+ # Strip any additional timestamp or text following it
31
+ title = re.sub(pattern, "", line).strip()
32
+ title = title.lstrip(" -:") # Remove leading dash, colon, or space
33
+ title = title.strip()
34
+ if len(title) > 0:
35
+ chapters[time] = title
36
+
37
+ return chapters
38
+
39
+
40
+ def filter_chapters(chapters: dict, vid_duration: str | None = None):
41
+ if vid_duration:
42
+ filter_chapters = {}
43
+ for k, v in sorted(chapters.items()):
44
+ if k > vid_duration:
45
+ break
46
+ filter_chapters[k] = v
47
+ chapters = filter_chapters
48
+
49
+ # Check if chapters are in ordered by time
50
+ times = list(chapters.keys())
51
+ for i in range(1, len(times)):
52
+ if times[i] < times[i - 1]:
53
+ return {}
54
+
55
+ # remove empty chapters
56
+ chapters = {k: v for k, v in chapters.items() if len(v) > 0}
57
+
58
+ # if only one chapter at 00:00:00, return empty dict
59
+ if len(chapters) == 1 and list(chapters.keys())[0] == "00:00:00":
60
+ return {}
61
+
62
+ return chapters
63
+
64
+
65
+ if __name__ == "__main__":
66
+ # Example usage
67
+ text = """
68
+ 00:00:00 Introduction - good
69
+ 00:05:30 - 00:05:33: Second Chapter
70
+ 00:05:33: Another Chapter
71
+ 00:90:00 - Wrong time
72
+ 00:42:00 - After video duration
73
+ 00:39:00 - What is this?
74
+ 01:04:00 - Outside of video duration
75
+ """
76
+ chapters = extract_chapters(text)
77
+ chapters = filter_chapters(chapters, vid_duration="00:40:00")
78
+ for time, title in chapters.items():
79
+ print(f"Time: {time}, Title: {title}")