Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -30,7 +30,7 @@ def call_gemini(video_file: str, prompt: str) -> str:
|
|
30 |
model=MODEL_NAME,
|
31 |
contents=[
|
32 |
Part(file_data=file_bytes, mime_type="video/mp4"),
|
33 |
-
prompt
|
34 |
]
|
35 |
)
|
36 |
return response.text
|
@@ -50,26 +50,23 @@ def hhmmss_to_seconds(time_str: str) -> float:
|
|
50 |
|
51 |
def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
|
52 |
"""
|
53 |
-
|
54 |
-
The
|
55 |
-
|
56 |
-
|
57 |
-
We then parse these lines and extract the corresponding frames from the video.
|
58 |
|
59 |
Returns a list of tuples: (image_array, caption)
|
60 |
"""
|
61 |
prompt = (
|
62 |
-
"
|
63 |
-
"
|
64 |
-
"in the following format: HH:MM:SS - description. Do not include any extra text."
|
65 |
)
|
66 |
-
prompt += f" Video
|
67 |
if user_query:
|
68 |
prompt += f" Additional focus: {user_query}"
|
69 |
|
70 |
try:
|
71 |
key_frames_response = call_gemini(video_file, prompt)
|
72 |
-
# Parse plain text output: each line should be "HH:MM:SS - description"
|
73 |
lines = key_frames_response.strip().split("\n")
|
74 |
key_frames = []
|
75 |
for line in lines:
|
@@ -95,11 +92,9 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
|
|
95 |
seconds = hhmmss_to_seconds(ts)
|
96 |
except Exception:
|
97 |
continue
|
98 |
-
# Set video position (in milliseconds)
|
99 |
cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
|
100 |
ret, frame = cap.read()
|
101 |
if ret:
|
102 |
-
# Convert BGR to RGB for proper display
|
103 |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
104 |
caption = f"{ts}: {description}"
|
105 |
extracted_frames.append((frame_rgb, caption))
|
@@ -108,33 +103,28 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
|
|
108 |
|
109 |
def analyze_video(video_file: str, user_query: str) -> (str, list):
|
110 |
"""
|
111 |
-
Perform iterative
|
112 |
-
|
113 |
-
Then, prompt the model to provide key timestamp information.
|
114 |
|
115 |
Returns:
|
116 |
-
- A Markdown report
|
117 |
- A gallery list of key frames (each as a tuple of (image, caption)).
|
118 |
"""
|
119 |
analysis = ""
|
120 |
num_iterations = 3
|
121 |
|
122 |
for i in range(num_iterations):
|
123 |
-
|
124 |
-
"
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
)
|
135 |
-
if user_query and i > 0:
|
136 |
-
prompt += f" Remember to focus on: {user_query}"
|
137 |
-
|
138 |
try:
|
139 |
analysis = call_gemini(video_file, prompt)
|
140 |
except Exception as e:
|
@@ -155,7 +145,7 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
|
|
155 |
def gradio_interface(video_file, user_query: str) -> (str, list):
|
156 |
"""
|
157 |
Gradio interface function that accepts an uploaded video file and an optional query,
|
158 |
-
then returns a Markdown report and a gallery of key
|
159 |
"""
|
160 |
if not video_file:
|
161 |
return "Please upload a valid video file.", []
|
@@ -174,9 +164,8 @@ iface = gr.Interface(
|
|
174 |
title="AI Video Analysis and Summariser Agent",
|
175 |
description=(
|
176 |
"This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
|
177 |
-
"to iteratively analyze an uploaded video for
|
178 |
-
"
|
179 |
-
"Markdown report along with a gallery of key frame images."
|
180 |
)
|
181 |
)
|
182 |
|
|
|
30 |
model=MODEL_NAME,
|
31 |
contents=[
|
32 |
Part(file_data=file_bytes, mime_type="video/mp4"),
|
33 |
+
prompt
|
34 |
]
|
35 |
)
|
36 |
return response.text
|
|
|
50 |
|
51 |
def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
|
52 |
"""
|
53 |
+
Ask Gemini to list key timestamps and descriptions for the video.
|
54 |
+
The model is instructed to output one line per event in the format:
|
55 |
+
HH:MM:SS - description
|
56 |
+
We then parse these lines and extract the corresponding frames using OpenCV.
|
|
|
57 |
|
58 |
Returns a list of tuples: (image_array, caption)
|
59 |
"""
|
60 |
prompt = (
|
61 |
+
"List the key timestamps in the video and a brief description of the important event at that time. "
|
62 |
+
"Output one line per event in the following format: HH:MM:SS - description. Do not include any extra text."
|
|
|
63 |
)
|
64 |
+
prompt += f" Video Summary: {analysis}"
|
65 |
if user_query:
|
66 |
prompt += f" Additional focus: {user_query}"
|
67 |
|
68 |
try:
|
69 |
key_frames_response = call_gemini(video_file, prompt)
|
|
|
70 |
lines = key_frames_response.strip().split("\n")
|
71 |
key_frames = []
|
72 |
for line in lines:
|
|
|
92 |
seconds = hhmmss_to_seconds(ts)
|
93 |
except Exception:
|
94 |
continue
|
|
|
95 |
cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
|
96 |
ret, frame = cap.read()
|
97 |
if ret:
|
|
|
98 |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
99 |
caption = f"{ts}: {description}"
|
100 |
extracted_frames.append((frame_rgb, caption))
|
|
|
103 |
|
104 |
def analyze_video(video_file: str, user_query: str) -> (str, list):
|
105 |
"""
|
106 |
+
Perform iterative video analysis on the uploaded file.
|
107 |
+
Iteratively refine the summary with simpler prompts, then ask for key timestamps.
|
|
|
108 |
|
109 |
Returns:
|
110 |
+
- A Markdown report (string) summarizing the video.
|
111 |
- A gallery list of key frames (each as a tuple of (image, caption)).
|
112 |
"""
|
113 |
analysis = ""
|
114 |
num_iterations = 3
|
115 |
|
116 |
for i in range(num_iterations):
|
117 |
+
if i == 0:
|
118 |
+
prompt = "Give a detailed summary of the video."
|
119 |
+
if user_query:
|
120 |
+
prompt += f" Also focus on: {user_query}"
|
121 |
+
elif i == 1:
|
122 |
+
prompt = f"Based on the summary: \"{analysis}\", provide additional details about important events and anomalies in the video."
|
123 |
+
if user_query:
|
124 |
+
prompt += f" Also focus on: {user_query}"
|
125 |
+
else:
|
126 |
+
prompt = f"Refine and consolidate the analysis: \"{analysis}\" into a final summary."
|
127 |
+
|
|
|
|
|
|
|
|
|
128 |
try:
|
129 |
analysis = call_gemini(video_file, prompt)
|
130 |
except Exception as e:
|
|
|
145 |
def gradio_interface(video_file, user_query: str) -> (str, list):
|
146 |
"""
|
147 |
Gradio interface function that accepts an uploaded video file and an optional query,
|
148 |
+
then returns a Markdown report and a gallery of extracted key frames with captions.
|
149 |
"""
|
150 |
if not video_file:
|
151 |
return "Please upload a valid video file.", []
|
|
|
164 |
title="AI Video Analysis and Summariser Agent",
|
165 |
description=(
|
166 |
"This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
|
167 |
+
"to iteratively analyze an uploaded video for insights. Provide a video file and, optionally, "
|
168 |
+
"a query to guide the analysis. The tool returns a Markdown report along with a gallery of key frame images."
|
|
|
169 |
)
|
170 |
)
|
171 |
|