codelion commited on
Commit
4938676
·
verified ·
1 Parent(s): c43728b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -36
app.py CHANGED
@@ -30,7 +30,7 @@ def call_gemini(video_file: str, prompt: str) -> str:
30
  model=MODEL_NAME,
31
  contents=[
32
  Part(file_data=file_bytes, mime_type="video/mp4"),
33
- prompt # prompt is passed as a plain string
34
  ]
35
  )
36
  return response.text
@@ -50,26 +50,23 @@ def hhmmss_to_seconds(time_str: str) -> float:
50
 
51
  def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
52
  """
53
- Prompt Gemini to output key frame information in plain text.
54
- The prompt instructs the model to list key timestamps (in HH:MM:SS format)
55
- and a brief description for each important event, one per line in the format:
56
- HH:MM:SS - description.
57
- We then parse these lines and extract the corresponding frames from the video.
58
 
59
  Returns a list of tuples: (image_array, caption)
60
  """
61
  prompt = (
62
- "Based on the following video analysis, list the key timestamps (in HH:MM:SS format) "
63
- "and a brief description of each important event or anomaly. For each event, output a separate line "
64
- "in the following format: HH:MM:SS - description. Do not include any extra text."
65
  )
66
- prompt += f" Video Analysis: {analysis}"
67
  if user_query:
68
  prompt += f" Additional focus: {user_query}"
69
 
70
  try:
71
  key_frames_response = call_gemini(video_file, prompt)
72
- # Parse plain text output: each line should be "HH:MM:SS - description"
73
  lines = key_frames_response.strip().split("\n")
74
  key_frames = []
75
  for line in lines:
@@ -95,11 +92,9 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
95
  seconds = hhmmss_to_seconds(ts)
96
  except Exception:
97
  continue
98
- # Set video position (in milliseconds)
99
  cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
100
  ret, frame = cap.read()
101
  if ret:
102
- # Convert BGR to RGB for proper display
103
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
104
  caption = f"{ts}: {description}"
105
  extracted_frames.append((frame_rgb, caption))
@@ -108,33 +103,28 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
108
 
109
  def analyze_video(video_file: str, user_query: str) -> (str, list):
110
  """
111
- Perform iterative, agentic video analysis on the uploaded file.
112
- First, refine the video analysis over several iterations.
113
- Then, prompt the model to provide key timestamp information.
114
 
115
  Returns:
116
- - A Markdown report as a string.
117
  - A gallery list of key frames (each as a tuple of (image, caption)).
118
  """
119
  analysis = ""
120
  num_iterations = 3
121
 
122
  for i in range(num_iterations):
123
- base_prompt = (
124
- "You are a video analysis agent focusing on security and surveillance. "
125
- "Provide a detailed summary of the video, highlighting key events, suspicious activities, or anomalies."
126
- )
127
- if user_query:
128
- base_prompt += f" Also, focus on the following query: {user_query}"
129
-
130
- prompt = base_prompt if i == 0 else (
131
- f"Based on the previous analysis: \"{analysis}\". "
132
- "Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
133
- "and details that would help a security team understand the situation better."
134
- )
135
- if user_query and i > 0:
136
- prompt += f" Remember to focus on: {user_query}"
137
-
138
  try:
139
  analysis = call_gemini(video_file, prompt)
140
  except Exception as e:
@@ -155,7 +145,7 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
155
  def gradio_interface(video_file, user_query: str) -> (str, list):
156
  """
157
  Gradio interface function that accepts an uploaded video file and an optional query,
158
- then returns a Markdown report and a gallery of key frame images with captions.
159
  """
160
  if not video_file:
161
  return "Please upload a valid video file.", []
@@ -174,9 +164,8 @@ iface = gr.Interface(
174
  title="AI Video Analysis and Summariser Agent",
175
  description=(
176
  "This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
177
- "to iteratively analyze an uploaded video for security and surveillance insights. "
178
- "Provide a video file and, optionally, a query to guide the analysis. The tool returns a detailed "
179
- "Markdown report along with a gallery of key frame images."
180
  )
181
  )
182
 
 
30
  model=MODEL_NAME,
31
  contents=[
32
  Part(file_data=file_bytes, mime_type="video/mp4"),
33
+ prompt
34
  ]
35
  )
36
  return response.text
 
50
 
51
  def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
52
  """
53
+ Ask Gemini to list key timestamps and descriptions for the video.
54
+ The model is instructed to output one line per event in the format:
55
+ HH:MM:SS - description
56
+ We then parse these lines and extract the corresponding frames using OpenCV.
 
57
 
58
  Returns a list of tuples: (image_array, caption)
59
  """
60
  prompt = (
61
+ "List the key timestamps in the video and a brief description of the important event at that time. "
62
+ "Output one line per event in the following format: HH:MM:SS - description. Do not include any extra text."
 
63
  )
64
+ prompt += f" Video Summary: {analysis}"
65
  if user_query:
66
  prompt += f" Additional focus: {user_query}"
67
 
68
  try:
69
  key_frames_response = call_gemini(video_file, prompt)
 
70
  lines = key_frames_response.strip().split("\n")
71
  key_frames = []
72
  for line in lines:
 
92
  seconds = hhmmss_to_seconds(ts)
93
  except Exception:
94
  continue
 
95
  cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
96
  ret, frame = cap.read()
97
  if ret:
 
98
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
99
  caption = f"{ts}: {description}"
100
  extracted_frames.append((frame_rgb, caption))
 
103
 
104
  def analyze_video(video_file: str, user_query: str) -> (str, list):
105
  """
106
+ Perform iterative video analysis on the uploaded file.
107
+ Iteratively refine the summary with simpler prompts, then ask for key timestamps.
 
108
 
109
  Returns:
110
+ - A Markdown report (string) summarizing the video.
111
  - A gallery list of key frames (each as a tuple of (image, caption)).
112
  """
113
  analysis = ""
114
  num_iterations = 3
115
 
116
  for i in range(num_iterations):
117
+ if i == 0:
118
+ prompt = "Give a detailed summary of the video."
119
+ if user_query:
120
+ prompt += f" Also focus on: {user_query}"
121
+ elif i == 1:
122
+ prompt = f"Based on the summary: \"{analysis}\", provide additional details about important events and anomalies in the video."
123
+ if user_query:
124
+ prompt += f" Also focus on: {user_query}"
125
+ else:
126
+ prompt = f"Refine and consolidate the analysis: \"{analysis}\" into a final summary."
127
+
 
 
 
 
128
  try:
129
  analysis = call_gemini(video_file, prompt)
130
  except Exception as e:
 
145
  def gradio_interface(video_file, user_query: str) -> (str, list):
146
  """
147
  Gradio interface function that accepts an uploaded video file and an optional query,
148
+ then returns a Markdown report and a gallery of extracted key frames with captions.
149
  """
150
  if not video_file:
151
  return "Please upload a valid video file.", []
 
164
  title="AI Video Analysis and Summariser Agent",
165
  description=(
166
  "This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
167
+ "to iteratively analyze an uploaded video for insights. Provide a video file and, optionally, "
168
+ "a query to guide the analysis. The tool returns a Markdown report along with a gallery of key frame images."
 
169
  )
170
  )
171