codelion commited on
Commit
c43728b
·
verified ·
1 Parent(s): 5e2d98d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -44
app.py CHANGED
@@ -3,7 +3,7 @@ import json
3
  import gradio as gr
4
  import cv2
5
  from google import genai
6
- from google.genai.types import Part, GenerateContentConfig
7
  from tenacity import retry, stop_after_attempt, wait_random_exponential
8
 
9
  # Retrieve API key from environment variables.
@@ -18,12 +18,11 @@ client = genai.Client(api_key=GOOGLE_API_KEY)
18
  MODEL_NAME = "gemini-2.0-flash-001"
19
 
20
  @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
21
- def call_gemini(video_file: str, prompt: str, config: GenerateContentConfig = None) -> str:
22
  """
23
  Call the Gemini model with the provided video file and prompt.
24
  The video file is read as bytes and passed with MIME type "video/mp4".
25
  The prompt is passed as a plain string.
26
- Optionally accepts a config (e.g. response_schema) for structured output.
27
  """
28
  with open(video_file, "rb") as f:
29
  file_bytes = f.read()
@@ -31,9 +30,8 @@ def call_gemini(video_file: str, prompt: str, config: GenerateContentConfig = No
31
  model=MODEL_NAME,
32
  contents=[
33
  Part(file_data=file_bytes, mime_type="video/mp4"),
34
- prompt, # Pass prompt as a plain string
35
- ],
36
- config=config
37
  )
38
  return response.text
39
 
@@ -52,43 +50,34 @@ def hhmmss_to_seconds(time_str: str) -> float:
52
 
53
  def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
54
  """
55
- Prompt Gemini to return key frame timestamps (in HH:MM:SS) with descriptions,
56
- then extract those frames from the uploaded video file using OpenCV.
 
 
 
57
 
58
  Returns a list of tuples: (image_array, caption)
59
  """
60
- # Define a response schema for key frames.
61
- response_schema = {
62
- "type": "ARRAY",
63
- "items": {
64
- "type": "OBJECT",
65
- "properties": {
66
- "timestamp": {"type": "string"},
67
- "description": {"type": "string"}
68
- },
69
- "required": ["timestamp", "description"]
70
- }
71
- }
72
- config = GenerateContentConfig(
73
- temperature=0.0,
74
- max_output_tokens=1024,
75
- response_mime_type="application/json",
76
- response_schema=response_schema
77
- )
78
  prompt = (
79
- "From the following video analysis, list key frames with their timestamps (in HH:MM:SS format) "
80
- "and a brief description of the important event at that timestamp. "
81
- "Return the result as a JSON array of objects with keys 'timestamp' and 'description'."
82
  )
83
  prompt += f" Video Analysis: {analysis}"
84
  if user_query:
85
  prompt += f" Additional focus: {user_query}"
86
 
87
  try:
88
- key_frames_response = call_gemini(video_file, prompt, config=config)
89
- key_frames = json.loads(key_frames_response)
90
- if not isinstance(key_frames, list):
91
- key_frames = []
 
 
 
 
 
 
92
  except Exception as e:
93
  print("Error in key frame extraction:", e)
94
  key_frames = []
@@ -110,6 +99,7 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
110
  cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
111
  ret, frame = cap.read()
112
  if ret:
 
113
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
114
  caption = f"{ts}: {description}"
115
  extracted_frames.append((frame_rgb, caption))
@@ -120,7 +110,7 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
120
  """
121
  Perform iterative, agentic video analysis on the uploaded file.
122
  First, refine the video analysis over several iterations.
123
- Then, prompt the model to identify key frames.
124
 
125
  Returns:
126
  - A Markdown report as a string.
@@ -137,16 +127,13 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
137
  if user_query:
138
  base_prompt += f" Also, focus on the following query: {user_query}"
139
 
140
- if i == 0:
141
- prompt = base_prompt
142
- else:
143
- prompt = (
144
- f"Based on the previous analysis: \"{analysis}\". "
145
- "Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
146
- "and details that would help a security team understand the situation better."
147
- )
148
- if user_query:
149
- prompt += f" Remember to focus on: {user_query}"
150
 
151
  try:
152
  analysis = call_gemini(video_file, prompt)
 
3
  import gradio as gr
4
  import cv2
5
  from google import genai
6
+ from google.genai.types import Part
7
  from tenacity import retry, stop_after_attempt, wait_random_exponential
8
 
9
  # Retrieve API key from environment variables.
 
18
  MODEL_NAME = "gemini-2.0-flash-001"
19
 
20
  @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
21
+ def call_gemini(video_file: str, prompt: str) -> str:
22
  """
23
  Call the Gemini model with the provided video file and prompt.
24
  The video file is read as bytes and passed with MIME type "video/mp4".
25
  The prompt is passed as a plain string.
 
26
  """
27
  with open(video_file, "rb") as f:
28
  file_bytes = f.read()
 
30
  model=MODEL_NAME,
31
  contents=[
32
  Part(file_data=file_bytes, mime_type="video/mp4"),
33
+ prompt # prompt is passed as a plain string
34
+ ]
 
35
  )
36
  return response.text
37
 
 
50
 
51
  def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
52
  """
53
+ Prompt Gemini to output key frame information in plain text.
54
+ The prompt instructs the model to list key timestamps (in HH:MM:SS format)
55
+ and a brief description for each important event, one per line in the format:
56
+ HH:MM:SS - description.
57
+ We then parse these lines and extract the corresponding frames from the video.
58
 
59
  Returns a list of tuples: (image_array, caption)
60
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  prompt = (
62
+ "Based on the following video analysis, list the key timestamps (in HH:MM:SS format) "
63
+ "and a brief description of each important event or anomaly. For each event, output a separate line "
64
+ "in the following format: HH:MM:SS - description. Do not include any extra text."
65
  )
66
  prompt += f" Video Analysis: {analysis}"
67
  if user_query:
68
  prompt += f" Additional focus: {user_query}"
69
 
70
  try:
71
+ key_frames_response = call_gemini(video_file, prompt)
72
+ # Parse plain text output: each line should be "HH:MM:SS - description"
73
+ lines = key_frames_response.strip().split("\n")
74
+ key_frames = []
75
+ for line in lines:
76
+ if " - " in line:
77
+ parts = line.split(" - ", 1)
78
+ timestamp = parts[0].strip()
79
+ description = parts[1].strip()
80
+ key_frames.append({"timestamp": timestamp, "description": description})
81
  except Exception as e:
82
  print("Error in key frame extraction:", e)
83
  key_frames = []
 
99
  cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
100
  ret, frame = cap.read()
101
  if ret:
102
+ # Convert BGR to RGB for proper display
103
  frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
104
  caption = f"{ts}: {description}"
105
  extracted_frames.append((frame_rgb, caption))
 
110
  """
111
  Perform iterative, agentic video analysis on the uploaded file.
112
  First, refine the video analysis over several iterations.
113
+ Then, prompt the model to provide key timestamp information.
114
 
115
  Returns:
116
  - A Markdown report as a string.
 
127
  if user_query:
128
  base_prompt += f" Also, focus on the following query: {user_query}"
129
 
130
+ prompt = base_prompt if i == 0 else (
131
+ f"Based on the previous analysis: \"{analysis}\". "
132
+ "Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
133
+ "and details that would help a security team understand the situation better."
134
+ )
135
+ if user_query and i > 0:
136
+ prompt += f" Remember to focus on: {user_query}"
 
 
 
137
 
138
  try:
139
  analysis = call_gemini(video_file, prompt)