Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,7 @@ import json
|
|
3 |
import gradio as gr
|
4 |
import cv2
|
5 |
from google import genai
|
6 |
-
from google.genai.types import Part
|
7 |
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
8 |
|
9 |
# Retrieve API key from environment variables.
|
@@ -18,12 +18,11 @@ client = genai.Client(api_key=GOOGLE_API_KEY)
|
|
18 |
MODEL_NAME = "gemini-2.0-flash-001"
|
19 |
|
20 |
@retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
|
21 |
-
def call_gemini(video_file: str, prompt: str
|
22 |
"""
|
23 |
Call the Gemini model with the provided video file and prompt.
|
24 |
The video file is read as bytes and passed with MIME type "video/mp4".
|
25 |
The prompt is passed as a plain string.
|
26 |
-
Optionally accepts a config (e.g. response_schema) for structured output.
|
27 |
"""
|
28 |
with open(video_file, "rb") as f:
|
29 |
file_bytes = f.read()
|
@@ -31,9 +30,8 @@ def call_gemini(video_file: str, prompt: str, config: GenerateContentConfig = No
|
|
31 |
model=MODEL_NAME,
|
32 |
contents=[
|
33 |
Part(file_data=file_bytes, mime_type="video/mp4"),
|
34 |
-
prompt
|
35 |
-
]
|
36 |
-
config=config
|
37 |
)
|
38 |
return response.text
|
39 |
|
@@ -52,43 +50,34 @@ def hhmmss_to_seconds(time_str: str) -> float:
|
|
52 |
|
53 |
def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
|
54 |
"""
|
55 |
-
Prompt Gemini to
|
56 |
-
|
|
|
|
|
|
|
57 |
|
58 |
Returns a list of tuples: (image_array, caption)
|
59 |
"""
|
60 |
-
# Define a response schema for key frames.
|
61 |
-
response_schema = {
|
62 |
-
"type": "ARRAY",
|
63 |
-
"items": {
|
64 |
-
"type": "OBJECT",
|
65 |
-
"properties": {
|
66 |
-
"timestamp": {"type": "string"},
|
67 |
-
"description": {"type": "string"}
|
68 |
-
},
|
69 |
-
"required": ["timestamp", "description"]
|
70 |
-
}
|
71 |
-
}
|
72 |
-
config = GenerateContentConfig(
|
73 |
-
temperature=0.0,
|
74 |
-
max_output_tokens=1024,
|
75 |
-
response_mime_type="application/json",
|
76 |
-
response_schema=response_schema
|
77 |
-
)
|
78 |
prompt = (
|
79 |
-
"
|
80 |
-
"and a brief description of
|
81 |
-
"
|
82 |
)
|
83 |
prompt += f" Video Analysis: {analysis}"
|
84 |
if user_query:
|
85 |
prompt += f" Additional focus: {user_query}"
|
86 |
|
87 |
try:
|
88 |
-
key_frames_response = call_gemini(video_file, prompt
|
89 |
-
|
90 |
-
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
except Exception as e:
|
93 |
print("Error in key frame extraction:", e)
|
94 |
key_frames = []
|
@@ -110,6 +99,7 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
|
|
110 |
cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
|
111 |
ret, frame = cap.read()
|
112 |
if ret:
|
|
|
113 |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
114 |
caption = f"{ts}: {description}"
|
115 |
extracted_frames.append((frame_rgb, caption))
|
@@ -120,7 +110,7 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
|
|
120 |
"""
|
121 |
Perform iterative, agentic video analysis on the uploaded file.
|
122 |
First, refine the video analysis over several iterations.
|
123 |
-
Then, prompt the model to
|
124 |
|
125 |
Returns:
|
126 |
- A Markdown report as a string.
|
@@ -137,16 +127,13 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
|
|
137 |
if user_query:
|
138 |
base_prompt += f" Also, focus on the following query: {user_query}"
|
139 |
|
140 |
-
if i == 0
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
)
|
148 |
-
if user_query:
|
149 |
-
prompt += f" Remember to focus on: {user_query}"
|
150 |
|
151 |
try:
|
152 |
analysis = call_gemini(video_file, prompt)
|
|
|
3 |
import gradio as gr
|
4 |
import cv2
|
5 |
from google import genai
|
6 |
+
from google.genai.types import Part
|
7 |
from tenacity import retry, stop_after_attempt, wait_random_exponential
|
8 |
|
9 |
# Retrieve API key from environment variables.
|
|
|
18 |
MODEL_NAME = "gemini-2.0-flash-001"
|
19 |
|
20 |
@retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
|
21 |
+
def call_gemini(video_file: str, prompt: str) -> str:
|
22 |
"""
|
23 |
Call the Gemini model with the provided video file and prompt.
|
24 |
The video file is read as bytes and passed with MIME type "video/mp4".
|
25 |
The prompt is passed as a plain string.
|
|
|
26 |
"""
|
27 |
with open(video_file, "rb") as f:
|
28 |
file_bytes = f.read()
|
|
|
30 |
model=MODEL_NAME,
|
31 |
contents=[
|
32 |
Part(file_data=file_bytes, mime_type="video/mp4"),
|
33 |
+
prompt # prompt is passed as a plain string
|
34 |
+
]
|
|
|
35 |
)
|
36 |
return response.text
|
37 |
|
|
|
50 |
|
51 |
def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
|
52 |
"""
|
53 |
+
Prompt Gemini to output key frame information in plain text.
|
54 |
+
The prompt instructs the model to list key timestamps (in HH:MM:SS format)
|
55 |
+
and a brief description for each important event, one per line in the format:
|
56 |
+
HH:MM:SS - description.
|
57 |
+
We then parse these lines and extract the corresponding frames from the video.
|
58 |
|
59 |
Returns a list of tuples: (image_array, caption)
|
60 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
prompt = (
|
62 |
+
"Based on the following video analysis, list the key timestamps (in HH:MM:SS format) "
|
63 |
+
"and a brief description of each important event or anomaly. For each event, output a separate line "
|
64 |
+
"in the following format: HH:MM:SS - description. Do not include any extra text."
|
65 |
)
|
66 |
prompt += f" Video Analysis: {analysis}"
|
67 |
if user_query:
|
68 |
prompt += f" Additional focus: {user_query}"
|
69 |
|
70 |
try:
|
71 |
+
key_frames_response = call_gemini(video_file, prompt)
|
72 |
+
# Parse plain text output: each line should be "HH:MM:SS - description"
|
73 |
+
lines = key_frames_response.strip().split("\n")
|
74 |
+
key_frames = []
|
75 |
+
for line in lines:
|
76 |
+
if " - " in line:
|
77 |
+
parts = line.split(" - ", 1)
|
78 |
+
timestamp = parts[0].strip()
|
79 |
+
description = parts[1].strip()
|
80 |
+
key_frames.append({"timestamp": timestamp, "description": description})
|
81 |
except Exception as e:
|
82 |
print("Error in key frame extraction:", e)
|
83 |
key_frames = []
|
|
|
99 |
cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
|
100 |
ret, frame = cap.read()
|
101 |
if ret:
|
102 |
+
# Convert BGR to RGB for proper display
|
103 |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
104 |
caption = f"{ts}: {description}"
|
105 |
extracted_frames.append((frame_rgb, caption))
|
|
|
110 |
"""
|
111 |
Perform iterative, agentic video analysis on the uploaded file.
|
112 |
First, refine the video analysis over several iterations.
|
113 |
+
Then, prompt the model to provide key timestamp information.
|
114 |
|
115 |
Returns:
|
116 |
- A Markdown report as a string.
|
|
|
127 |
if user_query:
|
128 |
base_prompt += f" Also, focus on the following query: {user_query}"
|
129 |
|
130 |
+
prompt = base_prompt if i == 0 else (
|
131 |
+
f"Based on the previous analysis: \"{analysis}\". "
|
132 |
+
"Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
|
133 |
+
"and details that would help a security team understand the situation better."
|
134 |
+
)
|
135 |
+
if user_query and i > 0:
|
136 |
+
prompt += f" Remember to focus on: {user_query}"
|
|
|
|
|
|
|
137 |
|
138 |
try:
|
139 |
analysis = call_gemini(video_file, prompt)
|