Spaces:

codelion
/

videoanalysis

Sleeping

App Files Files Community

codelion commited on Apr 2

Commit

c43728b

verified ·

1 Parent(s): 5e2d98d

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -44

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import gradio as gr
 import cv2
 from google import genai
-from google.genai.types import Part, GenerateContentConfig
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 # Retrieve API key from environment variables.
@@ -18,12 +18,11 @@ client = genai.Client(api_key=GOOGLE_API_KEY)
 MODEL_NAME = "gemini-2.0-flash-001"
 @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
-def call_gemini(video_file: str, prompt: str, config: GenerateContentConfig = None) -> str:
     """
     Call the Gemini model with the provided video file and prompt.
     The video file is read as bytes and passed with MIME type "video/mp4".
     The prompt is passed as a plain string.
-    Optionally accepts a config (e.g. response_schema) for structured output.
     """
     with open(video_file, "rb") as f:
         file_bytes = f.read()
@@ -31,9 +30,8 @@ def call_gemini(video_file: str, prompt: str, config: GenerateContentConfig = No
         model=MODEL_NAME,
         contents=[
             Part(file_data=file_bytes, mime_type="video/mp4"),
-            prompt,  # Pass prompt as a plain string
-        ],
-        config=config
     )
     return response.text
@@ -52,43 +50,34 @@ def hhmmss_to_seconds(time_str: str) -> float:
 def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
     """
-    Prompt Gemini to return key frame timestamps (in HH:MM:SS) with descriptions,
-    then extract those frames from the uploaded video file using OpenCV.
     Returns a list of tuples: (image_array, caption)
     """
-    # Define a response schema for key frames.
-    response_schema = {
-        "type": "ARRAY",
-        "items": {
-            "type": "OBJECT",
-            "properties": {
-                "timestamp": {"type": "string"},
-                "description": {"type": "string"}
-            },
-            "required": ["timestamp", "description"]
-        }
-    }
-    config = GenerateContentConfig(
-        temperature=0.0,
-        max_output_tokens=1024,
-        response_mime_type="application/json",
-        response_schema=response_schema
-    )
     prompt = (
-        "From the following video analysis, list key frames with their timestamps (in HH:MM:SS format) "
-        "and a brief description of the important event at that timestamp. "
-        "Return the result as a JSON array of objects with keys 'timestamp' and 'description'."
     )
     prompt += f" Video Analysis: {analysis}"
     if user_query:
         prompt += f" Additional focus: {user_query}"
     try:
-        key_frames_response = call_gemini(video_file, prompt, config=config)
-        key_frames = json.loads(key_frames_response)
-        if not isinstance(key_frames, list):
-            key_frames = []
     except Exception as e:
         print("Error in key frame extraction:", e)
         key_frames = []
@@ -110,6 +99,7 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
         cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
         ret, frame = cap.read()
         if ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             caption = f"{ts}: {description}"
             extracted_frames.append((frame_rgb, caption))
@@ -120,7 +110,7 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
     """
     Perform iterative, agentic video analysis on the uploaded file.
     First, refine the video analysis over several iterations.
-    Then, prompt the model to identify key frames.
     Returns:
       - A Markdown report as a string.
@@ -137,16 +127,13 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
         if user_query:
             base_prompt += f" Also, focus on the following query: {user_query}"
-        if i == 0:
-            prompt = base_prompt
-        else:
-            prompt = (
-                f"Based on the previous analysis: \"{analysis}\". "
-                "Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
-                "and details that would help a security team understand the situation better."
-            )
-            if user_query:
-                prompt += f" Remember to focus on: {user_query}"
         try:
             analysis = call_gemini(video_file, prompt)

 import gradio as gr
 import cv2
 from google import genai
+from google.genai.types import Part
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 # Retrieve API key from environment variables.
 MODEL_NAME = "gemini-2.0-flash-001"
 @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
+def call_gemini(video_file: str, prompt: str) -> str:
     """
     Call the Gemini model with the provided video file and prompt.
     The video file is read as bytes and passed with MIME type "video/mp4".
     The prompt is passed as a plain string.
     """
     with open(video_file, "rb") as f:
         file_bytes = f.read()
         model=MODEL_NAME,
         contents=[
             Part(file_data=file_bytes, mime_type="video/mp4"),
+            prompt  # prompt is passed as a plain string
+        ]
     )
     return response.text
 def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
     """
+    Prompt Gemini to output key frame information in plain text.
+    The prompt instructs the model to list key timestamps (in HH:MM:SS format)
+    and a brief description for each important event, one per line in the format:
+    HH:MM:SS - description.
+    We then parse these lines and extract the corresponding frames from the video.
     Returns a list of tuples: (image_array, caption)
     """
     prompt = (
+        "Based on the following video analysis, list the key timestamps (in HH:MM:SS format) "
+        "and a brief description of each important event or anomaly. For each event, output a separate line "
+        "in the following format: HH:MM:SS - description. Do not include any extra text."
     )
     prompt += f" Video Analysis: {analysis}"
     if user_query:
         prompt += f" Additional focus: {user_query}"
     try:
+        key_frames_response = call_gemini(video_file, prompt)
+        # Parse plain text output: each line should be "HH:MM:SS - description"
+        lines = key_frames_response.strip().split("\n")
+        key_frames = []
+        for line in lines:
+            if " - " in line:
+                parts = line.split(" - ", 1)
+                timestamp = parts[0].strip()
+                description = parts[1].strip()
+                key_frames.append({"timestamp": timestamp, "description": description})
     except Exception as e:
         print("Error in key frame extraction:", e)
         key_frames = []
         cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
         ret, frame = cap.read()
         if ret:
+            # Convert BGR to RGB for proper display
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             caption = f"{ts}: {description}"
             extracted_frames.append((frame_rgb, caption))
     """
     Perform iterative, agentic video analysis on the uploaded file.
     First, refine the video analysis over several iterations.
+    Then, prompt the model to provide key timestamp information.
     Returns:
       - A Markdown report as a string.
         if user_query:
             base_prompt += f" Also, focus on the following query: {user_query}"
+        prompt = base_prompt if i == 0 else (
+            f"Based on the previous analysis: \"{analysis}\". "
+            "Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
+            "and details that would help a security team understand the situation better."
+        )
+        if user_query and i > 0:
+            prompt += f" Remember to focus on: {user_query}"
         try:
             analysis = call_gemini(video_file, prompt)