Spaces:

codelion
/

videoanalysis

Sleeping

App Files Files Community

codelion commited on Apr 2

Commit

cba459f

verified ·

1 Parent(s): 03c6357

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -11

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import json
 import gradio as gr
 import cv2
 from google import genai
-from google.genai.types import Part
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 # Retrieve API key from environment variables.
@@ -18,11 +18,11 @@ client = genai.Client(api_key=GOOGLE_API_KEY)
 MODEL_NAME = "gemini-2.0-flash-001"
 @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
-def call_gemini(video_file: str, prompt: str) -> str:
     """
     Call the Gemini model with the provided video file and prompt.
-    The video file is read as bytes and passed with MIME type "video/mp4",
-    and the prompt is wrapped as text.
     """
     with open(video_file, "rb") as f:
         file_bytes = f.read()
@@ -32,6 +32,7 @@ def call_gemini(video_file: str, prompt: str) -> str:
             Part(file_data=file_bytes, mime_type="video/mp4"),
             Part(text=prompt)
         ],
     )
     return response.text
@@ -42,9 +43,9 @@ def hhmmss_to_seconds(time_str: str) -> float:
     parts = time_str.strip().split(":")
     parts = [float(p) for p in parts]
     if len(parts) == 3:
-        return parts[0]*3600 + parts[1]*60 + parts[2]
     elif len(parts) == 2:
-        return parts[0]*60 + parts[1]
     else:
         return parts[0]
@@ -55,18 +56,35 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
     Returns a list of tuples: (image_array, caption)
     """
     prompt = (
-        "Based on the following video analysis, identify key frames that best illustrate "
-        "the important events or anomalies. Return a JSON array where each element is an object "
-        "with two keys: 'timestamp' (in HH:MM:SS format) and 'description' (a brief explanation of why "
-        "this frame is important)."
     )
     prompt += f" Video Analysis: {analysis}"
     if user_query:
         prompt += f" Additional focus: {user_query}"
     try:
-        key_frames_response = call_gemini(video_file, prompt)
         key_frames = json.loads(key_frames_response)
         if not isinstance(key_frames, list):
             key_frames = []
@@ -87,6 +105,7 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
             seconds = hhmmss_to_seconds(ts)
         except Exception:
             continue
         cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
         ret, frame = cap.read()
         if ret:

 import gradio as gr
 import cv2
 from google import genai
+from google.genai.types import Part, GenerateContentConfig
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 # Retrieve API key from environment variables.
 MODEL_NAME = "gemini-2.0-flash-001"
 @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
+def call_gemini(video_file: str, prompt: str, config: GenerateContentConfig = None) -> str:
     """
     Call the Gemini model with the provided video file and prompt.
+    The video file is read as bytes and passed with MIME type "video/mp4".
+    Optionally accepts a config (e.g. response_schema) for structured output.
     """
     with open(video_file, "rb") as f:
         file_bytes = f.read()
             Part(file_data=file_bytes, mime_type="video/mp4"),
             Part(text=prompt)
         ],
+        config=config
     )
     return response.text
     parts = time_str.strip().split(":")
     parts = [float(p) for p in parts]
     if len(parts) == 3:
+        return parts[0] * 3600 + parts[1] * 60 + parts[2]
     elif len(parts) == 2:
+        return parts[0] * 60 + parts[1]
     else:
         return parts[0]
     Returns a list of tuples: (image_array, caption)
     """
+    # Define a response schema for key frames.
+    response_schema = {
+        "type": "ARRAY",
+        "items": {
+            "type": "OBJECT",
+            "properties": {
+                "timestamp": {"type": "string"},
+                "description": {"type": "string"}
+            },
+            "required": ["timestamp", "description"]
+        }
+    }
+    config = GenerateContentConfig(
+        temperature=0.0,
+        max_output_tokens=1024,
+        response_mime_type="application/json",
+        response_schema=response_schema
+    )
     prompt = (
+        "From the following video analysis, list key frames with their timestamps (in HH:MM:SS format) "
+        "and a brief description of the important event at that timestamp. "
+        "Return the result as a JSON array of objects with keys 'timestamp' and 'description'."
     )
     prompt += f" Video Analysis: {analysis}"
     if user_query:
         prompt += f" Additional focus: {user_query}"
     try:
+        key_frames_response = call_gemini(video_file, prompt, config=config)
         key_frames = json.loads(key_frames_response)
         if not isinstance(key_frames, list):
             key_frames = []
             seconds = hhmmss_to_seconds(ts)
         except Exception:
             continue
+        # Set video position (in milliseconds)
         cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
         ret, frame = cap.read()
         if ret: