Spaces:

codelion
/

videoanalysis

Sleeping

App Files Files Community

codelion commited on Apr 2

Commit

4938676

verified ·

1 Parent(s): c43728b

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -36

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ def call_gemini(video_file: str, prompt: str) -> str:
         model=MODEL_NAME,
         contents=[
             Part(file_data=file_bytes, mime_type="video/mp4"),
-            prompt  # prompt is passed as a plain string
         ]
     )
     return response.text
@@ -50,26 +50,23 @@ def hhmmss_to_seconds(time_str: str) -> float:
 def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
     """
-    Prompt Gemini to output key frame information in plain text.
-    The prompt instructs the model to list key timestamps (in HH:MM:SS format)
-    and a brief description for each important event, one per line in the format:
-    HH:MM:SS - description.
-    We then parse these lines and extract the corresponding frames from the video.
     Returns a list of tuples: (image_array, caption)
     """
     prompt = (
-        "Based on the following video analysis, list the key timestamps (in HH:MM:SS format) "
-        "and a brief description of each important event or anomaly. For each event, output a separate line "
-        "in the following format: HH:MM:SS - description. Do not include any extra text."
     )
-    prompt += f" Video Analysis: {analysis}"
     if user_query:
         prompt += f" Additional focus: {user_query}"
     try:
         key_frames_response = call_gemini(video_file, prompt)
-        # Parse plain text output: each line should be "HH:MM:SS - description"
         lines = key_frames_response.strip().split("\n")
         key_frames = []
         for line in lines:
@@ -95,11 +92,9 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
             seconds = hhmmss_to_seconds(ts)
         except Exception:
             continue
-        # Set video position (in milliseconds)
         cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
         ret, frame = cap.read()
         if ret:
-            # Convert BGR to RGB for proper display
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             caption = f"{ts}: {description}"
             extracted_frames.append((frame_rgb, caption))
@@ -108,33 +103,28 @@ def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
 def analyze_video(video_file: str, user_query: str) -> (str, list):
     """
-    Perform iterative, agentic video analysis on the uploaded file.
-    First, refine the video analysis over several iterations.
-    Then, prompt the model to provide key timestamp information.
     Returns:
-      - A Markdown report as a string.
       - A gallery list of key frames (each as a tuple of (image, caption)).
     """
     analysis = ""
     num_iterations = 3
     for i in range(num_iterations):
-        base_prompt = (
-            "You are a video analysis agent focusing on security and surveillance. "
-            "Provide a detailed summary of the video, highlighting key events, suspicious activities, or anomalies."
-        )
-        if user_query:
-            base_prompt += f" Also, focus on the following query: {user_query}"
-        prompt = base_prompt if i == 0 else (
-            f"Based on the previous analysis: \"{analysis}\". "
-            "Provide further elaboration and refined insights, focusing on potential security threats, anomalous events, "
-            "and details that would help a security team understand the situation better."
-        )
-        if user_query and i > 0:
-            prompt += f" Remember to focus on: {user_query}"
         try:
             analysis = call_gemini(video_file, prompt)
         except Exception as e:
@@ -155,7 +145,7 @@ def analyze_video(video_file: str, user_query: str) -> (str, list):
 def gradio_interface(video_file, user_query: str) -> (str, list):
     """
     Gradio interface function that accepts an uploaded video file and an optional query,
-    then returns a Markdown report and a gallery of key frame images with captions.
     """
     if not video_file:
         return "Please upload a valid video file.", []
@@ -174,9 +164,8 @@ iface = gr.Interface(
     title="AI Video Analysis and Summariser Agent",
     description=(
         "This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
-        "to iteratively analyze an uploaded video for security and surveillance insights. "
-        "Provide a video file and, optionally, a query to guide the analysis. The tool returns a detailed "
-        "Markdown report along with a gallery of key frame images."
     )
 )

         model=MODEL_NAME,
         contents=[
             Part(file_data=file_bytes, mime_type="video/mp4"),
+            prompt
         ]
     )
     return response.text
 def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
     """
+    Ask Gemini to list key timestamps and descriptions for the video.
+    The model is instructed to output one line per event in the format:
+    HH:MM:SS - description
+    We then parse these lines and extract the corresponding frames using OpenCV.
     Returns a list of tuples: (image_array, caption)
     """
     prompt = (
+        "List the key timestamps in the video and a brief description of the important event at that time. "
+        "Output one line per event in the following format: HH:MM:SS - description. Do not include any extra text."
     )
+    prompt += f" Video Summary: {analysis}"
     if user_query:
         prompt += f" Additional focus: {user_query}"
     try:
         key_frames_response = call_gemini(video_file, prompt)
         lines = key_frames_response.strip().split("\n")
         key_frames = []
         for line in lines:
             seconds = hhmmss_to_seconds(ts)
         except Exception:
             continue
         cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
         ret, frame = cap.read()
         if ret:
             frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
             caption = f"{ts}: {description}"
             extracted_frames.append((frame_rgb, caption))
 def analyze_video(video_file: str, user_query: str) -> (str, list):
     """
+    Perform iterative video analysis on the uploaded file.
+    Iteratively refine the summary with simpler prompts, then ask for key timestamps.
     Returns:
+      - A Markdown report (string) summarizing the video.
       - A gallery list of key frames (each as a tuple of (image, caption)).
     """
     analysis = ""
     num_iterations = 3
     for i in range(num_iterations):
+        if i == 0:
+            prompt = "Give a detailed summary of the video."
+            if user_query:
+                prompt += f" Also focus on: {user_query}"
+        elif i == 1:
+            prompt = f"Based on the summary: \"{analysis}\", provide additional details about important events and anomalies in the video."
+            if user_query:
+                prompt += f" Also focus on: {user_query}"
+        else:
+            prompt = f"Refine and consolidate the analysis: \"{analysis}\" into a final summary."
         try:
             analysis = call_gemini(video_file, prompt)
         except Exception as e:
 def gradio_interface(video_file, user_query: str) -> (str, list):
     """
     Gradio interface function that accepts an uploaded video file and an optional query,
+    then returns a Markdown report and a gallery of extracted key frames with captions.
     """
     if not video_file:
         return "Please upload a valid video file.", []
     title="AI Video Analysis and Summariser Agent",
     description=(
         "This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
+        "to iteratively analyze an uploaded video for insights. Provide a video file and, optionally, "
+        "a query to guide the analysis. The tool returns a Markdown report along with a gallery of key frame images."
     )
 )