Spaces:

codelion
/

videoanalysis

Sleeping

App Files Files Community

codelion commited on Apr 2

Commit

001b623

verified ·

1 Parent(s): 0ef8445

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -81

app.py CHANGED Viewed

@@ -1,14 +1,11 @@
 import os
 import json
-import tempfile
-import requests
 import gradio as gr
 import cv2
 from google import genai
 from google.genai import types
 from google.genai.types import Part
 from tenacity import retry, stop_after_attempt, wait_random_exponential
-import yt_dlp  # Use yt-dlp for robust YouTube downloading
 # Retrieve API key from environment variables.
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
@@ -22,15 +19,17 @@ client = genai.Client(api_key=GOOGLE_API_KEY)
 MODEL_NAME = "gemini-2.0-flash-001"
 @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
-def call_gemini(video_url: str, prompt: str) -> str:
     """
-    Call the Gemini model with the provided video URL and prompt.
-    The video is passed as a URI part with MIME type "video/webm".
     """
     response = client.models.generate_content(
         model=MODEL_NAME,
         contents=[
-            Part.from_uri(file_uri=video_url, mime_type="video/webm"),
             prompt,
         ],
     )
@@ -49,42 +48,10 @@ def hhmmss_to_seconds(time_str: str) -> float:
     else:
         return parts[0]
-def download_video(video_url: str) -> str:
-    """
-    Download the video from a URL. If it's a YouTube URL, use yt-dlp;
-    otherwise, use requests for direct links.
-    Returns the local file path.
-    """
-    local_file = None
-    if "youtube.com" in video_url or "youtu.be" in video_url:
-        ydl_opts = {
-            'format': 'mp4',
-            'outtmpl': '%(id)s.%(ext)s',
-            'noplaylist': True,
-            'quiet': True,
-        }
-        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            info = ydl.extract_info(video_url, download=True)
-            local_file = ydl.prepare_filename(info)
-    else:
-        # Assume it's a direct link to a video file.
-        response = requests.get(video_url, stream=True)
-        if response.status_code == 200:
-            temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
-            for chunk in response.iter_content(chunk_size=8192):
-                if chunk:
-                    temp_file.write(chunk)
-            temp_file.flush()
-            local_file = temp_file.name
-            temp_file.close()
-        else:
-            raise ValueError("Failed to download video, status code: " + str(response.status_code))
-    return local_file
-def get_key_frames(video_url: str, analysis: str, user_query: str) -> list:
     """
     Prompt Gemini to return key frame timestamps (in HH:MM:SS) with descriptions,
-    then extract those frames from the downloaded video file using OpenCV.
     Returns a list of tuples: (image_array, caption)
     """
@@ -99,7 +66,7 @@ def get_key_frames(video_url: str, analysis: str, user_query: str) -> list:
         prompt += f" Additional focus: {user_query}"
     try:
-        key_frames_response = call_gemini(video_url, prompt)
         # Attempt to parse the output as JSON.
         key_frames = json.loads(key_frames_response)
         if not isinstance(key_frames, list):
@@ -108,38 +75,32 @@ def get_key_frames(video_url: str, analysis: str, user_query: str) -> list:
         key_frames = []
     extracted_frames = []
-    local_path = None
-    try:
-        local_path = download_video(video_url)
-        cap = cv2.VideoCapture(local_path)
-        if not cap.isOpened():
-            print("Error: Could not open video from local file.")
-            return extracted_frames
-        for frame_obj in key_frames:
-            ts = frame_obj.get("timestamp")
-            description = frame_obj.get("description", "")
-            try:
-                seconds = hhmmss_to_seconds(ts)
-            except Exception:
-                continue
-            # Set video position (in milliseconds)
-            cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
-            ret, frame = cap.read()
-            if ret:
-                # Convert BGR to RGB
-                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-                caption = f"{ts}: {description}"
-                extracted_frames.append((frame_rgb, caption))
-        cap.release()
-    finally:
-        if local_path and os.path.exists(local_path):
-            os.remove(local_path)
     return extracted_frames
-def analyze_video(video_url: str, user_query: str) -> (str, list):
     """
-    Perform iterative, agentic video analysis.
     First, refine the video analysis over several iterations.
     Then, prompt the model to identify key frames.
@@ -170,7 +131,7 @@ def analyze_video(video_url: str, user_query: str) -> (str, list):
                 prompt += f" Remember to focus on: {user_query}"
         try:
-            analysis = call_gemini(video_url, prompt)
         except Exception as e:
             analysis += f"\n[Error during iteration {i+1}: {e}]"
             break
@@ -179,7 +140,7 @@ def analyze_video(video_url: str, user_query: str) -> (str, list):
     markdown_report = f"## Video Analysis Report\n\n**Summary:**\n\n{analysis}\n"
     # Get key frames based on the analysis and optional query.
-    key_frames_gallery = get_key_frames(video_url, analysis, user_query)
     if not key_frames_gallery:
         markdown_report += "\n*No key frames were extracted.*\n"
     else:
@@ -189,19 +150,19 @@ def analyze_video(video_url: str, user_query: str) -> (str, list):
     return markdown_report, key_frames_gallery
-def gradio_interface(video_url: str, user_query: str) -> (str, list):
     """
-    Gradio interface function that accepts a video URL and an optional query,
     then returns a Markdown report and a gallery of key frame images with captions.
     """
-    if not video_url:
-        return "Please provide a valid video URL.", []
-    return analyze_video(video_url, user_query)
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
-        gr.Textbox(label="Video URL (publicly accessible, e.g., YouTube link or direct video file URL)"),
         gr.Textbox(label="Analysis Query (optional): guide the focus of the analysis", placeholder="e.g., focus on unusual movements near the entrance")
     ],
     outputs=[
@@ -211,8 +172,9 @@ iface = gr.Interface(
     title="AI Video Analysis and Summariser Agent",
     description=(
         "This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
-        "to iteratively analyze a video for security and surveillance insights. Provide a video URL and, optionally, "
-        "a query to guide the analysis. The tool returns a detailed Markdown report along with a gallery of key frame images."
     )
 )

 import os
 import json
 import gradio as gr
 import cv2
 from google import genai
 from google.genai import types
 from google.genai.types import Part
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 # Retrieve API key from environment variables.
 GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
 MODEL_NAME = "gemini-2.0-flash-001"
 @retry(wait=wait_random_exponential(multiplier=1, max=60), stop=stop_after_attempt(3))
+def call_gemini(video_file: str, prompt: str) -> str:
     """
+    Call the Gemini model with the provided video file and prompt.
+    The video file is read as bytes and passed with MIME type "video/mp4".
     """
+    with open(video_file, "rb") as f:
+        file_bytes = f.read()
     response = client.models.generate_content(
         model=MODEL_NAME,
         contents=[
+            Part(file_data=file_bytes, mime_type="video/mp4"),
             prompt,
         ],
     )
     else:
         return parts[0]
+def get_key_frames(video_file: str, analysis: str, user_query: str) -> list:
     """
     Prompt Gemini to return key frame timestamps (in HH:MM:SS) with descriptions,
+    then extract those frames from the uploaded video file using OpenCV.
     Returns a list of tuples: (image_array, caption)
     """
         prompt += f" Additional focus: {user_query}"
     try:
+        key_frames_response = call_gemini(video_file, prompt)
         # Attempt to parse the output as JSON.
         key_frames = json.loads(key_frames_response)
         if not isinstance(key_frames, list):
         key_frames = []
     extracted_frames = []
+    cap = cv2.VideoCapture(video_file)
+    if not cap.isOpened():
+        print("Error: Could not open the uploaded video file.")
+        return extracted_frames
+    for frame_obj in key_frames:
+        ts = frame_obj.get("timestamp")
+        description = frame_obj.get("description", "")
+        try:
+            seconds = hhmmss_to_seconds(ts)
+        except Exception:
+            continue
+        # Set video position (in milliseconds)
+        cap.set(cv2.CAP_PROP_POS_MSEC, seconds * 1000)
+        ret, frame = cap.read()
+        if ret:
+            # Convert BGR to RGB for proper display
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            caption = f"{ts}: {description}"
+            extracted_frames.append((frame_rgb, caption))
+    cap.release()
     return extracted_frames
+def analyze_video(video_file: str, user_query: str) -> (str, list):
     """
+    Perform iterative, agentic video analysis on the uploaded file.
     First, refine the video analysis over several iterations.
     Then, prompt the model to identify key frames.
                 prompt += f" Remember to focus on: {user_query}"
         try:
+            analysis = call_gemini(video_file, prompt)
         except Exception as e:
             analysis += f"\n[Error during iteration {i+1}: {e}]"
             break
     markdown_report = f"## Video Analysis Report\n\n**Summary:**\n\n{analysis}\n"
     # Get key frames based on the analysis and optional query.
+    key_frames_gallery = get_key_frames(video_file, analysis, user_query)
     if not key_frames_gallery:
         markdown_report += "\n*No key frames were extracted.*\n"
     else:
     return markdown_report, key_frames_gallery
+def gradio_interface(video_file, user_query: str) -> (str, list):
     """
+    Gradio interface function that accepts an uploaded video file and an optional query,
     then returns a Markdown report and a gallery of key frame images with captions.
     """
+    if not video_file:
+        return "Please upload a valid video file.", []
+    return analyze_video(video_file, user_query)
 iface = gr.Interface(
     fn=gradio_interface,
     inputs=[
+        gr.Video(label="Upload Video File", source="upload", type="filepath"),
         gr.Textbox(label="Analysis Query (optional): guide the focus of the analysis", placeholder="e.g., focus on unusual movements near the entrance")
     ],
     outputs=[
     title="AI Video Analysis and Summariser Agent",
     description=(
         "This agentic video analysis tool uses Google's Gemini 2.0 Flash model via AI Studio "
+        "to iteratively analyze an uploaded video for security and surveillance insights. "
+        "Provide a video file and, optionally, a query to guide the analysis. The tool returns a detailed "
+        "Markdown report along with a gallery of key frame images."
     )
 )