Spaces:

Pixeltable
/

AI-Video-Analyzer-GPT4-Vision-TTS-Narration

Running

App Files Files Community

PierreBrunelle commited on Nov 3, 2024

Commit

06e7491

verified ·

1 Parent(s): a2e8547

Create app.py

Browse files

Files changed (1) hide show

app.py +396 -0

app.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import gradio as gr
+import pixeltable as pxt
+from pixeltable.iterators import FrameIterator
+from datetime import datetime
+import PIL.Image
+from pixeltable.functions import openai, image
+import os
+import getpass
+import requests
+import tempfile
+import json
+import math
+from typing import Dict, Optional
+# Constants
+MAX_VIDEO_SIZE_MB = 35
+MAX_FRAMES = 5
+# Prompt templates
+PROMPT_TEMPLATES = {
+    "descriptive": {
+        "name": "Descriptive Analysis",
+        "system_prompt": """You are a video content analyzer. Please generate a short and concise compelling description
+                          that summarizes the overall action and content of this video sequence. Focus on describing
+                          the key events, changes, and movements you observe across all frames.""",
+        "description": "Generates a clear, factual description of the video content"
+    },
+    "cinematic": {
+        "name": "Cinematic Analysis (Christopher Nolan style)",
+        "system_prompt": """You are Christopher Nolan, the acclaimed filmmaker. Describe this visual sequence
+                          as one continuous, flowing narrative moment, as you would when discussing a pivotal
+                          scene from one of your films. Focus on psychological undercurrents, visual symbolism,
+                          and the deeper thematic implications of what unfolds.""",
+        "description": "Analyzes the video from a filmmaker's perspective with artistic interpretation"
+    },
+    "documentary": {
+        "name": "Documentary Style (David Attenborough)",
+        "system_prompt": """You are David Attenborough, the renowned naturalist and documentarian. Narrate this sequence
+                          with your characteristic blend of scientific insight and storytelling prowess. Focus on the
+                          compelling details that bring the subject matter to life, while maintaining your signature
+                          warm, authoritative tone.""",
+        "description": "Creates a nature documentary style narration"
+    },
+    "technical": {
+        "name": "Technical Analysis",
+        "system_prompt": """You are a technical video analyst. Break down this sequence with precise attention to
+                          technical details including movement patterns, visual composition, lighting conditions,
+                          and any notable technical aspects of the footage.""",
+        "description": "Provides detailed technical analysis of the video"
+    },
+    "labelling": {
+        "name": "Labelling and Annotation",
+        "system_prompt": """You are a high-precision video labeling system designed to replace human labelers.
+                          Analyze this sequence with extreme attention to detail, focusing on:
+                          1. Object identification and tracking
+                          2. Precise descriptions of movements and actions
+                          3. Spatial relationships between objects
+                          4. Changes in object positions and behaviors
+                          Your goal is to provide detailed, accurate annotations that could be used for
+                          training computer vision models or validating automated systems.""",
+        "description": "Provides detailed object and action annotations for machine learning purposes"
+    }
+}
+# Voice options
+VOICE_OPTIONS = {
+    "alloy": "Alloy (Balanced)",
+    "echo": "Echo (Smooth)",
+    "fable": "Fable (Expressive)",
+    "onyx": "Onyx (Authoritative)",
+    "nova": "Nova (Friendly)",
+    "shimmer": "Shimmer (Warm)"
+}
+def process_video(video_file: gr.Video, api_key: str, prompt_template: str, voice_choice: str, progress: Optional[gr.Progress] = None) -> tuple[str, str]:
+    """Process video with given parameters. Creates new Pixeltable instance for each request."""
+    try:
+        if not video_file or not api_key:
+            return "Please provide both video file and API key.", None
+        # Set API key
+        os.environ['OPENAI_API_KEY'] = api_key
+        video_path = video_file.name if hasattr(video_file, 'name') else str(video_file)
+        # Check file size
+        file_size = os.path.getsize(video_path) / (1024 * 1024)
+        if file_size > MAX_VIDEO_SIZE_MB:
+            return f"Error: Video file size ({file_size:.1f}MB) exceeds limit of {MAX_VIDEO_SIZE_MB}MB", None
+        if progress:
+            progress(0.1, desc="Initializing...")
+        # Create unique directory for this processing session
+        session_id = datetime.now().strftime('%Y%m%d_%H%M%S')
+        dir_name = f'video_processor_{session_id}'
+        # Initialize Pixeltable
+        pxt.drop_dir(dir_name, force=True)
+        pxt.create_dir(dir_name)
+        # Create main video table
+        video_table = pxt.create_table(
+            f'{dir_name}.videos',
+            {
+                "video": pxt.VideoType(nullable=True),
+                "timestamp": pxt.TimestampType(),
+            }
+        )
+        # Create frames view
+        frames_view = pxt.create_view(
+            f'{dir_name}.frames',
+            video_table,
+            iterator=FrameIterator.create(video=video_table.video, fps=1)
+        )
+        frames_view['encoded_frame'] = image.b64_encode(frames_view.frame)
+        if progress:
+            progress(0.2, desc="Processing video...")
+        # Insert video
+        video_table.insert([{
+            "video": video_path,
+            "timestamp": datetime.now(),
+        }])
+        if progress:
+            progress(0.4, desc="Extracting frames...")
+        # Get frames
+        frames = frames_view.select(frames_view.encoded_frame).collect()
+        frame_list = [f["encoded_frame"] for f in frames]
+        def select_representative_frames(frames: list, num_frames: int = MAX_FRAMES) -> list:
+            total_frames = len(frames)
+            if total_frames <= num_frames:
+                return frames
+            interval = total_frames / num_frames
+            selected_indices = [math.floor(i * interval) for i in range(num_frames)]
+            return [frames[i] for i in selected_indices]
+        selected_frames = select_representative_frames(frame_list)
+        if progress:
+            progress(0.6, desc="Analyzing with GPT-4 Vision...")
+        def create_frame_content(frames: list) -> list:
+            content = [
+                {
+                    "type": "text",
+                    "text": "This is a sequence of frames from a video. Please analyze the overall action and content across all frames:"
+                }
+            ]
+            for i, frame in enumerate(frames, 1):
+                content.extend([
+                    {
+                        "type": "text",
+                        "text": f"Frame {i}:"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{frame}"
+                        }
+                    }
+                ])
+            return content
+        # Create frame content and generate description
+        frame_content = create_frame_content(selected_frames)
+        template = PROMPT_TEMPLATES[prompt_template]
+        messages = [
+            {
+                'role': 'system',
+                'content': template["system_prompt"]
+            },
+            {
+                'role': 'user',
+                'content': frame_content
+            }
+        ]
+        video_table['response'] = openai.chat_completions(
+            messages=messages,
+            model='gpt-4o',
+            max_tokens=500
+        )
+        video_table['content'] = video_table.response.choices[0].message.content.astype(pxt.StringType())
+        if progress:
+            progress(0.8, desc="Generating audio...")
+        # Generate voiceover
+        @pxt.udf
+        def generate_voiceover(script: str, voice: str) -> str:
+            try:
+                response = requests.post(
+                    "https://api.openai.com/v1/audio/speech",
+                    headers={"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"},
+                    json={
+                        "model": "tts-1",
+                        "input": script,
+                        "voice": voice,
+                    }
+                )
+                if response.status_code != 200:
+                    raise Exception(f"TTS API error: {response.status_code} - {response.text}")
+                # Create temp file in system temp directory
+                temp_dir = tempfile.gettempdir()
+                temp_audio_path = os.path.join(temp_dir, f"voiceover_{session_id}.mp3")
+                with open(temp_audio_path, 'wb') as f:
+                    f.write(response.content)
+                return temp_audio_path
+            except Exception as e:
+                print(f"Error generating audio: {e}")
+                return None
+        # Generate audio and get results
+        video_table['audio_path'] = generate_voiceover(video_table.content, voice_choice)
+        results = video_table.select(
+            video_table.content,
+            video_table.audio_path
+        ).tail(1)
+        if progress:
+            progress(1.0, desc="Processing complete!")
+        # Clean up
+        try:
+            pxt.drop_dir(dir_name, force=True)
+        except Exception as e:
+            print(f"Warning: Could not clean up directory {dir_name}: {e}")
+        return (
+            results['content'][0],  # Generated text content
+            results['audio_path'][0]  # Audio file path
+        )
+    except Exception as e:
+        print(f"Error processing video: {e}")
+        return f"Error processing video: {str(e)}", None
+# Gradio interface
+def create_interface():
+    with gr.Blocks(theme=gr.themes.Base()) as demo:
+        # Header
+        gr.Markdown(
+            """
+            <div style="text-align: left; margin-bottom: 2rem;">
+                <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 1rem;" />
+                <h1>🎥 AI Video Analyzer: Custom GPT-4 Analysis & TTS Narration</h1>
+                <p>Convert videos into rich narratives with 5 analysis styles - from Christopher Nolan-style cinematic breakdowns to David Attenborough documentary narrations.</p>
+            </div>
+            """
+        )
+        # Disclaimer with Whisper reference
+        gr.HTML(
+            """
+            <div style="background-color: #FFF3CD; border: 1px solid #FF7D04; padding: 1rem; margin: 1rem 0; border-radius: 4px;">
+                <p style="margin: 0; color: #013056;">
+                    ⚠️ <strong>Notice:</strong> This application requires an OpenAI API key and uses the following services:
+                    <ul style="margin-top: 0.5rem;">
+                        <li>GPT-4 Vision API for video analysis</li>
+                        <li>TTS API for audio generation</li>
+                    </ul>
+                    Please be aware of associated API costs. For pricing information, visit
+                    <a href="https://openai.com/pricing" target="_blank" style="color: #856404; text-decoration: underline;">OpenAI's pricing page</a>.
+                    <br><br>
+                    This application does not process audio/transcripts. If you need audio transcription and analysis, check out our
+                    <a href="https://huggingface.co/spaces/Pixeltable/Call-Analysis-AI-Tool" target="_blank" style="color: #856404; text-decoration: underline;">
+                    Call Analysis AI Tool</a> which uses Whisper for audio processing.
+                </p>
+            </div>
+            """
+        )
+        # Information sections side by side
+        with gr.Row():
+            with gr.Column():
+                with gr.Accordion("What does it do?", open=True):
+                    gr.Markdown("""
+                        - 🎥 Analyze video content using GPT-4 Vision
+                        - 📝 Generate detailed descriptions and narrations
+                        - 🎧 Create professional voiceovers using OpenAI's TTS
+                        - 🔄 Process up to 5 key frames from your video
+                    """)
+            with gr.Column():
+                with gr.Accordion("How to use", open=True):
+                    gr.Markdown("""
+                        1. Enter your OpenAI API key
+                        2. Upload a video file (max 35MB)
+                        3. Choose your preferred analysis style and voice
+                        5. Click "Process Video" and wait for results
+                    """)
+        # Main interface
+        with gr.Row():
+            with gr.Column():
+                # Configuration controls - side by side
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        api_key = gr.Textbox(
+                            label="OpenAI API Key",
+                            placeholder="sk-...",
+                            type="password"
+                        )
+                # Video upload below configuration
+                video_input = gr.Video(
+                    label=f"Upload Video (max {MAX_VIDEO_SIZE_MB}MB)",
+                    interactive=True
+                )
+                process_btn = gr.Button("🎬 Process Video", variant="primary")
+            # Results column
+            with gr.Column():
+                prompt_template = gr.Dropdown(
+                    choices=list(PROMPT_TEMPLATES.keys()),
+                    value="descriptive",
+                    label="Analysis Style",
+                    info="Choose analysis style"
+                )
+                voice_choice = gr.Dropdown(
+                    choices=list(VOICE_OPTIONS.keys()),
+                    value="onyx",
+                    label="Voice Selection",
+                    info="Select the voice for your narration"
+                )
+                with gr.Tabs():
+                    with gr.TabItem("📝 Analysis"):
+                        content_output = gr.Textbox(
+                            label="Generated Content",
+                            lines=10
+                        )
+                    with gr.TabItem("🎧 Audio"):
+                        audio_output = gr.Audio(
+                            label="Generated Voiceover",
+                            type="filepath"
+                        )
+        # Footer
+        gr.HTML(
+            """
+            <div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e5e7eb;">
+                <div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1rem;">
+                    <div style="flex: 1;">
+                        <h4 style="margin: 0; color: #374151;">🚀 Built with Pixeltable</h4>
+                        <p style="margin: 0.5rem 0; color: #6b7280;">
+                            Open Source AI infrastructure for intelligent applications
+                        </p>
+                    </div>
+                    <div style="flex: 1;">
+                        <h4 style="margin: 0; color: #374151;">🔗 Resources</h4>
+                        <div style="display: flex; gap: 1.5rem; margin-top: 0.5rem;">
+                            <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;">
+                                GitHub
+                            </a>
+                            <a href="https://docs.pixeltable.com" target="_blank" style="color: #4F46E5; text-decoration: none;">
+                                Documentation
+                            </a>
+                        </div>
+                    </div>
+                </div>
+            </div>
+            """
+        )
+        # Connect the process button
+        process_btn.click(
+            fn=process_video,
+            inputs=[video_input, api_key, prompt_template, voice_choice],
+            outputs=[content_output, audio_output]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()