PierreBrunelle commited on
Commit
06e7491
Β·
verified Β·
1 Parent(s): a2e8547

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +396 -0
app.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pixeltable as pxt
3
+ from pixeltable.iterators import FrameIterator
4
+ from datetime import datetime
5
+ import PIL.Image
6
+ from pixeltable.functions import openai, image
7
+ import os
8
+ import getpass
9
+ import requests
10
+ import tempfile
11
+ import json
12
+ import math
13
+ from typing import Dict, Optional
14
+
15
+ # Constants
16
+ MAX_VIDEO_SIZE_MB = 35
17
+ MAX_FRAMES = 5
18
+
19
+ # Prompt templates
20
+ PROMPT_TEMPLATES = {
21
+ "descriptive": {
22
+ "name": "Descriptive Analysis",
23
+ "system_prompt": """You are a video content analyzer. Please generate a short and concise compelling description
24
+ that summarizes the overall action and content of this video sequence. Focus on describing
25
+ the key events, changes, and movements you observe across all frames.""",
26
+ "description": "Generates a clear, factual description of the video content"
27
+ },
28
+ "cinematic": {
29
+ "name": "Cinematic Analysis (Christopher Nolan style)",
30
+ "system_prompt": """You are Christopher Nolan, the acclaimed filmmaker. Describe this visual sequence
31
+ as one continuous, flowing narrative moment, as you would when discussing a pivotal
32
+ scene from one of your films. Focus on psychological undercurrents, visual symbolism,
33
+ and the deeper thematic implications of what unfolds.""",
34
+ "description": "Analyzes the video from a filmmaker's perspective with artistic interpretation"
35
+ },
36
+ "documentary": {
37
+ "name": "Documentary Style (David Attenborough)",
38
+ "system_prompt": """You are David Attenborough, the renowned naturalist and documentarian. Narrate this sequence
39
+ with your characteristic blend of scientific insight and storytelling prowess. Focus on the
40
+ compelling details that bring the subject matter to life, while maintaining your signature
41
+ warm, authoritative tone.""",
42
+ "description": "Creates a nature documentary style narration"
43
+ },
44
+ "technical": {
45
+ "name": "Technical Analysis",
46
+ "system_prompt": """You are a technical video analyst. Break down this sequence with precise attention to
47
+ technical details including movement patterns, visual composition, lighting conditions,
48
+ and any notable technical aspects of the footage.""",
49
+ "description": "Provides detailed technical analysis of the video"
50
+ },
51
+ "labelling": {
52
+ "name": "Labelling and Annotation",
53
+ "system_prompt": """You are a high-precision video labeling system designed to replace human labelers.
54
+ Analyze this sequence with extreme attention to detail, focusing on:
55
+ 1. Object identification and tracking
56
+ 2. Precise descriptions of movements and actions
57
+ 3. Spatial relationships between objects
58
+ 4. Changes in object positions and behaviors
59
+ Your goal is to provide detailed, accurate annotations that could be used for
60
+ training computer vision models or validating automated systems.""",
61
+ "description": "Provides detailed object and action annotations for machine learning purposes"
62
+ }
63
+ }
64
+
65
+ # Voice options
66
+ VOICE_OPTIONS = {
67
+ "alloy": "Alloy (Balanced)",
68
+ "echo": "Echo (Smooth)",
69
+ "fable": "Fable (Expressive)",
70
+ "onyx": "Onyx (Authoritative)",
71
+ "nova": "Nova (Friendly)",
72
+ "shimmer": "Shimmer (Warm)"
73
+ }
74
+
75
+ def process_video(video_file: gr.Video, api_key: str, prompt_template: str, voice_choice: str, progress: Optional[gr.Progress] = None) -> tuple[str, str]:
76
+ """Process video with given parameters. Creates new Pixeltable instance for each request."""
77
+ try:
78
+ if not video_file or not api_key:
79
+ return "Please provide both video file and API key.", None
80
+
81
+ # Set API key
82
+ os.environ['OPENAI_API_KEY'] = api_key
83
+
84
+ video_path = video_file.name if hasattr(video_file, 'name') else str(video_file)
85
+
86
+ # Check file size
87
+ file_size = os.path.getsize(video_path) / (1024 * 1024)
88
+ if file_size > MAX_VIDEO_SIZE_MB:
89
+ return f"Error: Video file size ({file_size:.1f}MB) exceeds limit of {MAX_VIDEO_SIZE_MB}MB", None
90
+
91
+ if progress:
92
+ progress(0.1, desc="Initializing...")
93
+
94
+ # Create unique directory for this processing session
95
+ session_id = datetime.now().strftime('%Y%m%d_%H%M%S')
96
+ dir_name = f'video_processor_{session_id}'
97
+
98
+ # Initialize Pixeltable
99
+ pxt.drop_dir(dir_name, force=True)
100
+ pxt.create_dir(dir_name)
101
+
102
+ # Create main video table
103
+ video_table = pxt.create_table(
104
+ f'{dir_name}.videos',
105
+ {
106
+ "video": pxt.VideoType(nullable=True),
107
+ "timestamp": pxt.TimestampType(),
108
+ }
109
+ )
110
+
111
+ # Create frames view
112
+ frames_view = pxt.create_view(
113
+ f'{dir_name}.frames',
114
+ video_table,
115
+ iterator=FrameIterator.create(video=video_table.video, fps=1)
116
+ )
117
+
118
+ frames_view['encoded_frame'] = image.b64_encode(frames_view.frame)
119
+
120
+ if progress:
121
+ progress(0.2, desc="Processing video...")
122
+
123
+ # Insert video
124
+ video_table.insert([{
125
+ "video": video_path,
126
+ "timestamp": datetime.now(),
127
+ }])
128
+
129
+ if progress:
130
+ progress(0.4, desc="Extracting frames...")
131
+
132
+ # Get frames
133
+ frames = frames_view.select(frames_view.encoded_frame).collect()
134
+ frame_list = [f["encoded_frame"] for f in frames]
135
+
136
+ def select_representative_frames(frames: list, num_frames: int = MAX_FRAMES) -> list:
137
+ total_frames = len(frames)
138
+ if total_frames <= num_frames:
139
+ return frames
140
+
141
+ interval = total_frames / num_frames
142
+ selected_indices = [math.floor(i * interval) for i in range(num_frames)]
143
+ return [frames[i] for i in selected_indices]
144
+
145
+ selected_frames = select_representative_frames(frame_list)
146
+
147
+ if progress:
148
+ progress(0.6, desc="Analyzing with GPT-4 Vision...")
149
+
150
+ def create_frame_content(frames: list) -> list:
151
+ content = [
152
+ {
153
+ "type": "text",
154
+ "text": "This is a sequence of frames from a video. Please analyze the overall action and content across all frames:"
155
+ }
156
+ ]
157
+
158
+ for i, frame in enumerate(frames, 1):
159
+ content.extend([
160
+ {
161
+ "type": "text",
162
+ "text": f"Frame {i}:"
163
+ },
164
+ {
165
+ "type": "image_url",
166
+ "image_url": {
167
+ "url": f"data:image/jpeg;base64,{frame}"
168
+ }
169
+ }
170
+ ])
171
+
172
+ return content
173
+
174
+ # Create frame content and generate description
175
+ frame_content = create_frame_content(selected_frames)
176
+ template = PROMPT_TEMPLATES[prompt_template]
177
+
178
+ messages = [
179
+ {
180
+ 'role': 'system',
181
+ 'content': template["system_prompt"]
182
+ },
183
+ {
184
+ 'role': 'user',
185
+ 'content': frame_content
186
+ }
187
+ ]
188
+
189
+ video_table['response'] = openai.chat_completions(
190
+ messages=messages,
191
+ model='gpt-4o',
192
+ max_tokens=500
193
+ )
194
+
195
+ video_table['content'] = video_table.response.choices[0].message.content.astype(pxt.StringType())
196
+
197
+ if progress:
198
+ progress(0.8, desc="Generating audio...")
199
+
200
+ # Generate voiceover
201
+ @pxt.udf
202
+ def generate_voiceover(script: str, voice: str) -> str:
203
+ try:
204
+ response = requests.post(
205
+ "https://api.openai.com/v1/audio/speech",
206
+ headers={"Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"},
207
+ json={
208
+ "model": "tts-1",
209
+ "input": script,
210
+ "voice": voice,
211
+ }
212
+ )
213
+ if response.status_code != 200:
214
+ raise Exception(f"TTS API error: {response.status_code} - {response.text}")
215
+
216
+ # Create temp file in system temp directory
217
+ temp_dir = tempfile.gettempdir()
218
+ temp_audio_path = os.path.join(temp_dir, f"voiceover_{session_id}.mp3")
219
+
220
+ with open(temp_audio_path, 'wb') as f:
221
+ f.write(response.content)
222
+
223
+ return temp_audio_path
224
+ except Exception as e:
225
+ print(f"Error generating audio: {e}")
226
+ return None
227
+
228
+ # Generate audio and get results
229
+ video_table['audio_path'] = generate_voiceover(video_table.content, voice_choice)
230
+ results = video_table.select(
231
+ video_table.content,
232
+ video_table.audio_path
233
+ ).tail(1)
234
+
235
+ if progress:
236
+ progress(1.0, desc="Processing complete!")
237
+
238
+ # Clean up
239
+ try:
240
+ pxt.drop_dir(dir_name, force=True)
241
+ except Exception as e:
242
+ print(f"Warning: Could not clean up directory {dir_name}: {e}")
243
+
244
+ return (
245
+ results['content'][0], # Generated text content
246
+ results['audio_path'][0] # Audio file path
247
+ )
248
+
249
+ except Exception as e:
250
+ print(f"Error processing video: {e}")
251
+ return f"Error processing video: {str(e)}", None
252
+
253
+ # Gradio interface
254
+ def create_interface():
255
+ with gr.Blocks(theme=gr.themes.Base()) as demo:
256
+ # Header
257
+ gr.Markdown(
258
+ """
259
+ <div style="text-align: left; margin-bottom: 2rem;">
260
+ <img src="https://raw.githubusercontent.com/pixeltable/pixeltable/main/docs/source/data/pixeltable-logo-large.png" alt="Pixeltable" style="max-width: 200px; margin-bottom: 1rem;" />
261
+ <h1>πŸŽ₯ AI Video Analyzer: Custom GPT-4 Analysis & TTS Narration</h1>
262
+ <p>Convert videos into rich narratives with 5 analysis styles - from Christopher Nolan-style cinematic breakdowns to David Attenborough documentary narrations.</p>
263
+ </div>
264
+ """
265
+ )
266
+
267
+ # Disclaimer with Whisper reference
268
+ gr.HTML(
269
+ """
270
+ <div style="background-color: #FFF3CD; border: 1px solid #FF7D04; padding: 1rem; margin: 1rem 0; border-radius: 4px;">
271
+ <p style="margin: 0; color: #013056;">
272
+ ⚠️ <strong>Notice:</strong> This application requires an OpenAI API key and uses the following services:
273
+ <ul style="margin-top: 0.5rem;">
274
+ <li>GPT-4 Vision API for video analysis</li>
275
+ <li>TTS API for audio generation</li>
276
+ </ul>
277
+ Please be aware of associated API costs. For pricing information, visit
278
+ <a href="https://openai.com/pricing" target="_blank" style="color: #856404; text-decoration: underline;">OpenAI's pricing page</a>.
279
+ <br><br>
280
+ This application does not process audio/transcripts. If you need audio transcription and analysis, check out our
281
+ <a href="https://huggingface.co/spaces/Pixeltable/Call-Analysis-AI-Tool" target="_blank" style="color: #856404; text-decoration: underline;">
282
+ Call Analysis AI Tool</a> which uses Whisper for audio processing.
283
+ </p>
284
+ </div>
285
+ """
286
+ )
287
+
288
+ # Information sections side by side
289
+ with gr.Row():
290
+ with gr.Column():
291
+ with gr.Accordion("What does it do?", open=True):
292
+ gr.Markdown("""
293
+ - πŸŽ₯ Analyze video content using GPT-4 Vision
294
+ - πŸ“ Generate detailed descriptions and narrations
295
+ - 🎧 Create professional voiceovers using OpenAI's TTS
296
+ - πŸ”„ Process up to 5 key frames from your video
297
+ """)
298
+
299
+ with gr.Column():
300
+ with gr.Accordion("How to use", open=True):
301
+ gr.Markdown("""
302
+ 1. Enter your OpenAI API key
303
+ 2. Upload a video file (max 35MB)
304
+ 3. Choose your preferred analysis style and voice
305
+ 5. Click "Process Video" and wait for results
306
+ """)
307
+
308
+ # Main interface
309
+ with gr.Row():
310
+ with gr.Column():
311
+ # Configuration controls - side by side
312
+ with gr.Row():
313
+ with gr.Column(scale=1):
314
+ api_key = gr.Textbox(
315
+ label="OpenAI API Key",
316
+ placeholder="sk-...",
317
+ type="password"
318
+ )
319
+
320
+ # Video upload below configuration
321
+ video_input = gr.Video(
322
+ label=f"Upload Video (max {MAX_VIDEO_SIZE_MB}MB)",
323
+ interactive=True
324
+ )
325
+
326
+ process_btn = gr.Button("🎬 Process Video", variant="primary")
327
+
328
+ # Results column
329
+ with gr.Column():
330
+
331
+ prompt_template = gr.Dropdown(
332
+ choices=list(PROMPT_TEMPLATES.keys()),
333
+ value="descriptive",
334
+ label="Analysis Style",
335
+ info="Choose analysis style"
336
+ )
337
+
338
+ voice_choice = gr.Dropdown(
339
+ choices=list(VOICE_OPTIONS.keys()),
340
+ value="onyx",
341
+ label="Voice Selection",
342
+ info="Select the voice for your narration"
343
+ )
344
+
345
+ with gr.Tabs():
346
+ with gr.TabItem("πŸ“ Analysis"):
347
+ content_output = gr.Textbox(
348
+ label="Generated Content",
349
+ lines=10
350
+ )
351
+
352
+ with gr.TabItem("🎧 Audio"):
353
+ audio_output = gr.Audio(
354
+ label="Generated Voiceover",
355
+ type="filepath"
356
+ )
357
+
358
+ # Footer
359
+ gr.HTML(
360
+ """
361
+ <div style="margin-top: 2rem; padding-top: 1rem; border-top: 1px solid #e5e7eb;">
362
+ <div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap; gap: 1rem;">
363
+ <div style="flex: 1;">
364
+ <h4 style="margin: 0; color: #374151;">πŸš€ Built with Pixeltable</h4>
365
+ <p style="margin: 0.5rem 0; color: #6b7280;">
366
+ Open Source AI infrastructure for intelligent applications
367
+ </p>
368
+ </div>
369
+ <div style="flex: 1;">
370
+ <h4 style="margin: 0; color: #374151;">πŸ”— Resources</h4>
371
+ <div style="display: flex; gap: 1.5rem; margin-top: 0.5rem;">
372
+ <a href="https://github.com/pixeltable/pixeltable" target="_blank" style="color: #4F46E5; text-decoration: none;">
373
+ GitHub
374
+ </a>
375
+ <a href="https://docs.pixeltable.com" target="_blank" style="color: #4F46E5; text-decoration: none;">
376
+ Documentation
377
+ </a>
378
+ </div>
379
+ </div>
380
+ </div>
381
+ </div>
382
+ """
383
+ )
384
+
385
+ # Connect the process button
386
+ process_btn.click(
387
+ fn=process_video,
388
+ inputs=[video_input, api_key, prompt_template, voice_choice],
389
+ outputs=[content_output, audio_output]
390
+ )
391
+
392
+ return demo
393
+
394
+ if __name__ == "__main__":
395
+ demo = create_interface()
396
+ demo.launch()