Spaces:
Running
Running
Commit
·
fd13285
1
Parent(s):
4e022c1
First commit
Browse files- app.py +171 -0
- requirements.txt +20 -0
app.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import hashlib
|
3 |
+
import requests
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
import decord
|
7 |
+
from decord import VideoReader, cpu
|
8 |
+
import torch
|
9 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
10 |
+
from qwen_vl_utils import process_vision_info
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
# ----------------------------------------
|
14 |
+
# 1. Initialize the Qwen 2.5 VL Model (7B)
|
15 |
+
# ----------------------------------------
|
16 |
+
# We load the official 7B version, using flash attention optimization and bfloat16 for efficiency.
|
17 |
+
model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
|
18 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
19 |
+
model_path,
|
20 |
+
torch_dtype=torch.bfloat16,
|
21 |
+
attn_implementation="flash_attention_2",
|
22 |
+
device_map="auto" # Automatically places the model on available GPU
|
23 |
+
)
|
24 |
+
processor = AutoProcessor.from_pretrained(model_path)
|
25 |
+
|
26 |
+
# -------------------------------------------------
|
27 |
+
# 2. Define Utility Functions for Video Processing
|
28 |
+
# -------------------------------------------------
|
29 |
+
|
30 |
+
def download_video(url, dest_path):
|
31 |
+
"""
|
32 |
+
Download the video from the given URL and save it to a destination path.
|
33 |
+
"""
|
34 |
+
response = requests.get(url, stream=True)
|
35 |
+
with open(dest_path, 'wb') as f:
|
36 |
+
for chunk in response.iter_content(chunk_size=8096):
|
37 |
+
f.write(chunk)
|
38 |
+
print(f"Video downloaded to {dest_path}")
|
39 |
+
|
40 |
+
def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
41 |
+
"""
|
42 |
+
Download (if needed) and extract frames and timestamps from the video.
|
43 |
+
- Uses caching to avoid repeated processing.
|
44 |
+
- Utilizes decord to read video frames.
|
45 |
+
"""
|
46 |
+
os.makedirs(cache_dir, exist_ok=True)
|
47 |
+
video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
|
48 |
+
|
49 |
+
# If the video is a URL, download it locally
|
50 |
+
if video_path.startswith('http://') or video_path.startswith('https://'):
|
51 |
+
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
52 |
+
if not os.path.exists(video_file_path):
|
53 |
+
download_video(video_path, video_file_path)
|
54 |
+
else:
|
55 |
+
video_file_path = video_path
|
56 |
+
|
57 |
+
# Check if frames have been cached already
|
58 |
+
frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
|
59 |
+
timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
|
60 |
+
if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
|
61 |
+
frames = np.load(frames_cache_file)
|
62 |
+
timestamps = np.load(timestamps_cache_file)
|
63 |
+
return video_file_path, frames, timestamps
|
64 |
+
|
65 |
+
# Read video using decord
|
66 |
+
vr = VideoReader(video_file_path, ctx=cpu(0))
|
67 |
+
total_frames = len(vr)
|
68 |
+
indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
|
69 |
+
frames = vr.get_batch(indices).asnumpy()
|
70 |
+
timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
|
71 |
+
|
72 |
+
# Save the results to cache for later re-use
|
73 |
+
np.save(frames_cache_file, frames)
|
74 |
+
np.save(timestamps_cache_file, timestamps)
|
75 |
+
|
76 |
+
return video_file_path, frames, timestamps
|
77 |
+
|
78 |
+
# --------------------------------------------------------
|
79 |
+
# 3. Inference Function Using Qwen 2.5 VL to Process the Video
|
80 |
+
# --------------------------------------------------------
|
81 |
+
def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
|
82 |
+
"""
|
83 |
+
Prepare the input messages with the prompt and video metadata,
|
84 |
+
process the video inputs, and run inference through the model.
|
85 |
+
"""
|
86 |
+
messages = [
|
87 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
88 |
+
{"role": "user", "content": [
|
89 |
+
{"type": "text", "text": prompt},
|
90 |
+
{"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
|
91 |
+
]},
|
92 |
+
]
|
93 |
+
# Prepare the text with the chat template from the processor.
|
94 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
95 |
+
# Process the video information into the proper inputs.
|
96 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
|
97 |
+
fps_inputs = video_kwargs['fps']
|
98 |
+
inputs = processor(
|
99 |
+
text=[text],
|
100 |
+
images=image_inputs,
|
101 |
+
videos=video_inputs,
|
102 |
+
fps=fps_inputs,
|
103 |
+
padding=True,
|
104 |
+
return_tensors="pt"
|
105 |
+
)
|
106 |
+
inputs = inputs.to('cuda')
|
107 |
+
|
108 |
+
# Generate the response using the model
|
109 |
+
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
110 |
+
# Post-process the output tokens to text.
|
111 |
+
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
112 |
+
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
113 |
+
return output_text[0]
|
114 |
+
|
115 |
+
# -------------------------------------------------
|
116 |
+
# 4. Define Sample Prompts for Users
|
117 |
+
# -------------------------------------------------
|
118 |
+
sample_prompts = [
|
119 |
+
"Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
|
120 |
+
"Provide a breakdown of the video's content by segment, including starting times and summaries.",
|
121 |
+
"Segment the video into logical chapters and output the start time and a brief description for each chapter.",
|
122 |
+
]
|
123 |
+
|
124 |
+
# -------------------------------------------------
|
125 |
+
# 5. Main Processing Function for the Gradio Interface
|
126 |
+
# -------------------------------------------------
|
127 |
+
def process_video(video_url, custom_prompt, sample_prompt):
|
128 |
+
"""
|
129 |
+
This function is called when a user clicks the 'Process Video' button.
|
130 |
+
- It uses the custom prompt if provided; otherwise, it falls back to the selected sample prompt.
|
131 |
+
- It then downloads and processes the video and calls the inference function.
|
132 |
+
"""
|
133 |
+
# Choose the prompt: use custom prompt if not empty, else use the sample
|
134 |
+
final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
|
135 |
+
try:
|
136 |
+
video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
|
137 |
+
except Exception as e:
|
138 |
+
return f"Error processing video: {str(e)}"
|
139 |
+
|
140 |
+
try:
|
141 |
+
output = inference(video_path, final_prompt)
|
142 |
+
except Exception as e:
|
143 |
+
return f"Error during inference: {str(e)}"
|
144 |
+
|
145 |
+
return output
|
146 |
+
|
147 |
+
# -------------------------------------------------
|
148 |
+
# 6. Build the Gradio Interface
|
149 |
+
# -------------------------------------------------
|
150 |
+
with gr.Blocks() as demo:
|
151 |
+
gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B)")
|
152 |
+
gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
|
153 |
+
|
154 |
+
with gr.Row():
|
155 |
+
video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL...", lines=1)
|
156 |
+
with gr.Row():
|
157 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
|
158 |
+
with gr.Row():
|
159 |
+
sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
|
160 |
+
|
161 |
+
output_text = gr.Textbox(label="Output", lines=10)
|
162 |
+
run_button = gr.Button("Process Video")
|
163 |
+
|
164 |
+
# When the button is clicked, run the process_video function.
|
165 |
+
run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
|
166 |
+
|
167 |
+
# -------------------------------------------------
|
168 |
+
# 7. Launch the App
|
169 |
+
# -------------------------------------------------
|
170 |
+
if __name__ == "__main__":
|
171 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies from Qwen 2.5 VL
|
2 |
+
gradio==5.4.0
|
3 |
+
gradio_client==1.4.2
|
4 |
+
qwen-vl-utils==0.0.10
|
5 |
+
transformers-stream-generator==0.0.4
|
6 |
+
torch==2.4.0
|
7 |
+
torchvision==0.19.0
|
8 |
+
git+https://github.com/huggingface/transformers.git
|
9 |
+
accelerate
|
10 |
+
av
|
11 |
+
qwen-vl-utils
|
12 |
+
|
13 |
+
# Optional dependency (uncomment if flash attention is needed)
|
14 |
+
flash-attn==2.6.1
|
15 |
+
|
16 |
+
# Additional dependencies for video processing and utilities
|
17 |
+
decord
|
18 |
+
numpy
|
19 |
+
Pillow
|
20 |
+
requests
|