BahadirGLCK commited on
Commit
fd13285
·
1 Parent(s): 4e022c1

First commit

Browse files
Files changed (2) hide show
  1. app.py +171 -0
  2. requirements.txt +20 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import requests
4
+ import numpy as np
5
+ from PIL import Image
6
+ import decord
7
+ from decord import VideoReader, cpu
8
+ import torch
9
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
+ from qwen_vl_utils import process_vision_info
11
+ import gradio as gr
12
+
13
+ # ----------------------------------------
14
+ # 1. Initialize the Qwen 2.5 VL Model (7B)
15
+ # ----------------------------------------
16
+ # We load the official 7B version, using flash attention optimization and bfloat16 for efficiency.
17
+ model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
18
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
19
+ model_path,
20
+ torch_dtype=torch.bfloat16,
21
+ attn_implementation="flash_attention_2",
22
+ device_map="auto" # Automatically places the model on available GPU
23
+ )
24
+ processor = AutoProcessor.from_pretrained(model_path)
25
+
26
+ # -------------------------------------------------
27
+ # 2. Define Utility Functions for Video Processing
28
+ # -------------------------------------------------
29
+
30
+ def download_video(url, dest_path):
31
+ """
32
+ Download the video from the given URL and save it to a destination path.
33
+ """
34
+ response = requests.get(url, stream=True)
35
+ with open(dest_path, 'wb') as f:
36
+ for chunk in response.iter_content(chunk_size=8096):
37
+ f.write(chunk)
38
+ print(f"Video downloaded to {dest_path}")
39
+
40
+ def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
41
+ """
42
+ Download (if needed) and extract frames and timestamps from the video.
43
+ - Uses caching to avoid repeated processing.
44
+ - Utilizes decord to read video frames.
45
+ """
46
+ os.makedirs(cache_dir, exist_ok=True)
47
+ video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
48
+
49
+ # If the video is a URL, download it locally
50
+ if video_path.startswith('http://') or video_path.startswith('https://'):
51
+ video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
52
+ if not os.path.exists(video_file_path):
53
+ download_video(video_path, video_file_path)
54
+ else:
55
+ video_file_path = video_path
56
+
57
+ # Check if frames have been cached already
58
+ frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
59
+ timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
60
+ if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
61
+ frames = np.load(frames_cache_file)
62
+ timestamps = np.load(timestamps_cache_file)
63
+ return video_file_path, frames, timestamps
64
+
65
+ # Read video using decord
66
+ vr = VideoReader(video_file_path, ctx=cpu(0))
67
+ total_frames = len(vr)
68
+ indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
69
+ frames = vr.get_batch(indices).asnumpy()
70
+ timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
71
+
72
+ # Save the results to cache for later re-use
73
+ np.save(frames_cache_file, frames)
74
+ np.save(timestamps_cache_file, timestamps)
75
+
76
+ return video_file_path, frames, timestamps
77
+
78
+ # --------------------------------------------------------
79
+ # 3. Inference Function Using Qwen 2.5 VL to Process the Video
80
+ # --------------------------------------------------------
81
+ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
82
+ """
83
+ Prepare the input messages with the prompt and video metadata,
84
+ process the video inputs, and run inference through the model.
85
+ """
86
+ messages = [
87
+ {"role": "system", "content": "You are a helpful assistant."},
88
+ {"role": "user", "content": [
89
+ {"type": "text", "text": prompt},
90
+ {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
91
+ ]},
92
+ ]
93
+ # Prepare the text with the chat template from the processor.
94
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
95
+ # Process the video information into the proper inputs.
96
+ image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
97
+ fps_inputs = video_kwargs['fps']
98
+ inputs = processor(
99
+ text=[text],
100
+ images=image_inputs,
101
+ videos=video_inputs,
102
+ fps=fps_inputs,
103
+ padding=True,
104
+ return_tensors="pt"
105
+ )
106
+ inputs = inputs.to('cuda')
107
+
108
+ # Generate the response using the model
109
+ output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
110
+ # Post-process the output tokens to text.
111
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
112
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
113
+ return output_text[0]
114
+
115
+ # -------------------------------------------------
116
+ # 4. Define Sample Prompts for Users
117
+ # -------------------------------------------------
118
+ sample_prompts = [
119
+ "Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
120
+ "Provide a breakdown of the video's content by segment, including starting times and summaries.",
121
+ "Segment the video into logical chapters and output the start time and a brief description for each chapter.",
122
+ ]
123
+
124
+ # -------------------------------------------------
125
+ # 5. Main Processing Function for the Gradio Interface
126
+ # -------------------------------------------------
127
+ def process_video(video_url, custom_prompt, sample_prompt):
128
+ """
129
+ This function is called when a user clicks the 'Process Video' button.
130
+ - It uses the custom prompt if provided; otherwise, it falls back to the selected sample prompt.
131
+ - It then downloads and processes the video and calls the inference function.
132
+ """
133
+ # Choose the prompt: use custom prompt if not empty, else use the sample
134
+ final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
135
+ try:
136
+ video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
137
+ except Exception as e:
138
+ return f"Error processing video: {str(e)}"
139
+
140
+ try:
141
+ output = inference(video_path, final_prompt)
142
+ except Exception as e:
143
+ return f"Error during inference: {str(e)}"
144
+
145
+ return output
146
+
147
+ # -------------------------------------------------
148
+ # 6. Build the Gradio Interface
149
+ # -------------------------------------------------
150
+ with gr.Blocks() as demo:
151
+ gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B)")
152
+ gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
153
+
154
+ with gr.Row():
155
+ video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL...", lines=1)
156
+ with gr.Row():
157
+ custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
158
+ with gr.Row():
159
+ sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
160
+
161
+ output_text = gr.Textbox(label="Output", lines=10)
162
+ run_button = gr.Button("Process Video")
163
+
164
+ # When the button is clicked, run the process_video function.
165
+ run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
166
+
167
+ # -------------------------------------------------
168
+ # 7. Launch the App
169
+ # -------------------------------------------------
170
+ if __name__ == "__main__":
171
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies from Qwen 2.5 VL
2
+ gradio==5.4.0
3
+ gradio_client==1.4.2
4
+ qwen-vl-utils==0.0.10
5
+ transformers-stream-generator==0.0.4
6
+ torch==2.4.0
7
+ torchvision==0.19.0
8
+ git+https://github.com/huggingface/transformers.git
9
+ accelerate
10
+ av
11
+ qwen-vl-utils
12
+
13
+ # Optional dependency (uncomment if flash attention is needed)
14
+ flash-attn==2.6.1
15
+
16
+ # Additional dependencies for video processing and utilities
17
+ decord
18
+ numpy
19
+ Pillow
20
+ requests