Spaces:
Running
Running
Commit
·
6621c82
1
Parent(s):
9d2876b
Change application flow.
Browse files- app.py +173 -145
- app_video_understant.py +165 -0
- local_video_understant_app.py +166 -0
- requirements.txt +6 -19
- requirements_vu.txt +20 -0
app.py
CHANGED
@@ -1,165 +1,193 @@
|
|
1 |
import os
|
|
|
2 |
import hashlib
|
3 |
import requests
|
4 |
import numpy as np
|
5 |
-
from PIL import Image
|
6 |
-
import decord
|
7 |
-
from decord import VideoReader, cpu
|
8 |
-
import torch
|
9 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
10 |
-
from qwen_vl_utils import process_vision_info
|
11 |
import gradio as gr
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
# 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
|
16 |
-
# ----------------------------------------
|
17 |
-
model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
|
18 |
-
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
19 |
-
model_path,
|
20 |
-
torch_dtype=torch.float16 # use float16 on CPU if desired, else use float32
|
21 |
-
# Removed attn_implementation and device_map for CPU-only deployment
|
22 |
-
)
|
23 |
-
processor = AutoProcessor.from_pretrained(model_path)
|
24 |
-
|
25 |
-
# -------------------------------------------------
|
26 |
-
# 2. Define Utility Functions for Video Processing
|
27 |
-
# -------------------------------------------------
|
28 |
-
def download_video(url, dest_path):
|
29 |
-
"""
|
30 |
-
Download a non-YouTube video using requests.
|
31 |
-
(This function is retained if you need it later.)
|
32 |
-
"""
|
33 |
-
response = requests.get(url, stream=True)
|
34 |
-
with open(dest_path, 'wb') as f:
|
35 |
-
for chunk in response.iter_content(chunk_size=8096):
|
36 |
-
f.write(chunk)
|
37 |
-
print(f"Video downloaded to {dest_path}")
|
38 |
-
|
39 |
-
def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
40 |
-
"""
|
41 |
-
Extract frames and timestamps from a video file.
|
42 |
-
If the video_path is a URL, it will download it.
|
43 |
-
For local files (including uploaded videos), it processes directly.
|
44 |
-
Uses caching to avoid repeated processing.
|
45 |
-
"""
|
46 |
-
os.makedirs(cache_dir, exist_ok=True)
|
47 |
-
video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
|
48 |
-
|
49 |
-
# If video_path starts with 'http', attempt to download
|
50 |
-
if video_path.startswith('http'):
|
51 |
-
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
52 |
-
if not os.path.exists(video_file_path):
|
53 |
-
print("Downloading video using requests...")
|
54 |
-
download_video(video_path, video_file_path)
|
55 |
-
else:
|
56 |
-
# For local files (uploaded videos), use the provided path directly.
|
57 |
-
video_file_path = video_path
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
|
|
|
|
77 |
|
78 |
-
return
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
{"role": "user", "content": [
|
91 |
-
{"type": "text", "text": prompt},
|
92 |
-
{"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
|
93 |
-
]},
|
94 |
-
]
|
95 |
-
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
96 |
-
image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
|
97 |
-
fps_inputs = video_kwargs['fps']
|
98 |
-
inputs = processor(
|
99 |
-
text=[text],
|
100 |
-
images=image_inputs,
|
101 |
-
videos=video_inputs,
|
102 |
-
fps=fps_inputs,
|
103 |
-
padding=True,
|
104 |
-
return_tensors="pt"
|
105 |
-
)
|
106 |
-
# In CPU-only mode, we use the default device (no .to('cuda'))
|
107 |
-
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
108 |
-
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
109 |
-
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
110 |
-
return output_text[0]
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
|
121 |
-
|
122 |
-
# 5. Main Processing Function for the Gradio Interface
|
123 |
-
# -------------------------------------------------
|
124 |
-
def process_video(video_file, custom_prompt, sample_prompt):
|
125 |
"""
|
126 |
-
|
127 |
-
|
128 |
-
Processes the uploaded video file and runs inference.
|
129 |
"""
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
try:
|
138 |
-
|
139 |
except Exception as e:
|
140 |
-
return f"Error during
|
141 |
|
142 |
-
|
|
|
|
|
143 |
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
|
164 |
if __name__ == "__main__":
|
165 |
-
|
|
|
1 |
import os
|
2 |
+
import datetime
|
3 |
import hashlib
|
4 |
import requests
|
5 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
import gradio as gr
|
7 |
+
import whisper
|
8 |
+
import srt
|
9 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
LANGUAGE_OPTIONS = {
|
12 |
+
"Afrikaans": "af",
|
13 |
+
"Arabic": "ar",
|
14 |
+
"Azerbaijani": "az",
|
15 |
+
"Belarusian": "be",
|
16 |
+
"Bulgarian": "bg",
|
17 |
+
"Bengali": "bn",
|
18 |
+
"Catalan": "ca",
|
19 |
+
"Czech": "cs",
|
20 |
+
"Welsh": "cy",
|
21 |
+
"Danish": "da",
|
22 |
+
"German": "de",
|
23 |
+
"Greek": "el",
|
24 |
+
"English": "en",
|
25 |
+
"Spanish": "es",
|
26 |
+
"Estonian": "et",
|
27 |
+
"Persian": "fa",
|
28 |
+
"Finnish": "fi",
|
29 |
+
"French": "fr",
|
30 |
+
"Irish": "ga",
|
31 |
+
"Galician": "gl",
|
32 |
+
"Gujarati": "gu",
|
33 |
+
"Hebrew": "he",
|
34 |
+
"Hindi": "hi",
|
35 |
+
"Croatian": "hr",
|
36 |
+
"Hungarian": "hu",
|
37 |
+
"Armenian": "hy",
|
38 |
+
"Indonesian": "id",
|
39 |
+
"Icelandic": "is",
|
40 |
+
"Italian": "it",
|
41 |
+
"Japanese": "ja",
|
42 |
+
"Georgian": "ka",
|
43 |
+
"Kazakh": "kk",
|
44 |
+
"Khmer": "km",
|
45 |
+
"Kannada": "kn",
|
46 |
+
"Korean": "ko",
|
47 |
+
"Lithuanian": "lt",
|
48 |
+
"Latvian": "lv",
|
49 |
+
"Macedonian": "mk",
|
50 |
+
"Malayalam": "ml",
|
51 |
+
"Mongolian": "mn",
|
52 |
+
"Marathi": "mr",
|
53 |
+
"Malay": "ms",
|
54 |
+
"Maltese": "mt",
|
55 |
+
"Nepali": "ne",
|
56 |
+
"Dutch": "nl",
|
57 |
+
"Norwegian": "no",
|
58 |
+
"Odia": "or",
|
59 |
+
"Punjabi": "pa",
|
60 |
+
"Polish": "pl",
|
61 |
+
"Portuguese": "pt",
|
62 |
+
"Romanian": "ro",
|
63 |
+
"Russian": "ru",
|
64 |
+
"Sinhala": "si",
|
65 |
+
"Slovak": "sk",
|
66 |
+
"Slovenian": "sl",
|
67 |
+
"Albanian": "sq",
|
68 |
+
"Serbian": "sr",
|
69 |
+
"Swedish": "sv",
|
70 |
+
"Swahili": "sw",
|
71 |
+
"Tamil": "ta",
|
72 |
+
"Telugu": "te",
|
73 |
+
"Thai": "th",
|
74 |
+
"Turkish": "tr",
|
75 |
+
"Ukrainian": "uk",
|
76 |
+
"Urdu": "ur",
|
77 |
+
"Vietnamese": "vi",
|
78 |
+
"Chinese": "zh"
|
79 |
+
}
|
80 |
|
81 |
+
def transcribe_audio(audio_file_path, model_size='base', language="en"):
|
82 |
+
model = whisper.load_model(model_size)
|
83 |
+
model.to("cpu")
|
84 |
+
result = model.transcribe(audio_file_path, language=language)
|
85 |
+
transcription = result["text"]
|
86 |
+
segments = result["segments"]
|
87 |
|
88 |
+
try:
|
89 |
+
from whisper.utils import format_srt
|
90 |
+
srt_text = format_srt(segments)
|
91 |
+
except Exception:
|
92 |
+
srt_text = generate_srt(segments)
|
93 |
|
94 |
+
return transcription, srt_text, segments
|
95 |
|
96 |
+
def generate_srt(segments):
|
97 |
+
import datetime
|
98 |
+
import srt
|
99 |
+
subtitles = []
|
100 |
+
for i, seg in enumerate(segments):
|
101 |
+
start_td = datetime.timedelta(seconds=seg["start"])
|
102 |
+
end_td = datetime.timedelta(seconds=seg["end"])
|
103 |
+
subtitle = srt.Subtitle(index=i+1, start=start_td, end=end_td, content=seg["text"])
|
104 |
+
subtitles.append(subtitle)
|
105 |
+
return srt.compose(subtitles)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
def prepare_chapter_prompt(srt_text):
|
108 |
+
system_prompt = (
|
109 |
+
"You are a highly skilled video content segmentation and optimization expert. "
|
110 |
+
"Your task is to analyze a transcript of a YouTube video provided in SRT format and produce engaging and concise chapter headers. "
|
111 |
+
"Each chapter header must be on its own line in the exact format: 'mm:ss Chapter Title'.\n\n"
|
112 |
+
"- 'mm:ss' represents the starting time of the chapter (minutes and seconds).\n"
|
113 |
+
"- 'Chapter Title' must be a catchy, audience-friendly title that summarizes the key idea or transition at that point in the video.\n\n"
|
114 |
+
"IMPORTANT: Although these instructions are in English, please ensure that your output is in the same language as the provided SRT transcript."
|
115 |
+
)
|
116 |
+
user_prompt = (
|
117 |
+
"Below is the transcript of a YouTube video in SRT format:\n\n"
|
118 |
+
"```\n"
|
119 |
+
f"{srt_text}\n"
|
120 |
+
"```\n\n"
|
121 |
+
"Please generate only the chapter breakdown using the guidelines above. "
|
122 |
+
"Each chapter header should be formatted as:\n"
|
123 |
+
"mm:ss Chapter Title"
|
124 |
+
)
|
125 |
+
return system_prompt + "\n\n" + user_prompt
|
126 |
|
127 |
+
def format_prompt_html(prompt):
|
|
|
|
|
|
|
128 |
"""
|
129 |
+
Displays the prompt in a read-only textarea using Gradio's color variables for background and text.
|
130 |
+
Includes a 'Copy Prompt' button (blue) and a short 'Prompt Copied!' confirmation message.
|
|
|
131 |
"""
|
132 |
+
html_content = f"""
|
133 |
+
<div style="display: flex; flex-direction: column; gap: 10px; margin-top: 10px;">
|
134 |
+
<textarea id="prompt_text" rows="10"
|
135 |
+
style="width: 100%; resize: vertical;
|
136 |
+
background-color: var(--block-background-fill);
|
137 |
+
color: var(--block-text-color);
|
138 |
+
border: 1px solid var(--block-border-color);
|
139 |
+
border-radius: 4px;"
|
140 |
+
readonly>{prompt}</textarea>
|
141 |
+
<button
|
142 |
+
style="width: 150px; padding: 8px;
|
143 |
+
background-color: #007bff;
|
144 |
+
color: white;
|
145 |
+
border: none;
|
146 |
+
border-radius: 4px;
|
147 |
+
cursor: pointer;"
|
148 |
+
onclick="
|
149 |
+
navigator.clipboard.writeText(document.getElementById('prompt_text').value);
|
150 |
+
const copiedMsg = document.getElementById('copied_msg');
|
151 |
+
copiedMsg.style.display = 'inline';
|
152 |
+
setTimeout(() => copiedMsg.style.display = 'none', 2000);
|
153 |
+
">
|
154 |
+
Copy Prompt
|
155 |
+
</button>
|
156 |
+
<span id="copied_msg" style="display: none; color: var(--primary-text-color); font-weight: bold;">Prompt Copied!</span>
|
157 |
+
</div>
|
158 |
+
"""
|
159 |
+
return html_content
|
160 |
+
|
161 |
+
def process_audio(audio, language_name):
|
162 |
+
lang_code = LANGUAGE_OPTIONS.get(language_name, "en")
|
163 |
try:
|
164 |
+
transcription, srt_text, segments = transcribe_audio(audio, model_size='base', language=lang_code)
|
165 |
except Exception as e:
|
166 |
+
return f"Error during transcription: {str(e)}", "", ""
|
167 |
|
168 |
+
chapter_prompt = prepare_chapter_prompt(srt_text)
|
169 |
+
prompt_html = format_prompt_html(chapter_prompt)
|
170 |
+
return transcription, srt_text, prompt_html
|
171 |
|
172 |
+
iface = gr.Interface(
|
173 |
+
fn=process_audio,
|
174 |
+
inputs=[
|
175 |
+
gr.Audio(type="filepath", label="Upload Audio"),
|
176 |
+
gr.Dropdown(choices=list(LANGUAGE_OPTIONS.keys()), label="Audio Language", value="English")
|
177 |
+
],
|
178 |
+
outputs=[
|
179 |
+
gr.Textbox(label="Full Transcription", lines=10),
|
180 |
+
gr.Textbox(label="SRT File Content", lines=10),
|
181 |
+
gr.HTML(label="Prepared Chapter Prompt (Copy & Paste into ChatGPT)")
|
182 |
+
],
|
183 |
+
title="Video Chapter Splitter from Audio (MP3)",
|
184 |
+
description=(
|
185 |
+
"Upload an audio file (e.g., MP3) of your YouTube video and select the audio language. "
|
186 |
+
"The app will transcribe the audio using Whisper, generate subtitles in SRT format, "
|
187 |
+
"and prepare a single, complete prompt that instructs ChatGPT/GPT-4 to generate a chapter breakdown in the format 'mm:ss Chapter Title'.\n\n"
|
188 |
+
"Click the 'Copy Prompt' button to copy the entire prompt, and a brief 'Prompt Copied!' message will appear."
|
189 |
+
)
|
190 |
+
)
|
191 |
|
192 |
if __name__ == "__main__":
|
193 |
+
iface.launch()
|
app_video_understant.py
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import hashlib
|
3 |
+
import requests
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
import decord
|
7 |
+
from decord import VideoReader, cpu
|
8 |
+
import torch
|
9 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
10 |
+
from qwen_vl_utils import process_vision_info
|
11 |
+
import gradio as gr
|
12 |
+
# Removed pytube since we no longer download from YouTube
|
13 |
+
|
14 |
+
# ----------------------------------------
|
15 |
+
# 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
|
16 |
+
# ----------------------------------------
|
17 |
+
model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
|
18 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
19 |
+
model_path,
|
20 |
+
torch_dtype=torch.float16 # use float16 on CPU if desired, else use float32
|
21 |
+
# Removed attn_implementation and device_map for CPU-only deployment
|
22 |
+
)
|
23 |
+
processor = AutoProcessor.from_pretrained(model_path)
|
24 |
+
|
25 |
+
# -------------------------------------------------
|
26 |
+
# 2. Define Utility Functions for Video Processing
|
27 |
+
# -------------------------------------------------
|
28 |
+
def download_video(url, dest_path):
|
29 |
+
"""
|
30 |
+
Download a non-YouTube video using requests.
|
31 |
+
(This function is retained if you need it later.)
|
32 |
+
"""
|
33 |
+
response = requests.get(url, stream=True)
|
34 |
+
with open(dest_path, 'wb') as f:
|
35 |
+
for chunk in response.iter_content(chunk_size=8096):
|
36 |
+
f.write(chunk)
|
37 |
+
print(f"Video downloaded to {dest_path}")
|
38 |
+
|
39 |
+
def get_video_frames(video_path, num_frames=16, cache_dir='.cache'):
|
40 |
+
"""
|
41 |
+
Extract frames and timestamps from a video file.
|
42 |
+
If the video_path is a URL, it will download it.
|
43 |
+
For local files (including uploaded videos), it processes directly.
|
44 |
+
Uses caching to avoid repeated processing.
|
45 |
+
"""
|
46 |
+
os.makedirs(cache_dir, exist_ok=True)
|
47 |
+
video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
|
48 |
+
|
49 |
+
# If video_path starts with 'http', attempt to download
|
50 |
+
if video_path.startswith('http'):
|
51 |
+
video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
|
52 |
+
if not os.path.exists(video_file_path):
|
53 |
+
print("Downloading video using requests...")
|
54 |
+
download_video(video_path, video_file_path)
|
55 |
+
else:
|
56 |
+
# For local files (uploaded videos), use the provided path directly.
|
57 |
+
video_file_path = video_path
|
58 |
+
|
59 |
+
# Check for cached frames
|
60 |
+
frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
|
61 |
+
timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
|
62 |
+
if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
|
63 |
+
frames = np.load(frames_cache_file)
|
64 |
+
timestamps = np.load(timestamps_cache_file)
|
65 |
+
return video_file_path, frames, timestamps
|
66 |
+
|
67 |
+
# Read video using decord
|
68 |
+
vr = VideoReader(video_file_path, ctx=cpu(0))
|
69 |
+
total_frames = len(vr)
|
70 |
+
indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
|
71 |
+
frames = vr.get_batch(indices).asnumpy()
|
72 |
+
timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
|
73 |
+
|
74 |
+
# Save to cache
|
75 |
+
np.save(frames_cache_file, frames)
|
76 |
+
np.save(timestamps_cache_file, timestamps)
|
77 |
+
|
78 |
+
return video_file_path, frames, timestamps
|
79 |
+
|
80 |
+
# --------------------------------------------------------
|
81 |
+
# 3. Inference Function Using Qwen 2.5 VL to Process the Video
|
82 |
+
# --------------------------------------------------------
|
83 |
+
def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
|
84 |
+
"""
|
85 |
+
Prepares the input messages with the prompt and video metadata,
|
86 |
+
processes the video inputs, and runs inference through the model.
|
87 |
+
"""
|
88 |
+
messages = [
|
89 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
90 |
+
{"role": "user", "content": [
|
91 |
+
{"type": "text", "text": prompt},
|
92 |
+
{"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
|
93 |
+
]},
|
94 |
+
]
|
95 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
96 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
|
97 |
+
fps_inputs = video_kwargs['fps']
|
98 |
+
inputs = processor(
|
99 |
+
text=[text],
|
100 |
+
images=image_inputs,
|
101 |
+
videos=video_inputs,
|
102 |
+
fps=fps_inputs,
|
103 |
+
padding=True,
|
104 |
+
return_tensors="pt"
|
105 |
+
)
|
106 |
+
# In CPU-only mode, we use the default device (no .to('cuda'))
|
107 |
+
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
108 |
+
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
109 |
+
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
110 |
+
return output_text[0]
|
111 |
+
|
112 |
+
# -------------------------------------------------
|
113 |
+
# 4. Define Sample Prompts for Users
|
114 |
+
# -------------------------------------------------
|
115 |
+
sample_prompts = [
|
116 |
+
"Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
|
117 |
+
"Provide a breakdown of the video's content by segment, including starting times and summaries.",
|
118 |
+
"Segment the video into logical chapters and output the start time and a brief description for each chapter.",
|
119 |
+
]
|
120 |
+
|
121 |
+
# -------------------------------------------------
|
122 |
+
# 5. Main Processing Function for the Gradio Interface
|
123 |
+
# -------------------------------------------------
|
124 |
+
def process_video(video_file, custom_prompt, sample_prompt):
|
125 |
+
"""
|
126 |
+
Called when the user clicks 'Process Video'.
|
127 |
+
Uses the custom prompt if provided; otherwise, uses the sample prompt.
|
128 |
+
Processes the uploaded video file and runs inference.
|
129 |
+
"""
|
130 |
+
final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
|
131 |
+
try:
|
132 |
+
# video_file is expected to be a local file path from the uploader.
|
133 |
+
video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
|
134 |
+
except Exception as e:
|
135 |
+
return f"Error processing video: {str(e)}"
|
136 |
+
|
137 |
+
try:
|
138 |
+
output = inference(video_path, final_prompt)
|
139 |
+
except Exception as e:
|
140 |
+
return f"Error during inference: {str(e)}"
|
141 |
+
|
142 |
+
return output
|
143 |
+
|
144 |
+
# -------------------------------------------------
|
145 |
+
# 6. Build the Gradio Interface
|
146 |
+
# -------------------------------------------------
|
147 |
+
with gr.Blocks() as demo:
|
148 |
+
gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on CPU")
|
149 |
+
gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
|
150 |
+
|
151 |
+
with gr.Row():
|
152 |
+
# Removed the source parameter here
|
153 |
+
video_input = gr.Video(label="Upload Video")
|
154 |
+
with gr.Row():
|
155 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
|
156 |
+
with gr.Row():
|
157 |
+
sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
|
158 |
+
|
159 |
+
output_text = gr.Textbox(label="Output", lines=10)
|
160 |
+
run_button = gr.Button("Process Video")
|
161 |
+
|
162 |
+
run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
|
163 |
+
|
164 |
+
if __name__ == "__main__":
|
165 |
+
demo.launch()
|
local_video_understant_app.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import hashlib
|
3 |
+
import requests
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
import decord
|
7 |
+
from decord import VideoReader, cpu
|
8 |
+
import torch
|
9 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
|
10 |
+
from qwen_vl_utils import process_vision_info
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
# ---------------------------------------------------
|
14 |
+
# 1. Set Up Device: Use Apple's MPS if available, else CPU
|
15 |
+
# ---------------------------------------------------
|
16 |
+
device = "mps" if torch.backends.mps.is_available() else "cpu"
|
17 |
+
print(f"Using device: {device}")
|
18 |
+
# For MPS, we can try using float16 to reduce memory usage.
|
19 |
+
torch_dtype = torch.float16 if device == "mps" else torch.float32
|
20 |
+
|
21 |
+
# ---------------------------------------------------
|
22 |
+
# 2. Initialize the Qwen 2.5 VL Model (3B) for Local Use
|
23 |
+
# ---------------------------------------------------
|
24 |
+
model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
|
25 |
+
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
26 |
+
model_path,
|
27 |
+
torch_dtype=torch_dtype
|
28 |
+
)
|
29 |
+
model.to(device)
|
30 |
+
processor = AutoProcessor.from_pretrained(model_path)
|
31 |
+
|
32 |
+
# ---------------------------------------------------
|
33 |
+
# 3. Utility Functions for Video Processing
|
34 |
+
# ---------------------------------------------------
|
35 |
+
def download_video(url, dest_path):
|
36 |
+
"""
|
37 |
+
Downloads a video from a URL.
|
38 |
+
(This function is kept here if you ever need to download via URL.)
|
39 |
+
"""
|
40 |
+
response = requests.get(url, stream=True)
|
41 |
+
with open(dest_path, 'wb') as f:
|
42 |
+
for chunk in response.iter_content(chunk_size=8096):
|
43 |
+
f.write(chunk)
|
44 |
+
print(f"Video downloaded to {dest_path}")
|
45 |
+
|
46 |
+
def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
|
47 |
+
"""
|
48 |
+
Extract frames and timestamps from a video file.
|
49 |
+
If video_path is a URL, it downloads it; otherwise it assumes a local file.
|
50 |
+
Caching is used to avoid re-processing.
|
51 |
+
"""
|
52 |
+
os.makedirs(cache_dir, exist_ok=True)
|
53 |
+
video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
|
54 |
+
|
55 |
+
# If the path starts with 'http', download the file.
|
56 |
+
if video_path.startswith("http"):
|
57 |
+
video_file_path = os.path.join(cache_dir, f"{video_hash}.mp4")
|
58 |
+
if not os.path.exists(video_file_path):
|
59 |
+
print("Downloading video using requests...")
|
60 |
+
download_video(video_path, video_file_path)
|
61 |
+
else:
|
62 |
+
video_file_path = video_path
|
63 |
+
|
64 |
+
frames_cache_file = os.path.join(cache_dir, f"{video_hash}_{num_frames}_frames.npy")
|
65 |
+
timestamps_cache_file = os.path.join(cache_dir, f"{video_hash}_{num_frames}_timestamps.npy")
|
66 |
+
if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
|
67 |
+
frames = np.load(frames_cache_file)
|
68 |
+
timestamps = np.load(timestamps_cache_file)
|
69 |
+
return video_file_path, frames, timestamps
|
70 |
+
|
71 |
+
# Load video using decord
|
72 |
+
vr = VideoReader(video_file_path, ctx=cpu(0))
|
73 |
+
total_frames = len(vr)
|
74 |
+
indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
|
75 |
+
frames = vr.get_batch(indices).asnumpy()
|
76 |
+
timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
|
77 |
+
|
78 |
+
# Cache the frames and timestamps
|
79 |
+
np.save(frames_cache_file, frames)
|
80 |
+
np.save(timestamps_cache_file, timestamps)
|
81 |
+
|
82 |
+
return video_file_path, frames, timestamps
|
83 |
+
|
84 |
+
# ---------------------------------------------------
|
85 |
+
# 4. Inference Function Using Qwen 2.5 VL (3B)
|
86 |
+
# ---------------------------------------------------
|
87 |
+
def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
|
88 |
+
"""
|
89 |
+
Prepares the input with the prompt and video metadata,
|
90 |
+
processes the video inputs, and runs inference through the model.
|
91 |
+
"""
|
92 |
+
messages = [
|
93 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
94 |
+
{"role": "user", "content": [
|
95 |
+
{"type": "text", "text": prompt},
|
96 |
+
{"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
|
97 |
+
]},
|
98 |
+
]
|
99 |
+
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
100 |
+
image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
|
101 |
+
fps_inputs = video_kwargs["fps"]
|
102 |
+
inputs = processor(
|
103 |
+
text=[text],
|
104 |
+
images=image_inputs,
|
105 |
+
videos=video_inputs,
|
106 |
+
fps=fps_inputs,
|
107 |
+
padding=True,
|
108 |
+
return_tensors="pt"
|
109 |
+
)
|
110 |
+
# Move inputs to our chosen device (MPS or CPU)
|
111 |
+
inputs = inputs.to(device)
|
112 |
+
output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
113 |
+
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
|
114 |
+
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
115 |
+
return output_text[0]
|
116 |
+
|
117 |
+
# ---------------------------------------------------
|
118 |
+
# 5. Define Sample Prompts
|
119 |
+
# ---------------------------------------------------
|
120 |
+
sample_prompts = [
|
121 |
+
"Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
|
122 |
+
"Provide a breakdown of the video's content by segment, including starting times and summaries.",
|
123 |
+
"Segment the video into logical chapters and output the start time and a brief description for each chapter.",
|
124 |
+
]
|
125 |
+
|
126 |
+
# ---------------------------------------------------
|
127 |
+
# 6. Main Processing Function for the Gradio Interface
|
128 |
+
# ---------------------------------------------------
|
129 |
+
def process_video(video_file, custom_prompt, sample_prompt):
|
130 |
+
"""
|
131 |
+
Called when the user clicks 'Process Video'.
|
132 |
+
Uses a custom prompt (if provided) or the sample prompt.
|
133 |
+
Processes the uploaded video and runs inference.
|
134 |
+
"""
|
135 |
+
final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
|
136 |
+
try:
|
137 |
+
# Here, video_file is the local file path from the uploader.
|
138 |
+
video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
|
139 |
+
except Exception as e:
|
140 |
+
return f"Error processing video: {str(e)}"
|
141 |
+
|
142 |
+
try:
|
143 |
+
output = inference(video_path, final_prompt)
|
144 |
+
except Exception as e:
|
145 |
+
return f"Error during inference: {str(e)}"
|
146 |
+
|
147 |
+
return output
|
148 |
+
|
149 |
+
# ---------------------------------------------------
|
150 |
+
# 7. Build the Gradio Interface for Local Use
|
151 |
+
# ---------------------------------------------------
|
152 |
+
with gr.Blocks() as demo:
|
153 |
+
gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on Mac")
|
154 |
+
gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
|
155 |
+
with gr.Row():
|
156 |
+
video_input = gr.Video(label="Upload Video")
|
157 |
+
with gr.Row():
|
158 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
|
159 |
+
with gr.Row():
|
160 |
+
sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
|
161 |
+
output_text = gr.Textbox(label="Output", lines=10)
|
162 |
+
run_button = gr.Button("Process Video")
|
163 |
+
run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
|
164 |
+
|
165 |
+
if __name__ == "__main__":
|
166 |
+
demo.launch()
|
requirements.txt
CHANGED
@@ -1,20 +1,7 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
torch==2.4.0
|
7 |
-
torchvision==0.19.0
|
8 |
-
git+https://github.com/huggingface/transformers.git
|
9 |
-
accelerate
|
10 |
-
av
|
11 |
-
|
12 |
-
# Optional dependency (uncomment if flash attention is needed)
|
13 |
-
# flash-attn==2.6.1
|
14 |
-
|
15 |
-
# Additional dependencies for video processing and utilities
|
16 |
-
decord #use decord for linux or other OS
|
17 |
-
numpy
|
18 |
-
Pillow
|
19 |
requests
|
20 |
-
|
|
|
1 |
+
gradio>=3.0
|
2 |
+
openai-whisper
|
3 |
+
srt
|
4 |
+
transformers
|
5 |
+
torch>=2.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
requests
|
7 |
+
numpy
|
requirements_vu.txt
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies from Qwen 2.5 VL
|
2 |
+
gradio
|
3 |
+
gradio_client
|
4 |
+
qwen-vl-utils
|
5 |
+
transformers-stream-generator==0.0.4
|
6 |
+
torch==2.4.0
|
7 |
+
torchvision==0.19.0
|
8 |
+
git+https://github.com/huggingface/transformers.git
|
9 |
+
accelerate
|
10 |
+
av
|
11 |
+
|
12 |
+
# Optional dependency (uncomment if flash attention is needed)
|
13 |
+
# flash-attn==2.6.1
|
14 |
+
|
15 |
+
# Additional dependencies for video processing and utilities
|
16 |
+
decord #use decord for linux or other OS
|
17 |
+
numpy
|
18 |
+
Pillow
|
19 |
+
requests
|
20 |
+
pytube
|