BahadirGLCK commited on
Commit
6621c82
·
1 Parent(s): 9d2876b

Change application flow.

Browse files
app.py CHANGED
@@ -1,165 +1,193 @@
1
  import os
 
2
  import hashlib
3
  import requests
4
  import numpy as np
5
- from PIL import Image
6
- import decord
7
- from decord import VideoReader, cpu
8
- import torch
9
- from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
- from qwen_vl_utils import process_vision_info
11
  import gradio as gr
12
- # Removed pytube since we no longer download from YouTube
13
-
14
- # ----------------------------------------
15
- # 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
16
- # ----------------------------------------
17
- model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
18
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
19
- model_path,
20
- torch_dtype=torch.float16 # use float16 on CPU if desired, else use float32
21
- # Removed attn_implementation and device_map for CPU-only deployment
22
- )
23
- processor = AutoProcessor.from_pretrained(model_path)
24
-
25
- # -------------------------------------------------
26
- # 2. Define Utility Functions for Video Processing
27
- # -------------------------------------------------
28
- def download_video(url, dest_path):
29
- """
30
- Download a non-YouTube video using requests.
31
- (This function is retained if you need it later.)
32
- """
33
- response = requests.get(url, stream=True)
34
- with open(dest_path, 'wb') as f:
35
- for chunk in response.iter_content(chunk_size=8096):
36
- f.write(chunk)
37
- print(f"Video downloaded to {dest_path}")
38
-
39
- def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
40
- """
41
- Extract frames and timestamps from a video file.
42
- If the video_path is a URL, it will download it.
43
- For local files (including uploaded videos), it processes directly.
44
- Uses caching to avoid repeated processing.
45
- """
46
- os.makedirs(cache_dir, exist_ok=True)
47
- video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
48
-
49
- # If video_path starts with 'http', attempt to download
50
- if video_path.startswith('http'):
51
- video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
52
- if not os.path.exists(video_file_path):
53
- print("Downloading video using requests...")
54
- download_video(video_path, video_file_path)
55
- else:
56
- # For local files (uploaded videos), use the provided path directly.
57
- video_file_path = video_path
58
 
59
- # Check for cached frames
60
- frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
61
- timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
62
- if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
63
- frames = np.load(frames_cache_file)
64
- timestamps = np.load(timestamps_cache_file)
65
- return video_file_path, frames, timestamps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # Read video using decord
68
- vr = VideoReader(video_file_path, ctx=cpu(0))
69
- total_frames = len(vr)
70
- indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
71
- frames = vr.get_batch(indices).asnumpy()
72
- timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
73
 
74
- # Save to cache
75
- np.save(frames_cache_file, frames)
76
- np.save(timestamps_cache_file, timestamps)
 
 
77
 
78
- return video_file_path, frames, timestamps
79
 
80
- # --------------------------------------------------------
81
- # 3. Inference Function Using Qwen 2.5 VL to Process the Video
82
- # --------------------------------------------------------
83
- def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
84
- """
85
- Prepares the input messages with the prompt and video metadata,
86
- processes the video inputs, and runs inference through the model.
87
- """
88
- messages = [
89
- {"role": "system", "content": "You are a helpful assistant."},
90
- {"role": "user", "content": [
91
- {"type": "text", "text": prompt},
92
- {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
93
- ]},
94
- ]
95
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
96
- image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
97
- fps_inputs = video_kwargs['fps']
98
- inputs = processor(
99
- text=[text],
100
- images=image_inputs,
101
- videos=video_inputs,
102
- fps=fps_inputs,
103
- padding=True,
104
- return_tensors="pt"
105
- )
106
- # In CPU-only mode, we use the default device (no .to('cuda'))
107
- output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
108
- generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
109
- output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
110
- return output_text[0]
111
 
112
- # -------------------------------------------------
113
- # 4. Define Sample Prompts for Users
114
- # -------------------------------------------------
115
- sample_prompts = [
116
- "Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
117
- "Provide a breakdown of the video's content by segment, including starting times and summaries.",
118
- "Segment the video into logical chapters and output the start time and a brief description for each chapter.",
119
- ]
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- # -------------------------------------------------
122
- # 5. Main Processing Function for the Gradio Interface
123
- # -------------------------------------------------
124
- def process_video(video_file, custom_prompt, sample_prompt):
125
  """
126
- Called when the user clicks 'Process Video'.
127
- Uses the custom prompt if provided; otherwise, uses the sample prompt.
128
- Processes the uploaded video file and runs inference.
129
  """
130
- final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
131
- try:
132
- # video_file is expected to be a local file path from the uploader.
133
- video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
134
- except Exception as e:
135
- return f"Error processing video: {str(e)}"
136
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  try:
138
- output = inference(video_path, final_prompt)
139
  except Exception as e:
140
- return f"Error during inference: {str(e)}"
141
 
142
- return output
 
 
143
 
144
- # -------------------------------------------------
145
- # 6. Build the Gradio Interface
146
- # -------------------------------------------------
147
- with gr.Blocks() as demo:
148
- gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on CPU")
149
- gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
150
-
151
- with gr.Row():
152
- # Removed the source parameter here
153
- video_input = gr.Video(label="Upload Video")
154
- with gr.Row():
155
- custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
156
- with gr.Row():
157
- sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
158
-
159
- output_text = gr.Textbox(label="Output", lines=10)
160
- run_button = gr.Button("Process Video")
161
-
162
- run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
163
 
164
  if __name__ == "__main__":
165
- demo.launch()
 
1
  import os
2
+ import datetime
3
  import hashlib
4
  import requests
5
  import numpy as np
 
 
 
 
 
 
6
  import gradio as gr
7
+ import whisper
8
+ import srt
9
+ import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ LANGUAGE_OPTIONS = {
12
+ "Afrikaans": "af",
13
+ "Arabic": "ar",
14
+ "Azerbaijani": "az",
15
+ "Belarusian": "be",
16
+ "Bulgarian": "bg",
17
+ "Bengali": "bn",
18
+ "Catalan": "ca",
19
+ "Czech": "cs",
20
+ "Welsh": "cy",
21
+ "Danish": "da",
22
+ "German": "de",
23
+ "Greek": "el",
24
+ "English": "en",
25
+ "Spanish": "es",
26
+ "Estonian": "et",
27
+ "Persian": "fa",
28
+ "Finnish": "fi",
29
+ "French": "fr",
30
+ "Irish": "ga",
31
+ "Galician": "gl",
32
+ "Gujarati": "gu",
33
+ "Hebrew": "he",
34
+ "Hindi": "hi",
35
+ "Croatian": "hr",
36
+ "Hungarian": "hu",
37
+ "Armenian": "hy",
38
+ "Indonesian": "id",
39
+ "Icelandic": "is",
40
+ "Italian": "it",
41
+ "Japanese": "ja",
42
+ "Georgian": "ka",
43
+ "Kazakh": "kk",
44
+ "Khmer": "km",
45
+ "Kannada": "kn",
46
+ "Korean": "ko",
47
+ "Lithuanian": "lt",
48
+ "Latvian": "lv",
49
+ "Macedonian": "mk",
50
+ "Malayalam": "ml",
51
+ "Mongolian": "mn",
52
+ "Marathi": "mr",
53
+ "Malay": "ms",
54
+ "Maltese": "mt",
55
+ "Nepali": "ne",
56
+ "Dutch": "nl",
57
+ "Norwegian": "no",
58
+ "Odia": "or",
59
+ "Punjabi": "pa",
60
+ "Polish": "pl",
61
+ "Portuguese": "pt",
62
+ "Romanian": "ro",
63
+ "Russian": "ru",
64
+ "Sinhala": "si",
65
+ "Slovak": "sk",
66
+ "Slovenian": "sl",
67
+ "Albanian": "sq",
68
+ "Serbian": "sr",
69
+ "Swedish": "sv",
70
+ "Swahili": "sw",
71
+ "Tamil": "ta",
72
+ "Telugu": "te",
73
+ "Thai": "th",
74
+ "Turkish": "tr",
75
+ "Ukrainian": "uk",
76
+ "Urdu": "ur",
77
+ "Vietnamese": "vi",
78
+ "Chinese": "zh"
79
+ }
80
 
81
+ def transcribe_audio(audio_file_path, model_size='base', language="en"):
82
+ model = whisper.load_model(model_size)
83
+ model.to("cpu")
84
+ result = model.transcribe(audio_file_path, language=language)
85
+ transcription = result["text"]
86
+ segments = result["segments"]
87
 
88
+ try:
89
+ from whisper.utils import format_srt
90
+ srt_text = format_srt(segments)
91
+ except Exception:
92
+ srt_text = generate_srt(segments)
93
 
94
+ return transcription, srt_text, segments
95
 
96
+ def generate_srt(segments):
97
+ import datetime
98
+ import srt
99
+ subtitles = []
100
+ for i, seg in enumerate(segments):
101
+ start_td = datetime.timedelta(seconds=seg["start"])
102
+ end_td = datetime.timedelta(seconds=seg["end"])
103
+ subtitle = srt.Subtitle(index=i+1, start=start_td, end=end_td, content=seg["text"])
104
+ subtitles.append(subtitle)
105
+ return srt.compose(subtitles)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ def prepare_chapter_prompt(srt_text):
108
+ system_prompt = (
109
+ "You are a highly skilled video content segmentation and optimization expert. "
110
+ "Your task is to analyze a transcript of a YouTube video provided in SRT format and produce engaging and concise chapter headers. "
111
+ "Each chapter header must be on its own line in the exact format: 'mm:ss Chapter Title'.\n\n"
112
+ "- 'mm:ss' represents the starting time of the chapter (minutes and seconds).\n"
113
+ "- 'Chapter Title' must be a catchy, audience-friendly title that summarizes the key idea or transition at that point in the video.\n\n"
114
+ "IMPORTANT: Although these instructions are in English, please ensure that your output is in the same language as the provided SRT transcript."
115
+ )
116
+ user_prompt = (
117
+ "Below is the transcript of a YouTube video in SRT format:\n\n"
118
+ "```\n"
119
+ f"{srt_text}\n"
120
+ "```\n\n"
121
+ "Please generate only the chapter breakdown using the guidelines above. "
122
+ "Each chapter header should be formatted as:\n"
123
+ "mm:ss Chapter Title"
124
+ )
125
+ return system_prompt + "\n\n" + user_prompt
126
 
127
+ def format_prompt_html(prompt):
 
 
 
128
  """
129
+ Displays the prompt in a read-only textarea using Gradio's color variables for background and text.
130
+ Includes a 'Copy Prompt' button (blue) and a short 'Prompt Copied!' confirmation message.
 
131
  """
132
+ html_content = f"""
133
+ <div style="display: flex; flex-direction: column; gap: 10px; margin-top: 10px;">
134
+ <textarea id="prompt_text" rows="10"
135
+ style="width: 100%; resize: vertical;
136
+ background-color: var(--block-background-fill);
137
+ color: var(--block-text-color);
138
+ border: 1px solid var(--block-border-color);
139
+ border-radius: 4px;"
140
+ readonly>{prompt}</textarea>
141
+ <button
142
+ style="width: 150px; padding: 8px;
143
+ background-color: #007bff;
144
+ color: white;
145
+ border: none;
146
+ border-radius: 4px;
147
+ cursor: pointer;"
148
+ onclick="
149
+ navigator.clipboard.writeText(document.getElementById('prompt_text').value);
150
+ const copiedMsg = document.getElementById('copied_msg');
151
+ copiedMsg.style.display = 'inline';
152
+ setTimeout(() => copiedMsg.style.display = 'none', 2000);
153
+ ">
154
+ Copy Prompt
155
+ </button>
156
+ <span id="copied_msg" style="display: none; color: var(--primary-text-color); font-weight: bold;">Prompt Copied!</span>
157
+ </div>
158
+ """
159
+ return html_content
160
+
161
+ def process_audio(audio, language_name):
162
+ lang_code = LANGUAGE_OPTIONS.get(language_name, "en")
163
  try:
164
+ transcription, srt_text, segments = transcribe_audio(audio, model_size='base', language=lang_code)
165
  except Exception as e:
166
+ return f"Error during transcription: {str(e)}", "", ""
167
 
168
+ chapter_prompt = prepare_chapter_prompt(srt_text)
169
+ prompt_html = format_prompt_html(chapter_prompt)
170
+ return transcription, srt_text, prompt_html
171
 
172
+ iface = gr.Interface(
173
+ fn=process_audio,
174
+ inputs=[
175
+ gr.Audio(type="filepath", label="Upload Audio"),
176
+ gr.Dropdown(choices=list(LANGUAGE_OPTIONS.keys()), label="Audio Language", value="English")
177
+ ],
178
+ outputs=[
179
+ gr.Textbox(label="Full Transcription", lines=10),
180
+ gr.Textbox(label="SRT File Content", lines=10),
181
+ gr.HTML(label="Prepared Chapter Prompt (Copy & Paste into ChatGPT)")
182
+ ],
183
+ title="Video Chapter Splitter from Audio (MP3)",
184
+ description=(
185
+ "Upload an audio file (e.g., MP3) of your YouTube video and select the audio language. "
186
+ "The app will transcribe the audio using Whisper, generate subtitles in SRT format, "
187
+ "and prepare a single, complete prompt that instructs ChatGPT/GPT-4 to generate a chapter breakdown in the format 'mm:ss Chapter Title'.\n\n"
188
+ "Click the 'Copy Prompt' button to copy the entire prompt, and a brief 'Prompt Copied!' message will appear."
189
+ )
190
+ )
191
 
192
  if __name__ == "__main__":
193
+ iface.launch()
app_video_understant.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import requests
4
+ import numpy as np
5
+ from PIL import Image
6
+ import decord
7
+ from decord import VideoReader, cpu
8
+ import torch
9
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
+ from qwen_vl_utils import process_vision_info
11
+ import gradio as gr
12
+ # Removed pytube since we no longer download from YouTube
13
+
14
+ # ----------------------------------------
15
+ # 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
16
+ # ----------------------------------------
17
+ model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
18
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
19
+ model_path,
20
+ torch_dtype=torch.float16 # use float16 on CPU if desired, else use float32
21
+ # Removed attn_implementation and device_map for CPU-only deployment
22
+ )
23
+ processor = AutoProcessor.from_pretrained(model_path)
24
+
25
+ # -------------------------------------------------
26
+ # 2. Define Utility Functions for Video Processing
27
+ # -------------------------------------------------
28
+ def download_video(url, dest_path):
29
+ """
30
+ Download a non-YouTube video using requests.
31
+ (This function is retained if you need it later.)
32
+ """
33
+ response = requests.get(url, stream=True)
34
+ with open(dest_path, 'wb') as f:
35
+ for chunk in response.iter_content(chunk_size=8096):
36
+ f.write(chunk)
37
+ print(f"Video downloaded to {dest_path}")
38
+
39
+ def get_video_frames(video_path, num_frames=16, cache_dir='.cache'):
40
+ """
41
+ Extract frames and timestamps from a video file.
42
+ If the video_path is a URL, it will download it.
43
+ For local files (including uploaded videos), it processes directly.
44
+ Uses caching to avoid repeated processing.
45
+ """
46
+ os.makedirs(cache_dir, exist_ok=True)
47
+ video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
48
+
49
+ # If video_path starts with 'http', attempt to download
50
+ if video_path.startswith('http'):
51
+ video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
52
+ if not os.path.exists(video_file_path):
53
+ print("Downloading video using requests...")
54
+ download_video(video_path, video_file_path)
55
+ else:
56
+ # For local files (uploaded videos), use the provided path directly.
57
+ video_file_path = video_path
58
+
59
+ # Check for cached frames
60
+ frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
61
+ timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
62
+ if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
63
+ frames = np.load(frames_cache_file)
64
+ timestamps = np.load(timestamps_cache_file)
65
+ return video_file_path, frames, timestamps
66
+
67
+ # Read video using decord
68
+ vr = VideoReader(video_file_path, ctx=cpu(0))
69
+ total_frames = len(vr)
70
+ indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
71
+ frames = vr.get_batch(indices).asnumpy()
72
+ timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
73
+
74
+ # Save to cache
75
+ np.save(frames_cache_file, frames)
76
+ np.save(timestamps_cache_file, timestamps)
77
+
78
+ return video_file_path, frames, timestamps
79
+
80
+ # --------------------------------------------------------
81
+ # 3. Inference Function Using Qwen 2.5 VL to Process the Video
82
+ # --------------------------------------------------------
83
+ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
84
+ """
85
+ Prepares the input messages with the prompt and video metadata,
86
+ processes the video inputs, and runs inference through the model.
87
+ """
88
+ messages = [
89
+ {"role": "system", "content": "You are a helpful assistant."},
90
+ {"role": "user", "content": [
91
+ {"type": "text", "text": prompt},
92
+ {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
93
+ ]},
94
+ ]
95
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
96
+ image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
97
+ fps_inputs = video_kwargs['fps']
98
+ inputs = processor(
99
+ text=[text],
100
+ images=image_inputs,
101
+ videos=video_inputs,
102
+ fps=fps_inputs,
103
+ padding=True,
104
+ return_tensors="pt"
105
+ )
106
+ # In CPU-only mode, we use the default device (no .to('cuda'))
107
+ output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
108
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
109
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
110
+ return output_text[0]
111
+
112
+ # -------------------------------------------------
113
+ # 4. Define Sample Prompts for Users
114
+ # -------------------------------------------------
115
+ sample_prompts = [
116
+ "Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
117
+ "Provide a breakdown of the video's content by segment, including starting times and summaries.",
118
+ "Segment the video into logical chapters and output the start time and a brief description for each chapter.",
119
+ ]
120
+
121
+ # -------------------------------------------------
122
+ # 5. Main Processing Function for the Gradio Interface
123
+ # -------------------------------------------------
124
+ def process_video(video_file, custom_prompt, sample_prompt):
125
+ """
126
+ Called when the user clicks 'Process Video'.
127
+ Uses the custom prompt if provided; otherwise, uses the sample prompt.
128
+ Processes the uploaded video file and runs inference.
129
+ """
130
+ final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
131
+ try:
132
+ # video_file is expected to be a local file path from the uploader.
133
+ video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
134
+ except Exception as e:
135
+ return f"Error processing video: {str(e)}"
136
+
137
+ try:
138
+ output = inference(video_path, final_prompt)
139
+ except Exception as e:
140
+ return f"Error during inference: {str(e)}"
141
+
142
+ return output
143
+
144
+ # -------------------------------------------------
145
+ # 6. Build the Gradio Interface
146
+ # -------------------------------------------------
147
+ with gr.Blocks() as demo:
148
+ gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on CPU")
149
+ gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
150
+
151
+ with gr.Row():
152
+ # Removed the source parameter here
153
+ video_input = gr.Video(label="Upload Video")
154
+ with gr.Row():
155
+ custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
156
+ with gr.Row():
157
+ sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
158
+
159
+ output_text = gr.Textbox(label="Output", lines=10)
160
+ run_button = gr.Button("Process Video")
161
+
162
+ run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
163
+
164
+ if __name__ == "__main__":
165
+ demo.launch()
local_video_understant_app.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import hashlib
3
+ import requests
4
+ import numpy as np
5
+ from PIL import Image
6
+ import decord
7
+ from decord import VideoReader, cpu
8
+ import torch
9
+ from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
10
+ from qwen_vl_utils import process_vision_info
11
+ import gradio as gr
12
+
13
+ # ---------------------------------------------------
14
+ # 1. Set Up Device: Use Apple's MPS if available, else CPU
15
+ # ---------------------------------------------------
16
+ device = "mps" if torch.backends.mps.is_available() else "cpu"
17
+ print(f"Using device: {device}")
18
+ # For MPS, we can try using float16 to reduce memory usage.
19
+ torch_dtype = torch.float16 if device == "mps" else torch.float32
20
+
21
+ # ---------------------------------------------------
22
+ # 2. Initialize the Qwen 2.5 VL Model (3B) for Local Use
23
+ # ---------------------------------------------------
24
+ model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
25
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
26
+ model_path,
27
+ torch_dtype=torch_dtype
28
+ )
29
+ model.to(device)
30
+ processor = AutoProcessor.from_pretrained(model_path)
31
+
32
+ # ---------------------------------------------------
33
+ # 3. Utility Functions for Video Processing
34
+ # ---------------------------------------------------
35
+ def download_video(url, dest_path):
36
+ """
37
+ Downloads a video from a URL.
38
+ (This function is kept here if you ever need to download via URL.)
39
+ """
40
+ response = requests.get(url, stream=True)
41
+ with open(dest_path, 'wb') as f:
42
+ for chunk in response.iter_content(chunk_size=8096):
43
+ f.write(chunk)
44
+ print(f"Video downloaded to {dest_path}")
45
+
46
+ def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
47
+ """
48
+ Extract frames and timestamps from a video file.
49
+ If video_path is a URL, it downloads it; otherwise it assumes a local file.
50
+ Caching is used to avoid re-processing.
51
+ """
52
+ os.makedirs(cache_dir, exist_ok=True)
53
+ video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
54
+
55
+ # If the path starts with 'http', download the file.
56
+ if video_path.startswith("http"):
57
+ video_file_path = os.path.join(cache_dir, f"{video_hash}.mp4")
58
+ if not os.path.exists(video_file_path):
59
+ print("Downloading video using requests...")
60
+ download_video(video_path, video_file_path)
61
+ else:
62
+ video_file_path = video_path
63
+
64
+ frames_cache_file = os.path.join(cache_dir, f"{video_hash}_{num_frames}_frames.npy")
65
+ timestamps_cache_file = os.path.join(cache_dir, f"{video_hash}_{num_frames}_timestamps.npy")
66
+ if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
67
+ frames = np.load(frames_cache_file)
68
+ timestamps = np.load(timestamps_cache_file)
69
+ return video_file_path, frames, timestamps
70
+
71
+ # Load video using decord
72
+ vr = VideoReader(video_file_path, ctx=cpu(0))
73
+ total_frames = len(vr)
74
+ indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
75
+ frames = vr.get_batch(indices).asnumpy()
76
+ timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
77
+
78
+ # Cache the frames and timestamps
79
+ np.save(frames_cache_file, frames)
80
+ np.save(timestamps_cache_file, timestamps)
81
+
82
+ return video_file_path, frames, timestamps
83
+
84
+ # ---------------------------------------------------
85
+ # 4. Inference Function Using Qwen 2.5 VL (3B)
86
+ # ---------------------------------------------------
87
+ def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
88
+ """
89
+ Prepares the input with the prompt and video metadata,
90
+ processes the video inputs, and runs inference through the model.
91
+ """
92
+ messages = [
93
+ {"role": "system", "content": "You are a helpful assistant."},
94
+ {"role": "user", "content": [
95
+ {"type": "text", "text": prompt},
96
+ {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
97
+ ]},
98
+ ]
99
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
100
+ image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
101
+ fps_inputs = video_kwargs["fps"]
102
+ inputs = processor(
103
+ text=[text],
104
+ images=image_inputs,
105
+ videos=video_inputs,
106
+ fps=fps_inputs,
107
+ padding=True,
108
+ return_tensors="pt"
109
+ )
110
+ # Move inputs to our chosen device (MPS or CPU)
111
+ inputs = inputs.to(device)
112
+ output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
113
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
114
+ output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
115
+ return output_text[0]
116
+
117
+ # ---------------------------------------------------
118
+ # 5. Define Sample Prompts
119
+ # ---------------------------------------------------
120
+ sample_prompts = [
121
+ "Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
122
+ "Provide a breakdown of the video's content by segment, including starting times and summaries.",
123
+ "Segment the video into logical chapters and output the start time and a brief description for each chapter.",
124
+ ]
125
+
126
+ # ---------------------------------------------------
127
+ # 6. Main Processing Function for the Gradio Interface
128
+ # ---------------------------------------------------
129
+ def process_video(video_file, custom_prompt, sample_prompt):
130
+ """
131
+ Called when the user clicks 'Process Video'.
132
+ Uses a custom prompt (if provided) or the sample prompt.
133
+ Processes the uploaded video and runs inference.
134
+ """
135
+ final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
136
+ try:
137
+ # Here, video_file is the local file path from the uploader.
138
+ video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
139
+ except Exception as e:
140
+ return f"Error processing video: {str(e)}"
141
+
142
+ try:
143
+ output = inference(video_path, final_prompt)
144
+ except Exception as e:
145
+ return f"Error during inference: {str(e)}"
146
+
147
+ return output
148
+
149
+ # ---------------------------------------------------
150
+ # 7. Build the Gradio Interface for Local Use
151
+ # ---------------------------------------------------
152
+ with gr.Blocks() as demo:
153
+ gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on Mac")
154
+ gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
155
+ with gr.Row():
156
+ video_input = gr.Video(label="Upload Video")
157
+ with gr.Row():
158
+ custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
159
+ with gr.Row():
160
+ sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
161
+ output_text = gr.Textbox(label="Output", lines=10)
162
+ run_button = gr.Button("Process Video")
163
+ run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
164
+
165
+ if __name__ == "__main__":
166
+ demo.launch()
requirements.txt CHANGED
@@ -1,20 +1,7 @@
1
- # Core dependencies from Qwen 2.5 VL
2
- gradio
3
- gradio_client
4
- qwen-vl-utils
5
- transformers-stream-generator==0.0.4
6
- torch==2.4.0
7
- torchvision==0.19.0
8
- git+https://github.com/huggingface/transformers.git
9
- accelerate
10
- av
11
-
12
- # Optional dependency (uncomment if flash attention is needed)
13
- # flash-attn==2.6.1
14
-
15
- # Additional dependencies for video processing and utilities
16
- decord #use decord for linux or other OS
17
- numpy
18
- Pillow
19
  requests
20
- pytube
 
1
+ gradio>=3.0
2
+ openai-whisper
3
+ srt
4
+ transformers
5
+ torch>=2.0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  requests
7
+ numpy
requirements_vu.txt ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies from Qwen 2.5 VL
2
+ gradio
3
+ gradio_client
4
+ qwen-vl-utils
5
+ transformers-stream-generator==0.0.4
6
+ torch==2.4.0
7
+ torchvision==0.19.0
8
+ git+https://github.com/huggingface/transformers.git
9
+ accelerate
10
+ av
11
+
12
+ # Optional dependency (uncomment if flash attention is needed)
13
+ # flash-attn==2.6.1
14
+
15
+ # Additional dependencies for video processing and utilities
16
+ decord #use decord for linux or other OS
17
+ numpy
18
+ Pillow
19
+ requests
20
+ pytube