Spaces:

BahadirGLCK
/

video_splitter

Running

App Files Files Community

BahadirGLCK commited on Feb 23

Commit

6621c82

1 Parent(s): 9d2876b

Change application flow.

Browse files

Files changed (5) hide show

app.py +173 -145
app_video_understant.py +165 -0
local_video_understant_app.py +166 -0
requirements.txt +6 -19
requirements_vu.txt +20 -0

app.py CHANGED Viewed

@@ -1,165 +1,193 @@
 import os
 import hashlib
 import requests
 import numpy as np
-from PIL import Image
-import decord
-from decord import VideoReader, cpu
-import torch
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
-from qwen_vl_utils import process_vision_info
 import gradio as gr
-# Removed pytube since we no longer download from YouTube
-# ----------------------------------------
-# 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
-# ----------------------------------------
-model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
-model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    model_path,
-    torch_dtype=torch.float16  # use float16 on CPU if desired, else use float32
-    # Removed attn_implementation and device_map for CPU-only deployment
-)
-processor = AutoProcessor.from_pretrained(model_path)
-# -------------------------------------------------
-# 2. Define Utility Functions for Video Processing
-# -------------------------------------------------
-def download_video(url, dest_path):
-    """
-    Download a non-YouTube video using requests.
-    (This function is retained if you need it later.)
-    """
-    response = requests.get(url, stream=True)
-    with open(dest_path, 'wb') as f:
-        for chunk in response.iter_content(chunk_size=8096):
-            f.write(chunk)
-    print(f"Video downloaded to {dest_path}")
-def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
-    """
-    Extract frames and timestamps from a video file.
-    If the video_path is a URL, it will download it.
-    For local files (including uploaded videos), it processes directly.
-    Uses caching to avoid repeated processing.
-    """
-    os.makedirs(cache_dir, exist_ok=True)
-    video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
-    # If video_path starts with 'http', attempt to download
-    if video_path.startswith('http'):
-        video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
-        if not os.path.exists(video_file_path):
-            print("Downloading video using requests...")
-            download_video(video_path, video_file_path)
-    else:
-        # For local files (uploaded videos), use the provided path directly.
-        video_file_path = video_path
-    # Check for cached frames
-    frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
-    timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
-    if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
-        frames = np.load(frames_cache_file)
-        timestamps = np.load(timestamps_cache_file)
-        return video_file_path, frames, timestamps
-    # Read video using decord
-    vr = VideoReader(video_file_path, ctx=cpu(0))
-    total_frames = len(vr)
-    indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
-    frames = vr.get_batch(indices).asnumpy()
-    timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
-    # Save to cache
-    np.save(frames_cache_file, frames)
-    np.save(timestamps_cache_file, timestamps)
-    return video_file_path, frames, timestamps
-# --------------------------------------------------------
-# 3. Inference Function Using Qwen 2.5 VL to Process the Video
-# --------------------------------------------------------
-def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
-    """
-    Prepares the input messages with the prompt and video metadata,
-    processes the video inputs, and runs inference through the model.
-    """
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": [
-            {"type": "text", "text": prompt},
-            {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
-        ]},
-    ]
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
-    fps_inputs = video_kwargs['fps']
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        fps=fps_inputs,
-        padding=True,
-        return_tensors="pt"
-    )
-    # In CPU-only mode, we use the default device (no .to('cuda'))
-    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
-    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
-    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
-    return output_text[0]
-# -------------------------------------------------
-# 4. Define Sample Prompts for Users
-# -------------------------------------------------
-sample_prompts = [
-    "Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
-    "Provide a breakdown of the video's content by segment, including starting times and summaries.",
-    "Segment the video into logical chapters and output the start time and a brief description for each chapter.",
-]
-# -------------------------------------------------
-# 5. Main Processing Function for the Gradio Interface
-# -------------------------------------------------
-def process_video(video_file, custom_prompt, sample_prompt):
     """
-    Called when the user clicks 'Process Video'.
-    Uses the custom prompt if provided; otherwise, uses the sample prompt.
-    Processes the uploaded video file and runs inference.
     """
-    final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
-    try:
-        # video_file is expected to be a local file path from the uploader.
-        video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
-    except Exception as e:
-        return f"Error processing video: {str(e)}"
     try:
-        output = inference(video_path, final_prompt)
     except Exception as e:
-        return f"Error during inference: {str(e)}"
-    return output
-# -------------------------------------------------
-# 6. Build the Gradio Interface
-# -------------------------------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on CPU")
-    gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
-    with gr.Row():
-        # Removed the source parameter here
-        video_input = gr.Video(label="Upload Video")
-    with gr.Row():
-        custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
-    with gr.Row():
-        sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
-    output_text = gr.Textbox(label="Output", lines=10)
-    run_button = gr.Button("Process Video")
-    run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
 if __name__ == "__main__":
-    demo.launch()

 import os
+import datetime
 import hashlib
 import requests
 import numpy as np
 import gradio as gr
+import whisper
+import srt
+import torch
+LANGUAGE_OPTIONS = {
+    "Afrikaans": "af",
+    "Arabic": "ar",
+    "Azerbaijani": "az",
+    "Belarusian": "be",
+    "Bulgarian": "bg",
+    "Bengali": "bn",
+    "Catalan": "ca",
+    "Czech": "cs",
+    "Welsh": "cy",
+    "Danish": "da",
+    "German": "de",
+    "Greek": "el",
+    "English": "en",
+    "Spanish": "es",
+    "Estonian": "et",
+    "Persian": "fa",
+    "Finnish": "fi",
+    "French": "fr",
+    "Irish": "ga",
+    "Galician": "gl",
+    "Gujarati": "gu",
+    "Hebrew": "he",
+    "Hindi": "hi",
+    "Croatian": "hr",
+    "Hungarian": "hu",
+    "Armenian": "hy",
+    "Indonesian": "id",
+    "Icelandic": "is",
+    "Italian": "it",
+    "Japanese": "ja",
+    "Georgian": "ka",
+    "Kazakh": "kk",
+    "Khmer": "km",
+    "Kannada": "kn",
+    "Korean": "ko",
+    "Lithuanian": "lt",
+    "Latvian": "lv",
+    "Macedonian": "mk",
+    "Malayalam": "ml",
+    "Mongolian": "mn",
+    "Marathi": "mr",
+    "Malay": "ms",
+    "Maltese": "mt",
+    "Nepali": "ne",
+    "Dutch": "nl",
+    "Norwegian": "no",
+    "Odia": "or",
+    "Punjabi": "pa",
+    "Polish": "pl",
+    "Portuguese": "pt",
+    "Romanian": "ro",
+    "Russian": "ru",
+    "Sinhala": "si",
+    "Slovak": "sk",
+    "Slovenian": "sl",
+    "Albanian": "sq",
+    "Serbian": "sr",
+    "Swedish": "sv",
+    "Swahili": "sw",
+    "Tamil": "ta",
+    "Telugu": "te",
+    "Thai": "th",
+    "Turkish": "tr",
+    "Ukrainian": "uk",
+    "Urdu": "ur",
+    "Vietnamese": "vi",
+    "Chinese": "zh"
+}
+def transcribe_audio(audio_file_path, model_size='base', language="en"):
+    model = whisper.load_model(model_size)
+    model.to("cpu")
+    result = model.transcribe(audio_file_path, language=language)
+    transcription = result["text"]
+    segments = result["segments"]
+    try:
+        from whisper.utils import format_srt
+        srt_text = format_srt(segments)
+    except Exception:
+        srt_text = generate_srt(segments)
+    return transcription, srt_text, segments
+def generate_srt(segments):
+    import datetime
+    import srt
+    subtitles = []
+    for i, seg in enumerate(segments):
+        start_td = datetime.timedelta(seconds=seg["start"])
+        end_td = datetime.timedelta(seconds=seg["end"])
+        subtitle = srt.Subtitle(index=i+1, start=start_td, end=end_td, content=seg["text"])
+        subtitles.append(subtitle)
+    return srt.compose(subtitles)
+def prepare_chapter_prompt(srt_text):
+    system_prompt = (
+        "You are a highly skilled video content segmentation and optimization expert. "
+        "Your task is to analyze a transcript of a YouTube video provided in SRT format and produce engaging and concise chapter headers. "
+        "Each chapter header must be on its own line in the exact format: 'mm:ss Chapter Title'.\n\n"
+        "- 'mm:ss' represents the starting time of the chapter (minutes and seconds).\n"
+        "- 'Chapter Title' must be a catchy, audience-friendly title that summarizes the key idea or transition at that point in the video.\n\n"
+        "IMPORTANT: Although these instructions are in English, please ensure that your output is in the same language as the provided SRT transcript."
+    )
+    user_prompt = (
+        "Below is the transcript of a YouTube video in SRT format:\n\n"
+        "```\n"
+        f"{srt_text}\n"
+        "```\n\n"
+        "Please generate only the chapter breakdown using the guidelines above. "
+        "Each chapter header should be formatted as:\n"
+        "mm:ss Chapter Title"
+    )
+    return system_prompt + "\n\n" + user_prompt
+def format_prompt_html(prompt):
     """
+    Displays the prompt in a read-only textarea using Gradio's color variables for background and text.
+    Includes a 'Copy Prompt' button (blue) and a short 'Prompt Copied!' confirmation message.
     """
+    html_content = f"""
+    <div style="display: flex; flex-direction: column; gap: 10px; margin-top: 10px;">
+      <textarea id="prompt_text" rows="10"
+        style="width: 100%; resize: vertical;
+               background-color: var(--block-background-fill);
+               color: var(--block-text-color);
+               border: 1px solid var(--block-border-color);
+               border-radius: 4px;"
+        readonly>{prompt}</textarea>
+      <button
+        style="width: 150px; padding: 8px;
+               background-color: #007bff;
+               color: white;
+               border: none;
+               border-radius: 4px;
+               cursor: pointer;"
+        onclick="
+          navigator.clipboard.writeText(document.getElementById('prompt_text').value);
+          const copiedMsg = document.getElementById('copied_msg');
+          copiedMsg.style.display = 'inline';
+          setTimeout(() => copiedMsg.style.display = 'none', 2000);
+        ">
+        Copy Prompt
+      </button>
+      <span id="copied_msg" style="display: none; color: var(--primary-text-color); font-weight: bold;">Prompt Copied!</span>
+    </div>
+    """
+    return html_content
+def process_audio(audio, language_name):
+    lang_code = LANGUAGE_OPTIONS.get(language_name, "en")
     try:
+        transcription, srt_text, segments = transcribe_audio(audio, model_size='base', language=lang_code)
     except Exception as e:
+        return f"Error during transcription: {str(e)}", "", ""
+    chapter_prompt = prepare_chapter_prompt(srt_text)
+    prompt_html = format_prompt_html(chapter_prompt)
+    return transcription, srt_text, prompt_html
+iface = gr.Interface(
+    fn=process_audio,
+    inputs=[
+        gr.Audio(type="filepath", label="Upload Audio"),
+        gr.Dropdown(choices=list(LANGUAGE_OPTIONS.keys()), label="Audio Language", value="English")
+    ],
+    outputs=[
+        gr.Textbox(label="Full Transcription", lines=10),
+        gr.Textbox(label="SRT File Content", lines=10),
+        gr.HTML(label="Prepared Chapter Prompt (Copy & Paste into ChatGPT)")
+    ],
+    title="Video Chapter Splitter from Audio (MP3)",
+    description=(
+        "Upload an audio file (e.g., MP3) of your YouTube video and select the audio language. "
+        "The app will transcribe the audio using Whisper, generate subtitles in SRT format, "
+        "and prepare a single, complete prompt that instructs ChatGPT/GPT-4 to generate a chapter breakdown in the format 'mm:ss Chapter Title'.\n\n"
+        "Click the 'Copy Prompt' button to copy the entire prompt, and a brief 'Prompt Copied!' message will appear."
+    )
+)
 if __name__ == "__main__":
+    iface.launch()

app_video_understant.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import os
+import hashlib
+import requests
+import numpy as np
+from PIL import Image
+import decord
+from decord import VideoReader, cpu
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import gradio as gr
+# Removed pytube since we no longer download from YouTube
+# ----------------------------------------
+# 1. Initialize the Qwen 2.5 VL Model (3B) for CPU-only
+# ----------------------------------------
+model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_path,
+    torch_dtype=torch.float16  # use float16 on CPU if desired, else use float32
+    # Removed attn_implementation and device_map for CPU-only deployment
+)
+processor = AutoProcessor.from_pretrained(model_path)
+# -------------------------------------------------
+# 2. Define Utility Functions for Video Processing
+# -------------------------------------------------
+def download_video(url, dest_path):
+    """
+    Download a non-YouTube video using requests.
+    (This function is retained if you need it later.)
+    """
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8096):
+            f.write(chunk)
+    print(f"Video downloaded to {dest_path}")
+def get_video_frames(video_path, num_frames=16, cache_dir='.cache'):
+    """
+    Extract frames and timestamps from a video file.
+    If the video_path is a URL, it will download it.
+    For local files (including uploaded videos), it processes directly.
+    Uses caching to avoid repeated processing.
+    """
+    os.makedirs(cache_dir, exist_ok=True)
+    video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
+    # If video_path starts with 'http', attempt to download
+    if video_path.startswith('http'):
+        video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
+        if not os.path.exists(video_file_path):
+            print("Downloading video using requests...")
+            download_video(video_path, video_file_path)
+    else:
+        # For local files (uploaded videos), use the provided path directly.
+        video_file_path = video_path
+    # Check for cached frames
+    frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
+    timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
+    if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
+        frames = np.load(frames_cache_file)
+        timestamps = np.load(timestamps_cache_file)
+        return video_file_path, frames, timestamps
+    # Read video using decord
+    vr = VideoReader(video_file_path, ctx=cpu(0))
+    total_frames = len(vr)
+    indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
+    frames = vr.get_batch(indices).asnumpy()
+    timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
+    # Save to cache
+    np.save(frames_cache_file, frames)
+    np.save(timestamps_cache_file, timestamps)
+    return video_file_path, frames, timestamps
+# --------------------------------------------------------
+# 3. Inference Function Using Qwen 2.5 VL to Process the Video
+# --------------------------------------------------------
+def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
+    """
+    Prepares the input messages with the prompt and video metadata,
+    processes the video inputs, and runs inference through the model.
+    """
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            {"type": "text", "text": prompt},
+            {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
+        ]},
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
+    fps_inputs = video_kwargs['fps']
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        fps=fps_inputs,
+        padding=True,
+        return_tensors="pt"
+    )
+    # In CPU-only mode, we use the default device (no .to('cuda'))
+    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    return output_text[0]
+# -------------------------------------------------
+# 4. Define Sample Prompts for Users
+# -------------------------------------------------
+sample_prompts = [
+    "Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
+    "Provide a breakdown of the video's content by segment, including starting times and summaries.",
+    "Segment the video into logical chapters and output the start time and a brief description for each chapter.",
+]
+# -------------------------------------------------
+# 5. Main Processing Function for the Gradio Interface
+# -------------------------------------------------
+def process_video(video_file, custom_prompt, sample_prompt):
+    """
+    Called when the user clicks 'Process Video'.
+    Uses the custom prompt if provided; otherwise, uses the sample prompt.
+    Processes the uploaded video file and runs inference.
+    """
+    final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
+    try:
+        # video_file is expected to be a local file path from the uploader.
+        video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
+    except Exception as e:
+        return f"Error processing video: {str(e)}"
+    try:
+        output = inference(video_path, final_prompt)
+    except Exception as e:
+        return f"Error during inference: {str(e)}"
+    return output
+# -------------------------------------------------
+# 6. Build the Gradio Interface
+# -------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on CPU")
+    gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
+    with gr.Row():
+        # Removed the source parameter here
+        video_input = gr.Video(label="Upload Video")
+    with gr.Row():
+        custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
+    with gr.Row():
+        sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
+    output_text = gr.Textbox(label="Output", lines=10)
+    run_button = gr.Button("Process Video")
+    run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
+if __name__ == "__main__":
+    demo.launch()

local_video_understant_app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import os
+import hashlib
+import requests
+import numpy as np
+from PIL import Image
+import decord
+from decord import VideoReader, cpu
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import gradio as gr
+# ---------------------------------------------------
+# 1. Set Up Device: Use Apple's MPS if available, else CPU
+# ---------------------------------------------------
+device = "mps" if torch.backends.mps.is_available() else "cpu"
+print(f"Using device: {device}")
+# For MPS, we can try using float16 to reduce memory usage.
+torch_dtype = torch.float16 if device == "mps" else torch.float32
+# ---------------------------------------------------
+# 2. Initialize the Qwen 2.5 VL Model (3B) for Local Use
+# ---------------------------------------------------
+model_path = "Qwen/Qwen2.5-VL-3B-Instruct"
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_path,
+    torch_dtype=torch_dtype
+)
+model.to(device)
+processor = AutoProcessor.from_pretrained(model_path)
+# ---------------------------------------------------
+# 3. Utility Functions for Video Processing
+# ---------------------------------------------------
+def download_video(url, dest_path):
+    """
+    Downloads a video from a URL.
+    (This function is kept here if you ever need to download via URL.)
+    """
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8096):
+            f.write(chunk)
+    print(f"Video downloaded to {dest_path}")
+def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
+    """
+    Extract frames and timestamps from a video file.
+    If video_path is a URL, it downloads it; otherwise it assumes a local file.
+    Caching is used to avoid re-processing.
+    """
+    os.makedirs(cache_dir, exist_ok=True)
+    video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
+    # If the path starts with 'http', download the file.
+    if video_path.startswith("http"):
+        video_file_path = os.path.join(cache_dir, f"{video_hash}.mp4")
+        if not os.path.exists(video_file_path):
+            print("Downloading video using requests...")
+            download_video(video_path, video_file_path)
+    else:
+        video_file_path = video_path
+    frames_cache_file = os.path.join(cache_dir, f"{video_hash}_{num_frames}_frames.npy")
+    timestamps_cache_file = os.path.join(cache_dir, f"{video_hash}_{num_frames}_timestamps.npy")
+    if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
+        frames = np.load(frames_cache_file)
+        timestamps = np.load(timestamps_cache_file)
+        return video_file_path, frames, timestamps
+    # Load video using decord
+    vr = VideoReader(video_file_path, ctx=cpu(0))
+    total_frames = len(vr)
+    indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
+    frames = vr.get_batch(indices).asnumpy()
+    timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
+    # Cache the frames and timestamps
+    np.save(frames_cache_file, frames)
+    np.save(timestamps_cache_file, timestamps)
+    return video_file_path, frames, timestamps
+# ---------------------------------------------------
+# 4. Inference Function Using Qwen 2.5 VL (3B)
+# ---------------------------------------------------
+def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
+    """
+    Prepares the input with the prompt and video metadata,
+    processes the video inputs, and runs inference through the model.
+    """
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            {"type": "text", "text": prompt},
+            {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
+        ]},
+    ]
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
+    fps_inputs = video_kwargs["fps"]
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        fps=fps_inputs,
+        padding=True,
+        return_tensors="pt"
+    )
+    # Move inputs to our chosen device (MPS or CPU)
+    inputs = inputs.to(device)
+    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    return output_text[0]
+# ---------------------------------------------------
+# 5. Define Sample Prompts
+# ---------------------------------------------------
+sample_prompts = [
+    "Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
+    "Provide a breakdown of the video's content by segment, including starting times and summaries.",
+    "Segment the video into logical chapters and output the start time and a brief description for each chapter.",
+]
+# ---------------------------------------------------
+# 6. Main Processing Function for the Gradio Interface
+# ---------------------------------------------------
+def process_video(video_file, custom_prompt, sample_prompt):
+    """
+    Called when the user clicks 'Process Video'.
+    Uses a custom prompt (if provided) or the sample prompt.
+    Processes the uploaded video and runs inference.
+    """
+    final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
+    try:
+        # Here, video_file is the local file path from the uploader.
+        video_path, frames, timestamps = get_video_frames(video_file, num_frames=64)
+    except Exception as e:
+        return f"Error processing video: {str(e)}"
+    try:
+        output = inference(video_path, final_prompt)
+    except Exception as e:
+        return f"Error during inference: {str(e)}"
+    return output
+# ---------------------------------------------------
+# 7. Build the Gradio Interface for Local Use
+# ---------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# Video Chapter Splitter using Qwen 2.5 VL (3B) on Mac")
+    gr.Markdown("Upload a video file and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
+    with gr.Row():
+        video_input = gr.Video(label="Upload Video")
+    with gr.Row():
+        custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
+    with gr.Row():
+        sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
+    output_text = gr.Textbox(label="Output", lines=10)
+    run_button = gr.Button("Process Video")
+    run_button.click(fn=process_video, inputs=[video_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
+if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,20 +1,7 @@
-# Core dependencies from Qwen 2.5 VL
-gradio
-gradio_client
-qwen-vl-utils
-transformers-stream-generator==0.0.4
-torch==2.4.0
-torchvision==0.19.0
-git+https://github.com/huggingface/transformers.git
-accelerate
-av
-# Optional dependency (uncomment if flash attention is needed)
-# flash-attn==2.6.1
-# Additional dependencies for video processing and utilities
-decord #use decord for linux or other OS
-numpy
-Pillow
 requests
-pytube

+gradio>=3.0
+openai-whisper
+srt
+transformers
+torch>=2.0.0
 requests
+numpy

requirements_vu.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# Core dependencies from Qwen 2.5 VL
+gradio
+gradio_client
+qwen-vl-utils
+transformers-stream-generator==0.0.4
+torch==2.4.0
+torchvision==0.19.0
+git+https://github.com/huggingface/transformers.git
+accelerate
+av
+# Optional dependency (uncomment if flash attention is needed)
+# flash-attn==2.6.1
+# Additional dependencies for video processing and utilities
+decord #use decord for linux or other OS
+numpy
+Pillow
+requests
+pytube