Spaces:

BahadirGLCK
/

video_splitter

Running

App Files Files Community

BahadirGLCK commited on Feb 22

Commit

fd13285

1 Parent(s): 4e022c1

First commit

Browse files

Files changed (2) hide show

app.py +171 -0
requirements.txt +20 -0

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+import hashlib
+import requests
+import numpy as np
+from PIL import Image
+import decord
+from decord import VideoReader, cpu
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import gradio as gr
+# ----------------------------------------
+# 1. Initialize the Qwen 2.5 VL Model (7B)
+# ----------------------------------------
+# We load the official 7B version, using flash attention optimization and bfloat16 for efficiency.
+model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    model_path,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto"  # Automatically places the model on available GPU
+)
+processor = AutoProcessor.from_pretrained(model_path)
+# -------------------------------------------------
+# 2. Define Utility Functions for Video Processing
+# -------------------------------------------------
+def download_video(url, dest_path):
+    """
+    Download the video from the given URL and save it to a destination path.
+    """
+    response = requests.get(url, stream=True)
+    with open(dest_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8096):
+            f.write(chunk)
+    print(f"Video downloaded to {dest_path}")
+def get_video_frames(video_path, num_frames=64, cache_dir='.cache'):
+    """
+    Download (if needed) and extract frames and timestamps from the video.
+    - Uses caching to avoid repeated processing.
+    - Utilizes decord to read video frames.
+    """
+    os.makedirs(cache_dir, exist_ok=True)
+    video_hash = hashlib.md5(video_path.encode('utf-8')).hexdigest()
+    # If the video is a URL, download it locally
+    if video_path.startswith('http://') or video_path.startswith('https://'):
+        video_file_path = os.path.join(cache_dir, f'{video_hash}.mp4')
+        if not os.path.exists(video_file_path):
+            download_video(video_path, video_file_path)
+    else:
+        video_file_path = video_path
+    # Check if frames have been cached already
+    frames_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_frames.npy')
+    timestamps_cache_file = os.path.join(cache_dir, f'{video_hash}_{num_frames}_timestamps.npy')
+    if os.path.exists(frames_cache_file) and os.path.exists(timestamps_cache_file):
+        frames = np.load(frames_cache_file)
+        timestamps = np.load(timestamps_cache_file)
+        return video_file_path, frames, timestamps
+    # Read video using decord
+    vr = VideoReader(video_file_path, ctx=cpu(0))
+    total_frames = len(vr)
+    indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int)
+    frames = vr.get_batch(indices).asnumpy()
+    timestamps = np.array([vr.get_frame_timestamp(idx) for idx in indices])
+    # Save the results to cache for later re-use
+    np.save(frames_cache_file, frames)
+    np.save(timestamps_cache_file, timestamps)
+    return video_file_path, frames, timestamps
+# --------------------------------------------------------
+# 3. Inference Function Using Qwen 2.5 VL to Process the Video
+# --------------------------------------------------------
+def inference(video_path, prompt, max_new_tokens=2048, total_pixels=20480 * 28 * 28, min_pixels=16 * 28 * 28):
+    """
+    Prepare the input messages with the prompt and video metadata,
+    process the video inputs, and run inference through the model.
+    """
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "user", "content": [
+            {"type": "text", "text": prompt},
+            {"video": video_path, "total_pixels": total_pixels, "min_pixels": min_pixels},
+        ]},
+    ]
+    # Prepare the text with the chat template from the processor.
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    # Process the video information into the proper inputs.
+    image_inputs, video_inputs, video_kwargs = process_vision_info([messages], return_video_kwargs=True)
+    fps_inputs = video_kwargs['fps']
+    inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        fps=fps_inputs,
+        padding=True,
+        return_tensors="pt"
+    )
+    inputs = inputs.to('cuda')
+    # Generate the response using the model
+    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    # Post-process the output tokens to text.
+    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
+    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+    return output_text[0]
+# -------------------------------------------------
+# 4. Define Sample Prompts for Users
+# -------------------------------------------------
+sample_prompts = [
+    "Please analyze the video and split it into chapters with timestamps and descriptive titles in the format 'mm:ss Title'.",
+    "Provide a breakdown of the video's content by segment, including starting times and summaries.",
+    "Segment the video into logical chapters and output the start time and a brief description for each chapter.",
+]
+# -------------------------------------------------
+# 5. Main Processing Function for the Gradio Interface
+# -------------------------------------------------
+def process_video(video_url, custom_prompt, sample_prompt):
+    """
+    This function is called when a user clicks the 'Process Video' button.
+    - It uses the custom prompt if provided; otherwise, it falls back to the selected sample prompt.
+    - It then downloads and processes the video and calls the inference function.
+    """
+    # Choose the prompt: use custom prompt if not empty, else use the sample
+    final_prompt = custom_prompt.strip() if custom_prompt.strip() != "" else sample_prompt
+    try:
+        video_path, frames, timestamps = get_video_frames(video_url, num_frames=64)
+    except Exception as e:
+        return f"Error processing video: {str(e)}"
+    try:
+        output = inference(video_path, final_prompt)
+    except Exception as e:
+        return f"Error during inference: {str(e)}"
+    return output
+# -------------------------------------------------
+# 6. Build the Gradio Interface
+# -------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# YouTube Video Chapter Splitter using Qwen 2.5 VL (7B)")
+    gr.Markdown("Enter a YouTube video URL and either type a custom prompt or select one of the sample prompts. Then click **Process Video** to generate the chapter breakdown.")
+    with gr.Row():
+        video_url_input = gr.Textbox(label="YouTube Video URL", placeholder="Enter YouTube video URL...", lines=1)
+    with gr.Row():
+        custom_prompt_input = gr.Textbox(label="Custom Prompt", placeholder="Enter custom prompt (optional)...", lines=2)
+    with gr.Row():
+        sample_prompt_input = gr.Dropdown(label="Sample Prompts", choices=sample_prompts, value=sample_prompts[0])
+    output_text = gr.Textbox(label="Output", lines=10)
+    run_button = gr.Button("Process Video")
+    # When the button is clicked, run the process_video function.
+    run_button.click(fn=process_video, inputs=[video_url_input, custom_prompt_input, sample_prompt_input], outputs=output_text)
+# -------------------------------------------------
+# 7. Launch the App
+# -------------------------------------------------
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+# Core dependencies from Qwen 2.5 VL
+gradio==5.4.0
+gradio_client==1.4.2
+qwen-vl-utils==0.0.10
+transformers-stream-generator==0.0.4
+torch==2.4.0
+torchvision==0.19.0
+git+https://github.com/huggingface/transformers.git
+accelerate
+av
+qwen-vl-utils
+# Optional dependency (uncomment if flash attention is needed)
+flash-attn==2.6.1
+# Additional dependencies for video processing and utilities
+decord
+numpy
+Pillow
+requests