Spaces:

whyumesh
/

fusion

Configuration error

File size: 7,152 Bytes

import torch
from transformers import (
    Qwen2VLForConditionalGeneration, 
    AutoProcessor,
    AutoModelForCausalLM, 
    AutoTokenizer
)
from qwen_vl_utils import process_vision_info
from PIL import Image
import cv2
import numpy as np
import gradio as gr
import spaces

# Load both models and their processors/tokenizers
def load_models():
    # Vision model
    vision_model = Qwen2VLForConditionalGeneration.from_pretrained(
        "Qwen/Qwen2-VL-2B-Instruct",
        torch_dtype=torch.float16,
        device_map="auto"
    )
    vision_processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
    
    # Code model
    code_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2.5-Coder-1.5B-Instruct",
        torch_dtype=torch.float16,
        device_map="auto"
    )
    code_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-1.5B-Instruct")
    
    return vision_model, vision_processor, code_model, code_tokenizer

vision_model, vision_processor, code_model, code_tokenizer = load_models()

VISION_SYSTEM_PROMPT = """You are an AI assistant specialized in analyzing images and videos of code editors. Your primary task is to:

1. FIRST AND MOST IMPORTANTLY: Check if the image contains any inappropriate content such as:
   - Harassment or bullying
   - Hate speech or discriminatory content
   - Sexually explicit material
   - Dangerous or harmful content
   If any such content is detected, respond ONLY with: "I apologize, but I cannot process this content as it appears to contain [type of inappropriate content]. Please provide only appropriate code-related images."

2. If the content is appropriate, then:
   - Extract and describe any code snippets visible in the image
   - Identify any error messages, warnings, or highlighting that indicates bugs
   - Describe the programming language and context if visible

Be thorough and accurate in your description of appropriate content, as this will be used to fix the code."""

CODE_SYSTEM_PROMPT = """You are an expert code debugging assistant. Your tasks in order are:

1. Check if the input description contains any flags for inappropriate content.
   If it does, respond ONLY with: "I apologize, but I cannot process this request as the original content was flagged as inappropriate."

2. If the content is appropriate, then based on the description of code and errors provided:
   - Identify the bugs and issues in the code
   - Provide a corrected version of the code
   - Explain the fixes made and why they resolve the issues
   - Provide the output in a well-structured format removing all unnecessary information

Be thorough in your explanation and ensure the corrected code is complete and functional."""

def process_image_for_code(image):
    # First, process with vision model
    vision_messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": image},
                {"type": "text", "text": f"{VISION_SYSTEM_PROMPT}\n\nDescribe the code and any errors you see in this image."},
            ],
        }
    ]

    vision_text = vision_processor.apply_chat_template(
        vision_messages, 
        tokenize=False, 
        add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(vision_messages)

    vision_inputs = vision_processor(
        text=[vision_text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    ).to(vision_model.device)

    with torch.no_grad():
        vision_output_ids = vision_model.generate(**vision_inputs, max_new_tokens=512)
    vision_output_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(vision_inputs.input_ids, vision_output_ids)
    ]
    vision_description = vision_processor.batch_decode(
        vision_output_trimmed, 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=False
    )[0]

    # Check if vision model flagged inappropriate content
    if "I apologize, but I cannot process this content" in vision_description:
        return vision_description, "No code analysis provided due to inappropriate content."

    # Then, use code model to fix the code
    code_messages = [
        {"role": "system", "content": CODE_SYSTEM_PROMPT},
        {"role": "user", "content": f"Here's a description of code with errors:\n\n{vision_description}\n\nPlease analyze and fix the code."}
    ]
    
    code_text = code_tokenizer.apply_chat_template(
        code_messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    code_inputs = code_tokenizer([code_text], return_tensors="pt").to(code_model.device)
    
    with torch.no_grad():
        code_output_ids = code_model.generate(
            **code_inputs,
            max_new_tokens=1024,
            temperature=0.7,
            top_p=0.95,
        )
    
    code_output_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(code_inputs.input_ids, code_output_ids)
    ]
    fixed_code_response = code_tokenizer.batch_decode(
        code_output_trimmed,
        skip_special_tokens=True
    )[0]
    
    return vision_description, fixed_code_response

def process_video_for_code(video_path, max_frames=16, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_count = 0
    
    while len(frames) < max_frames:
        ret, frame = cap.read()
        if not ret:
            break
            
        if frame_count % frame_interval == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
            frames.append(frame)
            
        frame_count += 1
        
    cap.release()
    
    if frames:
        return process_image_for_code(frames[0])
    else:
        return "No frames could be extracted from the video.", "No code could be analyzed."

@spaces.GPU
def process_content(content):
    if content is None:
        return "Please upload an image or video file of code with errors.", ""

    try:
        if content.name.lower().endswith(('.png', '.jpg', '.jpeg')):
            image = Image.open(content.name)
            vision_output, code_output = process_image_for_code(image)
        elif content.name.lower().endswith(('.mp4', '.avi', '.mov')):
            vision_output, code_output = process_video_for_code(content.name)
        else:
            return "Unsupported file type. Please provide an image or video file.", ""
    except Exception as e:
        return f"An error occurred while processing the file: {str(e)}", ""

    return vision_output, code_output

# Gradio interface
iface = gr.Interface(
    fn=process_content,
    inputs=gr.File(label="Upload Image or Video of Code with Errors"),
    outputs=[
        gr.Textbox(label="Vision Model Output (Code Description)"),
        gr.Code(label="Fixed Code", language="python")
    ],
    title="Vision Code Debugger",
    description="Upload an image or video of code with errors for AI analysis and fixes. Note: Only appropriate code-related content will be processed."
)

if __name__ == "__main__":
    iface.launch()