import contextlib
import subprocess
import time
from typing import Iterator, Callable

import gradio as gr
import spaces
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

subprocess.run(
    "pip install flash-attn --no-build-isolation",
    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
    shell=True,
)

# Mapping user-friendly names to HF model IDs
MODEL_NAMES = {
    "Qwen2.5-VL-7B-Instruct": "Qwen/Qwen2.5-VL-7B-Instruct",
    "Qwen2.5-VL-3B-Instruct": "Qwen/Qwen2.5-VL-3B-Instruct",
}


@contextlib.contextmanager
def measure_time() -> Iterator[Callable[[], float]]:
    """
    A context manager for measuring execution time (in seconds) within its code block.

    usage:
        with code_timer() as timer:
            # Code snippet to be timed
        print(f"Code took: {timer()} seconds")
    """
    start_time = end_time = time.perf_counter()
    yield lambda: end_time - start_time
    end_time = time.perf_counter()


@spaces.GPU(duration=300)
def run_inference(model_key, input_type, text, image, video, fps, system_prompt, add_vision_id):
    """
    Load the selected Qwen2.5-VL model and run inference on text, image, or video.
    """
    model_id = MODEL_NAMES[model_key]
    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype="auto",
        device_map="auto",
    )
    processor = AutoProcessor.from_pretrained(model_id)

    # Text-only inference
    if input_type == "text":
        inputs = processor(
            text=text,
            return_tensors="pt",
            padding=True
        )
        inputs = inputs.to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=512)
        return processor.batch_decode(outputs, skip_special_tokens=True)[0]

    # Multimodal inference (image or video)
    content = []
    if input_type == "image" and image:
        for img_path in image:
            content.append({"type": "image", "image": img_path})
    elif input_type == "video" and video:
        # Ensure file URI for local files
        video_src = video if str(video).startswith("file://") else f"file://{video}"
        content.append({"type": "video", "video": video_src, "fps": fps})
    content.append({"type": "text", "text": text or ""})
    msg = [
        {"role": "system", "content": system_prompt},
        {"role": "user",   "content": content}
    ]

    # Prepare inputs for model with video kwargs
    text_prompt = processor.apply_chat_template(
        msg,
        tokenize=False,
        add_generation_prompt=True,
        add_vision_id=add_vision_id
    )
    image_inputs, video_inputs, video_kwargs = process_vision_info(msg, return_video_kwargs=True)
    inputs = processor(
        text=[text_prompt],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
        **video_kwargs
    )
    inputs = inputs.to(model.device)

    with measure_time() as timer:
        gen_ids = model.generate(**inputs, max_new_tokens=512)
        # Trim the prompt tokens
        trimmed = [out_ids[len(inp_ids):] for inp_ids, out_ids in zip(inputs.input_ids, gen_ids)]
        result = processor.batch_decode(trimmed, skip_special_tokens=True)[0]

    gr.Info(f"Finished in {timer():.2f}s", title="Success", duration=5)  # green-style info toast :contentReference[oaicite:0]{index=0}
    return result


# Build Gradio interface
demo = gr.Blocks()
with demo:
    gr.Markdown("# Qwen2.5-VL Multimodal Demo")
    model_select = gr.Dropdown(list(MODEL_NAMES.keys()), label="Select Model")
    input_type = gr.Radio(["text", "image", "video"], label="Input Type")
    system_prompt_input = gr.Textbox(
        lines=2,
        placeholder="System prompt…",
        value="You are a helpful assistant.",
        label="System Prompt"
   )
    vision_id_checkbox = gr.Checkbox(
        label="Add vision ID",
        value=False
   )
    text_input = gr.Textbox(
        lines=3,
        placeholder="Enter text ...",
        visible=True
    )
    image_input = gr.File(
        file_count="multiple",
        file_types=["image"],
        label="Upload Images",
        visible=False
    )
    video_input = gr.Video(visible=False)
    fps_input = gr.Number(
      value=2.0,
      label="FPS",
      visible=False
    )
    output = gr.Textbox(label="Output")

    # Show/hide inputs based on selection
    def update_inputs(choice):
        return (
            gr.update(visible=True),
            gr.update(visible=(choice == "image")),
            gr.update(visible=(choice == "video")),
            gr.update(visible=(choice == "video"))
        )

    input_type.change(update_inputs, input_type, [text_input, image_input, video_input, fps_input])
    run_btn = gr.Button("Generate")
    run_btn.click(
        run_inference,
        [
                model_select,
                input_type,
                text_input,
                image_input,
                video_input,
                fps_input,
                system_prompt_input,
                vision_id_checkbox
            ],
        output
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()