import os
import gradio as gr
import torch
import ftfy
import spaces
from diffusers import DiffusionPipeline

# Read token and optional model override from environment
token = os.environ.get("HUGGINGFACE_TOKEN")
if not token:
    raise ValueError("Environment variable HUGGINGFACE_TOKEN is not set.")

# Use the Diffusers-ready model repository by default
model_id = os.environ.get("WAN_MODEL_ID", "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers")

@spaces.GPU  # GPU is only activated when this function is called
def generate_video(image, prompt, num_frames=16, steps=25, guidance_scale=7.5):
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    # Load pipeline inside the GPU-allocated function
    pipe = DiffusionPipeline.from_pretrained(
        model_id,
        torch_dtype=torch_dtype,
        trust_remote_code=True,
        use_auth_token=token
    ).to("cuda")

    pipe.enable_attention_slicing()

    # Generate video
    output = pipe(
        prompt=prompt,
        image=image,
        num_inference_steps=steps,
        guidance_scale=guidance_scale,
        num_frames=num_frames
    )

    return output.videos

# Gradio UI
def main():
    with gr.Blocks() as demo:
        gr.Markdown("# Wan2.1 Image-to-Video Demo (ZeroGPU Edition)")
        with gr.Row():
            img_in = gr.Image(type="pil", label="Input Image")
            txt_p = gr.Textbox(label="Prompt")
        btn = gr.Button("Generate Video")
        out = gr.Video(label="Generated Video")
        btn.click(fn=generate_video, inputs=[img_in, txt_p], outputs=out)
    return demo

if __name__ == "__main__":
    main().launch()