Spaces:

GeradeHouse
/

Wan2.1-FLF2V

Paused

App Files Files Community

GeradeHouse commited on 22 days ago

Commit

64a6a24

verified ·

1 Parent(s): 1c8aab2

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -101

app.py CHANGED Viewed

@@ -1,10 +1,15 @@
 #!/usr/bin/env python
 """
-Gradio demo for Wan2.1 FLF2V – full streaming progress
-No globals: pipeline, resize utils all use the local `pipe`.
-Author: <your-handle>
 """
 import numpy as np
 import torch
 import gradio as gr
@@ -12,95 +17,108 @@ from diffusers import WanImageToVideoPipeline, AutoencoderKLWan
 from diffusers.utils import export_to_video
 from transformers import CLIPVisionModel, CLIPImageProcessor
 from PIL import Image
-import torchvision.transforms.functional as TF
-# ---------------------------------------------------------------------
-# CONFIG ----------------------------------------------------------------
-MODEL_ID       = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
-DTYPE          = torch.float16
-MAX_AREA       = 1280 * 720
-DEFAULT_FRAMES = 81
-# ----------------------------------------------------------------------
-def load_pipeline(progress):
-    """Load & shard the pipeline across CPU/GPU with streaming progress."""
-    progress(0.00, desc="Init: loading image encoder…")
-    image_encoder = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
-    progress(0.10, desc="Loaded image encoder")
-    progress(0.10, desc="Loading VAE…")
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
-    progress(0.20, desc="Loaded VAE")
-    progress(0.20, desc="Assembling pipeline…")
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
-        image_encoder=image_encoder,
         torch_dtype=DTYPE,
-        low_cpu_mem_usage=True,
-        device_map="balanced",
     )
-    progress(0.30, desc="Pipeline assembled")
-    progress(0.30, desc="Loading fast image processor…")
-    pipe.image_processor = CLIPImageProcessor.from_pretrained(
-        MODEL_ID, subfolder="image_processor", use_fast=True
-    )
-    progress(0.40, desc="Processor ready")
-    return pipe
-def aspect_resize(img: Image.Image, pipe, max_area=MAX_AREA):
-    """Resize while respecting model patch multiples, using `pipe` for scale."""
     ar = img.height / img.width
-    mod = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
-    h = round(np.sqrt(max_area * ar)) // mod * mod
-    w = round(np.sqrt(max_area / ar)) // mod * mod
     return img.resize((w, h), Image.LANCZOS), h, w
-def center_crop_resize(img: Image.Image, pipe, h, w):
-    """Center-crop & resize to H×W, using same Lanczos filter."""
     ratio = max(w / img.width, h / img.height)
-    img = img.resize(
-        (round(img.width * ratio), round(img.height * ratio)),
-        Image.LANCZOS
-    )
-    return TF.center_crop(img, [h, w])
-def generate(first_frame, last_frame, prompt, negative_prompt,
-             steps, guidance, num_frames, seed, fps,
-             progress=gr.Progress()):  # Gradio progress hook
-    # 1) Load & shard pipeline
-    pipe = load_pipeline(progress)
-    # 2) Preprocess
-    progress(0.45, desc="Preprocessing first frame…")
-    first_frame, h, w = aspect_resize(first_frame, pipe)
-    if last_frame.size != first_frame.size:
-        progress(0.50, desc="Preprocessing last frame…")
-        last_frame = center_crop_resize(last_frame, pipe, h, w)
-    progress(0.55, desc="Frames ready")
-    # 3) Run inference with per-step callbacks
     if seed == -1:
         seed = torch.seed()
-    gen = torch.Generator(device=pipe.device).manual_seed(seed)
-    def _cb(step, timestep, latents):
-        frac = 0.55 + 0.35 * ((step + 1) / steps)
-        progress(frac, desc=f"Inference step {step+1}/{steps}")
-    progress(0.55, desc="Starting inference…")
-    output = pipe(
-        image=first_frame,
         last_image=last_frame,
-        prompt=prompt,
         negative_prompt=negative_prompt or None,
         height=h,
         width=w,
@@ -108,44 +126,42 @@ def generate(first_frame, last_frame, prompt, negative_prompt,
         num_inference_steps=steps,
         guidance_scale=guidance,
         generator=gen,
-        callback_on_step_end=_cb,
-        callback_steps=1,
     )
-    frames = output.frames[0]
-    # 4) Export video
-    progress(0.92, desc="Exporting video…")
-    video_path = export_to_video(frames, fps=fps)
-    # 5) Done
-    progress(1.0, desc="Complete!")
-    return video_path
 with gr.Blocks() as demo:
-    gr.Markdown("## Wan2.1 FLF2V – Full Streaming Progress")
     with gr.Row():
         first_img = gr.Image(label="First frame", type="pil")
         last_img  = gr.Image(label="Last frame",  type="pil")
-    prompt   = gr.Textbox(label="Prompt", placeholder="A blue bird takes off…")
-    negative = gr.Textbox(label="Negative prompt (optional)", placeholder="ugly, blurry")
     with gr.Accordion("Advanced parameters", open=False):
-        steps      = gr.Slider(10, 50, value=30, step=1, label="Steps")
-        guidance   = gr.Slider(0.0, 10.0, value=5.5, step=0.1, label="Guidance")
-        num_frames = gr.Slider(16, 129, value=DEFAULT_FRAMES, label="Frames")
-        fps        = gr.Slider(4, 30, value=16, label="FPS")
-        seed       = gr.Number(value=-1, precision=0, label="Seed")
-    video = gr.Video(label="Result (.mp4)")
-    btn = gr.Button("Generate")
-    btn.click(
         fn=generate,
         inputs=[first_img, last_img, prompt, negative, steps, guidance, num_frames, seed, fps],
-        outputs=[video],
     )
-    demo.queue()  # enable streaming updates
-    demo.launch()

 #!/usr/bin/env python
 """
+Gradio demo for Wan2.1 FLF2V – First & Last Frame → Video
+Loads the huge model lazily (only once), streams **all** tqdm bars
+(from HF downloads, shard loading, to denoising) into Gradio's UI,
+and outputs a direct File download for the generated video.
 """
+import os
+import tempfile
+import ftfy
 import numpy as np
 import torch
 import gradio as gr
 from diffusers.utils import export_to_video
 from transformers import CLIPVisionModel, CLIPImageProcessor
 from PIL import Image
+# -----------------------------------------------------------------------------
+# CONFIG
+# -----------------------------------------------------------------------------
+MODEL_ID        = "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"
+DTYPE           = torch.float16                     # or torch.bfloat16 on AMP-friendly cards
+MAX_AREA        = 1280 * 720                        # ≤720p
+DEFAULT_FRAMES  = 81                                # ~5s @16fps
+# -----------------------------------------------------------------------------
+# GLOBAL PIPELINE (lazy)
+# -----------------------------------------------------------------------------
+PIPE = None
+def load_pipeline():
+    """
+    Load the Wan2.1-FLF2V pipeline once, with fast processor,
+    CPU-offload for large models, and in half-precision.
+    """
+    # 1) full-precision CLIP encoder
+    vision = CLIPVisionModel.from_pretrained(
         MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32
     )
+    # 2) fast CLIP image processor
+    processor = CLIPImageProcessor.from_pretrained(
+        MODEL_ID, subfolder="preprocessor", use_fast=True
+    )
+    # 3) reduced-precision VAE
     vae = AutoencoderKLWan.from_pretrained(
         MODEL_ID, subfolder="vae", torch_dtype=DTYPE
     )
+    # 4) assemble pipeline
     pipe = WanImageToVideoPipeline.from_pretrained(
         MODEL_ID,
         vae=vae,
+        image_encoder=vision,
+        image_processor=processor,
         torch_dtype=DTYPE,
     )
+    # 5) offload to CPU/AutoDevice
+    pipe.enable_model_cpu_offload()
+    # (we drop .enable_slicing() because it's unsupported here)
+    return pipe.to("cuda" if torch.cuda.is_available() else "cpu")
+# -----------------------------------------------------------------------------
+# UTILS
+# -----------------------------------------------------------------------------
+def aspect_resize(img: Image.Image, max_area=MAX_AREA):
+    """
+    Resize while respecting the model's patch size (multiple of 8 * transformer patch).
+    """
     ar = img.height / img.width
+    mod = PIPE.transformer.config.patch_size[1] * PIPE.vae_scale_factor_spatial
+    h = (int(np.sqrt(max_area * ar)) // mod) * mod
+    w = (int(np.sqrt(max_area / ar)) // mod) * mod
     return img.resize((w, h), Image.LANCZOS), h, w
+def center_crop_resize(img: Image.Image, h: int, w: int):
+    """
+    Center-crop + resize to exactly h×w.
+    """
     ratio = max(w / img.width, h / img.height)
+    img2 = img.resize((round(img.width * ratio), round(img.height * ratio)), Image.LANCZOS)
+    return TF.center_crop(img2, [h, w])
+# -----------------------------------------------------------------------------
+# GENERATION (with full tqdm → Gradio progress streaming)
+# -----------------------------------------------------------------------------
+def generate(
+    first_frame: Image.Image,
+    last_frame: Image.Image,
+    prompt: str,
+    negative_prompt: str,
+    steps: int,
+    guidance: float,
+    num_frames: int,
+    seed: int,
+    fps: int,
+    progress=gr.Progress(track_tqdm=True),
+):
+    global PIPE
+    # lazy instantiate
+    if PIPE is None:
+        progress(0, desc="Loading pipeline…")
+        PIPE = load_pipeline()
+    # seeding
     if seed == -1:
         seed = torch.seed()
+    gen = torch.Generator(device=PIPE.device).manual_seed(seed)
+    # preprocess
+    progress(0, desc="Preprocessing…")
+    frame1, h, w = aspect_resize(first_frame)
+    if last_frame.size != frame1.size:
+        last_frame = center_crop_resize(last_frame, h, w)
+    # inference (all tqdm inside will stream to UI)
+    result = PIPE(
+        image=frame1,
         last_image=last_frame,
+        prompt=whitespace_clean(basic_clean(prompt)),
         negative_prompt=negative_prompt or None,
         height=h,
         width=w,
         num_inference_steps=steps,
         guidance_scale=guidance,
         generator=gen,
+        # no callback_steps here!
     )
+    frames = result.frames[0]  # list of PIL images
+    # export to MP4
+    progress(1.0, desc="Assembling video…")
+    out_path = export_to_video(frames, fps=fps)
+    return out_path, seed
+# -----------------------------------------------------------------------------
+# BUILD UI
+# -----------------------------------------------------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## Wan 2.1 FLF2V – First & Last Frame → Video (Diffusers)")
     with gr.Row():
         first_img = gr.Image(label="First frame", type="pil")
         last_img  = gr.Image(label="Last frame",  type="pil")
+    prompt        = gr.Textbox(label="Prompt", placeholder="A small blue bird takes off…")
+    negative      = gr.Textbox(label="Negative prompt (optional)", placeholder="ugly, blurry")
     with gr.Accordion("Advanced parameters", open=False):
+        steps      = gr.Slider(10, 50, value=30,  step=1,  label="Sampling steps")
+        guidance   = gr.Slider(0.0, 10.0, value=5.5,  step=0.1, label="Guidance scale")
+        num_frames = gr.Slider(16, 129, value=DEFAULT_FRAMES, step=1, label="Frames")
+        fps        = gr.Slider(4, 30, value=16,  step=1, label="FPS")
+        seed       = gr.Number(value=-1, precision=0, label="Seed (-1 = random)")
+    run_btn = gr.Button("Generate")
+    # **File** component for direct download link:
+    download = gr.File(label="Download video (.mp4)")
+    used_seed = gr.Number(label="Seed used", interactive=False)
+    # queue() for async + progress
+    run_btn.click(
         fn=generate,
         inputs=[first_img, last_img, prompt, negative, steps, guidance, num_frames, seed, fps],
+        outputs=[download, used_seed],
     )
+# MUST call .queue() to enable gr.Progress()
+demo.queue(concurrency_count=1).launch()