Spaces:

imthanhlv
/

looop

Sleeping

File size: 4,668 Bytes

import spaces
import gradio as gr
import torch
from diffusers import LTXPipeline
import uuid
import time
import types
from typing import Optional

pipe = LTXPipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.1-diffusers", torch_dtype=torch.bfloat16)
# pipe = LTXPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
pipe.enable_sequential_cpu_offload()


# pipe.vae.decode = vae_decode

HEIGHT = 512
WIDTH = 768
N_FRAME = 161
N_AVG_FRAME = 2

negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

prepare_latents_original = pipe.prepare_latents

# unpack will have shape B, C, F, H, W with F, H, W are in latent dim

# def prepare_latents_loop(*args, **kwargs):
#     packed_latents = prepare_latents_original(*args, **kwargs)
#     unpacked_latents = pipe._unpack_latents(packed_latents, (N_FRAME-1)//8+1, HEIGHT//32, WIDTH//32, 1, 1)
#     # now average the first n and last n frames
#     last_n = unpacked_latents[:, :, -N_AVG_FRAME:, :, :]
#     # 0,1,2,3,4, roll -1 => 1,2,3,4,0
#     # last n: [3, 4]
#     # last_next_n: [4, 0]
#     # then 3 will be 0.75*3 + 0.25*4, and 4 will be 0.75*4+0.25*0
#     last_next_n = torch.roll(unpacked_latents, shifts=-1, dims=2)[:, :, -N_AVG_FRAME:, :, :]
#     avg_n = last_n * 0.75 + last_next_n * 0.25
#     unpacked_latents[:, :, -N_AVG_FRAME:, :, :] = avg_n
#     # pack the latents back
#     packed_latents = pipe._pack_latents(unpacked_latents)
#     return packed_latents

# pipe.prepare_latents = prepare_latents_loop

def modify_latents_callback(pipeline, step, timestep, callback_kwargs):
    print("Rolling latents on step", step)
    latents = callback_kwargs.get("latents")
    unpacked_latents = pipeline._unpack_latents(latents, (N_FRAME-1)//8+1, HEIGHT//32, WIDTH//32, 1, 1)
    # the frame order after each denoising step will be 0,1,2 -> 2,0,1 -> 1,2,0 -> 0,1,2 ...
    modified_latents = torch.roll(unpacked_latents, shifts=1, dims=2)
    modified_latents = pipeline._pack_latents(modified_latents)
    return {"latents": modified_latents}

@spaces.GPU(duration=140)
def generate_gif(prompt, use_fixed_seed):
    seed = 0 if use_fixed_seed else torch.seed()
    generator = torch.Generator(device="cuda").manual_seed(seed)

    output = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        width=WIDTH,
        height=HEIGHT,
        num_frames=N_FRAME,
        num_inference_steps=50,
        decode_timestep=0.03,
        decode_noise_scale=0.025,
        generator=generator,
        callback_on_step_end=modify_latents_callback,
    ).frames[0]

    gif_path = f"/tmp/{uuid.uuid4().hex}.gif"

    bef = time.time()
    # imageio.mimsave(gif_path, output, format="GIF", fps=24, loop=0)
    gif_path = f"/tmp/{uuid.uuid4().hex}.webp"
    output[0].save(gif_path, format="WebP", save_all=True, append_images=output[1:], duration=1000/24, loop=0)
    print("GIF creation time:", time.time() - bef)
    return gif_path

with gr.Blocks() as demo:
    gr.Markdown("## LTX Video → Looping GIF Generator")
    with gr.Row():
        with gr.Column():
            prompt_input = gr.Textbox(label="Prompt", lines=4)
            use_fixed_seed = gr.Checkbox(label="Use Fixed Seed", value=True)
            generate_btn = gr.Button("Generate")
        with gr.Column():
            gif_output = gr.Image(label="Looping GIF Result", type="filepath")

    generate_btn.click(
        fn=generate_gif,
        inputs=[prompt_input, use_fixed_seed],
        outputs=gif_output,
        concurrency_limit=1
    )

    gr.Dataset(
        components=[prompt_input, use_fixed_seed, gif_output],
        samples=[
            ["A woman with light skin, wearing a blue jacket and a black hat with a veil, looks down and to her right, then back up as she speaks; she has brown hair styled in an updo, light brown eyebrows, and is wearing a white collared shirt under her jacket; the camera remains stationary on her face as she speaks; the background is out of focus, but shows trees and people in period clothing; the scene is captured in real-life footage.", False, "examples/woman.webp"],
            ["A sleek white car skids into a narrow alley on wet pavement, its rear tires releasing a thin cloud of smoky exhaust as it accelerates past flickering neon signs. Rain cascades from the eaves. The camera swoops low and follows behind, capturing towering skyscraper reflections in puddles and the car’s headlights. Lightning flashes overhead, intensifying the gritty atmosphere", True, "examples/car.webp"]
        ],
        label="Example Inputs and Outputs",
        type="index",
    )

demo.queue(max_size=5)
demo.launch(share=True)