Spaces:

acvlab
/

FantasyTalking

Running on Zero

App Files Files Community

wangmengchao commited on 5 days ago

Commit

282b272

1 Parent(s): a893799

init

Browse files

Files changed (22) hide show

app.py +312 -0
diffsynth/__init__.py +5 -0
diffsynth/configs/__init__.py +0 -0
diffsynth/configs/model_config.py +650 -0
diffsynth/data/__init__.py +1 -0
diffsynth/data/video.py +173 -0
diffsynth/pipelines/__init__.py +1 -0
diffsynth/pipelines/base.py +127 -0
diffsynth/pipelines/wan_video.py +290 -0
diffsynth/prompters/__init__.py +1 -0
diffsynth/prompters/base_prompter.py +70 -0
diffsynth/prompters/wan_prompter.py +108 -0
diffsynth/schedulers/__init__.py +3 -0
diffsynth/schedulers/continuous_ode.py +59 -0
diffsynth/schedulers/ddim.py +105 -0
diffsynth/schedulers/flow_match.py +79 -0
diffsynth/vram_management/__init__.py +1 -0
diffsynth/vram_management/layers.py +95 -0
infer.py +214 -0
model.py +229 -0
requirements.txt +14 -0
utils.py +49 -0

app.py ADDED Viewed

	@@ -0,0 +1,312 @@

+import gradio as gr
+from pathlib import Path
+import argparse
+from datetime import datetime
+import librosa
+from infer import load_models,main
+pipe,fantasytalking,wav2vec_processor,wav2vec = None,None,None,None
+def generate_video(
+    image_path,
+    audio_path,
+    prompt,
+    prompt_cfg_scale,
+    audio_cfg_scale,
+    audio_weight,
+    image_size,
+    max_num_frames,
+    inference_steps,
+    seed,
+):
+    # Create the temp directory if it doesn't exist
+    output_dir = Path("./output")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Convert paths to absolute Path objects and normalize them
+    print(image_path)
+    image_path = Path(image_path).absolute().as_posix()
+    audio_path = Path(audio_path).absolute().as_posix()
+    # Parse the arguments
+    args = create_args(
+        image_path=image_path,
+        audio_path=audio_path,
+        prompt=prompt,
+        output_dir=str(output_dir),
+        audio_weight=audio_weight,
+        prompt_cfg_scale=prompt_cfg_scale,
+        audio_cfg_scale=audio_cfg_scale,
+        image_size=image_size,
+        max_num_frames=max_num_frames,
+        inference_steps=inference_steps,
+        seed=seed,
+    )
+    try:
+        global pipe, fantasytalking, wav2vec_processor, wav2vec
+        if pipe is None:
+            pipe,fantasytalking,wav2vec_processor,wav2vec = load_models(args)
+        output_path=main(
+            args,pipe,fantasytalking,wav2vec_processor,wav2vec
+        )
+        return output_path  # Ensure the output path is returned
+    except Exception as e:
+        print(f"Error during processing: {str(e)}")
+        raise gr.Error(f"Error during processing: {str(e)}")
+def create_args(
+    image_path: str,
+    audio_path: str,
+    prompt: str,
+    output_dir: str,
+    audio_weight: float,
+    prompt_cfg_scale: float,
+    audio_cfg_scale: float,
+    image_size: int,
+    max_num_frames: int,
+    inference_steps: int,
+    seed: int,
+) -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wan_model_dir",
+        type=str,
+        default="./models/Wan2.1-I2V-14B-720P",
+        required=False,
+        help="The dir of the Wan I2V 14B model.",
+    )
+    parser.add_argument(
+        "--fantasytalking_model_path",
+        type=str,
+        default="./models/fantasytalking_model.ckpt",
+        required=False,
+        help="The .ckpt path of fantasytalking model.",
+    )
+    parser.add_argument(
+        "--wav2vec_model_dir",
+        type=str,
+        default="./models/wav2vec2-base-960h",
+        required=False,
+        help="The dir of wav2vec model.",
+    )
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default="./assets/images/woman.png",
+        required=False,
+        help="The path of the image.",
+    )
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default="./assets/audios/woman.wav",
+        required=False,
+        help="The path of the audio.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="A woman is talking.",
+        required=False,
+        help="prompt.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./output",
+        help="Dir to save the video.",
+    )
+    parser.add_argument(
+        "--image_size",
+        type=int,
+        default=512,
+        help="The image will be resized proportionally to this size.",
+    )
+    parser.add_argument(
+        "--audio_scale",
+        type=float,
+        default=1.0,
+        help="Image width.",
+    )
+    parser.add_argument(
+        "--prompt_cfg_scale",
+        type=float,
+        default=5.0,
+        required=False,
+        help="prompt cfg scale",
+    )
+    parser.add_argument(
+        "--audio_cfg_scale",
+        type=float,
+        default=5.0,
+        required=False,
+        help="audio cfg scale",
+    )
+    parser.add_argument(
+        "--max_num_frames",
+        type=int,
+        default=81,
+        required=False,
+        help="The maximum frames for generating videos, the audio part exceeding max_num_frames/fps will be truncated.",
+    )
+    parser.add_argument(
+        "--inference_steps",
+        type=int,
+        default=20,
+        required=False,
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=23,
+        required=False,
+    )
+    parser.add_argument(
+        "--num_persistent_param_in_dit",
+        type=int,
+        default=None,
+        required=False,
+        help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required"
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1111,
+        required=False,
+    )
+    args = parser.parse_args(
+        [
+            "--image_path",
+            image_path,
+            "--audio_path",
+            audio_path,
+            "--prompt",
+            prompt,
+            "--output_dir",
+            output_dir,
+            "--image_size",
+            str(image_size),
+            "--audio_scale",
+            str(audio_weight),
+            "--prompt_cfg_scale",
+            str(prompt_cfg_scale),
+            "--audio_cfg_scale",
+            str(audio_cfg_scale),
+            "--max_num_frames",
+            str(max_num_frames),
+            "--inference_steps",
+            str(inference_steps),
+            "--seed",
+            str(seed),
+        ]
+    )
+    print(args)
+    return args
+# Create Gradio interface
+with gr.Blocks(title="FantasyTalking Video Generation") as demo:
+    gr.Markdown(
+        """
+    # FantasyTalking: Realistic Talking Portrait Generation via Coherent Motion Synthesis
+    <div align="center">
+        <strong> Mengchao Wang1*  Qiang Wang1*  Fan Jiang1†
+        Yaqi Fan2    Yunpeng Zhang1,2   YongGang Qi2‡
+        Kun Zhao1.   Mu Xu1 </strong>
+    </div>
+    <div align="center">
+        <strong>1AMAP,Alibaba Group   2Beijing University of Posts and Telecommunications</strong>
+    </div>
+    <div style="display:flex;justify-content:center;column-gap:4px;">
+        <a href="https://github.com/Fantasy-AMAP/fantasy-talking">
+            <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+        </a>
+        <a href="https://arxiv.org/abs/2504.04842">
+            <img src='https://img.shields.io/badge/ArXiv-Paper-red'>
+        </a>
+    </div>
+    """
+    )
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image(label="Input Image", type="filepath")
+            audio_input = gr.Audio(label="Input Audio", type="filepath")
+            prompt_input = gr.Text(label="Input Prompt")
+            with gr.Row():
+                prompt_cfg_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=9.0,
+                    value=5.0,
+                    step=0.5,
+                    label="Prompt CFG Scale",
+                )
+                audio_cfg_scale = gr.Slider(
+                    minimum=1.0,
+                    maximum=9.0,
+                    value=5.0,
+                    step=0.5,
+                    label="Audio CFG Scale",
+                )
+                audio_weight = gr.Slider(
+                    minimum=0.1,
+                    maximum=3.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Audio Weight",
+                )
+            with gr.Row():
+                image_size = gr.Number(
+                    value=512, label="Width/Height Maxsize", precision=0
+                )
+                max_num_frames = gr.Number(
+                    value=81, label="The Maximum Frames", precision=0
+                )
+                inference_steps = gr.Slider(
+                    minimum=1, maximum=50, value=20, step=1, label="Inference Steps"
+                )
+            with gr.Row():
+                seed = gr.Number(value=1247, label="Random Seed", precision=0)
+            process_btn = gr.Button("Generate Video")
+        with gr.Column():
+            video_output = gr.Video(label="Output Video")
+            gr.Examples(
+                examples=[
+                    [
+                        "/home/wangmengchao.wmc/code/fantasytalking/assets/images/woman.png",
+                        "/home/wangmengchao.wmc/code/fantasytalking/assets/audios/woman.wav",
+                    ],
+                ],
+                inputs=[image_input, audio_input],
+            )
+    process_btn.click(
+        fn=generate_video,
+        inputs=[
+            image_input,
+            audio_input,
+            prompt_input,
+            prompt_cfg_scale,
+            audio_cfg_scale,
+            audio_weight,
+            image_size,
+            max_num_frames,
+            inference_steps,
+            seed,
+        ],
+        outputs=video_output,
+    )
+if __name__ == "__main__":
+    demo.launch(inbrowser=True, share=True)

diffsynth/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .data import *
+from .models import *
+from .prompters import *
+from .schedulers import *
+from .pipelines import *

diffsynth/configs/__init__.py ADDED Viewed

File without changes

diffsynth/configs/model_config.py ADDED Viewed

	@@ -0,0 +1,650 @@

+from typing_extensions import Literal, TypeAlias
+from ..models.wan_video_dit import WanModel
+from ..models.wan_video_text_encoder import WanTextEncoder
+from ..models.wan_video_image_encoder import WanImageEncoder
+from ..models.wan_video_vae import WanVideoVAE
+model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (state_dict_keys_hash, state_dict_keys_hash_with_shape, model_names, model_classes, model_resource)
+    (None, "9269f8db9040a9d860eaca435be61814", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "aafcfd9672c3a2456dc46e1cb6e52c70", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "6bfcfb3b342cb286ce886889d519a77e", ["wan_video_dit"], [WanModel], "civitai"),
+    (None, "9c8818c2cbea55eca56c7b447df170da", ["wan_video_text_encoder"], [WanTextEncoder], "civitai"),
+    (None, "5941c53e207d62f20f9025686193c40b", ["wan_video_image_encoder"], [WanImageEncoder], "civitai"),
+    (None, "1378ea763357eea97acdef78e65d6d96", ["wan_video_vae"], [WanVideoVAE], "civitai"),
+    (None, "ccc42284ea13e1ad04693284c7a09be6", ["wan_video_vae"], [WanVideoVAE], "civitai"),
+]
+huggingface_model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (architecture_in_huggingface_config, huggingface_lib, model_name, redirected_architecture)
+    ("ChatGLMModel", "diffsynth.models.kolors_text_encoder", "kolors_text_encoder", None),
+    ("MarianMTModel", "transformers.models.marian.modeling_marian", "translator", None),
+    ("BloomForCausalLM", "transformers.models.bloom.modeling_bloom", "beautiful_prompt", None),
+    ("Qwen2ForCausalLM", "transformers.models.qwen2.modeling_qwen2", "qwen_prompt", None),
+    # ("LlamaForCausalLM", "transformers.models.llama.modeling_llama", "omost_prompt", None),
+    ("T5EncoderModel", "diffsynth.models.flux_text_encoder", "flux_text_encoder_2", "FluxTextEncoder2"),
+    ("CogVideoXTransformer3DModel", "diffsynth.models.cog_dit", "cog_dit", "CogDiT"),
+    ("SiglipModel", "transformers.models.siglip.modeling_siglip", "siglip_vision_model", "SiglipVisionModel"),
+    ("LlamaForCausalLM", "diffsynth.models.hunyuan_video_text_encoder", "hunyuan_video_text_encoder_2", "HunyuanVideoLLMEncoder"),
+    ("Step1Model", "diffsynth.models.stepvideo_text_encoder", "stepvideo_text_encoder_2", "STEP1TextEncoder"),
+]
+patch_model_loader_configs = [
+    # These configs are provided for detecting model type automatically.
+    # The format is (state_dict_keys_hash_with_shape, model_name, model_class, extra_kwargs)
+    # ("9a4ab6869ac9b7d6e31f9854e397c867", ["svd_unet"], [SVDUNet], {"add_positional_conv": 128}),
+]
+preset_models_on_huggingface = {
+    "HunyuanDiT": [
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
+        ("Tencent-Hunyuan/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
+    ],
+    "stable-video-diffusion-img2vid-xt": [
+        ("stabilityai/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
+    ],
+    "ExVideo-SVD-128f-v1": [
+        ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        ("benjamin-paine/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"),
+    ],
+    "DreamShaper_8": [
+        ("Yntec/Dreamshaper8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        ("gemasai/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        ("stabilityai/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "BluePencilXL_v200": [
+        ("frankjoshua/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "StableDiffusionXL_Turbo": [
+        ("stabilityai/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        ("stabilityai/stable-diffusion-3-medium", "sd3_medium_incl_clips_t5xxlfp16.safetensors", "models/stable_diffusion_3"),
+    ],
+    "StableDiffusion3_without_T5": [
+        ("stabilityai/stable-diffusion-3-medium", "sd3_medium_incl_clips.safetensors", "models/stable_diffusion_3"),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        ("lllyasviel/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"),
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        ("lllyasviel/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"),
+        ("lllyasviel/Annotators", "ControlNetHED.pth", "models/Annotators")
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        ("lllyasviel/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet")
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        ("lllyasviel/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"),
+        ("lllyasviel/Annotators", "sk_model.pth", "models/Annotators"),
+        ("lllyasviel/Annotators", "sk_model2.pth", "models/Annotators")
+    ],
+    "ControlNet_union_sdxl_promax": [
+        ("xinsir/controlnet-union-sdxl-1.0", "diffusion_pytorch_model_promax.safetensors", "models/ControlNet/controlnet_union"),
+        ("lllyasviel/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        ("guoyww/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
+    ],
+    "AnimateDiff_xl_beta": [
+        ("guoyww/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
+    ],
+    # Qwen Prompt
+    "QwenPrompt": [
+        ("Qwen/Qwen2-1.5B-Instruct", "config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "generation_config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "model.safetensors", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "special_tokens_map.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "tokenizer.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "tokenizer_config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "merges.txt", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ("Qwen/Qwen2-1.5B-Instruct", "vocab.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+    ],
+    # Beautiful Prompt
+    "BeautifulPrompt": [
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ("alibaba-pai/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+    ],
+    # Omost prompt
+    "OmostPrompt":[
+        ("lllyasviel/omost-llama-3-8b-4bits", "model-00001-of-00002.safetensors", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "model-00002-of-00002.safetensors", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "tokenizer.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "tokenizer_config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "generation_config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "model.safetensors.index.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ("lllyasviel/omost-llama-3-8b-4bits", "special_tokens_map.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+    ],
+    # Translator
+    "opus-mt-zh-en": [
+        ("Helsinki-NLP/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"),
+        ("Helsinki-NLP/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+    ],
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        ("h94/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"),
+        ("h94/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"),
+    ],
+    "IP-Adapter-SDXL": [
+        ("h94/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
+        ("h94/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
+    ],
+    "SDXL-vae-fp16-fix": [
+        ("madebyollin/sdxl-vae-fp16-fix", "diffusion_pytorch_model.safetensors", "models/sdxl-vae-fp16-fix")
+    ],
+    # Kolors
+    "Kolors": [
+        ("Kwai-Kolors/Kolors", "text_encoder/config.json", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model.bin.index.json", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00001-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00002-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00003-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00004-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00005-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00006-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00007-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+        ("Kwai-Kolors/Kolors", "unet/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/unet"),
+        ("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/vae"),
+    ],
+    # FLUX
+    "FLUX.1-dev": [
+        ("black-forest-labs/FLUX.1-dev", "text_encoder/model.safetensors", "models/FLUX/FLUX.1-dev/text_encoder"),
+        ("black-forest-labs/FLUX.1-dev", "text_encoder_2/config.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+        ("black-forest-labs/FLUX.1-dev", "text_encoder_2/model-00001-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+        ("black-forest-labs/FLUX.1-dev", "text_encoder_2/model-00002-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+        ("black-forest-labs/FLUX.1-dev", "text_encoder_2/model.safetensors.index.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+        ("black-forest-labs/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+        ("black-forest-labs/FLUX.1-dev", "flux1-dev.safetensors", "models/FLUX/FLUX.1-dev"),
+    ],
+    "InstantX/FLUX.1-dev-IP-Adapter": {
+        "file_list": [
+            ("InstantX/FLUX.1-dev-IP-Adapter", "ip-adapter.bin", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter"),
+            ("google/siglip-so400m-patch14-384", "model.safetensors", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder"),
+            ("google/siglip-so400m-patch14-384", "config.json", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder"),
+        ],
+        "load_path": [
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/ip-adapter.bin",
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+        ],
+    },
+    # RIFE
+    "RIFE": [
+        ("AlexWortega/RIFE", "flownet.pkl", "models/RIFE"),
+    ],
+    # CogVideo
+    "CogVideoX-5B": [
+        ("THUDM/CogVideoX-5b", "text_encoder/config.json", "models/CogVideo/CogVideoX-5b/text_encoder"),
+        ("THUDM/CogVideoX-5b", "text_encoder/model.safetensors.index.json", "models/CogVideo/CogVideoX-5b/text_encoder"),
+        ("THUDM/CogVideoX-5b", "text_encoder/model-00001-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/text_encoder"),
+        ("THUDM/CogVideoX-5b", "text_encoder/model-00002-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/text_encoder"),
+        ("THUDM/CogVideoX-5b", "transformer/config.json", "models/CogVideo/CogVideoX-5b/transformer"),
+        ("THUDM/CogVideoX-5b", "transformer/diffusion_pytorch_model.safetensors.index.json", "models/CogVideo/CogVideoX-5b/transformer"),
+        ("THUDM/CogVideoX-5b", "transformer/diffusion_pytorch_model-00001-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/transformer"),
+        ("THUDM/CogVideoX-5b", "transformer/diffusion_pytorch_model-00002-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/transformer"),
+        ("THUDM/CogVideoX-5b", "vae/diffusion_pytorch_model.safetensors", "models/CogVideo/CogVideoX-5b/vae"),
+    ],
+    # Stable Diffusion 3.5
+    "StableDiffusion3.5-large": [
+        ("stabilityai/stable-diffusion-3.5-large", "sd3.5_large.safetensors", "models/stable_diffusion_3"),
+        ("stabilityai/stable-diffusion-3.5-large", "text_encoders/clip_l.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("stabilityai/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("stabilityai/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
+    ],
+}
+preset_models_on_modelscope = {
+    # Hunyuan DiT
+    "HunyuanDiT": [
+        ("modelscope/HunyuanDiT", "t2i/clip_text_encoder/pytorch_model.bin", "models/HunyuanDiT/t2i/clip_text_encoder"),
+        ("modelscope/HunyuanDiT", "t2i/mt5/pytorch_model.bin", "models/HunyuanDiT/t2i/mt5"),
+        ("modelscope/HunyuanDiT", "t2i/model/pytorch_model_ema.pt", "models/HunyuanDiT/t2i/model"),
+        ("modelscope/HunyuanDiT", "t2i/sdxl-vae-fp16-fix/diffusion_pytorch_model.bin", "models/HunyuanDiT/t2i/sdxl-vae-fp16-fix"),
+    ],
+    # Stable Video Diffusion
+    "stable-video-diffusion-img2vid-xt": [
+        ("AI-ModelScope/stable-video-diffusion-img2vid-xt", "svd_xt.safetensors", "models/stable_video_diffusion"),
+    ],
+    # ExVideo
+    "ExVideo-SVD-128f-v1": [
+        ("ECNU-CILab/ExVideo-SVD-128f-v1", "model.fp16.safetensors", "models/stable_video_diffusion"),
+    ],
+    "ExVideo-CogVideoX-LoRA-129f-v1": [
+        ("ECNU-CILab/ExVideo-CogVideoX-LoRA-129f-v1", "ExVideo-CogVideoX-LoRA-129f-v1.safetensors", "models/lora"),
+    ],
+    # Stable Diffusion
+    "StableDiffusion_v15": [
+        ("AI-ModelScope/stable-diffusion-v1-5", "v1-5-pruned-emaonly.safetensors", "models/stable_diffusion"),
+    ],
+    "DreamShaper_8": [
+        ("sd_lora/dreamshaper_8", "dreamshaper_8.safetensors", "models/stable_diffusion"),
+    ],
+    "AingDiffusion_v12": [
+        ("sd_lora/aingdiffusion_v12", "aingdiffusion_v12.safetensors", "models/stable_diffusion"),
+    ],
+    "Flat2DAnimerge_v45Sharp": [
+        ("sd_lora/Flat-2D-Animerge", "flat2DAnimerge_v45Sharp.safetensors", "models/stable_diffusion"),
+    ],
+    # Textual Inversion
+    "TextualInversion_VeryBadImageNegative_v1.3": [
+        ("sd_lora/verybadimagenegative_v1.3", "verybadimagenegative_v1.3.pt", "models/textual_inversion"),
+    ],
+    # Stable Diffusion XL
+    "StableDiffusionXL_v1": [
+        ("AI-ModelScope/stable-diffusion-xl-base-1.0", "sd_xl_base_1.0.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "BluePencilXL_v200": [
+        ("sd_lora/bluePencilXL_v200", "bluePencilXL_v200.safetensors", "models/stable_diffusion_xl"),
+    ],
+    "StableDiffusionXL_Turbo": [
+        ("AI-ModelScope/sdxl-turbo", "sd_xl_turbo_1.0_fp16.safetensors", "models/stable_diffusion_xl_turbo"),
+    ],
+    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0": [
+        ("sd_lora/zyd232_ChineseInkStyle_SDXL_v1_0", "zyd232_ChineseInkStyle_SDXL_v1_0.safetensors", "models/lora"),
+    ],
+    # Stable Diffusion 3
+    "StableDiffusion3": [
+        ("AI-ModelScope/stable-diffusion-3-medium", "sd3_medium_incl_clips_t5xxlfp16.safetensors", "models/stable_diffusion_3"),
+    ],
+    "StableDiffusion3_without_T5": [
+        ("AI-ModelScope/stable-diffusion-3-medium", "sd3_medium_incl_clips.safetensors", "models/stable_diffusion_3"),
+    ],
+    # ControlNet
+    "ControlNet_v11f1p_sd15_depth": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11f1p_sd15_depth.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    "ControlNet_v11p_sd15_softedge": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_softedge.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators")
+    ],
+    "ControlNet_v11f1e_sd15_tile": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11f1e_sd15_tile.pth", "models/ControlNet")
+    ],
+    "ControlNet_v11p_sd15_lineart": [
+        ("AI-ModelScope/ControlNet-v1-1", "control_v11p_sd15_lineart.pth", "models/ControlNet"),
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators")
+    ],
+    "ControlNet_union_sdxl_promax": [
+        ("AI-ModelScope/controlnet-union-sdxl-1.0", "diffusion_pytorch_model_promax.safetensors", "models/ControlNet/controlnet_union"),
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators")
+    ],
+    "Annotators:Depth": [
+        ("sd_lora/Annotators", "dpt_hybrid-midas-501f0c75.pt", "models/Annotators"),
+    ],
+    "Annotators:Softedge": [
+        ("sd_lora/Annotators", "ControlNetHED.pth", "models/Annotators"),
+    ],
+    "Annotators:Lineart": [
+        ("sd_lora/Annotators", "sk_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "sk_model2.pth", "models/Annotators"),
+    ],
+    "Annotators:Normal": [
+        ("sd_lora/Annotators", "scannet.pt", "models/Annotators"),
+    ],
+    "Annotators:Openpose": [
+        ("sd_lora/Annotators", "body_pose_model.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "facenet.pth", "models/Annotators"),
+        ("sd_lora/Annotators", "hand_pose_model.pth", "models/Annotators"),
+    ],
+    # AnimateDiff
+    "AnimateDiff_v2": [
+        ("Shanghai_AI_Laboratory/animatediff", "mm_sd_v15_v2.ckpt", "models/AnimateDiff"),
+    ],
+    "AnimateDiff_xl_beta": [
+        ("Shanghai_AI_Laboratory/animatediff", "mm_sdxl_v10_beta.ckpt", "models/AnimateDiff"),
+    ],
+    # RIFE
+    "RIFE": [
+        ("Damo_XR_Lab/cv_rife_video-frame-interpolation", "flownet.pkl", "models/RIFE"),
+    ],
+    # Qwen Prompt
+    "QwenPrompt": {
+        "file_list": [
+            ("qwen/Qwen2-1.5B-Instruct", "config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "generation_config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "model.safetensors", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "special_tokens_map.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "tokenizer.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "tokenizer_config.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "merges.txt", "models/QwenPrompt/qwen2-1.5b-instruct"),
+            ("qwen/Qwen2-1.5B-Instruct", "vocab.json", "models/QwenPrompt/qwen2-1.5b-instruct"),
+        ],
+        "load_path": [
+            "models/QwenPrompt/qwen2-1.5b-instruct",
+        ],
+    },
+    # Beautiful Prompt
+    "BeautifulPrompt": {
+        "file_list": [
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "generation_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "model.safetensors", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "special_tokens_map.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+            ("AI-ModelScope/pai-bloom-1b1-text2prompt-sd", "tokenizer_config.json", "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd"),
+        ],
+        "load_path": [
+            "models/BeautifulPrompt/pai-bloom-1b1-text2prompt-sd",
+        ],
+    },
+    # Omost prompt
+    "OmostPrompt": {
+        "file_list": [
+            ("Omost/omost-llama-3-8b-4bits", "model-00001-of-00002.safetensors", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "model-00002-of-00002.safetensors", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "tokenizer.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "tokenizer_config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "generation_config.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "model.safetensors.index.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+            ("Omost/omost-llama-3-8b-4bits", "special_tokens_map.json", "models/OmostPrompt/omost-llama-3-8b-4bits"),
+        ],
+        "load_path": [
+            "models/OmostPrompt/omost-llama-3-8b-4bits",
+        ],
+    },
+    # Translator
+    "opus-mt-zh-en": {
+        "file_list": [
+            ("moxying/opus-mt-zh-en", "config.json", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "generation_config.json", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "metadata.json", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "pytorch_model.bin", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "source.spm", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "target.spm", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "tokenizer_config.json", "models/translator/opus-mt-zh-en"),
+            ("moxying/opus-mt-zh-en", "vocab.json", "models/translator/opus-mt-zh-en"),
+        ],
+        "load_path": [
+            "models/translator/opus-mt-zh-en",
+        ],
+    },
+    # IP-Adapter
+    "IP-Adapter-SD": [
+        ("AI-ModelScope/IP-Adapter", "models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion/image_encoder"),
+        ("AI-ModelScope/IP-Adapter", "models/ip-adapter_sd15.bin", "models/IpAdapter/stable_diffusion"),
+    ],
+    "IP-Adapter-SDXL": [
+        ("AI-ModelScope/IP-Adapter", "sdxl_models/image_encoder/model.safetensors", "models/IpAdapter/stable_diffusion_xl/image_encoder"),
+        ("AI-ModelScope/IP-Adapter", "sdxl_models/ip-adapter_sdxl.bin", "models/IpAdapter/stable_diffusion_xl"),
+    ],
+    # Kolors
+    "Kolors": {
+        "file_list": [
+            ("Kwai-Kolors/Kolors", "text_encoder/config.json", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model.bin.index.json", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00001-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00002-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00003-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00004-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00005-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00006-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "text_encoder/pytorch_model-00007-of-00007.bin", "models/kolors/Kolors/text_encoder"),
+            ("Kwai-Kolors/Kolors", "unet/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/unet"),
+            ("Kwai-Kolors/Kolors", "vae/diffusion_pytorch_model.safetensors", "models/kolors/Kolors/vae"),
+        ],
+        "load_path": [
+            "models/kolors/Kolors/text_encoder",
+            "models/kolors/Kolors/unet/diffusion_pytorch_model.safetensors",
+            "models/kolors/Kolors/vae/diffusion_pytorch_model.safetensors",
+        ],
+    },
+    "SDXL-vae-fp16-fix": [
+        ("AI-ModelScope/sdxl-vae-fp16-fix", "diffusion_pytorch_model.safetensors", "models/sdxl-vae-fp16-fix")
+    ],
+    # FLUX
+    "FLUX.1-dev": {
+        "file_list": [
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder/model.safetensors", "models/FLUX/FLUX.1-dev/text_encoder"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/config.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model-00001-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model-00002-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model.safetensors.index.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+            ("AI-ModelScope/FLUX.1-dev", "flux1-dev.safetensors", "models/FLUX/FLUX.1-dev"),
+        ],
+        "load_path": [
+            "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+            "models/FLUX/FLUX.1-dev/ae.safetensors",
+            "models/FLUX/FLUX.1-dev/flux1-dev.safetensors"
+        ],
+    },
+    "FLUX.1-schnell": {
+        "file_list": [
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder/model.safetensors", "models/FLUX/FLUX.1-dev/text_encoder"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/config.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model-00001-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model-00002-of-00002.safetensors", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "text_encoder_2/model.safetensors.index.json", "models/FLUX/FLUX.1-dev/text_encoder_2"),
+            ("AI-ModelScope/FLUX.1-dev", "ae.safetensors", "models/FLUX/FLUX.1-dev"),
+            ("AI-ModelScope/FLUX.1-schnell", "flux1-schnell.safetensors", "models/FLUX/FLUX.1-schnell"),
+        ],
+        "load_path": [
+            "models/FLUX/FLUX.1-dev/text_encoder/model.safetensors",
+            "models/FLUX/FLUX.1-dev/text_encoder_2",
+            "models/FLUX/FLUX.1-dev/ae.safetensors",
+            "models/FLUX/FLUX.1-schnell/flux1-schnell.safetensors"
+        ],
+    },
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha": [
+        ("InstantX/FLUX.1-dev-Controlnet-Union-alpha", "diffusion_pytorch_model.safetensors", "models/ControlNet/InstantX/FLUX.1-dev-Controlnet-Union-alpha"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Depth": [
+        ("jasperai/Flux.1-dev-Controlnet-Depth", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Depth"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals": [
+        ("jasperai/Flux.1-dev-Controlnet-Surface-Normals", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Surface-Normals"),
+    ],
+    "jasperai/Flux.1-dev-Controlnet-Upscaler": [
+        ("jasperai/Flux.1-dev-Controlnet-Upscaler", "diffusion_pytorch_model.safetensors", "models/ControlNet/jasperai/Flux.1-dev-Controlnet-Upscaler"),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha": [
+        ("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha", "diffusion_pytorch_model.safetensors", "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha"),
+    ],
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta": [
+        ("alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta", "diffusion_pytorch_model.safetensors", "models/ControlNet/alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta"),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth": [
+        ("Shakker-Labs/FLUX.1-dev-ControlNet-Depth", "diffusion_pytorch_model.safetensors", "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Depth"),
+    ],
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro": [
+        ("Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro", "diffusion_pytorch_model.safetensors", "models/ControlNet/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro"),
+    ],
+    "InstantX/FLUX.1-dev-IP-Adapter": {
+        "file_list": [
+            ("InstantX/FLUX.1-dev-IP-Adapter", "ip-adapter.bin", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter"),
+            ("AI-ModelScope/siglip-so400m-patch14-384", "model.safetensors", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder"),
+            ("AI-ModelScope/siglip-so400m-patch14-384", "config.json", "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder"),
+        ],
+        "load_path": [
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/ip-adapter.bin",
+            "models/IpAdapter/InstantX/FLUX.1-dev-IP-Adapter/image_encoder",
+        ],
+    },
+    # ESRGAN
+    "ESRGAN_x4": [
+        ("AI-ModelScope/Real-ESRGAN", "RealESRGAN_x4.pth", "models/ESRGAN"),
+    ],
+    # RIFE
+    "RIFE": [
+        ("AI-ModelScope/RIFE", "flownet.pkl", "models/RIFE"),
+    ],
+    # Omnigen
+    "OmniGen-v1": {
+        "file_list": [
+            ("BAAI/OmniGen-v1", "vae/diffusion_pytorch_model.safetensors", "models/OmniGen/OmniGen-v1/vae"),
+            ("BAAI/OmniGen-v1", "model.safetensors", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "config.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "special_tokens_map.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "tokenizer_config.json", "models/OmniGen/OmniGen-v1"),
+            ("BAAI/OmniGen-v1", "tokenizer.json", "models/OmniGen/OmniGen-v1"),
+        ],
+        "load_path": [
+            "models/OmniGen/OmniGen-v1/vae/diffusion_pytorch_model.safetensors",
+            "models/OmniGen/OmniGen-v1/model.safetensors",
+        ]
+    },
+    # CogVideo
+    "CogVideoX-5B": {
+        "file_list": [
+            ("ZhipuAI/CogVideoX-5b", "text_encoder/config.json", "models/CogVideo/CogVideoX-5b/text_encoder"),
+            ("ZhipuAI/CogVideoX-5b", "text_encoder/model.safetensors.index.json", "models/CogVideo/CogVideoX-5b/text_encoder"),
+            ("ZhipuAI/CogVideoX-5b", "text_encoder/model-00001-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/text_encoder"),
+            ("ZhipuAI/CogVideoX-5b", "text_encoder/model-00002-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/text_encoder"),
+            ("ZhipuAI/CogVideoX-5b", "transformer/config.json", "models/CogVideo/CogVideoX-5b/transformer"),
+            ("ZhipuAI/CogVideoX-5b", "transformer/diffusion_pytorch_model.safetensors.index.json", "models/CogVideo/CogVideoX-5b/transformer"),
+            ("ZhipuAI/CogVideoX-5b", "transformer/diffusion_pytorch_model-00001-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/transformer"),
+            ("ZhipuAI/CogVideoX-5b", "transformer/diffusion_pytorch_model-00002-of-00002.safetensors", "models/CogVideo/CogVideoX-5b/transformer"),
+            ("ZhipuAI/CogVideoX-5b", "vae/diffusion_pytorch_model.safetensors", "models/CogVideo/CogVideoX-5b/vae"),
+        ],
+        "load_path": [
+            "models/CogVideo/CogVideoX-5b/text_encoder",
+            "models/CogVideo/CogVideoX-5b/transformer",
+            "models/CogVideo/CogVideoX-5b/vae/diffusion_pytorch_model.safetensors",
+        ],
+    },
+    # Stable Diffusion 3.5
+    "StableDiffusion3.5-large": [
+        ("AI-ModelScope/stable-diffusion-3.5-large", "sd3.5_large.safetensors", "models/stable_diffusion_3"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_l.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
+    ],
+    "StableDiffusion3.5-medium": [
+        ("AI-ModelScope/stable-diffusion-3.5-medium", "sd3.5_medium.safetensors", "models/stable_diffusion_3"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_l.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
+    ],
+    "StableDiffusion3.5-large-turbo": [
+        ("AI-ModelScope/stable-diffusion-3.5-large-turbo", "sd3.5_large_turbo.safetensors", "models/stable_diffusion_3"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_l.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/clip_g.safetensors", "models/stable_diffusion_3/text_encoders"),
+        ("AI-ModelScope/stable-diffusion-3.5-large", "text_encoders/t5xxl_fp16.safetensors", "models/stable_diffusion_3/text_encoders"),
+    ],
+    "HunyuanVideo":{
+        "file_list": [
+            ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00001-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00002-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00003-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00004-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "config.json", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model.safetensors.index.json", "models/HunyuanVideo/text_encoder_2"),
+            ("AI-ModelScope/HunyuanVideo", "hunyuan-video-t2v-720p/vae/pytorch_model.pt", "models/HunyuanVideo/vae"),
+            ("AI-ModelScope/HunyuanVideo", "hunyuan-video-t2v-720p/transformers/mp_rank_00_model_states.pt", "models/HunyuanVideo/transformers")
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+            "models/HunyuanVideo/vae/pytorch_model.pt",
+            "models/HunyuanVideo/transformers/mp_rank_00_model_states.pt"
+        ],
+    },
+    "HunyuanVideo-fp8":{
+        "file_list": [
+            ("AI-ModelScope/clip-vit-large-patch14", "model.safetensors", "models/HunyuanVideo/text_encoder"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00001-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00002-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00003-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model-00004-of-00004.safetensors", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "config.json", "models/HunyuanVideo/text_encoder_2"),
+            ("DiffSynth-Studio/HunyuanVideo_MLLM_text_encoder", "model.safetensors.index.json", "models/HunyuanVideo/text_encoder_2"),
+            ("AI-ModelScope/HunyuanVideo", "hunyuan-video-t2v-720p/vae/pytorch_model.pt", "models/HunyuanVideo/vae"),
+            ("DiffSynth-Studio/HunyuanVideo-safetensors", "model.fp8.safetensors", "models/HunyuanVideo/transformers")
+        ],
+        "load_path": [
+            "models/HunyuanVideo/text_encoder/model.safetensors",
+            "models/HunyuanVideo/text_encoder_2",
+            "models/HunyuanVideo/vae/pytorch_model.pt",
+            "models/HunyuanVideo/transformers/model.fp8.safetensors"
+        ],
+    },
+}
+Preset_model_id: TypeAlias = Literal[
+    "HunyuanDiT",
+    "stable-video-diffusion-img2vid-xt",
+    "ExVideo-SVD-128f-v1",
+    "ExVideo-CogVideoX-LoRA-129f-v1",
+    "StableDiffusion_v15",
+    "DreamShaper_8",
+    "AingDiffusion_v12",
+    "Flat2DAnimerge_v45Sharp",
+    "TextualInversion_VeryBadImageNegative_v1.3",
+    "StableDiffusionXL_v1",
+    "BluePencilXL_v200",
+    "StableDiffusionXL_Turbo",
+    "ControlNet_v11f1p_sd15_depth",
+    "ControlNet_v11p_sd15_softedge",
+    "ControlNet_v11f1e_sd15_tile",
+    "ControlNet_v11p_sd15_lineart",
+    "AnimateDiff_v2",
+    "AnimateDiff_xl_beta",
+    "RIFE",
+    "BeautifulPrompt",
+    "opus-mt-zh-en",
+    "IP-Adapter-SD",
+    "IP-Adapter-SDXL",
+    "StableDiffusion3",
+    "StableDiffusion3_without_T5",
+    "Kolors",
+    "SDXL-vae-fp16-fix",
+    "ControlNet_union_sdxl_promax",
+    "FLUX.1-dev",
+    "FLUX.1-schnell",
+    "InstantX/FLUX.1-dev-Controlnet-Union-alpha",
+    "jasperai/Flux.1-dev-Controlnet-Depth",
+    "jasperai/Flux.1-dev-Controlnet-Surface-Normals",
+    "jasperai/Flux.1-dev-Controlnet-Upscaler",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Alpha",
+    "alimama-creative/FLUX.1-dev-Controlnet-Inpainting-Beta",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Depth",
+    "Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro",
+    "InstantX/FLUX.1-dev-IP-Adapter",
+    "SDXL_lora_zyd232_ChineseInkStyle_SDXL_v1_0",
+    "QwenPrompt",
+    "OmostPrompt",
+    "ESRGAN_x4",
+    "RIFE",
+    "OmniGen-v1",
+    "CogVideoX-5B",
+    "Annotators:Depth",
+    "Annotators:Softedge",
+    "Annotators:Lineart",
+    "Annotators:Normal",
+    "Annotators:Openpose",
+    "StableDiffusion3.5-large",
+    "StableDiffusion3.5-medium",
+    "HunyuanVideo",
+    "HunyuanVideo-fp8",
+]

diffsynth/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .video import VideoData, save_video, save_frames

diffsynth/data/video.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import imageio, os
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+class LowMemoryVideo:
+    def __init__(self, file_name):
+        self.reader = imageio.get_reader(file_name)
+    def __len__(self):
+        return self.reader.count_frames()
+    def __getitem__(self, item):
+        return Image.fromarray(np.array(self.reader.get_data(item))).convert("RGB")
+    def __del__(self):
+        self.reader.close()
+def split_file_name(file_name):
+    result = []
+    number = -1
+    for i in file_name:
+        if ord(i)>=ord("0") and ord(i)<=ord("9"):
+            if number == -1:
+                number = 0
+            number = number*10 + ord(i) - ord("0")
+        else:
+            if number != -1:
+                result.append(number)
+                number = -1
+            result.append(i)
+    if number != -1:
+        result.append(number)
+    result = tuple(result)
+    return result
+def search_for_images(folder):
+    file_list = [i for i in os.listdir(folder) if i.endswith(".jpg") or i.endswith(".png")]
+    file_list = [(split_file_name(file_name), file_name) for file_name in file_list]
+    file_list = [i[1] for i in sorted(file_list)]
+    file_list = [os.path.join(folder, i) for i in file_list]
+    return file_list
+class LowMemoryImageFolder:
+    def __init__(self, folder, file_list=None):
+        if file_list is None:
+            self.file_list = search_for_images(folder)
+        else:
+            self.file_list = [os.path.join(folder, file_name) for file_name in file_list]
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, item):
+        return Image.open(self.file_list[item]).convert("RGB")
+    def __del__(self):
+        pass
+def crop_and_resize(image, height, width):
+    image = np.array(image)
+    image_height, image_width, _ = image.shape
+    if image_height / image_width < height / width:
+        croped_width = int(image_height / height * width)
+        left = (image_width - croped_width) // 2
+        image = image[:, left: left+croped_width]
+        image = Image.fromarray(image).resize((width, height))
+    else:
+        croped_height = int(image_width / width * height)
+        left = (image_height - croped_height) // 2
+        image = image[left: left+croped_height, :]
+        image = Image.fromarray(image).resize((width, height))
+    return image
+class VideoData:
+    def __init__(self, video_file=None, image_folder=None, height=None, width=None, **kwargs):
+        if video_file is not None:
+            self.data_type = "video"
+            self.data = LowMemoryVideo(video_file, **kwargs)
+        elif image_folder is not None:
+            self.data_type = "images"
+            self.data = LowMemoryImageFolder(image_folder, **kwargs)
+        else:
+            raise ValueError("Cannot open video or image folder")
+        self.length = None
+        self.set_shape(height, width)
+    def raw_data(self):
+        frames = []
+        for i in range(self.__len__()):
+            frames.append(self.__getitem__(i))
+        return frames
+    def set_length(self, length):
+        self.length = length
+    def set_shape(self, height, width):
+        self.height = height
+        self.width = width
+    def __len__(self):
+        if self.length is None:
+            return len(self.data)
+        else:
+            return self.length
+    def shape(self):
+        if self.height is not None and self.width is not None:
+            return self.height, self.width
+        else:
+            height, width, _ = self.__getitem__(0).shape
+            return height, width
+    def __getitem__(self, item):
+        frame = self.data.__getitem__(item)
+        width, height = frame.size
+        if self.height is not None and self.width is not None:
+            if self.height != height or self.width != width:
+                frame = crop_and_resize(frame, self.height, self.width)
+        return frame
+    def __del__(self):
+        pass
+    def save_images(self, folder):
+        os.makedirs(folder, exist_ok=True)
+        for i in tqdm(range(self.__len__()), desc="Saving images"):
+            frame = self.__getitem__(i)
+            frame.save(os.path.join(folder, f"{i}.png"))
+def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+    writer = imageio.get_writer(save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params)
+    for frame in tqdm(frames, desc="Saving video"):
+        frame = np.array(frame)
+        writer.append_data(frame)
+    writer.close()
+# def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+#     writer = imageio.get_writer(save_path, fps=fps, quality=quality, ffmpeg_params=["-crf", "0", "-preset", "veryslow"])
+#     for frame in tqdm(frames, desc="Saving video"):
+#         frame = np.array(frame)
+#         writer.append_data(frame)
+#     writer.close()
+# def save_video_h264(frames, save_path, fps, ffmpeg_params=None):
+#     import imageio.v3 as iio
+#     from tqdm import tqdm
+#     import numpy as np
+#     if ffmpeg_params is None:
+#         ffmpeg_params = ["-crf", "0", "-preset", "ultrafast"]  # 无损 H.264
+#     writer = iio.get_writer(save_path, fps=fps, codec="libx264", ffmpeg_params=ffmpeg_params)
+#     for frame in tqdm(frames, desc="Saving video"):
+#         writer.append_data(np.array(frame))
+#     writer.close()
+def save_frames(frames, save_path):
+    os.makedirs(save_path, exist_ok=True)
+    for i, frame in enumerate(tqdm(frames, desc="Saving images")):
+        frame.save(os.path.join(save_path, f"{i}.png"))
+if __name__=='__main__':
+    frames = [Image.fromarray(np.random.randint(0, 256, (512, 512, 3), dtype=np.uint8)) for i in range(81)]
+    save_video(frames,"haha.mp4",23,5)

diffsynth/pipelines/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .wan_video import WanVideoPipeline

diffsynth/pipelines/base.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import torch
+import numpy as np
+from PIL import Image
+from torchvision.transforms import GaussianBlur
+class BasePipeline(torch.nn.Module):
+    def __init__(self, device="cuda", torch_dtype=torch.float16, height_division_factor=64, width_division_factor=64):
+        super().__init__()
+        self.device = device
+        self.torch_dtype = torch_dtype
+        self.height_division_factor = height_division_factor
+        self.width_division_factor = width_division_factor
+        self.cpu_offload = False
+        self.model_names = []
+    def check_resize_height_width(self, height, width):
+        if height % self.height_division_factor != 0:
+            height = (height + self.height_division_factor - 1) // self.height_division_factor * self.height_division_factor
+            print(f"The height cannot be evenly divided by {self.height_division_factor}. We round it up to {height}.")
+        if width % self.width_division_factor != 0:
+            width = (width + self.width_division_factor - 1) // self.width_division_factor * self.width_division_factor
+            print(f"The width cannot be evenly divided by {self.width_division_factor}. We round it up to {width}.")
+        return height, width
+    def preprocess_image(self, image):
+        image = torch.Tensor(np.array(image, dtype=np.float32) * (2 / 255) - 1).permute(2, 0, 1).unsqueeze(0)
+        return image
+    def preprocess_images(self, images):
+        return [self.preprocess_image(image) for image in images]
+    def vae_output_to_image(self, vae_output):
+        image = vae_output[0].cpu().float().permute(1, 2, 0).numpy()
+        image = Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8"))
+        return image
+    def vae_output_to_video(self, vae_output):
+        video = vae_output.cpu().permute(1, 2, 0).numpy()
+        video = [Image.fromarray(((image / 2 + 0.5).clip(0, 1) * 255).astype("uint8")) for image in video]
+        return video
+    def merge_latents(self, value, latents, masks, scales, blur_kernel_size=33, blur_sigma=10.0):
+        if len(latents) > 0:
+            blur = GaussianBlur(kernel_size=blur_kernel_size, sigma=blur_sigma)
+            height, width = value.shape[-2:]
+            weight = torch.ones_like(value)
+            for latent, mask, scale in zip(latents, masks, scales):
+                mask = self.preprocess_image(mask.resize((width, height))).mean(dim=1, keepdim=True) > 0
+                mask = mask.repeat(1, latent.shape[1], 1, 1).to(dtype=latent.dtype, device=latent.device)
+                mask = blur(mask)
+                value += latent * mask * scale
+                weight += mask * scale
+            value /= weight
+        return value
+    def control_noise_via_local_prompts(self, prompt_emb_global, prompt_emb_locals, masks, mask_scales, inference_callback, special_kwargs=None, special_local_kwargs_list=None):
+        if special_kwargs is None:
+            noise_pred_global = inference_callback(prompt_emb_global)
+        else:
+            noise_pred_global = inference_callback(prompt_emb_global, special_kwargs)
+        if special_local_kwargs_list is None:
+            noise_pred_locals = [inference_callback(prompt_emb_local) for prompt_emb_local in prompt_emb_locals]
+        else:
+            noise_pred_locals = [inference_callback(prompt_emb_local, special_kwargs) for prompt_emb_local, special_kwargs in zip(prompt_emb_locals, special_local_kwargs_list)]
+        noise_pred = self.merge_latents(noise_pred_global, noise_pred_locals, masks, mask_scales)
+        return noise_pred
+    def extend_prompt(self, prompt, local_prompts, masks, mask_scales):
+        local_prompts = local_prompts or []
+        masks = masks or []
+        mask_scales = mask_scales or []
+        extended_prompt_dict = self.prompter.extend_prompt(prompt)
+        prompt = extended_prompt_dict.get("prompt", prompt)
+        local_prompts += extended_prompt_dict.get("prompts", [])
+        masks += extended_prompt_dict.get("masks", [])
+        mask_scales += [100.0] * len(extended_prompt_dict.get("masks", []))
+        return prompt, local_prompts, masks, mask_scales
+    def enable_cpu_offload(self):
+        self.cpu_offload = True
+    def load_models_to_device(self, loadmodel_names=[]):
+        # only load models to device if cpu_offload is enabled
+        if not self.cpu_offload:
+            return
+        # offload the unneeded models to cpu
+        for model_name in self.model_names:
+            if model_name not in loadmodel_names:
+                model = getattr(self, model_name)
+                if model is not None:
+                    if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
+                        for module in model.modules():
+                            if hasattr(module, "offload"):
+                                module.offload()
+                    else:
+                        model.cpu()
+        # load the needed models to device
+        for model_name in loadmodel_names:
+            model = getattr(self, model_name)
+            if model is not None:
+                if hasattr(model, "vram_management_enabled") and model.vram_management_enabled:
+                    for module in model.modules():
+                        if hasattr(module, "onload"):
+                            module.onload()
+                else:
+                    model.to(self.device)
+        # fresh the cuda cache
+        torch.cuda.empty_cache()
+    def generate_noise(self, shape, seed=None, device="cpu", dtype=torch.float16):
+        generator = None if seed is None else torch.Generator(device).manual_seed(seed)
+        noise = torch.randn(shape, generator=generator, device=device, dtype=dtype)
+        return noise

diffsynth/pipelines/wan_video.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from ..models import ModelManager
+from ..models.wan_video_dit import WanModel
+from ..models.wan_video_text_encoder import WanTextEncoder
+from ..models.wan_video_vae import WanVideoVAE
+from ..models.wan_video_image_encoder import WanImageEncoder
+from ..schedulers.flow_match import FlowMatchScheduler
+from .base import BasePipeline
+from ..prompters import WanPrompter
+import torch, os
+from einops import rearrange
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from ..vram_management import enable_vram_management, AutoWrappedModule, AutoWrappedLinear
+from ..models.wan_video_text_encoder import T5RelativeEmbedding, T5LayerNorm
+from ..models.wan_video_dit import WanLayerNorm, WanRMSNorm
+from ..models.wan_video_vae import RMS_norm, CausalConv3d, Upsample
+class WanVideoPipeline(BasePipeline):
+    def __init__(self, device="cuda", torch_dtype=torch.float16, tokenizer_path=None):
+        super().__init__(device=device, torch_dtype=torch_dtype)
+        self.scheduler = FlowMatchScheduler(shift=5, sigma_min=0.0, extra_one_step=True)
+        self.prompter = WanPrompter(tokenizer_path=tokenizer_path)
+        self.text_encoder: WanTextEncoder = None
+        self.image_encoder: WanImageEncoder = None
+        self.dit: WanModel = None
+        self.vae: WanVideoVAE = None
+        self.model_names = ['text_encoder', 'dit', 'vae']
+        self.height_division_factor = 16
+        self.width_division_factor = 16
+    def enable_vram_management(self, num_persistent_param_in_dit=None):
+        dtype = next(iter(self.text_encoder.parameters())).dtype
+        enable_vram_management(
+            self.text_encoder,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Embedding: AutoWrappedModule,
+                T5RelativeEmbedding: AutoWrappedModule,
+                T5LayerNorm: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device="cpu",
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        dtype = next(iter(self.dit.parameters())).dtype
+        enable_vram_management(
+            self.dit,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Conv3d: AutoWrappedModule,
+                torch.nn.LayerNorm: AutoWrappedModule,
+                WanLayerNorm: AutoWrappedModule,
+                WanRMSNorm: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device=self.device,
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+            max_num_param=num_persistent_param_in_dit,
+            overflow_module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device="cpu",
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        dtype = next(iter(self.vae.parameters())).dtype
+        enable_vram_management(
+            self.vae,
+            module_map = {
+                torch.nn.Linear: AutoWrappedLinear,
+                torch.nn.Conv2d: AutoWrappedModule,
+                RMS_norm: AutoWrappedModule,
+                CausalConv3d: AutoWrappedModule,
+                Upsample: AutoWrappedModule,
+                torch.nn.SiLU: AutoWrappedModule,
+                torch.nn.Dropout: AutoWrappedModule,
+            },
+            module_config = dict(
+                offload_dtype=dtype,
+                offload_device="cpu",
+                onload_dtype=dtype,
+                onload_device=self.device,
+                computation_dtype=self.torch_dtype,
+                computation_device=self.device,
+            ),
+        )
+        if self.image_encoder is not None:
+            dtype = next(iter(self.image_encoder.parameters())).dtype
+            enable_vram_management(
+                self.image_encoder,
+                module_map = {
+                    torch.nn.Linear: AutoWrappedLinear,
+                    torch.nn.Conv2d: AutoWrappedModule,
+                    torch.nn.LayerNorm: AutoWrappedModule,
+                },
+                module_config = dict(
+                    offload_dtype=dtype,
+                    offload_device="cpu",
+                    onload_dtype=dtype,
+                    onload_device="cpu",
+                    computation_dtype=self.torch_dtype,
+                    computation_device=self.device,
+                ),
+            )
+        self.enable_cpu_offload()
+    def fetch_models(self, model_manager: ModelManager):
+        text_encoder_model_and_path = model_manager.fetch_model("wan_video_text_encoder", require_model_path=True)
+        if text_encoder_model_and_path is not None:
+            self.text_encoder, tokenizer_path = text_encoder_model_and_path
+            self.prompter.fetch_models(self.text_encoder)
+            self.prompter.fetch_tokenizer(os.path.join(os.path.dirname(tokenizer_path), "google/umt5-xxl"))
+        self.dit = model_manager.fetch_model("wan_video_dit")
+        self.vae = model_manager.fetch_model("wan_video_vae")
+        self.image_encoder = model_manager.fetch_model("wan_video_image_encoder")
+    @staticmethod
+    def from_model_manager(model_manager: ModelManager, torch_dtype=None, device=None):
+        if device is None: device = model_manager.device
+        if torch_dtype is None: torch_dtype = model_manager.torch_dtype
+        pipe = WanVideoPipeline(device=device, torch_dtype=torch_dtype)
+        pipe.fetch_models(model_manager)
+        return pipe
+    def denoising_model(self):
+        return self.dit
+    def encode_prompt(self, prompt, positive=True):
+        prompt_emb = self.prompter.encode_prompt(prompt, positive=positive)
+        return {"context": prompt_emb}
+    def encode_image(self, image, num_frames, height, width):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            image = self.preprocess_image(image.resize((width, height))).to(self.device)
+            clip_context = self.image_encoder.encode_image([image])
+            msk = torch.ones(1, num_frames, height//8, width//8, device=self.device)
+            msk[:, 1:] = 0
+            msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
+            msk = msk.view(1, msk.shape[1] // 4, 4, height//8, width//8)
+            msk = msk.transpose(1, 2)[0]
+            y = self.vae.encode([torch.concat([image.transpose(0, 1), torch.zeros(3, num_frames-1, height, width).to(image.device)], dim=1)], device=self.device)[0]
+            y = torch.concat([msk, y])
+        return {"clip_fea": clip_context, "y": [y]}
+    def tensor2video(self, frames):
+        frames = rearrange(frames, "C T H W -> T H W C")
+        frames = ((frames.float() + 1) * 127.5).clip(0, 255).cpu().numpy().astype(np.uint8)
+        frames = [Image.fromarray(frame) for frame in frames]
+        return frames
+    def prepare_extra_input(self, latents=None):
+        return {"seq_len": latents.shape[2] * latents.shape[3] * latents.shape[4] // 4}
+    def encode_video(self, input_video, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            latents = self.vae.encode(input_video, device=self.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+        return latents
+    def decode_video(self, latents, tiled=True, tile_size=(34, 34), tile_stride=(18, 16)):
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            frames = self.vae.decode(latents, device=self.device, tiled=tiled, tile_size=tile_size, tile_stride=tile_stride)
+        return frames
+    def set_ip(self, local_path):
+        pass
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        negative_prompt="",
+        input_image=None,
+        input_video=None,
+        denoising_strength=1.0,
+        seed=None,
+        rand_device="cpu",
+        height=480,
+        width=832,
+        num_frames=81,
+        cfg_scale=5.0,
+        audio_cfg_scale=None,
+        num_inference_steps=50,
+        sigma_shift=5.0,
+        tiled=True,
+        tile_size=(30, 52),
+        tile_stride=(15, 26),
+        progress_bar_cmd=tqdm,
+        progress_bar_st=None,
+        **kwargs,
+    ):
+        # Parameter check
+        height, width = self.check_resize_height_width(height, width)
+        if num_frames % 4 != 1:
+            num_frames = (num_frames + 2) // 4 * 4 + 1
+            print(f"Only `num_frames % 4 != 1` is acceptable. We round it up to {num_frames}.")
+        # Tiler parameters
+        tiler_kwargs = {"tiled": tiled, "tile_size": tile_size, "tile_stride": tile_stride}
+        # Scheduler
+        self.scheduler.set_timesteps(num_inference_steps, denoising_strength, shift=sigma_shift)
+        # Initialize noise
+        noise = self.generate_noise((1, 16, (num_frames - 1) // 4 + 1, height//8, width//8), seed=seed, device=rand_device, dtype=torch.float32).to(self.device)
+        if input_video is not None:
+            self.load_models_to_device(['vae'])
+            input_video = self.preprocess_images(input_video)
+            input_video = torch.stack(input_video, dim=2)
+            latents = self.encode_video(input_video, **tiler_kwargs).to(dtype=noise.dtype, device=noise.device)
+            latents = self.scheduler.add_noise(latents, noise, timestep=self.scheduler.timesteps[0])
+        else:
+            latents = noise
+        # Encode prompts
+        self.load_models_to_device(["text_encoder"])
+        prompt_emb_posi = self.encode_prompt(prompt, positive=True)
+        if cfg_scale != 1.0:
+            prompt_emb_nega = self.encode_prompt(negative_prompt, positive=False)
+        # Encode image
+        if input_image is not None and self.image_encoder is not None:
+            self.load_models_to_device(["image_encoder", "vae"])
+            image_emb = self.encode_image(input_image, num_frames, height, width)
+        else:
+            image_emb = {}
+        # Extra input
+        extra_input = self.prepare_extra_input(latents)
+        # Denoise
+        self.load_models_to_device(["dit"])
+        with torch.amp.autocast(dtype=torch.bfloat16, device_type=torch.device(self.device).type):
+            for progress_id, timestep in enumerate(progress_bar_cmd(self.scheduler.timesteps)):
+                timestep = timestep.unsqueeze(0).to(dtype=torch.float32, device=self.device)
+                # Inference
+                noise_pred_posi = self.dit(latents, timestep=timestep, **prompt_emb_posi, **image_emb, **extra_input, **kwargs) # (zt,audio,prompt)
+                if audio_cfg_scale is not None:
+                    audio_scale = kwargs['audio_scale']
+                    kwargs['audio_scale'] = 0.0
+                    noise_pred_noaudio = self.dit(latents, timestep=timestep, **prompt_emb_posi, **image_emb, **extra_input, **kwargs) #(zt,0,prompt)
+                    # kwargs['ip_scale'] = ip_scale
+                    if cfg_scale != 1.0: #prompt cfg
+                        noise_pred_no_cond = self.dit(latents, timestep=timestep, **prompt_emb_nega, **image_emb, **extra_input, **kwargs)  # (zt,0,0)
+                        noise_pred = noise_pred_no_cond + cfg_scale * (noise_pred_noaudio - noise_pred_no_cond) + audio_cfg_scale * (noise_pred_posi - noise_pred_noaudio)
+                    else:
+                        noise_pred = noise_pred_noaudio + audio_cfg_scale * (noise_pred_posi - noise_pred_noaudio)
+                    kwargs['audio_scale'] = audio_scale
+                else:
+                    if cfg_scale != 1.0:
+                        noise_pred_nega = self.dit(latents, timestep=timestep, **prompt_emb_nega, **image_emb, **extra_input, **kwargs) #(zt,audio,0)
+                        noise_pred = noise_pred_nega + cfg_scale * (noise_pred_posi - noise_pred_nega)
+                    else:
+                        noise_pred = noise_pred_posi
+                # Scheduler
+                latents = self.scheduler.step(noise_pred, self.scheduler.timesteps[progress_id], latents)
+        # Decode
+        self.load_models_to_device(['vae'])
+        frames = self.decode_video(latents, **tiler_kwargs)
+        self.load_models_to_device([])
+        frames = self.tensor2video(frames[0])
+        return frames

diffsynth/prompters/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .wan_prompter import WanPrompter

diffsynth/prompters/base_prompter.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from ..models.model_manager import ModelManager
+import torch
+def tokenize_long_prompt(tokenizer, prompt, max_length=None):
+    # Get model_max_length from self.tokenizer
+    length = tokenizer.model_max_length if max_length is None else max_length
+    # To avoid the warning. set self.tokenizer.model_max_length to +oo.
+    tokenizer.model_max_length = 99999999
+    # Tokenize it!
+    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+    # Determine the real length.
+    max_length = (input_ids.shape[1] + length - 1) // length * length
+    # Restore tokenizer.model_max_length
+    tokenizer.model_max_length = length
+    # Tokenize it again with fixed length.
+    input_ids = tokenizer(
+        prompt,
+        return_tensors="pt",
+        padding="max_length",
+        max_length=max_length,
+        truncation=True
+    ).input_ids
+    # Reshape input_ids to fit the text encoder.
+    num_sentence = input_ids.shape[1] // length
+    input_ids = input_ids.reshape((num_sentence, length))
+    return input_ids
+class BasePrompter:
+    def __init__(self):
+        self.refiners = []
+        self.extenders = []
+    def load_prompt_refiners(self, model_manager: ModelManager, refiner_classes=[]):
+        for refiner_class in refiner_classes:
+            refiner = refiner_class.from_model_manager(model_manager)
+            self.refiners.append(refiner)
+    def load_prompt_extenders(self,model_manager:ModelManager,extender_classes=[]):
+        for extender_class in extender_classes:
+            extender = extender_class.from_model_manager(model_manager)
+            self.extenders.append(extender)
+    @torch.no_grad()
+    def process_prompt(self, prompt, positive=True):
+        if isinstance(prompt, list):
+            prompt = [self.process_prompt(prompt_, positive=positive) for prompt_ in prompt]
+        else:
+            for refiner in self.refiners:
+                prompt = refiner(prompt, positive=positive)
+        return prompt
+    @torch.no_grad()
+    def extend_prompt(self, prompt:str, positive=True):
+        extended_prompt = dict(prompt=prompt)
+        for extender in self.extenders:
+            extended_prompt = extender(extended_prompt)
+        return extended_prompt

diffsynth/prompters/wan_prompter.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from .base_prompter import BasePrompter
+from ..models.wan_video_text_encoder import WanTextEncoder
+from transformers import AutoTokenizer
+import os, torch
+import ftfy
+import html
+import string
+import regex as re
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+def canonicalize(text, keep_punctuation_exact_string=None):
+    text = text.replace('_', ' ')
+    if keep_punctuation_exact_string:
+        text = keep_punctuation_exact_string.join(
+            part.translate(str.maketrans('', '', string.punctuation))
+            for part in text.split(keep_punctuation_exact_string))
+    else:
+        text = text.translate(str.maketrans('', '', string.punctuation))
+    text = text.lower()
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+class HuggingfaceTokenizer:
+    def __init__(self, name, seq_len=None, clean=None, **kwargs):
+        assert clean in (None, 'whitespace', 'lower', 'canonicalize')
+        self.name = name
+        self.seq_len = seq_len
+        self.clean = clean
+        # init tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
+        self.vocab_size = self.tokenizer.vocab_size
+    def __call__(self, sequence, **kwargs):
+        return_mask = kwargs.pop('return_mask', False)
+        # arguments
+        _kwargs = {'return_tensors': 'pt'}
+        if self.seq_len is not None:
+            _kwargs.update({
+                'padding': 'max_length',
+                'truncation': True,
+                'max_length': self.seq_len
+            })
+        _kwargs.update(**kwargs)
+        # tokenization
+        if isinstance(sequence, str):
+            sequence = [sequence]
+        if self.clean:
+            sequence = [self._clean(u) for u in sequence]
+        ids = self.tokenizer(sequence, **_kwargs)
+        # output
+        if return_mask:
+            return ids.input_ids, ids.attention_mask
+        else:
+            return ids.input_ids
+    def _clean(self, text):
+        if self.clean == 'whitespace':
+            text = whitespace_clean(basic_clean(text))
+        elif self.clean == 'lower':
+            text = whitespace_clean(basic_clean(text)).lower()
+        elif self.clean == 'canonicalize':
+            text = canonicalize(basic_clean(text))
+        return text
+class WanPrompter(BasePrompter):
+    def __init__(self, tokenizer_path=None, text_len=512):
+        super().__init__()
+        self.text_len = text_len
+        self.text_encoder = None
+        self.fetch_tokenizer(tokenizer_path)
+    def fetch_tokenizer(self, tokenizer_path=None):
+        if tokenizer_path is not None:
+            self.tokenizer = HuggingfaceTokenizer(name=tokenizer_path, seq_len=self.text_len, clean='whitespace')
+    def fetch_models(self, text_encoder: WanTextEncoder = None):
+        self.text_encoder = text_encoder
+    def encode_prompt(self, prompt, positive=True, device="cuda"):
+        prompt = self.process_prompt(prompt, positive=positive)
+        ids, mask = self.tokenizer(prompt, return_mask=True, add_special_tokens=True)
+        ids = ids.to(device)
+        mask = mask.to(device)
+        seq_lens = mask.gt(0).sum(dim=1).long()
+        prompt_emb = self.text_encoder(ids, mask)
+        prompt_emb = [u[:v] for u, v in zip(prompt_emb, seq_lens)]
+        return prompt_emb

diffsynth/schedulers/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .ddim import EnhancedDDIMScheduler
+from .continuous_ode import ContinuousODEScheduler
+from .flow_match import FlowMatchScheduler

diffsynth/schedulers/continuous_ode.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+class ContinuousODEScheduler():
+    def __init__(self, num_inference_steps=100, sigma_max=700.0, sigma_min=0.002, rho=7.0):
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.rho = rho
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, **kwargs):
+        ramp = torch.linspace(1-denoising_strength, 1, num_inference_steps)
+        min_inv_rho = torch.pow(torch.tensor((self.sigma_min,)), (1 / self.rho))
+        max_inv_rho = torch.pow(torch.tensor((self.sigma_max,)), (1 / self.rho))
+        self.sigmas = torch.pow(max_inv_rho + ramp * (min_inv_rho - max_inv_rho), self.rho)
+        self.timesteps = torch.log(self.sigmas) * 0.25
+    def step(self, model_output, timestep, sample, to_final=False):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample *= (sigma*sigma + 1).sqrt()
+        estimated_sample = -sigma / (sigma*sigma + 1).sqrt() * model_output + 1 / (sigma*sigma + 1) * sample
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            prev_sample = estimated_sample
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+            derivative = 1 / sigma * (sample - estimated_sample)
+            prev_sample = sample + derivative * (sigma_ - sigma)
+            prev_sample /= (sigma_*sigma_ + 1).sqrt()
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        # This scheduler doesn't support this function.
+        pass
+    def add_noise(self, original_samples, noise, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (original_samples + noise * sigma) / (sigma*sigma + 1).sqrt()
+        return sample
+    def training_target(self, sample, noise, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        target = (-(sigma*sigma + 1).sqrt() / sigma + 1 / (sigma*sigma + 1).sqrt() / sigma) * sample + 1 / (sigma*sigma + 1).sqrt() * noise
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        weight = (1 + sigma*sigma).sqrt() / sigma
+        return weight

diffsynth/schedulers/ddim.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch, math
+class EnhancedDDIMScheduler():
+    def __init__(self, num_train_timesteps=1000, beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", prediction_type="epsilon", rescale_zero_terminal_snr=False):
+        self.num_train_timesteps = num_train_timesteps
+        if beta_schedule == "scaled_linear":
+            betas = torch.square(torch.linspace(math.sqrt(beta_start), math.sqrt(beta_end), num_train_timesteps, dtype=torch.float32))
+        elif beta_schedule == "linear":
+            betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented")
+        self.alphas_cumprod = torch.cumprod(1.0 - betas, dim=0)
+        if rescale_zero_terminal_snr:
+            self.alphas_cumprod = self.rescale_zero_terminal_snr(self.alphas_cumprod)
+        self.alphas_cumprod = self.alphas_cumprod.tolist()
+        self.set_timesteps(10)
+        self.prediction_type = prediction_type
+    def rescale_zero_terminal_snr(self, alphas_cumprod):
+        alphas_bar_sqrt = alphas_cumprod.sqrt()
+        # Store old values.
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        # Shift so the last timestep is zero.
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        # Scale so the first timestep is back to the old value.
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        # Convert alphas_bar_sqrt to betas
+        alphas_bar = alphas_bar_sqrt.square()  # Revert sqrt
+        return alphas_bar
+    def set_timesteps(self, num_inference_steps, denoising_strength=1.0, **kwargs):
+        # The timesteps are aligned to 999...0, which is different from other implementations,
+        # but I think this implementation is more reasonable in theory.
+        max_timestep = max(round(self.num_train_timesteps * denoising_strength) - 1, 0)
+        num_inference_steps = min(num_inference_steps, max_timestep + 1)
+        if num_inference_steps == 1:
+            self.timesteps = torch.Tensor([max_timestep])
+        else:
+            step_length = max_timestep / (num_inference_steps - 1)
+            self.timesteps = torch.Tensor([round(max_timestep - i*step_length) for i in range(num_inference_steps)])
+    def denoise(self, model_output, sample, alpha_prod_t, alpha_prod_t_prev):
+        if self.prediction_type == "epsilon":
+            weight_e = math.sqrt(1 - alpha_prod_t_prev) - math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t) / alpha_prod_t)
+            weight_x = math.sqrt(alpha_prod_t_prev / alpha_prod_t)
+            prev_sample = sample * weight_x + model_output * weight_e
+        elif self.prediction_type == "v_prediction":
+            weight_e = -math.sqrt(alpha_prod_t_prev * (1 - alpha_prod_t)) + math.sqrt(alpha_prod_t * (1 - alpha_prod_t_prev))
+            weight_x = math.sqrt(alpha_prod_t * alpha_prod_t_prev) + math.sqrt((1 - alpha_prod_t) * (1 - alpha_prod_t_prev))
+            prev_sample = sample * weight_x + model_output * weight_e
+        else:
+            raise NotImplementedError(f"{self.prediction_type} is not implemented")
+        return prev_sample
+    def step(self, model_output, timestep, sample, to_final=False):
+        alpha_prod_t = self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            alpha_prod_t_prev = 1.0
+        else:
+            timestep_prev = int(self.timesteps[timestep_id + 1])
+            alpha_prod_t_prev = self.alphas_cumprod[timestep_prev]
+        return self.denoise(model_output, sample, alpha_prod_t, alpha_prod_t_prev)
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        alpha_prod_t = self.alphas_cumprod[int(timestep.flatten().tolist()[0])]
+        noise_pred = (sample - math.sqrt(alpha_prod_t) * sample_stablized) / math.sqrt(1 - alpha_prod_t)
+        return noise_pred
+    def add_noise(self, original_samples, noise, timestep):
+        sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[int(timestep.flatten().tolist()[0])])
+        sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[int(timestep.flatten().tolist()[0])])
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+    def training_target(self, sample, noise, timestep):
+        if self.prediction_type == "epsilon":
+            return noise
+        else:
+            sqrt_alpha_prod = math.sqrt(self.alphas_cumprod[int(timestep.flatten().tolist()[0])])
+            sqrt_one_minus_alpha_prod = math.sqrt(1 - self.alphas_cumprod[int(timestep.flatten().tolist()[0])])
+            target = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+            return target
+    def training_weight(self, timestep):
+        return 1.0

diffsynth/schedulers/flow_match.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+class FlowMatchScheduler():
+    def __init__(self, num_inference_steps=100, num_train_timesteps=1000, shift=3.0, sigma_max=1.0, sigma_min=0.003/1.002, inverse_timesteps=False, extra_one_step=False, reverse_sigmas=False):
+        self.num_train_timesteps = num_train_timesteps
+        self.shift = shift
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+        self.inverse_timesteps = inverse_timesteps
+        self.extra_one_step = extra_one_step
+        self.reverse_sigmas = reverse_sigmas
+        self.set_timesteps(num_inference_steps)
+    def set_timesteps(self, num_inference_steps=100, denoising_strength=1.0, training=False, shift=None):
+        if shift is not None:
+            self.shift = shift
+        sigma_start = self.sigma_min + (self.sigma_max - self.sigma_min) * denoising_strength
+        if self.extra_one_step:
+            self.sigmas = torch.linspace(sigma_start, self.sigma_min, num_inference_steps + 1)[:-1]
+        else:
+            self.sigmas = torch.linspace(sigma_start, self.sigma_min, num_inference_steps)
+        if self.inverse_timesteps:
+            self.sigmas = torch.flip(self.sigmas, dims=[0])
+        self.sigmas = self.shift * self.sigmas / (1 + (self.shift - 1) * self.sigmas)
+        if self.reverse_sigmas:
+            self.sigmas = 1 - self.sigmas
+        self.timesteps = self.sigmas * self.num_train_timesteps
+        if training:
+            x = self.timesteps
+            y = torch.exp(-2 * ((x - num_inference_steps / 2) / num_inference_steps) ** 2)
+            y_shifted = y - y.min()
+            bsmntw_weighing = y_shifted * (num_inference_steps / y_shifted.sum())
+            self.linear_timesteps_weights = bsmntw_weighing
+    def step(self, model_output, timestep, sample, to_final=False):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        if to_final or timestep_id + 1 >= len(self.timesteps):
+            sigma_ = 1 if (self.inverse_timesteps or self.reverse_sigmas) else 0
+        else:
+            sigma_ = self.sigmas[timestep_id + 1]
+        prev_sample = sample + model_output * (sigma_ - sigma)
+        return prev_sample
+    def return_to_timestep(self, timestep, sample, sample_stablized):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        model_output = (sample - sample_stablized) / sigma
+        return model_output
+    def add_noise(self, original_samples, noise, timestep):
+        if isinstance(timestep, torch.Tensor):
+            timestep = timestep.cpu()
+        timestep_id = torch.argmin((self.timesteps - timestep).abs())
+        sigma = self.sigmas[timestep_id]
+        sample = (1 - sigma) * original_samples + sigma * noise
+        return sample
+    def training_target(self, sample, noise, timestep):
+        target = noise - sample
+        return target
+    def training_weight(self, timestep):
+        timestep_id = torch.argmin((self.timesteps - timestep.to(self.timesteps.device)).abs())
+        weights = self.linear_timesteps_weights[timestep_id]
+        return weights

diffsynth/vram_management/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .layers import *

diffsynth/vram_management/layers.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import torch, copy
+from ..models.utils import init_weights_on_device
+def cast_to(weight, dtype, device):
+    r = torch.empty_like(weight, dtype=dtype, device=device)
+    r.copy_(weight)
+    return r
+class AutoWrappedModule(torch.nn.Module):
+    def __init__(self, module: torch.nn.Module, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device):
+        super().__init__()
+        self.module = module.to(dtype=offload_dtype, device=offload_device)
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.state = 0
+    def offload(self):
+        if self.state == 1 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
+            self.module.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state == 0 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
+            self.module.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def forward(self, *args, **kwargs):
+        if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+            module = self.module
+        else:
+            module = copy.deepcopy(self.module).to(dtype=self.computation_dtype, device=self.computation_device)
+        return module(*args, **kwargs)
+class AutoWrappedLinear(torch.nn.Linear):
+    def __init__(self, module: torch.nn.Linear, offload_dtype, offload_device, onload_dtype, onload_device, computation_dtype, computation_device):
+        with init_weights_on_device(device=torch.device("meta")):
+            super().__init__(in_features=module.in_features, out_features=module.out_features, bias=module.bias is not None, dtype=offload_dtype, device=offload_device)
+        self.weight = module.weight
+        self.bias = module.bias
+        self.offload_dtype = offload_dtype
+        self.offload_device = offload_device
+        self.onload_dtype = onload_dtype
+        self.onload_device = onload_device
+        self.computation_dtype = computation_dtype
+        self.computation_device = computation_device
+        self.state = 0
+    def offload(self):
+        if self.state == 1 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
+            self.to(dtype=self.offload_dtype, device=self.offload_device)
+            self.state = 0
+    def onload(self):
+        if self.state == 0 and (self.offload_dtype != self.onload_dtype or self.offload_device != self.onload_device):
+            self.to(dtype=self.onload_dtype, device=self.onload_device)
+            self.state = 1
+    def forward(self, x, *args, **kwargs):
+        if self.onload_dtype == self.computation_dtype and self.onload_device == self.computation_device:
+            weight, bias = self.weight, self.bias
+        else:
+            weight = cast_to(self.weight, self.computation_dtype, self.computation_device)
+            bias = None if self.bias is None else cast_to(self.bias, self.computation_dtype, self.computation_device)
+        return torch.nn.functional.linear(x, weight, bias)
+def enable_vram_management_recursively(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None, total_num_param=0):
+    for name, module in model.named_children():
+        for source_module, target_module in module_map.items():
+            if isinstance(module, source_module):
+                num_param = sum(p.numel() for p in module.parameters())
+                if max_num_param is not None and total_num_param + num_param > max_num_param:
+                    module_config_ = overflow_module_config
+                else:
+                    module_config_ = module_config
+                module_ = target_module(module, **module_config_)
+                setattr(model, name, module_)
+                total_num_param += num_param
+                break
+        else:
+            total_num_param = enable_vram_management_recursively(module, module_map, module_config, max_num_param, overflow_module_config, total_num_param)
+    return total_num_param
+def enable_vram_management(model: torch.nn.Module, module_map: dict, module_config: dict, max_num_param=None, overflow_module_config: dict = None):
+    enable_vram_management_recursively(model, module_map, module_config, max_num_param, overflow_module_config, total_num_param=0)
+    model.vram_management_enabled = True

infer.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import torch
+from diffsynth import ModelManager, WanVideoPipeline
+from PIL import Image
+import argparse
+from transformers import Wav2Vec2Processor, Wav2Vec2Model
+import librosa
+import os
+import subprocess
+import cv2
+from model import FantasyTalkingAudioConditionModel
+from utils import save_video, get_audio_features, resize_image_by_longest_edge
+from pathlib import Path
+from datetime import datetime
+def parse_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--wan_model_dir",
+        type=str,
+        default="./models/Wan2.1-I2V-14B-720P",
+        required=False,
+        help="The dir of the Wan I2V 14B model.",
+    )
+    parser.add_argument(
+        "--fantasytalking_model_path",
+        type=str,
+        default="./models/fantasytalking_model.ckpt",
+        required=False,
+        help="The .ckpt path of fantasytalking model.",
+    )
+    parser.add_argument(
+        "--wav2vec_model_dir",
+        type=str,
+        default="./models/wav2vec2-base-960h",
+        required=False,
+        help="The dir of wav2vec model.",
+    )
+    parser.add_argument(
+        "--image_path",
+        type=str,
+        default="./assets/images/woman.png",
+        required=False,
+        help="The path of the image.",
+    )
+    parser.add_argument(
+        "--audio_path",
+        type=str,
+        default="./assets/audios/woman.wav",
+        required=False,
+        help="The path of the audio.",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default="A woman is talking.",
+        required=False,
+        help="prompt.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./output",
+        help="Dir to save the model.",
+    )
+    parser.add_argument(
+        "--image_size",
+        type=int,
+        default=512,
+        help="The image will be resized proportionally to this size.",
+    )
+    parser.add_argument(
+        "--audio_scale",
+        type=float,
+        default=1.0,
+        help="Audio condition injection weight",
+    )
+    parser.add_argument(
+        "--prompt_cfg_scale",
+        type=float,
+        default=5.0,
+        required=False,
+        help="Prompt cfg scale",
+    )
+    parser.add_argument(
+        "--audio_cfg_scale",
+        type=float,
+        default=5.0,
+        required=False,
+        help="Audio cfg scale",
+    )
+    parser.add_argument(
+        "--max_num_frames",
+        type=int,
+        default=81,
+        required=False,
+        help="The maximum frames for generating videos, the audio part exceeding max_num_frames/fps will be truncated."
+    )
+    parser.add_argument(
+        "--fps",
+        type=int,
+        default=23,
+        required=False,
+    )
+    parser.add_argument(
+        "--num_persistent_param_in_dit",
+        type=int,
+        default=None,
+        required=False,
+        help="Maximum parameter quantity retained in video memory, small number to reduce VRAM required"
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=1111,
+        required=False,
+    )
+    args = parser.parse_args()
+    return args
+def load_models(args):
+    # Load Wan I2V models
+    model_manager = ModelManager(device="cpu")
+    model_manager.load_models(
+        [
+            [
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00001-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00002-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00003-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00004-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00005-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00006-of-00007.safetensors",
+                f"{args.wan_model_dir}/diffusion_pytorch_model-00007-of-00007.safetensors",
+            ],
+            f"{args.wan_model_dir}/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth",
+            f"{args.wan_model_dir}/models_t5_umt5-xxl-enc-bf16.pth",
+            f"{args.wan_model_dir}/Wan2.1_VAE.pth",
+        ],
+        # torch_dtype=torch.float8_e4m3fn, # You can set `torch_dtype=torch.bfloat16` to disable FP8 quantization.
+        torch_dtype=torch.bfloat16, # You can set `torch_dtype=torch.bfloat16` to disable FP8 quantization.
+    )
+    pipe = WanVideoPipeline.from_model_manager(model_manager, torch_dtype=torch.bfloat16, device="cuda")
+    # Load FantasyTalking weights
+    fantasytalking = FantasyTalkingAudioConditionModel(pipe.dit, 768, 2048).to("cuda")
+    fantasytalking.load_audio_processor(args.fantasytalking_model_path, pipe.dit)
+    # You can set `num_persistent_param_in_dit` to a small number to reduce VRAM required.
+    pipe.enable_vram_management(num_persistent_param_in_dit=args.num_persistent_param_in_dit)
+    # Load wav2vec models
+    wav2vec_processor = Wav2Vec2Processor.from_pretrained(args.wav2vec_model_dir)
+    wav2vec = Wav2Vec2Model.from_pretrained(args.wav2vec_model_dir).to("cuda")
+    return pipe,fantasytalking,wav2vec_processor,wav2vec
+def main(args,pipe,fantasytalking,wav2vec_processor,wav2vec):
+    os.makedirs(args.output_dir,exist_ok=True)
+    duration = librosa.get_duration(filename=args.audio_path)
+    num_frames = min(int(args.fps*duration//4)*4+5,args.max_num_frames)
+    audio_wav2vec_fea = get_audio_features(wav2vec,wav2vec_processor,args.audio_path,args.fps,num_frames)
+    image = resize_image_by_longest_edge(args.image_path,args.image_size)
+    width, height = image.size
+    audio_proj_fea = fantasytalking.get_proj_fea(audio_wav2vec_fea)
+    pos_idx_ranges = fantasytalking.split_audio_sequence(audio_proj_fea.size(1),num_frames=num_frames)
+    audio_proj_split,audio_context_lens = fantasytalking.split_tensor_with_padding(audio_proj_fea,pos_idx_ranges,expand_length=4) # [b,21,9+8,768]
+    # Image-to-video
+    video_audio = pipe(
+        prompt=args.prompt,
+        negative_prompt="人物静止不动，静止，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+        input_image=image,
+        width=width,
+        height=height,
+        num_frames=num_frames,
+        num_inference_steps=30,
+        seed=args.seed, tiled=True,
+        audio_scale=args.audio_scale,
+        cfg_scale = args.prompt_cfg_scale,
+        audio_cfg_scale=args.audio_cfg_scale,
+        audio_proj=audio_proj_split,
+        audio_context_lens=audio_context_lens,
+        latents_num_frames=(num_frames-1)//4+1
+    )
+    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
+    save_path_tmp = f"{args.output_dir}/tmp_{Path(args.image_path).stem}_{Path(args.audio_path).stem}_{current_time}.mp4"
+    save_video(video_audio, save_path_tmp, fps=args.fps, quality=5)
+    save_path = f"{args.output_dir}/{Path(args.image_path).stem}_{Path(args.audio_path).stem}_{current_time}.mp4"
+    final_command = [
+        "ffmpeg", "-y",
+        "-i", save_path_tmp,
+        "-i", args.audio_path,
+        "-c:v", "libx264",
+        "-c:a", "aac",
+        "-shortest",
+        save_path
+    ]
+    subprocess.run(final_command, check=True)
+    os.remove(save_path_tmp)
+    return save_path
+if __name__ == "__main__":
+    args = parse_args()
+    pipe,fantasytalking,wav2vec_processor,wav2vec = load_models(args)
+    main(args,pipe,fantasytalking,wav2vec_processor,wav2vec)

model.py ADDED Viewed

	@@ -0,0 +1,229 @@

+from diffsynth.models.wan_video_dit import flash_attention, WanModel
+import torch.nn.functional as F
+import torch.nn as nn
+import torch
+import os
+from safetensors import safe_open
+class AudioProjModel(nn.Module):
+    def __init__(self, audio_in_dim=1024, cross_attention_dim=1024):
+        super().__init__()
+        self.cross_attention_dim = cross_attention_dim
+        self.proj = torch.nn.Linear(audio_in_dim, cross_attention_dim, bias=False)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+    def forward(self, audio_embeds):
+        context_tokens = self.proj(audio_embeds)
+        context_tokens = self.norm(context_tokens)
+        return context_tokens  # [B,L,C]
+class WanCrossAttentionProcessor(nn.Module):
+    def __init__(self, context_dim, hidden_dim):
+        super().__init__()
+        self.context_dim = context_dim
+        self.hidden_dim = hidden_dim
+        self.k_proj = nn.Linear(context_dim, hidden_dim, bias=False)
+        self.v_proj = nn.Linear(context_dim, hidden_dim, bias=False)
+        nn.init.zeros_(self.k_proj.weight)
+        nn.init.zeros_(self.v_proj.weight)
+    def __call__(
+        self,
+        attn: nn.Module,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        context_lens: torch.Tensor,
+        audio_proj: torch.Tensor,
+        audio_context_lens: torch.Tensor,
+        latents_num_frames: int = 21,
+        audio_scale: float = 1.0,
+    ) -> torch.Tensor:
+        """
+        x:              [B, L1, C].
+        context:        [B, L2, C].
+        context_lens:   [B].
+        audio_proj:   [B, 21, L3, C]
+        audio_context_lens: [B*21].
+        """
+        context_img = context[:, :257]
+        context = context[:, 257:]
+        b, n, d = x.size(0), attn.num_heads, attn.head_dim
+        # compute query, key, value
+        q = attn.norm_q(attn.q(x)).view(b, -1, n, d)
+        k = attn.norm_k(attn.k(context)).view(b, -1, n, d)
+        v = attn.v(context).view(b, -1, n, d)
+        k_img = attn.norm_k_img(attn.k_img(context_img)).view(b, -1, n, d)
+        v_img = attn.v_img(context_img).view(b, -1, n, d)
+        img_x = flash_attention(q, k_img, v_img, k_lens=None)
+        # compute attention
+        x = flash_attention(q, k, v, k_lens=context_lens)
+        x = x.flatten(2)
+        img_x = img_x.flatten(2)
+        if len(audio_proj.shape) == 4:
+            audio_q = q.view(b * latents_num_frames, -1, n, d)  # [b, 21, l1, n, d]
+            ip_key = self.k_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
+            ip_value = self.v_proj(audio_proj).view(b * latents_num_frames, -1, n, d)
+            audio_x = flash_attention(
+                audio_q, ip_key, ip_value, k_lens=audio_context_lens
+            )
+            audio_x = audio_x.view(b, q.size(1), n, d)
+            audio_x = audio_x.flatten(2)
+        elif len(audio_proj.shape) == 3:
+            ip_key = self.k_proj(audio_proj).view(b, -1, n, d)
+            ip_value = self.v_proj(audio_proj).view(b, -1, n, d)
+            audio_x = flash_attention(q, ip_key, ip_value, k_lens=audio_context_lens)
+            audio_x = audio_x.flatten(2)
+        # output
+        x = x + img_x + audio_x * audio_scale
+        x = attn.o(x)
+        return x
+class FantasyTalkingAudioConditionModel(nn.Module):
+    def __init__(self, wan_dit: WanModel, audio_in_dim: int, audio_proj_dim: int):
+        super().__init__()
+        self.audio_in_dim = audio_in_dim
+        self.audio_proj_dim = audio_proj_dim
+        # audio proj model
+        self.proj_model = self.init_proj(self.audio_proj_dim)
+        self.set_audio_processor(wan_dit)
+    def init_proj(self, cross_attention_dim=5120):
+        proj_model = AudioProjModel(
+            audio_in_dim=self.audio_in_dim, cross_attention_dim=cross_attention_dim
+        )
+        return proj_model
+    def set_audio_processor(self, wan_dit):
+        attn_procs = {}
+        for name in wan_dit.attn_processors.keys():
+            attn_procs[name] = WanCrossAttentionProcessor(
+                context_dim=self.audio_proj_dim, hidden_dim=wan_dit.dim
+            )
+        wan_dit.set_attn_processor(attn_procs)
+    def load_audio_processor(self, ip_ckpt: str, wan_dit):
+        if os.path.splitext(ip_ckpt)[-1] == ".safetensors":
+            state_dict = {"proj_model": {}, "audio_processor": {}}
+            with safe_open(ip_ckpt, framework="pt", device="cpu") as f:
+                for key in f.keys():
+                    if key.startswith("proj_model."):
+                        state_dict["proj_model"][key.replace("proj_model.", "")] = (
+                            f.get_tensor(key)
+                        )
+                    elif key.startswith("audio_processor."):
+                        state_dict["audio_processor"][
+                            key.replace("audio_processor.", "")
+                        ] = f.get_tensor(key)
+        else:
+            state_dict = torch.load(ip_ckpt, map_location="cpu")
+        self.proj_model.load_state_dict(state_dict["proj_model"])
+        wan_dit.load_state_dict(state_dict["audio_processor"], strict=False)
+    def get_proj_fea(self, audio_fea=None):
+        return self.proj_model(audio_fea) if audio_fea is not None else None
+    def split_audio_sequence(self, audio_proj_length, num_frames=81):
+        """
+        Map the audio feature sequence to corresponding latent frame slices.
+        Args:
+            audio_proj_length (int): The total length of the audio feature sequence
+                                    (e.g., 173 in audio_proj[1, 173, 768]).
+            num_frames (int): The number of video frames in the training data (default: 81).
+        Returns:
+            list: A list of [start_idx, end_idx] pairs. Each pair represents the index range
+                (within the audio feature sequence) corresponding to a latent frame.
+        """
+        # Average number of tokens per original video frame
+        tokens_per_frame = audio_proj_length / num_frames
+        # Each latent frame covers 4 video frames, and we want the center
+        tokens_per_latent_frame = tokens_per_frame * 4
+        half_tokens = int(tokens_per_latent_frame / 2)
+        pos_indices = []
+        for i in range(int((num_frames - 1) / 4) + 1):
+            if i == 0:
+                pos_indices.append(0)
+            else:
+                start_token = tokens_per_frame * ((i - 1) * 4 + 1)
+                end_token = tokens_per_frame * (i * 4 + 1)
+                center_token = int((start_token + end_token) / 2) - 1
+                pos_indices.append(center_token)
+        # Build index ranges centered around each position
+        pos_idx_ranges = [[idx - half_tokens, idx + half_tokens] for idx in pos_indices]
+        # Adjust the first range to avoid negative start index
+        pos_idx_ranges[0] = [
+            -(half_tokens * 2 - pos_idx_ranges[1][0]),
+            pos_idx_ranges[1][0],
+        ]
+        return pos_idx_ranges
+    def split_tensor_with_padding(self, input_tensor, pos_idx_ranges, expand_length=0):
+        """
+        Split the input tensor into subsequences based on index ranges, and apply right-side zero-padding
+        if the range exceeds the input boundaries.
+        Args:
+            input_tensor (Tensor): Input audio tensor of shape [1, L, 768].
+            pos_idx_ranges (list): A list of index ranges, e.g. [[-7, 1], [1, 9], ..., [165, 173]].
+            expand_length (int): Number of tokens to expand on both sides of each subsequence.
+        Returns:
+            sub_sequences (Tensor): A tensor of shape [1, F, L, 768], where L is the length after padding.
+                                    Each element is a padded subsequence.
+            k_lens (Tensor): A tensor of shape [F], representing the actual (unpadded) length of each subsequence.
+                            Useful for ignoring padding tokens in attention masks.
+        """
+        pos_idx_ranges = [
+            [idx[0] - expand_length, idx[1] + expand_length] for idx in pos_idx_ranges
+        ]
+        sub_sequences = []
+        seq_len = input_tensor.size(1)  # 173
+        max_valid_idx = seq_len - 1  # 172
+        k_lens_list = []
+        for start, end in pos_idx_ranges:
+            # Calculate the fill amount
+            pad_front = max(-start, 0)
+            pad_back = max(end - max_valid_idx, 0)
+            # Calculate the start and end indices of the valid part
+            valid_start = max(start, 0)
+            valid_end = min(end, max_valid_idx)
+            # Extract the valid part
+            if valid_start <= valid_end:
+                valid_part = input_tensor[:, valid_start : valid_end + 1, :]
+            else:
+                valid_part = input_tensor.new_zeros(
+                    (1, 0, input_tensor.size(2))
+                )
+            # In the sequence dimension (the 1st dimension) perform padding
+            padded_subseq = F.pad(
+                valid_part,
+                (0, 0, 0, pad_back + pad_front, 0, 0),
+                mode="constant",
+                value=0,
+            )
+            k_lens_list.append(padded_subseq.size(-2) - pad_back - pad_front)
+            sub_sequences.append(padded_subseq)
+        return torch.stack(sub_sequences, dim=1), torch.tensor(
+            k_lens_list, dtype=torch.long
+        )

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+torch>=2.0.0
+torchvision
+cupy-cuda12x
+transformers==4.46.2
+controlnet-aux==0.0.7
+imageio
+imageio[ffmpeg]
+safetensors
+einops
+sentencepiece
+protobuf
+modelscope
+ftfy
+librosa

utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import imageio, librosa
+import torch
+from PIL import Image
+from tqdm import tqdm
+import numpy as np
+def resize_image_by_longest_edge(image_path, target_size):
+    image = Image.open(image_path).convert("RGB")
+    width, height = image.size
+    scale = target_size / max(width, height)
+    new_size = (int(width * scale), int(height * scale))
+    return image.resize(new_size, Image.LANCZOS)
+def save_video(frames, save_path, fps, quality=9, ffmpeg_params=None):
+    writer = imageio.get_writer(
+        save_path, fps=fps, quality=quality, ffmpeg_params=ffmpeg_params
+    )
+    for frame in tqdm(frames, desc="Saving video"):
+        frame = np.array(frame)
+        writer.append_data(frame)
+    writer.close()
+def get_audio_features(wav2vec, audio_processor, audio_path, fps, num_frames):
+    sr = 16000
+    audio_input, sample_rate = librosa.load(audio_path, sr=sr)  # 采样率为 16kHz
+    start_time = 0
+    # end_time = (0 + (num_frames - 1) * 1) / fps
+    end_time = num_frames / fps
+    start_sample = int(start_time * sr)
+    end_sample = int(end_time * sr)
+    try:
+        audio_segment = audio_input[start_sample:end_sample]
+    except:
+        audio_segment = audio_input
+    input_values = audio_processor(
+        audio_segment, sampling_rate=sample_rate, return_tensors="pt"
+    ).input_values.to("cuda")
+    with torch.no_grad():
+        fea = wav2vec(input_values).last_hidden_state
+    return fea