File size: 5,000 Bytes
73ed896
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gc
import logging
from datetime import datetime
from fractions import Fraction
from pathlib import Path

import gradio as gr
import torch
import torchaudio

from mmaudio.eval_utils import (ModelConfig, VideoInfo, all_model_cfg, generate, load_image,
                               load_video, make_video, setup_eval_logging)
from mmaudio.model.flow_matching import FlowMatching
from mmaudio.model.networks import MMAudio, get_my_mmaudio
from mmaudio.model.sequence_config import SequenceConfig
from mmaudio.model.utils.features_utils import FeaturesUtils

# Setup logging
setup_eval_logging()
log = logging.getLogger()

# Configure device and dtype
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cpu':
    log.warning('CUDA is not available, running on CPU')
dtype = torch.bfloat16

# Configure model and paths
model: ModelConfig = all_model_cfg['large_44k_v2']
model.download_if_needed()
output_dir = Path('./output/gradio')
output_dir.mkdir(exist_ok=True, parents=True)

def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
    seq_cfg = model.seq_cfg
    
    net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
    net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
    log.info(f'Loaded weights from {model.model_path}')
    
    feature_utils = FeaturesUtils(tod_vae_ckpt=model.vae_path,
                                synchformer_ckpt=model.synchformer_ckpt,
                                enable_conditions=True,
                                mode=model.mode,
                                bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
                                need_vae_encoder=False)
    feature_utils = feature_utils.to(device, dtype).eval()
    
    return net, feature_utils, seq_cfg

# Load model once at startup
net, feature_utils, seq_cfg = get_model()

@torch.inference_mode()
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                   cfg_strength: float, duration: float):
    try:
        rng = torch.Generator(device=device)
        if seed >= 0:
            rng.manual_seed(seed)
        else:
            rng.seed()
        fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)

        video_info = load_video(video, duration)
        clip_frames = video_info.clip_frames.unsqueeze(0)
        sync_frames = video_info.sync_frames.unsqueeze(0)
        duration = video_info.duration_sec
        
        seq_cfg.duration = duration
        net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)

        audios = generate(clip_frames, sync_frames, [prompt],
                        negative_text=[negative_prompt],
                        feature_utils=feature_utils,
                        net=net,
                        fm=fm,
                        rng=rng,
                        cfg_strength=cfg_strength)
        audio = audios.float().cpu()[0]

        current_time_string = datetime.now().strftime('%Y%m%d_%H%M%S')
        video_save_path = output_dir / f'{current_time_string}.mp4'
        make_video(video_info, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate)
        
        gc.collect()
        torch.cuda.empty_cache()
        
        return video_save_path
    except Exception as e:
        log.error(f"Error in video_to_audio: {str(e)}")
        raise gr.Error(f"An error occurred: {str(e)}")

# Create the Gradio interface
demo = gr.Interface(
    fn=video_to_audio,
    title="MMAudio — Video-to-Audio Synthesis",
    description="""
    Generate realistic audio for your videos using MMAudio!
    
    Project page: [MMAudio](https://hkchengrex.com/MMAudio/)
    Code: [GitHub](https://github.com/hkchengrex/MMAudio)
    
    Note: Processing high-resolution videos (>384px on shorter side) takes longer and doesn't improve results.
    """,
    inputs=[
        gr.Video(label="Upload Video"),
        gr.Text(label="Prompt", placeholder="Describe the audio you want to generate..."),
        gr.Text(label="Negative prompt", value="music", placeholder="What you don't want in the audio..."),
        gr.Number(label="Seed (-1: random)", value=-1, precision=0, minimum=-1),
        gr.Number(label="Number of steps", value=25, precision=0, minimum=1),
        gr.Slider(label="Guidance Strength", value=4.5, minimum=1, maximum=10, step=0.5),
        gr.Slider(label="Duration (seconds)", value=8, minimum=1, maximum=30, step=1),
    ],
    outputs=gr.Video(label="Generated Result"),
    examples=[
        ["https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_beach.mp4",
         "waves, seagulls", "", 0, 25, 4.5, 10],
        ["https://huggingface.co/hkchengrex/MMAudio/resolve/main/examples/sora_serpent.mp4",
         "", "music", 0, 25, 4.5, 10],
    ],
    cache_examples=True,
)

# Launch the app
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)