Fabrice-TIERCELIN commited on
Commit
1df3eda
·
verified ·
1 Parent(s): 11e7e3c

This PR makes this space work

Browse files

This code starts successfully.

Click on _Merge_ to add this feature.

Files changed (3) hide show
  1. README.md +6 -6
  2. app.py +100 -32
  3. requirements.txt +3 -0
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: Audio
3
- emoji: 🐢
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.35.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
+ title: Stable Audio Open Zero
3
+ emoji: 🔥
4
+ colorFrom: indigo
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 4.33.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -1,42 +1,110 @@
 
1
  import torch
2
  import torchaudio
3
  from einops import rearrange
 
 
 
 
 
 
4
  from stable_audio_tools import get_pretrained_model
5
  from stable_audio_tools.inference.generation import generate_diffusion_cond
6
 
7
- device = "cuda" if torch.cuda.is_available() else "cpu"
8
-
9
- # Download model
10
- model, model_config = get_pretrained_model("stabilityai/stable-audio-open-1.0")
11
- sample_rate = model_config["sample_rate"]
12
- sample_size = model_config["sample_size"]
13
-
14
- model = model.to(device)
15
-
16
- # Set up text and timing conditioning
17
- conditioning = [{
18
- "prompt": "128 BPM tech house drum loop",
19
- "seconds_start": 0,
20
- "seconds_total": 30
21
- }]
22
-
23
- # Generate stereo audio
24
- output = generate_diffusion_cond(
25
- model,
26
- steps=100,
27
- cfg_scale=7,
28
- conditioning=conditioning,
29
- sample_size=sample_size,
30
- sigma_min=0.3,
31
- sigma_max=500,
32
- sampler_type="dpmpp-3m-sde",
33
- device=device
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  )
35
 
36
- # Rearrange audio batch to a single sequence
37
- output = rearrange(output, "b d n -> d (b n)")
38
 
39
- # Peak normalize, clip, convert to int16, and save to file
40
- output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
41
- torchaudio.save("output.wav", output, sample_rate)
42
 
 
 
 
1
+ import random
2
  import torch
3
  import torchaudio
4
  from einops import rearrange
5
+ import gradio as gr
6
+ import spaces
7
+ import os
8
+ import uuid
9
+
10
+ # Importing the model-related functions
11
  from stable_audio_tools import get_pretrained_model
12
  from stable_audio_tools.inference.generation import generate_diffusion_cond
13
 
14
+ # Load the model outside of the GPU-decorated function
15
+ def load_model():
16
+ print("Loading model...")
17
+ model, model_config = get_pretrained_model("chaowenguo/stable-audio-open-1.0")
18
+ print("Model loaded successfully.")
19
+ return model, model_config
20
+
21
+ # Function to set up, generate, and process the audio
22
+ @spaces.GPU(duration=120) # Allocate GPU only when this function is called
23
+ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
24
+ print(f"Prompt received: {prompt}")
25
+ print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
26
+
27
+ seed = random.randint(0, 2**63 - 1)
28
+ random.seed(seed)
29
+ torch.manual_seed(seed)
30
+ print(f"Using seed: {seed}")
31
+
32
+ device = "cuda" if torch.cuda.is_available() else "cpu"
33
+ print(f"Using device: {device}")
34
+
35
+ # Fetch the Hugging Face token from the environment variable
36
+ hf_token = os.getenv('HF_TOKEN')
37
+ print(f"Hugging Face token: {hf_token}")
38
+
39
+ # Use pre-loaded model and configuration
40
+ model, model_config = load_model()
41
+ sample_rate = model_config["sample_rate"]
42
+ sample_size = model_config["sample_size"]
43
+
44
+ print(f"Sample rate: {sample_rate}, Sample size: {sample_size}")
45
+
46
+ model = model.to(device)
47
+ print("Model moved to device.")
48
+
49
+ # Set up text and timing conditioning
50
+ conditioning = [{
51
+ "prompt": prompt,
52
+ "seconds_start": 0,
53
+ "seconds_total": seconds_total
54
+ }]
55
+ print(f"Conditioning: {conditioning}")
56
+
57
+ # Generate stereo audio
58
+ print("Generating audio...")
59
+ output = generate_diffusion_cond(
60
+ model,
61
+ steps=steps,
62
+ cfg_scale=cfg_scale,
63
+ conditioning=conditioning,
64
+ sample_size=sample_size,
65
+ sigma_min=0.3,
66
+ sigma_max=500,
67
+ sampler_type="dpmpp-3m-sde",
68
+ device=device
69
+ )
70
+ print("Audio generated.")
71
+
72
+ # Rearrange audio batch to a single sequence
73
+ output = rearrange(output, "b d n -> d (b n)")
74
+ print("Audio rearranged.")
75
+
76
+ # Peak normalize, clip, convert to int16
77
+ output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
78
+ print("Audio normalized and converted.")
79
+
80
+ # Generate a unique filename for the output
81
+ unique_filename = f"output_{uuid.uuid4().hex}.wav"
82
+ print(f"Saving audio to file: {unique_filename}")
83
+
84
+ # Save to file
85
+ torchaudio.save(unique_filename, output, sample_rate)
86
+ print(f"Audio saved: {unique_filename}")
87
+
88
+ # Return the path to the generated audio file
89
+ return unique_filename
90
+
91
+ # Setting up the Gradio Interface
92
+ interface = gr.Interface(
93
+ fn=generate_audio,
94
+ inputs=[
95
+ gr.Textbox(label="Prompt", placeholder="Enter your text prompt here"),
96
+ gr.Slider(0, 47, value=5, label="Duration in Seconds"),
97
+ gr.Slider(10, 150, value=10, step=10, label="Number of Diffusion Steps"),
98
+ gr.Slider(1, 15, value=7, step=0.1, label="CFG Scale")
99
+ ],
100
+ outputs=gr.Audio(type="filepath", label="Generated Audio"),
101
+ title="Stable Audio Generator",
102
+ description="Generate variable-length stereo audio at 44.1kHz from text prompts using Stable Audio Open 1.0."
103
  )
104
 
 
 
105
 
106
+ # Pre-load the model to avoid multiprocessing issues
107
+ model, model_config = load_model()
 
108
 
109
+ # Launch the Interface
110
+ interface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ torchaudio
3
+ stable-audio-tools