Spaces:
Sleeping
Sleeping
File size: 7,243 Bytes
17d10a7 a15d204 d448add db46bfb 1c1b50f db46bfb 1c1b50f db8ba25 db46bfb cf3593c ecc69bf cf3593c 3168a3e ecc69bf cf3593c 1c1b50f db8ba25 1c1b50f ecc69bf dfa5d3e db8ba25 dfa5d3e 3168a3e ecc69bf 3168a3e 60b6e41 db8ba25 dfa5d3e db8ba25 dfa5d3e 3b58485 e7b189b 17d10a7 db8ba25 a3b5047 8b6a33e cf3593c 17d10a7 a3b5047 ecc69bf 3b58485 a3b5047 6f08234 cf3593c d448add dfa5d3e ecc69bf dfa5d3e ecc69bf dfa5d3e ecc69bf a3b5047 db8ba25 ecc69bf a3b5047 dfa5d3e 5080bd7 ecc69bf 60b6e41 07c07fa db8ba25 60b6e41 ecc69bf db8ba25 ecc69bf db8ba25 60b6e41 ecc69bf 60b6e41 ecc69bf 3168a3e a3b5047 3168a3e ecc69bf 6f08234 b6700b8 ecc69bf 07c07fa 60b6e41 ecc69bf 60b6e41 b6700b8 60b6e41 ecc69bf 07c07fa 60b6e41 07c07fa db8ba25 ecc69bf 07c07fa db8ba25 07c07fa ecc69bf 07c07fa 3fe530b 1a0bb5e a8c9cb5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import gradio as gr
import os
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
pipeline,
AutoProcessor,
MusicgenForConditionalGeneration,
)
from scipy.io.wavfile import write
from TTS.api import TTS
import tempfile
from dotenv import load_dotenv
import spaces
# Load environment variables
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
# ---------------------------------------------------------------------
# Load Llama 3 Pipeline with Zero GPU (Encapsulated)
# ---------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_script(user_prompt: str, duration: int, model_id: str, token: str):
try:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
model = AutoModelForCausalLM.from_pretrained(
model_id,
use_auth_token=token,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True,
)
llama_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)
system_prompt = (
"You are an expert radio imaging producer specializing in sound design and music. "
f"Generate a concise, creative promo script for a {duration}-second ad, focusing on auditory elements and musical appeal."
)
combined_prompt = f"{system_prompt}\nUser concept: {user_prompt}\nRefined script:"
result = llama_pipeline(combined_prompt, max_new_tokens=200, do_sample=True, temperature=0.9)
return result[0]["generated_text"].split("Refined script:")[-1].strip()
except Exception as e:
return f"Error generating script: {e}"
# ---------------------------------------------------------------------
# Load MusicGen Model (Encapsulated)
# ---------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_audio(prompt: str, audio_length: int):
try:
musicgen_model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
musicgen_processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
device = "cuda" if torch.cuda.is_available() else "cpu"
musicgen_model.to(device)
inputs = musicgen_processor(text=[prompt], padding=True, return_tensors="pt").to(device)
outputs = musicgen_model.generate(**inputs, max_new_tokens=audio_length)
audio_data = outputs[0, 0].cpu().numpy()
normalized_audio = (audio_data / max(abs(audio_data)) * 32767).astype("int16")
output_path = f"{tempfile.gettempdir()}/generated_audio.wav"
write(output_path, musicgen_model.config.audio_encoder.sampling_rate, normalized_audio)
return output_path
except Exception as e:
return f"Error generating audio: {e}"
# ---------------------------------------------------------------------
# Generate Voice-Over with Coqui XTTS-v2
# ---------------------------------------------------------------------
@spaces.GPU(duration=300)
def generate_voice(script: str, reference_audio: str, language: str):
try:
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=torch.cuda.is_available())
output_path = f"{tempfile.gettempdir()}/voice_over.wav"
tts.tts_to_file(
text=script,
file_path=output_path,
speaker_wav=reference_audio,
language=language,
)
return output_path
except Exception as e:
return f"Error generating voice-over: {e}"
# ---------------------------------------------------------------------
# Interface Functions
# ---------------------------------------------------------------------
def interface_generate_script(user_prompt, duration, llama_model_id):
return generate_script(user_prompt, duration, llama_model_id, hf_token)
def interface_generate_audio(script, audio_length):
return generate_audio(script, audio_length)
def interface_generate_voice(script, reference_audio, language):
return generate_voice(script, reference_audio, language)
# ---------------------------------------------------------------------
# Interface
# ---------------------------------------------------------------------
with gr.Blocks() as demo:
gr.Markdown("""
# π§ All-in-One Radio Promo Studio π
### Create professional scripts, soundscapes, and voice-overs in minutes!
π₯ Powered by **Llama 3**, **MusicGen**, and **XTTS-v2**
""")
# Script Generation Section
gr.Markdown("## βοΈ Step 1: Generate Your Promo Script")
with gr.Row():
user_prompt = gr.Textbox(
label="π€ Enter Promo Idea",
placeholder="E.g., A 15-second energetic jingle for a morning talk show.",
lines=2
)
duration = gr.Dropdown(
label="β³ Duration",
choices=["15", "30", "60"],
value="15",
info="Choose the duration of the promo (in seconds)."
)
llama_model_id = gr.Textbox(
label="ποΈ Llama 3 Model ID",
value="meta-llama/Meta-Llama-3-8B-Instruct"
)
generate_script_button = gr.Button("Generate Script β¨")
script_output = gr.Textbox(label="ποΈ Generated Promo Script", lines=4, interactive=False)
# Audio Generation Section
gr.Markdown("## π΅ Step 2: Generate Background Music")
with gr.Row():
audio_length = gr.Slider(
label="πΆ Audio Length (tokens)",
minimum=128,
maximum=1024,
step=64,
value=512
)
generate_audio_button = gr.Button("Generate Audio πΆ")
audio_output = gr.Audio(label="π΅ Generated Audio", type="filepath")
# Voice-Over Section
gr.Markdown("## ποΈ Step 3: Generate Voice-Over")
with gr.Row():
reference_audio = gr.Audio(
label="π€ Upload Reference Voice (6 seconds)",
type="filepath"
)
language = gr.Dropdown(
label="π Language",
choices=["en", "es", "fr", "de", "it"],
value="en"
)
generate_voice_button = gr.Button("Generate Voice-Over π€")
voice_output = gr.Audio(label="π Generated Voice-Over", type="filepath")
# Footer
gr.Markdown("""
<br><hr>
<p style="text-align: center; font-size: 0.9em;">
Created with β€οΈ by <a href="https://bilsimaging.com" target="_blank">bilsimaging.com</a>
</p>
""")
# Button Actions
generate_script_button.click(
fn=interface_generate_script,
inputs=[user_prompt, duration, llama_model_id],
outputs=script_output
)
generate_audio_button.click(
fn=interface_generate_audio,
inputs=[script_output, audio_length],
outputs=audio_output
)
generate_voice_button.click(
fn=interface_generate_voice,
inputs=[script_output, reference_audio, language],
outputs=voice_output
)
# ---------------------------------------------------------------------
# Launch App
# ---------------------------------------------------------------------
demo.launch(debug=True)
|