Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -36,7 +36,7 @@ print(f"Orpheus model loaded to {device}")
|
|
36 |
|
37 |
# Available voices and emotive tags
|
38 |
VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
|
39 |
-
EMOTIVE_TAGS = ["
|
40 |
|
41 |
# Initialize Dash app
|
42 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
@@ -126,589 +126,6 @@ def detect_silence(audio, threshold=0.01, min_silence_len=1000):
|
|
126 |
silent_regions.append((silent_start, len(audio)))
|
127 |
return silent_regions
|
128 |
|
129 |
-
import dash from dash import dcc, html, Input, Output, State, callback import dash_bootstrap_components as dbc import base64 import io import os from snac import SNAC import torch from transformers import AutoModelForCausalLM, AutoTokenizer import google.generativeai as genai import re import logging import numpy as np from pydub import AudioSegment from docx import Document import PyPDF2 import spaces
|
130 |
-
|
131 |
-
Initialize logging
|
132 |
-
logging.basicConfig(level=logging.INFO) logger = logging.getLogger(name)
|
133 |
-
|
134 |
-
Initialize device
|
135 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
136 |
-
|
137 |
-
Load models
|
138 |
-
print("Loading SNAC model...") snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") snac_model = snac_model.to(device)
|
139 |
-
|
140 |
-
model_name = "canopylabs/orpheus-3b-0.1-ft" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) model.to(device) tokenizer = AutoTokenizer.from_pretrained(model_name) print(f"Orpheus model loaded to {device}")
|
141 |
-
|
142 |
-
Available voices and emotive tags
|
143 |
-
VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"] EMOTIVE_TAGS = ["<laugh>", "<chuckle>", "<sigh>", "<cough>", "<sniffle>", "<groan>", "<yawn>", "<gasp>"]
|
144 |
-
|
145 |
-
Initialize Dash app
|
146 |
-
app = dash.Dash(name, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
147 |
-
|
148 |
-
Layout
|
149 |
-
app.layout = dbc.Container([ dbc.Row([ dbc.Col([ html.H1("Orpheus Text-to-Speech", className="mb-4"), dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"), dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"), dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"), dbc.Input(id="podcast-topic", placeholder="Enter podcast topic", className="mb-2"), dbc.Textarea(id="prompt", placeholder="Enter your text here...", rows=5, className="mb-2"), dcc.Upload( id='upload-file', children=html.Div(['Drag and Drop or ', html.A('Select a File')]), style={ 'width': '100%', 'height': '60px', 'lineHeight': '60px', 'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px 0' }, ), dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"), dbc.RadioItems( id="num-hosts", options=[{"label": i, "value": i} for i in ["1", "2"]], value="1", inline=True, className="mb-2" ), dbc.Button("Generate Podcast Script", id="generate-script-btn", color="primary", className="mb-2"), ], width=6), dbc.Col([ dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"), dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"), dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"), dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"), html.Div(id="audio-output"), dbc.Button("Clear", id="clear-btn", color="secondary", className="mb-2"), dbc.Collapse([ dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"), dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"), dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"), dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"), ], id="advanced-settings", is_open=False), dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"), ], width=6), ]), dcc.Store(id='generated-script'), dcc.Store(id='generated-audio'), ])
|
150 |
-
|
151 |
-
Callbacks
|
152 |
-
@callback( Output("script-output", "value"), Input("generate-script-btn", "n_clicks"), State("host1-name", "value"), State("host2-name", "value"), State("podcast-name", "value"), State("podcast-topic", "value"), State("prompt", "value"), State("upload-file", "contents"), State("duration", "value"), State("num-hosts", "value"), prevent_initial_call=True ) def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts): if n_clicks is None: return ""
|
153 |
-
|
154 |
-
Copy
|
155 |
-
|
156 |
-
try:
|
157 |
-
# Get the Gemini API key from Hugging Face secrets
|
158 |
-
api_key = os.environ.get("GEMINI_API_KEY")
|
159 |
-
if not api_key:
|
160 |
-
raise ValueError("Gemini API key not found in environment variables")
|
161 |
-
|
162 |
-
genai.configure(api_key=api_key)
|
163 |
-
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
|
164 |
-
|
165 |
-
combined_content = prompt or ""
|
166 |
-
|
167 |
-
if uploaded_file:
|
168 |
-
content_type, content_string = uploaded_file.split(',')
|
169 |
-
decoded = base64.b64decode(content_string)
|
170 |
-
file_bytes = io.BytesIO(decoded)
|
171 |
-
|
172 |
-
# Try to detect the file type based on content
|
173 |
-
file_bytes.seek(0)
|
174 |
-
if file_bytes.read(4) == b'%PDF':
|
175 |
-
# It's a PDF file
|
176 |
-
file_bytes.seek(0)
|
177 |
-
pdf_reader = PyPDF2.PdfReader(file_bytes)
|
178 |
-
file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
179 |
-
else:
|
180 |
-
# Try as text file first
|
181 |
-
file_bytes.seek(0)
|
182 |
-
try:
|
183 |
-
file_content = file_bytes.read().decode('utf-8')
|
184 |
-
except UnicodeDecodeError:
|
185 |
-
# If it's not a text file, try as a docx
|
186 |
-
file_bytes.seek(0)
|
187 |
-
try:
|
188 |
-
doc = Document(file_bytes)
|
189 |
-
file_content = "\n".join([para.text for para in doc.paragraphs])
|
190 |
-
except:
|
191 |
-
raise ValueError("Unsupported file type or corrupted file")
|
192 |
-
|
193 |
-
combined_content += "\n" + file_content if combined_content else file_content
|
194 |
-
|
195 |
-
num_hosts = int(num_hosts) if num_hosts else 1
|
196 |
-
|
197 |
-
prompt_template = f"""
|
198 |
-
Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
|
199 |
-
{combined_content}
|
200 |
-
|
201 |
-
Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
|
202 |
-
Use speech fillers like um, ah. Vary emotional tone.
|
203 |
-
|
204 |
-
Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
|
205 |
-
Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
|
206 |
-
If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
|
207 |
-
Only provide the dialog for text to speech.
|
208 |
-
Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
|
209 |
-
-Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
|
210 |
-
Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
|
211 |
-
Do not include speaker labels like "jane:" or "john:" before dialogue.
|
212 |
-
The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
|
213 |
-
The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
|
214 |
-
Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...". Its just dialog.
|
215 |
-
Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
|
216 |
-
Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
|
217 |
-
Maintain natural conversation flow and speech patterns within each monologue.
|
218 |
-
Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
|
219 |
-
Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
|
220 |
-
Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
|
221 |
-
Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
|
222 |
-
{'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
|
223 |
-
Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
|
224 |
-
Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
|
225 |
-
"""
|
226 |
-
|
227 |
-
response = model.generate_content(prompt_template)
|
228 |
-
return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
|
229 |
-
except Exception as e:
|
230 |
-
logger.error(f"Error generating podcast script: {str(e)}")
|
231 |
-
return f"Error: {str(e)}"
|
232 |
-
@callback( Output("audio-output", "children"), Input("generate-audio-btn", "n_clicks"), State("script-output", "value"), State("voice1", "value"), State("voice2", "value"), State("temperature", "value"), State("top-p", "value"), State("repetition-penalty", "value"), State("max-new-tokens", "value"), State("num-hosts", "value"), prevent_initial_call=True ) @spaces.GPU() def generate_speech(n_clicks, text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts): if n_clicks is None or not text.strip(): return html.Div("No audio generated yet.")
|
233 |
-
|
234 |
-
Copy
|
235 |
-
|
236 |
-
try:
|
237 |
-
paragraphs = text.split('\n\n') # Split by double newline
|
238 |
-
audio_samples = []
|
239 |
-
|
240 |
-
for i, paragraph in enumerate(paragraphs):
|
241 |
-
if not paragraph.strip():
|
242 |
-
continue
|
243 |
-
|
244 |
-
voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
|
245 |
-
|
246 |
-
input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
|
247 |
-
|
248 |
-
with torch.no_grad():
|
249 |
-
generated_ids = model.generate(
|
250 |
-
input_ids,
|
251 |
-
attention_mask=attention_mask,
|
252 |
-
do_sample=True,
|
253 |
-
temperature=temperature,
|
254 |
-
top_p=top_p,
|
255 |
-
repetition_penalty=repetition_penalty,
|
256 |
-
max_new_tokens=max_new_tokens,
|
257 |
-
num_return_sequences=1,
|
258 |
-
eos_token_id=128258,
|
259 |
-
)
|
260 |
-
|
261 |
-
code_list = parse_output(generated_ids)
|
262 |
-
paragraph_audio = redistribute_codes(code_list, snac_model)
|
263 |
-
|
264 |
-
silences = detect_silence(paragraph_audio)
|
265 |
-
if silences:
|
266 |
-
paragraph_audio = paragraph_audio[:silences[-1][1]]
|
267 |
-
|
268 |
-
audio_samples.append(paragraph_audio)
|
269 |
-
|
270 |
-
final_audio = np.concatenate(audio_samples)
|
271 |
-
final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
|
272 |
-
|
273 |
-
# Convert to base64 for audio playback
|
274 |
-
audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
|
275 |
-
src = f"data:audio/wav;base64,{audio_base64}"
|
276 |
-
|
277 |
-
return html.Audio(src=src, controls=True)
|
278 |
-
except Exception as e:
|
279 |
-
logger.error(f"Error generating speech: {str(e)}")
|
280 |
-
return html.Div(f"Error generating audio: {str(e)}")
|
281 |
-
@callback( Output("advanced-settings", "is_open"), Input("advanced-settings-toggle", "n_clicks"), State("advanced-settings", "is_open"), ) def toggle_advanced_settings(n_clicks, is_open): if n_clicks: return not is_open return is_open
|
282 |
-
|
283 |
-
@callback( Output("prompt", "value"), Output("script-output", "value"), Output("audio-output", "children"), Input("clear-btn", "n_clicks"), ) def clear_outputs(n_clicks): if n_clicks: return "", "", html.Div("No audio generated yet.") return dash.no_update, dash.no_update, dash.no_update
|
284 |
-
|
285 |
-
Run the app
|
286 |
-
if name == 'main': print("Starting the Dash application...") app.run(debug=True, host='0.0.0.0', port=7860) print("Dash application has finished running.")
|
287 |
-
|
288 |
-
import dash
|
289 |
-
from dash import dcc, html, Input, Output, State, callback
|
290 |
-
import dash_bootstrap_components as dbc
|
291 |
-
import base64
|
292 |
-
import io
|
293 |
-
import os
|
294 |
-
from snac import SNAC
|
295 |
-
import torch
|
296 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
297 |
-
import google.generativeai as genai
|
298 |
-
import re
|
299 |
-
import logging
|
300 |
-
import numpy as np
|
301 |
-
from pydub import AudioSegment
|
302 |
-
from docx import Document
|
303 |
-
import PyPDF2
|
304 |
-
import spaces
|
305 |
-
|
306 |
-
# Initialize logging
|
307 |
-
logging.basicConfig(level=logging.INFO)
|
308 |
-
logger = logging.getLogger(__name__)
|
309 |
-
|
310 |
-
# Initialize device
|
311 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
312 |
-
|
313 |
-
# Load models
|
314 |
-
print("Loading SNAC model...")
|
315 |
-
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
|
316 |
-
snac_model = snac_model.to(device)
|
317 |
-
|
318 |
-
model_name = "canopylabs/orpheus-3b-0.1-ft"
|
319 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
|
320 |
-
model.to(device)
|
321 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
322 |
-
print(f"Orpheus model loaded to {device}")
|
323 |
-
|
324 |
-
# Available voices and emotive tags
|
325 |
-
VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
|
326 |
-
EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
|
327 |
-
|
328 |
-
# Initialize Dash app
|
329 |
-
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
330 |
-
|
331 |
-
# Layout
|
332 |
-
app.layout = dbc.Container([
|
333 |
-
dbc.Row([
|
334 |
-
dbc.Col([
|
335 |
-
html.H1("Orpheus Text-to-Speech", className="mb-4"),
|
336 |
-
dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"),
|
337 |
-
dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"),
|
338 |
-
dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"),
|
339 |
-
dbc.Input(id="podcast-topic", placeholder="Enter podcast topic", className="mb-2"),
|
340 |
-
dbc.Textarea(id="prompt", placeholder="Enter your text here...", rows=5, className="mb-2"),
|
341 |
-
dcc.Upload(
|
342 |
-
id='upload-file',
|
343 |
-
children=html.Div(['Drag and Drop or ', html.A('Select a File')]),
|
344 |
-
style={
|
345 |
-
'width': '100%',
|
346 |
-
'height': '60px',
|
347 |
-
'lineHeight': '60px',
|
348 |
-
'borderWidth': '1px',
|
349 |
-
'borderStyle': 'dashed',
|
350 |
-
'borderRadius': '5px',
|
351 |
-
'textAlign': 'center',
|
352 |
-
'margin': '10px 0'
|
353 |
-
},
|
354 |
-
),
|
355 |
-
dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"),
|
356 |
-
dbc.RadioItems(
|
357 |
-
id="num-hosts",
|
358 |
-
options=[{"label": i, "value": i} for i in ["1", "2"]],
|
359 |
-
value="1",
|
360 |
-
inline=True,
|
361 |
-
className="mb-2"
|
362 |
-
),
|
363 |
-
dbc.Button("Generate Podcast Script", id="generate-script-btn", color="primary", className="mb-2"),
|
364 |
-
], width=6),
|
365 |
-
dbc.Col([
|
366 |
-
dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"),
|
367 |
-
dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"),
|
368 |
-
dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"),
|
369 |
-
dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"),
|
370 |
-
html.Div(id="audio-output"),
|
371 |
-
dbc.Button("Clear", id="clear-btn", color="secondary", className="mb-2"),
|
372 |
-
dbc.Collapse([
|
373 |
-
dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"),
|
374 |
-
dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"),
|
375 |
-
dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"),
|
376 |
-
dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"),
|
377 |
-
], id="advanced-settings", is_open=False),
|
378 |
-
dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"),
|
379 |
-
], width=6),
|
380 |
-
]),
|
381 |
-
dcc.Store(id='generated-script'),
|
382 |
-
dcc.Store(id='generated-audio'),
|
383 |
-
])
|
384 |
-
|
385 |
-
# Combined callback
|
386 |
-
@callback(
|
387 |
-
Output("script-output", "value"),
|
388 |
-
Output("audio-output", "children"),
|
389 |
-
Output("advanced-settings", "is_open"),
|
390 |
-
Output("prompt", "value"),
|
391 |
-
Input("generate-script-btn", "n_clicks"),
|
392 |
-
Input("generate-audio-btn", "n_clicks"),
|
393 |
-
Input("advanced-settings-toggle", "n_clicks"),
|
394 |
-
Input("clear-btn", "n_clicks"),
|
395 |
-
State("host1-name", "value"),
|
396 |
-
State("host2-name", "value"),
|
397 |
-
State("podcast-name", "value"),
|
398 |
-
State("podcast-topic", "value"),
|
399 |
-
State("prompt", "value"),
|
400 |
-
State("upload-file", "contents"),
|
401 |
-
State("duration", "value"),
|
402 |
-
State("num-hosts", "value"),
|
403 |
-
State("script-output", "value"),
|
404 |
-
State("voice1", "value"),
|
405 |
-
State("voice2", "value"),
|
406 |
-
State("temperature", "value"),
|
407 |
-
State("top-p", "value"),
|
408 |
-
State("repetition-penalty", "value"),
|
409 |
-
State("max-new-tokens", "value"),
|
410 |
-
State("advanced-settings", "is_open"),
|
411 |
-
prevent_initial_call=True
|
412 |
-
)
|
413 |
-
def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
|
414 |
-
host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
|
415 |
-
script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
|
416 |
-
ctx = dash.callback_context
|
417 |
-
if not ctx.triggered:
|
418 |
-
return dash.no_update, dash.no_update, dash.no_update, dash.no_update
|
419 |
-
|
420 |
-
trigger_id = ctx.triggered[0]['prop_id'].split('.')[0]
|
421 |
-
|
422 |
-
if trigger_id == "generate-script-btn":
|
423 |
-
try:
|
424 |
-
# Get the Gemini API key from Hugging Face secrets
|
425 |
-
api_key = os.environ.get("GEMINI_API_KEY")
|
426 |
-
if not api_key:
|
427 |
-
raise ValueError("Gemini API key not found in environment variables")
|
428 |
-
|
429 |
-
genai.configure(api_key=api_key)
|
430 |
-
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
|
431 |
-
|
432 |
-
combined_content = prompt or ""
|
433 |
-
|
434 |
-
if uploaded_file:
|
435 |
-
content_type, content_string = uploaded_file.split(',')
|
436 |
-
decoded = base64.b64decode(content_string)
|
437 |
-
file_bytes = io.BytesIO(decoded)
|
438 |
-
|
439 |
-
# Try to detect the file type based on content
|
440 |
-
file_bytes.seek(0)
|
441 |
-
if file_bytes.read(4) == b'%PDF':
|
442 |
-
# It's a PDF file
|
443 |
-
file_bytes.seek(0)
|
444 |
-
pdf_reader = PyPDF2.PdfReader(file_bytes)
|
445 |
-
file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
446 |
-
else:
|
447 |
-
# Try as text file first
|
448 |
-
file_bytes.seek(0)
|
449 |
-
try:
|
450 |
-
file_content = file_bytes.read().decode('utf-8')
|
451 |
-
except UnicodeDecodeError:
|
452 |
-
# If it's not a text file, try as a docx
|
453 |
-
file_bytes.seek(0)
|
454 |
-
try:
|
455 |
-
doc = Document(file_bytes)
|
456 |
-
file_content = "\n".join([para.text for para in doc.paragraphs])
|
457 |
-
except:
|
458 |
-
raise ValueError("Unsupported file type or corrupted file")
|
459 |
-
|
460 |
-
combined_content += "\n" + file_content if combined_content else file_content
|
461 |
-
|
462 |
-
num_hosts = int(num_hosts) if num_hosts else 1
|
463 |
-
|
464 |
-
prompt_template = f"""
|
465 |
-
Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
|
466 |
-
{combined_content}
|
467 |
-
|
468 |
-
Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
|
469 |
-
Use speech fillers like um, ah. Vary emotional tone.
|
470 |
-
|
471 |
-
Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
|
472 |
-
Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
|
473 |
-
If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
|
474 |
-
Only provide the dialog for text to speech.
|
475 |
-
Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
|
476 |
-
-Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
|
477 |
-
Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
|
478 |
-
Do not include speaker labels like "jane:" or "john:" before dialogue.
|
479 |
-
The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
|
480 |
-
The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
|
481 |
-
Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...". Its just dialog.
|
482 |
-
Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
|
483 |
-
Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
|
484 |
-
Maintain natural conversation flow and speech patterns within each monologue.
|
485 |
-
Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
|
486 |
-
Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
|
487 |
-
Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
|
488 |
-
Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
|
489 |
-
{'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
|
490 |
-
Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
|
491 |
-
Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
|
492 |
-
"""
|
493 |
-
|
494 |
-
response = model.generate_content(prompt_template)
|
495 |
-
return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text), dash.no_update, dash.no_update, dash.no_update
|
496 |
-
except Exception as e:
|
497 |
-
logger.error(f"Error generating podcast script: {str(e)}")
|
498 |
-
return f"Error: {str(e)}", dash.no_update, dash.no_update, dash.no_update
|
499 |
-
|
500 |
-
elif trigger_id == "generate-audio-btn":
|
501 |
-
if not script_output.strip():
|
502 |
-
return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update
|
503 |
-
|
504 |
-
try:
|
505 |
-
paragraphs = script_output.split('\n\n') # Split by double newline
|
506 |
-
audio_samples = []
|
507 |
-
|
508 |
-
for i, paragraph in enumerate(paragraphs):
|
509 |
-
if not paragraph.strip():
|
510 |
-
continue
|
511 |
-
|
512 |
-
voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
|
513 |
-
|
514 |
-
input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
|
515 |
-
|
516 |
-
with torch.no_grad():
|
517 |
-
generated_ids = model.generate(
|
518 |
-
input_ids,
|
519 |
-
attention_mask=attention_mask,
|
520 |
-
do_sample=True,
|
521 |
-
temperature=temperature,
|
522 |
-
top_p=top_p,
|
523 |
-
repetition_penalty=repetition_penalty,
|
524 |
-
max_new_tokens=max_new_tokens,
|
525 |
-
num_return_sequences=1,
|
526 |
-
eos_token_id=128258,
|
527 |
-
)
|
528 |
-
|
529 |
-
code_list = parse_output(generated_ids)
|
530 |
-
paragraph_audio = redistribute_codes(code_list, snac_model)
|
531 |
-
|
532 |
-
silences = detect_silence(paragraph_audio)
|
533 |
-
if silences:
|
534 |
-
paragraph_audio = paragraph_audio[:silences[-1][1]]
|
535 |
-
|
536 |
-
audio_samples.append(paragraph_audio)
|
537 |
-
|
538 |
-
final_audio = np.concatenate(audio_samples)
|
539 |
-
final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
|
540 |
-
|
541 |
-
# Convert to base64 for audio playback
|
542 |
-
audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
|
543 |
-
src = f"data:audio/wav;base64,{audio_base64}"
|
544 |
-
|
545 |
-
return dash.no_update, html.Audio(src=src, controls=True), dash.no_update, dash.no_update
|
546 |
-
except Exception as e:
|
547 |
-
logger.error(f"Error generating speech: {str(e)}")
|
548 |
-
return dash.no_update, html.Div(f"Error generating audio: {str(e)}"), dash.no_update, dash.no_update
|
549 |
-
|
550 |
-
elif trigger_id == "advanced-settings-toggle":
|
551 |
-
return dash.no_update, dash.no_update, not is_advanced_open, dash.no_update
|
552 |
-
|
553 |
-
elif trigger_id == "clear-btn":
|
554 |
-
return "", html.Div("No audio generated yet."), dash.no_update, ""
|
555 |
-
|
556 |
-
return dash.no_update, dash.no_update, dash.no_update, dash.no_update
|
557 |
-
|
558 |
-
# Run the app
|
559 |
-
if __name__ == '__main__':
|
560 |
-
print("Starting the Dash
|
561 |
-
where do I put this? @spaces.GPU
|
562 |
-
|
563 |
-
To properly implement the @spaces.GPU() decorator in your Dash application, you should place it directly above the function that requires GPU acceleration. In this case, you would add it above the combined_callback function. Here's how you should modify your code:
|
564 |
-
|
565 |
-
python
|
566 |
-
Copy
|
567 |
-
|
568 |
-
# ... (previous code remains unchanged)
|
569 |
-
|
570 |
-
# Combined callback
|
571 |
-
@callback(
|
572 |
-
Output("script-output", "value"),
|
573 |
-
Output("audio-output", "children"),
|
574 |
-
Output("advanced-settings", "is_open"),
|
575 |
-
Output("prompt", "value"),
|
576 |
-
Input("generate-script-btn", "n_clicks"),
|
577 |
-
Input("generate-audio-btn", "n_clicks"),
|
578 |
-
Input("advanced-settings-toggle", "n_clicks"),
|
579 |
-
Input("clear-btn", "n_clicks"),
|
580 |
-
State("host1-name", "value"),
|
581 |
-
State("host2-name", "value"),
|
582 |
-
State("podcast-name", "value"),
|
583 |
-
State("podcast-topic", "value"),
|
584 |
-
State("prompt", "value"),
|
585 |
-
State("upload-file", "contents"),
|
586 |
-
State("duration", "value"),
|
587 |
-
State("num-hosts", "value"),
|
588 |
-
State("script-output", "value"),
|
589 |
-
State("voice1", "value"),
|
590 |
-
State("voice2", "value"),
|
591 |
-
State("temperature", "value"),
|
592 |
-
State("top-p", "value"),
|
593 |
-
State("repetition-penalty", "value"),
|
594 |
-
State("max-new-tokens", "value"),
|
595 |
-
State("advanced-settings", "is_open"),
|
596 |
-
prevent_initial_call=True
|
597 |
-
)
|
598 |
-
@spaces.GPU()
|
599 |
-
def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
|
600 |
-
host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
|
601 |
-
script_output, voice1, voice2, temperature, top
|
602 |
-
ERROR:main:Error generating speech: name 'process_prompt' is not defined
|
603 |
-
|
604 |
-
I apologize for the oversight. The error you're encountering is due to the process_prompt function not being defined in the code. This function is likely used for processing the input text before generating speech. Let's add the missing function and any other necessary functions to resolve this error. Here's how we can modify the code:
|
605 |
-
|
606 |
-
First, we need to add the process_prompt function.
|
607 |
-
We should also add the parse_output, redistribute_codes, and detect_silence functions, which are used in the audio generation process.
|
608 |
-
Add these functions just before the combined_callback function:
|
609 |
-
|
610 |
-
python
|
611 |
-
Copy
|
612 |
-
|
613 |
-
def process_prompt(text, voice, tokenizer, device):
|
614 |
-
prompt = f"{voice}: {text}"
|
615 |
-
inputs = tokenizer(prompt, return_tensors="pt")
|
616 |
-
input_ids = inputs["input_ids"].to(device)
|
617 |
-
attention_mask = inputs["attention_mask"].to(device)
|
618 |
-
return input_ids, attention_mask
|
619 |
-
|
620 |
-
def parse_output(generated_ids):
|
621 |
-
decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
|
622 |
-
code_list = [int(code) for code in decoded.split() if code.isdigit()]
|
623 |
-
return code_list
|
624 |
-
|
625 |
-
def redistribute_codes(code_list, snac_model):
|
626 |
-
audio = snac_model.codes_to_audio(torch.tensor(code_list).unsqueeze(0).to(device))
|
627 |
-
return audio.cpu().numpy().flatten()
|
628 |
-
|
629 |
-
def detect_silence(audio, threshold=0.01, min_silence_len=1000):
|
630 |
-
is_silent = np.abs(audio) < threshold
|
631 |
-
silent_regions = []
|
632 |
-
silent_start = None
|
633 |
-
for i, silent in enumerate(is_silent):
|
634 |
-
if silent and silent_start is None:
|
635 |
-
silent_start = i
|
636 |
-
elif not silent and silent_start is not None:
|
637 |
-
if i - silent_start >= min_silence_len:
|
638 |
-
silent_regions.append((silent_start, i))
|
639 |
-
silent_start = None
|
640 |
-
if silent_start is not None and len(audio) - silent_start >= min_silence_len:
|
641 |
-
silent_regions.append((silent_start, len(audio)))
|
642 |
-
return silent_regions
|
643 |
-
|
644 |
-
# Combined callback
|
645 |
-
@callback(
|
646 |
-
Output("script-output", "value"),
|
647 |
-
Output("audio-output", "children"),
|
648 |
-
Output("advanced-settings", "is_open"),
|
649 |
-
Output("prompt", "value"),
|
650 |
-
Input("generate-script-btn", "n_clicks"),
|
651 |
-
Input("generate-audio-btn", "n_clicks"),
|
652 |
-
Input("advanced-settings-toggle", "n_clicks"),
|
653 |
-
Input("clear-btn", "n_clicks"),
|
654 |
-
State("host1-name", "value"),
|
655 |
-
State("host2-name", "value"),
|
656 |
-
State("podcast-name", "value"),
|
657 |
-
State("podcast-topic", "value"),
|
658 |
-
State("prompt", "value"),
|
659 |
-
State("upload-file", "contents"),
|
660 |
-
State("duration", "value"),
|
661 |
-
State("num-hosts", "value"),
|
662 |
-
State("script-output", "value"),
|
663 |
-
State("voice1", "value"),
|
664 |
-
State("voice2", "value"),
|
665 |
-
State("temperature", "value"),
|
666 |
-
State("top-p", "value"),
|
667 |
-
State("repetition-penalty", "value"),
|
668 |
-
State("max-new-tokens", "value"),
|
669 |
-
State("advanced-settings", "is_open"),
|
670 |
-
prevent_initial_call=True
|
671 |
-
)
|
672 |
-
@spaces.GPU()
|
673 |
-
def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
|
674 |
-
host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
|
675 |
-
script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
|
676 |
-
# ... (rest of the function remains unchanged)
|
677 |
-
These additions should resolve the "name 'process_prompt' is not defined" error and provide the necessary functions for audio generation. Make sure to place these function definitions before the combined_callback function in your code.
|
678 |
-
|
679 |
-
its stuck here
|
680 |
-
|
681 |
-
Using a ZeroGPU function outside of Gradio caching or request might block the app
|
682 |
-
|
683 |
-
INFO:httpx:HTTP Request: POST http://device-api.zero/allow?allowToken=765a43b283841841a6b484887de0c56c84ab4e8219e7ff2632ad6cf3e92d2012&pid=1580 "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://device-api.zero/release?allowToken=765a43b283841841a6b484887de0c56c84ab4e8219e7ff2632ad6cf3e92d2012&fail=false "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://device-api.zero/schedule?cgroupPath=%2Fkubepods.slice%2Fkubepods-burstable.slice%2Fkubepods-burstable-pod3cd9482c_d34a_484d_9434_ad8483b9ee4c.slice%2Fcri-containerd-b9a000519f1ed8024a1915b093625f1ac9c2cad6b0700f31850608b8c61a7aa5.scope&taskId=140413719708576&enableQueue=true&tokenVersion=2 "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://device-api.zero/release?allowToken=70d732fbc3156877cb6179fc0e7b1115e19291613f08e53f594be5b684eb4f9c&fail=false "HTTP/1.1 200 OK"
|
684 |
-
|
685 |
-
no audio download
|
686 |
-
|
687 |
-
I'll just attache a gpu to it like a t4
|
688 |
-
|
689 |
-
Based on the context provided, it seems you're encountering issues with GPU allocation and audio generation in your Dash application. Let's address these problems step by step:
|
690 |
-
|
691 |
-
GPU Allocation Issue: The error message suggests that you're using a ZeroGPU function outside of the intended context, which could be blocking the app. This is likely related to the @spaces.GPU() decorator we added earlier.
|
692 |
-
|
693 |
-
Audio Download Problem: You mentioned there's no audio download, which could be related to how the audio is being generated or served.
|
694 |
-
|
695 |
-
To address these issues, let's make the following modifications:
|
696 |
-
|
697 |
-
Remove the @spaces.GPU() decorator from the combined_callback function. Instead, we'll use it more specifically within the audio generation part.
|
698 |
-
|
699 |
-
Split the audio generation into a separate function and apply the @spaces.GPU() decorator there.
|
700 |
-
|
701 |
-
Ensure the audio is properly generated and served for download.
|
702 |
-
|
703 |
-
Here's how you can modify your code:
|
704 |
-
|
705 |
-
python
|
706 |
-
Copy
|
707 |
-
|
708 |
-
import spaces
|
709 |
-
|
710 |
-
# ... (other imports and setup code)
|
711 |
-
|
712 |
@spaces.GPU()
|
713 |
def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
|
714 |
try:
|
@@ -752,8 +169,7 @@ def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p,
|
|
752 |
except Exception as e:
|
753 |
logger.error(f"Error generating speech: {str(e)}")
|
754 |
return None
|
755 |
-
|
756 |
-
# Combined callback
|
757 |
@callback(
|
758 |
Output("script-output", "value"),
|
759 |
Output("audio-output", "children"),
|
@@ -792,7 +208,6 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
|
|
792 |
|
793 |
if trigger_id == "generate-script-btn":
|
794 |
try:
|
795 |
-
# Get the Gemini API key from Hugging Face secrets
|
796 |
api_key = os.environ.get("GEMINI_API_KEY")
|
797 |
if not api_key:
|
798 |
raise ValueError("Gemini API key not found in environment variables")
|
@@ -807,20 +222,16 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
|
|
807 |
decoded = base64.b64decode(content_string)
|
808 |
file_bytes = io.BytesIO(decoded)
|
809 |
|
810 |
-
# Try to detect the file type based on content
|
811 |
file_bytes.seek(0)
|
812 |
if file_bytes.read(4) == b'%PDF':
|
813 |
-
# It's a PDF file
|
814 |
file_bytes.seek(0)
|
815 |
pdf_reader = PyPDF2.PdfReader(file_bytes)
|
816 |
file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
817 |
else:
|
818 |
-
# Try as text file first
|
819 |
file_bytes.seek(0)
|
820 |
try:
|
821 |
file_content = file_bytes.read().decode('utf-8')
|
822 |
except UnicodeDecodeError:
|
823 |
-
# If it's not a text file, try as a docx
|
824 |
file_bytes.seek(0)
|
825 |
try:
|
826 |
doc = Document(file_bytes)
|
@@ -841,9 +252,9 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
|
|
841 |
|
842 |
Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
|
843 |
Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
|
844 |
-
If the number of {num_hosts
|
845 |
Only provide the dialog for text to speech.
|
846 |
-
Only use these emotion tags in angle brackets:
|
847 |
-Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
|
848 |
Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
|
849 |
Do not include speaker labels like "jane:" or "john:" before dialogue.
|
|
|
36 |
|
37 |
# Available voices and emotive tags
|
38 |
VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
|
39 |
+
EMOTIVE_TAGS = ["<laugh>", "<chuckle>", "<sigh>", "<cough>", "<sniffle>", "<groan>", "<yawn>", "<gasp>"]
|
40 |
|
41 |
# Initialize Dash app
|
42 |
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
|
|
126 |
silent_regions.append((silent_start, len(audio)))
|
127 |
return silent_regions
|
128 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
@spaces.GPU()
|
130 |
def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
|
131 |
try:
|
|
|
169 |
except Exception as e:
|
170 |
logger.error(f"Error generating speech: {str(e)}")
|
171 |
return None
|
172 |
+
|
|
|
173 |
@callback(
|
174 |
Output("script-output", "value"),
|
175 |
Output("audio-output", "children"),
|
|
|
208 |
|
209 |
if trigger_id == "generate-script-btn":
|
210 |
try:
|
|
|
211 |
api_key = os.environ.get("GEMINI_API_KEY")
|
212 |
if not api_key:
|
213 |
raise ValueError("Gemini API key not found in environment variables")
|
|
|
222 |
decoded = base64.b64decode(content_string)
|
223 |
file_bytes = io.BytesIO(decoded)
|
224 |
|
|
|
225 |
file_bytes.seek(0)
|
226 |
if file_bytes.read(4) == b'%PDF':
|
|
|
227 |
file_bytes.seek(0)
|
228 |
pdf_reader = PyPDF2.PdfReader(file_bytes)
|
229 |
file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
230 |
else:
|
|
|
231 |
file_bytes.seek(0)
|
232 |
try:
|
233 |
file_content = file_bytes.read().decode('utf-8')
|
234 |
except UnicodeDecodeError:
|
|
|
235 |
file_bytes.seek(0)
|
236 |
try:
|
237 |
doc = Document(file_bytes)
|
|
|
252 |
|
253 |
Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
|
254 |
Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
|
255 |
+
If the number of {num_hosts} is 1 then each paragraph will be no more than 3 sentences each
|
256 |
Only provide the dialog for text to speech.
|
257 |
+
Only use these emotion tags in angle brackets: {', '.join(EMOTIVE_TAGS)}.
|
258 |
-Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
|
259 |
Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
|
260 |
Do not include speaker labels like "jane:" or "john:" before dialogue.
|