Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ import numpy as np
|
|
14 |
from pydub import AudioSegment
|
15 |
from docx import Document
|
16 |
import PyPDF2
|
17 |
-
import
|
18 |
|
19 |
# Initialize logging
|
20 |
logging.basicConfig(level=logging.INFO)
|
@@ -45,7 +45,11 @@ app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
|
|
45 |
app.layout = dbc.Container([
|
46 |
dbc.Row([
|
47 |
dbc.Col([
|
48 |
-
html.H1("Orpheus Text-to-Speech", className="mb-4"),
|
|
|
|
|
|
|
|
|
49 |
dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"),
|
50 |
dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"),
|
51 |
dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"),
|
@@ -65,7 +69,9 @@ app.layout = dbc.Container([
|
|
65 |
'margin': '10px 0'
|
66 |
},
|
67 |
),
|
|
|
68 |
dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"),
|
|
|
69 |
dbc.RadioItems(
|
70 |
id="num-hosts",
|
71 |
options=[{"label": i, "value": i} for i in ["1", "2"]],
|
@@ -77,18 +83,24 @@ app.layout = dbc.Container([
|
|
77 |
], width=6),
|
78 |
dbc.Col([
|
79 |
dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"),
|
|
|
|
|
80 |
dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"),
|
|
|
81 |
dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"),
|
82 |
dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"),
|
83 |
html.Div(id="audio-output"),
|
84 |
-
dbc.Button("
|
85 |
dbc.Collapse([
|
|
|
86 |
dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"),
|
|
|
87 |
dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"),
|
|
|
88 |
dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"),
|
|
|
89 |
dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"),
|
90 |
], id="advanced-settings", is_open=False),
|
91 |
-
dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"),
|
92 |
], width=6),
|
93 |
]),
|
94 |
dcc.Store(id='generated-script'),
|
@@ -126,13 +138,12 @@ def detect_silence(audio, threshold=0.01, min_silence_len=1000):
|
|
126 |
silent_regions.append((silent_start, len(audio)))
|
127 |
return silent_regions
|
128 |
|
129 |
-
@spaces.GPU()
|
130 |
def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
|
131 |
try:
|
132 |
paragraphs = script_output.split('\n\n') # Split by double newline
|
133 |
audio_samples = []
|
134 |
|
135 |
-
for i, paragraph in enumerate(paragraphs):
|
136 |
if not paragraph.strip():
|
137 |
continue
|
138 |
|
@@ -151,6 +162,7 @@ def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p,
|
|
151 |
max_new_tokens=max_new_tokens,
|
152 |
num_return_sequences=1,
|
153 |
eos_token_id=128258,
|
|
|
154 |
)
|
155 |
|
156 |
code_list = parse_output(generated_ids)
|
@@ -265,52 +277,4 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
|
|
265 |
Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
|
266 |
Maintain natural conversation flow and speech patterns within each monologue.
|
267 |
Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
|
268 |
-
Use speaker names ({host1_name} and/or {
|
269 |
-
Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
|
270 |
-
Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
|
271 |
-
{'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
|
272 |
-
Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
|
273 |
-
Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
|
274 |
-
"""
|
275 |
-
|
276 |
-
response = model.generate_content(prompt_template)
|
277 |
-
return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text), dash.no_update, dash.no_update, dash.no_update
|
278 |
-
except Exception as e:
|
279 |
-
logger.error(f"Error generating podcast script: {str(e)}")
|
280 |
-
return f"Error: {str(e)}", dash.no_update, dash.no_update, dash.no_update
|
281 |
-
|
282 |
-
elif trigger_id == "generate-audio-btn":
|
283 |
-
if not script_output.strip():
|
284 |
-
return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update
|
285 |
-
|
286 |
-
final_audio = generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens)
|
287 |
-
|
288 |
-
if final_audio is not None:
|
289 |
-
# Convert to base64 for audio playback
|
290 |
-
audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
|
291 |
-
src = f"data:audio/wav;base64,{audio_base64}"
|
292 |
-
|
293 |
-
# Create a download link for the audio
|
294 |
-
download_link = html.A("Download Audio", href=src, download="generated_audio.wav")
|
295 |
-
|
296 |
-
return dash.no_update, html.Div([
|
297 |
-
html.Audio(src=src, controls=True),
|
298 |
-
html.Br(),
|
299 |
-
download_link
|
300 |
-
]), dash.no_update, dash.no_update
|
301 |
-
else:
|
302 |
-
return dash.no_update, html.Div("Error generating audio"), dash.no_update, dash.no_update
|
303 |
-
|
304 |
-
elif trigger_id == "advanced-settings-toggle":
|
305 |
-
return dash.no_update, dash.no_update, not is_advanced_open, dash.no_update
|
306 |
-
|
307 |
-
elif trigger_id == "clear-btn":
|
308 |
-
return "", html.Div("No audio generated yet."), dash.no_update, ""
|
309 |
-
|
310 |
-
return dash.no_update, dash.no_update, dash.no_update, dash.no_update
|
311 |
-
|
312 |
-
# Run the app
|
313 |
-
if __name__ == '__main__':
|
314 |
-
print("Starting the Dash application...")
|
315 |
-
app.run(debug=True, host='0.0.0.0', port=7860)
|
316 |
-
print("Dash application has finished running.")
|
|
|
14 |
from pydub import AudioSegment
|
15 |
from docx import Document
|
16 |
import PyPDF2
|
17 |
+
from tqdm import tqdm
|
18 |
|
19 |
# Initialize logging
|
20 |
logging.basicConfig(level=logging.INFO)
|
|
|
45 |
app.layout = dbc.Container([
|
46 |
dbc.Row([
|
47 |
dbc.Col([
|
48 |
+
html.H1("Orpheus Text-to-Speech", className="text-center mb-4"),
|
49 |
+
], width=12),
|
50 |
+
]),
|
51 |
+
dbc.Row([
|
52 |
+
dbc.Col([
|
53 |
dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"),
|
54 |
dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"),
|
55 |
dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"),
|
|
|
69 |
'margin': '10px 0'
|
70 |
},
|
71 |
),
|
72 |
+
html.Label("Duration (minutes)", className="mt-2"),
|
73 |
dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"),
|
74 |
+
html.Label("Number of Hosts", className="mt-2"),
|
75 |
dbc.RadioItems(
|
76 |
id="num-hosts",
|
77 |
options=[{"label": i, "value": i} for i in ["1", "2"]],
|
|
|
83 |
], width=6),
|
84 |
dbc.Col([
|
85 |
dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"),
|
86 |
+
dbc.Button("Clear", id="clear-btn", color="secondary", className="mb-2"),
|
87 |
+
html.Label("Voice 1", className="mt-2"),
|
88 |
dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"),
|
89 |
+
html.Label("Voice 2", className="mt-2"),
|
90 |
dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"),
|
91 |
dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"),
|
92 |
html.Div(id="audio-output"),
|
93 |
+
dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"),
|
94 |
dbc.Collapse([
|
95 |
+
html.Label("Temperature", className="mt-2"),
|
96 |
dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"),
|
97 |
+
html.Label("Top P", className="mt-2"),
|
98 |
dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"),
|
99 |
+
html.Label("Repetition Penalty", className="mt-2"),
|
100 |
dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"),
|
101 |
+
html.Label("Max New Tokens", className="mt-2"),
|
102 |
dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"),
|
103 |
], id="advanced-settings", is_open=False),
|
|
|
104 |
], width=6),
|
105 |
]),
|
106 |
dcc.Store(id='generated-script'),
|
|
|
138 |
silent_regions.append((silent_start, len(audio)))
|
139 |
return silent_regions
|
140 |
|
|
|
141 |
def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
|
142 |
try:
|
143 |
paragraphs = script_output.split('\n\n') # Split by double newline
|
144 |
audio_samples = []
|
145 |
|
146 |
+
for i, paragraph in tqdm(enumerate(paragraphs), total=len(paragraphs), desc="Generating audio"):
|
147 |
if not paragraph.strip():
|
148 |
continue
|
149 |
|
|
|
162 |
max_new_tokens=max_new_tokens,
|
163 |
num_return_sequences=1,
|
164 |
eos_token_id=128258,
|
165 |
+
pad_token_id=128258,
|
166 |
)
|
167 |
|
168 |
code_list = parse_output(generated_ids)
|
|
|
277 |
Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
|
278 |
Maintain natural conversation flow and speech patterns within each monologue.
|
279 |
Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
|
280 |
+
Use speaker names ({host1_name} and/or {host2_
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|