podcast-generator

Paused

App Files Files Community

bluenevus commited on 10 days ago

Commit

9901299

verified ·

1 Parent(s): 112f5f1

Update app.py

Browse files

Files changed (1) hide show

app.py +622 -1

app.py CHANGED Viewed

@@ -126,6 +126,285 @@ def detect_silence(audio, threshold=0.01, min_silence_len=1000):
         silent_regions.append((silent_start, len(audio)))
     return silent_regions
 # Combined callback
 @callback(
     Output("script-output", "value"),
@@ -154,7 +433,6 @@ def detect_silence(audio, threshold=0.01, min_silence_len=1000):
     State("advanced-settings", "is_open"),
     prevent_initial_call=True
 )
-@spaces.GPU()
 def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
                       host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
                       script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
@@ -300,6 +578,349 @@ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_se
     return dash.no_update, dash.no_update, dash.no_update, dash.no_update
 # Run the app
 if __name__ == '__main__':
     print("Starting the Dash application...")

         silent_regions.append((silent_start, len(audio)))
     return silent_regions
+Welcome to Python Dash! I'm your specialized AI assistant for creating efficient Dash by Plotly applications on Hugging Face Spaces. I generate minimal, precise code for AI and ML projects.
+Getting Started:
+Describe your Dash by Plotly app's core functionality for huggingface spaces
+I will always provide you the entire script for app.py and requirement.txt
+When troubleshooting I will only fix the affected code and not make any other changes
+this is my code. get rid of the duplicate call backs by putting it into a single function and return me the entire app.py without any other modifications
+import dash from dash import dcc, html, Input, Output, State, callback import dash_bootstrap_components as dbc import base64 import io import os from snac import SNAC import torch from transformers import AutoModelForCausalLM, AutoTokenizer import google.generativeai as genai import re import logging import numpy as np from pydub import AudioSegment from docx import Document import PyPDF2 import spaces
+Initialize logging
+logging.basicConfig(level=logging.INFO) logger = logging.getLogger(name)
+Initialize device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+Load models
+print("Loading SNAC model...") snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz") snac_model = snac_model.to(device)
+model_name = "canopylabs/orpheus-3b-0.1-ft" model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16) model.to(device) tokenizer = AutoTokenizer.from_pretrained(model_name) print(f"Orpheus model loaded to {device}")
+Available voices and emotive tags
+VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"] EMOTIVE_TAGS = ["<laugh>", "<chuckle>", "<sigh>", "<cough>", "<sniffle>", "<groan>", "<yawn>", "<gasp>"]
+Initialize Dash app
+app = dash.Dash(name, external_stylesheets=[dbc.themes.BOOTSTRAP])
+Layout
+app.layout = dbc.Container([ dbc.Row([ dbc.Col([ html.H1("Orpheus Text-to-Speech", className="mb-4"), dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"), dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"), dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"), dbc.Input(id="podcast-topic", placeholder="Enter podcast topic", className="mb-2"), dbc.Textarea(id="prompt", placeholder="Enter your text here...", rows=5, className="mb-2"), dcc.Upload( id='upload-file', children=html.Div(['Drag and Drop or ', html.A('Select a File')]), style={ 'width': '100%', 'height': '60px', 'lineHeight': '60px', 'borderWidth': '1px', 'borderStyle': 'dashed', 'borderRadius': '5px', 'textAlign': 'center', 'margin': '10px 0' }, ), dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"), dbc.RadioItems( id="num-hosts", options=[{"label": i, "value": i} for i in ["1", "2"]], value="1", inline=True, className="mb-2" ), dbc.Button("Generate Podcast Script", id="generate-script-btn", color="primary", className="mb-2"), ], width=6), dbc.Col([ dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"), dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"), dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"), dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"), html.Div(id="audio-output"), dbc.Button("Clear", id="clear-btn", color="secondary", className="mb-2"), dbc.Collapse([ dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"), dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"), dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"), dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"), ], id="advanced-settings", is_open=False), dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"), ], width=6), ]), dcc.Store(id='generated-script'), dcc.Store(id='generated-audio'), ])
+Callbacks
+@callback( Output("script-output", "value"), Input("generate-script-btn", "n_clicks"), State("host1-name", "value"), State("host2-name", "value"), State("podcast-name", "value"), State("podcast-topic", "value"), State("prompt", "value"), State("upload-file", "contents"), State("duration", "value"), State("num-hosts", "value"), prevent_initial_call=True ) def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts): if n_clicks is None: return ""
+Copy
+try:
+    # Get the Gemini API key from Hugging Face secrets
+    api_key = os.environ.get("GEMINI_API_KEY")
+    if not api_key:
+        raise ValueError("Gemini API key not found in environment variables")
+    genai.configure(api_key=api_key)
+    model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
+    combined_content = prompt or ""
+    if uploaded_file:
+        content_type, content_string = uploaded_file.split(',')
+        decoded = base64.b64decode(content_string)
+        file_bytes = io.BytesIO(decoded)
+        # Try to detect the file type based on content
+        file_bytes.seek(0)
+        if file_bytes.read(4) == b'%PDF':
+            # It's a PDF file
+            file_bytes.seek(0)
+            pdf_reader = PyPDF2.PdfReader(file_bytes)
+            file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
+        else:
+            # Try as text file first
+            file_bytes.seek(0)
+            try:
+                file_content = file_bytes.read().decode('utf-8')
+            except UnicodeDecodeError:
+                # If it's not a text file, try as a docx
+                file_bytes.seek(0)
+                try:
+                    doc = Document(file_bytes)
+                    file_content = "\n".join([para.text for para in doc.paragraphs])
+                except:
+                    raise ValueError("Unsupported file type or corrupted file")
+        combined_content += "\n" + file_content if combined_content else file_content
+    num_hosts = int(num_hosts) if num_hosts else 1
+    prompt_template = f"""
+    Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
+    {combined_content}
+    Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
+    Use speech fillers like um, ah. Vary emotional tone.
+    Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
+    Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
+    If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
+    Only provide the dialog for text to speech.
+    Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
+    -Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
+    Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
+    Do not include speaker labels like "jane:" or "john:" before dialogue.
+    The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
+    The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
+    Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...".  Its just dialog.
+    Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
+    Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
+    Maintain natural conversation flow and speech patterns within each monologue.
+    Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
+    Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
+    Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
+    Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
+    {'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
+    Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
+    Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
+    """
+    response = model.generate_content(prompt_template)
+    return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
+except Exception as e:
+    logger.error(f"Error generating podcast script: {str(e)}")
+    return f"Error: {str(e)}"
+@callback( Output("audio-output", "children"), Input("generate-audio-btn", "n_clicks"), State("script-output", "value"), State("voice1", "value"), State("voice2", "value"), State("temperature", "value"), State("top-p", "value"), State("repetition-penalty", "value"), State("max-new-tokens", "value"), State("num-hosts", "value"), prevent_initial_call=True ) @spaces.GPU() def generate_speech(n_clicks, text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts): if n_clicks is None or not text.strip(): return html.Div("No audio generated yet.")
+Copy
+try:
+    paragraphs = text.split('\n\n')  # Split by double newline
+    audio_samples = []
+    for i, paragraph in enumerate(paragraphs):
+        if not paragraph.strip():
+            continue
+        voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
+        input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
+        with torch.no_grad():
+            generated_ids = model.generate(
+                input_ids,
+                attention_mask=attention_mask,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                max_new_tokens=max_new_tokens,
+                num_return_sequences=1,
+                eos_token_id=128258,
+            )
+        code_list = parse_output(generated_ids)
+        paragraph_audio = redistribute_codes(code_list, snac_model)
+        silences = detect_silence(paragraph_audio)
+        if silences:
+            paragraph_audio = paragraph_audio[:silences[-1][1]]
+        audio_samples.append(paragraph_audio)
+    final_audio = np.concatenate(audio_samples)
+    final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
+    # Convert to base64 for audio playback
+    audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
+    src = f"data:audio/wav;base64,{audio_base64}"
+    return html.Audio(src=src, controls=True)
+except Exception as e:
+    logger.error(f"Error generating speech: {str(e)}")
+    return html.Div(f"Error generating audio: {str(e)}")
+@callback( Output("advanced-settings", "is_open"), Input("advanced-settings-toggle", "n_clicks"), State("advanced-settings", "is_open"), ) def toggle_advanced_settings(n_clicks, is_open): if n_clicks: return not is_open return is_open
+@callback( Output("prompt", "value"), Output("script-output", "value"), Output("audio-output", "children"), Input("clear-btn", "n_clicks"), ) def clear_outputs(n_clicks): if n_clicks: return "", "", html.Div("No audio generated yet.") return dash.no_update, dash.no_update, dash.no_update
+Run the app
+if name == 'main': print("Starting the Dash application...") app.run(debug=True, host='0.0.0.0', port=7860) print("Dash application has finished running.")
+Chain of thought:
+Analyze the current code structure
+Identify the duplicate callbacks
+Combine the callbacks into a single function
+Ensure all inputs and outputs are properly handled
+Maintain the existing functionality
+Update the layout to reflect the changes
+Verify that no other modifications are made
+Now, let's implement these changes:
+python
+Copy
+import dash
+from dash import dcc, html, Input, Output, State, callback
+import dash_bootstrap_components as dbc
+import base64
+import io
+import os
+from snac import SNAC
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import google.generativeai as genai
+import re
+import logging
+import numpy as np
+from pydub import AudioSegment
+from docx import Document
+import PyPDF2
+import spaces
+# Initialize logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load models
+print("Loading SNAC model...")
+snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
+snac_model = snac_model.to(device)
+model_name = "canopylabs/orpheus-3b-0.1-ft"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
+model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+print(f"Orpheus model loaded to {device}")
+# Available voices and emotive tags
+VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
+EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
+# Initialize Dash app
+app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+# Layout
+app.layout = dbc.Container([
+    dbc.Row([
+        dbc.Col([
+            html.H1("Orpheus Text-to-Speech", className="mb-4"),
+            dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"),
+            dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"),
+            dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"),
+            dbc.Input(id="podcast-topic", placeholder="Enter podcast topic", className="mb-2"),
+            dbc.Textarea(id="prompt", placeholder="Enter your text here...", rows=5, className="mb-2"),
+            dcc.Upload(
+                id='upload-file',
+                children=html.Div(['Drag and Drop or ', html.A('Select a File')]),
+                style={
+                    'width': '100%',
+                    'height': '60px',
+                    'lineHeight': '60px',
+                    'borderWidth': '1px',
+                    'borderStyle': 'dashed',
+                    'borderRadius': '5px',
+                    'textAlign': 'center',
+                    'margin': '10px 0'
+                },
+            ),
+            dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"),
+            dbc.RadioItems(
+                id="num-hosts",
+                options=[{"label": i, "value": i} for i in ["1", "2"]],
+                value="1",
+                inline=True,
+                className="mb-2"
+            ),
+            dbc.Button("Generate Podcast Script", id="generate-script-btn", color="primary", className="mb-2"),
+        ], width=6),
+        dbc.Col([
+            dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"),
+            dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"),
+            dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"),
+            dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"),
+            html.Div(id="audio-output"),
+            dbc.Button("Clear", id="clear-btn", color="secondary", className="mb-2"),
+            dbc.Collapse([
+                dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"),
+                dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"),
+                dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"),
+                dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"),
+            ], id="advanced-settings", is_open=False),
+            dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"),
+        ], width=6),
+    ]),
+    dcc.Store(id='generated-script'),
+    dcc.Store(id='generated-audio'),
+])
 # Combined callback
 @callback(
     Output("script-output", "value"),
     State("advanced-settings", "is_open"),
     prevent_initial_call=True
 )
 def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
                       host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
                       script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
     return dash.no_update, dash.no_update, dash.no_update, dash.no_update
+# Run the app
+if __name__ == '__main__':
+    print("Starting the Dash
+where do I put this? @spaces.GPU
+To properly implement the @spaces.GPU() decorator in your Dash application, you should place it directly above the function that requires GPU acceleration. In this case, you would add it above the combined_callback function. Here's how you should modify your code:
+python
+Copy
+# ... (previous code remains unchanged)
+# Combined callback
+@callback(
+    Output("script-output", "value"),
+    Output("audio-output", "children"),
+    Output("advanced-settings", "is_open"),
+    Output("prompt", "value"),
+    Input("generate-script-btn", "n_clicks"),
+    Input("generate-audio-btn", "n_clicks"),
+    Input("advanced-settings-toggle", "n_clicks"),
+    Input("clear-btn", "n_clicks"),
+    State("host1-name", "value"),
+    State("host2-name", "value"),
+    State("podcast-name", "value"),
+    State("podcast-topic", "value"),
+    State("prompt", "value"),
+    State("upload-file", "contents"),
+    State("duration", "value"),
+    State("num-hosts", "value"),
+    State("script-output", "value"),
+    State("voice1", "value"),
+    State("voice2", "value"),
+    State("temperature", "value"),
+    State("top-p", "value"),
+    State("repetition-penalty", "value"),
+    State("max-new-tokens", "value"),
+    State("advanced-settings", "is_open"),
+    prevent_initial_call=True
+)
+@spaces.GPU()
+def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
+                      host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
+                      script_output, voice1, voice2, temperature, top
+ERROR:main:Error generating speech: name 'process_prompt' is not defined
+I apologize for the oversight. The error you're encountering is due to the process_prompt function not being defined in the code. This function is likely used for processing the input text before generating speech. Let's add the missing function and any other necessary functions to resolve this error. Here's how we can modify the code:
+First, we need to add the process_prompt function.
+We should also add the parse_output, redistribute_codes, and detect_silence functions, which are used in the audio generation process.
+Add these functions just before the combined_callback function:
+python
+Copy
+def process_prompt(text, voice, tokenizer, device):
+    prompt = f"{voice}: {text}"
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(device)
+    attention_mask = inputs["attention_mask"].to(device)
+    return input_ids, attention_mask
+def parse_output(generated_ids):
+    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+    code_list = [int(code) for code in decoded.split() if code.isdigit()]
+    return code_list
+def redistribute_codes(code_list, snac_model):
+    audio = snac_model.codes_to_audio(torch.tensor(code_list).unsqueeze(0).to(device))
+    return audio.cpu().numpy().flatten()
+def detect_silence(audio, threshold=0.01, min_silence_len=1000):
+    is_silent = np.abs(audio) < threshold
+    silent_regions = []
+    silent_start = None
+    for i, silent in enumerate(is_silent):
+        if silent and silent_start is None:
+            silent_start = i
+        elif not silent and silent_start is not None:
+            if i - silent_start >= min_silence_len:
+                silent_regions.append((silent_start, i))
+            silent_start = None
+    if silent_start is not None and len(audio) - silent_start >= min_silence_len:
+        silent_regions.append((silent_start, len(audio)))
+    return silent_regions
+# Combined callback
+@callback(
+    Output("script-output", "value"),
+    Output("audio-output", "children"),
+    Output("advanced-settings", "is_open"),
+    Output("prompt", "value"),
+    Input("generate-script-btn", "n_clicks"),
+    Input("generate-audio-btn", "n_clicks"),
+    Input("advanced-settings-toggle", "n_clicks"),
+    Input("clear-btn", "n_clicks"),
+    State("host1-name", "value"),
+    State("host2-name", "value"),
+    State("podcast-name", "value"),
+    State("podcast-topic", "value"),
+    State("prompt", "value"),
+    State("upload-file", "contents"),
+    State("duration", "value"),
+    State("num-hosts", "value"),
+    State("script-output", "value"),
+    State("voice1", "value"),
+    State("voice2", "value"),
+    State("temperature", "value"),
+    State("top-p", "value"),
+    State("repetition-penalty", "value"),
+    State("max-new-tokens", "value"),
+    State("advanced-settings", "is_open"),
+    prevent_initial_call=True
+)
+@spaces.GPU()
+def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
+                      host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
+                      script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
+    # ... (rest of the function remains unchanged)
+These additions should resolve the "name 'process_prompt' is not defined" error and provide the necessary functions for audio generation. Make sure to place these function definitions before the combined_callback function in your code.
+its stuck here
+Using a ZeroGPU function outside of Gradio caching or request might block the app
+INFO:httpx:HTTP Request: POST http://device-api.zero/allow?allowToken=765a43b283841841a6b484887de0c56c84ab4e8219e7ff2632ad6cf3e92d2012&pid=1580 "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://device-api.zero/release?allowToken=765a43b283841841a6b484887de0c56c84ab4e8219e7ff2632ad6cf3e92d2012&fail=false "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://device-api.zero/schedule?cgroupPath=%2Fkubepods.slice%2Fkubepods-burstable.slice%2Fkubepods-burstable-pod3cd9482c_d34a_484d_9434_ad8483b9ee4c.slice%2Fcri-containerd-b9a000519f1ed8024a1915b093625f1ac9c2cad6b0700f31850608b8c61a7aa5.scope&taskId=140413719708576&enableQueue=true&tokenVersion=2 "HTTP/1.1 200 OK" INFO:httpx:HTTP Request: POST http://device-api.zero/release?allowToken=70d732fbc3156877cb6179fc0e7b1115e19291613f08e53f594be5b684eb4f9c&fail=false "HTTP/1.1 200 OK"
+no audio download
+I'll just attache a gpu to it like a t4
+Based on the context provided, it seems you're encountering issues with GPU allocation and audio generation in your Dash application. Let's address these problems step by step:
+GPU Allocation Issue: The error message suggests that you're using a ZeroGPU function outside of the intended context, which could be blocking the app. This is likely related to the @spaces.GPU() decorator we added earlier.
+Audio Download Problem: You mentioned there's no audio download, which could be related to how the audio is being generated or served.
+To address these issues, let's make the following modifications:
+Remove the @spaces.GPU() decorator from the combined_callback function. Instead, we'll use it more specifically within the audio generation part.
+Split the audio generation into a separate function and apply the @spaces.GPU() decorator there.
+Ensure the audio is properly generated and served for download.
+Here's how you can modify your code:
+python
+Copy
+import spaces
+# ... (other imports and setup code)
+@spaces.GPU()
+def generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens):
+    try:
+        paragraphs = script_output.split('\n\n')  # Split by double newline
+        audio_samples = []
+        for i, paragraph in enumerate(paragraphs):
+            if not paragraph.strip():
+                continue
+            voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
+            input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
+            with torch.no_grad():
+                generated_ids = model.generate(
+                    input_ids,
+                    attention_mask=attention_mask,
+                    do_sample=True,
+                    temperature=temperature,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    max_new_tokens=max_new_tokens,
+                    num_return_sequences=1,
+                    eos_token_id=128258,
+                )
+            code_list = parse_output(generated_ids)
+            paragraph_audio = redistribute_codes(code_list, snac_model)
+            silences = detect_silence(paragraph_audio)
+            if silences:
+                paragraph_audio = paragraph_audio[:silences[-1][1]]
+            audio_samples.append(paragraph_audio)
+        final_audio = np.concatenate(audio_samples)
+        final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
+        return final_audio
+    except Exception as e:
+        logger.error(f"Error generating speech: {str(e)}")
+        return None
+# Combined callback
+@callback(
+    Output("script-output", "value"),
+    Output("audio-output", "children"),
+    Output("advanced-settings", "is_open"),
+    Output("prompt", "value"),
+    Input("generate-script-btn", "n_clicks"),
+    Input("generate-audio-btn", "n_clicks"),
+    Input("advanced-settings-toggle", "n_clicks"),
+    Input("clear-btn", "n_clicks"),
+    State("host1-name", "value"),
+    State("host2-name", "value"),
+    State("podcast-name", "value"),
+    State("podcast-topic", "value"),
+    State("prompt", "value"),
+    State("upload-file", "contents"),
+    State("duration", "value"),
+    State("num-hosts", "value"),
+    State("script-output", "value"),
+    State("voice1", "value"),
+    State("voice2", "value"),
+    State("temperature", "value"),
+    State("top-p", "value"),
+    State("repetition-penalty", "value"),
+    State("max-new-tokens", "value"),
+    State("advanced-settings", "is_open"),
+    prevent_initial_call=True
+)
+def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
+                      host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
+                      script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
+    ctx = dash.callback_context
+    if not ctx.triggered:
+        return dash.no_update, dash.no_update, dash.no_update, dash.no_update
+    trigger_id = ctx.triggered[0]['prop_id'].split('.')[0]
+    if trigger_id == "generate-script-btn":
+        try:
+            # Get the Gemini API key from Hugging Face secrets
+            api_key = os.environ.get("GEMINI_API_KEY")
+            if not api_key:
+                raise ValueError("Gemini API key not found in environment variables")
+            genai.configure(api_key=api_key)
+            model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
+            combined_content = prompt or ""
+            if uploaded_file:
+                content_type, content_string = uploaded_file.split(',')
+                decoded = base64.b64decode(content_string)
+                file_bytes = io.BytesIO(decoded)
+                # Try to detect the file type based on content
+                file_bytes.seek(0)
+                if file_bytes.read(4) == b'%PDF':
+                    # It's a PDF file
+                    file_bytes.seek(0)
+                    pdf_reader = PyPDF2.PdfReader(file_bytes)
+                    file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
+                else:
+                    # Try as text file first
+                    file_bytes.seek(0)
+                    try:
+                        file_content = file_bytes.read().decode('utf-8')
+                    except UnicodeDecodeError:
+                        # If it's not a text file, try as a docx
+                        file_bytes.seek(0)
+                        try:
+                            doc = Document(file_bytes)
+                            file_content = "\n".join([para.text for para in doc.paragraphs])
+                        except:
+                            raise ValueError("Unsupported file type or corrupted file")
+                combined_content += "\n" + file_content if combined_content else file_content
+            num_hosts = int(num_hosts) if num_hosts else 1
+            prompt_template = f"""
+            Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
+            {combined_content}
+            Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
+            Use speech fillers like um, ah. Vary emotional tone.
+            Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
+            Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
+            If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
+            Only provide the dialog for text to speech.
+            Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
+            -Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
+            Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
+            Do not include speaker labels like "jane:" or "john:" before dialogue.
+            The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
+            The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
+            Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...".  Its just dialog.
+            Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
+            Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
+            Maintain natural conversation flow and speech patterns within each monologue.
+            Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
+            Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
+            Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
+            Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
+            {'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
+            Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
+            Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
+            """
+            response = model.generate_content(prompt_template)
+            return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text), dash.no_update, dash.no_update, dash.no_update
+        except Exception as e:
+            logger.error(f"Error generating podcast script: {str(e)}")
+            return f"Error: {str(e)}", dash.no_update, dash.no_update, dash.no_update
+    elif trigger_id == "generate-audio-btn":
+        if not script_output.strip():
+            return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update
+        final_audio = generate_audio(script_output, voice1, voice2, num_hosts, temperature, top_p, repetition_penalty, max_new_tokens)
+        if final_audio is not None:
+            # Convert to base64 for audio playback
+            audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
+            src = f"data:audio/wav;base64,{audio_base64}"
+            # Create a download link for the audio
+            download_link = html.A("Download Audio", href=src, download="generated_audio.wav")
+            return dash.no_update, html.Div([
+                html.Audio(src=src, controls=True),
+                html.Br(),
+                download_link
+            ]), dash.no_update, dash.no_update
+        else:
+            return dash.no_update, html.Div("Error generating audio"), dash.no_update, dash.no_update
+    elif trigger_id == "advanced-settings-toggle":
+        return dash.no_update, dash.no_update, not is_advanced_open, dash.no_update
+    elif trigger_id == "clear-btn":
+        return "", html.Div("No audio generated yet."), dash.no_update, ""
+    return dash.no_update, dash.no_update, dash.no_update, dash.no_update
 # Run the app
 if __name__ == '__main__':
     print("Starting the Dash application...")