podcast-generator

Paused

App Files Files Community

bluenevus commited on Apr 25

Commit

c594756

verified ·

1 Parent(s): 1f13bd4

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -215

app.py CHANGED Viewed

@@ -1,50 +1,132 @@
-import spaces
 from snac import SNAC
 import torch
-import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from huggingface_hub import snapshot_download
 import google.generativeai as genai
 import re
 import logging
 import numpy as np
 from pydub import AudioSegment
-import io
 from docx import Document
 import PyPDF2
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "canopylabs/orpheus-3b-0.1-ft"
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
 model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 print(f"Orpheus model loaded to {device}")
-# Available voices
 VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
-# Available Emotive Tags
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
-@spaces.GPU()
-def generate_podcast_script(api_key, host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts):
     try:
         genai.configure(api_key=api_key)
         model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
         combined_content = prompt or ""
-        if uploaded_file is not None:
-            file_bytes = io.BytesIO(uploaded_file)
             # Try to detect the file type based on content
             file_bytes.seek(0)
@@ -105,99 +187,26 @@ def generate_podcast_script(api_key, host1_name, host2_name, podcast_name, podca
         return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
     except Exception as e:
         logger.error(f"Error generating podcast script: {str(e)}")
-        raise
-def process_prompt(prompt, voice, tokenizer, device):
-    prompt = f"{voice}: {prompt}"
-    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
-    start_token = torch.tensor([[128259]], dtype=torch.int64)
-    end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
-    modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
-    attention_mask = torch.ones_like(modified_input_ids)
-    return modified_input_ids.to(device), attention_mask.to(device)
-def parse_output(generated_ids):
-    token_to_find = 128257
-    token_to_remove = 128258
-    token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
-    if len(token_indices[1]) > 0:
-        last_occurrence_idx = token_indices[1][-1].item()
-        cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
-    else:
-        cropped_tensor = generated_ids
-    processed_rows = []
-    for row in cropped_tensor:
-        masked_row = row[row != token_to_remove]
-        processed_rows.append(masked_row)
-    code_lists = []
-    for row in processed_rows:
-        row_length = row.size(0)
-        new_length = (row_length // 7) * 7
-        trimmed_row = row[:new_length]
-        trimmed_row = [t - 128266 for t in trimmed_row]
-        code_lists.append(trimmed_row)
-    return code_lists[0]
-def redistribute_codes(code_list, snac_model):
-    device = next(snac_model.parameters()).device  # Get the device of SNAC model
-    layer_1 = []
-    layer_2 = []
-    layer_3 = []
-    for i in range((len(code_list)+1)//7):
-        layer_1.append(code_list[7*i])
-        layer_2.append(code_list[7*i+1]-4096)
-        layer_3.append(code_list[7*i+2]-(2*4096))
-        layer_3.append(code_list[7*i+3]-(3*4096))
-        layer_2.append(code_list[7*i+4]-(4*4096))
-        layer_3.append(code_list[7*i+5]-(5*4096))
-        layer_3.append(code_list[7*i+6]-(6*4096))
-    codes = [
-        torch.tensor(layer_1, device=device).unsqueeze(0),
-        torch.tensor(layer_2, device=device).unsqueeze(0),
-        torch.tensor(layer_3, device=device).unsqueeze(0)
-    ]
-    audio_hat = snac_model.decode(codes)
-    return audio_hat.detach().squeeze().cpu().numpy()  # Always return CPU numpy array
-def detect_silence(audio, threshold=0.005, min_silence_duration=1.3):
-    sample_rate = 24000  # Adjust if your sample rate is different
-    is_silent = np.abs(audio) < threshold
-    silent_regions = np.where(is_silent)[0]
-    silence_starts = []
-    silence_ends = []
-    if len(silent_regions) > 0:
-        silence_starts.append(silent_regions[0])
-        for i in range(1, len(silent_regions)):
-            if silent_regions[i] - silent_regions[i-1] > 1:
-                silence_ends.append(silent_regions[i-1])
-                silence_starts.append(silent_regions[i])
-        silence_ends.append(silent_regions[-1])
-    long_silences = [(start, end) for start, end in zip(silence_starts, silence_ends)
-                     if (end - start) / sample_rate >= min_silence_duration]
-    return long_silences
-@spaces.GPU()
-def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
-    if not text.strip():
-        return None
     try:
-        progress(0.1, "Processing text...")
         paragraphs = text.split('\n\n')  # Split by double newline
         audio_samples = []
@@ -209,7 +218,6 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
             input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
-            progress(0.3, f"Generating speech tokens for paragraph {i+1}...")
             with torch.no_grad():
                 generated_ids = model.generate(
                     input_ids,
@@ -223,130 +231,50 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
                     eos_token_id=128258,
                 )
-            progress(0.6, f"Processing speech tokens for paragraph {i+1}...")
             code_list = parse_output(generated_ids)
-            progress(0.8, f"Converting paragraph {i+1} to audio...")
             paragraph_audio = redistribute_codes(code_list, snac_model)
-            # Add silence detection here
             silences = detect_silence(paragraph_audio)
             if silences:
-                # Trim the audio at the last detected silence
                 paragraph_audio = paragraph_audio[:silences[-1][1]]
             audio_samples.append(paragraph_audio)
         final_audio = np.concatenate(audio_samples)
-        # Normalize the audio
         final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
-        return (24000, final_audio)
     except Exception as e:
-        print(f"Error generating speech: {e}")
-        return None
-with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
-    with gr.Row():
-        def get_field_value(field, default=""):
-            return field.value if field.value and not field.value.isspace() else default
-        with gr.Column(scale=1):
-            gemini_api_key = gr.Textbox(label="Gemini API Key", type="password")
-            host1_name = gr.Textbox(label="Name of Podcast Host 1", placeholder="Enter name of first host")
-            host2_name = gr.Textbox(label="Name of Podcast Host 2", placeholder="Enter name of second host")
-            podcast_name = gr.Textbox(label="Name of Podcast", placeholder="Enter podcast name")
-            podcast_topic = gr.Textbox(label="Podcast Topic", placeholder="Enter podcast topic")
-            prompt = gr.Textbox(
-                label="Prompt",
-                placeholder="Enter your text here...",
-                lines=5,
-                max_lines=30,
-                show_label=True,
-                interactive=True,
-                container=True
-            )
-        with gr.Column(scale=2):
-            uploaded_file = gr.File(label="Upload File", type="binary")
-            duration = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Duration (minutes)")
-            num_hosts = gr.Radio(["1", "2"], label="Number of Hosts", value="1")
-            script_output = gr.Textbox(label="Generated Script", lines=10)
-            generate_script_btn = gr.Button("Generate Podcast Script")  # Add this line
-            generate_script_btn.click(
-                fn=generate_podcast_script,
-                inputs=[
-                    gemini_api_key,
-                    host1_name,
-                    host2_name,
-                    podcast_name,
-                    podcast_topic,
-                    prompt,
-                    uploaded_file,
-                    duration,
-                    num_hosts
-                ],
-                outputs=script_output
-            )
-        with gr.Column(scale=2):
-            voice1 = gr.Dropdown(
-                choices=VOICES,
-                value="tara",
-                label="Voice 1",
-                info="Select the first voice for speech generation"
-            )
-            voice2 = gr.Dropdown(
-                choices=VOICES,
-                value="zac",
-                label="Voice 2",
-                info="Select the second voice for speech generation"
-            )
-            with gr.Accordion("Advanced Settings", open=False):
-                temperature = gr.Slider(
-                    minimum=0.1, maximum=1.5, value=0.6, step=0.05,
-                    label="Temperature",
-                    info="Higher values (0.7-1.0) create more expressive but less stable speech"
-                )
-                top_p = gr.Slider(
-                    minimum=0.1, maximum=1.0, value=0.9, step=0.05,
-                    label="Top P",
-                    info="Higher values produce more diverse outputs"
-                )
-                repetition_penalty = gr.Slider(
-                    minimum=1.0, maximum=2.0, value=1.2, step=0.1,
-                    label="Repetition Penalty",
-                    info="Higher values discourage repetitive patterns"
-                )
-                max_new_tokens = gr.Slider(
-                    minimum=100, maximum=16384, value=4096, step=100,
-                    label="Max Length",
-                    info="Maximum length of generated audio (in tokens)"
-                )
-            audio_output = gr.Audio(label="Generated Audio", type="numpy")
-            with gr.Row():
-                submit_btn = gr.Button("Generate Audio", variant="primary")
-                clear_btn = gr.Button("Clear")
-    generate_script_btn.click(
-        fn=generate_podcast_script,
-        inputs=[gemini_api_key, prompt, uploaded_file, duration, num_hosts],
-        outputs=script_output
-    )
-    submit_btn.click(
-        fn=generate_speech,
-        inputs=[script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts],
-        outputs=audio_output
-    )
-    clear_btn.click(
-        fn=lambda: (None, None, None),
-        inputs=[],
-        outputs=[prompt, script_output, audio_output]
-    )
-if __name__ == "__main__":
-    demo.queue().launch(share=False, ssr_mode=False)

+import dash
+from dash import dcc, html, Input, Output, State, callback
+import dash_bootstrap_components as dbc
+import base64
+import io
+import os
 from snac import SNAC
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import google.generativeai as genai
 import re
 import logging
 import numpy as np
 from pydub import AudioSegment
 from docx import Document
 import PyPDF2
+# Initialize logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Initialize device
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# Load models
 print("Loading SNAC model...")
 snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
 snac_model = snac_model.to(device)
 model_name = "canopylabs/orpheus-3b-0.1-ft"
 model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
 model.to(device)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 print(f"Orpheus model loaded to {device}")
+# Available voices and emotive tags
 VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
 EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
+# Initialize Dash app
+app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])
+# Layout
+app.layout = dbc.Container([
+    dbc.Row([
+        dbc.Col([
+            html.H1("Orpheus Text-to-Speech", className="mb-4"),
+            dbc.Input(id="host1-name", placeholder="Enter name of first host", className="mb-2"),
+            dbc.Input(id="host2-name", placeholder="Enter name of second host", className="mb-2"),
+            dbc.Input(id="podcast-name", placeholder="Enter podcast name", className="mb-2"),
+            dbc.Input(id="podcast-topic", placeholder="Enter podcast topic", className="mb-2"),
+            dbc.Textarea(id="prompt", placeholder="Enter your text here...", rows=5, className="mb-2"),
+            dcc.Upload(
+                id='upload-file',
+                children=html.Div(['Drag and Drop or ', html.A('Select a File')]),
+                style={
+                    'width': '100%',
+                    'height': '60px',
+                    'lineHeight': '60px',
+                    'borderWidth': '1px',
+                    'borderStyle': 'dashed',
+                    'borderRadius': '5px',
+                    'textAlign': 'center',
+                    'margin': '10px 0'
+                },
+            ),
+            dcc.Slider(id="duration", min=1, max=60, value=5, step=1, marks={1: '1', 30: '30', 60: '60'}, className="mb-2"),
+            dbc.RadioItems(
+                id="num-hosts",
+                options=[{"label": i, "value": i} for i in ["1", "2"]],
+                value="1",
+                inline=True,
+                className="mb-2"
+            ),
+            dbc.Button("Generate Podcast Script", id="generate-script-btn", color="primary", className="mb-2"),
+        ], width=6),
+        dbc.Col([
+            dbc.Textarea(id="script-output", placeholder="Generated script will appear here...", rows=10, className="mb-2"),
+            dcc.Dropdown(id="voice1", options=[{"label": v, "value": v} for v in VOICES], value="tara", className="mb-2"),
+            dcc.Dropdown(id="voice2", options=[{"label": v, "value": v} for v in VOICES], value="zac", className="mb-2"),
+            dbc.Button("Generate Audio", id="generate-audio-btn", color="success", className="mb-2"),
+            html.Div(id="audio-output"),
+            dbc.Button("Clear", id="clear-btn", color="secondary", className="mb-2"),
+            dbc.Collapse([
+                dcc.Slider(id="temperature", min=0.1, max=1.5, value=0.6, step=0.05, marks={0.1: '0.1', 0.8: '0.8', 1.5: '1.5'}, className="mb-2"),
+                dcc.Slider(id="top-p", min=0.1, max=1.0, value=0.9, step=0.05, marks={0.1: '0.1', 0.5: '0.5', 1.0: '1.0'}, className="mb-2"),
+                dcc.Slider(id="repetition-penalty", min=1.0, max=2.0, value=1.2, step=0.1, marks={1.0: '1.0', 1.5: '1.5', 2.0: '2.0'}, className="mb-2"),
+                dcc.Slider(id="max-new-tokens", min=100, max=16384, value=4096, step=100, marks={100: '100', 8192: '8192', 16384: '16384'}, className="mb-2"),
+            ], id="advanced-settings", is_open=False),
+            dbc.Button("Advanced Settings", id="advanced-settings-toggle", color="info", className="mb-2"),
+        ], width=6),
+    ]),
+    dcc.Store(id='generated-script'),
+    dcc.Store(id='generated-audio'),
+])
+# Callbacks
+@callback(
+    Output("script-output", "value"),
+    Input("generate-script-btn", "n_clicks"),
+    State("host1-name", "value"),
+    State("host2-name", "value"),
+    State("podcast-name", "value"),
+    State("podcast-topic", "value"),
+    State("prompt", "value"),
+    State("upload-file", "contents"),
+    State("duration", "value"),
+    State("num-hosts", "value"),
+    prevent_initial_call=True
+)
+def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts):
+    if n_clicks is None:
+        return ""
     try:
+        # Get the Gemini API key from Hugging Face secrets
+        api_key = os.environ.get("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("Gemini API key not found in environment variables")
         genai.configure(api_key=api_key)
         model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
         combined_content = prompt or ""
+        if uploaded_file:
+            content_type, content_string = uploaded_file.split(',')
+            decoded = base64.b64decode(content_string)
+            file_bytes = io.BytesIO(decoded)
             # Try to detect the file type based on content
             file_bytes.seek(0)
         return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
     except Exception as e:
         logger.error(f"Error generating podcast script: {str(e)}")
+        return f"Error: {str(e)}"
+@callback(
+    Output("audio-output", "children"),
+    Input("generate-audio-btn", "n_clicks"),
+    State("script-output", "value"),
+    State("voice1", "value"),
+    State("voice2", "value"),
+    State("temperature", "value"),
+    State("top-p", "value"),
+    State("repetition-penalty", "value"),
+    State("max-new-tokens", "value"),
+    State("num-hosts", "value"),
+    prevent_initial_call=True
+)
+def generate_speech(n_clicks, text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts):
+    if n_clicks is None or not text.strip():
+        return html.Div("No audio generated yet.")
     try:
         paragraphs = text.split('\n\n')  # Split by double newline
         audio_samples = []
             input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
             with torch.no_grad():
                 generated_ids = model.generate(
                     input_ids,
                     eos_token_id=128258,
                 )
             code_list = parse_output(generated_ids)
             paragraph_audio = redistribute_codes(code_list, snac_model)
             silences = detect_silence(paragraph_audio)
             if silences:
                 paragraph_audio = paragraph_audio[:silences[-1][1]]
             audio_samples.append(paragraph_audio)
         final_audio = np.concatenate(audio_samples)
         final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
+        # Convert to base64 for audio playback
+        audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
+        src = f"data:audio/wav;base64,{audio_base64}"
+        return html.Audio(src=src, controls=True)
     except Exception as e:
+        logger.error(f"Error generating speech: {str(e)}")
+        return html.Div(f"Error generating audio: {str(e)}")
+@callback(
+    Output("advanced-settings", "is_open"),
+    Input("advanced-settings-toggle", "n_clicks"),
+    State("advanced-settings", "is_open"),
+)
+def toggle_advanced_settings(n_clicks, is_open):
+    if n_clicks:
+        return not is_open
+    return is_open
+@callback(
+    Output("prompt", "value"),
+    Output("script-output", "value"),
+    Output("audio-output", "children"),
+    Input("clear-btn", "n_clicks"),
+)
+def clear_outputs(n_clicks):
+    if n_clicks:
+        return "", "", html.Div("No audio generated yet.")
+    return dash.no_update, dash.no_update, dash.no_update
+# Run the app
+if __name__ == '__main__':
+    print("Starting the Dash application...")
+    app.run(debug=True, host='0.0.0.0', port=7860)
+    print("Dash application has finished running.")