bluenevus commited on
Commit
2cf25ca
·
verified ·
1 Parent(s): 624da7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -63
app.py CHANGED
@@ -4,8 +4,6 @@ import torch
4
  import gradio as gr
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
  from huggingface_hub import snapshot_download
7
- import google.generativeai as genai
8
- import re
9
  import logging
10
 
11
  logging.basicConfig(level=logging.INFO)
@@ -24,39 +22,11 @@ model.to(device)
24
  tokenizer = AutoTokenizer.from_pretrained(model_name)
25
  print(f"Orpheus model loaded to {device}")
26
 
27
- @spaces.GPU()
28
- def generate_podcast_script(api_key, content, uploaded_file, duration, num_hosts):
29
- try:
30
- genai.configure(api_key=api_key)
31
- model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
32
-
33
- combined_content = content or ""
34
- if uploaded_file:
35
- file_content = uploaded_file.read().decode('utf-8')
36
- combined_content += "\n" + file_content if combined_content else file_content
37
-
38
- prompt = f"""
39
- Create a podcast script for {'one person' if num_hosts == 1 else 'two people'} discussing:
40
- {combined_content}
41
-
42
- Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
43
- Use speech fillers like um, ah. Vary emotional tone.
44
-
45
- Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
46
- Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
47
-
48
- Use emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
49
-
50
- Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
51
-
52
- Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
53
- """
54
-
55
- response = model.generate_content(prompt)
56
- return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
57
- except Exception as e:
58
- logger.error(f"Error generating podcast script: {str(e)}")
59
- raise
60
 
61
  def process_prompt(prompt, voice, tokenizer, device):
62
  prompt = f"{voice}: {prompt}"
@@ -97,6 +67,26 @@ def parse_output(generated_ids):
97
 
98
  return code_lists[0]
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  @spaces.GPU()
101
  def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
102
  if not text.strip():
@@ -132,51 +122,64 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
132
  return None
133
 
134
  with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
135
- gr.Markdown("# 🎵 Orpheus Text-to-Speech with Gemini Script Generation")
 
 
 
 
 
 
 
136
 
137
  with gr.Row():
138
- with gr.Column(scale=1):
139
- gemini_api_key = gr.Textbox(label="Gemini API Key", type="password")
140
- content = gr.Textbox(label="Content", lines=8)
141
- uploaded_file = gr.File(label="Upload File")
142
- duration = gr.Slider(minimum=1, maximum=60, value=5, step=1, label="Duration (minutes)")
143
- num_hosts = gr.Radio(["1", "2"], label="Number of Hosts", value="1")
144
- generate_script_btn = gr.Button("Generate Podcast Script")
145
-
146
  with gr.Column(scale=2):
147
- script_output = gr.Textbox(label="Generated Script", lines=10)
148
- voice = gr.Dropdown(choices=["Narrator", "Male", "Female"], value="Narrator", label="Voice")
 
 
 
 
 
149
 
150
- with gr.Row():
151
- temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
152
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top P")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
  with gr.Row():
155
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition Penalty")
156
- max_new_tokens = gr.Slider(minimum=100, maximum=1000, value=500, step=50, label="Max New Tokens")
157
-
158
- submit_btn = gr.Button("Generate Speech")
159
- clear_btn = gr.Button("Clear")
160
 
161
  with gr.Column(scale=2):
162
  audio_output = gr.Audio(label="Generated Speech", type="numpy")
163
 
164
- generate_script_btn.click(
165
- fn=generate_podcast_script,
166
- inputs=[gemini_api_key, content, uploaded_file, duration, num_hosts],
167
- outputs=script_output
168
- )
169
-
170
  submit_btn.click(
171
  fn=generate_speech,
172
- inputs=[script_output, voice, temperature, top_p, repetition_penalty, max_new_tokens],
173
  outputs=audio_output
174
  )
175
 
176
  clear_btn.click(
177
- fn=lambda: (None, None, None),
178
  inputs=[],
179
- outputs=[content, script_output, audio_output]
180
  )
181
 
182
  if __name__ == "__main__":
 
4
  import gradio as gr
5
  from transformers import AutoModelForCausalLM, AutoTokenizer
6
  from huggingface_hub import snapshot_download
 
 
7
  import logging
8
 
9
  logging.basicConfig(level=logging.INFO)
 
22
  tokenizer = AutoTokenizer.from_pretrained(model_name)
23
  print(f"Orpheus model loaded to {device}")
24
 
25
+ # Available voices
26
+ VOICES = ["tara", "leah", "jess", "leo", "dan", "mia", "zac", "zoe"]
27
+
28
+ # Available Emotive Tags
29
+ EMOTIVE_TAGS = ["`<laugh>`", "`<chuckle>`", "`<sigh>`", "`<cough>`", "`<sniffle>`", "`<groan>`", "`<yawn>`", "`<gasp>`"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  def process_prompt(prompt, voice, tokenizer, device):
32
  prompt = f"{voice}: {prompt}"
 
67
 
68
  return code_lists[0]
69
 
70
+ def redistribute_codes(code_list, snac_model):
71
+ layer_1, layer_2, layer_3 = [], [], []
72
+ for i in range(len(code_list) // 7):
73
+ layer_1.append(code_list[7*i])
74
+ layer_2.append(code_list[7*i+1]-(1*4096))
75
+ layer_3.append(code_list[7*i+2]-(2*4096))
76
+ layer_3.append(code_list[7*i+3]-(3*4096))
77
+ layer_2.append(code_list[7*i+4]-(4*4096))
78
+ layer_3.append(code_list[7*i+5]-(5*4096))
79
+ layer_3.append(code_list[7*i+6]-(6*4096))
80
+
81
+ codes = [
82
+ torch.tensor(layer_1, device=device).unsqueeze(0),
83
+ torch.tensor(layer_2, device=device).unsqueeze(0),
84
+ torch.tensor(layer_3, device=device).unsqueeze(0)
85
+ ]
86
+
87
+ audio = snac_model.decode(codes)
88
+ return audio.cpu().numpy().flatten()
89
+
90
  @spaces.GPU()
91
  def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
92
  if not text.strip():
 
122
  return None
123
 
124
  with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
125
+ gr.Markdown(f"""
126
+ # 🎵 [Orpheus Text-to-Speech](https://github.com/canopyai/Orpheus-TTS)
127
+ Enter your text below and hear it converted to natural-sounding speech with the Orpheus TTS model.
128
+
129
+ ## Tips for better prompts:
130
+ - Add paralinguistic elements like {", ".join(EMOTIVE_TAGS)} or `uhm` for more human-like speech.
131
+ - Longer text prompts generally work better than very short phrases
132
+ """)
133
 
134
  with gr.Row():
 
 
 
 
 
 
 
 
135
  with gr.Column(scale=2):
136
+ text_input = gr.Textbox(label="Text to speak", lines=5)
137
+ voice = gr.Dropdown(
138
+ choices=VOICES,
139
+ value="tara",
140
+ label="Voice",
141
+ info="Select the voice for speech generation"
142
+ )
143
 
144
+ with gr.Accordion("Advanced Settings", open=False):
145
+ temperature = gr.Slider(
146
+ minimum=0.1, maximum=1.5, value=0.6, step=0.05,
147
+ label="Temperature",
148
+ info="Higher values (0.7-1.0) create more expressive but less stable speech"
149
+ )
150
+ top_p = gr.Slider(
151
+ minimum=0.1, maximum=1.0, value=0.9, step=0.05,
152
+ label="Top P",
153
+ info="Higher values produce more diverse outputs"
154
+ )
155
+ repetition_penalty = gr.Slider(
156
+ minimum=1.0, maximum=2.0, value=1.2, step=0.1,
157
+ label="Repetition Penalty",
158
+ info="Higher values discourage repetitive patterns"
159
+ )
160
+ max_new_tokens = gr.Slider(
161
+ minimum=100, maximum=2000, value=1200, step=100,
162
+ label="Max Length",
163
+ info="Maximum length of generated audio (in tokens)"
164
+ )
165
 
166
  with gr.Row():
167
+ submit_btn = gr.Button("Generate Speech", variant="primary")
168
+ clear_btn = gr.Button("Clear")
 
 
 
169
 
170
  with gr.Column(scale=2):
171
  audio_output = gr.Audio(label="Generated Speech", type="numpy")
172
 
 
 
 
 
 
 
173
  submit_btn.click(
174
  fn=generate_speech,
175
+ inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
176
  outputs=audio_output
177
  )
178
 
179
  clear_btn.click(
180
+ fn=lambda: (None, None),
181
  inputs=[],
182
+ outputs=[text_input, audio_output]
183
  )
184
 
185
  if __name__ == "__main__":