Athspi commited on
Commit
f564269
·
verified ·
1 Parent(s): bef7195

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -119
app.py CHANGED
@@ -1,40 +1,35 @@
1
- import os
2
  import torch
3
  import gradio as gr
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
  from huggingface_hub import snapshot_download
 
 
6
  from dotenv import load_dotenv
 
7
 
8
- # Load environment variables
9
  load_dotenv()
10
 
11
- # Set number of threads (adjust based on your CPU cores)
12
- torch.set_num_threads(4)
13
 
14
- # Device and torch dtype selection
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
- torch_dtype = torch.bfloat16 if device == "cuda" else torch.float32
17
-
18
- # No-op decorator for CPU mode (if you had GPU-specific decorators)
19
- def gpu_decorator(func):
20
- return func
21
-
22
- # Import SNAC after setting device
23
- from snac import SNAC
24
 
 
25
  print("Loading SNAC model...")
26
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
27
  snac_model = snac_model.to(device)
28
- snac_model.eval() # Set SNAC to eval mode
29
 
 
30
  model_name = "canopylabs/orpheus-3b-0.1-ft"
31
 
32
- # Download only necessary files for the Orpheus model
 
33
  snapshot_download(
34
  repo_id=model_name,
35
  allow_patterns=[
36
  "config.json",
37
- "*.safetensors",
38
  "model.safetensors.index.json",
39
  ],
40
  ignore_patterns=[
@@ -47,42 +42,50 @@ snapshot_download(
47
  "special_tokens_map.json",
48
  "vocab.json",
49
  "merges.txt",
50
- "tokenizer.*"
51
  ]
52
  )
53
 
54
  print("Loading Orpheus model...")
55
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch_dtype)
56
- model.to(device)
57
- model.eval() # Set the model to evaluation mode
58
 
59
- # Optionally compile the model for PyTorch 2.0+ on CPU (if available)
60
- if hasattr(torch, "compile") and device == "cpu":
61
- try:
62
- model = torch.compile(model)
63
- print("Model compiled with torch.compile")
64
- except Exception as e:
65
- print("torch.compile not supported:", e)
66
 
 
 
67
  tokenizer = AutoTokenizer.from_pretrained(model_name)
68
  print(f"Orpheus model loaded to {device}")
69
 
 
 
 
70
  def process_prompt(prompt, voice, tokenizer, device):
 
71
  prompt = f"{voice}: {prompt}"
72
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
73
 
74
- start_token = torch.tensor([[128259]], dtype=torch.int64)
75
- end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64)
76
 
77
- modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
 
 
78
  attention_mask = torch.ones_like(modified_input_ids)
 
79
  return modified_input_ids.to(device), attention_mask.to(device)
80
 
81
  def parse_output(generated_ids):
82
- token_to_find = 128257
83
- token_to_remove = 128258
 
84
 
85
  token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
 
86
  if len(token_indices[1]) > 0:
87
  last_occurrence_idx = token_indices[1][-1].item()
88
  cropped_tensor = generated_ids[:, last_occurrence_idx + 1:]
@@ -97,42 +100,57 @@ def parse_output(generated_ids):
97
  code_lists = []
98
  for row in processed_rows:
99
  row_length = row.size(0)
100
- new_length = (row_length // 7) * 7
101
  trimmed_row = row[:new_length]
102
- trimmed_row = [t - 128266 for t in trimmed_row]
103
  code_lists.append(trimmed_row)
104
 
105
- return code_lists[0]
 
106
 
107
  def redistribute_codes(code_list, snac_model):
108
- snac_device = next(snac_model.parameters()).device
109
- layer_1, layer_2, layer_3 = [], [], []
110
- for i in range((len(code_list) + 1) // 7):
111
- layer_1.append(code_list[7 * i])
112
- layer_2.append(code_list[7 * i + 1] - 4096)
113
- layer_3.append(code_list[7 * i + 2] - (2 * 4096))
114
- layer_3.append(code_list[7 * i + 3] - (3 * 4096))
115
- layer_2.append(code_list[7 * i + 4] - (4 * 4096))
116
- layer_3.append(code_list[7 * i + 5] - (5 * 4096))
117
- layer_3.append(code_list[7 * i + 6] - (6 * 4096))
 
 
 
 
118
 
 
119
  codes = [
120
- torch.tensor(layer_1, device=snac_device).unsqueeze(0),
121
- torch.tensor(layer_2, device=snac_device).unsqueeze(0),
122
- torch.tensor(layer_3, device=snac_device).unsqueeze(0)
123
  ]
 
124
  audio_hat = snac_model.decode(codes)
125
- return audio_hat.detach().squeeze().cpu().numpy()
 
126
 
127
- @gpu_decorator
128
  def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
 
129
  if not text.strip():
130
  return None
 
131
  try:
132
- progress(0.05, "Processing text...")
 
 
133
  input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
134
- progress(0.2, "Generating tokens...")
135
- with torch.inference_mode():
 
 
 
 
136
  generated_ids = model.generate(
137
  input_ids=input_ids,
138
  attention_mask=attention_mask,
@@ -144,90 +162,82 @@ def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new
144
  num_return_sequences=1,
145
  eos_token_id=128258,
146
  )
147
- progress(0.4, "Parsing tokens...")
 
 
 
 
148
  code_list = parse_output(generated_ids)
149
- progress(0.7, "Generating audio...")
 
 
 
 
 
150
  audio_samples = redistribute_codes(code_list, snac_model)
151
- progress(1.0, "Done")
152
- return (24000, audio_samples)
 
 
153
  except Exception as e:
154
  print(f"Error generating speech: {e}")
155
  return None
156
 
157
- def convert_model_to_onnx():
158
- """
159
- Converts the Orpheus model to ONNX format using a dummy prompt.
160
- The exported file will be saved as 'orpheus_model.onnx' in the working directory.
161
- """
162
- dummy_prompt = "tara: Hello"
163
- dummy_input = tokenizer(dummy_prompt, return_tensors="pt").input_ids.to(device)
164
- file_path = "orpheus_model.onnx"
165
-
166
- # Ensure the model is in evaluation mode and not compiled
167
- model.eval()
168
-
169
- # Reset Torch Dynamo to avoid FX-tracing issues during export.
170
- if hasattr(torch, "_dynamo"):
171
- try:
172
- torch._dynamo.reset()
173
- print("Torch Dynamo reset before ONNX export")
174
- except Exception as e:
175
- print(f"Warning: Torch Dynamo reset failed - {e}")
176
-
177
- try:
178
- torch.onnx.export(
179
- model,
180
- dummy_input,
181
- file_path,
182
- export_params=True,
183
- opset_version=14,
184
- input_names=["input_ids"],
185
- output_names=["logits"],
186
- dynamic_axes={
187
- "input_ids": {0: "batch_size", 1: "sequence_length"},
188
- "logits": {0: "batch_size", 1: "sequence_length"}
189
- },
190
- )
191
- return f"Model converted to ONNX and saved as '{file_path}'."
192
- except Exception as e:
193
- return f"Error during ONNX conversion: {e}"
194
 
195
- # UI examples and voice choices
 
 
196
  examples = [
197
  ["Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.", "tara", 0.6, 0.95, 1.1, 1200],
198
  ["I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!", "dan", 0.7, 0.95, 1.1, 1200],
199
- ["I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, let's just say a lot of parameters.", "emma", 0.6, 0.9, 1.2, 1200]
200
  ]
 
201
  VOICES = ["tara", "dan", "josh", "emma"]
202
 
203
  with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
204
  gr.Markdown("""
205
  # 🎵 Orpheus Text-to-Speech
206
- Enter text to have it converted into natural-sounding speech.
207
-
208
- **Tips:**
209
- - Include paralinguistic cues like `<chuckle>` or `<sigh>`.
210
- - Longer text can produce more natural results.
211
  """)
212
  with gr.Row():
213
  with gr.Column(scale=3):
214
- text_input = gr.Textbox(label="Text to speak", placeholder="Enter your text...", lines=5)
215
- voice = gr.Dropdown(choices=VOICES, value="tara", label="Voice")
 
 
 
 
 
 
 
 
 
216
  with gr.Accordion("Advanced Settings", open=False):
217
- temperature = gr.Slider(minimum=0.1, maximum=1.5, value=0.6, step=0.05, label="Temperature",
218
- info="Higher values produce more varied speech")
219
- top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P",
220
- info="Nucleus sampling threshold")
221
- repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty",
222
- info="Discourage repetition")
223
- max_new_tokens = gr.Slider(minimum=100, maximum=2000, value=1200, step=100, label="Max Length",
224
- info="Maximum generated tokens")
 
 
 
 
 
 
 
 
 
225
  with gr.Row():
226
  submit_btn = gr.Button("Generate Speech", variant="primary")
227
  clear_btn = gr.Button("Clear")
 
228
  with gr.Column(scale=2):
229
  audio_output = gr.Audio(label="Generated Speech", type="numpy")
230
-
231
  gr.Examples(
232
  examples=examples,
233
  inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
@@ -235,22 +245,18 @@ with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
235
  fn=generate_speech,
236
  cache_examples=True,
237
  )
238
-
239
  submit_btn.click(
240
  fn=generate_speech,
241
  inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
242
  outputs=audio_output
243
  )
 
244
  clear_btn.click(
245
  fn=lambda: (None, None),
246
  inputs=[],
247
  outputs=[text_input, audio_output]
248
  )
249
-
250
- gr.Markdown("## ONNX Conversion")
251
- onnx_btn = gr.Button("Convert Model to ONNX")
252
- onnx_output = gr.Textbox(label="Conversion Output")
253
- onnx_btn.click(fn=convert_model_to_onnx, inputs=[], outputs=onnx_output)
254
 
255
  if __name__ == "__main__":
256
- demo.queue().launch(share=False, ssr_mode=False)
 
 
1
  import torch
2
  import gradio as gr
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  from huggingface_hub import snapshot_download
5
+ from snac import SNAC
6
+ import time # Import the time module
7
  from dotenv import load_dotenv
8
+ from optimum.bettertransformer import BetterTransformer
9
 
 
10
  load_dotenv()
11
 
 
 
12
 
13
+ # Check if CUDA is available, otherwise use CPU
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ print(f"Using device: {device}")
 
 
 
 
 
 
 
16
 
17
+ # 1. Load SNAC Model (for audio decoding)
18
  print("Loading SNAC model...")
19
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
20
  snac_model = snac_model.to(device)
21
+ snac_model.eval() # Set SNAC to evaluation mode
22
 
23
+ # 2. Load Orpheus Language Model (for text-to-token generation)
24
  model_name = "canopylabs/orpheus-3b-0.1-ft"
25
 
26
+ # Download only necessary files (config and safetensors)
27
+ print("Downloading Orpheus model files...")
28
  snapshot_download(
29
  repo_id=model_name,
30
  allow_patterns=[
31
  "config.json",
32
+ ".safetensors",
33
  "model.safetensors.index.json",
34
  ],
35
  ignore_patterns=[
 
42
  "special_tokens_map.json",
43
  "vocab.json",
44
  "merges.txt",
45
+ "tokenizer."
46
  ]
47
  )
48
 
49
  print("Loading Orpheus model...")
50
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
 
 
51
 
52
+ # --- Optimization 1: Convert to BetterTransformer ---
53
+ try:
54
+ model = BetterTransformer.transform(model)
55
+ print("Model converted to BetterTransformer for faster inference.")
56
+ except Exception as e:
57
+ print(f"BetterTransformer conversion failed: {e}. Proceeding without it.")
 
58
 
59
+ model.to(device)
60
+ model.eval() # Set the Orpheus model to evaluation mode
61
  tokenizer = AutoTokenizer.from_pretrained(model_name)
62
  print(f"Orpheus model loaded to {device}")
63
 
64
+
65
+ # --- Function Definitions ---
66
+
67
  def process_prompt(prompt, voice, tokenizer, device):
68
+ """Processes the text prompt and converts it to input IDs."""
69
  prompt = f"{voice}: {prompt}"
70
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
71
 
72
+ start_token = torch.tensor([[128259]], dtype=torch.int64) # Start of human
73
+ end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64) # End of text, End of human
74
 
75
+ modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) # SOH SOT Text EOT EOH
76
+
77
+ # No padding needed for single input
78
  attention_mask = torch.ones_like(modified_input_ids)
79
+
80
  return modified_input_ids.to(device), attention_mask.to(device)
81
 
82
  def parse_output(generated_ids):
83
+ """Parses the generated token IDs to extract the audio codes."""
84
+ token_to_find = 128257 # SOT token
85
+ token_to_remove = 128258 # EOT token
86
 
87
  token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
88
+
89
  if len(token_indices[1]) > 0:
90
  last_occurrence_idx = token_indices[1][-1].item()
91
  cropped_tensor = generated_ids[:, last_occurrence_idx + 1:]
 
100
  code_lists = []
101
  for row in processed_rows:
102
  row_length = row.size(0)
103
+ new_length = (row_length // 7) * 7 # Ensure divisibility by 7
104
  trimmed_row = row[:new_length]
105
+ trimmed_row = [t - 128266 for t in trimmed_row] # Adjust code values
106
  code_lists.append(trimmed_row)
107
 
108
+ return code_lists[0] # Return codes for the first (and only) sequence
109
+
110
 
111
  def redistribute_codes(code_list, snac_model):
112
+ """Redistributes the audio codes into the format required by SNAC."""
113
+ device = next(snac_model.parameters()).device # Get the device of SNAC model
114
+
115
+ layer_1 = []
116
+ layer_2 = []
117
+ layer_3 = []
118
+ for i in range(len(code_list) // 7): # Corrected loop condition
119
+ layer_1.append(code_list[7*i])
120
+ layer_2.append(code_list[7*i+1]-4096)
121
+ layer_3.append(code_list[7*i+2]-(2*4096))
122
+ layer_3.append(code_list[7*i+3]-(3*4096))
123
+ layer_2.append(code_list[7*i+4]-(4*4096))
124
+ layer_3.append(code_list[7*i+5]-(5*4096))
125
+ layer_3.append(code_list[7*i+6]-(6*4096))
126
 
127
+ # Move tensors to the same device as the SNAC model
128
  codes = [
129
+ torch.tensor(layer_1, device=device).unsqueeze(0),
130
+ torch.tensor(layer_2, device=device).unsqueeze(0),
131
+ torch.tensor(layer_3, device=device).unsqueeze(0)
132
  ]
133
+
134
  audio_hat = snac_model.decode(codes)
135
+ return audio_hat.detach().squeeze().cpu().numpy() # Return CPU numpy array
136
+
137
 
 
138
  def generate_speech(text, voice, temperature, top_p, repetition_penalty, max_new_tokens, progress=gr.Progress()):
139
+ """Generates speech from the given text using Orpheus and SNAC."""
140
  if not text.strip():
141
  return None
142
+
143
  try:
144
+ start_time = time.time() # Start timing
145
+
146
+ progress(0.1, "Processing text...")
147
  input_ids, attention_mask = process_prompt(text, voice, tokenizer, device)
148
+ process_time = time.time() - start_time
149
+ print(f"Text processing time: {process_time:.2f} seconds")
150
+
151
+ start_time = time.time() # Reset timer
152
+ progress(0.3, "Generating speech tokens...")
153
+ with torch.no_grad():
154
  generated_ids = model.generate(
155
  input_ids=input_ids,
156
  attention_mask=attention_mask,
 
162
  num_return_sequences=1,
163
  eos_token_id=128258,
164
  )
165
+ generation_time = time.time() - start_time
166
+ print(f"Token generation time: {generation_time:.2f} seconds")
167
+
168
+ start_time = time.time() # Reset timer
169
+ progress(0.6, "Processing speech tokens...")
170
  code_list = parse_output(generated_ids)
171
+ parse_time = time.time() - start_time
172
+ print(f"Token parsing time: {parse_time:.2f} seconds")
173
+
174
+
175
+ start_time = time.time() # Reset timer
176
+ progress(0.8, "Converting to audio...")
177
  audio_samples = redistribute_codes(code_list, snac_model)
178
+ audio_time = time.time() - start_time
179
+ print(f"Audio conversion time: {audio_time:.2f} seconds")
180
+
181
+ return (24000, audio_samples) # Return sample rate and audio
182
  except Exception as e:
183
  print(f"Error generating speech: {e}")
184
  return None
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
+
188
+ # --- Gradio Interface Setup ---
189
+
190
  examples = [
191
  ["Hey there my name is Tara, <chuckle> and I'm a speech generation model that can sound like a person.", "tara", 0.6, 0.95, 1.1, 1200],
192
  ["I've also been taught to understand and produce paralinguistic things like sighing, or chuckling, or yawning!", "dan", 0.7, 0.95, 1.1, 1200],
193
+ ["I live in San Francisco, and have, uhm let's see, 3 billion 7 hundred ... well, lets just say a lot of parameters.", "emma", 0.6, 0.9, 1.2, 1200]
194
  ]
195
+
196
  VOICES = ["tara", "dan", "josh", "emma"]
197
 
198
  with gr.Blocks(title="Orpheus Text-to-Speech") as demo:
199
  gr.Markdown("""
200
  # 🎵 Orpheus Text-to-Speech
201
+ Enter text below to convert to speech.
 
 
 
 
202
  """)
203
  with gr.Row():
204
  with gr.Column(scale=3):
205
+ text_input = gr.Textbox(
206
+ label="Text to speak",
207
+ placeholder="Enter your text here...",
208
+ lines=5
209
+ )
210
+ voice = gr.Dropdown(
211
+ choices=VOICES,
212
+ value="tara",
213
+ label="Voice"
214
+ )
215
+
216
  with gr.Accordion("Advanced Settings", open=False):
217
+ temperature = gr.Slider(
218
+ minimum=0.1, maximum=1.5, value=0.6, step=0.05,
219
+ label="Temperature"
220
+ )
221
+ top_p = gr.Slider(
222
+ minimum=0.1, maximum=1.0, value=0.95, step=0.05,
223
+ label="Top P"
224
+ )
225
+ repetition_penalty = gr.Slider(
226
+ minimum=1.0, maximum=2.0, value=1.1, step=0.05,
227
+ label="Repetition Penalty"
228
+ )
229
+ max_new_tokens = gr.Slider(
230
+ minimum=100, maximum=2000, value=1200, step=100,
231
+ label="Max Length"
232
+ )
233
+
234
  with gr.Row():
235
  submit_btn = gr.Button("Generate Speech", variant="primary")
236
  clear_btn = gr.Button("Clear")
237
+
238
  with gr.Column(scale=2):
239
  audio_output = gr.Audio(label="Generated Speech", type="numpy")
240
+
241
  gr.Examples(
242
  examples=examples,
243
  inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
 
245
  fn=generate_speech,
246
  cache_examples=True,
247
  )
248
+
249
  submit_btn.click(
250
  fn=generate_speech,
251
  inputs=[text_input, voice, temperature, top_p, repetition_penalty, max_new_tokens],
252
  outputs=audio_output
253
  )
254
+
255
  clear_btn.click(
256
  fn=lambda: (None, None),
257
  inputs=[],
258
  outputs=[text_input, audio_output]
259
  )
 
 
 
 
 
260
 
261
  if __name__ == "__main__":
262
+ demo.queue().launch(share=False)