qsense commited on
Commit
f852f45
·
verified ·
1 Parent(s): 2547c62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -71
app.py CHANGED
@@ -1,50 +1,40 @@
1
  import io
2
  from threading import Thread
3
- import random
4
- import os
5
 
6
  import numpy as np
7
- import spaces
8
  import gradio as gr
9
  import torch
10
-
11
- from parler_tts import ParlerTTSForConditionalGeneration
12
  from pydub import AudioSegment
13
  from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
14
- from huggingface_hub import InferenceClient
15
  from streamer import ParlerTTSStreamer
16
- import time
17
-
18
 
 
19
  device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
20
  torch_dtype = torch.float16 if device != "cpu" else torch.float32
21
 
22
  repo_id = "parler-tts/parler_tts_mini_v0.1"
23
-
24
  jenny_repo_id = "ylacombe/parler-tts-mini-jenny-30H"
25
 
26
  model = ParlerTTSForConditionalGeneration.from_pretrained(
27
  jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
28
  ).to(device)
29
 
30
- # client = InferenceClient(token=os.getenv("HF_TOKEN"))
31
-
32
-
33
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
34
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
35
-
36
  SAMPLE_RATE = feature_extractor.sampling_rate
37
  SEED = 42
38
 
 
39
 
 
40
  def numpy_to_mp3(audio_array, sampling_rate):
41
- # Normalize audio_array if it's floating-point
42
  if np.issubdtype(audio_array.dtype, np.floating):
43
  max_val = np.max(np.abs(audio_array))
44
- audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
45
  audio_array = audio_array.astype(np.int16)
46
 
47
- # Create an audio segment from the numpy array
48
  audio_segment = AudioSegment(
49
  audio_array.tobytes(),
50
  frame_rate=sampling_rate,
@@ -52,46 +42,20 @@ def numpy_to_mp3(audio_array, sampling_rate):
52
  channels=1
53
  )
54
 
55
- # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
56
  mp3_io = io.BytesIO()
57
  audio_segment.export(mp3_io, format="mp3", bitrate="320k")
 
58
 
59
- # Get the MP3 bytes
60
- mp3_bytes = mp3_io.getvalue()
61
- mp3_io.close()
62
-
63
- return mp3_bytes
64
-
65
- sampling_rate = model.audio_encoder.config.sampling_rate
66
- frame_rate = model.audio_encoder.config.frame_rate
67
-
68
 
69
- def generate_response(audio):
70
- # gr.Info("Transcribing Audio", duration=5)
71
- # question = client.automatic_speech_recognition(audio).text
72
- # messages = [{"role": "system", "content": ("You are a magic 8 ball."
73
- # "Someone will present to you a situation or question and your job "
74
- # "is to answer with a cryptic addage or proverb such as "
75
- # "'curiosity killed the cat' or 'The early bird gets the worm'."
76
- # "Keep your answers short and do not include the phrase 'Magic 8 Ball' in your response. If the question does not make sense or is off-topic, say 'Foolish questions get foolish answers.'"
77
- # "For example, 'Magic 8 Ball, should I get a dog?', 'A dog is ready for you but are you ready for the dog?'")},
78
- # {"role": "user", "content": f"Magic 8 Ball please answer this question - {question}"}]
79
-
80
- # response = client.chat_completion(messages, max_tokens=64, seed=random.randint(1, 5000), model="mistralai/Mistral-7B-Instruct-v0.3")
81
- # response = response.choices[0].message.content.replace("Magic 8 Ball", "")
82
- return "test response", None, None
83
-
84
- @spaces.GPU
85
- def read_response(answer):
86
-
87
- play_steps_in_s = 2.0
88
- play_steps = int(frame_rate * play_steps_in_s)
89
-
90
- description = "Jenny speaks at an average pace with a calm delivery in a very confined sounding environment with clear audio quality."
91
  description_tokens = tokenizer(description, return_tensors="pt").to(device)
 
92
 
 
93
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
94
- prompt = tokenizer(answer, return_tensors="pt").to(device)
95
 
96
  generation_kwargs = dict(
97
  input_ids=description_tokens.input_ids,
@@ -105,29 +69,17 @@ def read_response(answer):
105
  set_seed(SEED)
106
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
107
  thread.start()
 
108
  start = time.time()
109
  for new_audio in streamer:
110
- print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds after {time.time() - start} seconds")
111
- yield answer, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
112
 
 
 
 
 
 
 
113
 
114
- with gr.Blocks() as block:
115
- gr.HTML(
116
- f"""
117
- <h1 style='text-align: center;'> Magic 8 Ball 🎱 </h1>
118
- <h3 style='text-align: center;'> Ask a question and receive wisdom </h3>
119
- <p style='text-align: center;'> Powered by <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a>
120
- """
121
- )
122
- with gr.Group():
123
- with gr.Row():
124
- audio_out = gr.Audio(label="Spoken Answer", streaming=True, autoplay=True, loop=False)
125
- answer = gr.Textbox(label="Answer")
126
- state = gr.State()
127
- with gr.Row():
128
- audio_in = gr.Audio(label="Speak you question", sources="microphone", type="filepath")
129
- with gr.Row():
130
- gr.HTML("""<h3 style='text-align: center;'> Examples: 'What is the meaning of life?', 'Should I get a dog?' </h3>""")
131
- audio_in.stop_recording(generate_response, audio_in, [state, answer, audio_out]).then(fn=read_response, inputs=state, outputs=[answer, audio_out])
132
-
133
- block.launch()
 
1
  import io
2
  from threading import Thread
3
+ import time
 
4
 
5
  import numpy as np
 
6
  import gradio as gr
7
  import torch
 
 
8
  from pydub import AudioSegment
9
  from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
10
+ from parler_tts import ParlerTTSForConditionalGeneration
11
  from streamer import ParlerTTSStreamer
 
 
12
 
13
+ # Device and model setup
14
  device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
15
  torch_dtype = torch.float16 if device != "cpu" else torch.float32
16
 
17
  repo_id = "parler-tts/parler_tts_mini_v0.1"
 
18
  jenny_repo_id = "ylacombe/parler-tts-mini-jenny-30H"
19
 
20
  model = ParlerTTSForConditionalGeneration.from_pretrained(
21
  jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
22
  ).to(device)
23
 
 
 
 
24
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
25
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
 
26
  SAMPLE_RATE = feature_extractor.sampling_rate
27
  SEED = 42
28
 
29
+ frame_rate = model.audio_encoder.config.frame_rate
30
 
31
+ # Helper to convert audio to MP3
32
  def numpy_to_mp3(audio_array, sampling_rate):
 
33
  if np.issubdtype(audio_array.dtype, np.floating):
34
  max_val = np.max(np.abs(audio_array))
35
+ audio_array = (audio_array / max_val) * 32767
36
  audio_array = audio_array.astype(np.int16)
37
 
 
38
  audio_segment = AudioSegment(
39
  audio_array.tobytes(),
40
  frame_rate=sampling_rate,
 
42
  channels=1
43
  )
44
 
 
45
  mp3_io = io.BytesIO()
46
  audio_segment.export(mp3_io, format="mp3", bitrate="320k")
47
+ return mp3_io.getvalue()
48
 
49
+ # TTS Function using fixed text
50
+ def speak_fixed_text():
51
+ text = "This is a demo of Parler-TTS generating a voice from fixed text input."
 
 
 
 
 
 
52
 
53
+ description = "A calm, clear female voice speaking in a natural tone."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  description_tokens = tokenizer(description, return_tensors="pt").to(device)
55
+ prompt = tokenizer(text, return_tensors="pt").to(device)
56
 
57
+ play_steps = int(frame_rate * 2.0)
58
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
 
59
 
60
  generation_kwargs = dict(
61
  input_ids=description_tokens.input_ids,
 
69
  set_seed(SEED)
70
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
71
  thread.start()
72
+
73
  start = time.time()
74
  for new_audio in streamer:
75
+ print(f"Audio sample: {round(new_audio.shape[0] / SAMPLE_RATE, 2)} sec (elapsed {round(time.time() - start, 2)} sec)")
76
+ return numpy_to_mp3(new_audio, sampling_rate=SAMPLE_RATE)
77
 
78
+ # Minimal Gradio UI
79
+ with gr.Blocks() as demo:
80
+ gr.Markdown("## 🔊 Text-to-Speech Demo")
81
+ output_audio = gr.Audio(label="Generated Audio", streaming=True, autoplay=True)
82
+ generate_btn = gr.Button("Generate Voice")
83
+ generate_btn.click(fn=speak_fixed_text, outputs=output_audio)
84
 
85
+ demo.launch()