A Le Thanh Son commited on
Commit
955241f
·
1 Parent(s): c6d6c28
Files changed (1) hide show
  1. app.py +59 -14
app.py CHANGED
@@ -57,7 +57,12 @@ def save_audio(audio_tensor: torch.Tensor, sample_rate: int) -> str:
57
  def generate_speech(
58
  text: str,
59
  speaker_id: int,
60
- context_audio_files: List[Tuple[str, str, int]],
 
 
 
 
 
61
  max_duration_ms: float = 30000,
62
  temperature: float = 0.9,
63
  top_k: int = 50,
@@ -70,13 +75,21 @@ def generate_speech(
70
  context = []
71
  progress(0.1, "Đang xử lý ngữ cảnh...")
72
 
73
- for audio_file, text_content, speaker in context_audio_files:
74
- if audio_file and text_content:
75
- waveform, sample_rate = audio_to_tensor(audio_file)
76
- # Resample nếu cần
77
- if sample_rate != generator.sample_rate:
78
- waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=generator.sample_rate)
79
- context.append(Segment(speaker=speaker, text=text_content, audio=waveform))
 
 
 
 
 
 
 
 
80
 
81
  progress(0.3, "Đang tạo âm thanh...")
82
  # Tạo âm thanh từ văn bản
@@ -96,6 +109,37 @@ def generate_speech(
96
  progress(1.0, "Hoàn thành!")
97
  return output_path
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  # Tạo giao diện Gradio
100
  def create_demo():
101
  with gr.Blocks(title="CSM-1B Text-to-Speech") as demo:
@@ -255,11 +299,10 @@ def create_demo():
255
 
256
  # Kết nối các thành phần
257
  generate_btn.click(
258
- fn=generate_speech,
259
  inputs=[
260
  text_input,
261
  speaker_id,
262
- gr.State([]), # Không có ngữ cảnh
263
  max_duration,
264
  temperature,
265
  top_k
@@ -272,10 +315,12 @@ def create_demo():
272
  inputs=[
273
  text_input_context,
274
  speaker_id_context,
275
- gr.State([
276
- (context_audio1, context_text1, context_speaker1),
277
- (context_audio2, context_text2, context_speaker2)
278
- ]),
 
 
279
  max_duration_context,
280
  temperature_context,
281
  top_k_context
 
57
  def generate_speech(
58
  text: str,
59
  speaker_id: int,
60
+ context_audio_path1: str = None,
61
+ context_text1: str = None,
62
+ context_speaker1: int = 0,
63
+ context_audio_path2: str = None,
64
+ context_text2: str = None,
65
+ context_speaker2: int = 1,
66
  max_duration_ms: float = 30000,
67
  temperature: float = 0.9,
68
  top_k: int = 50,
 
75
  context = []
76
  progress(0.1, "Đang xử lý ngữ cảnh...")
77
 
78
+ # Xử ngữ cảnh 1
79
+ if context_audio_path1 and context_text1:
80
+ waveform, sample_rate = audio_to_tensor(context_audio_path1)
81
+ # Resample nếu cần
82
+ if sample_rate != generator.sample_rate:
83
+ waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=generator.sample_rate)
84
+ context.append(Segment(speaker=context_speaker1, text=context_text1, audio=waveform))
85
+
86
+ # Xử lý ngữ cảnh 2
87
+ if context_audio_path2 and context_text2:
88
+ waveform, sample_rate = audio_to_tensor(context_audio_path2)
89
+ # Resample nếu cần
90
+ if sample_rate != generator.sample_rate:
91
+ waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=generator.sample_rate)
92
+ context.append(Segment(speaker=context_speaker2, text=context_text2, audio=waveform))
93
 
94
  progress(0.3, "Đang tạo âm thanh...")
95
  # Tạo âm thanh từ văn bản
 
109
  progress(1.0, "Hoàn thành!")
110
  return output_path
111
 
112
+ # Hàm tạo âm thanh đơn giản không có ngữ cảnh
113
+ @spaces.GPU
114
+ def generate_speech_simple(
115
+ text: str,
116
+ speaker_id: int,
117
+ max_duration_ms: float = 30000,
118
+ temperature: float = 0.9,
119
+ top_k: int = 50,
120
+ progress=gr.Progress()
121
+ ) -> str:
122
+ # Tải mô hình nếu chưa tải
123
+ generator = load_model()
124
+
125
+ progress(0.3, "Đang tạo âm thanh...")
126
+ # Tạo âm thanh từ văn bản
127
+ audio = generator.generate(
128
+ text=text,
129
+ speaker=speaker_id,
130
+ context=[], # Không có ngữ cảnh
131
+ max_audio_length_ms=max_duration_ms,
132
+ temperature=temperature,
133
+ topk=top_k
134
+ )
135
+
136
+ progress(0.8, "Đang lưu âm thanh...")
137
+ # Lưu âm thanh thành file
138
+ output_path = save_audio(audio, generator.sample_rate)
139
+
140
+ progress(1.0, "Hoàn thành!")
141
+ return output_path
142
+
143
  # Tạo giao diện Gradio
144
  def create_demo():
145
  with gr.Blocks(title="CSM-1B Text-to-Speech") as demo:
 
299
 
300
  # Kết nối các thành phần
301
  generate_btn.click(
302
+ fn=generate_speech_simple,
303
  inputs=[
304
  text_input,
305
  speaker_id,
 
306
  max_duration,
307
  temperature,
308
  top_k
 
315
  inputs=[
316
  text_input_context,
317
  speaker_id_context,
318
+ context_audio1,
319
+ context_text1,
320
+ context_speaker1,
321
+ context_audio2,
322
+ context_text2,
323
+ context_speaker2,
324
  max_duration_context,
325
  temperature_context,
326
  top_k_context