Spaces:

alethanhson
/

csm-1b-gradio-v2

Running

App Files Files Community

A Le Thanh Son commited on Mar 17

Commit

955241f

1 Parent(s): c6d6c28

fix

Browse files

Files changed (1) hide show

app.py +59 -14

app.py CHANGED Viewed

@@ -57,7 +57,12 @@ def save_audio(audio_tensor: torch.Tensor, sample_rate: int) -> str:
 def generate_speech(
     text: str,
     speaker_id: int,
-    context_audio_files: List[Tuple[str, str, int]],
     max_duration_ms: float = 30000,
     temperature: float = 0.9,
     top_k: int = 50,
@@ -70,13 +75,21 @@ def generate_speech(
     context = []
     progress(0.1, "Đang xử lý ngữ cảnh...")
-    for audio_file, text_content, speaker in context_audio_files:
-        if audio_file and text_content:
-            waveform, sample_rate = audio_to_tensor(audio_file)
-            # Resample nếu cần
-            if sample_rate != generator.sample_rate:
-                waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=generator.sample_rate)
-            context.append(Segment(speaker=speaker, text=text_content, audio=waveform))
     progress(0.3, "Đang tạo âm thanh...")
     # Tạo âm thanh từ văn bản
@@ -96,6 +109,37 @@ def generate_speech(
     progress(1.0, "Hoàn thành!")
     return output_path
 # Tạo giao diện Gradio
 def create_demo():
     with gr.Blocks(title="CSM-1B Text-to-Speech") as demo:
@@ -255,11 +299,10 @@ def create_demo():
         # Kết nối các thành phần
         generate_btn.click(
-            fn=generate_speech,
             inputs=[
                 text_input,
                 speaker_id,
-                gr.State([]),  # Không có ngữ cảnh
                 max_duration,
                 temperature,
                 top_k
@@ -272,10 +315,12 @@ def create_demo():
             inputs=[
                 text_input_context,
                 speaker_id_context,
-                gr.State([
-                    (context_audio1, context_text1, context_speaker1),
-                    (context_audio2, context_text2, context_speaker2)
-                ]),
                 max_duration_context,
                 temperature_context,
                 top_k_context

 def generate_speech(
     text: str,
     speaker_id: int,
+    context_audio_path1: str = None,
+    context_text1: str = None,
+    context_speaker1: int = 0,
+    context_audio_path2: str = None,
+    context_text2: str = None,
+    context_speaker2: int = 1,
     max_duration_ms: float = 30000,
     temperature: float = 0.9,
     top_k: int = 50,
     context = []
     progress(0.1, "Đang xử lý ngữ cảnh...")
+    # Xử lý ngữ cảnh 1
+    if context_audio_path1 and context_text1:
+        waveform, sample_rate = audio_to_tensor(context_audio_path1)
+        # Resample nếu cần
+        if sample_rate != generator.sample_rate:
+            waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=generator.sample_rate)
+        context.append(Segment(speaker=context_speaker1, text=context_text1, audio=waveform))
+    # Xử lý ngữ cảnh 2
+    if context_audio_path2 and context_text2:
+        waveform, sample_rate = audio_to_tensor(context_audio_path2)
+        # Resample nếu cần
+        if sample_rate != generator.sample_rate:
+            waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=generator.sample_rate)
+        context.append(Segment(speaker=context_speaker2, text=context_text2, audio=waveform))
     progress(0.3, "Đang tạo âm thanh...")
     # Tạo âm thanh từ văn bản
     progress(1.0, "Hoàn thành!")
     return output_path
+# Hàm tạo âm thanh đơn giản không có ngữ cảnh
+@spaces.GPU
+def generate_speech_simple(
+    text: str,
+    speaker_id: int,
+    max_duration_ms: float = 30000,
+    temperature: float = 0.9,
+    top_k: int = 50,
+    progress=gr.Progress()
+) -> str:
+    # Tải mô hình nếu chưa tải
+    generator = load_model()
+    progress(0.3, "Đang tạo âm thanh...")
+    # Tạo âm thanh từ văn bản
+    audio = generator.generate(
+        text=text,
+        speaker=speaker_id,
+        context=[],  # Không có ngữ cảnh
+        max_audio_length_ms=max_duration_ms,
+        temperature=temperature,
+        topk=top_k
+    )
+    progress(0.8, "Đang lưu âm thanh...")
+    # Lưu âm thanh thành file
+    output_path = save_audio(audio, generator.sample_rate)
+    progress(1.0, "Hoàn thành!")
+    return output_path
 # Tạo giao diện Gradio
 def create_demo():
     with gr.Blocks(title="CSM-1B Text-to-Speech") as demo:
         # Kết nối các thành phần
         generate_btn.click(
+            fn=generate_speech_simple,
             inputs=[
                 text_input,
                 speaker_id,
                 max_duration,
                 temperature,
                 top_k
             inputs=[
                 text_input_context,
                 speaker_id_context,
+                context_audio1,
+                context_text1,
+                context_speaker1,
+                context_audio2,
+                context_text2,
+                context_speaker2,
                 max_duration_context,
                 temperature_context,
                 top_k_context