Spaces:

alethanhson
/

csm-1b-gradio

Running

App Files Files Community

alethanhson commited on Mar 17

Commit

69a5801

1 Parent(s): e0668e2

fix

Browse files

Files changed (5) hide show

.huggingface/space.yml +3 -2
Procfile +1 -1
README.md +20 -20
app.py +34 -34
app_huggingface.py +214 -0

.huggingface/space.yml CHANGED Viewed

@@ -4,8 +4,9 @@ colorFrom: indigo
 colorTo: purple
 sdk: gradio
 sdk_version: 4.19.2
-app_file: app.py
 pinned: false
 license: apache-2.0
 models:
-  - sesame/csm-1b

 colorTo: purple
 sdk: gradio
 sdk_version: 4.19.2
+app_file: app_huggingface.py
 pinned: false
 license: apache-2.0
 models:
+  - sesame/csm-1b
+description: "Convert text to natural-sounding speech with Sesame's Conversational Speech Model"

Procfile CHANGED Viewed

	@@ -1 +1 @@
1	- web: python ~~app~~.py


1	+ web: python app_huggingface.py

README.md CHANGED Viewed

@@ -13,32 +13,32 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
 # CSM-1B Gradio Demo
-Ứng dụng demo cho mô hình CSM-1B (Conversational Speech Model) sử dụng Gradio để tạo giao diện người dùng thân thiện.
-## Tính năng
-- Chuyển đổi văn bản thành giọng nói tự nhiên
-- Hỗ trợ nhiều giọng đọc khác nhau (ID người nói)
-- Tạo giọng nói theo ngữ cảnh hội thoại
-- Tùy chỉnh các tham số như nhiệt độ và độ dài âm thanh
-## Sử dụng
-1. Nhập văn bản bạn muốn chuyển thành giọng nói
-2. Chọn ID người nói (từ 0-10)
-3. Tùy chỉnh các thông số nâng cao (không bắt buộc)
-4. Thêm ngữ cảnh hội thoại nếu cần
-5. Nhấn "Tạo âm thanh" để nghe kết quả
-## Triển khai trên Hugging Face Spaces
-Ứng dụng này được thiết kế để chạy trên Hugging Face Spaces. Để triển khai:
-1. Tạo một Space mới
-2. Upload mã nguồn lên Space
-3. Chọn Gradio là framework
-4. Chờ ứng dụng được xây dựng và khởi động
-## Tài nguyên
-Mô hình CSM-1B của Sesame AI: [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

 # CSM-1B Gradio Demo
+Demo application for the CSM-1B (Conversational Speech Model) using Gradio to create a user-friendly interface.
+## Features
+- Convert text to natural-sounding speech
+- Support for multiple speaker voices (Speaker IDs)
+- Generate speech with conversation context
+- Customize parameters like temperature and audio length
+## Usage
+1. Enter the text you want to convert to speech
+2. Choose a speaker ID (from 0-10)
+3. Customize advanced parameters (optional)
+4. Add conversation context if needed
+5. Click "Generate Audio" to hear the result
+## Deployment on Hugging Face Spaces
+This application is designed to run on Hugging Face Spaces. To deploy:
+1. Create a new Space
+2. Upload the source code to the Space
+3. Select Gradio as the framework
+4. Wait for the application to build and start
+## Resources
+Sesame AI's CSM-1B model: [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

app.py CHANGED Viewed

@@ -17,19 +17,19 @@ generator = None
 def initialize_model():
     global generator
-    logger.info("Đang tải mô hình CSM 1B...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if device == "cpu":
-        logger.warning("GPU không khả dụng. Sử dụng CPU, hiệu suất có thể chậm!")
-    logger.info(f"Sử dụng thiết bị: {device}")
     try:
         generator = load_csm_1b(device=device)
-        logger.info(f"Mô hình đã được tải thành công trên thiết bị: {device}")
         return True
     except Exception as e:
-        logger.error(f"Không thể tải mô hình: {str(e)}")
         return False
 def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9, topk=50, context_texts=None, context_speakers=None):
@@ -37,10 +37,10 @@ def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9
     if generator is None:
         if not initialize_model():
-            return None, "Không thể tải mô hình. Vui lòng thử lại sau."
     try:
-        # Xử lý context nếu được cung cấp
         context_segments = []
         if context_texts and context_speakers:
             for ctx_text, ctx_speaker in zip(context_texts, context_speakers):
@@ -49,7 +49,7 @@ def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9
                         Segment(text=ctx_text, speaker=int(ctx_speaker), audio=torch.zeros(0, dtype=torch.float32))
                     )
-        # Tạo âm thanh từ văn bản
         audio = generator.generate(
             text=text,
             speaker=int(speaker_id),
@@ -59,15 +59,15 @@ def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9
             topk=int(topk),
         )
-        # Chuyển đổi tensor thành numpy array để Gradio có thể xử lý
         audio_numpy = audio.cpu().numpy()
         sample_rate = generator.sample_rate
         return (sample_rate, audio_numpy), None
     except Exception as e:
-        logger.error(f"Lỗi khi tạo âm thanh: {str(e)}")
-        return None, f"Lỗi khi tạo âm thanh: {str(e)}"
 def clear_context():
     return [], []
@@ -78,36 +78,36 @@ def add_context(text, speaker_id, context_texts, context_speakers):
         context_speakers.append(int(speaker_id))
     return context_texts, context_speakers
-# Thiết lập giao diện Gradio
 with gr.Blocks(title="CSM 1B Demo") as demo:
-    gr.Markdown("# CSM 1B - Mô hình tạo giọng nói hội thoại")
-    gr.Markdown("Nhập văn bản để tạo giọng nói tự nhiên với mô hình CSM 1B")
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
-                label="Văn bản để chuyển thành giọng nói",
-                placeholder="Nhập văn bản ở đây...",
                 lines=3
             )
             speaker_id = gr.Slider(
-                label="ID người nói",
                 minimum=0,
                 maximum=10,
                 step=1,
                 value=0
             )
-            with gr.Accordion("Tùy chọn nâng cao", open=False):
                 max_length = gr.Slider(
-                    label="Độ dài tối đa (mili giây)",
                     minimum=1000,
                     maximum=30000,
                     step=1000,
                     value=10000
                 )
                 temp = gr.Slider(
-                    label="Nhiệt độ",
                     minimum=0.1,
                     maximum=1.5,
                     step=0.1,
@@ -121,14 +121,14 @@ with gr.Blocks(title="CSM 1B Demo") as demo:
                     value=50
                 )
-            with gr.Accordion("Ngữ cảnh hội thoại", open=False):
                 context_list = gr.State([])
                 context_speakers_list = gr.State([])
                 with gr.Row():
-                    context_text = gr.Textbox(label="Văn bản ngữ cảnh", lines=2)
                     context_speaker = gr.Slider(
-                        label="ID người nói ngữ cảnh",
                         minimum=0,
                         maximum=10,
                         step=1,
@@ -136,22 +136,22 @@ with gr.Blocks(title="CSM 1B Demo") as demo:
                     )
                 with gr.Row():
-                    add_ctx_btn = gr.Button("Thêm ngữ cảnh")
-                    clear_ctx_btn = gr.Button("Xóa tất cả ngữ cảnh")
                 context_display = gr.Dataframe(
-                    headers=["Văn bản", "ID người nói"],
-                    label="Ngữ cảnh hiện tại",
                     interactive=False
                 )
-            generate_btn = gr.Button("Tạo âm thanh", variant="primary")
         with gr.Column(scale=1):
-            audio_output = gr.Audio(label="Âm thanh được tạo", type="numpy")
-            error_output = gr.Textbox(label="Thông báo lỗi", visible=False)
-    # Kết nối các sự kiện
     generate_btn.click(
         fn=generate_speech,
         inputs=[
@@ -183,7 +183,7 @@ with gr.Blocks(title="CSM 1B Demo") as demo:
         outputs=[context_list, context_speakers_list]
     )
-    # Cập nhật hiển thị ngữ cảnh
     def update_context_display(texts, speakers):
         if not texts or not speakers:
             return []
@@ -201,9 +201,9 @@ with gr.Blocks(title="CSM 1B Demo") as demo:
         outputs=[context_display]
     )
-# Khởi động ứng dụng khi tải trang
 initialize_model()
-# Cấu hình cho Hugging Face Spaces
 demo.launch(share=False)

 def initialize_model():
     global generator
+    logger.info("Loading CSM 1B model...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     if device == "cpu":
+        logger.warning("GPU not available. Using CPU, performance may be slow!")
+    logger.info(f"Using device: {device}")
     try:
         generator = load_csm_1b(device=device)
+        logger.info(f"Model loaded successfully on device: {device}")
         return True
     except Exception as e:
+        logger.error(f"Could not load model: {str(e)}")
         return False
 def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9, topk=50, context_texts=None, context_speakers=None):
     if generator is None:
         if not initialize_model():
+            return None, "Could not load model. Please try again later."
     try:
+        # Process context if provided
         context_segments = []
         if context_texts and context_speakers:
             for ctx_text, ctx_speaker in zip(context_texts, context_speakers):
                         Segment(text=ctx_text, speaker=int(ctx_speaker), audio=torch.zeros(0, dtype=torch.float32))
                     )
+        # Generate audio from text
         audio = generator.generate(
             text=text,
             speaker=int(speaker_id),
             topk=int(topk),
         )
+        # Convert tensor to numpy array for Gradio
         audio_numpy = audio.cpu().numpy()
         sample_rate = generator.sample_rate
         return (sample_rate, audio_numpy), None
     except Exception as e:
+        logger.error(f"Error generating audio: {str(e)}")
+        return None, f"Error generating audio: {str(e)}"
 def clear_context():
     return [], []
         context_speakers.append(int(speaker_id))
     return context_texts, context_speakers
+# Set up Gradio interface
 with gr.Blocks(title="CSM 1B Demo") as demo:
+    gr.Markdown("# CSM 1B - Conversational Speech Model")
+    gr.Markdown("Enter text to generate natural-sounding speech with the CSM 1B model")
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
+                label="Text to convert to speech",
+                placeholder="Enter your text here...",
                 lines=3
             )
             speaker_id = gr.Slider(
+                label="Speaker ID",
                 minimum=0,
                 maximum=10,
                 step=1,
                 value=0
             )
+            with gr.Accordion("Advanced Options", open=False):
                 max_length = gr.Slider(
+                    label="Maximum length (milliseconds)",
                     minimum=1000,
                     maximum=30000,
                     step=1000,
                     value=10000
                 )
                 temp = gr.Slider(
+                    label="Temperature",
                     minimum=0.1,
                     maximum=1.5,
                     step=0.1,
                     value=50
                 )
+            with gr.Accordion("Conversation Context", open=False):
                 context_list = gr.State([])
                 context_speakers_list = gr.State([])
                 with gr.Row():
+                    context_text = gr.Textbox(label="Context text", lines=2)
                     context_speaker = gr.Slider(
+                        label="Context speaker ID",
                         minimum=0,
                         maximum=10,
                         step=1,
                     )
                 with gr.Row():
+                    add_ctx_btn = gr.Button("Add Context")
+                    clear_ctx_btn = gr.Button("Clear All Context")
                 context_display = gr.Dataframe(
+                    headers=["Text", "Speaker ID"],
+                    label="Current Context",
                     interactive=False
                 )
+            generate_btn = gr.Button("Generate Audio", variant="primary")
         with gr.Column(scale=1):
+            audio_output = gr.Audio(label="Generated Audio", type="numpy")
+            error_output = gr.Textbox(label="Error Message", visible=False)
+    # Connect events
     generate_btn.click(
         fn=generate_speech,
         inputs=[
         outputs=[context_list, context_speakers_list]
     )
+    # Update context display
     def update_context_display(texts, speakers):
         if not texts or not speakers:
             return []
         outputs=[context_display]
     )
+# Initialize model when page loads
 initialize_model()
+# Configuration for Hugging Face Spaces
 demo.launch(share=False)

app_huggingface.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import base64
+import io
+import logging
+from typing import List
+import torch
+import torchaudio
+import gradio as gr
+import numpy as np
+from generator import load_csm_1b, Segment
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+generator = None
+def initialize_model():
+    global generator
+    logger.info("Loading CSM 1B model...")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    if device == "cpu":
+        logger.warning("GPU not available. Using CPU, performance may be slow!")
+    logger.info(f"Using device: {device}")
+    try:
+        generator = load_csm_1b(device=device)
+        logger.info(f"Model loaded successfully on device: {device}")
+        return True
+    except Exception as e:
+        logger.error(f"Could not load model: {str(e)}")
+        return False
+def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9, topk=50, context_texts=None, context_speakers=None):
+    global generator
+    if generator is None:
+        if not initialize_model():
+            return None, "Could not load model. Please try again later."
+    try:
+        # Process context if provided
+        context_segments = []
+        if context_texts and context_speakers:
+            for ctx_text, ctx_speaker in zip(context_texts, context_speakers):
+                if ctx_text and ctx_speaker is not None:
+                    context_segments.append(
+                        Segment(text=ctx_text, speaker=int(ctx_speaker), audio=torch.zeros(0, dtype=torch.float32))
+                    )
+        # Generate audio from text
+        audio = generator.generate(
+            text=text,
+            speaker=int(speaker_id),
+            context=context_segments,
+            max_audio_length_ms=float(max_audio_length_ms),
+            temperature=float(temperature),
+            topk=int(topk),
+        )
+        # Convert tensor to numpy array for Gradio
+        audio_numpy = audio.cpu().numpy()
+        sample_rate = generator.sample_rate
+        return (sample_rate, audio_numpy), None
+    except Exception as e:
+        logger.error(f"Error generating audio: {str(e)}")
+        return None, f"Error generating audio: {str(e)}"
+def clear_context():
+    return [], []
+def add_context(text, speaker_id, context_texts, context_speakers):
+    if text and speaker_id is not None:
+        context_texts.append(text)
+        context_speakers.append(int(speaker_id))
+    return context_texts, context_speakers
+def update_context_display(texts, speakers):
+    if not texts or not speakers:
+        return []
+    return [[text, speaker] for text, speaker in zip(texts, speakers)]
+def create_demo():
+    # Set up Gradio interface
+    demo = gr.Blocks(title="CSM 1B Demo")
+    with demo:
+        gr.Markdown("# CSM 1B - Conversational Speech Model")
+        gr.Markdown("Enter text to generate natural-sounding speech with the CSM 1B model")
+        with gr.Row():
+            with gr.Column(scale=2):
+                text_input = gr.Textbox(
+                    label="Text to convert to speech",
+                    placeholder="Enter your text here...",
+                    lines=3
+                )
+                speaker_id = gr.Slider(
+                    label="Speaker ID",
+                    minimum=0,
+                    maximum=10,
+                    step=1,
+                    value=0
+                )
+                with gr.Accordion("Advanced Options", open=False):
+                    max_length = gr.Slider(
+                        label="Maximum length (milliseconds)",
+                        minimum=1000,
+                        maximum=30000,
+                        step=1000,
+                        value=10000
+                    )
+                    temp = gr.Slider(
+                        label="Temperature",
+                        minimum=0.1,
+                        maximum=1.5,
+                        step=0.1,
+                        value=0.9
+                    )
+                    top_k = gr.Slider(
+                        label="Top K",
+                        minimum=10,
+                        maximum=100,
+                        step=10,
+                        value=50
+                    )
+                with gr.Accordion("Conversation Context", open=False):
+                    context_list = gr.State([])
+                    context_speakers_list = gr.State([])
+                    with gr.Row():
+                        context_text = gr.Textbox(label="Context text", lines=2)
+                        context_speaker = gr.Slider(
+                            label="Context speaker ID",
+                            minimum=0,
+                            maximum=10,
+                            step=1,
+                            value=0
+                        )
+                    with gr.Row():
+                        add_ctx_btn = gr.Button("Add Context")
+                        clear_ctx_btn = gr.Button("Clear All Context")
+                    context_display = gr.Dataframe(
+                        headers=["Text", "Speaker ID"],
+                        label="Current Context",
+                        interactive=False
+                    )
+                generate_btn = gr.Button("Generate Audio", variant="primary")
+            with gr.Column(scale=1):
+                audio_output = gr.Audio(label="Generated Audio", type="numpy")
+                error_output = gr.Textbox(label="Error Message", visible=False)
+        # Connect events
+        generate_btn.click(
+            fn=generate_speech,
+            inputs=[
+                text_input,
+                speaker_id,
+                max_length,
+                temp,
+                top_k,
+                context_list,
+                context_speakers_list
+            ],
+            outputs=[audio_output, error_output]
+        )
+        add_ctx_btn.click(
+            fn=add_context,
+            inputs=[
+                context_text,
+                context_speaker,
+                context_list,
+                context_speakers_list
+            ],
+            outputs=[context_list, context_speakers_list]
+        )
+        clear_ctx_btn.click(
+            fn=clear_context,
+            inputs=[],
+            outputs=[context_list, context_speakers_list]
+        )
+        # Update context display
+        context_list.change(
+            fn=update_context_display,
+            inputs=[context_list, context_speakers_list],
+            outputs=[context_display]
+        )
+        context_speakers_list.change(
+            fn=update_context_display,
+            inputs=[context_list, context_speakers_list],
+            outputs=[context_display]
+        )
+    return demo
+# Initialize model when page loads
+initialize_model()
+# Create and launch the demo
+demo = create_demo()
+demo.launch(server_name="0.0.0.0", server_port=7860)