alethanhson commited on
Commit
69a5801
·
1 Parent(s): e0668e2
Files changed (5) hide show
  1. .huggingface/space.yml +3 -2
  2. Procfile +1 -1
  3. README.md +20 -20
  4. app.py +34 -34
  5. app_huggingface.py +214 -0
.huggingface/space.yml CHANGED
@@ -4,8 +4,9 @@ colorFrom: indigo
4
  colorTo: purple
5
  sdk: gradio
6
  sdk_version: 4.19.2
7
- app_file: app.py
8
  pinned: false
9
  license: apache-2.0
10
  models:
11
- - sesame/csm-1b
 
 
4
  colorTo: purple
5
  sdk: gradio
6
  sdk_version: 4.19.2
7
+ app_file: app_huggingface.py
8
  pinned: false
9
  license: apache-2.0
10
  models:
11
+ - sesame/csm-1b
12
+ description: "Convert text to natural-sounding speech with Sesame's Conversational Speech Model"
Procfile CHANGED
@@ -1 +1 @@
1
- web: python app.py
 
1
+ web: python app_huggingface.py
README.md CHANGED
@@ -13,32 +13,32 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
13
 
14
  # CSM-1B Gradio Demo
15
 
16
- Ứng dụng demo cho mô hình CSM-1B (Conversational Speech Model) sử dụng Gradio để tạo giao diện người dùng thân thiện.
17
 
18
- ## Tính năng
19
 
20
- - Chuyển đổi văn bản thành giọng nói tự nhiên
21
- - Hỗ trợ nhiều giọng đọc khác nhau (ID người nói)
22
- - Tạo giọng nói theo ngữ cảnh hội thoại
23
- - Tùy chỉnh các tham số như nhiệt độ và độ dài âm thanh
24
 
25
- ## Sử dụng
26
 
27
- 1. Nhập văn bản bạn muốn chuyển thành giọng nói
28
- 2. Chọn ID người nói (từ 0-10)
29
- 3. Tùy chỉnh các thông số nâng cao (không bắt buộc)
30
- 4. Thêm ngữ cảnh hội thoại nếu cần
31
- 5. Nhấn "Tạo âm thanh" để nghe kết quả
32
 
33
- ## Triển khai trên Hugging Face Spaces
34
 
35
- Ứng dụng này được thiết kế để chạy trên Hugging Face Spaces. Để triển khai:
36
 
37
- 1. Tạo một Space mới
38
- 2. Upload nguồn lên Space
39
- 3. Chọn Gradio framework
40
- 4. Chờ ứng dụng được xây dựng khởi động
41
 
42
- ## Tài nguyên
43
 
44
- hình CSM-1B của Sesame AI: [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)
 
13
 
14
  # CSM-1B Gradio Demo
15
 
16
+ Demo application for the CSM-1B (Conversational Speech Model) using Gradio to create a user-friendly interface.
17
 
18
+ ## Features
19
 
20
+ - Convert text to natural-sounding speech
21
+ - Support for multiple speaker voices (Speaker IDs)
22
+ - Generate speech with conversation context
23
+ - Customize parameters like temperature and audio length
24
 
25
+ ## Usage
26
 
27
+ 1. Enter the text you want to convert to speech
28
+ 2. Choose a speaker ID (from 0-10)
29
+ 3. Customize advanced parameters (optional)
30
+ 4. Add conversation context if needed
31
+ 5. Click "Generate Audio" to hear the result
32
 
33
+ ## Deployment on Hugging Face Spaces
34
 
35
+ This application is designed to run on Hugging Face Spaces. To deploy:
36
 
37
+ 1. Create a new Space
38
+ 2. Upload the source code to the Space
39
+ 3. Select Gradio as the framework
40
+ 4. Wait for the application to build and start
41
 
42
+ ## Resources
43
 
44
+ Sesame AI's CSM-1B model: [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)
app.py CHANGED
@@ -17,19 +17,19 @@ generator = None
17
 
18
  def initialize_model():
19
  global generator
20
- logger.info("Đang tải mô hình CSM 1B...")
21
 
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
  if device == "cpu":
24
- logger.warning("GPU không khả dụng. Sử dụng CPU, hiệu suất thể chậm!")
25
- logger.info(f"Sử dụng thiết bị: {device}")
26
 
27
  try:
28
  generator = load_csm_1b(device=device)
29
- logger.info(f" hình đã được tải thành công trên thiết bị: {device}")
30
  return True
31
  except Exception as e:
32
- logger.error(f"Không thể tải mô hình: {str(e)}")
33
  return False
34
 
35
  def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9, topk=50, context_texts=None, context_speakers=None):
@@ -37,10 +37,10 @@ def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9
37
 
38
  if generator is None:
39
  if not initialize_model():
40
- return None, "Không thể tải mô hình. Vui lòng thử lại sau."
41
 
42
  try:
43
- # Xử context nếu được cung cấp
44
  context_segments = []
45
  if context_texts and context_speakers:
46
  for ctx_text, ctx_speaker in zip(context_texts, context_speakers):
@@ -49,7 +49,7 @@ def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9
49
  Segment(text=ctx_text, speaker=int(ctx_speaker), audio=torch.zeros(0, dtype=torch.float32))
50
  )
51
 
52
- # Tạo âm thanh từ văn bản
53
  audio = generator.generate(
54
  text=text,
55
  speaker=int(speaker_id),
@@ -59,15 +59,15 @@ def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9
59
  topk=int(topk),
60
  )
61
 
62
- # Chuyển đổi tensor thành numpy array để Gradio có thể xử lý
63
  audio_numpy = audio.cpu().numpy()
64
  sample_rate = generator.sample_rate
65
 
66
  return (sample_rate, audio_numpy), None
67
 
68
  except Exception as e:
69
- logger.error(f"Lỗi khi tạo âm thanh: {str(e)}")
70
- return None, f"Lỗi khi tạo âm thanh: {str(e)}"
71
 
72
  def clear_context():
73
  return [], []
@@ -78,36 +78,36 @@ def add_context(text, speaker_id, context_texts, context_speakers):
78
  context_speakers.append(int(speaker_id))
79
  return context_texts, context_speakers
80
 
81
- # Thiết lập giao diện Gradio
82
  with gr.Blocks(title="CSM 1B Demo") as demo:
83
- gr.Markdown("# CSM 1B - hình tạo giọng nói hội thoại")
84
- gr.Markdown("Nhập văn bản để tạo giọng nói tự nhiên với mô hình CSM 1B")
85
 
86
  with gr.Row():
87
  with gr.Column(scale=2):
88
  text_input = gr.Textbox(
89
- label="Văn bản để chuyển thành giọng nói",
90
- placeholder="Nhập văn bản ở đây...",
91
  lines=3
92
  )
93
  speaker_id = gr.Slider(
94
- label="ID người nói",
95
  minimum=0,
96
  maximum=10,
97
  step=1,
98
  value=0
99
  )
100
 
101
- with gr.Accordion("Tùy chọn nâng cao", open=False):
102
  max_length = gr.Slider(
103
- label="Độ dài tối đa (mili giây)",
104
  minimum=1000,
105
  maximum=30000,
106
  step=1000,
107
  value=10000
108
  )
109
  temp = gr.Slider(
110
- label="Nhiệt độ",
111
  minimum=0.1,
112
  maximum=1.5,
113
  step=0.1,
@@ -121,14 +121,14 @@ with gr.Blocks(title="CSM 1B Demo") as demo:
121
  value=50
122
  )
123
 
124
- with gr.Accordion("Ngữ cảnh hội thoại", open=False):
125
  context_list = gr.State([])
126
  context_speakers_list = gr.State([])
127
 
128
  with gr.Row():
129
- context_text = gr.Textbox(label="Văn bản ngữ cảnh", lines=2)
130
  context_speaker = gr.Slider(
131
- label="ID người nói ngữ cảnh",
132
  minimum=0,
133
  maximum=10,
134
  step=1,
@@ -136,22 +136,22 @@ with gr.Blocks(title="CSM 1B Demo") as demo:
136
  )
137
 
138
  with gr.Row():
139
- add_ctx_btn = gr.Button("Thêm ngữ cảnh")
140
- clear_ctx_btn = gr.Button("Xóa tất cả ngữ cảnh")
141
 
142
  context_display = gr.Dataframe(
143
- headers=["Văn bản", "ID người nói"],
144
- label="Ngữ cảnh hiện tại",
145
  interactive=False
146
  )
147
 
148
- generate_btn = gr.Button("Tạo âm thanh", variant="primary")
149
 
150
  with gr.Column(scale=1):
151
- audio_output = gr.Audio(label="Âm thanh được tạo", type="numpy")
152
- error_output = gr.Textbox(label="Thông báo lỗi", visible=False)
153
 
154
- # Kết nối các sự kiện
155
  generate_btn.click(
156
  fn=generate_speech,
157
  inputs=[
@@ -183,7 +183,7 @@ with gr.Blocks(title="CSM 1B Demo") as demo:
183
  outputs=[context_list, context_speakers_list]
184
  )
185
 
186
- # Cập nhật hiển thị ngữ cảnh
187
  def update_context_display(texts, speakers):
188
  if not texts or not speakers:
189
  return []
@@ -201,9 +201,9 @@ with gr.Blocks(title="CSM 1B Demo") as demo:
201
  outputs=[context_display]
202
  )
203
 
204
- # Khởi động ứng dụng khi tải trang
205
  initialize_model()
206
 
207
- # Cấu hình cho Hugging Face Spaces
208
  demo.launch(share=False)
209
 
 
17
 
18
  def initialize_model():
19
  global generator
20
+ logger.info("Loading CSM 1B model...")
21
 
22
  device = "cuda" if torch.cuda.is_available() else "cpu"
23
  if device == "cpu":
24
+ logger.warning("GPU not available. Using CPU, performance may be slow!")
25
+ logger.info(f"Using device: {device}")
26
 
27
  try:
28
  generator = load_csm_1b(device=device)
29
+ logger.info(f"Model loaded successfully on device: {device}")
30
  return True
31
  except Exception as e:
32
+ logger.error(f"Could not load model: {str(e)}")
33
  return False
34
 
35
  def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9, topk=50, context_texts=None, context_speakers=None):
 
37
 
38
  if generator is None:
39
  if not initialize_model():
40
+ return None, "Could not load model. Please try again later."
41
 
42
  try:
43
+ # Process context if provided
44
  context_segments = []
45
  if context_texts and context_speakers:
46
  for ctx_text, ctx_speaker in zip(context_texts, context_speakers):
 
49
  Segment(text=ctx_text, speaker=int(ctx_speaker), audio=torch.zeros(0, dtype=torch.float32))
50
  )
51
 
52
+ # Generate audio from text
53
  audio = generator.generate(
54
  text=text,
55
  speaker=int(speaker_id),
 
59
  topk=int(topk),
60
  )
61
 
62
+ # Convert tensor to numpy array for Gradio
63
  audio_numpy = audio.cpu().numpy()
64
  sample_rate = generator.sample_rate
65
 
66
  return (sample_rate, audio_numpy), None
67
 
68
  except Exception as e:
69
+ logger.error(f"Error generating audio: {str(e)}")
70
+ return None, f"Error generating audio: {str(e)}"
71
 
72
  def clear_context():
73
  return [], []
 
78
  context_speakers.append(int(speaker_id))
79
  return context_texts, context_speakers
80
 
81
+ # Set up Gradio interface
82
  with gr.Blocks(title="CSM 1B Demo") as demo:
83
+ gr.Markdown("# CSM 1B - Conversational Speech Model")
84
+ gr.Markdown("Enter text to generate natural-sounding speech with the CSM 1B model")
85
 
86
  with gr.Row():
87
  with gr.Column(scale=2):
88
  text_input = gr.Textbox(
89
+ label="Text to convert to speech",
90
+ placeholder="Enter your text here...",
91
  lines=3
92
  )
93
  speaker_id = gr.Slider(
94
+ label="Speaker ID",
95
  minimum=0,
96
  maximum=10,
97
  step=1,
98
  value=0
99
  )
100
 
101
+ with gr.Accordion("Advanced Options", open=False):
102
  max_length = gr.Slider(
103
+ label="Maximum length (milliseconds)",
104
  minimum=1000,
105
  maximum=30000,
106
  step=1000,
107
  value=10000
108
  )
109
  temp = gr.Slider(
110
+ label="Temperature",
111
  minimum=0.1,
112
  maximum=1.5,
113
  step=0.1,
 
121
  value=50
122
  )
123
 
124
+ with gr.Accordion("Conversation Context", open=False):
125
  context_list = gr.State([])
126
  context_speakers_list = gr.State([])
127
 
128
  with gr.Row():
129
+ context_text = gr.Textbox(label="Context text", lines=2)
130
  context_speaker = gr.Slider(
131
+ label="Context speaker ID",
132
  minimum=0,
133
  maximum=10,
134
  step=1,
 
136
  )
137
 
138
  with gr.Row():
139
+ add_ctx_btn = gr.Button("Add Context")
140
+ clear_ctx_btn = gr.Button("Clear All Context")
141
 
142
  context_display = gr.Dataframe(
143
+ headers=["Text", "Speaker ID"],
144
+ label="Current Context",
145
  interactive=False
146
  )
147
 
148
+ generate_btn = gr.Button("Generate Audio", variant="primary")
149
 
150
  with gr.Column(scale=1):
151
+ audio_output = gr.Audio(label="Generated Audio", type="numpy")
152
+ error_output = gr.Textbox(label="Error Message", visible=False)
153
 
154
+ # Connect events
155
  generate_btn.click(
156
  fn=generate_speech,
157
  inputs=[
 
183
  outputs=[context_list, context_speakers_list]
184
  )
185
 
186
+ # Update context display
187
  def update_context_display(texts, speakers):
188
  if not texts or not speakers:
189
  return []
 
201
  outputs=[context_display]
202
  )
203
 
204
+ # Initialize model when page loads
205
  initialize_model()
206
 
207
+ # Configuration for Hugging Face Spaces
208
  demo.launch(share=False)
209
 
app_huggingface.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import logging
4
+ from typing import List
5
+
6
+ import torch
7
+ import torchaudio
8
+ import gradio as gr
9
+ import numpy as np
10
+
11
+ from generator import load_csm_1b, Segment
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ generator = None
17
+
18
+ def initialize_model():
19
+ global generator
20
+ logger.info("Loading CSM 1B model...")
21
+
22
+ device = "cuda" if torch.cuda.is_available() else "cpu"
23
+ if device == "cpu":
24
+ logger.warning("GPU not available. Using CPU, performance may be slow!")
25
+ logger.info(f"Using device: {device}")
26
+
27
+ try:
28
+ generator = load_csm_1b(device=device)
29
+ logger.info(f"Model loaded successfully on device: {device}")
30
+ return True
31
+ except Exception as e:
32
+ logger.error(f"Could not load model: {str(e)}")
33
+ return False
34
+
35
+ def generate_speech(text, speaker_id, max_audio_length_ms=10000, temperature=0.9, topk=50, context_texts=None, context_speakers=None):
36
+ global generator
37
+
38
+ if generator is None:
39
+ if not initialize_model():
40
+ return None, "Could not load model. Please try again later."
41
+
42
+ try:
43
+ # Process context if provided
44
+ context_segments = []
45
+ if context_texts and context_speakers:
46
+ for ctx_text, ctx_speaker in zip(context_texts, context_speakers):
47
+ if ctx_text and ctx_speaker is not None:
48
+ context_segments.append(
49
+ Segment(text=ctx_text, speaker=int(ctx_speaker), audio=torch.zeros(0, dtype=torch.float32))
50
+ )
51
+
52
+ # Generate audio from text
53
+ audio = generator.generate(
54
+ text=text,
55
+ speaker=int(speaker_id),
56
+ context=context_segments,
57
+ max_audio_length_ms=float(max_audio_length_ms),
58
+ temperature=float(temperature),
59
+ topk=int(topk),
60
+ )
61
+
62
+ # Convert tensor to numpy array for Gradio
63
+ audio_numpy = audio.cpu().numpy()
64
+ sample_rate = generator.sample_rate
65
+
66
+ return (sample_rate, audio_numpy), None
67
+
68
+ except Exception as e:
69
+ logger.error(f"Error generating audio: {str(e)}")
70
+ return None, f"Error generating audio: {str(e)}"
71
+
72
+ def clear_context():
73
+ return [], []
74
+
75
+ def add_context(text, speaker_id, context_texts, context_speakers):
76
+ if text and speaker_id is not None:
77
+ context_texts.append(text)
78
+ context_speakers.append(int(speaker_id))
79
+ return context_texts, context_speakers
80
+
81
+ def update_context_display(texts, speakers):
82
+ if not texts or not speakers:
83
+ return []
84
+ return [[text, speaker] for text, speaker in zip(texts, speakers)]
85
+
86
+ def create_demo():
87
+ # Set up Gradio interface
88
+ demo = gr.Blocks(title="CSM 1B Demo")
89
+
90
+ with demo:
91
+ gr.Markdown("# CSM 1B - Conversational Speech Model")
92
+ gr.Markdown("Enter text to generate natural-sounding speech with the CSM 1B model")
93
+
94
+ with gr.Row():
95
+ with gr.Column(scale=2):
96
+ text_input = gr.Textbox(
97
+ label="Text to convert to speech",
98
+ placeholder="Enter your text here...",
99
+ lines=3
100
+ )
101
+ speaker_id = gr.Slider(
102
+ label="Speaker ID",
103
+ minimum=0,
104
+ maximum=10,
105
+ step=1,
106
+ value=0
107
+ )
108
+
109
+ with gr.Accordion("Advanced Options", open=False):
110
+ max_length = gr.Slider(
111
+ label="Maximum length (milliseconds)",
112
+ minimum=1000,
113
+ maximum=30000,
114
+ step=1000,
115
+ value=10000
116
+ )
117
+ temp = gr.Slider(
118
+ label="Temperature",
119
+ minimum=0.1,
120
+ maximum=1.5,
121
+ step=0.1,
122
+ value=0.9
123
+ )
124
+ top_k = gr.Slider(
125
+ label="Top K",
126
+ minimum=10,
127
+ maximum=100,
128
+ step=10,
129
+ value=50
130
+ )
131
+
132
+ with gr.Accordion("Conversation Context", open=False):
133
+ context_list = gr.State([])
134
+ context_speakers_list = gr.State([])
135
+
136
+ with gr.Row():
137
+ context_text = gr.Textbox(label="Context text", lines=2)
138
+ context_speaker = gr.Slider(
139
+ label="Context speaker ID",
140
+ minimum=0,
141
+ maximum=10,
142
+ step=1,
143
+ value=0
144
+ )
145
+
146
+ with gr.Row():
147
+ add_ctx_btn = gr.Button("Add Context")
148
+ clear_ctx_btn = gr.Button("Clear All Context")
149
+
150
+ context_display = gr.Dataframe(
151
+ headers=["Text", "Speaker ID"],
152
+ label="Current Context",
153
+ interactive=False
154
+ )
155
+
156
+ generate_btn = gr.Button("Generate Audio", variant="primary")
157
+
158
+ with gr.Column(scale=1):
159
+ audio_output = gr.Audio(label="Generated Audio", type="numpy")
160
+ error_output = gr.Textbox(label="Error Message", visible=False)
161
+
162
+ # Connect events
163
+ generate_btn.click(
164
+ fn=generate_speech,
165
+ inputs=[
166
+ text_input,
167
+ speaker_id,
168
+ max_length,
169
+ temp,
170
+ top_k,
171
+ context_list,
172
+ context_speakers_list
173
+ ],
174
+ outputs=[audio_output, error_output]
175
+ )
176
+
177
+ add_ctx_btn.click(
178
+ fn=add_context,
179
+ inputs=[
180
+ context_text,
181
+ context_speaker,
182
+ context_list,
183
+ context_speakers_list
184
+ ],
185
+ outputs=[context_list, context_speakers_list]
186
+ )
187
+
188
+ clear_ctx_btn.click(
189
+ fn=clear_context,
190
+ inputs=[],
191
+ outputs=[context_list, context_speakers_list]
192
+ )
193
+
194
+ # Update context display
195
+ context_list.change(
196
+ fn=update_context_display,
197
+ inputs=[context_list, context_speakers_list],
198
+ outputs=[context_display]
199
+ )
200
+
201
+ context_speakers_list.change(
202
+ fn=update_context_display,
203
+ inputs=[context_list, context_speakers_list],
204
+ outputs=[context_display]
205
+ )
206
+
207
+ return demo
208
+
209
+ # Initialize model when page loads
210
+ initialize_model()
211
+
212
+ # Create and launch the demo
213
+ demo = create_demo()
214
+ demo.launch(server_name="0.0.0.0", server_port=7860)