Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -5,12 +5,8 @@ from tempfile import NamedTemporaryFile
|
|
5 |
import time
|
6 |
import gradio as gr
|
7 |
from openai import OpenAI
|
8 |
-
from pydub import AudioSegment
|
9 |
|
10 |
-
#
|
11 |
-
# pip install openai gradio pydub
|
12 |
-
|
13 |
-
# 標準音頻模型和聲音選項
|
14 |
STANDARD_AUDIO_MODELS = [
|
15 |
"tts-1",
|
16 |
"tts-1-hd",
|
@@ -24,7 +20,7 @@ STANDARD_VOICES = [
|
|
24 |
"shimmer",
|
25 |
]
|
26 |
|
27 |
-
#
|
28 |
def optimize_script(script):
|
29 |
lines = [line.strip() for line in script.splitlines() if line.strip()]
|
30 |
optimized = []
|
@@ -39,32 +35,30 @@ def optimize_script(script):
|
|
39 |
speaker = "speaker-2"
|
40 |
text = line.split(":", 1)[1].strip()
|
41 |
else:
|
42 |
-
speaker = "speaker-1" #
|
43 |
text = line
|
44 |
|
45 |
-
#
|
46 |
if speaker != current_speaker and current_text:
|
47 |
optimized.append((current_speaker, current_text))
|
48 |
current_text = text
|
49 |
current_speaker = speaker
|
50 |
else:
|
51 |
-
#
|
52 |
if current_text:
|
53 |
current_text += " " + text
|
54 |
else:
|
55 |
current_text = text
|
56 |
current_speaker = speaker
|
57 |
|
58 |
-
#
|
59 |
if current_text:
|
60 |
optimized.append((current_speaker, current_text))
|
61 |
|
62 |
return optimized
|
63 |
|
64 |
def get_mp3(text: str, voice: str, audio_model: str, audio_api_key: str) -> bytes:
|
65 |
-
"""
|
66 |
-
使用 OpenAI TTS API 生成音頻
|
67 |
-
"""
|
68 |
client = OpenAI(api_key=audio_api_key)
|
69 |
try:
|
70 |
with client.audio.speech.with_streaming_response.create(
|
@@ -87,61 +81,41 @@ def generate_audio_from_script(
|
|
87 |
speaker1_voice: str = "onyx",
|
88 |
speaker2_voice: str = "nova",
|
89 |
) -> tuple[bytes, str]:
|
90 |
-
"""
|
91 |
-
|
92 |
-
"""
|
93 |
status_log = []
|
94 |
-
optimized_script = optimize_script(script)
|
95 |
-
|
96 |
-
# 使用 pydub 處理音頻
|
97 |
-
final_audio = AudioSegment.silent(duration=0)
|
98 |
|
99 |
-
|
|
|
100 |
|
|
|
101 |
for speaker, text in optimized_script:
|
102 |
voice_to_use = speaker1_voice if speaker == "speaker-1" else speaker2_voice
|
103 |
status_log.append(f"[{speaker}] {text}")
|
104 |
|
105 |
try:
|
106 |
-
#
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
temp_file.write(chunk)
|
115 |
-
temp_file.flush()
|
116 |
-
|
117 |
-
# 使用 pydub 加載和合併音頻
|
118 |
-
segment = AudioSegment.from_file(temp_file.name, format="mp3")
|
119 |
-
final_audio += segment
|
120 |
-
|
121 |
-
# 刪除臨時文件
|
122 |
-
os.unlink(temp_file.name)
|
123 |
-
|
124 |
except Exception as e:
|
125 |
-
status_log.append(f"[
|
126 |
-
|
127 |
-
# 將最終音頻轉換為 bytes
|
128 |
-
output_buffer = io.BytesIO()
|
129 |
-
final_audio.export(output_buffer, format="mp3")
|
130 |
-
output_buffer.seek(0)
|
131 |
|
132 |
-
return
|
133 |
|
134 |
def save_audio_file(audio_data: bytes) -> str:
|
135 |
-
"""
|
136 |
-
將音頻數據保存為臨時檔案
|
137 |
-
"""
|
138 |
temp_dir = Path("./temp_audio")
|
139 |
temp_dir.mkdir(exist_ok=True)
|
140 |
-
#
|
141 |
for old_file in temp_dir.glob("*.mp3"):
|
142 |
-
if old_file.stat().st_mtime < (time.time() - 24*60*60): # 24
|
143 |
old_file.unlink()
|
144 |
-
#
|
145 |
temp_file = NamedTemporaryFile(
|
146 |
dir=temp_dir,
|
147 |
delete=False,
|
@@ -151,23 +125,40 @@ def save_audio_file(audio_data: bytes) -> str:
|
|
151 |
temp_file.close()
|
152 |
return temp_file.name
|
153 |
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
def create_gradio_interface():
|
156 |
with gr.Blocks(title="TTS Generator", css="""
|
157 |
#header { text-align: center; margin-bottom: 20px; }
|
158 |
""") as demo:
|
159 |
-
gr.Markdown("#
|
160 |
with gr.Row():
|
161 |
with gr.Column(scale=1):
|
162 |
-
#
|
163 |
script_input = gr.Textbox(
|
164 |
-
label="
|
165 |
-
placeholder="""
|
166 |
-
speaker-1:
|
167 |
speaker-2: 大家好,我是 Cordelia...
|
168 |
-
|
169 |
|
170 |
-
|
171 |
lines=20
|
172 |
)
|
173 |
api_key = gr.Textbox(
|
@@ -176,74 +167,47 @@ speaker-2: 大家好,我是 Cordelia...
|
|
176 |
)
|
177 |
with gr.Row():
|
178 |
audio_model = gr.Dropdown(
|
179 |
-
label="
|
180 |
choices=STANDARD_AUDIO_MODELS,
|
181 |
value="tts-1"
|
182 |
)
|
183 |
speaker1_voice = gr.Dropdown(
|
184 |
-
label="
|
185 |
choices=STANDARD_VOICES,
|
186 |
value="onyx"
|
187 |
)
|
188 |
speaker2_voice = gr.Dropdown(
|
189 |
-
label="
|
190 |
choices=STANDARD_VOICES,
|
191 |
value="nova"
|
192 |
)
|
193 |
-
generate_button = gr.Button("
|
194 |
with gr.Column(scale=1):
|
195 |
-
#
|
196 |
audio_output = gr.Audio(
|
197 |
-
label="
|
198 |
type="filepath"
|
199 |
)
|
200 |
status_output = gr.Textbox(
|
201 |
-
label="
|
202 |
lines=20,
|
203 |
show_copy_button=True
|
204 |
)
|
205 |
-
|
206 |
-
# 進度條
|
207 |
-
progress_bar = gr.Progress()
|
208 |
|
209 |
-
#
|
210 |
generate_button.click(
|
211 |
-
fn=
|
212 |
-
script, key, model, v1, v2, p
|
213 |
-
),
|
214 |
inputs=[
|
215 |
script_input,
|
216 |
api_key,
|
217 |
audio_model,
|
218 |
speaker1_voice,
|
219 |
-
speaker2_voice
|
220 |
],
|
221 |
outputs=[audio_output, status_output]
|
222 |
)
|
223 |
return demo
|
224 |
|
225 |
-
def process_and_save_audio(script, api_key, model, voice1, voice2, progress=gr.Progress()):
|
226 |
-
"""
|
227 |
-
處理音頻生成並保存檔案
|
228 |
-
"""
|
229 |
-
try:
|
230 |
-
progress(0, desc="準備處理腳本...")
|
231 |
-
audio_data, status_log = generate_audio_from_script(
|
232 |
-
script,
|
233 |
-
api_key,
|
234 |
-
model,
|
235 |
-
voice1,
|
236 |
-
voice2
|
237 |
-
)
|
238 |
-
progress(0.9, desc="保存音頻文件...")
|
239 |
-
audio_path = save_audio_file(audio_data)
|
240 |
-
progress(1.0, desc="完成!")
|
241 |
-
return audio_path, status_log
|
242 |
-
except Exception as e:
|
243 |
-
error_message = f"生成音頻時發生錯誤: {str(e)}"
|
244 |
-
print(error_message)
|
245 |
-
return None, error_message
|
246 |
-
|
247 |
if __name__ == "__main__":
|
248 |
demo = create_gradio_interface()
|
249 |
demo.launch()
|
|
|
5 |
import time
|
6 |
import gradio as gr
|
7 |
from openai import OpenAI
|
|
|
8 |
|
9 |
+
# 标准音频模型和声音选项
|
|
|
|
|
|
|
10 |
STANDARD_AUDIO_MODELS = [
|
11 |
"tts-1",
|
12 |
"tts-1-hd",
|
|
|
20 |
"shimmer",
|
21 |
]
|
22 |
|
23 |
+
# 优化脚本处理 - 合并相同说话者连续文本
|
24 |
def optimize_script(script):
|
25 |
lines = [line.strip() for line in script.splitlines() if line.strip()]
|
26 |
optimized = []
|
|
|
35 |
speaker = "speaker-2"
|
36 |
text = line.split(":", 1)[1].strip()
|
37 |
else:
|
38 |
+
speaker = "speaker-1" # 默认使用说话者1
|
39 |
text = line
|
40 |
|
41 |
+
# 如果说话者变了,保存之前的文本并开始新的
|
42 |
if speaker != current_speaker and current_text:
|
43 |
optimized.append((current_speaker, current_text))
|
44 |
current_text = text
|
45 |
current_speaker = speaker
|
46 |
else:
|
47 |
+
# 相同说话者,合并文本(加空格)
|
48 |
if current_text:
|
49 |
current_text += " " + text
|
50 |
else:
|
51 |
current_text = text
|
52 |
current_speaker = speaker
|
53 |
|
54 |
+
# 添加最后一个说话者的文本
|
55 |
if current_text:
|
56 |
optimized.append((current_speaker, current_text))
|
57 |
|
58 |
return optimized
|
59 |
|
60 |
def get_mp3(text: str, voice: str, audio_model: str, audio_api_key: str) -> bytes:
|
61 |
+
"""使用 OpenAI TTS API 生成音频"""
|
|
|
|
|
62 |
client = OpenAI(api_key=audio_api_key)
|
63 |
try:
|
64 |
with client.audio.speech.with_streaming_response.create(
|
|
|
81 |
speaker1_voice: str = "onyx",
|
82 |
speaker2_voice: str = "nova",
|
83 |
) -> tuple[bytes, str]:
|
84 |
+
"""从脚本生成音频,支持两个说话者,并优化 API 调用"""
|
85 |
+
combined_audio = b""
|
|
|
86 |
status_log = []
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
# 优化脚本处理
|
89 |
+
optimized_script = optimize_script(script)
|
90 |
|
91 |
+
# 处理每一段
|
92 |
for speaker, text in optimized_script:
|
93 |
voice_to_use = speaker1_voice if speaker == "speaker-1" else speaker2_voice
|
94 |
status_log.append(f"[{speaker}] {text}")
|
95 |
|
96 |
try:
|
97 |
+
# 生成这一段的音频
|
98 |
+
audio_chunk = get_mp3(
|
99 |
+
text,
|
100 |
+
voice_to_use,
|
101 |
+
audio_model,
|
102 |
+
audio_api_key
|
103 |
+
)
|
104 |
+
combined_audio += audio_chunk
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
105 |
except Exception as e:
|
106 |
+
status_log.append(f"[错误] 无法生成音频: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
+
return combined_audio, "\n".join(status_log)
|
109 |
|
110 |
def save_audio_file(audio_data: bytes) -> str:
|
111 |
+
"""将音频数据保存为临时文件"""
|
|
|
|
|
112 |
temp_dir = Path("./temp_audio")
|
113 |
temp_dir.mkdir(exist_ok=True)
|
114 |
+
# 清理旧文件
|
115 |
for old_file in temp_dir.glob("*.mp3"):
|
116 |
+
if old_file.stat().st_mtime < (time.time() - 24*60*60): # 24小时前的文件
|
117 |
old_file.unlink()
|
118 |
+
# 创建新的临时文件
|
119 |
temp_file = NamedTemporaryFile(
|
120 |
dir=temp_dir,
|
121 |
delete=False,
|
|
|
125 |
temp_file.close()
|
126 |
return temp_file.name
|
127 |
|
128 |
+
def process_and_save_audio(script, api_key, model, voice1, voice2):
|
129 |
+
"""处理音频生成并保存文件"""
|
130 |
+
try:
|
131 |
+
audio_data, status_log = generate_audio_from_script(
|
132 |
+
script,
|
133 |
+
api_key,
|
134 |
+
model,
|
135 |
+
voice1,
|
136 |
+
voice2
|
137 |
+
)
|
138 |
+
audio_path = save_audio_file(audio_data)
|
139 |
+
return audio_path, status_log
|
140 |
+
except Exception as e:
|
141 |
+
error_message = f"生成音频时发生错误: {str(e)}"
|
142 |
+
print(error_message)
|
143 |
+
return None, error_message
|
144 |
+
|
145 |
+
# Gradio 界面
|
146 |
def create_gradio_interface():
|
147 |
with gr.Blocks(title="TTS Generator", css="""
|
148 |
#header { text-align: center; margin-bottom: 20px; }
|
149 |
""") as demo:
|
150 |
+
gr.Markdown("# 语音合成器 | TTS Generator", elem_id="header")
|
151 |
with gr.Row():
|
152 |
with gr.Column(scale=1):
|
153 |
+
# 输入区
|
154 |
script_input = gr.Textbox(
|
155 |
+
label="输入脚本 | Input Script",
|
156 |
+
placeholder="""请粘贴脚本内容,格式如下:
|
157 |
+
speaker-1: 欢迎来到 David888 Podcast,我是 David...
|
158 |
speaker-2: 大家好,我是 Cordelia...
|
159 |
+
没有标记说话者的行会默认使用说话者1的声音。
|
160 |
|
161 |
+
提示:为提高效率,相同说话者的多行文字将自动合并处理。""",
|
162 |
lines=20
|
163 |
)
|
164 |
api_key = gr.Textbox(
|
|
|
167 |
)
|
168 |
with gr.Row():
|
169 |
audio_model = gr.Dropdown(
|
170 |
+
label="音频模型 | Audio Model",
|
171 |
choices=STANDARD_AUDIO_MODELS,
|
172 |
value="tts-1"
|
173 |
)
|
174 |
speaker1_voice = gr.Dropdown(
|
175 |
+
label="说话者1声音 | Speaker 1 Voice",
|
176 |
choices=STANDARD_VOICES,
|
177 |
value="onyx"
|
178 |
)
|
179 |
speaker2_voice = gr.Dropdown(
|
180 |
+
label="说话者2声音 | Speaker 2 Voice",
|
181 |
choices=STANDARD_VOICES,
|
182 |
value="nova"
|
183 |
)
|
184 |
+
generate_button = gr.Button("生成音频 | Generate Audio")
|
185 |
with gr.Column(scale=1):
|
186 |
+
# 输出区
|
187 |
audio_output = gr.Audio(
|
188 |
+
label="生成的音频 | Generated Audio",
|
189 |
type="filepath"
|
190 |
)
|
191 |
status_output = gr.Textbox(
|
192 |
+
label="生成状态 | Generation Status",
|
193 |
lines=20,
|
194 |
show_copy_button=True
|
195 |
)
|
|
|
|
|
|
|
196 |
|
197 |
+
# 事件处理
|
198 |
generate_button.click(
|
199 |
+
fn=process_and_save_audio,
|
|
|
|
|
200 |
inputs=[
|
201 |
script_input,
|
202 |
api_key,
|
203 |
audio_model,
|
204 |
speaker1_voice,
|
205 |
+
speaker2_voice
|
206 |
],
|
207 |
outputs=[audio_output, status_output]
|
208 |
)
|
209 |
return demo
|
210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
if __name__ == "__main__":
|
212 |
demo = create_gradio_interface()
|
213 |
demo.launch()
|