Michael Hu commited on
Commit
34f1262
Β·
1 Parent(s): 9740afc

update app to include audio playback

Browse files
Files changed (2) hide show
  1. app.py +55 -43
  2. requirements.txt +1 -1
app.py CHANGED
@@ -3,7 +3,6 @@ Main entry point for the Audio Translation Web Application
3
  Handles file upload, processing pipeline, and UI rendering
4
  """
5
 
6
- # Configure logging first
7
  import logging
8
  logging.basicConfig(
9
  level=logging.INFO,
@@ -21,42 +20,7 @@ import time
21
  import subprocess
22
  from utils.stt import transcribe_audio
23
  from utils.translation import translate_text
24
- from utils.tts import generate_speech
25
-
26
- # Hugging Face Spaces Setup Automation
27
- def setup_huggingface_space():
28
- """Automatically configure Hugging Face Space requirements"""
29
- logger.info("Running Hugging Face space setup")
30
- st.sidebar.header("Space Configuration")
31
-
32
- try:
33
- subprocess.run(["espeak-ng", "--version"], check=True, capture_output=True)
34
- logger.info("espeak-ng verification successful")
35
- except (FileNotFoundError, subprocess.CalledProcessError):
36
- logger.error("Missing espeak-ng dependency")
37
- st.sidebar.error("""
38
- **Missing System Dependencies!** Add this to your Space settings:
39
- ```txt
40
- apt-get update && apt-get install -y espeak-ng
41
- ```
42
- """)
43
- st.stop()
44
-
45
- model_dir = "./kokoro"
46
- required_files = [
47
- f"{model_dir}/kokoro-v0_19.pth",
48
- f"{model_dir}/voices/af_bella.pt"
49
- ]
50
-
51
- if not all(os.path.exists(f) for f in required_files):
52
- logger.error("Missing model files in %s", model_dir)
53
- st.sidebar.warning("""
54
- **Missing Model Files!** Add this to your Space settings:
55
- ```txt
56
- git clone https://huggingface.co/hexgrad/Kokoro-82M ./kokoro
57
- ```
58
- """)
59
- st.stop()
60
 
61
  # Initialize environment configurations
62
  os.makedirs("temp/uploads", exist_ok=True)
@@ -111,15 +75,18 @@ def handle_file_processing(upload_path):
111
  # TTS Phase
112
  logger.info("Beginning TTS generation")
113
  status_text.markdown("🎡 **Generating Chinese Speech...**")
114
- with st.spinner("Initializing TTS engine..."):
115
- output_path = generate_speech(chinese_text, voice="zf_xiaobei")
 
 
 
 
116
  progress_bar.progress(100)
117
  logger.info(f"TTS completed. Output file: {output_path}")
118
 
 
 
119
 
120
- # Display results
121
-
122
- # Display results
123
  status_text.success("βœ… Processing Complete!")
124
  return english_text, chinese_text, output_path
125
 
@@ -144,7 +111,10 @@ def render_results(english_text, chinese_text, output_path):
144
 
145
  with col2:
146
  st.subheader("Audio Output")
 
147
  st.audio(output_path)
 
 
148
  with open(output_path, "rb") as f:
149
  st.download_button(
150
  label="Download Audio",
@@ -152,15 +122,57 @@ def render_results(english_text, chinese_text, output_path):
152
  file_name="translated_audio.wav",
153
  mime="audio/wav"
154
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  def main():
157
  """Main application workflow"""
158
  logger.info("Starting application")
159
- # setup_huggingface_space() # First-run configuration checks
160
  configure_page()
 
 
161
  st.title("🎧 High-Quality Audio Translation System")
162
  st.markdown("Upload English Audio β†’ Get Chinese Speech Output")
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
  uploaded_file = st.file_uploader(
165
  "Select Audio File (MP3/WAV)",
166
  type=["mp3", "wav"],
 
3
  Handles file upload, processing pipeline, and UI rendering
4
  """
5
 
 
6
  import logging
7
  logging.basicConfig(
8
  level=logging.INFO,
 
20
  import subprocess
21
  from utils.stt import transcribe_audio
22
  from utils.translation import translate_text
23
+ from utils.tts import get_tts_engine, generate_speech
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
  # Initialize environment configurations
26
  os.makedirs("temp/uploads", exist_ok=True)
 
75
  # TTS Phase
76
  logger.info("Beginning TTS generation")
77
  status_text.markdown("🎡 **Generating Chinese Speech...**")
78
+
79
+ # Initialize TTS engine with appropriate language code for Chinese
80
+ engine = get_tts_engine(lang_code='z') # 'z' for Mandarin Chinese
81
+
82
+ # Generate speech and get the file path
83
+ output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
84
  progress_bar.progress(100)
85
  logger.info(f"TTS completed. Output file: {output_path}")
86
 
87
+ # Store the text for streaming playback
88
+ st.session_state.current_text = chinese_text
89
 
 
 
 
90
  status_text.success("βœ… Processing Complete!")
91
  return english_text, chinese_text, output_path
92
 
 
111
 
112
  with col2:
113
  st.subheader("Audio Output")
114
+ # Standard audio player for the full file
115
  st.audio(output_path)
116
+
117
+ # Download button
118
  with open(output_path, "rb") as f:
119
  st.download_button(
120
  label="Download Audio",
 
122
  file_name="translated_audio.wav",
123
  mime="audio/wav"
124
  )
125
+
126
+ # Streaming playback controls
127
+ st.subheader("Streaming Playback")
128
+ if st.button("Stream Audio"):
129
+ engine = get_tts_engine(lang_code='z')
130
+ streaming_placeholder = st.empty()
131
+
132
+ # Stream the audio in chunks
133
+ for sample_rate, audio_chunk in engine.generate_speech_stream(
134
+ chinese_text,
135
+ voice="zf_xiaobei"
136
+ ):
137
+ # Create a temporary file for each chunk
138
+ temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
139
+ import soundfile as sf
140
+ sf.write(temp_chunk_path, audio_chunk, sample_rate)
141
+
142
+ # Play the chunk
143
+ with streaming_placeholder:
144
+ st.audio(temp_chunk_path, sample_rate=sample_rate)
145
+
146
+ # Clean up the temporary chunk file
147
+ os.remove(temp_chunk_path)
148
+
149
+ def initialize_session_state():
150
+ """Initialize session state variables"""
151
+ if 'current_text' not in st.session_state:
152
+ st.session_state.current_text = None
153
 
154
  def main():
155
  """Main application workflow"""
156
  logger.info("Starting application")
 
157
  configure_page()
158
+ initialize_session_state()
159
+
160
  st.title("🎧 High-Quality Audio Translation System")
161
  st.markdown("Upload English Audio β†’ Get Chinese Speech Output")
162
 
163
+ # Voice selection in sidebar
164
+ st.sidebar.header("TTS Settings")
165
+ voice_options = {
166
+ "Xiaobei (Female)": "zf_xiaobei",
167
+ "Yunjian (Male)": "zm_yunjian",
168
+ }
169
+ selected_voice = st.sidebar.selectbox(
170
+ "Select Voice",
171
+ list(voice_options.keys()),
172
+ format_func=lambda x: x
173
+ )
174
+ speed = st.sidebar.slider("Speech Speed", 0.5, 2.0, 1.0, 0.1)
175
+
176
  uploaded_file = st.file_uploader(
177
  "Select Audio File (MP3/WAV)",
178
  type=["mp3", "wav"],
requirements.txt CHANGED
@@ -13,4 +13,4 @@ scipy>=1.11
13
  munch>=2.5
14
  accelerate>=1.2.0
15
  soundfile>=0.13.0
16
- # git+https://github.com/hexgrad/Kokoro-82M
 
13
  munch>=2.5
14
  accelerate>=1.2.0
15
  soundfile>=0.13.0
16
+ kokoro>=0.7.9