Michael Hu commited on
Commit
c72d839
·
1 Parent(s): f7102b4

add more logging

Browse files
Files changed (4) hide show
  1. app.py +39 -10
  2. utils/stt.py +51 -33
  3. utils/translation.py +42 -29
  4. utils/tts.py +42 -24
app.py CHANGED
@@ -3,6 +3,18 @@ Main entry point for the Audio Translation Web Application
3
  Handles file upload, processing pipeline, and UI rendering
4
  """
5
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import streamlit as st
7
  import os
8
  import time
@@ -14,12 +26,14 @@ from utils.tts_dummy import generate_speech
14
  # Hugging Face Spaces Setup Automation
15
  def setup_huggingface_space():
16
  """Automatically configure Hugging Face Space requirements"""
 
17
  st.sidebar.header("Space Configuration")
18
 
19
- # Check for required system packages
20
  try:
21
  subprocess.run(["espeak-ng", "--version"], check=True, capture_output=True)
 
22
  except (FileNotFoundError, subprocess.CalledProcessError):
 
23
  st.sidebar.error("""
24
  **Missing System Dependencies!** Add this to your Space settings:
25
  ```txt
@@ -28,7 +42,6 @@ def setup_huggingface_space():
28
  """)
29
  st.stop()
30
 
31
- # Verify model files
32
  model_dir = "./kokoro"
33
  required_files = [
34
  f"{model_dir}/kokoro-v0_19.pth",
@@ -36,6 +49,7 @@ def setup_huggingface_space():
36
  ]
37
 
38
  if not all(os.path.exists(f) for f in required_files):
 
39
  st.sidebar.warning("""
40
  **Missing Model Files!** Add this to your Space settings:
41
  ```txt
@@ -50,6 +64,7 @@ os.makedirs("temp/outputs", exist_ok=True)
50
 
51
  def configure_page():
52
  """Set up Streamlit page configuration"""
 
53
  st.set_page_config(
54
  page_title="Audio Translator",
55
  page_icon="🎧",
@@ -72,36 +87,51 @@ def handle_file_processing(upload_path):
72
  2. Machine Translation
73
  3. Text-to-Speech (TTS)
74
  """
 
75
  progress_bar = st.progress(0)
76
  status_text = st.empty()
77
 
78
  try:
79
  # STT Phase
 
80
  status_text.markdown("🔍 **Performing Speech Recognition...**")
81
- english_text = transcribe_audio(upload_path)
 
82
  progress_bar.progress(30)
83
-
 
84
  # Translation Phase
 
85
  status_text.markdown("🌐 **Translating Content...**")
86
- chinese_text = translate_text(english_text)
 
87
  progress_bar.progress(60)
88
-
 
89
  # TTS Phase
 
90
  status_text.markdown("🎵 **Generating Chinese Speech...**")
91
- output_path = generate_speech(chinese_text, language="zh")
 
92
  progress_bar.progress(100)
 
 
93
 
 
 
94
  # Display results
95
  status_text.success("✅ Processing Complete!")
96
  return english_text, chinese_text, output_path
97
 
98
  except Exception as e:
 
99
  status_text.error(f"❌ Processing Failed: {str(e)}")
100
  st.exception(e)
101
  raise
102
 
103
  def render_results(english_text, chinese_text, output_path):
104
  """Display processing results in organized columns"""
 
105
  st.divider()
106
 
107
  col1, col2 = st.columns([2, 1])
@@ -125,12 +155,12 @@ def render_results(english_text, chinese_text, output_path):
125
 
126
  def main():
127
  """Main application workflow"""
 
128
  # setup_huggingface_space() # First-run configuration checks
129
  configure_page()
130
  st.title("🎧 High-Quality Audio Translation System")
131
  st.markdown("Upload English Audio → Get Chinese Speech Output")
132
 
133
- # File uploader widget
134
  uploaded_file = st.file_uploader(
135
  "Select Audio File (MP3/WAV)",
136
  type=["mp3", "wav"],
@@ -138,12 +168,11 @@ def main():
138
  )
139
 
140
  if uploaded_file:
141
- # Save uploaded file
142
  upload_path = os.path.join("temp/uploads", uploaded_file.name)
143
  with open(upload_path, "wb") as f:
144
  f.write(uploaded_file.getbuffer())
145
 
146
- # Execute processing pipeline
147
  results = handle_file_processing(upload_path)
148
  if results:
149
  render_results(*results)
 
3
  Handles file upload, processing pipeline, and UI rendering
4
  """
5
 
6
+ # Configure logging first
7
+ import logging
8
+ logging.basicConfig(
9
+ level=logging.INFO,
10
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
11
+ handlers=[
12
+ logging.FileHandler("app.log"),
13
+ logging.StreamHandler()
14
+ ]
15
+ )
16
+ logger = logging.getLogger(__name__)
17
+
18
  import streamlit as st
19
  import os
20
  import time
 
26
  # Hugging Face Spaces Setup Automation
27
  def setup_huggingface_space():
28
  """Automatically configure Hugging Face Space requirements"""
29
+ logger.debug("Running Hugging Face space setup")
30
  st.sidebar.header("Space Configuration")
31
 
 
32
  try:
33
  subprocess.run(["espeak-ng", "--version"], check=True, capture_output=True)
34
+ logger.debug("espeak-ng verification successful")
35
  except (FileNotFoundError, subprocess.CalledProcessError):
36
+ logger.error("Missing espeak-ng dependency")
37
  st.sidebar.error("""
38
  **Missing System Dependencies!** Add this to your Space settings:
39
  ```txt
 
42
  """)
43
  st.stop()
44
 
 
45
  model_dir = "./kokoro"
46
  required_files = [
47
  f"{model_dir}/kokoro-v0_19.pth",
 
49
  ]
50
 
51
  if not all(os.path.exists(f) for f in required_files):
52
+ logger.error("Missing model files in %s", model_dir)
53
  st.sidebar.warning("""
54
  **Missing Model Files!** Add this to your Space settings:
55
  ```txt
 
64
 
65
  def configure_page():
66
  """Set up Streamlit page configuration"""
67
+ logger.debug("Configuring Streamlit page")
68
  st.set_page_config(
69
  page_title="Audio Translator",
70
  page_icon="🎧",
 
87
  2. Machine Translation
88
  3. Text-to-Speech (TTS)
89
  """
90
+ logger.info(f"Starting processing for: {upload_path}")
91
  progress_bar = st.progress(0)
92
  status_text = st.empty()
93
 
94
  try:
95
  # STT Phase
96
+ logger.debug("Beginning STT processing")
97
  status_text.markdown("🔍 **Performing Speech Recognition...**")
98
+ with st.spinner("Initializing Whisper model..."):
99
+ english_text = transcribe_audio(upload_path)
100
  progress_bar.progress(30)
101
+ logger.info(f"STT completed. Text length: {len(english_text)} characters")
102
+
103
  # Translation Phase
104
+ logger.debug("Beginning translation")
105
  status_text.markdown("🌐 **Translating Content...**")
106
+ with st.spinner("Loading translation model..."):
107
+ chinese_text = translate_text(english_text)
108
  progress_bar.progress(60)
109
+ logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
110
+
111
  # TTS Phase
112
+ logger.debug("Beginning TTS generation")
113
  status_text.markdown("🎵 **Generating Chinese Speech...**")
114
+ with st.spinner("Initializing TTS engine..."):
115
+ output_path = generate_speech(chinese_text, language="zh")
116
  progress_bar.progress(100)
117
+ logger.info(f"TTS completed. Output file: {output_path}")
118
+
119
 
120
+ # Display results
121
+
122
  # Display results
123
  status_text.success("✅ Processing Complete!")
124
  return english_text, chinese_text, output_path
125
 
126
  except Exception as e:
127
+ logger.error(f"Processing failed: {str(e)}", exc_info=True)
128
  status_text.error(f"❌ Processing Failed: {str(e)}")
129
  st.exception(e)
130
  raise
131
 
132
  def render_results(english_text, chinese_text, output_path):
133
  """Display processing results in organized columns"""
134
+ logger.debug("Rendering results")
135
  st.divider()
136
 
137
  col1, col2 = st.columns([2, 1])
 
155
 
156
  def main():
157
  """Main application workflow"""
158
+ logger.info("Starting application")
159
  # setup_huggingface_space() # First-run configuration checks
160
  configure_page()
161
  st.title("🎧 High-Quality Audio Translation System")
162
  st.markdown("Upload English Audio → Get Chinese Speech Output")
163
 
 
164
  uploaded_file = st.file_uploader(
165
  "Select Audio File (MP3/WAV)",
166
  type=["mp3", "wav"],
 
168
  )
169
 
170
  if uploaded_file:
171
+ logger.info(f"File uploaded: {uploaded_file.name}")
172
  upload_path = os.path.join("temp/uploads", uploaded_file.name)
173
  with open(upload_path, "wb") as f:
174
  f.write(uploaded_file.getbuffer())
175
 
 
176
  results = handle_file_processing(upload_path)
177
  if results:
178
  render_results(*results)
utils/stt.py CHANGED
@@ -3,6 +3,9 @@ Speech Recognition Module using Whisper Large-v3
3
  Handles audio preprocessing and transcription
4
  """
5
 
 
 
 
6
  import torch
7
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
8
  from pydub import AudioSegment
@@ -15,37 +18,52 @@ def transcribe_audio(audio_path):
15
  Returns:
16
  Transcribed English text
17
  """
18
- # Configure hardware settings
19
- device = "cuda" if torch.cuda.is_available() else "cpu"
20
-
21
- # Convert to proper audio format
22
- audio = AudioSegment.from_file(audio_path)
23
- processed_audio = audio.set_frame_rate(16000).set_channels(1)
24
- wav_path = audio_path.replace(".mp3", ".wav")
25
- processed_audio.export(wav_path, format="wav")
26
-
27
- # Initialize ASR model
28
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
29
- "openai/whisper-large-v3",
30
- torch_dtype=torch.float32,
31
- low_cpu_mem_usage=True,
32
- use_safetensors=True
33
- ).to(device)
34
-
35
- processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
36
-
37
- # Process audio input
38
- inputs = processor(
39
- wav_path,
40
- sampling_rate=16000,
41
- return_tensors="pt",
42
- truncation=True,
43
- chunk_length_s=30,
44
- stride_length_s=5
45
- ).to(device)
46
-
47
- # Generate transcription
48
- with torch.no_grad():
49
- outputs = model.generate(**inputs, language="en", task="transcribe")
50
 
51
- return processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  Handles audio preprocessing and transcription
4
  """
5
 
6
+ import logging
7
+ logger = logging.getLogger(__name__)
8
+
9
  import torch
10
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
11
  from pydub import AudioSegment
 
18
  Returns:
19
  Transcribed English text
20
  """
21
+ logger.info(f"Starting transcription for: {audio_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ try:
24
+ # Audio conversion
25
+ logger.debug("Converting audio format")
26
+ audio = AudioSegment.from_file(audio_path)
27
+ processed_audio = audio.set_frame_rate(16000).set_channels(1)
28
+ wav_path = audio_path.replace(".mp3", ".wav")
29
+ processed_audio.export(wav_path, format="wav")
30
+ logger.debug(f"Audio converted to: {wav_path}")
31
+
32
+ # Model initialization
33
+ logger.info("Loading Whisper model")
34
+ device = "cuda" if torch.cuda.is_available() else "cpu"
35
+ logger.debug(f"Using device: {device}")
36
+
37
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
38
+ "openai/whisper-large-v3",
39
+ torch_dtype=torch.float32,
40
+ low_cpu_mem_usage=True,
41
+ use_safetensors=True
42
+ ).to(device)
43
+
44
+ processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
45
+ logger.debug("Model loaded successfully")
46
+
47
+ # Processing
48
+ logger.debug("Processing audio input")
49
+ inputs = processor(
50
+ wav_path,
51
+ sampling_rate=16000,
52
+ return_tensors="pt",
53
+ truncation=True,
54
+ chunk_length_s=30,
55
+ stride_length_s=5
56
+ ).to(device)
57
+
58
+ # Transcription
59
+ logger.info("Generating transcription")
60
+ with torch.no_grad():
61
+ outputs = model.generate(**inputs, language="en", task="transcribe")
62
+
63
+ result = processor.batch_decode(outputs, skip_special_tokens=True)[0]
64
+ logger.info(f"Transcription completed successfully")
65
+ return result
66
+
67
+ except Exception as e:
68
+ logger.error(f"Transcription failed: {str(e)}", exc_info=True)
69
+ raise
utils/translation.py CHANGED
@@ -3,6 +3,9 @@ Text Translation Module using NLLB-3.3B model
3
  Handles text segmentation and batch translation
4
  """
5
 
 
 
 
6
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
 
8
  def translate_text(text):
@@ -13,33 +16,43 @@ def translate_text(text):
13
  Returns:
14
  Translated Chinese text
15
  """
16
- # Initialize translation model
17
- tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B")
18
- model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
19
-
20
- # Split long text into manageable chunks
21
- max_chunk_length = 1000
22
- text_chunks = [
23
- text[i:i+max_chunk_length]
24
- for i in range(0, len(text), max_chunk_length)
25
- ]
26
 
27
- translated_chunks = []
28
- for chunk in text_chunks:
29
- # Prepare model inputs
30
- inputs = tokenizer(
31
- chunk,
32
- return_tensors="pt",
33
- max_length=1024,
34
- truncation=True
35
- )
36
-
37
- # Generate translation
38
- outputs = model.generate(
39
- **inputs,
40
- forced_bos_token_id=tokenizer.lang_code_to_id["zho_Hans"],
41
- max_new_tokens=1024
42
- )
43
- translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
44
-
45
- return "".join(translated_chunks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  Handles text segmentation and batch translation
4
  """
5
 
6
+ import logging
7
+ logger = logging.getLogger(__name__)
8
+
9
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
10
 
11
  def translate_text(text):
 
16
  Returns:
17
  Translated Chinese text
18
  """
19
+ logger.info(f"Starting translation for text length: {len(text)}")
 
 
 
 
 
 
 
 
 
20
 
21
+ try:
22
+ # Model initialization
23
+ logger.info("Loading NLLB model")
24
+ tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B")
25
+ model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
26
+ logger.debug("Translation model loaded")
27
+
28
+ # Text processing
29
+ max_chunk_length = 1000
30
+ text_chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)]
31
+ logger.info(f"Split text into {len(text_chunks)} chunks")
32
+
33
+ translated_chunks = []
34
+ for i, chunk in enumerate(text_chunks):
35
+ logger.debug(f"Processing chunk {i+1}/{len(text_chunks)}")
36
+ inputs = tokenizer(
37
+ chunk,
38
+ return_tensors="pt",
39
+ max_length=1024,
40
+ truncation=True
41
+ )
42
+
43
+ outputs = model.generate(
44
+ **inputs,
45
+ forced_bos_token_id=tokenizer.lang_code_to_id["zho_Hans"],
46
+ max_new_tokens=1024
47
+ )
48
+ translated = tokenizer.decode(outputs[0], skip_special_tokens=True)
49
+ translated_chunks.append(translated)
50
+ logger.debug(f"Chunk {i+1} translated successfully")
51
+
52
+ result = "".join(translated_chunks)
53
+ logger.info(f"Translation completed. Total length: {len(result)}")
54
+ return result
55
+
56
+ except Exception as e:
57
+ logger.error(f"Translation failed: {str(e)}", exc_info=True)
58
+ raise
utils/tts.py CHANGED
@@ -1,10 +1,13 @@
1
  import os
2
  import torch
3
  import time
 
4
  from pydub import AudioSegment
5
  from phonemizer.backend.espeak.wrapper import EspeakWrapper
6
  from models import build_model
7
 
 
 
8
  # Hugging Face Spaces setup
9
  MODEL_DIR = "./kokoro"
10
  os.makedirs(MODEL_DIR, exist_ok=True)
@@ -14,12 +17,17 @@ EspeakWrapper.set_library('/usr/lib/x86_64-linux-gnu/libespeak-ng.so.1')
14
 
15
  class TTSEngine:
16
  def __init__(self):
 
17
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
 
18
  self._verify_model_files()
 
19
  self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
 
20
  self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
21
  map_location=self.device)
22
-
 
23
  def _verify_model_files(self):
24
  """Ensure required model files exist"""
25
  required_files = [
@@ -29,6 +37,7 @@ class TTSEngine:
29
 
30
  missing = [f for f in required_files if not os.path.exists(f)]
31
  if missing:
 
32
  raise FileNotFoundError(
33
  f"Missing model files: {missing}\n"
34
  "Add this to your Hugging Face Space settings:\n"
@@ -38,30 +47,39 @@ class TTSEngine:
38
 
39
  def generate_speech(self, text: str, language: str = "zh") -> str:
40
  """Generate speech from Chinese text"""
41
- from kokoro import generate_full
42
-
43
- # Safety checks for Hugging Face Free Tier
44
- if len(text) > 500:
45
- text = text[:495] + "[TRUNCATED]"
46
-
47
- audio, _ = generate_full(
48
- self.model,
49
- text,
50
- self.voice,
51
- lang='en-us',
52
- max_len=200 if self.device == "cpu" else 500
53
- )
54
 
55
- # Save output
56
- output_path = f"temp/outputs/output_{int(time.time())}.wav"
57
- AudioSegment(
58
- audio.numpy().tobytes(),
59
- frame_rate=24000,
60
- sample_width=2,
61
- channels=1
62
- ).export(output_path, format="wav")
63
-
64
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  # Initialize TTS engine once
67
  @st.cache_resource
 
1
  import os
2
  import torch
3
  import time
4
+ import logging
5
  from pydub import AudioSegment
6
  from phonemizer.backend.espeak.wrapper import EspeakWrapper
7
  from models import build_model
8
 
9
+ logger = logging.getLogger(__name__)
10
+
11
  # Hugging Face Spaces setup
12
  MODEL_DIR = "./kokoro"
13
  os.makedirs(MODEL_DIR, exist_ok=True)
 
17
 
18
  class TTSEngine:
19
  def __init__(self):
20
+ logger.info("Initializing TTS Engine")
21
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ logger.debug(f"Using device: {self.device}")
23
  self._verify_model_files()
24
+ logger.info("Loading Kokoro model")
25
  self.model = build_model(f"{MODEL_DIR}/kokoro-v0_19.pth", self.device)
26
+ logger.info("Loading voice model")
27
  self.voice = torch.load(f"{MODEL_DIR}/voices/af_bella.pt",
28
  map_location=self.device)
29
+ logger.info("TTS engine initialized")
30
+
31
  def _verify_model_files(self):
32
  """Ensure required model files exist"""
33
  required_files = [
 
37
 
38
  missing = [f for f in required_files if not os.path.exists(f)]
39
  if missing:
40
+ logger.error(f"Missing model files: {missing}")
41
  raise FileNotFoundError(
42
  f"Missing model files: {missing}\n"
43
  "Add this to your Hugging Face Space settings:\n"
 
47
 
48
  def generate_speech(self, text: str, language: str = "zh") -> str:
49
  """Generate speech from Chinese text"""
50
+ logger.info(f"Generating speech for text length: {len(text)}")
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ try:
53
+ from kokoro import generate_full
54
+
55
+ if len(text) > 500:
56
+ logger.warning(f"Truncating long text ({len(text)} characters)")
57
+ text = text[:495] + "[TRUNCATED]"
58
+
59
+ logger.debug("Starting audio generation")
60
+ audio, _ = generate_full(
61
+ self.model,
62
+ text,
63
+ self.voice,
64
+ lang='en-us',
65
+ max_len=200 if self.device == "cpu" else 500
66
+ )
67
+
68
+ output_path = f"temp/outputs/output_{int(time.time())}.wav"
69
+ logger.debug(f"Saving audio to {output_path}")
70
+ AudioSegment(
71
+ audio.numpy().tobytes(),
72
+ frame_rate=24000,
73
+ sample_width=2,
74
+ channels=1
75
+ ).export(output_path, format="wav")
76
+
77
+ logger.info(f"Audio generation complete: {output_path}")
78
+ return output_path
79
+
80
+ except Exception as e:
81
+ logger.error(f"TTS generation failed: {str(e)}", exc_info=True)
82
+ raise
83
 
84
  # Initialize TTS engine once
85
  @st.cache_resource