Nishur commited on
Commit
8ae26b9
·
verified ·
1 Parent(s): 7ddad4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -37
app.py CHANGED
@@ -11,7 +11,8 @@ import shutil
11
  from pathlib import Path
12
  import time
13
  from tqdm import tqdm
14
- from gtts import gTTS
 
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO,
@@ -30,20 +31,35 @@ LANGUAGES = {
30
  "Hindi": "hi"
31
  }
32
 
33
- # TTS voice mapping for different languages
34
- TTS_VOICES = {
35
- "en": "en-US",
36
- "es": "es-ES",
37
- "fr": "fr-FR",
38
- "de": "de-DE",
39
- "ja": "ja-JP",
40
- "hi": "hi-IN"
41
  }
42
 
43
  # Create a permanent output directory
44
  OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
45
  os.makedirs(OUTPUT_DIR, exist_ok=True)
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def extract_audio(video_path):
48
  """Extract audio from video file using ffmpeg"""
49
  try:
@@ -124,7 +140,7 @@ def translate_subtitles(srt_path, target_langs):
124
  raise Exception(f"Translation failed: {str(e)}")
125
 
126
  def generate_translated_audio(srt_path, target_lang):
127
- """Generate translated audio using text-to-speech"""
128
  try:
129
  logger.info(f"Generating translated audio for {target_lang}")
130
  subs = pysrt.open(srt_path, encoding="utf-8")
@@ -138,6 +154,11 @@ def generate_translated_audio(srt_path, target_lang):
138
  audio_files = []
139
  timings = []
140
 
 
 
 
 
 
141
  for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} speech")):
142
  text = sub.text.strip()
143
  if not text:
@@ -157,31 +178,11 @@ def generate_translated_audio(srt_path, target_lang):
157
  duration = end_time - start_time
158
 
159
  # Generate TTS audio
160
- tts_lang = TTS_VOICES.get(target_lang, target_lang)
161
- audio_file = os.path.join(temp_dir, f"chunk_{i:04d}.mp3")
162
 
163
  try:
164
- # Add a retry mechanism for Hindi and other potentially problematic languages
165
- retry_count = 0
166
- max_retries = 3
167
- while retry_count < max_retries:
168
- try:
169
- # For Hindi, use slower speed which might improve reliability
170
- slow_option = target_lang == "hi"
171
- tts = gTTS(text=text, lang=target_lang, slow=slow_option)
172
- tts.save(audio_file)
173
- break
174
- except Exception as e:
175
- retry_count += 1
176
- logger.warning(f"TTS attempt {retry_count} failed for {target_lang}: {str(e)}")
177
- time.sleep(1) # Wait before retrying
178
-
179
- # If still failing after retries, try with shorter text
180
- if retry_count == max_retries and len(text) > 100:
181
- logger.warning(f"Trying with shortened text for {target_lang}")
182
- shortened_text = text[:100] + "..."
183
- tts = gTTS(text=shortened_text, lang=target_lang, slow=True)
184
- tts.save(audio_file)
185
 
186
  if os.path.exists(audio_file) and os.path.getsize(audio_file) > 0:
187
  audio_files.append(audio_file)
@@ -587,11 +588,11 @@ if __name__ == "__main__":
587
  missing_deps.append("assemblyai")
588
 
589
  try:
590
- import gtts
591
- logger.info("gTTS package found")
592
  except ImportError:
593
- logger.warning("gTTS package not found - required for text-to-speech")
594
- missing_deps.append("gtts")
595
 
596
  try:
597
  import deep_translator
 
11
  from pathlib import Path
12
  import time
13
  from tqdm import tqdm
14
+ import torch
15
+ from TTS.api import TTS
16
 
17
  # Set up logging
18
  logging.basicConfig(level=logging.INFO,
 
31
  "Hindi": "hi"
32
  }
33
 
34
+ # TTS model mapping for different languages
35
+ TTS_MODELS = {
36
+ "en": "tts_models/en/ljspeech/tacotron2-DDC_ph",
37
+ "es": "tts_models/es/css10/vits",
38
+ "fr": "tts_models/fr/css10/vits",
39
+ "de": "tts_models/de/thorsten/tacotron2-DDC",
40
+ "ja": "tts_models/ja/kokoro/tacotron2-DDC",
41
+ "hi": "tts_models/hi/kb/tacotron2-DDC"
42
  }
43
 
44
  # Create a permanent output directory
45
  OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
46
  os.makedirs(OUTPUT_DIR, exist_ok=True)
47
 
48
+ # Initialize TTS
49
+ def init_tts():
50
+ device = "cuda" if torch.cuda.is_available() else "cpu"
51
+ tts_models = {}
52
+ for lang_code, model_name in TTS_MODELS.items():
53
+ try:
54
+ tts = TTS(model_name=model_name, progress_bar=False).to(device)
55
+ tts_models[lang_code] = tts
56
+ logger.info(f"Loaded TTS model for {lang_code}: {model_name}")
57
+ except Exception as e:
58
+ logger.warning(f"Failed to load TTS model for {lang_code}: {str(e)}")
59
+ return tts_models
60
+
61
+ tts_models = init_tts()
62
+
63
  def extract_audio(video_path):
64
  """Extract audio from video file using ffmpeg"""
65
  try:
 
140
  raise Exception(f"Translation failed: {str(e)}")
141
 
142
  def generate_translated_audio(srt_path, target_lang):
143
+ """Generate translated audio using Coqui TTS"""
144
  try:
145
  logger.info(f"Generating translated audio for {target_lang}")
146
  subs = pysrt.open(srt_path, encoding="utf-8")
 
154
  audio_files = []
155
  timings = []
156
 
157
+ # Get the appropriate TTS model
158
+ tts = tts_models.get(target_lang)
159
+ if tts is None:
160
+ raise Exception(f"No TTS model available for language: {target_lang}")
161
+
162
  for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} speech")):
163
  text = sub.text.strip()
164
  if not text:
 
178
  duration = end_time - start_time
179
 
180
  # Generate TTS audio
181
+ audio_file = os.path.join(temp_dir, f"chunk_{i:04d}.wav")
 
182
 
183
  try:
184
+ # For multi-speaker models, we might need to specify speaker
185
+ tts.tts_to_file(text=text, file_path=audio_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  if os.path.exists(audio_file) and os.path.getsize(audio_file) > 0:
188
  audio_files.append(audio_file)
 
588
  missing_deps.append("assemblyai")
589
 
590
  try:
591
+ import TTS
592
+ logger.info("Coqui TTS package found")
593
  except ImportError:
594
+ logger.warning("Coqui TTS package not found - required for text-to-speech")
595
+ missing_deps.append("TTS")
596
 
597
  try:
598
  import deep_translator