Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,7 +11,8 @@ import shutil
|
|
11 |
from pathlib import Path
|
12 |
import time
|
13 |
from tqdm import tqdm
|
14 |
-
|
|
|
15 |
|
16 |
# Set up logging
|
17 |
logging.basicConfig(level=logging.INFO,
|
@@ -30,20 +31,35 @@ LANGUAGES = {
|
|
30 |
"Hindi": "hi"
|
31 |
}
|
32 |
|
33 |
-
# TTS
|
34 |
-
|
35 |
-
"en": "en-
|
36 |
-
"es": "es
|
37 |
-
"fr": "fr
|
38 |
-
"de": "de-
|
39 |
-
"ja": "ja-
|
40 |
-
"hi": "hi-
|
41 |
}
|
42 |
|
43 |
# Create a permanent output directory
|
44 |
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
|
45 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def extract_audio(video_path):
|
48 |
"""Extract audio from video file using ffmpeg"""
|
49 |
try:
|
@@ -124,7 +140,7 @@ def translate_subtitles(srt_path, target_langs):
|
|
124 |
raise Exception(f"Translation failed: {str(e)}")
|
125 |
|
126 |
def generate_translated_audio(srt_path, target_lang):
|
127 |
-
"""Generate translated audio using
|
128 |
try:
|
129 |
logger.info(f"Generating translated audio for {target_lang}")
|
130 |
subs = pysrt.open(srt_path, encoding="utf-8")
|
@@ -138,6 +154,11 @@ def generate_translated_audio(srt_path, target_lang):
|
|
138 |
audio_files = []
|
139 |
timings = []
|
140 |
|
|
|
|
|
|
|
|
|
|
|
141 |
for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} speech")):
|
142 |
text = sub.text.strip()
|
143 |
if not text:
|
@@ -157,31 +178,11 @@ def generate_translated_audio(srt_path, target_lang):
|
|
157 |
duration = end_time - start_time
|
158 |
|
159 |
# Generate TTS audio
|
160 |
-
|
161 |
-
audio_file = os.path.join(temp_dir, f"chunk_{i:04d}.mp3")
|
162 |
|
163 |
try:
|
164 |
-
#
|
165 |
-
|
166 |
-
max_retries = 3
|
167 |
-
while retry_count < max_retries:
|
168 |
-
try:
|
169 |
-
# For Hindi, use slower speed which might improve reliability
|
170 |
-
slow_option = target_lang == "hi"
|
171 |
-
tts = gTTS(text=text, lang=target_lang, slow=slow_option)
|
172 |
-
tts.save(audio_file)
|
173 |
-
break
|
174 |
-
except Exception as e:
|
175 |
-
retry_count += 1
|
176 |
-
logger.warning(f"TTS attempt {retry_count} failed for {target_lang}: {str(e)}")
|
177 |
-
time.sleep(1) # Wait before retrying
|
178 |
-
|
179 |
-
# If still failing after retries, try with shorter text
|
180 |
-
if retry_count == max_retries and len(text) > 100:
|
181 |
-
logger.warning(f"Trying with shortened text for {target_lang}")
|
182 |
-
shortened_text = text[:100] + "..."
|
183 |
-
tts = gTTS(text=shortened_text, lang=target_lang, slow=True)
|
184 |
-
tts.save(audio_file)
|
185 |
|
186 |
if os.path.exists(audio_file) and os.path.getsize(audio_file) > 0:
|
187 |
audio_files.append(audio_file)
|
@@ -587,11 +588,11 @@ if __name__ == "__main__":
|
|
587 |
missing_deps.append("assemblyai")
|
588 |
|
589 |
try:
|
590 |
-
import
|
591 |
-
logger.info("
|
592 |
except ImportError:
|
593 |
-
logger.warning("
|
594 |
-
missing_deps.append("
|
595 |
|
596 |
try:
|
597 |
import deep_translator
|
|
|
11 |
from pathlib import Path
|
12 |
import time
|
13 |
from tqdm import tqdm
|
14 |
+
import torch
|
15 |
+
from TTS.api import TTS
|
16 |
|
17 |
# Set up logging
|
18 |
logging.basicConfig(level=logging.INFO,
|
|
|
31 |
"Hindi": "hi"
|
32 |
}
|
33 |
|
34 |
+
# TTS model mapping for different languages
|
35 |
+
TTS_MODELS = {
|
36 |
+
"en": "tts_models/en/ljspeech/tacotron2-DDC_ph",
|
37 |
+
"es": "tts_models/es/css10/vits",
|
38 |
+
"fr": "tts_models/fr/css10/vits",
|
39 |
+
"de": "tts_models/de/thorsten/tacotron2-DDC",
|
40 |
+
"ja": "tts_models/ja/kokoro/tacotron2-DDC",
|
41 |
+
"hi": "tts_models/hi/kb/tacotron2-DDC"
|
42 |
}
|
43 |
|
44 |
# Create a permanent output directory
|
45 |
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
|
46 |
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
47 |
|
48 |
+
# Initialize TTS
|
49 |
+
def init_tts():
|
50 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
51 |
+
tts_models = {}
|
52 |
+
for lang_code, model_name in TTS_MODELS.items():
|
53 |
+
try:
|
54 |
+
tts = TTS(model_name=model_name, progress_bar=False).to(device)
|
55 |
+
tts_models[lang_code] = tts
|
56 |
+
logger.info(f"Loaded TTS model for {lang_code}: {model_name}")
|
57 |
+
except Exception as e:
|
58 |
+
logger.warning(f"Failed to load TTS model for {lang_code}: {str(e)}")
|
59 |
+
return tts_models
|
60 |
+
|
61 |
+
tts_models = init_tts()
|
62 |
+
|
63 |
def extract_audio(video_path):
|
64 |
"""Extract audio from video file using ffmpeg"""
|
65 |
try:
|
|
|
140 |
raise Exception(f"Translation failed: {str(e)}")
|
141 |
|
142 |
def generate_translated_audio(srt_path, target_lang):
|
143 |
+
"""Generate translated audio using Coqui TTS"""
|
144 |
try:
|
145 |
logger.info(f"Generating translated audio for {target_lang}")
|
146 |
subs = pysrt.open(srt_path, encoding="utf-8")
|
|
|
154 |
audio_files = []
|
155 |
timings = []
|
156 |
|
157 |
+
# Get the appropriate TTS model
|
158 |
+
tts = tts_models.get(target_lang)
|
159 |
+
if tts is None:
|
160 |
+
raise Exception(f"No TTS model available for language: {target_lang}")
|
161 |
+
|
162 |
for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} speech")):
|
163 |
text = sub.text.strip()
|
164 |
if not text:
|
|
|
178 |
duration = end_time - start_time
|
179 |
|
180 |
# Generate TTS audio
|
181 |
+
audio_file = os.path.join(temp_dir, f"chunk_{i:04d}.wav")
|
|
|
182 |
|
183 |
try:
|
184 |
+
# For multi-speaker models, we might need to specify speaker
|
185 |
+
tts.tts_to_file(text=text, file_path=audio_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
if os.path.exists(audio_file) and os.path.getsize(audio_file) > 0:
|
188 |
audio_files.append(audio_file)
|
|
|
588 |
missing_deps.append("assemblyai")
|
589 |
|
590 |
try:
|
591 |
+
import TTS
|
592 |
+
logger.info("Coqui TTS package found")
|
593 |
except ImportError:
|
594 |
+
logger.warning("Coqui TTS package not found - required for text-to-speech")
|
595 |
+
missing_deps.append("TTS")
|
596 |
|
597 |
try:
|
598 |
import deep_translator
|