Nishur commited on
Commit
dc43c7f
·
verified ·
1 Parent(s): 68acba2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +404 -559
app.py CHANGED
@@ -1,616 +1,461 @@
1
  import gradio as gr
2
  import os
3
- import tempfile
4
  import subprocess
5
- import assemblyai as aai
 
6
  from deep_translator import GoogleTranslator
7
  import pysrt
8
- import logging
9
- import sys
10
  import shutil
11
- from pathlib import Path
12
  import time
13
  from tqdm import tqdm
14
- import torch
15
- from TTS.api import TTS
16
 
17
  # Set up logging
18
  logging.basicConfig(level=logging.INFO,
19
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
20
- stream=sys.stdout)
21
  logger = logging.getLogger(__name__)
22
 
23
  # Configuration
24
- aai.settings.api_key = os.getenv("ASSEMBLYAI_API_KEY")
25
  LANGUAGES = {
26
- "English": "en",
27
- "Spanish": "es",
28
- "French": "fr",
29
- "German": "de",
30
- "Japanese": "ja",
31
- "Hindi": "hi"
32
  }
33
 
34
- # TTS model mapping for different languages
35
- TTS_MODELS = {
36
- "en": "tts_models/en/ljspeech/tacotron2-DDC_ph",
37
- "es": "tts_models/es/css10/vits",
38
- "fr": "tts_models/fr/css10/vits",
39
- "de": "tts_models/de/thorsten/tacotron2-DDC",
40
- "ja": "tts_models/ja/kokoro/tacotron2-DDC",
41
- "hi": "tts_models/hi/kb/tacotron2-DDC"
42
  }
43
 
44
- # Create a permanent output directory
45
- OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
46
  os.makedirs(OUTPUT_DIR, exist_ok=True)
47
 
48
  # Initialize TTS
49
- def init_tts():
50
- device = "cuda" if torch.cuda.is_available() else "cpu"
51
- tts_models = {}
52
- for lang_code, model_name in TTS_MODELS.items():
53
- try:
54
- tts = TTS(model_name=model_name, progress_bar=False).to(device)
55
- tts_models[lang_code] = tts
56
- logger.info(f"Loaded TTS model for {lang_code}: {model_name}")
57
- except Exception as e:
58
- logger.warning(f"Failed to load TTS model for {lang_code}: {str(e)}")
59
- return tts_models
60
 
61
- tts_models = init_tts()
 
62
 
63
- def extract_audio(video_path):
64
- """Extract audio from video file using ffmpeg"""
65
- try:
66
- logger.info(f"Extracting audio from video: {video_path}")
67
- audio_path = os.path.join(OUTPUT_DIR, "audio.wav")
68
-
69
- # Use ffmpeg to extract audio
70
- cmd = [
71
- 'ffmpeg',
72
- '-i', video_path,
73
- '-vn', # No video
74
- '-acodec', 'pcm_s16le', # PCM format
75
- '-ar', '44100', # Sample rate
76
- '-ac', '2', # Stereo
77
- '-y', # Overwrite output file
78
- audio_path
79
- ]
80
-
81
- logger.info(f"Running command: {' '.join(cmd)}")
82
- process = subprocess.run(cmd, capture_output=True, text=True)
83
-
84
- if process.returncode != 0:
85
- logger.error(f"Audio extraction failed: {process.stderr}")
86
- raise Exception(f"Audio extraction failed: {process.stderr}")
87
-
88
- return audio_path
89
- except Exception as e:
90
- logger.error(f"Audio extraction failed: {str(e)}", exc_info=True)
91
- raise Exception(f"Audio extraction failed: {str(e)}")
92
 
93
- def generate_subtitles(audio_path):
94
- """Generate subtitles using AssemblyAI"""
95
- try:
96
- logger.info(f"Transcribing audio with AssemblyAI: {audio_path}")
97
- transcriber = aai.Transcriber()
98
- transcript = transcriber.transcribe(audio_path)
99
-
100
- srt_path = os.path.join(OUTPUT_DIR, "subtitles.srt")
101
- logger.info(f"Saving subtitles to: {srt_path}")
102
-
103
- with open(srt_path, "w", encoding="utf-8") as f:
104
- f.write(transcript.export_subtitles_srt())
105
-
106
- return srt_path
107
- except Exception as e:
108
- logger.error(f"Subtitle generation failed: {str(e)}", exc_info=True)
109
- raise Exception(f"Subtitle generation failed: {str(e)}")
110
 
111
- def translate_subtitles(srt_path, target_langs):
112
- """Translate subtitles to target languages"""
113
- try:
114
- logger.info(f"Loading subtitles from: {srt_path}")
115
- subs = pysrt.open(srt_path, encoding="utf-8")
116
- results = {}
117
-
118
- for lang_code in target_langs:
119
- logger.info(f"Translating to language code: {lang_code}")
120
- translated_subs = subs[:]
121
- translator = GoogleTranslator(source="auto", target=lang_code)
122
-
123
- for i, sub in enumerate(translated_subs):
124
- try:
125
- sub.text = translator.translate(sub.text)
126
- if i % 10 == 0: # Log progress every 10 subtitles
127
- logger.info(f"Translated {i+1}/{len(translated_subs)} subtitles to {lang_code}")
128
- except Exception as e:
129
- logger.warning(f"Failed to translate subtitle: {sub.text}. Error: {str(e)}")
130
- # Keep original text if translation fails
131
-
132
- output_path = os.path.join(OUTPUT_DIR, f"subtitles_{lang_code}.srt")
133
- logger.info(f"Saving translated subtitles to: {output_path}")
134
- translated_subs.save(output_path, encoding='utf-8')
135
- results[lang_code] = output_path
136
-
137
- return results
138
- except Exception as e:
139
- logger.error(f"Translation failed: {str(e)}", exc_info=True)
140
- raise Exception(f"Translation failed: {str(e)}")
141
 
142
- def generate_translated_audio(srt_path, target_lang):
143
- """Generate translated audio using Coqui TTS"""
144
- try:
145
- logger.info(f"Generating translated audio for {target_lang}")
146
- subs = pysrt.open(srt_path, encoding="utf-8")
147
- translated_text = [sub.text for sub in subs]
148
-
149
- # Create temporary directory for audio chunks
150
- temp_dir = os.path.join(OUTPUT_DIR, f"temp_audio_{target_lang}")
151
- os.makedirs(temp_dir, exist_ok=True)
152
-
153
- # Generate TTS for each subtitle
154
- audio_files = []
155
- timings = []
156
-
157
- # Get the appropriate TTS model
158
- tts = tts_models.get(target_lang)
159
- if tts is None:
160
- raise Exception(f"No TTS model available for language: {target_lang}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
 
162
- for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} speech")):
163
- text = sub.text.strip()
164
- if not text:
165
- continue
166
-
167
- # Get timing information
168
- start_time = (sub.start.hours * 3600 +
169
- sub.start.minutes * 60 +
170
- sub.start.seconds +
171
- sub.start.milliseconds / 1000)
172
-
173
- end_time = (sub.end.hours * 3600 +
174
- sub.end.minutes * 60 +
175
- sub.end.seconds +
176
- sub.end.milliseconds / 1000)
177
-
178
- duration = end_time - start_time
179
-
180
- # Generate TTS audio
181
- audio_file = os.path.join(temp_dir, f"chunk_{i:04d}.wav")
182
-
183
  try:
184
- # For multi-speaker models, we might need to specify speaker
185
- tts.tts_to_file(text=text, file_path=audio_file)
186
-
187
- if os.path.exists(audio_file) and os.path.getsize(audio_file) > 0:
188
- audio_files.append(audio_file)
189
- timings.append((start_time, end_time, duration, audio_file))
190
- else:
191
- logger.warning(f"Generated audio file is empty or does not exist: {audio_file}")
192
-
193
  except Exception as e:
194
- logger.warning(f"Failed to generate TTS for: {text}. Error: {str(e)}")
195
-
196
- # Check if we actually generated any audio files
197
- if not audio_files:
198
- logger.warning(f"No audio files were generated for {target_lang}")
199
- # Create a silent audio file as fallback
200
- silent_audio = os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")
201
- silent_cmd = [
202
- 'ffmpeg',
203
- '-f', 'lavfi',
204
- '-i', f'anullsrc=r=44100:cl=stereo',
205
- '-t', '180', # 3 minutes default
206
- '-q:a', '0',
207
- '-y',
208
- silent_audio
209
- ]
210
- subprocess.run(silent_cmd, capture_output=True)
211
- return silent_audio
212
-
213
- # Create a silent audio track the same length as the original video
214
- silence_file = os.path.join(temp_dir, "silence.wav")
215
- try:
216
- video_duration_cmd = [
217
- 'ffprobe',
218
- '-v', 'error',
219
- '-show_entries', 'format=duration',
220
- '-of', 'default=noprint_wrappers=1:nokey=1',
221
- os.path.join(OUTPUT_DIR, "base_video.mp4")
222
- ]
223
-
224
- duration_result = subprocess.run(video_duration_cmd, capture_output=True, text=True)
225
- video_duration = float(duration_result.stdout.strip())
226
- except Exception as e:
227
- logger.warning(f"Could not determine video duration: {str(e)}. Using default of 180 seconds.")
228
- video_duration = 180.0
229
-
230
- # Create silent audio track
231
- silent_cmd = [
232
- 'ffmpeg',
233
- '-f', 'lavfi',
234
- '-i', f'anullsrc=r=44100:cl=stereo',
235
- '-t', str(video_duration),
236
- '-q:a', '0',
237
- '-y',
238
- silence_file
239
- ]
240
- subprocess.run(silent_cmd, capture_output=True)
241
-
242
- # Create a file with the audio mixing commands
243
- filter_complex = []
244
- input_count = 1 # Starting with 1 because 0 is the silence track
245
-
246
- # Start with silent track
247
- filter_parts = ["[0:a]"]
248
-
249
- # Add each audio segment
250
- for start_time, end_time, duration, audio_file in timings:
251
- filter_parts.append(f"[{input_count}:a]adelay={int(start_time*1000)}|{int(start_time*1000)}")
252
- input_count += 1
253
-
254
- # Mix all audio tracks
255
- filter_parts.append(f"amix=inputs={input_count}:dropout_transition=0:normalize=0[aout]")
256
- filter_complex = ";".join(filter_parts)
257
-
258
- # Build the ffmpeg command with all audio chunks
259
- cmd = ['ffmpeg', '-y']
260
-
261
- # Add silent base track
262
- cmd.extend(['-i', silence_file])
263
-
264
- # Add all audio chunks
265
- for audio_file in audio_files:
266
- cmd.extend(['-i', audio_file])
267
-
268
- # Add filter complex and output
269
- output_audio = os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")
270
- cmd.extend([
271
- '-filter_complex', filter_complex,
272
- '-map', '[aout]',
273
- output_audio
274
- ])
275
-
276
- # Run the command
277
- logger.info(f"Combining audio segments: {' '.join(cmd)}")
278
- process = subprocess.run(cmd, capture_output=True)
279
-
280
- if process.returncode != 0:
281
- logger.error(f"Audio combination failed: {process.stderr}")
282
- # Create a fallback silent audio as last resort
283
- silent_audio = os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")
284
- silent_cmd = [
285
- 'ffmpeg',
286
- '-f', 'lavfi',
287
- '-i', f'anullsrc=r=44100:cl=stereo',
288
- '-t', str(video_duration),
289
- '-q:a', '0',
290
- '-y',
291
- silent_audio
292
- ]
293
- subprocess.run(silent_cmd, capture_output=True)
294
- output_audio = silent_audio
295
-
296
- # Verify the output file exists
297
- if not os.path.exists(output_audio):
298
- logger.error(f"Output audio file does not exist: {output_audio}")
299
- # Create emergency fallback
300
- silent_audio = os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")
301
- silent_cmd = [
302
- 'ffmpeg',
303
- '-f', 'lavfi',
304
- '-i', f'anullsrc=r=44100:cl=stereo',
305
- '-t', '180',
306
- '-q:a', '0',
307
- '-y',
308
- silent_audio
309
- ]
310
- subprocess.run(silent_cmd, capture_output=True)
311
- output_audio = silent_audio
312
-
313
- # Clean up temporary files
314
- try:
315
- shutil.rmtree(temp_dir)
316
- except Exception as e:
317
- logger.warning(f"Failed to clean up temp directory: {str(e)}")
318
 
319
- return output_audio
320
- except Exception as e:
321
- logger.error(f"Audio translation failed: {str(e)}", exc_info=True)
322
- # Create an emergency fallback silent audio
323
- try:
324
- silent_audio = os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")
325
- silent_cmd = [
326
- 'ffmpeg',
327
- '-f', 'lavfi',
328
- '-i', f'anullsrc=r=44100:cl=stereo',
329
- '-t', '180',
330
- '-q:a', '0',
331
- '-y',
332
- silent_audio
333
- ]
334
- subprocess.run(silent_cmd, capture_output=True)
335
- return silent_audio
336
- except:
337
- raise Exception(f"Audio translation failed: {str(e)}")
338
 
339
- def combine_video_audio_subtitles(video_path, audio_path, srt_path, output_path):
340
- """Combine video with translated audio and subtitles"""
341
- try:
342
- logger.info(f"Combining video, audio, and subtitles")
343
-
344
- # Verify that all input files exist
345
- if not os.path.exists(video_path):
346
- raise Exception(f"Video file does not exist: {video_path}")
347
- if not os.path.exists(audio_path):
348
- raise Exception(f"Audio file does not exist: {audio_path}")
349
- if not os.path.exists(srt_path):
350
- raise Exception(f"Subtitle file does not exist: {srt_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- logger.info(f"Input files verified: Video: {os.path.getsize(video_path)} bytes, Audio: {os.path.getsize(audio_path)} bytes, Subtitles: {os.path.getsize(srt_path)} bytes")
 
353
 
354
- # Create a safe version of the subtitle path
355
- safe_srt_path = srt_path.replace(" ", "\\ ").replace(":", "\\:")
356
-
357
- # Command to combine video with translated audio and subtitles
358
  try:
359
- # Attempt method 1: Using subtitles filter
360
- cmd = [
361
- 'ffmpeg',
362
- '-i', video_path, # Input video
363
- '-i', audio_path, # Input translated audio
364
- '-map', '0:v', # Use video from first input
365
- '-map', '1:a', # Use audio from second input
366
- '-vf', f"subtitles={safe_srt_path}:force_style='FontSize=24,PrimaryColour=&H00FFFFFF,OutlineColour=&H00000000,BorderStyle=3'", # Burn subtitles
367
- '-c:v', 'libx264', # Video codec
368
- '-c:a', 'aac', # Audio codec
369
- '-shortest', # End when shortest input ends
370
- '-y', # Overwrite output file
371
- output_path
372
- ]
373
-
374
- logger.info(f"Running command: {' '.join(cmd)}")
375
- process = subprocess.run(cmd, capture_output=True, text=True)
376
-
377
- if process.returncode != 0:
378
- logger.warning(f"First method failed: {process.stderr}")
379
- raise Exception("First method failed")
380
-
381
  except Exception as e:
382
- logger.warning(f"First method failed: {str(e)}")
383
-
384
- try:
385
- # Attempt method 2: Using hardcoded subtitles approach
386
- temp_srt_dir = os.path.join(OUTPUT_DIR, "temp_srt")
387
- os.makedirs(temp_srt_dir, exist_ok=True)
388
-
389
- # Copy the SRT file to the temp directory
390
- temp_srt_path = os.path.join(temp_srt_dir, "temp.srt")
391
- shutil.copy(srt_path, temp_srt_path)
392
-
393
- cmd = [
394
- 'ffmpeg',
395
- '-i', video_path,
396
- '-i', audio_path,
397
- '-map', '0:v',
398
- '-map', '1:a',
399
- '-vf', f"subtitles={temp_srt_path}",
400
- '-c:v', 'libx264',
401
- '-c:a', 'aac',
402
- '-shortest',
403
- '-y',
404
- output_path
405
- ]
406
-
407
- logger.info(f"Running second method: {' '.join(cmd)}")
408
- process = subprocess.run(cmd, capture_output=True, text=True)
409
-
410
- if process.returncode != 0:
411
- logger.warning(f"Second method failed: {process.stderr}")
412
- raise Exception("Second method failed")
413
-
414
- # Clean up temp directory
415
- shutil.rmtree(temp_srt_dir)
416
-
417
- except Exception as e:
418
- logger.warning(f"Second method failed: {str(e)}")
419
-
420
- # Attempt method 3: No subtitles as last resort
421
- cmd = [
422
- 'ffmpeg',
423
- '-i', video_path,
424
- '-i', audio_path,
425
- '-map', '0:v',
426
- '-map', '1:a',
427
- '-c:v', 'libx264',
428
- '-c:a', 'aac',
429
- '-shortest',
430
- '-y',
431
- output_path
432
- ]
433
-
434
- logger.info(f"Running fallback method (no subtitles): {' '.join(cmd)}")
435
- process = subprocess.run(cmd, capture_output=True, text=True)
436
-
437
- if process.returncode != 0:
438
- logger.error(f"All methods failed: {process.stderr}")
439
- raise Exception(f"Failed to combine video and audio: {process.stderr}")
440
- else:
441
- logger.warning("Created video without subtitles as fallback")
442
-
443
- # Verify the output file exists and has a reasonable size
444
- if not os.path.exists(output_path):
445
- raise Exception(f"Output file does not exist: {output_path}")
446
-
447
- if os.path.getsize(output_path) < 1000:
448
- raise Exception(f"Output file is too small: {os.path.getsize(output_path)} bytes")
449
-
450
- logger.info(f"Successfully created output file: {output_path} ({os.path.getsize(output_path)} bytes)")
451
- return output_path
452
- except Exception as e:
453
- logger.error(f"Combining failed: {str(e)}", exc_info=True)
454
- raise Exception(f"Combining failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
 
456
- def process_video(video_file, source_lang, target_langs, progress=gr.Progress()):
457
- """Process video with translation of both subtitles and audio"""
 
 
 
 
 
 
 
458
  try:
459
- progress(0.05, "Starting processing...")
460
- logger.info(f"Processing video: {video_file}")
461
 
462
- # Make sure we have ffmpeg installed
463
- try:
464
- subprocess.run(['ffmpeg', '-version'], capture_output=True, check=True)
465
- logger.info("ffmpeg is installed and working")
466
- except (subprocess.SubprocessError, FileNotFoundError):
467
- error_msg = "ffmpeg is not installed or not in PATH. Please install ffmpeg."
468
- logger.error(error_msg)
469
- return None, error_msg
470
-
471
- # Extract audio
472
  progress(0.1, "Extracting audio...")
473
  audio_path = extract_audio(video_file)
474
 
475
- # Generate subtitles
476
- progress(0.25, "Generating subtitles...")
477
- srt_path = generate_subtitles(audio_path)
 
478
 
479
- # Translate subtitles
480
- progress(0.4, "Translating subtitles...")
481
- target_lang_codes = [LANGUAGES[lang] for lang in target_langs]
482
- translated_subs = translate_subtitles(srt_path, target_lang_codes)
 
 
483
 
484
- # Create a copy of the video file in our output directory
 
 
 
 
485
  base_video = os.path.join(OUTPUT_DIR, "base_video.mp4")
486
  shutil.copy(video_file, base_video)
487
 
488
- # Process each target language
489
- output_videos = []
490
-
491
- for i, (lang_code, sub_path) in enumerate(translated_subs.items()):
492
- lang_name = next(name for name, code in LANGUAGES.items() if code == lang_code)
493
- progress(0.5 + (i * 0.5 / len(translated_subs)), f"Processing {lang_name}...")
494
 
495
- try:
496
- # Generate translated audio
497
- logger.info(f"Generating translated audio for {lang_code}")
498
- translated_audio = generate_translated_audio(sub_path, lang_code)
499
-
500
- # Verify audio file exists
501
- if not os.path.exists(translated_audio):
502
- logger.error(f"Translated audio file does not exist: {translated_audio}")
503
- continue
504
-
505
- # Combine video, translated audio, and subtitles
506
- output_path = os.path.join(OUTPUT_DIR, f"output_{lang_code}.mp4")
507
- logger.info(f"Creating final video with {lang_code} audio and subtitles")
508
-
509
- output_video = combine_video_audio_subtitles(
510
- base_video,
511
- translated_audio,
512
- sub_path,
513
- output_path
514
- )
515
-
516
- # Verify the output file exists and has content
517
- if os.path.exists(output_video) and os.path.getsize(output_video) > 1000:
518
- logger.info(f"Successfully created output file: {output_video}")
519
- output_videos.append(output_video)
520
- else:
521
- logger.warning(f"Output file is missing or too small: {output_video}")
522
- except Exception as e:
523
- logger.error(f"Failed to process {lang_code}: {str(e)}")
524
-
525
- # If all output videos failed, return the original
526
- if not output_videos:
527
- logger.warning("All translations failed, returning original video")
528
- return base_video, "Failed to translate video, returning original"
529
 
530
- progress(1.0, "Done!")
531
- message = f"Processing complete. Created {len(output_videos)} translated videos."
532
- logger.info(message)
533
- return output_videos[0], message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
 
 
 
 
535
  except Exception as e:
536
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
537
- return None, f"Processing failed: {str(e)}"
538
 
539
- with gr.Blocks() as demo:
540
- gr.Markdown("# Complete Video Translation System")
541
- gr.Markdown("Translates both subtitles and audio to target languages")
542
-
543
- with gr.Row():
544
- with gr.Column(scale=1):
545
- video_input = gr.Video(label="Upload Video")
546
- source_lang = gr.Dropdown(
547
- label="Source Language",
548
- choices=list(LANGUAGES.keys()),
549
- value="English"
550
- )
551
- target_langs = gr.CheckboxGroup(
552
- label="Target Languages (Both Audio & Subtitles)",
553
- choices=list(LANGUAGES.keys()),
554
- value=["Spanish"]
555
- )
556
- submit_btn = gr.Button("Translate", variant="primary")
557
-
558
- with gr.Column(scale=2):
559
- output_video = gr.Video(label="Translated Video")
560
- status_text = gr.Textbox(label="Status", interactive=False)
561
- output_info = gr.Markdown("Output videos will be saved in the 'outputs' directory")
562
 
563
- submit_btn.click(
564
- process_video,
565
- inputs=[video_input, source_lang, target_langs],
566
- outputs=[output_video, status_text]
567
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
 
569
  if __name__ == "__main__":
570
- # Check dependencies at startup
571
- missing_deps = []
572
-
573
- # Check ffmpeg
574
- try:
575
- version_info = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True)
576
- ffmpeg_version = version_info.stdout.split('\n')[0]
577
- logger.info(f"ffmpeg version: {ffmpeg_version}")
578
- except:
579
- logger.warning("ffmpeg not found - required for video processing")
580
- missing_deps.append("ffmpeg")
581
-
582
- # Check Python dependencies
583
- try:
584
- import assemblyai
585
- logger.info("AssemblyAI package found")
586
- except ImportError:
587
- logger.warning("AssemblyAI package not found - required for transcription")
588
- missing_deps.append("assemblyai")
589
-
590
- try:
591
- import TTS
592
- logger.info("Coqui TTS package found")
593
- except ImportError:
594
- logger.warning("Coqui TTS package not found - required for text-to-speech")
595
- missing_deps.append("TTS")
596
-
597
  try:
598
- import deep_translator
599
- logger.info("deep_translator package found")
600
- except ImportError:
601
- logger.warning("deep_translator package not found - required for translation")
602
- missing_deps.append("deep_translator")
603
-
604
- # Print installation instructions if dependencies are missing
605
- if missing_deps:
606
- logger.warning("Missing dependencies detected. Please install:")
607
- if "ffmpeg" in missing_deps:
608
- logger.warning("- ffmpeg: https://ffmpeg.org/download.html")
609
-
610
- python_deps = [dep for dep in missing_deps if dep != "ffmpeg"]
611
- if python_deps:
612
- deps_str = " ".join(python_deps)
613
- logger.warning(f"- Python packages: pip install {deps_str}")
614
-
615
- # Start the app
616
- demo.launch()
 
1
  import gradio as gr
2
  import os
 
3
  import subprocess
4
+ import torch
5
+ from TTS.api import TTS
6
  from deep_translator import GoogleTranslator
7
  import pysrt
8
+ import whisper # Free speech-to-text
9
+ import webvtt
10
  import shutil
 
11
  import time
12
  from tqdm import tqdm
13
+ from typing import Dict, List, Optional
14
+ import logging
15
 
16
  # Set up logging
17
  logging.basicConfig(level=logging.INFO,
18
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
19
  logger = logging.getLogger(__name__)
20
 
21
  # Configuration
 
22
  LANGUAGES = {
23
+ "English": {"code": "en", "speakers": ["default"], "whisper": "en"},
24
+ "Spanish": {"code": "es", "speakers": ["default"], "whisper": "es"},
25
+ "French": {"code": "fr", "speakers": ["default"], "whisper": "fr"},
26
+ "German": {"code": "de", "speakers": ["thorsten", "eva_k"], "whisper": "de"},
27
+ "Japanese": {"code": "ja", "speakers": ["default"], "whisper": "ja"},
28
+ "Hindi": {"code": "hi", "speakers": ["default"], "whisper": "hi"}
29
  }
30
 
31
+ SUBTITLE_STYLES = {
32
+ "Default": "",
33
+ "White Text": "color: white;",
34
+ "Yellow Text": "color: yellow;",
35
+ "Large Text": "font-size: 24px;",
36
+ "Bold Text": "font-weight: bold;",
37
+ "Black Background": "background-color: black; padding: 5px;"
 
38
  }
39
 
40
+ # Create output directory
41
+ OUTPUT_DIR = "outputs"
42
  os.makedirs(OUTPUT_DIR, exist_ok=True)
43
 
44
  # Initialize TTS
45
+ device = "cuda" if torch.cuda.is_available() else "cpu"
46
+ tts_models = {
47
+ "en": TTS("tts_models/en/ljspeech/tacotron2-DDC").to(device),
48
+ "es": TTS("tts_models/es/css10/vits").to(device),
49
+ "fr": TTS("tts_models/fr/css10/vits").to(device),
50
+ "de": TTS("tts_models/de/thorsten/tacotron2-DDC").to(device),
51
+ "ja": TTS("tts_models/ja/kokoro/tacotron2-DDC").to(device),
52
+ "hi": TTS("tts_models/hi/kb/tacotron2-DDC").to(device)
53
+ }
 
 
54
 
55
+ # Initialize Whisper (load when needed)
56
+ whisper_model = None
57
 
58
+ def get_whisper_model():
59
+ global whisper_model
60
+ if whisper_model is None:
61
+ whisper_model = whisper.load_model("small")
62
+ return whisper_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ def extract_audio(video_path: str) -> str:
65
+ """Extract audio using ffmpeg"""
66
+ audio_path = os.path.join(OUTPUT_DIR, "audio.wav")
67
+ cmd = [
68
+ 'ffmpeg', '-i', video_path, '-vn',
69
+ '-acodec', 'pcm_s16le', '-ar', '16000',
70
+ '-ac', '1', '-y', audio_path
71
+ ]
72
+ subprocess.run(cmd, check=True)
73
+ return audio_path
 
 
 
 
 
 
 
74
 
75
+ def transcribe_with_whisper(audio_path: str, language: str = None) -> str:
76
+ """Transcribe audio using Whisper"""
77
+ model = get_whisper_model()
78
+ result = model.transcribe(audio_path, language=language, word_timestamps=True)
79
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
+ def generate_srt_from_whisper(audio_path: str, language: str) -> str:
82
+ """Generate SRT subtitles from Whisper output"""
83
+ result = transcribe_with_whisper(audio_path, language)
84
+
85
+ subs = pysrt.SubRipFile()
86
+ for i, segment in enumerate(result["segments"]):
87
+ subs.append(pysrt.SubRipItem(
88
+ index=i+1,
89
+ start=pysrt.SubRipTime(seconds=segment["start"]),
90
+ end=pysrt.SubRipTime(seconds=segment["end"]),
91
+ text=segment["text"]
92
+ ))
93
+
94
+ srt_path = os.path.join(OUTPUT_DIR, "subtitles.srt")
95
+ subs.save(srt_path, encoding='utf-8')
96
+ return srt_path
97
+
98
+ def detect_language(audio_path: str) -> str:
99
+ """Detect language using Whisper"""
100
+ result = transcribe_with_whisper(audio_path)
101
+ detected_code = result["language"]
102
+ for name, data in LANGUAGES.items():
103
+ if data["whisper"] == detected_code:
104
+ return name
105
+ return "English"
106
+
107
+ def translate_subtitles(srt_path: str, target_langs: List[str]) -> Dict[str, str]:
108
+ """Translate subtitles to multiple languages"""
109
+ subs = pysrt.open(srt_path)
110
+ results = {}
111
+
112
+ for lang_name in target_langs:
113
+ lang_code = LANGUAGES[lang_name]["code"]
114
+ translated_subs = subs[:]
115
+ translator = GoogleTranslator(source='auto', target=lang_code)
116
 
117
+ for sub in translated_subs:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  try:
119
+ sub.text = translator.translate(sub.text)
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
+ logger.warning(f"Translation failed: {str(e)}")
122
+ continue
123
+
124
+ output_path = os.path.join(OUTPUT_DIR, f"subtitles_{lang_code}.srt")
125
+ translated_subs.save(output_path, encoding='utf-8')
126
+ results[lang_code] = output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
+ return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ def generate_webvtt_subtitles(srt_path: str, style: str = "") -> str:
131
+ """Convert SRT to WebVTT with optional styling"""
132
+ subs = pysrt.open(srt_path)
133
+ lang_code = os.path.basename(srt_path).split('_')[-1].replace('.srt', '')
134
+ vtt_path = os.path.join(OUTPUT_DIR, f"subtitles_{lang_code}.vtt")
135
+
136
+ with open(vtt_path, 'w', encoding='utf-8') as f:
137
+ f.write("WEBVTT\n\n")
138
+ if style:
139
+ f.write(f"STYLE\n::cue {{\n{style}\n}}\n\n")
140
+
141
+ for sub in subs:
142
+ start = sub.start.to_time().strftime('%H:%M:%S.%f')[:-3]
143
+ end = sub.end.to_time().strftime('%H:%M:%S.%f')[:-3]
144
+ f.write(f"{start} --> {end}\n")
145
+ f.write(f"{sub.text}\n\n")
146
+
147
+ return vtt_path
148
+
149
+ def generate_translated_audio(
150
+ srt_path: str,
151
+ target_lang: str,
152
+ speaker: str = "default"
153
+ ) -> str:
154
+ """Generate translated audio using TTS"""
155
+ subs = pysrt.open(srt_path)
156
+ temp_dir = os.path.join(OUTPUT_DIR, f"temp_audio_{target_lang}")
157
+ os.makedirs(temp_dir, exist_ok=True)
158
+
159
+ audio_files = []
160
+ timings = []
161
+ tts = tts_models.get(target_lang)
162
+
163
+ for i, sub in enumerate(tqdm(subs, desc=f"Generating {target_lang} audio")):
164
+ text = sub.text.strip()
165
+ if not text:
166
+ continue
167
 
168
+ start_time = sub.start.ordinal / 1000
169
+ audio_file = os.path.join(temp_dir, f"chunk_{i:04d}.wav")
170
 
 
 
 
 
171
  try:
172
+ kwargs = {"speaker": speaker} if speaker != "default" and hasattr(tts, 'synthesizer') else {}
173
+ tts.tts_to_file(text=text, file_path=audio_file, **kwargs)
174
+ audio_files.append(audio_file)
175
+ timings.append((start_time, audio_file))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  except Exception as e:
177
+ logger.warning(f"TTS failed: {str(e)}")
178
+
179
+ if not audio_files:
180
+ raise Exception("No audio generated")
181
+
182
+ # Create silent audio
183
+ video_duration = get_video_duration(os.path.join(OUTPUT_DIR, "base_video.mp4"))
184
+ silence_file = os.path.join(temp_dir, "silence.wav")
185
+ subprocess.run([
186
+ 'ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=stereo',
187
+ '-t', str(video_duration), '-y', silence_file
188
+ ], check=True)
189
+
190
+ # Mix audio
191
+ filter_complex = "[0:a]" + "".join(
192
+ f"[{i+1}:a]adelay={int(start*1000)}|{int(start*1000)}[a{i}];" +
193
+ f"[a{i-1 if i>0 else 'out'}]" + f"[a{i}]amix=inputs=2[aout]"
194
+ for i, (start, _) in enumerate(timings)
195
+ )
196
+
197
+ cmd = ['ffmpeg', '-y', '-i', silence_file] + \
198
+ [f'-i {f}' for f in audio_files] + [
199
+ '-filter_complex', filter_complex,
200
+ '-map', '[aout]',
201
+ os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")]
202
+
203
+ subprocess.run(' '.join(cmd), shell=True, check=True)
204
+ shutil.rmtree(temp_dir)
205
+ return os.path.join(OUTPUT_DIR, f"translated_audio_{target_lang}.wav")
206
+
207
+ def get_video_duration(video_path: str) -> float:
208
+ """Get video duration in seconds"""
209
+ result = subprocess.run([
210
+ 'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
211
+ '-of', 'default=noprint_wrappers=1:nokey=1', video_path
212
+ ], capture_output=True, text=True)
213
+ return float(result.stdout.strip() or 180)
214
+
215
+ def create_html_player(
216
+ video_path: str,
217
+ subtitle_paths: Dict[str, str],
218
+ style: str = ""
219
+ ) -> str:
220
+ """Create HTML player with video and subtitles"""
221
+ html_path = os.path.join(OUTPUT_DIR, "player.html")
222
+ video_name = os.path.basename(video_path)
223
+
224
+ subtitle_tracks = "\n".join(
225
+ f'<track kind="subtitles" src="{os.path.basename(path)}" '
226
+ f'srclang="{lang}" label="{lang.capitalize()}" '
227
+ f'{"default" if lang == "en" else ""}>'
228
+ for lang, path in subtitle_paths.items()
229
+ )
230
+
231
+ style_block = f"video::cue {{ {style} }}" if style else ""
232
+
233
+ html_content = f"""<!DOCTYPE html>
234
+ <html>
235
+ <head>
236
+ <title>Video Player</title>
237
+ <style>
238
+ body {{ font-family: Arial, sans-serif; margin: 20px; }}
239
+ .container {{ max-width: 800px; margin: 0 auto; }}
240
+ video {{ width: 100%; background: #000; }}
241
+ .downloads {{ margin-top: 20px; }}
242
+ {style_block}
243
+ </style>
244
+ </head>
245
+ <body>
246
+ <div class="container">
247
+ <h2>Video Player with Subtitles</h2>
248
+ <video controls>
249
+ <source src="{video_name}" type="video/mp4">
250
+ {subtitle_tracks}
251
+ </video>
252
+
253
+ <div class="downloads">
254
+ <h3>Download Subtitles:</h3>
255
+ {"".join(
256
+ f'<a href="{os.path.basename(path)}" download>'
257
+ f'{lang.upper()} Subtitles (.vtt)</a><br>'
258
+ for lang, path in subtitle_paths.items()
259
+ )}
260
+ </div>
261
+ </div>
262
+ </body>
263
+ </html>"""
264
+
265
+ with open(html_path, 'w', encoding='utf-8') as f:
266
+ f.write(html_content)
267
+
268
+ return html_path
269
 
270
+ def process_video(
271
+ video_file: str,
272
+ source_lang: str,
273
+ target_langs: List[str],
274
+ subtitle_style: str,
275
+ speaker_settings: Dict[str, str],
276
+ progress: gr.Progress = gr.Progress()
277
+ ) -> List[str]:
278
+ """Complete video processing pipeline"""
279
  try:
280
+ progress(0.05, "Initializing...")
 
281
 
282
+ # 1. Extract audio
 
 
 
 
 
 
 
 
 
283
  progress(0.1, "Extracting audio...")
284
  audio_path = extract_audio(video_file)
285
 
286
+ # 2. Detect language if needed
287
+ if source_lang == "Auto-detect":
288
+ source_lang = detect_language(audio_path)
289
+ progress(0.15, f"Detected language: {source_lang}")
290
 
291
+ # 3. Generate subtitles
292
+ progress(0.2, "Generating subtitles...")
293
+ srt_path = generate_srt_from_whisper(
294
+ audio_path,
295
+ LANGUAGES[source_lang]["whisper"]
296
+ )
297
 
298
+ # 4. Translate subtitles
299
+ progress(0.3, "Translating subtitles...")
300
+ translated_subs = translate_subtitles(srt_path, target_langs)
301
+
302
+ # 5. Save original video
303
  base_video = os.path.join(OUTPUT_DIR, "base_video.mp4")
304
  shutil.copy(video_file, base_video)
305
 
306
+ # 6. Process each target language
307
+ translated_vtts = {}
308
+ for i, lang_name in enumerate(target_langs, 1):
309
+ lang_code = LANGUAGES[lang_name]["code"]
310
+ progress(0.4 + (i * 0.5 / len(target_langs)), f"Processing {lang_name}...")
 
311
 
312
+ # Generate audio
313
+ translated_audio = generate_translated_audio(
314
+ translated_subs[lang_code],
315
+ lang_code,
316
+ speaker_settings.get(lang_code, "default")
317
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
+ # Generate subtitles
320
+ vtt_path = generate_webvtt_subtitles(
321
+ translated_subs[lang_code],
322
+ SUBTITLE_STYLES.get(subtitle_style, "")
323
+ )
324
+ translated_vtts[lang_code] = vtt_path
325
+
326
+ # Create translated video version
327
+ output_video = os.path.join(OUTPUT_DIR, f"output_{lang_code}.mp4")
328
+ subprocess.run([
329
+ 'ffmpeg', '-i', base_video, '-i', translated_audio,
330
+ '-map', '0:v', '-map', '1:a', '-c:v', 'copy', '-c:a', 'aac',
331
+ '-y', output_video
332
+ ], check=True)
333
+
334
+ # 7. Create HTML player
335
+ progress(0.9, "Creating HTML player...")
336
+ html_path = create_html_player(
337
+ base_video,
338
+ translated_vtts,
339
+ SUBTITLE_STYLES.get(subtitle_style, "")
340
+ )
341
+
342
+ # Prepare all output files
343
+ output_files = [html_path, base_video] + \
344
+ list(translated_vtts.values()) + \
345
+ [os.path.join(OUTPUT_DIR, f"output_{LANGUAGES[lang]['code']}.mp4")
346
+ for lang in target_langs]
347
 
348
+ progress(1.0, "Done!")
349
+ return output_files, "Processing completed successfully!"
350
+
351
  except Exception as e:
352
  logger.error(f"Processing failed: {str(e)}", exc_info=True)
353
+ return None, f"Error: {str(e)}"
354
 
355
+ def get_speaker_settings(*args) -> Dict[str, str]:
356
+ """Create speaker settings dictionary from inputs"""
357
+ settings = {}
358
+ for i, lang in enumerate(LANGUAGES.keys()):
359
+ if i < len(args) and args[i]:
360
+ settings[LANGUAGES[lang]["code"]] = args[i]
361
+ return settings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
 
363
+ def create_interface():
364
+ """Create Gradio interface"""
365
+ with gr.Blocks(title="Video Translator") as demo:
366
+ gr.Markdown("# Free Video Translation System")
367
+ gr.Markdown("Translate videos with subtitles and audio dubbing using free/open-source tools")
368
+
369
+ with gr.Row():
370
+ with gr.Column(scale=1):
371
+ video_input = gr.Video(label="Upload Video")
372
+
373
+ with gr.Accordion("Source Settings", open=True):
374
+ source_lang = gr.Dropdown(
375
+ label="Source Language",
376
+ choices=["Auto-detect"] + list(LANGUAGES.keys()),
377
+ value="Auto-detect"
378
+ )
379
+
380
+ with gr.Accordion("Target Languages", open=True):
381
+ target_langs = gr.CheckboxGroup(
382
+ label="Select target languages",
383
+ choices=list(LANGUAGES.keys()),
384
+ value=["English", "Spanish"]
385
+ )
386
+
387
+ with gr.Accordion("Subtitle Styling", open=False):
388
+ subtitle_style = gr.Dropdown(
389
+ label="Subtitle Appearance",
390
+ choices=list(SUBTITLE_STYLES.keys()),
391
+ value="Default"
392
+ )
393
+
394
+ with gr.Accordion("Voice Settings", open=False):
395
+ speaker_inputs = []
396
+ for lang_name in LANGUAGES.keys():
397
+ speakers = LANGUAGES[lang_name]["speakers"]
398
+ if len(speakers) > 1:
399
+ speaker_inputs.append(
400
+ gr.Dropdown(
401
+ label=f"{lang_name} Speaker",
402
+ choices=speakers,
403
+ value=speakers[0],
404
+ visible=False
405
+ )
406
+ )
407
+ else:
408
+ speaker_inputs.append(gr.Textbox(visible=False))
409
+
410
+ submit_btn = gr.Button("Translate Video", variant="primary")
411
+
412
+ with gr.Column(scale=2):
413
+ output_files = gr.Files(label="Download Files")
414
+ status = gr.Textbox(label="Status")
415
+
416
+ gr.Markdown("""
417
+ **Instructions:**
418
+ 1. Upload a video file
419
+ 2. Select source and target languages
420
+ 3. Customize subtitles and voices
421
+ 4. Click Translate
422
+ 5. Download the HTML player and open in browser
423
+ """)
424
+
425
+ def update_speaker_ui(selected_langs):
426
+ updates = []
427
+ for i, lang_name in enumerate(LANGUAGES.keys()):
428
+ visible = lang_name in selected_langs and len(LANGUAGES[lang_name]["speakers"]) > 1
429
+ updates.append(gr.Dropdown.update(visible=visible))
430
+ return updates
431
+
432
+ target_langs.change(
433
+ update_speaker_ui,
434
+ inputs=target_langs,
435
+ outputs=speaker_inputs
436
+ )
437
+
438
+ submit_btn.click(
439
+ process_video,
440
+ inputs=[
441
+ video_input,
442
+ source_lang,
443
+ target_langs,
444
+ subtitle_style,
445
+ gr.State(lambda: get_speaker_settings(*speaker_inputs))
446
+ ],
447
+ outputs=[output_files, status]
448
+ )
449
+
450
+ return demo
451
 
452
  if __name__ == "__main__":
453
+ # Check requirements
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454
  try:
455
+ subprocess.run(["ffmpeg", "-version"], check=True)
456
+ import torch, whisper
457
+ demo = create_interface()
458
+ demo.launch()
459
+ except Exception as e:
460
+ print(f"Error: {str(e)}")
461
+ print("Please install all requirements: pip install -r requirements.txt")