testdeep123 commited on
Commit
96c0d7e
·
verified ·
1 Parent(s): 8f72d8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -147
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # Import necessary libraries
2
  import os
3
  import re
4
  import time
@@ -9,21 +8,16 @@ import shutil
9
  import torch
10
  import numpy as np
11
  import soundfile as sf
12
- from PIL import Image, ImageDraw, ImageFont
13
  from pydub import AudioSegment
14
  from gtts import gTTS
15
- import whisper
16
  import gradio as gr
17
  import requests
18
  import json
19
  from moviepy.editor import (
20
  VideoFileClip, concatenate_videoclips, AudioFileClip,
21
- CompositeVideoClip, TextClip, CompositeAudioClip
22
  )
23
- import subprocess
24
- import cv2
25
- import moviepy.config as mpy_config
26
- import moviepy.video.fx.all as vfx
27
  import logging
28
 
29
  # Set up logging
@@ -31,18 +25,15 @@ logging.basicConfig(level=logging.INFO,
31
  format='%(asctime)s - %(levelname)s - %(message)s')
32
  logger = logging.getLogger(__name__)
33
 
34
- # Configure moviepy
35
- mpy_config.change_settings({"IMAGEMAGICK_BINARY": "convert"})
36
-
37
  # Global Configuration Variables
38
  OPENROUTER_API_KEY = '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'
39
  OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
40
- TARGET_RESOLUTION = (1080, 1920) # Fixed to vertical format for shorts
41
  OUTPUT_VIDEO_FILENAME = "final_video.mp4"
42
  TEMP_FOLDER = None
43
  CAPTION_COLOR = "white"
44
 
45
- # Additional global variables for the Gradio interface
46
  selected_voice = 'en_us_001' # Default voice
47
  voice_speed = 1.0 # Default voice speed
48
  font_size = 45 # Default font size
@@ -50,7 +41,7 @@ bg_music_volume = 0.08 # Default background music volume
50
  fps = 30 # Default FPS
51
  preset = "veryfast" # Default preset
52
 
53
- # Initialize whisper model globally to avoid reloading
54
  whisper_model = None
55
 
56
  def load_whisper_model():
@@ -65,7 +56,6 @@ def load_whisper_model():
65
  logger.error(f"Failed to load Whisper model: {e}")
66
  return False
67
 
68
- # Helper Functions
69
  def generate_script(user_input):
70
  """Generate documentary script using OpenRouter API."""
71
  headers = {
@@ -94,31 +84,23 @@ Only output the script. No extra comments or text.
94
  Example:
95
 
96
  [Ocean]
97
-
98
  The ocean covers over seventy percent of the Earth's surface.
99
 
100
  [Currents]
101
-
102
  Ocean currents distribute heat and regulate global climate patterns.
103
 
104
  [Coral Reefs]
105
-
106
  These ecosystems support over one million species of marine life.
107
 
108
  [Pollution]
109
-
110
  Plastic waste threatens marine biodiversity and food chains.
111
 
112
  [Climate Impact]
113
-
114
  Rising temperatures are causing coral bleaching and habitat loss.
115
 
116
  [Subscribe]
117
-
118
  Follow to explore more about the changing planet we live on.
119
 
120
-
121
-
122
  Now here is the Topic: {user_input}
123
  """
124
 
@@ -147,7 +129,6 @@ Now here is the Topic: {user_input}
147
  else:
148
  logger.error(f"API Error {response.status_code}: {response.text}")
149
  return None
150
-
151
  except Exception as e:
152
  logger.error(f"Request failed: {str(e)}")
153
  return None
@@ -182,7 +163,7 @@ def parse_script(script_text):
182
 
183
  media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
184
  words = narration.split()
185
- duration = max(3, len(words) * 0.5) # Estimate duration based on word count
186
  tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
187
  elements.append(media_element)
188
  elements.append(tts_element)
@@ -205,7 +186,6 @@ def generate_tts(text, voice="en"):
205
 
206
  # Convert MP3 to WAV
207
  audio = AudioSegment.from_mp3(mp3_path)
208
- # Adjust speed if needed
209
  if voice_speed != 1.0:
210
  audio = audio._spawn(audio.raw_data, overrides={
211
  "frame_rate": int(audio.frame_rate * voice_speed)
@@ -220,7 +200,7 @@ def generate_tts(text, voice="en"):
220
  return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
221
 
222
  def generate_silent_audio(duration, sample_rate=24000):
223
- """Generate a silent WAV audio file lasting 'duration' seconds."""
224
  num_samples = int(duration * sample_rate)
225
  silence = np.zeros(num_samples, dtype=np.float32)
226
  silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
@@ -229,20 +209,14 @@ def generate_silent_audio(duration, sample_rate=24000):
229
  return silent_path
230
 
231
  def analyze_audio_with_whisper(audio_path):
232
- """
233
- Use Whisper to transcribe audio and generate word-level timestamps.
234
- Returns a list of dictionaries with word, start_time, and end_time.
235
- """
236
  try:
237
  if whisper_model is None:
238
  load_whisper_model()
239
 
240
  logger.info(f"Analyzing audio with Whisper: {audio_path}")
241
-
242
- # Transcribe the audio file
243
  result = whisper_model.transcribe(audio_path, word_timestamps=True)
244
 
245
- # Extract word-level segments
246
  word_segments = []
247
  for segment in result["segments"]:
248
  for word in segment["words"]:
@@ -259,24 +233,19 @@ def analyze_audio_with_whisper(audio_path):
259
  return []
260
 
261
  def get_video_clip_segment(video_path, start_time, duration):
262
- """
263
- Extract a segment from the video file starting at a random position,
264
- but ensuring the segment is at least 'duration' seconds long.
265
- """
266
  try:
267
  video = VideoFileClip(video_path)
268
  video_duration = video.duration
269
 
270
  if duration > video_duration:
271
- logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s). Using full video.")
272
  return video
273
 
274
- # Calculate a random start time ensuring we have enough duration left
275
  max_start_time = video_duration - duration
276
  if start_time is None or start_time > max_start_time:
277
  start_time = random.uniform(0, max_start_time)
278
 
279
- # Extract the segment
280
  clip = video.subclip(start_time, start_time + duration)
281
  logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
282
  return clip
@@ -285,13 +254,9 @@ def get_video_clip_segment(video_path, start_time, duration):
285
  return None
286
 
287
  def create_word_level_subtitles(clip, words_data, font_size=45):
288
- """
289
- Create subtitles that highlight words as they are spoken.
290
- Takes a list of word dictionaries with timing information.
291
- """
292
  try:
293
  logger.info("Creating word-level synchronized subtitles")
294
- # Group words into chunks of approximately 5 words
295
  chunks = []
296
  current_chunk = []
297
  current_chunk_words = []
@@ -310,7 +275,6 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
310
  current_chunk = []
311
  current_chunk_words = []
312
 
313
- # Add any remaining words
314
  if current_chunk_words:
315
  chunks.append({
316
  "text": " ".join(current_chunk_words),
@@ -319,25 +283,26 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
319
  "end": current_chunk[-1]["end"]
320
  })
321
 
322
- # Create subtitle clips for each chunk
323
  subtitle_clips = []
324
-
325
  for chunk in chunks:
326
  txt_clip = TextClip(
327
  chunk["text"],
328
  fontsize=font_size,
329
- font='Arial-Bold',
330
  color=CAPTION_COLOR,
331
- bg_color='rgba(0, 0, 0, 0.5)',
332
- method='caption',
333
- align='center',
334
- stroke_width=2,
335
- stroke_color='black',
336
- size=(TARGET_RESOLUTION[0] * 0.9, None)
337
- ).set_start(chunk["start"]).set_end(chunk["end"])
338
 
339
- txt_clip = txt_clip.set_position(('center', TARGET_RESOLUTION[1] * 0.85))
340
- subtitle_clips.append(txt_clip)
 
 
 
 
341
 
342
  logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
343
  return subtitle_clips
@@ -346,7 +311,7 @@ def create_word_level_subtitles(clip, words_data, font_size=45):
346
  return []
347
 
348
  def add_background_music(final_video, bg_music_volume=0.08):
349
- """Add background music to the final video."""
350
  try:
351
  bg_music_path = "music.mp3"
352
  if bg_music_path and os.path.exists(bg_music_path):
@@ -367,61 +332,54 @@ def add_background_music(final_video, bg_music_volume=0.08):
367
  return final_video
368
  except Exception as e:
369
  logger.error(f"Error adding background music: {e}")
370
- logger.info("Continuing without background music")
371
  return final_video
372
 
373
  def create_clip(tts_path, narration_text, segment_index=0):
374
- """
375
- Create a video clip with synchronized subtitles using whisper timestamps.
376
- Uses a random segment from video.mp4 matching the audio duration.
377
- """
378
  try:
379
  logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
380
  if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
381
  logger.error("Missing video or TTS file")
382
  return None
383
 
384
- # Get audio duration
385
  audio_clip = AudioFileClip(tts_path)
386
  audio_duration = audio_clip.duration
387
- target_duration = audio_duration + 0.5 # Add a small buffer
388
 
389
- # Get a random segment from the main video
390
  video_clip = get_video_clip_segment("video.mp4", None, target_duration)
391
  if video_clip is None:
392
  logger.error("Failed to extract video segment")
393
  return None
394
 
395
- # Resize to target resolution
396
  video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
397
-
398
- # Set the audio
399
  video_clip = video_clip.set_audio(audio_clip)
400
 
401
- # Generate word-level timestamps with Whisper
402
  word_data = analyze_audio_with_whisper(tts_path)
403
 
404
  if word_data:
405
- # Create word-level subtitles
406
  subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
407
  if subtitle_clips:
408
- # Combine video with subtitles
409
  video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
410
  else:
411
- # Fallback to basic subtitle if whisper fails
412
  logger.warning("Falling back to basic subtitles")
413
  txt_clip = TextClip(
414
  narration_text,
415
  fontsize=font_size,
416
- font='Arial-Bold',
417
  color=CAPTION_COLOR,
418
- bg_color='rgba(0, 0, 0, 0.5)',
419
- method='caption',
420
- align='center',
421
- size=(TARGET_RESOLUTION[0] * 0.9, None)
422
- ).set_position(('center', TARGET_RESOLUTION[1] * 0.85)).set_duration(video_clip.duration)
 
 
423
 
424
- video_clip = CompositeVideoClip([video_clip, txt_clip])
 
 
 
 
 
425
 
426
  logger.info(f"Clip created: {video_clip.duration:.1f}s")
427
  return video_clip
@@ -429,125 +387,82 @@ def create_clip(tts_path, narration_text, segment_index=0):
429
  logger.error(f"Error in create_clip: {str(e)}")
430
  return None
431
 
432
- # Main Video Generation Function
433
  def generate_video(user_input, resolution, caption_option):
434
- """Generate a video based on user input via Gradio."""
435
  global TEMP_FOLDER, CAPTION_COLOR
436
 
437
- # Set caption color based on option
438
  CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
439
-
440
- # Create a unique temporary folder
441
  TEMP_FOLDER = tempfile.mkdtemp()
442
  logger.info(f"Created temporary folder: {TEMP_FOLDER}")
443
 
444
- # Check if video.mp4 exists
445
  if not os.path.exists("video.mp4"):
446
- logger.error("video.mp4 not found in the current directory")
447
  return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
448
 
449
- # Load Whisper model
450
  load_whisper_model()
451
-
452
- # Generate script
453
- logger.info("Generating script from API...")
454
  script = generate_script(user_input)
455
  if not script:
456
- logger.error("Failed to generate script.")
457
  shutil.rmtree(TEMP_FOLDER)
458
- return "Failed to generate script. Please try again."
459
 
460
  logger.info("Generated Script:\n" + script)
461
-
462
- # Parse script into elements
463
  elements = parse_script(script)
464
  if not elements:
465
- logger.error("Failed to parse script into elements.")
466
  shutil.rmtree(TEMP_FOLDER)
467
- return "Failed to parse script. Please try again."
468
 
469
  logger.info(f"Parsed {len(elements)//2} script segments.")
470
-
471
- # Group elements into pairs (media prompt + TTS)
472
- paired_elements = []
473
- for i in range(0, len(elements), 2):
474
- if i + 1 < len(elements):
475
- paired_elements.append((elements[i], elements[i + 1]))
476
-
477
  if not paired_elements:
478
- logger.error("No valid script segments found.")
479
  shutil.rmtree(TEMP_FOLDER)
480
- return "No valid script segments were generated."
481
-
482
- # Create video clips for each segment
483
  clips = []
484
  for idx, (media_elem, tts_elem) in enumerate(paired_elements):
485
  logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
486
-
487
- # Generate TTS for the segment
488
  tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
489
  if not tts_path:
490
- logger.error(f"Skipping segment {idx+1} due to TTS generation failure.")
491
  continue
492
 
493
- # Create video clip with subtitles
494
- clip = create_clip(
495
- tts_path=tts_path,
496
- narration_text=tts_elem['text'],
497
- segment_index=idx
498
- )
499
-
500
  if clip:
501
  clips.append(clip)
502
- else:
503
- logger.error(f"Clip creation failed for segment {idx+1}.")
504
-
505
  if not clips:
506
- logger.error("No clips were successfully created.")
507
  shutil.rmtree(TEMP_FOLDER)
508
- return "Failed to create any video clips. Please try again."
509
-
510
- # Concatenate all clips
511
  logger.info("\nConcatenating clips...")
512
  final_video = concatenate_videoclips(clips, method="compose")
513
-
514
- # Add background music if available
515
  final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
516
-
517
- # Export final video
518
  logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
519
  final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
520
  logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
521
-
522
- # Clean up
523
- logger.info("Cleaning up temporary files...")
524
  shutil.rmtree(TEMP_FOLDER)
525
  logger.info("Temporary files removed.")
526
-
527
  return OUTPUT_VIDEO_FILENAME
528
 
529
- # Gradio Interface Setup
530
  def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
 
531
  global voice_speed, font_size, bg_music_volume, fps, preset
532
 
533
- # Update global variables with user selections
534
  voice_speed = v_speed
535
  font_size = caption_size
536
  bg_music_volume = bg_vol
537
  fps = video_fps
538
  preset = video_preset
539
 
540
- # Handle music upload
541
  if music_file is not None:
542
- target_path = "music.mp3"
543
- shutil.copy(music_file.name, target_path)
544
- logger.info(f"Uploaded music saved as: {target_path}")
545
 
546
- # Generate the video (always using vertical resolution)
547
  return generate_video(user_input, "Short", caption_option)
548
 
549
- # Create the Gradio interface
550
  def create_interface():
 
551
  iface = gr.Interface(
552
  fn=generate_video_with_options,
553
  inputs=[
@@ -570,16 +485,13 @@ def create_interface():
570
  3. Adjust settings as needed
571
  4. Click submit and wait for video generation
572
 
573
- NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space for this app to work.
574
  """
575
  )
576
  return iface
577
 
578
- # Launch the application
579
  if __name__ == "__main__":
580
- # Create interface and launch
581
  demo = create_interface()
582
  demo.launch()
583
  else:
584
- # For importing as a module
585
  demo = create_interface()
 
 
1
  import os
2
  import re
3
  import time
 
8
  import torch
9
  import numpy as np
10
  import soundfile as sf
 
11
  from pydub import AudioSegment
12
  from gtts import gTTS
13
+ import whisper # Ensure this is openai-whisper in requirements.txt
14
  import gradio as gr
15
  import requests
16
  import json
17
  from moviepy.editor import (
18
  VideoFileClip, concatenate_videoclips, AudioFileClip,
19
+ CompositeVideoClip, TextClip, CompositeAudioClip, ColorClip
20
  )
 
 
 
 
21
  import logging
22
 
23
  # Set up logging
 
25
  format='%(asctime)s - %(levelname)s - %(message)s')
26
  logger = logging.getLogger(__name__)
27
 
 
 
 
28
  # Global Configuration Variables
29
  OPENROUTER_API_KEY = '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'
30
  OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
31
+ TARGET_RESOLUTION = (1080, 1920) # Vertical format for shorts
32
  OUTPUT_VIDEO_FILENAME = "final_video.mp4"
33
  TEMP_FOLDER = None
34
  CAPTION_COLOR = "white"
35
 
36
+ # Additional global variables for Gradio interface
37
  selected_voice = 'en_us_001' # Default voice
38
  voice_speed = 1.0 # Default voice speed
39
  font_size = 45 # Default font size
 
41
  fps = 30 # Default FPS
42
  preset = "veryfast" # Default preset
43
 
44
+ # Initialize whisper model globally
45
  whisper_model = None
46
 
47
  def load_whisper_model():
 
56
  logger.error(f"Failed to load Whisper model: {e}")
57
  return False
58
 
 
59
  def generate_script(user_input):
60
  """Generate documentary script using OpenRouter API."""
61
  headers = {
 
84
  Example:
85
 
86
  [Ocean]
 
87
  The ocean covers over seventy percent of the Earth's surface.
88
 
89
  [Currents]
 
90
  Ocean currents distribute heat and regulate global climate patterns.
91
 
92
  [Coral Reefs]
 
93
  These ecosystems support over one million species of marine life.
94
 
95
  [Pollution]
 
96
  Plastic waste threatens marine biodiversity and food chains.
97
 
98
  [Climate Impact]
 
99
  Rising temperatures are causing coral bleaching and habitat loss.
100
 
101
  [Subscribe]
 
102
  Follow to explore more about the changing planet we live on.
103
 
 
 
104
  Now here is the Topic: {user_input}
105
  """
106
 
 
129
  else:
130
  logger.error(f"API Error {response.status_code}: {response.text}")
131
  return None
 
132
  except Exception as e:
133
  logger.error(f"Request failed: {str(e)}")
134
  return None
 
163
 
164
  media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
165
  words = narration.split()
166
+ duration = max(3, len(words) * 0.5) # Estimate duration
167
  tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
168
  elements.append(media_element)
169
  elements.append(tts_element)
 
186
 
187
  # Convert MP3 to WAV
188
  audio = AudioSegment.from_mp3(mp3_path)
 
189
  if voice_speed != 1.0:
190
  audio = audio._spawn(audio.raw_data, overrides={
191
  "frame_rate": int(audio.frame_rate * voice_speed)
 
200
  return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
201
 
202
  def generate_silent_audio(duration, sample_rate=24000):
203
+ """Generate a silent WAV audio file."""
204
  num_samples = int(duration * sample_rate)
205
  silence = np.zeros(num_samples, dtype=np.float32)
206
  silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
 
209
  return silent_path
210
 
211
  def analyze_audio_with_whisper(audio_path):
212
+ """Use Whisper to generate word-level timestamps."""
 
 
 
213
  try:
214
  if whisper_model is None:
215
  load_whisper_model()
216
 
217
  logger.info(f"Analyzing audio with Whisper: {audio_path}")
 
 
218
  result = whisper_model.transcribe(audio_path, word_timestamps=True)
219
 
 
220
  word_segments = []
221
  for segment in result["segments"]:
222
  for word in segment["words"]:
 
233
  return []
234
 
235
  def get_video_clip_segment(video_path, start_time, duration):
236
+ """Extract a random video segment."""
 
 
 
237
  try:
238
  video = VideoFileClip(video_path)
239
  video_duration = video.duration
240
 
241
  if duration > video_duration:
242
+ logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s).")
243
  return video
244
 
 
245
  max_start_time = video_duration - duration
246
  if start_time is None or start_time > max_start_time:
247
  start_time = random.uniform(0, max_start_time)
248
 
 
249
  clip = video.subclip(start_time, start_time + duration)
250
  logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
251
  return clip
 
254
  return None
255
 
256
  def create_word_level_subtitles(clip, words_data, font_size=45):
257
+ """Create synchronized subtitles without ImageMagick."""
 
 
 
258
  try:
259
  logger.info("Creating word-level synchronized subtitles")
 
260
  chunks = []
261
  current_chunk = []
262
  current_chunk_words = []
 
275
  current_chunk = []
276
  current_chunk_words = []
277
 
 
278
  if current_chunk_words:
279
  chunks.append({
280
  "text": " ".join(current_chunk_words),
 
283
  "end": current_chunk[-1]["end"]
284
  })
285
 
 
286
  subtitle_clips = []
 
287
  for chunk in chunks:
288
  txt_clip = TextClip(
289
  chunk["text"],
290
  fontsize=font_size,
 
291
  color=CAPTION_COLOR,
292
+ method='label'
293
+ )
294
+
295
+ bg_clip = ColorClip(
296
+ size=(txt_clip.w + 20, txt_clip.h + 10),
297
+ color=(0, 0, 0, 128) # Semi-transparent black
298
+ )
299
 
300
+ subtitle_clip = CompositeVideoClip([
301
+ bg_clip.set_position('center'),
302
+ txt_clip.set_position('center')
303
+ ])
304
+ subtitle_clip = subtitle_clip.set_start(chunk["start"]).set_end(chunk["end"]).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
305
+ subtitle_clips.append(subtitle_clip)
306
 
307
  logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
308
  return subtitle_clips
 
311
  return []
312
 
313
  def add_background_music(final_video, bg_music_volume=0.08):
314
+ """Add background music to the video."""
315
  try:
316
  bg_music_path = "music.mp3"
317
  if bg_music_path and os.path.exists(bg_music_path):
 
332
  return final_video
333
  except Exception as e:
334
  logger.error(f"Error adding background music: {e}")
 
335
  return final_video
336
 
337
  def create_clip(tts_path, narration_text, segment_index=0):
338
+ """Create a video clip with synchronized subtitles."""
 
 
 
339
  try:
340
  logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
341
  if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
342
  logger.error("Missing video or TTS file")
343
  return None
344
 
 
345
  audio_clip = AudioFileClip(tts_path)
346
  audio_duration = audio_clip.duration
347
+ target_duration = audio_duration + 0.5
348
 
 
349
  video_clip = get_video_clip_segment("video.mp4", None, target_duration)
350
  if video_clip is None:
351
  logger.error("Failed to extract video segment")
352
  return None
353
 
 
354
  video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
 
 
355
  video_clip = video_clip.set_audio(audio_clip)
356
 
 
357
  word_data = analyze_audio_with_whisper(tts_path)
358
 
359
  if word_data:
 
360
  subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
361
  if subtitle_clips:
 
362
  video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
363
  else:
 
364
  logger.warning("Falling back to basic subtitles")
365
  txt_clip = TextClip(
366
  narration_text,
367
  fontsize=font_size,
 
368
  color=CAPTION_COLOR,
369
+ method='label'
370
+ )
371
+
372
+ bg_clip = ColorClip(
373
+ size=(txt_clip.w + 20, txt_clip.h + 10),
374
+ color=(0, 0, 0, 128)
375
+ )
376
 
377
+ subtitle_clip = CompositeVideoClip([
378
+ bg_clip.set_position('center'),
379
+ txt_clip.set_position('center')
380
+ ])
381
+ subtitle_clip = subtitle_clip.set_duration(video_clip.duration).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
382
+ video_clip = CompositeVideoClip([video_clip, subtitle_clip])
383
 
384
  logger.info(f"Clip created: {video_clip.duration:.1f}s")
385
  return video_clip
 
387
  logger.error(f"Error in create_clip: {str(e)}")
388
  return None
389
 
 
390
  def generate_video(user_input, resolution, caption_option):
391
+ """Generate a video based on user input."""
392
  global TEMP_FOLDER, CAPTION_COLOR
393
 
 
394
  CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
 
 
395
  TEMP_FOLDER = tempfile.mkdtemp()
396
  logger.info(f"Created temporary folder: {TEMP_FOLDER}")
397
 
 
398
  if not os.path.exists("video.mp4"):
399
+ logger.error("video.mp4 not found")
400
  return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
401
 
 
402
  load_whisper_model()
 
 
 
403
  script = generate_script(user_input)
404
  if not script:
 
405
  shutil.rmtree(TEMP_FOLDER)
406
+ return "Failed to generate script."
407
 
408
  logger.info("Generated Script:\n" + script)
 
 
409
  elements = parse_script(script)
410
  if not elements:
 
411
  shutil.rmtree(TEMP_FOLDER)
412
+ return "Failed to parse script."
413
 
414
  logger.info(f"Parsed {len(elements)//2} script segments.")
415
+ paired_elements = [(elements[i], elements[i + 1]) for i in range(0, len(elements), 2)]
416
+
 
 
 
 
 
417
  if not paired_elements:
 
418
  shutil.rmtree(TEMP_FOLDER)
419
+ return "No valid script segments generated."
420
+
 
421
  clips = []
422
  for idx, (media_elem, tts_elem) in enumerate(paired_elements):
423
  logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
 
 
424
  tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
425
  if not tts_path:
 
426
  continue
427
 
428
+ clip = create_clip(tts_path, tts_elem['text'], idx)
 
 
 
 
 
 
429
  if clip:
430
  clips.append(clip)
431
+
 
 
432
  if not clips:
 
433
  shutil.rmtree(TEMP_FOLDER)
434
+ return "Failed to create any video clips."
435
+
 
436
  logger.info("\nConcatenating clips...")
437
  final_video = concatenate_videoclips(clips, method="compose")
 
 
438
  final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
439
+
 
440
  logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
441
  final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
442
  logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
443
+
 
 
444
  shutil.rmtree(TEMP_FOLDER)
445
  logger.info("Temporary files removed.")
 
446
  return OUTPUT_VIDEO_FILENAME
447
 
 
448
  def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
449
+ """Generate video with Gradio options."""
450
  global voice_speed, font_size, bg_music_volume, fps, preset
451
 
 
452
  voice_speed = v_speed
453
  font_size = caption_size
454
  bg_music_volume = bg_vol
455
  fps = video_fps
456
  preset = video_preset
457
 
 
458
  if music_file is not None:
459
+ shutil.copy(music_file.name, "music.mp3")
460
+ logger.info(f"Uploaded music saved as: music.mp3")
 
461
 
 
462
  return generate_video(user_input, "Short", caption_option)
463
 
 
464
  def create_interface():
465
+ """Create Gradio interface."""
466
  iface = gr.Interface(
467
  fn=generate_video_with_options,
468
  inputs=[
 
485
  3. Adjust settings as needed
486
  4. Click submit and wait for video generation
487
 
488
+ NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space.
489
  """
490
  )
491
  return iface
492
 
 
493
  if __name__ == "__main__":
 
494
  demo = create_interface()
495
  demo.launch()
496
  else:
 
497
  demo = create_interface()