testdeep123 commited on
Commit
0e69295
Β·
verified Β·
1 Parent(s): 731f02e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +552 -191
app.py CHANGED
@@ -1,224 +1,585 @@
 
1
  import os
2
  import re
 
3
  import math
4
- import random
5
  import tempfile
 
6
  import shutil
7
- import requests
8
  import numpy as np
9
- from kokoro import KPipeline
10
  import soundfile as sf
 
11
  from pydub import AudioSegment
12
  from gtts import gTTS
 
13
  import gradio as gr
 
 
14
  from moviepy.editor import (
15
- VideoFileClip, AudioFileClip, concatenate_audioclips,
16
- CompositeAudioClip, CompositeVideoClip, TextClip
17
  )
 
 
 
 
 
 
 
 
 
 
18
 
19
- # ────────── GLOBAL CONFIG ──────────
20
- OPENROUTER_API_KEY = ''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' # ← your key here
21
- OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
22
- SOURCE_VIDEO_PATH = "video.mp4" # your 13 min source
23
- OUTPUT_VIDEO_PATH = "final_video.mp4"
24
- TARGET_RESOLUTION = (1080, 1920) # vertical
25
- VOICE_SPEED = 0.9
26
- CAPTION_FONT_SIZE = 45
27
- BG_MUSIC_VOLUME = 0.08
28
-
29
- # Kokoro TTS pipeline (American English)
30
- pipeline = KPipeline(lang_code='a')
31
-
32
- # ────────── UTILS ──────────
33
- def generate_script(topic: str) -> str:
34
- """Ask the LLM to produce a tagged, one-sentence-per-scene script."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  headers = {
36
  'Authorization': f'Bearer {OPENROUTER_API_KEY}',
 
37
  'X-Title': 'AI Documentary Maker'
38
  }
39
- prompt = f"""
40
- You’re a professional documentary narrator.
41
- Break your script into scenes with [Tags], one sentence each (≀12 words).
42
- No slang or numbers. End with [Subscribe] + formal reason.
43
- Topic: {topic}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  """
45
- payload = {
 
46
  'model': OPENROUTER_MODEL,
47
- 'messages': [{'role':'user','content':prompt}],
48
- 'temperature':0.4,
49
- 'max_tokens':5000
50
  }
51
- r = requests.post('https://openrouter.ai/api/v1/chat/completions',
52
- headers=headers, json=payload, timeout=30)
53
- r.raise_for_status()
54
- return r.json()['choices'][0]['message']['content']
55
 
56
- def parse_script(script_text: str):
57
- """
58
- Return list of (tag, sentence), skipping empties.
59
- If [Subscribe] has no sentence, fill a default call-to-action.
60
- """
61
- sections = []
62
- current = None
63
- for line in script_text.splitlines():
64
- m = re.match(r'^\[(.+?)\]\s*(.*)$', line)
65
- if m:
66
- if current:
67
- sections.append(tuple(current))
68
- current = [m.group(1).strip(), m.group(2).strip()]
69
- elif current and line.strip():
70
- current[1] += ' ' + line.strip()
71
- if current:
72
- sections.append(tuple(current))
73
-
74
- # filter & fix
75
- cleaned = []
76
- for tag, sentence in sections:
77
- if not sentence:
78
- if tag.lower() == 'subscribe':
79
- sentence = "Follow to explore more on this topic."
80
  else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  continue
82
- cleaned.append((tag, sentence))
83
- return cleaned
84
 
85
- def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  """
87
- Try Kokoro β†’ fallback to gTTS. Returns a .wav path.
 
88
  """
89
- safe = re.sub(r'[^\w]', '_', text[:10]).strip()
90
- out_wav = os.path.join(dirpath, f"tts_{safe}.wav")
91
- if os.path.exists(out_wav):
92
- return out_wav
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
- # 1) Kokoro
 
 
 
 
95
  try:
96
- segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+')
97
- arrays = [audio for _, _, audio in segments]
98
- audio = np.concatenate(arrays, axis=0) if len(arrays)>1 else arrays[0]
99
- sf.write(out_wav, audio, 24000)
100
- return out_wav
101
- except Exception:
102
- # 2) fallback to gTTS
103
- mp3 = os.path.join(dirpath, f"{safe}.mp3")
104
- gTTS(text=text, lang='en').save(mp3)
105
- wav_seg = AudioSegment.from_mp3(mp3)
106
- wav_seg.export(out_wav, format="wav")
107
- os.remove(mp3)
108
- return out_wav
109
-
110
- def add_pillow_subtitles(video_clip, sections):
 
 
 
 
 
 
111
  """
112
- Break each sentence into ~5-word chunks and overlay as timed subtitles,
113
- all via Pillow (no ImageMagick).
114
  """
115
- subs = []
116
- total_words = sum(len(s.split()) for _, s in sections)
117
- cum_time = 0.0
118
-
119
- for _, sentence in sections:
120
- words = sentence.split()
121
- seg_words = len(words)
122
- seg_dur = video_clip.duration * (seg_words / total_words)
123
- chunk_dur = seg_dur / max(1, math.ceil(seg_words/5))
124
-
125
- # group into 5-word chunks
126
- for i in range(0, seg_words, 5):
127
- chunk = " ".join(words[i:i+5])
128
- txt_clip = (
129
- TextClip(
130
- chunk,
131
- fontsize=CAPTION_FONT_SIZE,
132
- font='Arial-Bold',
133
- color='white',
134
- bg_color='rgba(0,0,0,0.3)',
135
- size=(TARGET_RESOLUTION[0]*0.9, None),
136
- method='pillow'
137
- )
138
- .set_start(cum_time + (i//5)*chunk_dur)
139
- .set_duration(chunk_dur)
140
- .set_position(('center', int(TARGET_RESOLUTION[1]*0.8)))
141
- )
142
- subs.append(txt_clip)
143
- cum_time += seg_dur
144
-
145
- return subs
146
-
147
- # ────────── MAIN RENDER FUNCTION ──────────
148
- VOICE_MAP = {
149
- 'Emma (Female)': 'af_heart',
150
- 'Bella (Female)': 'af_bella',
151
- 'Nicole (Female)':'af_nicole',
152
- # … add your full list here …
153
- }
154
-
155
- def generate_video(topic, include_captions, music_file, voice_choice):
156
- # 1) Script β†’ sections
157
- script = generate_script(topic)
158
- sections = parse_script(script)
159
-
160
- # 2) TTS each sentence
161
- tmpdir = tempfile.mkdtemp()
162
- tts_paths = []
163
- voice_code = VOICE_MAP.get(voice_choice, 'af_heart')
164
-
165
- for _, sentence in sections:
166
- tts_paths.append(generate_tts_audio(sentence, voice_code, tmpdir))
167
-
168
- # 3) Concatenate narration
169
- aud_clips = [AudioFileClip(p) for p in tts_paths]
170
- narration = concatenate_audioclips(aud_clips)
171
- narration = narration.set_fps(24000)
172
-
173
- # 4) Pick one random video subclip
174
- src = VideoFileClip(SOURCE_VIDEO_PATH)
175
- max_start = max(0, src.duration - narration.duration)
176
- start = random.uniform(0, max_start)
177
- vid = src.subclip(start, start + narration.duration).resize(TARGET_RESOLUTION)
178
- src.close()
179
-
180
- # 5) Overlay narration audio
181
- vid = vid.set_audio(narration)
182
-
183
- # 6) Add captions if requested
184
- if include_captions:
185
- subs = add_pillow_subtitles(vid, sections)
186
- vid = CompositeVideoClip([vid, *subs])
187
-
188
- # 7) Add background music
189
- if music_file:
190
- bg = AudioFileClip(music_file.name)
191
- loops = math.ceil(vid.duration / bg.duration)
192
- bg_full = concatenate_audioclips([bg]*loops).subclip(0, vid.duration)
193
- bg_full = bg_full.volumex(BG_MUSIC_VOLUME)
194
- vid = vid.set_audio(CompositeAudioClip([vid.audio, bg_full]))
195
-
196
- # 8) Export
197
- vid.write_videofile(
198
- OUTPUT_VIDEO_PATH,
199
- codec='libx264',
200
- fps=30,
201
- preset='veryfast',
202
- audio_codec='aac'
203
- )
204
 
205
- # Cleanup
206
- shutil.rmtree(tmpdir)
207
- return OUTPUT_VIDEO_PATH
208
-
209
- # ────────── GRADIO UI ──────────
210
- iface = gr.Interface(
211
- fn=generate_video,
212
- inputs=[
213
- gr.Textbox(label="Video Concept", placeholder="Enter your topic…"),
214
- gr.Checkbox(label="Include Captions"),
215
- gr.File(label="Background Music (MP3)", file_types=[".mp3"]),
216
- gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)")
217
- ],
218
- outputs=gr.Video(label="Generated Video"),
219
- title="AI Documentary Video Generator",
220
- description="Cuts one ~64 s clip from your video, adds AI narration & TikTok-style subtitles."
221
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
 
223
  if __name__ == "__main__":
224
- iface.launch(share=True)
 
 
 
 
 
 
1
+ # Import necessary libraries
2
  import os
3
  import re
4
+ import time
5
  import math
 
6
  import tempfile
7
+ import random
8
  import shutil
9
+ import torch
10
  import numpy as np
 
11
  import soundfile as sf
12
+ from PIL import Image, ImageDraw, ImageFont
13
  from pydub import AudioSegment
14
  from gtts import gTTS
15
+ import whisper
16
  import gradio as gr
17
+ import requests
18
+ import json
19
  from moviepy.editor import (
20
+ VideoFileClip, concatenate_videoclips, AudioFileClip,
21
+ CompositeVideoClip, TextClip, CompositeAudioClip
22
  )
23
+ import subprocess
24
+ import cv2
25
+ import moviepy.config as mpy_config
26
+ import moviepy.video.fx.all as vfx
27
+ import logging
28
+
29
+ # Set up logging
30
+ logging.basicConfig(level=logging.INFO,
31
+ format='%(asctime)s - %(levelname)s - %(message)s')
32
+ logger = logging.getLogger(__name__)
33
 
34
+ # Configure moviepy
35
+ mpy_config.change_settings({"IMAGEMAGICK_BINARY": "convert"})
36
+
37
+ # Global Configuration Variables
38
+ OPENROUTER_API_KEY = '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'
39
+ OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
40
+ TARGET_RESOLUTION = (1080, 1920) # Fixed to vertical format for shorts
41
+ OUTPUT_VIDEO_FILENAME = "final_video.mp4"
42
+ TEMP_FOLDER = None
43
+ CAPTION_COLOR = "white"
44
+
45
+ # Additional global variables for the Gradio interface
46
+ selected_voice = 'en_us_001' # Default voice
47
+ voice_speed = 1.0 # Default voice speed
48
+ font_size = 45 # Default font size
49
+ bg_music_volume = 0.08 # Default background music volume
50
+ fps = 30 # Default FPS
51
+ preset = "veryfast" # Default preset
52
+
53
+ # Initialize whisper model globally to avoid reloading
54
+ whisper_model = None
55
+
56
+ def load_whisper_model():
57
+ """Load the Whisper model."""
58
+ global whisper_model
59
+ try:
60
+ logger.info("Loading Whisper model...")
61
+ whisper_model = whisper.load_model("tiny") # Using tiny for CPU efficiency
62
+ logger.info("Whisper model loaded successfully")
63
+ return True
64
+ except Exception as e:
65
+ logger.error(f"Failed to load Whisper model: {e}")
66
+ return False
67
+
68
+ # Helper Functions
69
+ def generate_script(user_input):
70
+ """Generate documentary script using OpenRouter API."""
71
  headers = {
72
  'Authorization': f'Bearer {OPENROUTER_API_KEY}',
73
+ 'HTTP-Referer': 'https://huggingface.co/spaces',
74
  'X-Title': 'AI Documentary Maker'
75
  }
76
+
77
+ prompt = f"""You're a professional documentary narrator. Your job is to write a serious, natural, and informative video script based on one topic.
78
+
79
+ The script should sound like a real human voiceover from a TV show or documentary β€” clear, factual, and engaging, like something you'd hear on National Geographic or a news report.
80
+
81
+ Structure:
82
+ - Break the script into scenes using [Tags]. Each tag is a short title (1–2 words) that describes the scene.
83
+ - Under each tag, write one sentence (max 12 words) that fits the tag and continues the topic.
84
+ - The full script should make sense as one connected narration β€” no randomness.
85
+ - Use natural, formal English. No slang, no fake AI language, and no robotic tone.
86
+ - Do not use humor, sarcasm, or casual language. This is a serious narration.
87
+ - No emotion-sound words like "aww," "eww," "whoa," etc.
88
+ - Do not use numbers like 1, 2, 3 β€” write them out as one, two, three.
89
+ - Make the total narration about 1 minute long (around 150-200 words total).
90
+ - At the end, add a [Subscribe] tag with a formal or respectful reason to follow or subscribe.
91
+
92
+ Only output the script. No extra comments or text.
93
+
94
+ Example:
95
+
96
+ [Ocean]
97
+
98
+ The ocean covers over seventy percent of the Earth's surface.
99
+
100
+ [Currents]
101
+
102
+ Ocean currents distribute heat and regulate global climate patterns.
103
+
104
+ [Coral Reefs]
105
+
106
+ These ecosystems support over one million species of marine life.
107
+
108
+ [Pollution]
109
+
110
+ Plastic waste threatens marine biodiversity and food chains.
111
+
112
+ [Climate Impact]
113
+
114
+ Rising temperatures are causing coral bleaching and habitat loss.
115
+
116
+ [Subscribe]
117
+
118
+ Follow to explore more about the changing planet we live on.
119
+
120
+
121
+
122
+ Now here is the Topic: {user_input}
123
  """
124
+
125
+ data = {
126
  'model': OPENROUTER_MODEL,
127
+ 'messages': [{'role': 'user', 'content': prompt}],
128
+ 'temperature': 0.4,
129
+ 'max_tokens': 2000
130
  }
 
 
 
 
131
 
132
+ try:
133
+ response = requests.post(
134
+ 'https://openrouter.ai/api/v1/chat/completions',
135
+ headers=headers,
136
+ json=data,
137
+ timeout=30
138
+ )
139
+
140
+ if response.status_code == 200:
141
+ response_data = response.json()
142
+ if 'choices' in response_data and len(response_data['choices']) > 0:
143
+ return response_data['choices'][0]['message']['content']
 
 
 
 
 
 
 
 
 
 
 
 
144
  else:
145
+ logger.error(f"Unexpected response format: {response_data}")
146
+ return None
147
+ else:
148
+ logger.error(f"API Error {response.status_code}: {response.text}")
149
+ return None
150
+
151
+ except Exception as e:
152
+ logger.error(f"Request failed: {str(e)}")
153
+ return None
154
+
155
+ def parse_script(script_text):
156
+ """Parse the generated script into a list of elements."""
157
+ sections = {}
158
+ current_title = None
159
+ current_text = ""
160
+
161
+ try:
162
+ for line in script_text.splitlines():
163
+ line = line.strip()
164
+ if line.startswith("[") and "]" in line:
165
+ bracket_start = line.find("[")
166
+ bracket_end = line.find("]", bracket_start)
167
+ if bracket_start != -1 and bracket_end != -1:
168
+ if current_title is not None:
169
+ sections[current_title] = current_text.strip()
170
+ current_title = line[bracket_start+1:bracket_end]
171
+ current_text = line[bracket_end+1:].strip()
172
+ elif current_title:
173
+ current_text += line + " "
174
+
175
+ if current_title:
176
+ sections[current_title] = current_text.strip()
177
+
178
+ elements = []
179
+ for title, narration in sections.items():
180
+ if not title or not narration:
181
  continue
 
 
182
 
183
+ media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
184
+ words = narration.split()
185
+ duration = max(3, len(words) * 0.5) # Estimate duration based on word count
186
+ tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
187
+ elements.append(media_element)
188
+ elements.append(tts_element)
189
+
190
+ return elements
191
+ except Exception as e:
192
+ logger.error(f"Error parsing script: {e}")
193
+ return []
194
+
195
+ def generate_tts(text, voice="en"):
196
+ """Generate TTS audio using gTTS."""
197
+ safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_')
198
+ file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
199
+
200
+ try:
201
+ logger.info(f"Generating TTS for: {text[:30]}...")
202
+ tts = gTTS(text=text, lang='en', slow=False)
203
+ mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
204
+ tts.save(mp3_path)
205
+
206
+ # Convert MP3 to WAV
207
+ audio = AudioSegment.from_mp3(mp3_path)
208
+ # Adjust speed if needed
209
+ if voice_speed != 1.0:
210
+ audio = audio._spawn(audio.raw_data, overrides={
211
+ "frame_rate": int(audio.frame_rate * voice_speed)
212
+ })
213
+ audio.export(file_path, format="wav")
214
+ os.remove(mp3_path)
215
+
216
+ logger.info(f"TTS saved to {file_path}")
217
+ return file_path
218
+ except Exception as e:
219
+ logger.error(f"TTS generation error: {e}")
220
+ return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
221
+
222
+ def generate_silent_audio(duration, sample_rate=24000):
223
+ """Generate a silent WAV audio file lasting 'duration' seconds."""
224
+ num_samples = int(duration * sample_rate)
225
+ silence = np.zeros(num_samples, dtype=np.float32)
226
+ silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
227
+ sf.write(silent_path, silence, sample_rate)
228
+ logger.info(f"Silent audio generated: {silent_path}")
229
+ return silent_path
230
+
231
+ def analyze_audio_with_whisper(audio_path):
232
  """
233
+ Use Whisper to transcribe audio and generate word-level timestamps.
234
+ Returns a list of dictionaries with word, start_time, and end_time.
235
  """
236
+ try:
237
+ if whisper_model is None:
238
+ load_whisper_model()
239
+
240
+ logger.info(f"Analyzing audio with Whisper: {audio_path}")
241
+
242
+ # Transcribe the audio file
243
+ result = whisper_model.transcribe(audio_path, word_timestamps=True)
244
+
245
+ # Extract word-level segments
246
+ word_segments = []
247
+ for segment in result["segments"]:
248
+ for word in segment["words"]:
249
+ word_segments.append({
250
+ "word": word["word"].strip(),
251
+ "start": word["start"],
252
+ "end": word["end"]
253
+ })
254
+
255
+ logger.info(f"Extracted {len(word_segments)} word segments")
256
+ return word_segments
257
+ except Exception as e:
258
+ logger.error(f"Whisper analysis error: {e}")
259
+ return []
260
 
261
+ def get_video_clip_segment(video_path, start_time, duration):
262
+ """
263
+ Extract a segment from the video file starting at a random position,
264
+ but ensuring the segment is at least 'duration' seconds long.
265
+ """
266
  try:
267
+ video = VideoFileClip(video_path)
268
+ video_duration = video.duration
269
+
270
+ if duration > video_duration:
271
+ logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s). Using full video.")
272
+ return video
273
+
274
+ # Calculate a random start time ensuring we have enough duration left
275
+ max_start_time = video_duration - duration
276
+ if start_time is None or start_time > max_start_time:
277
+ start_time = random.uniform(0, max_start_time)
278
+
279
+ # Extract the segment
280
+ clip = video.subclip(start_time, start_time + duration)
281
+ logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
282
+ return clip
283
+ except Exception as e:
284
+ logger.error(f"Error extracting video segment: {e}")
285
+ return None
286
+
287
+ def create_word_level_subtitles(clip, words_data, font_size=45):
288
  """
289
+ Create subtitles that highlight words as they are spoken.
290
+ Takes a list of word dictionaries with timing information.
291
  """
292
+ try:
293
+ logger.info("Creating word-level synchronized subtitles")
294
+ # Group words into chunks of approximately 5 words
295
+ chunks = []
296
+ current_chunk = []
297
+ current_chunk_words = []
298
+
299
+ for word_data in words_data:
300
+ current_chunk_words.append(word_data["word"])
301
+ current_chunk.append(word_data)
302
+
303
+ if len(current_chunk_words) >= 5:
304
+ chunks.append({
305
+ "text": " ".join(current_chunk_words),
306
+ "words": current_chunk,
307
+ "start": current_chunk[0]["start"],
308
+ "end": current_chunk[-1]["end"]
309
+ })
310
+ current_chunk = []
311
+ current_chunk_words = []
312
+
313
+ # Add any remaining words
314
+ if current_chunk_words:
315
+ chunks.append({
316
+ "text": " ".join(current_chunk_words),
317
+ "words": current_chunk,
318
+ "start": current_chunk[0]["start"],
319
+ "end": current_chunk[-1]["end"]
320
+ })
321
+
322
+ # Create subtitle clips for each chunk
323
+ subtitle_clips = []
324
+
325
+ for chunk in chunks:
326
+ txt_clip = TextClip(
327
+ chunk["text"],
328
+ fontsize=font_size,
329
+ font='Arial-Bold',
330
+ color=CAPTION_COLOR,
331
+ bg_color='rgba(0, 0, 0, 0.5)',
332
+ method='caption',
333
+ align='center',
334
+ stroke_width=2,
335
+ stroke_color='black',
336
+ size=(TARGET_RESOLUTION[0] * 0.9, None)
337
+ ).set_start(chunk["start"]).set_end(chunk["end"])
338
+
339
+ txt_clip = txt_clip.set_position(('center', TARGET_RESOLUTION[1] * 0.85))
340
+ subtitle_clips.append(txt_clip)
341
+
342
+ logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
343
+ return subtitle_clips
344
+ except Exception as e:
345
+ logger.error(f"Error creating subtitles: {e}")
346
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
+ def add_background_music(final_video, bg_music_volume=0.08):
349
+ """Add background music to the final video."""
350
+ try:
351
+ bg_music_path = "music.mp3"
352
+ if bg_music_path and os.path.exists(bg_music_path):
353
+ logger.info(f"Adding background music from: {bg_music_path}")
354
+ bg_music = AudioFileClip(bg_music_path)
355
+ if bg_music.duration < final_video.duration:
356
+ loops_needed = math.ceil(final_video.duration / bg_music.duration)
357
+ bg_segments = [bg_music] * loops_needed
358
+ bg_music = CompositeAudioClip(bg_segments)
359
+ bg_music = bg_music.subclip(0, final_video.duration)
360
+ bg_music = bg_music.volumex(bg_music_volume)
361
+ video_audio = final_video.audio
362
+ mixed_audio = CompositeAudioClip([video_audio, bg_music])
363
+ final_video = final_video.set_audio(mixed_audio)
364
+ logger.info("Background music added successfully")
365
+ else:
366
+ logger.info("No music file found, skipping background music")
367
+ return final_video
368
+ except Exception as e:
369
+ logger.error(f"Error adding background music: {e}")
370
+ logger.info("Continuing without background music")
371
+ return final_video
372
+
373
+ def create_clip(tts_path, narration_text, segment_index=0):
374
+ """
375
+ Create a video clip with synchronized subtitles using whisper timestamps.
376
+ Uses a random segment from video.mp4 matching the audio duration.
377
+ """
378
+ try:
379
+ logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
380
+ if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
381
+ logger.error("Missing video or TTS file")
382
+ return None
383
+
384
+ # Get audio duration
385
+ audio_clip = AudioFileClip(tts_path)
386
+ audio_duration = audio_clip.duration
387
+ target_duration = audio_duration + 0.5 # Add a small buffer
388
+
389
+ # Get a random segment from the main video
390
+ video_clip = get_video_clip_segment("video.mp4", None, target_duration)
391
+ if video_clip is None:
392
+ logger.error("Failed to extract video segment")
393
+ return None
394
+
395
+ # Resize to target resolution
396
+ video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
397
+
398
+ # Set the audio
399
+ video_clip = video_clip.set_audio(audio_clip)
400
+
401
+ # Generate word-level timestamps with Whisper
402
+ word_data = analyze_audio_with_whisper(tts_path)
403
+
404
+ if word_data:
405
+ # Create word-level subtitles
406
+ subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
407
+ if subtitle_clips:
408
+ # Combine video with subtitles
409
+ video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
410
+ else:
411
+ # Fallback to basic subtitle if whisper fails
412
+ logger.warning("Falling back to basic subtitles")
413
+ txt_clip = TextClip(
414
+ narration_text,
415
+ fontsize=font_size,
416
+ font='Arial-Bold',
417
+ color=CAPTION_COLOR,
418
+ bg_color='rgba(0, 0, 0, 0.5)',
419
+ method='caption',
420
+ align='center',
421
+ size=(TARGET_RESOLUTION[0] * 0.9, None)
422
+ ).set_position(('center', TARGET_RESOLUTION[1] * 0.85)).set_duration(video_clip.duration)
423
+
424
+ video_clip = CompositeVideoClip([video_clip, txt_clip])
425
+
426
+ logger.info(f"Clip created: {video_clip.duration:.1f}s")
427
+ return video_clip
428
+ except Exception as e:
429
+ logger.error(f"Error in create_clip: {str(e)}")
430
+ return None
431
+
432
+ # Main Video Generation Function
433
+ def generate_video(user_input, resolution, caption_option):
434
+ """Generate a video based on user input via Gradio."""
435
+ global TEMP_FOLDER, CAPTION_COLOR
436
+
437
+ # Set caption color based on option
438
+ CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
439
+
440
+ # Create a unique temporary folder
441
+ TEMP_FOLDER = tempfile.mkdtemp()
442
+ logger.info(f"Created temporary folder: {TEMP_FOLDER}")
443
+
444
+ # Check if video.mp4 exists
445
+ if not os.path.exists("video.mp4"):
446
+ logger.error("video.mp4 not found in the current directory")
447
+ return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
448
+
449
+ # Load Whisper model
450
+ load_whisper_model()
451
+
452
+ # Generate script
453
+ logger.info("Generating script from API...")
454
+ script = generate_script(user_input)
455
+ if not script:
456
+ logger.error("Failed to generate script.")
457
+ shutil.rmtree(TEMP_FOLDER)
458
+ return "Failed to generate script. Please try again."
459
+
460
+ logger.info("Generated Script:\n" + script)
461
+
462
+ # Parse script into elements
463
+ elements = parse_script(script)
464
+ if not elements:
465
+ logger.error("Failed to parse script into elements.")
466
+ shutil.rmtree(TEMP_FOLDER)
467
+ return "Failed to parse script. Please try again."
468
+
469
+ logger.info(f"Parsed {len(elements)//2} script segments.")
470
+
471
+ # Group elements into pairs (media prompt + TTS)
472
+ paired_elements = []
473
+ for i in range(0, len(elements), 2):
474
+ if i + 1 < len(elements):
475
+ paired_elements.append((elements[i], elements[i + 1]))
476
+
477
+ if not paired_elements:
478
+ logger.error("No valid script segments found.")
479
+ shutil.rmtree(TEMP_FOLDER)
480
+ return "No valid script segments were generated."
481
+
482
+ # Create video clips for each segment
483
+ clips = []
484
+ for idx, (media_elem, tts_elem) in enumerate(paired_elements):
485
+ logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
486
+
487
+ # Generate TTS for the segment
488
+ tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
489
+ if not tts_path:
490
+ logger.error(f"Skipping segment {idx+1} due to TTS generation failure.")
491
+ continue
492
+
493
+ # Create video clip with subtitles
494
+ clip = create_clip(
495
+ tts_path=tts_path,
496
+ narration_text=tts_elem['text'],
497
+ segment_index=idx
498
+ )
499
+
500
+ if clip:
501
+ clips.append(clip)
502
+ else:
503
+ logger.error(f"Clip creation failed for segment {idx+1}.")
504
+
505
+ if not clips:
506
+ logger.error("No clips were successfully created.")
507
+ shutil.rmtree(TEMP_FOLDER)
508
+ return "Failed to create any video clips. Please try again."
509
+
510
+ # Concatenate all clips
511
+ logger.info("\nConcatenating clips...")
512
+ final_video = concatenate_videoclips(clips, method="compose")
513
+
514
+ # Add background music if available
515
+ final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
516
+
517
+ # Export final video
518
+ logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
519
+ final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
520
+ logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
521
+
522
+ # Clean up
523
+ logger.info("Cleaning up temporary files...")
524
+ shutil.rmtree(TEMP_FOLDER)
525
+ logger.info("Temporary files removed.")
526
+
527
+ return OUTPUT_VIDEO_FILENAME
528
+
529
+ # Gradio Interface Setup
530
+ def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
531
+ global voice_speed, font_size, bg_music_volume, fps, preset
532
+
533
+ # Update global variables with user selections
534
+ voice_speed = v_speed
535
+ font_size = caption_size
536
+ bg_music_volume = bg_vol
537
+ fps = video_fps
538
+ preset = video_preset
539
+
540
+ # Handle music upload
541
+ if music_file is not None:
542
+ target_path = "music.mp3"
543
+ shutil.copy(music_file.name, target_path)
544
+ logger.info(f"Uploaded music saved as: {target_path}")
545
+
546
+ # Generate the video (always using vertical resolution)
547
+ return generate_video(user_input, "Short", caption_option)
548
+
549
+ # Create the Gradio interface
550
+ def create_interface():
551
+ iface = gr.Interface(
552
+ fn=generate_video_with_options,
553
+ inputs=[
554
+ gr.Textbox(label="Video Concept", placeholder="Enter your video concept here..."),
555
+ gr.Radio(["Yes", "No"], label="Show Captions", value="Yes"),
556
+ gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"]),
557
+ gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume"),
558
+ gr.Slider(10, 60, value=30, step=1, label="Video FPS"),
559
+ gr.Dropdown(choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"],
560
+ value="veryfast", label="Export Preset"),
561
+ gr.Slider(0.75, 1.25, value=1.0, step=0.05, label="Voice Speed"),
562
+ gr.Slider(20, 100, value=45, step=1, label="Caption Font Size")
563
+ ],
564
+ outputs=gr.Video(label="Generated Video"),
565
+ title="AI Documentary Video Generator",
566
+ description="""
567
+ Create short documentary videos with AI narration and synchronized captions.
568
+ 1. Enter a topic or concept for your documentary
569
+ 2. Optionally upload background music
570
+ 3. Adjust settings as needed
571
+ 4. Click submit and wait for video generation
572
+
573
+ NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space for this app to work.
574
+ """
575
+ )
576
+ return iface
577
 
578
+ # Launch the application
579
  if __name__ == "__main__":
580
+ # Create interface and launch
581
+ demo = create_interface()
582
+ demo.launch()
583
+ else:
584
+ # For importing as a module
585
+ demo = create_interface()