testdeep123 commited on
Commit
3d3cf6f
·
verified ·
1 Parent(s): eb527ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -485
app.py CHANGED
@@ -1,497 +1,114 @@
1
- import os
2
- import re
3
- import time
4
- import math
5
  import tempfile
6
- import random
7
- import shutil
8
- import torch
9
- import numpy as np
10
- import soundfile as sf
11
  from pydub import AudioSegment
12
- from gtts import gTTS
13
- import whisper # Ensure this is openai-whisper in requirements.txt
14
- import gradio as gr
15
- import requests
16
  import json
17
- from moviepy.editor import (
18
- VideoFileClip, concatenate_videoclips, AudioFileClip,
19
- CompositeVideoClip, TextClip, CompositeAudioClip, ColorClip
20
- )
21
- import logging
22
-
23
- # Set up logging
24
- logging.basicConfig(level=logging.INFO,
25
- format='%(asctime)s - %(levelname)s - %(message)s')
26
- logger = logging.getLogger(__name__)
27
 
28
- # Global Configuration Variables
29
  OPENROUTER_API_KEY = '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'
30
  OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
31
- TARGET_RESOLUTION = (1080, 1920) # Vertical format for shorts
32
  OUTPUT_VIDEO_FILENAME = "final_video.mp4"
33
- TEMP_FOLDER = None
34
  CAPTION_COLOR = "white"
35
 
36
- # Additional global variables for Gradio interface
37
- selected_voice = 'en_us_001' # Default voice
38
- voice_speed = 1.0 # Default voice speed
39
- font_size = 45 # Default font size
40
- bg_music_volume = 0.08 # Default background music volume
41
- fps = 30 # Default FPS
42
- preset = "veryfast" # Default preset
43
-
44
- # Initialize whisper model globally
45
- whisper_model = None
46
-
47
- def load_whisper_model():
48
- """Load the Whisper model."""
49
- global whisper_model
50
- try:
51
- logger.info("Loading Whisper model...")
52
- whisper_model = whisper.load_model("tiny") # Using tiny for CPU efficiency
53
- logger.info("Whisper model loaded successfully")
54
- return True
55
- except Exception as e:
56
- logger.error(f"Failed to load Whisper model: {e}")
57
- return False
58
-
59
- def generate_script(user_input):
60
- """Generate documentary script using OpenRouter API."""
61
- headers = {
62
- 'Authorization': f'Bearer {OPENROUTER_API_KEY}',
63
- 'HTTP-Referer': 'https://huggingface.co/spaces',
64
- 'X-Title': 'AI Documentary Maker'
65
- }
66
-
67
- prompt = f"""You're a professional documentary narrator. Your job is to write a serious, natural, and informative video script based on one topic.
68
-
69
- The script should sound like a real human voiceover from a TV show or documentary — clear, factual, and engaging, like something you'd hear on National Geographic or a news report.
70
-
71
- Structure:
72
- - Break the script into scenes using [Tags]. Each tag is a short title (1–2 words) that describes the scene.
73
- - Under each tag, write one sentence (max 12 words) that fits the tag and continues the topic.
74
- - The full script should make sense as one connected narration — no randomness.
75
- - Use natural, formal English. No slang, no fake AI language, and no robotic tone.
76
- - Do not use humor, sarcasm, or casual language. This is a serious narration.
77
- - No emotion-sound words like "aww," "eww," "whoa," etc.
78
- - Do not use numbers like 1, 2, 3 — write them out as one, two, three.
79
- - Make the total narration about 1 minute long (around 150-200 words total).
80
- - At the end, add a [Subscribe] tag with a formal or respectful reason to follow or subscribe.
81
-
82
- Only output the script. No extra comments or text.
83
-
84
- Example:
85
-
86
- [Ocean]
87
- The ocean covers over seventy percent of the Earth's surface.
88
-
89
- [Currents]
90
- Ocean currents distribute heat and regulate global climate patterns.
91
-
92
- [Coral Reefs]
93
- These ecosystems support over one million species of marine life.
94
-
95
- [Pollution]
96
- Plastic waste threatens marine biodiversity and food chains.
97
-
98
- [Climate Impact]
99
- Rising temperatures are causing coral bleaching and habitat loss.
100
-
101
- [Subscribe]
102
- Follow to explore more about the changing planet we live on.
103
-
104
- Now here is the Topic: {user_input}
105
- """
106
-
107
- data = {
108
- 'model': OPENROUTER_MODEL,
109
- 'messages': [{'role': 'user', 'content': prompt}],
110
- 'temperature': 0.4,
111
- 'max_tokens': 2000
112
- }
113
-
114
- try:
115
- response = requests.post(
116
- 'https://openrouter.ai/api/v1/chat/completions',
117
- headers=headers,
118
- json=data,
119
- timeout=30
120
- )
121
-
122
- if response.status_code == 200:
123
- response_data = response.json()
124
- if 'choices' in response_data and len(response_data['choices']) > 0:
125
- return response_data['choices'][0]['message']['content']
126
- else:
127
- logger.error(f"Unexpected response format: {response_data}")
128
- return None
129
- else:
130
- logger.error(f"API Error {response.status_code}: {response.text}")
131
- return None
132
- except Exception as e:
133
- logger.error(f"Request failed: {str(e)}")
134
- return None
135
-
136
- def parse_script(script_text):
137
- """Parse the generated script into a list of elements."""
138
- sections = {}
139
- current_title = None
140
- current_text = ""
141
-
142
- try:
143
- for line in script_text.splitlines():
144
- line = line.strip()
145
- if line.startswith("[") and "]" in line:
146
- bracket_start = line.find("[")
147
- bracket_end = line.find("]", bracket_start)
148
- if bracket_start != -1 and bracket_end != -1:
149
- if current_title is not None:
150
- sections[current_title] = current_text.strip()
151
- current_title = line[bracket_start+1:bracket_end]
152
- current_text = line[bracket_end+1:].strip()
153
- elif current_title:
154
- current_text += line + " "
155
-
156
- if current_title:
157
- sections[current_title] = current_text.strip()
158
-
159
- elements = []
160
- for title, narration in sections.items():
161
- if not title or not narration:
162
- continue
163
-
164
- media_element = {"type": "media", "prompt": title, "effects": "fade-in"}
165
- words = narration.split()
166
- duration = max(3, len(words) * 0.5) # Estimate duration
167
- tts_element = {"type": "tts", "text": narration, "voice": "en", "duration": duration}
168
- elements.append(media_element)
169
- elements.append(tts_element)
170
-
171
- return elements
172
- except Exception as e:
173
- logger.error(f"Error parsing script: {e}")
174
- return []
175
-
176
- def generate_tts(text, voice="en"):
177
- """Generate TTS audio using gTTS."""
178
- safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_')
179
- file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
180
-
181
- try:
182
- logger.info(f"Generating TTS for: {text[:30]}...")
183
- tts = gTTS(text=text, lang='en', slow=False)
184
- mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
185
- tts.save(mp3_path)
186
-
187
- # Convert MP3 to WAV
188
- audio = AudioSegment.from_mp3(mp3_path)
189
- if voice_speed != 1.0:
190
- audio = audio._spawn(audio.raw_data, overrides={
191
- "frame_rate": int(audio.frame_rate * voice_speed)
192
- })
193
- audio.export(file_path, format="wav")
194
- os.remove(mp3_path)
195
-
196
- logger.info(f"TTS saved to {file_path}")
197
- return file_path
198
- except Exception as e:
199
- logger.error(f"TTS generation error: {e}")
200
- return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
201
-
202
- def generate_silent_audio(duration, sample_rate=24000):
203
- """Generate a silent WAV audio file."""
204
- num_samples = int(duration * sample_rate)
205
- silence = np.zeros(num_samples, dtype=np.float32)
206
- silent_path = os.path.join(TEMP_FOLDER, f"silent_{int(time.time())}.wav")
207
- sf.write(silent_path, silence, sample_rate)
208
- logger.info(f"Silent audio generated: {silent_path}")
209
- return silent_path
210
-
211
- def analyze_audio_with_whisper(audio_path):
212
- """Use Whisper to generate word-level timestamps."""
213
- try:
214
- if whisper_model is None:
215
- load_whisper_model()
216
-
217
- logger.info(f"Analyzing audio with Whisper: {audio_path}")
218
- result = whisper_model.transcribe(audio_path, word_timestamps=True)
219
-
220
- word_segments = []
221
- for segment in result["segments"]:
222
- for word in segment["words"]:
223
- word_segments.append({
224
- "word": word["word"].strip(),
225
- "start": word["start"],
226
- "end": word["end"]
227
- })
228
-
229
- logger.info(f"Extracted {len(word_segments)} word segments")
230
- return word_segments
231
- except Exception as e:
232
- logger.error(f"Whisper analysis error: {e}")
233
- return []
234
-
235
- def get_video_clip_segment(video_path, start_time, duration):
236
- """Extract a random video segment."""
237
- try:
238
- video = VideoFileClip(video_path)
239
- video_duration = video.duration
240
-
241
- if duration > video_duration:
242
- logger.warning(f"Requested duration ({duration}s) exceeds video length ({video_duration}s).")
243
- return video
244
-
245
- max_start_time = video_duration - duration
246
- if start_time is None or start_time > max_start_time:
247
- start_time = random.uniform(0, max_start_time)
248
-
249
- clip = video.subclip(start_time, start_time + duration)
250
- logger.info(f"Extracted video segment: {start_time:.2f}s to {start_time + duration:.2f}s")
251
- return clip
252
- except Exception as e:
253
- logger.error(f"Error extracting video segment: {e}")
254
- return None
255
-
256
- def create_word_level_subtitles(clip, words_data, font_size=45):
257
- """Create synchronized subtitles without ImageMagick."""
258
- try:
259
- logger.info("Creating word-level synchronized subtitles")
260
- chunks = []
261
- current_chunk = []
262
- current_chunk_words = []
263
-
264
- for word_data in words_data:
265
- current_chunk_words.append(word_data["word"])
266
- current_chunk.append(word_data)
267
-
268
- if len(current_chunk_words) >= 5:
269
- chunks.append({
270
- "text": " ".join(current_chunk_words),
271
- "words": current_chunk,
272
- "start": current_chunk[0]["start"],
273
- "end": current_chunk[-1]["end"]
274
- })
275
- current_chunk = []
276
- current_chunk_words = []
277
-
278
- if current_chunk_words:
279
- chunks.append({
280
- "text": " ".join(current_chunk_words),
281
- "words": current_chunk,
282
- "start": current_chunk[0]["start"],
283
- "end": current_chunk[-1]["end"]
284
- })
285
-
286
- subtitle_clips = []
287
- for chunk in chunks:
288
- txt_clip = TextClip(
289
- chunk["text"],
290
- fontsize=font_size,
291
- color=CAPTION_COLOR,
292
- method='label'
293
- )
294
-
295
- bg_clip = ColorClip(
296
- size=(txt_clip.w + 20, txt_clip.h + 10),
297
- color=(0, 0, 0, 128) # Semi-transparent black
298
- )
299
-
300
- subtitle_clip = CompositeVideoClip([
301
- bg_clip.set_position('center'),
302
- txt_clip.set_position('center')
303
- ])
304
- subtitle_clip = subtitle_clip.set_start(chunk["start"]).set_end(chunk["end"]).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
305
- subtitle_clips.append(subtitle_clip)
306
-
307
- logger.info(f"Created {len(subtitle_clips)} subtitle chunks")
308
- return subtitle_clips
309
- except Exception as e:
310
- logger.error(f"Error creating subtitles: {e}")
311
- return []
312
-
313
- def add_background_music(final_video, bg_music_volume=0.08):
314
- """Add background music to the video."""
315
- try:
316
- bg_music_path = "music.mp3"
317
- if bg_music_path and os.path.exists(bg_music_path):
318
- logger.info(f"Adding background music from: {bg_music_path}")
319
- bg_music = AudioFileClip(bg_music_path)
320
- if bg_music.duration < final_video.duration:
321
- loops_needed = math.ceil(final_video.duration / bg_music.duration)
322
- bg_segments = [bg_music] * loops_needed
323
- bg_music = CompositeAudioClip(bg_segments)
324
- bg_music = bg_music.subclip(0, final_video.duration)
325
- bg_music = bg_music.volumex(bg_music_volume)
326
- video_audio = final_video.audio
327
- mixed_audio = CompositeAudioClip([video_audio, bg_music])
328
- final_video = final_video.set_audio(mixed_audio)
329
- logger.info("Background music added successfully")
330
- else:
331
- logger.info("No music file found, skipping background music")
332
- return final_video
333
- except Exception as e:
334
- logger.error(f"Error adding background music: {e}")
335
- return final_video
336
-
337
- def create_clip(tts_path, narration_text, segment_index=0):
338
- """Create a video clip with synchronized subtitles."""
339
- try:
340
- logger.info(f"Creating clip #{segment_index} with TTS: {tts_path}")
341
- if not os.path.exists(tts_path) or not os.path.exists("video.mp4"):
342
- logger.error("Missing video or TTS file")
343
- return None
344
-
345
- audio_clip = AudioFileClip(tts_path)
346
- audio_duration = audio_clip.duration
347
- target_duration = audio_duration + 0.5
348
-
349
- video_clip = get_video_clip_segment("video.mp4", None, target_duration)
350
- if video_clip is None:
351
- logger.error("Failed to extract video segment")
352
- return None
353
-
354
- video_clip = video_clip.resize(height=TARGET_RESOLUTION[1], width=TARGET_RESOLUTION[0])
355
- video_clip = video_clip.set_audio(audio_clip)
356
-
357
- word_data = analyze_audio_with_whisper(tts_path)
358
-
359
- if word_data:
360
- subtitle_clips = create_word_level_subtitles(video_clip, word_data, font_size)
361
- if subtitle_clips:
362
- video_clip = CompositeVideoClip([video_clip] + subtitle_clips)
363
- else:
364
- logger.warning("Falling back to basic subtitles")
365
- txt_clip = TextClip(
366
- narration_text,
367
- fontsize=font_size,
368
- color=CAPTION_COLOR,
369
- method='label'
370
- )
371
-
372
- bg_clip = ColorClip(
373
- size=(txt_clip.w + 20, txt_clip.h + 10),
374
- color=(0, 0, 0, 128)
375
- )
376
-
377
- subtitle_clip = CompositeVideoClip([
378
- bg_clip.set_position('center'),
379
- txt_clip.set_position('center')
380
- ])
381
- subtitle_clip = subtitle_clip.set_duration(video_clip.duration).set_position(('center', TARGET_RESOLUTION[1] * 0.85))
382
- video_clip = CompositeVideoClip([video_clip, subtitle_clip])
383
-
384
- logger.info(f"Clip created: {video_clip.duration:.1f}s")
385
- return video_clip
386
- except Exception as e:
387
- logger.error(f"Error in create_clip: {str(e)}")
388
- return None
389
-
390
- def generate_video(user_input, resolution, caption_option):
391
- """Generate a video based on user input."""
392
- global TEMP_FOLDER, CAPTION_COLOR
393
-
394
- CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
395
- TEMP_FOLDER = tempfile.mkdtemp()
396
- logger.info(f"Created temporary folder: {TEMP_FOLDER}")
397
-
398
- if not os.path.exists("video.mp4"):
399
- logger.error("video.mp4 not found")
400
- return "Error: video.mp4 not found. Please upload a video file named 'video.mp4'."
401
-
402
- load_whisper_model()
403
- script = generate_script(user_input)
404
- if not script:
405
- shutil.rmtree(TEMP_FOLDER)
406
- return "Failed to generate script."
407
-
408
- logger.info("Generated Script:\n" + script)
409
- elements = parse_script(script)
410
- if not elements:
411
- shutil.rmtree(TEMP_FOLDER)
412
- return "Failed to parse script."
413
-
414
- logger.info(f"Parsed {len(elements)//2} script segments.")
415
- paired_elements = [(elements[i], elements[i + 1]) for i in range(0, len(elements), 2)]
416
-
417
- if not paired_elements:
418
- shutil.rmtree(TEMP_FOLDER)
419
- return "No valid script segments generated."
420
-
421
- clips = []
422
- for idx, (media_elem, tts_elem) in enumerate(paired_elements):
423
- logger.info(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
424
- tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
425
- if not tts_path:
426
- continue
427
-
428
- clip = create_clip(tts_path, tts_elem['text'], idx)
429
- if clip:
430
- clips.append(clip)
431
-
432
- if not clips:
433
- shutil.rmtree(TEMP_FOLDER)
434
- return "Failed to create any video clips."
435
-
436
- logger.info("\nConcatenating clips...")
437
- final_video = concatenate_videoclips(clips, method="compose")
438
- final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
439
-
440
- logger.info(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
441
- final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
442
- logger.info(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
443
-
444
- shutil.rmtree(TEMP_FOLDER)
445
- logger.info("Temporary files removed.")
446
- return OUTPUT_VIDEO_FILENAME
447
-
448
- def generate_video_with_options(user_input, caption_option, music_file, bg_vol, video_fps, video_preset, v_speed, caption_size):
449
- """Generate video with Gradio options."""
450
- global voice_speed, font_size, bg_music_volume, fps, preset
451
-
452
- voice_speed = v_speed
453
- font_size = caption_size
454
- bg_music_volume = bg_vol
455
- fps = video_fps
456
- preset = video_preset
457
-
458
- if music_file is not None:
459
- shutil.copy(music_file.name, "music.mp3")
460
- logger.info(f"Uploaded music saved as: music.mp3")
461
-
462
- return generate_video(user_input, "Short", caption_option)
463
-
464
- def create_interface():
465
- """Create Gradio interface."""
466
- iface = gr.Interface(
467
- fn=generate_video_with_options,
468
- inputs=[
469
- gr.Textbox(label="Video Concept", placeholder="Enter your video concept here..."),
470
- gr.Radio(["Yes", "No"], label="Show Captions", value="Yes"),
471
- gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"]),
472
- gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume"),
473
- gr.Slider(10, 60, value=30, step=1, label="Video FPS"),
474
- gr.Dropdown(choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"],
475
- value="veryfast", label="Export Preset"),
476
- gr.Slider(0.75, 1.25, value=1.0, step=0.05, label="Voice Speed"),
477
- gr.Slider(20, 100, value=45, step=1, label="Caption Font Size")
478
- ],
479
- outputs=gr.Video(label="Generated Video"),
480
- title="AI Documentary Video Generator",
481
- description="""
482
- Create short documentary videos with AI narration and synchronized captions.
483
- 1. Enter a topic or concept for your documentary
484
- 2. Optionally upload background music
485
- 3. Adjust settings as needed
486
- 4. Click submit and wait for video generation
487
-
488
- NOTE: You must upload a file named 'video.mp4' to your Hugging Face Space.
489
- """
490
  )
491
- return iface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
492
 
493
  if __name__ == "__main__":
494
- demo = create_interface()
495
- demo.launch()
496
- else:
497
- demo = create_interface()
 
1
+ import gradio as gr
 
 
 
2
  import tempfile
3
+ import os
4
+ from moviepy.editor import *
 
 
 
5
  from pydub import AudioSegment
6
+ import whisper
 
 
 
7
  import json
8
+ import requests
 
 
 
 
 
 
 
 
 
9
 
10
+ # Configuration
11
  OPENROUTER_API_KEY = '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'
12
  OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
13
+ TARGET_RESOLUTION = (1080, 1920)
14
  OUTPUT_VIDEO_FILENAME = "final_video.mp4"
 
15
  CAPTION_COLOR = "white"
16
 
17
+ # Placeholder for Kokoro TTS
18
+ def kokoro_tts(text):
19
+ # TODO: Replace with actual Kokoro TTS implementation
20
+ # Should return path to generated audio file
21
+ return "dummy_audio.wav"
22
+
23
+ def generate_script(topic):
24
+ prompt = f"Generate a script about {topic} divided into parts, and output it as a JSON array of strings."
25
+ response = requests.post(
26
+ "https://api.openrouter.com/v1/completions",
27
+ headers={"Authorization": f"Bearer {OPENROUTER_API_KEY}"},
28
+ json={"model": OPENROUTER_MODEL, "prompt": prompt}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  )
30
+ script_json = response.json()["choices"][0]["text"]
31
+ return json.loads(script_json)
32
+
33
+ def generate_audio(script_parts, temp_folder):
34
+ full_audio = AudioSegment.empty()
35
+ for part in script_parts:
36
+ audio_file = kokoro_tts(part)
37
+ audio_segment = AudioSegment.from_file(audio_file)
38
+ silence = AudioSegment.silent(duration=300) # 0.3s gap
39
+ full_audio += audio_segment + silence
40
+ full_audio = full_audio[:-300] # Remove last silence
41
+ audio_path = os.path.join(temp_folder, "full_audio.wav")
42
+ full_audio.export(audio_path, format="wav")
43
+ return audio_path
44
+
45
+ def generate_subtitles(audio_path):
46
+ model = whisper.load_model("base")
47
+ result = model.transcribe(audio_path, word_timestamps=True)
48
+ return result['segments']
49
+
50
+ def process_background_video(audio_duration):
51
+ background = VideoFileClip("video.mp4")
52
+ background = background.resize(height=1920)
53
+ if background.w > 1080:
54
+ background = background.crop(x_center=background.w/2, width=1080)
55
+ required_duration = audio_duration + 0.5
56
+ if background.duration < required_duration:
57
+ n_loops = int(required_duration / background.duration) + 1
58
+ background = concatenate_videoclips([background] * n_loops)
59
+ return background.set_duration(required_duration)
60
+
61
+ def create_subtitle_clips(segments, video_height=1920, font_size=24, color='white', highlight_color='yellow'):
62
+ subtitle_y = video_height - 200
63
+ all_words = [word for segment in segments for word in segment['words']]
64
+ chunks = [all_words[i:i+5] for i in range(0, len(all_words), 5)]
65
+ subtitle_clips = []
66
+ for chunk in chunks:
67
+ for i, word in enumerate(chunk):
68
+ line_clip = create_text_line(chunk, i, font_size, color, highlight_color)
69
+ line_clip = line_clip.set_start(word['start']).set_end(word['end']).set_pos(('center', subtitle_y))
70
+ subtitle_clips.append(line_clip)
71
+ return subtitle_clips
72
+
73
+ def create_text_line(words, highlighted_index, font_size, color, highlight_color):
74
+ space_clip = TextClip(" ", fontsize=font_size, color=color)
75
+ space_width = space_clip.w
76
+ text_clips = []
77
+ total_width = 0
78
+ for i, word in enumerate(words):
79
+ c = highlight_color if i == highlighted_index else color
80
+ text_clip = TextClip(word['word'], fontsize=font_size, color=c)
81
+ text_clips.append(text_clip)
82
+ total_width += text_clip.w + (space_width if i < len(words) - 1 else 0)
83
+ current_x = -total_width / 2
84
+ positioned_clips = []
85
+ for clip in text_clips:
86
+ positioned_clips.append(clip.set_pos((current_x, 0)))
87
+ current_x += clip.w + space_width
88
+ return CompositeVideoClip(positioned_clips, size=(total_width, text_clips[0].h))
89
+
90
+ def generate_video(topic):
91
+ with tempfile.TemporaryDirectory() as temp_folder:
92
+ script_parts = generate_script(topic)
93
+ audio_path = generate_audio(script_parts, temp_folder)
94
+ audio_duration = AudioSegment.from_file(audio_path).duration_seconds
95
+ segments = generate_subtitles(audio_path)
96
+ background = process_background_video(audio_duration)
97
+ subtitle_clips = create_subtitle_clips(segments)
98
+ audio_clip = AudioFileClip(audio_path)
99
+ final_video = background.set_audio(audio_clip)
100
+ final_video = CompositeVideoClip([final_video] + subtitle_clips)
101
+ output_path = os.path.join(temp_folder, OUTPUT_VIDEO_FILENAME)
102
+ final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
103
+ return output_path
104
+
105
+ # Gradio UI
106
+ iface = gr.Interface(
107
+ fn=generate_video,
108
+ inputs=gr.Textbox(label="Topic"),
109
+ outputs=gr.Video(label="Generated YouTube Short"),
110
+ title="YouTube Short Creator"
111
+ )
112
 
113
  if __name__ == "__main__":
114
+ iface.launch()