Spaces:

testdeep123
/

sda

Running

App Files Files Community

sda / app.py

testdeep123

Update app.py

c25723f verified 7 days ago

raw

history blame

6.94 kB

	import os
	import re
	import math
	import random
	import tempfile
	import shutil
	import requests
	import numpy as np
	from kokoro import KPipeline
	import soundfile as sf
	from pydub import AudioSegment
	from gtts import gTTS
	import gradio as gr
	from moviepy.editor import (
	VideoFileClip, AudioFileClip, concatenate_audioclips,
	CompositeAudioClip, CompositeVideoClip, TextClip
	)

	# ────────── GLOBAL CONFIG ──────────
	OPENROUTER_API_KEY = 'sk-or-v1-…'
	OPENROUTER_MODEL = "google/gemini-2.0-flash-exp:free"
	SOURCE_VIDEO_PATH = "video.mp4" # 13-min source
	OUTPUT_VIDEO_PATH = "final_video.mp4"
	TARGET_RESOLUTION = (1080, 1920) # Vertical TikTok style
	VOICE_SPEED = 0.9
	CAPTION_FONT_SIZE = 45
	BG_MUSIC_VOLUME = 0.08

	# Initialize Kokoro TTS
	pipeline = KPipeline(lang_code='a') # American English

	# ────────── SCRIPT GENERATION ──────────
	def generate_script(topic: str) -> str:
	headers = {
	'Authorization': f'Bearer {OPENROUTER_API_KEY}',
	'HTTP-Referer': 'https://your-domain.com',
	'X-Title': 'AI Documentary Maker'
	}
	prompt = f"""You’re a professional documentary narrator.
	Break your script into scenes with [Tags], one sentence each (≤12 words).
	No slang or numbers. At the end, include [Subscribe] with a formal reason.

	Topic: {topic}
	"""
	payload = {
	'model': OPENROUTER_MODEL,
	'messages': [{'role':'user','content':prompt}],
	'temperature':0.4,
	'max_tokens':5000
	}
	r = requests.post('https://openrouter.ai/api/v1/chat/completions',
	headers=headers, json=payload, timeout=30)
	r.raise_for_status()
	return r.json()['choices'][0]['message']['content']

	def parse_script(script_text: str):
	"""Return list of (scene_title, sentence_text)."""
	sections = []
	current = None
	for line in script_text.splitlines():
	m = re.match(r'^\[(.+?)\]\s(.)$', line)
	if m:
	if current:
	sections.append(current)
	current = [m.group(1), m.group(2)]
	elif current and line.strip():
	current[1] += ' ' + line.strip()
	if current:
	sections.append(current)
	return sections

	# ────────── TTS ──────────
	def generate_tts_audio(text: str, voice_code: str, dirpath: str) -> str:
	"""Produce a WAV file, using Kokoro then gTTS fallback."""
	safe = re.sub(r'[^\w]', '_', text[:10]).strip()
	out_path = os.path.join(dirpath, f"tts_{safe}.wav")
	if os.path.exists(out_path):
	return out_path

	try:
	# Kokoro pipeline returns a sequence of numpy audio arrays
	segments = pipeline(text, voice=voice_code, speed=VOICE_SPEED, split_pattern=r'\n+')
	arrays = [seg_audio for _, _, seg_audio in segments]
	combined = np.concatenate(arrays, axis=0)
	sf.write(out_path, combined, 24000)
	except Exception:
	# fallback to gTTS
	mp3 = os.path.join(dirpath, f"{safe}.mp3")
	gTTS(text=text, lang='en').save(mp3)
	wav = AudioSegment.from_mp3(mp3)
	wav.export(out_path, format="wav")
	os.remove(mp3)
	return out_path

	# ────────── VIDEO + SUBTITLES ──────────
	def add_pillow_subtitles(clip, sections):
	"""Overlay each sentence as timed subtitles using Pillow (no ImageMagick)."""
	subtitles = []
	cum_time = 0
	for title, sentence in sections:
	audio_path = clip.audio # duration only
	# split sentence into 5-word chunks
	words = sentence.split()
	chunks = [words[i:i+5] for i in range(0, len(words), 5)]
	seg_dur = clip.duration * (len(sentence.split()) / sum(len(s.split()) for _, s in sections))
	# approximate each chunk duration
	chunk_dur = seg_dur / len(chunks) if chunks else seg_dur
	for i, chunk in enumerate(chunks):
	txt = ' '.join(chunk)
	txt_clip = (
	TextClip(txt, fontsize=CAPTION_FONT_SIZE, font='Arial-Bold',
	color='white', bg_color='rgba(0,0,0,0.3)',
	size=(TARGET_RESOLUTION[0]*0.9, None),
	method='pillow')
	.set_start(cum_time + i*chunk_dur)
	.set_duration(chunk_dur)
	.set_position(('center', int(TARGET_RESOLUTION[1]*0.8)))
	)
	subtitles.append(txt_clip)
	cum_time += seg_dur
	return subtitles

	def generate_video(topic, include_captions, music_file, voice_choice):
	if not os.path.exists(SOURCE_VIDEO_PATH):
	raise FileNotFoundError(f"{SOURCE_VIDEO_PATH} not found.")
	# 1) get script & sections
	script = generate_script(topic)
	sections = parse_script(script)
	# 2) TTS each sentence
	tmp = tempfile.mkdtemp()
	tts_paths = [
	generate_tts_audio(sentence, voice_choice, tmp)
	for _, sentence in sections
	]
	# 3) concatenate all TTS audios
	clips_audio = [AudioFileClip(p) for p in tts_paths]
	narration = concatenate_audioclips(clips_audio)
	total_dur = narration.duration

	# 4) pick one random subclip
	src = VideoFileClip(SOURCE_VIDEO_PATH)
	start = random.uniform(0, max(0, src.duration - total_dur))
	video = src.subclip(start, start + total_dur).resize(TARGET_RESOLUTION)
	src.close()

	# 5) overlay narration
	video = video.set_audio(narration)

	# 6) optional subtitles
	if include_captions:
	subs = add_pillow_subtitles(video, sections)
	video = CompositeVideoClip([video, *subs])

	# 7) optional background music
	if music_file:
	bg = AudioFileClip(music_file)
	# loop or trim
	loops = math.ceil(video.duration / bg.duration)
	bg = concatenate_audioclips([bg]*loops).subclip(0, video.duration).volumex(BG_MUSIC_VOLUME)
	mixed = CompositeAudioClip([video.audio, bg])
	video = video.set_audio(mixed)

	# 8) export
	video.write_videofile(OUTPUT_VIDEO_PATH, codec='libx264', fps=30, preset='veryfast')

	# cleanup
	shutil.rmtree(tmp)
	return OUTPUT_VIDEO_PATH

	# ────────── GRADIO UI ──────────
	VOICE_MAP = {
	'Emma (Female)': 'af_heart',
	'Bella (Female)': 'af_bella',
	# … add others as needed …
	}

	iface = gr.Interface(
	fn=generate_video,
	inputs=[
	gr.Textbox(label="Video Concept"),
	gr.Checkbox(label="Include Captions"),
	gr.File(label="Background Music (MP3)", file_types=[".mp3"]),
	gr.Dropdown(list(VOICE_MAP.keys()), label="Voice", value="Emma (Female)")
	],
	outputs=gr.Video(label="Generated Video"),
	title="AI Documentary Video Generator",
	description="Cuts one ~64 s clip from your 13 min video, adds AI narration & TikTok-style subtitles."
	)

	if __name__ == "__main__":
	iface.launch(share=True)