Spaces:
Running
Running
import whisper | |
import os | |
import torch | |
import clip | |
from PIL import Image | |
from moviepy.editor import * | |
from transformers import pipeline | |
from langdetect import detect | |
from gtts import gTTS | |
import tempfile | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
clip_model, preprocess = clip.load("ViT-B/32", device=device) | |
whisper_model = whisper.load_model("small") | |
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en") | |
def transcribe_audio_segments(audio_path): | |
result = whisper_model.transcribe(audio_path, word_timestamps=True) | |
segments = result["segments"] | |
return segments | |
def translate_text(text, target_lang="en"): | |
detected = detect(text) | |
if detected != target_lang: | |
translated = translator(text)[0]['translation_text'] | |
return translated | |
return text | |
def match_images(text, image_paths): | |
text = clip.tokenize([text]).to(device) | |
images = [preprocess(Image.open(img_path)).unsqueeze(0).to(device) for img_path in image_paths] | |
images_tensor = torch.cat(images) | |
with torch.no_grad(): | |
text_features = clip_model.encode_text(text) | |
image_features = clip_model.encode_image(images_tensor) | |
similarity = (image_features @ text_features.T).squeeze() | |
best_index = similarity.argmax().item() | |
return image_paths[best_index] | |
def generate_speech(text, lang="en"): | |
tts = gTTS(text, lang=lang) | |
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") | |
tts.save(temp_audio.name) | |
return temp_audio.name | |
def create_video_segments(segments, audio_path, image_paths, output_path="final_video.mp4"): | |
clips = [] | |
for seg in segments: | |
segment_text = seg['text'].strip() | |
translated = translate_text(segment_text) | |
duration = seg['end'] - seg['start'] | |
matched_img = match_images(translated, image_paths) | |
image_clip = ImageClip(matched_img).set_duration(duration) | |
txt_clip = TextClip(translated, fontsize=30, color='white', bg_color='black', size=image_clip.size) | |
txt_clip = txt_clip.set_duration(duration).set_position(("center", "bottom")) | |
audio = AudioFileClip(audio_path).subclip(seg['start'], seg['end']) | |
final_clip = CompositeVideoClip([image_clip, txt_clip]) | |
final_clip = final_clip.set_audio(audio) | |
clips.append(final_clip) | |
final_video = concatenate_videoclips(clips, method="compose") | |
final_video.write_videofile(output_path, fps=24) | |
return output_path | |