File size: 2,488 Bytes
f998b37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import whisper
import os
import torch
import clip
from PIL import Image
from moviepy.editor import *
from transformers import pipeline
from langdetect import detect
from gtts import gTTS
import tempfile

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)
whisper_model = whisper.load_model("small")
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

def transcribe_audio_segments(audio_path):
    result = whisper_model.transcribe(audio_path, word_timestamps=True)
    segments = result["segments"]
    return segments

def translate_text(text, target_lang="en"):
    detected = detect(text)
    if detected != target_lang:
        translated = translator(text)[0]['translation_text']
        return translated
    return text

def match_images(text, image_paths):
    text = clip.tokenize([text]).to(device)
    images = [preprocess(Image.open(img_path)).unsqueeze(0).to(device) for img_path in image_paths]
    images_tensor = torch.cat(images)
    with torch.no_grad():
        text_features = clip_model.encode_text(text)
        image_features = clip_model.encode_image(images_tensor)
        similarity = (image_features @ text_features.T).squeeze()
    best_index = similarity.argmax().item()
    return image_paths[best_index]

def generate_speech(text, lang="en"):
    tts = gTTS(text, lang=lang)
    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(temp_audio.name)
    return temp_audio.name

def create_video_segments(segments, audio_path, image_paths, output_path="final_video.mp4"):
    clips = []
    for seg in segments:
        segment_text = seg['text'].strip()
        translated = translate_text(segment_text)
        duration = seg['end'] - seg['start']
        matched_img = match_images(translated, image_paths)
        image_clip = ImageClip(matched_img).set_duration(duration)

        txt_clip = TextClip(translated, fontsize=30, color='white', bg_color='black', size=image_clip.size)
        txt_clip = txt_clip.set_duration(duration).set_position(("center", "bottom"))

        audio = AudioFileClip(audio_path).subclip(seg['start'], seg['end'])
        final_clip = CompositeVideoClip([image_clip, txt_clip])
        final_clip = final_clip.set_audio(audio)
        clips.append(final_clip)

    final_video = concatenate_videoclips(clips, method="compose")
    final_video.write_videofile(output_path, fps=24)
    return output_path