meraj12 commited on
Commit
f998b37
·
verified ·
1 Parent(s): b76d606

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +65 -0
utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import os
3
+ import torch
4
+ import clip
5
+ from PIL import Image
6
+ from moviepy.editor import *
7
+ from transformers import pipeline
8
+ from langdetect import detect
9
+ from gtts import gTTS
10
+ import tempfile
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ clip_model, preprocess = clip.load("ViT-B/32", device=device)
14
+ whisper_model = whisper.load_model("small")
15
+ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
16
+
17
+ def transcribe_audio_segments(audio_path):
18
+ result = whisper_model.transcribe(audio_path, word_timestamps=True)
19
+ segments = result["segments"]
20
+ return segments
21
+
22
+ def translate_text(text, target_lang="en"):
23
+ detected = detect(text)
24
+ if detected != target_lang:
25
+ translated = translator(text)[0]['translation_text']
26
+ return translated
27
+ return text
28
+
29
+ def match_images(text, image_paths):
30
+ text = clip.tokenize([text]).to(device)
31
+ images = [preprocess(Image.open(img_path)).unsqueeze(0).to(device) for img_path in image_paths]
32
+ images_tensor = torch.cat(images)
33
+ with torch.no_grad():
34
+ text_features = clip_model.encode_text(text)
35
+ image_features = clip_model.encode_image(images_tensor)
36
+ similarity = (image_features @ text_features.T).squeeze()
37
+ best_index = similarity.argmax().item()
38
+ return image_paths[best_index]
39
+
40
+ def generate_speech(text, lang="en"):
41
+ tts = gTTS(text, lang=lang)
42
+ temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
43
+ tts.save(temp_audio.name)
44
+ return temp_audio.name
45
+
46
+ def create_video_segments(segments, audio_path, image_paths, output_path="final_video.mp4"):
47
+ clips = []
48
+ for seg in segments:
49
+ segment_text = seg['text'].strip()
50
+ translated = translate_text(segment_text)
51
+ duration = seg['end'] - seg['start']
52
+ matched_img = match_images(translated, image_paths)
53
+ image_clip = ImageClip(matched_img).set_duration(duration)
54
+
55
+ txt_clip = TextClip(translated, fontsize=30, color='white', bg_color='black', size=image_clip.size)
56
+ txt_clip = txt_clip.set_duration(duration).set_position(("center", "bottom"))
57
+
58
+ audio = AudioFileClip(audio_path).subclip(seg['start'], seg['end'])
59
+ final_clip = CompositeVideoClip([image_clip, txt_clip])
60
+ final_clip = final_clip.set_audio(audio)
61
+ clips.append(final_clip)
62
+
63
+ final_video = concatenate_videoclips(clips, method="compose")
64
+ final_video.write_videofile(output_path, fps=24)
65
+ return output_path