import whisper import os import shutil import cv2 from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip from tqdm import tqdm FONT = cv2.FONT_HERSHEY_SIMPLEX FONT_SCALE = 0.8 FONT_THICKNESS = 2 class VideoTranscriber: def __init__(self, model_path, video_path): self.model = whisper.load_model(model_path) self.video_path = video_path self.audio_path = '' self.text_array = [] self.fps = 0 self.char_width = 0 def transcribe_video(self): print('Transcribing video') result = self.model.transcribe(self.audio_path) text = result["segments"][0]["text"] textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0] cap = cv2.VideoCapture(self.video_path) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) asp = 16/9 ret, frame = cap.read() width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1] width = width - (width * 0.1) self.fps = cap.get(cv2.CAP_PROP_FPS) self.char_width = int(textsize[0] / len(text)) for j in tqdm(result["segments"]): lines = [] text = j["text"] end = j["end"] start = j["start"] total_frames = int((end - start) * self.fps) start = start * self.fps total_chars = len(text) words = text.split(" ") i = 0 while i < len(words): words[i] = words[i].strip() if words[i] == "": i += 1 continue length_in_pixels = (len(words[i]) + 1) * self.char_width remaining_pixels = width - length_in_pixels line = words[i] while remaining_pixels > 0: i += 1 if i >= len(words): break length_in_pixels = (len(words[i]) + 1) * self.char_width remaining_pixels -= length_in_pixels if remaining_pixels < 0: continue else: line += " " + words[i] line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15] start = int(len(line) / total_chars * total_frames) + int(start) lines.append(line_array) self.text_array.append(line_array) cap.release() print('Transcription complete') def extract_audio(self): print('Extracting audio') audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3") video = VideoFileClip(self.video_path) audio = video.audio audio.write_audiofile(audio_path) self.audio_path = audio_path print('Audio extracted') def extract_frames(self, output_folder): print('Extracting frames') cap = cv2.VideoCapture(self.video_path) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) asp = width / height N_frames = 0 while True: ret, frame = cap.read() if not ret: break frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)] for i in self.text_array: if N_frames >= i[1] and N_frames <= i[2]: text = i[0] text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2) text_x = int((frame.shape[1] - text_size[0]) / 2) text_y = int(height/2) cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2) break cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame) N_frames += 1 cap.release() print('Frames extracted') def create_video(self, output_video_path): print('Creating video') image_folder = os.path.join(os.path.dirname(self.video_path), "frames") if not os.path.exists(image_folder): os.makedirs(image_folder) self.extract_frames(image_folder) images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")] images.sort(key=lambda x: int(x.split(".")[0])) frame = cv2.imread(os.path.join(image_folder, images[0])) height, width, layers = frame.shape clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps) audio = AudioFileClip(self.audio_path) clip = clip.set_audio(audio) clip.write_videofile(output_video_path) shutil.rmtree(image_folder) os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3")) # Example usage model_path = "base" # video_path = "test_videos/videoplayback.mp4" output_video_path = "output.mp4" # output_audio_path = "test_videos/audio.mp3"