import gradio as gr import whisper import os import shutil import cv2 from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip from tqdm import tqdm # Constants FONT = cv2.FONT_HERSHEY_SIMPLEX FONT_SCALE = 0.8 FONT_THICKNESS = 2 class VideoTranscriber: def __init__(self, model_name, video_path): self.model = whisper.load_model(model_name) self.video_path = video_path self.audio_path = '' self.text_segments = [] self.fps = 0 self.char_width = 0 def extract_audio(self): print('[INFO] Extracting audio...') audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3" video = VideoFileClip(self.video_path) audio = video.audio audio.write_audiofile(audio_path) self.audio_path = audio_path print('[INFO] Audio extracted') def transcribe_video(self): print('[INFO] Transcribing audio...') result = self.model.transcribe(self.audio_path) segments = result["segments"] sample_text = segments[0]["text"] if segments else "Sample" textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0] cap = cv2.VideoCapture(self.video_path) self.fps = cap.get(cv2.CAP_PROP_FPS) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) aspect_ratio = width / height cap.release() effective_width = int(width - (width * 0.1)) self.char_width = max(int(textsize[0] / len(sample_text)), 1) for seg in tqdm(segments, desc="Transcribing"): lines = self._split_text_to_lines(seg["text"], effective_width) start_frame = int(seg["start"] * self.fps) end_frame = int(seg["end"] * self.fps) self.text_segments.extend([[line, start_frame, end_frame] for line in lines]) print('[INFO] Transcription complete') def _split_text_to_lines(self, text, max_width): words = text.split() lines, line = [], "" for word in words: if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width: line += (" " if line else "") + word else: lines.append(line) line = word if line: lines.append(line) return lines def extract_and_annotate_frames(self, output_dir): print('[INFO] Extracting and annotating frames...') os.makedirs(output_dir, exist_ok=True) cap = cv2.VideoCapture(self.video_path) frame_count = 0 while True: ret, frame = cap.read() if not ret: break for text, start, end in self.text_segments: if start <= frame_count <= end: text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS) text_x = (frame.shape[1] - text_size[0]) // 2 text_y = frame.shape[0] - 30 cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS) break cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame) frame_count += 1 cap.release() print('[INFO] Frame extraction complete') def create_annotated_video(self, output_video_path): print('[INFO] Creating final video...') frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp") self.extract_and_annotate_frames(frames_dir) image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")]) clip = ImageSequenceClip(image_files, fps=self.fps) audio = AudioFileClip(self.audio_path) clip = clip.with_audio(audio) clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac") shutil.rmtree(frames_dir) os.remove(self.audio_path) print('[INFO] Video created successfully') def process_video(video_path): transcriber = VideoTranscriber(model_name="base", video_path=video_path) transcriber.extract_audio() transcriber.transcribe_video() output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4" transcriber.create_annotated_video(output_path) return output_path # Gradio Interface def gradio_interface(video): return process_video(video) iface = gr.Interface( fn=gradio_interface, inputs=gr.Video(label="Upload Video"), outputs=gr.Video(label="Transcribed Video"), title="🎬 Whisper Video Subtitle Generator", description="Upload a video to automatically transcribe and add subtitles using Whisper AI." ) if __name__ == "__main__": iface.launch()