Spaces:
Running
Running
import gradio as gr | |
import whisper | |
import os | |
import shutil | |
import cv2 | |
from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip | |
from tqdm import tqdm | |
# Constants | |
FONT = cv2.FONT_HERSHEY_SIMPLEX | |
FONT_SCALE = 0.8 | |
FONT_THICKNESS = 2 | |
class VideoTranscriber: | |
def __init__(self, model_name, video_path): | |
self.model = whisper.load_model(model_name) | |
self.video_path = video_path | |
self.audio_path = '' | |
self.text_segments = [] | |
self.fps = 0 | |
self.char_width = 0 | |
def extract_audio(self): | |
print('[INFO] Extracting audio...') | |
audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3" | |
video = VideoFileClip(self.video_path) | |
audio = video.audio | |
audio.write_audiofile(audio_path) | |
self.audio_path = audio_path | |
print('[INFO] Audio extracted') | |
def transcribe_video(self): | |
print('[INFO] Transcribing audio...') | |
result = self.model.transcribe(self.audio_path) | |
segments = result["segments"] | |
sample_text = segments[0]["text"] if segments else "Sample" | |
textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0] | |
cap = cv2.VideoCapture(self.video_path) | |
self.fps = cap.get(cv2.CAP_PROP_FPS) | |
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
aspect_ratio = width / height | |
cap.release() | |
effective_width = int(width - (width * 0.1)) | |
self.char_width = max(int(textsize[0] / len(sample_text)), 1) | |
for seg in tqdm(segments, desc="Transcribing"): | |
lines = self._split_text_to_lines(seg["text"], effective_width) | |
start_frame = int(seg["start"] * self.fps) | |
end_frame = int(seg["end"] * self.fps) | |
self.text_segments.extend([[line, start_frame, end_frame] for line in lines]) | |
print('[INFO] Transcription complete') | |
def _split_text_to_lines(self, text, max_width): | |
words = text.split() | |
lines, line = [], "" | |
for word in words: | |
if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width: | |
line += (" " if line else "") + word | |
else: | |
lines.append(line) | |
line = word | |
if line: | |
lines.append(line) | |
return lines | |
def extract_and_annotate_frames(self, output_dir): | |
print('[INFO] Extracting and annotating frames...') | |
os.makedirs(output_dir, exist_ok=True) | |
cap = cv2.VideoCapture(self.video_path) | |
frame_count = 0 | |
while True: | |
ret, frame = cap.read() | |
if not ret: | |
break | |
for text, start, end in self.text_segments: | |
if start <= frame_count <= end: | |
text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS) | |
text_x = (frame.shape[1] - text_size[0]) // 2 | |
text_y = frame.shape[0] - 30 | |
cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS) | |
break | |
cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame) | |
frame_count += 1 | |
cap.release() | |
print('[INFO] Frame extraction complete') | |
def create_annotated_video(self, output_video_path): | |
print('[INFO] Creating final video...') | |
frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp") | |
self.extract_and_annotate_frames(frames_dir) | |
image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")]) | |
clip = ImageSequenceClip(image_files, fps=self.fps) | |
audio = AudioFileClip(self.audio_path) | |
clip = clip.with_audio(audio) | |
clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac") | |
shutil.rmtree(frames_dir) | |
os.remove(self.audio_path) | |
print('[INFO] Video created successfully') | |
def process_video(video_path): | |
transcriber = VideoTranscriber(model_name="base", video_path=video_path) | |
transcriber.extract_audio() | |
transcriber.transcribe_video() | |
output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4" | |
transcriber.create_annotated_video(output_path) | |
return output_path | |
# Gradio Interface | |
def gradio_interface(video): | |
return process_video(video) | |
iface = gr.Interface( | |
fn=gradio_interface, | |
inputs=gr.Video(label="Upload Video"), | |
outputs=gr.Video(label="Transcribed Video"), | |
title="🎬 Whisper Video Subtitle Generator", | |
description="Upload a video to automatically transcribe and add subtitles using Whisper AI." | |
) | |
if __name__ == "__main__": | |
iface.launch() |