Auto-caption / app.py
ar08's picture
Update app.py
ca94fa8 verified
import gradio as gr
import whisper
import os
import shutil
import cv2
from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
from tqdm import tqdm
# Constants
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 0.8
FONT_THICKNESS = 2
class VideoTranscriber:
def __init__(self, model_name, video_path):
self.model = whisper.load_model(model_name)
self.video_path = video_path
self.audio_path = ''
self.text_segments = []
self.fps = 0
self.char_width = 0
def extract_audio(self):
print('[INFO] Extracting audio...')
audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3"
video = VideoFileClip(self.video_path)
audio = video.audio
audio.write_audiofile(audio_path)
self.audio_path = audio_path
print('[INFO] Audio extracted')
def transcribe_video(self):
print('[INFO] Transcribing audio...')
result = self.model.transcribe(self.audio_path)
segments = result["segments"]
sample_text = segments[0]["text"] if segments else "Sample"
textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
cap = cv2.VideoCapture(self.video_path)
self.fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
aspect_ratio = width / height
cap.release()
effective_width = int(width - (width * 0.1))
self.char_width = max(int(textsize[0] / len(sample_text)), 1)
for seg in tqdm(segments, desc="Transcribing"):
lines = self._split_text_to_lines(seg["text"], effective_width)
start_frame = int(seg["start"] * self.fps)
end_frame = int(seg["end"] * self.fps)
self.text_segments.extend([[line, start_frame, end_frame] for line in lines])
print('[INFO] Transcription complete')
def _split_text_to_lines(self, text, max_width):
words = text.split()
lines, line = [], ""
for word in words:
if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width:
line += (" " if line else "") + word
else:
lines.append(line)
line = word
if line:
lines.append(line)
return lines
def extract_and_annotate_frames(self, output_dir):
print('[INFO] Extracting and annotating frames...')
os.makedirs(output_dir, exist_ok=True)
cap = cv2.VideoCapture(self.video_path)
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
for text, start, end in self.text_segments:
if start <= frame_count <= end:
text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)
text_x = (frame.shape[1] - text_size[0]) // 2
text_y = frame.shape[0] - 30
cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS)
break
cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame)
frame_count += 1
cap.release()
print('[INFO] Frame extraction complete')
def create_annotated_video(self, output_video_path):
print('[INFO] Creating final video...')
frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp")
self.extract_and_annotate_frames(frames_dir)
image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")])
clip = ImageSequenceClip(image_files, fps=self.fps)
audio = AudioFileClip(self.audio_path)
clip = clip.with_audio(audio)
clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
shutil.rmtree(frames_dir)
os.remove(self.audio_path)
print('[INFO] Video created successfully')
def process_video(video_path):
transcriber = VideoTranscriber(model_name="base", video_path=video_path)
transcriber.extract_audio()
transcriber.transcribe_video()
output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4"
transcriber.create_annotated_video(output_path)
return output_path
# Gradio Interface
def gradio_interface(video):
return process_video(video)
iface = gr.Interface(
fn=gradio_interface,
inputs=gr.Video(label="Upload Video"),
outputs=gr.Video(label="Transcribed Video"),
title="🎬 Whisper Video Subtitle Generator",
description="Upload a video to automatically transcribe and add subtitles using Whisper AI."
)
if __name__ == "__main__":
iface.launch()