Spaces:

ar08
/

Auto-caption

Running

File size: 4,809 Bytes

import gradio as gr
import whisper
import os
import shutil
import cv2
from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
from tqdm import tqdm

# Constants
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 0.8
FONT_THICKNESS = 2

class VideoTranscriber:
    def __init__(self, model_name, video_path):
        self.model = whisper.load_model(model_name)
        self.video_path = video_path
        self.audio_path = ''
        self.text_segments = []
        self.fps = 0
        self.char_width = 0

    def extract_audio(self):
        print('[INFO] Extracting audio...')
        audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3"
        video = VideoFileClip(self.video_path)
        audio = video.audio
        audio.write_audiofile(audio_path)
        self.audio_path = audio_path
        print('[INFO] Audio extracted')

    def transcribe_video(self):
        print('[INFO] Transcribing audio...')
        result = self.model.transcribe(self.audio_path)
        segments = result["segments"]
        sample_text = segments[0]["text"] if segments else "Sample"
        textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0]

        cap = cv2.VideoCapture(self.video_path)
        self.fps = cap.get(cv2.CAP_PROP_FPS)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        aspect_ratio = width / height
        cap.release()

        effective_width = int(width - (width * 0.1))
        self.char_width = max(int(textsize[0] / len(sample_text)), 1)

        for seg in tqdm(segments, desc="Transcribing"):
            lines = self._split_text_to_lines(seg["text"], effective_width)
            start_frame = int(seg["start"] * self.fps)
            end_frame = int(seg["end"] * self.fps)
            self.text_segments.extend([[line, start_frame, end_frame] for line in lines])

        print('[INFO] Transcription complete')

    def _split_text_to_lines(self, text, max_width):
        words = text.split()
        lines, line = [], ""
        for word in words:
            if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width:
                line += (" " if line else "") + word
            else:
                lines.append(line)
                line = word
        if line:
            lines.append(line)
        return lines

    def extract_and_annotate_frames(self, output_dir):
        print('[INFO] Extracting and annotating frames...')
        os.makedirs(output_dir, exist_ok=True)
        cap = cv2.VideoCapture(self.video_path)
        frame_count = 0

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            for text, start, end in self.text_segments:
                if start <= frame_count <= end:
                    text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)
                    text_x = (frame.shape[1] - text_size[0]) // 2
                    text_y = frame.shape[0] - 30
                    cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS)
                    break

            cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame)
            frame_count += 1

        cap.release()
        print('[INFO] Frame extraction complete')

    def create_annotated_video(self, output_video_path):
        print('[INFO] Creating final video...')
        frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp")
        self.extract_and_annotate_frames(frames_dir)

        image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")])
        clip = ImageSequenceClip(image_files, fps=self.fps)
        audio = AudioFileClip(self.audio_path)
        clip = clip.with_audio(audio)
        clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac")

        shutil.rmtree(frames_dir)
        os.remove(self.audio_path)
        print('[INFO] Video created successfully')

def process_video(video_path):
    transcriber = VideoTranscriber(model_name="base", video_path=video_path)
    transcriber.extract_audio()
    transcriber.transcribe_video()
    output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4"
    transcriber.create_annotated_video(output_path)
    return output_path

# Gradio Interface
def gradio_interface(video):
    return process_video(video)

iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Video(label="Upload Video"),
    outputs=gr.Video(label="Transcribed Video"),
    title="🎬 Whisper Video Subtitle Generator",
    description="Upload a video to automatically transcribe and add subtitles using Whisper AI."
)

if __name__ == "__main__":
    iface.launch()