File size: 4,809 Bytes
778610f
 
 
 
 
 
 
 
bd6a9e8
778610f
 
 
 
 
bd6a9e8
 
778610f
 
bd6a9e8
778610f
 
 
 
bd6a9e8
 
778610f
 
 
 
bd6a9e8
 
 
 
 
 
 
 
778610f
 
bd6a9e8
778610f
 
bd6a9e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778610f
 
 
 
 
 
bd6a9e8
 
 
 
 
 
778610f
 
bd6a9e8
 
778610f
 
bd6a9e8
778610f
bd6a9e8
 
 
 
778610f
bd6a9e8
 
778610f
ca94fa8
bd6a9e8
778610f
bd6a9e8
 
 
778610f
bd6a9e8
 
778610f
 
bd6a9e8
 
 
778610f
 
 
bd6a9e8
778610f
 
 
bd6a9e8
 
 
 
778610f
 
 
34b89df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import gradio as gr
import whisper
import os
import shutil
import cv2
from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
from tqdm import tqdm

# Constants
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 0.8
FONT_THICKNESS = 2

class VideoTranscriber:
    def __init__(self, model_name, video_path):
        self.model = whisper.load_model(model_name)
        self.video_path = video_path
        self.audio_path = ''
        self.text_segments = []
        self.fps = 0
        self.char_width = 0

    def extract_audio(self):
        print('[INFO] Extracting audio...')
        audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3"
        video = VideoFileClip(self.video_path)
        audio = video.audio
        audio.write_audiofile(audio_path)
        self.audio_path = audio_path
        print('[INFO] Audio extracted')

    def transcribe_video(self):
        print('[INFO] Transcribing audio...')
        result = self.model.transcribe(self.audio_path)
        segments = result["segments"]
        sample_text = segments[0]["text"] if segments else "Sample"
        textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0]

        cap = cv2.VideoCapture(self.video_path)
        self.fps = cap.get(cv2.CAP_PROP_FPS)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        aspect_ratio = width / height
        cap.release()

        effective_width = int(width - (width * 0.1))
        self.char_width = max(int(textsize[0] / len(sample_text)), 1)

        for seg in tqdm(segments, desc="Transcribing"):
            lines = self._split_text_to_lines(seg["text"], effective_width)
            start_frame = int(seg["start"] * self.fps)
            end_frame = int(seg["end"] * self.fps)
            self.text_segments.extend([[line, start_frame, end_frame] for line in lines])

        print('[INFO] Transcription complete')

    def _split_text_to_lines(self, text, max_width):
        words = text.split()
        lines, line = [], ""
        for word in words:
            if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width:
                line += (" " if line else "") + word
            else:
                lines.append(line)
                line = word
        if line:
            lines.append(line)
        return lines

    def extract_and_annotate_frames(self, output_dir):
        print('[INFO] Extracting and annotating frames...')
        os.makedirs(output_dir, exist_ok=True)
        cap = cv2.VideoCapture(self.video_path)
        frame_count = 0

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            for text, start, end in self.text_segments:
                if start <= frame_count <= end:
                    text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)
                    text_x = (frame.shape[1] - text_size[0]) // 2
                    text_y = frame.shape[0] - 30
                    cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS)
                    break

            cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame)
            frame_count += 1

        cap.release()
        print('[INFO] Frame extraction complete')

    def create_annotated_video(self, output_video_path):
        print('[INFO] Creating final video...')
        frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp")
        self.extract_and_annotate_frames(frames_dir)

        image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")])
        clip = ImageSequenceClip(image_files, fps=self.fps)
        audio = AudioFileClip(self.audio_path)
        clip = clip.with_audio(audio)
        clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac")

        shutil.rmtree(frames_dir)
        os.remove(self.audio_path)
        print('[INFO] Video created successfully')

def process_video(video_path):
    transcriber = VideoTranscriber(model_name="base", video_path=video_path)
    transcriber.extract_audio()
    transcriber.transcribe_video()
    output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4"
    transcriber.create_annotated_video(output_path)
    return output_path

# Gradio Interface
def gradio_interface(video):
    return process_video(video)

iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Video(label="Upload Video"),
    outputs=gr.Video(label="Transcribed Video"),
    title="🎬 Whisper Video Subtitle Generator",
    description="Upload a video to automatically transcribe and add subtitles using Whisper AI."
)

if __name__ == "__main__":
    iface.launch()