Spaces:
Running
Running
File size: 4,809 Bytes
778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f ca94fa8 bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f bd6a9e8 778610f 34b89df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import gradio as gr
import whisper
import os
import shutil
import cv2
from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
from tqdm import tqdm
# Constants
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 0.8
FONT_THICKNESS = 2
class VideoTranscriber:
def __init__(self, model_name, video_path):
self.model = whisper.load_model(model_name)
self.video_path = video_path
self.audio_path = ''
self.text_segments = []
self.fps = 0
self.char_width = 0
def extract_audio(self):
print('[INFO] Extracting audio...')
audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3"
video = VideoFileClip(self.video_path)
audio = video.audio
audio.write_audiofile(audio_path)
self.audio_path = audio_path
print('[INFO] Audio extracted')
def transcribe_video(self):
print('[INFO] Transcribing audio...')
result = self.model.transcribe(self.audio_path)
segments = result["segments"]
sample_text = segments[0]["text"] if segments else "Sample"
textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
cap = cv2.VideoCapture(self.video_path)
self.fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
aspect_ratio = width / height
cap.release()
effective_width = int(width - (width * 0.1))
self.char_width = max(int(textsize[0] / len(sample_text)), 1)
for seg in tqdm(segments, desc="Transcribing"):
lines = self._split_text_to_lines(seg["text"], effective_width)
start_frame = int(seg["start"] * self.fps)
end_frame = int(seg["end"] * self.fps)
self.text_segments.extend([[line, start_frame, end_frame] for line in lines])
print('[INFO] Transcription complete')
def _split_text_to_lines(self, text, max_width):
words = text.split()
lines, line = [], ""
for word in words:
if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width:
line += (" " if line else "") + word
else:
lines.append(line)
line = word
if line:
lines.append(line)
return lines
def extract_and_annotate_frames(self, output_dir):
print('[INFO] Extracting and annotating frames...')
os.makedirs(output_dir, exist_ok=True)
cap = cv2.VideoCapture(self.video_path)
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
for text, start, end in self.text_segments:
if start <= frame_count <= end:
text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)
text_x = (frame.shape[1] - text_size[0]) // 2
text_y = frame.shape[0] - 30
cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS)
break
cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame)
frame_count += 1
cap.release()
print('[INFO] Frame extraction complete')
def create_annotated_video(self, output_video_path):
print('[INFO] Creating final video...')
frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp")
self.extract_and_annotate_frames(frames_dir)
image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")])
clip = ImageSequenceClip(image_files, fps=self.fps)
audio = AudioFileClip(self.audio_path)
clip = clip.with_audio(audio)
clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
shutil.rmtree(frames_dir)
os.remove(self.audio_path)
print('[INFO] Video created successfully')
def process_video(video_path):
transcriber = VideoTranscriber(model_name="base", video_path=video_path)
transcriber.extract_audio()
transcriber.transcribe_video()
output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4"
transcriber.create_annotated_video(output_path)
return output_path
# Gradio Interface
def gradio_interface(video):
return process_video(video)
iface = gr.Interface(
fn=gradio_interface,
inputs=gr.Video(label="Upload Video"),
outputs=gr.Video(label="Transcribed Video"),
title="🎬 Whisper Video Subtitle Generator",
description="Upload a video to automatically transcribe and add subtitles using Whisper AI."
)
if __name__ == "__main__":
iface.launch() |