Spaces:
Running
Running
import whisper | |
import os | |
import shutil | |
import cv2 | |
from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip | |
from tqdm import tqdm | |
FONT = cv2.FONT_HERSHEY_SIMPLEX | |
FONT_SCALE = 0.8 | |
FONT_THICKNESS = 2 | |
class VideoTranscriber: | |
def __init__(self, model_path, video_path): | |
self.model = whisper.load_model(model_path) | |
self.video_path = video_path | |
self.audio_path = '' | |
self.text_array = [] | |
self.fps = 0 | |
self.char_width = 0 | |
def transcribe_video(self): | |
print('Transcribing video') | |
result = self.model.transcribe(self.audio_path) | |
text = result["segments"][0]["text"] | |
textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0] | |
cap = cv2.VideoCapture(self.video_path) | |
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
asp = 16/9 | |
ret, frame = cap.read() | |
width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1] | |
width = width - (width * 0.1) | |
self.fps = cap.get(cv2.CAP_PROP_FPS) | |
self.char_width = int(textsize[0] / len(text)) | |
for j in tqdm(result["segments"]): | |
lines = [] | |
text = j["text"] | |
end = j["end"] | |
start = j["start"] | |
total_frames = int((end - start) * self.fps) | |
start = start * self.fps | |
total_chars = len(text) | |
words = text.split(" ") | |
i = 0 | |
while i < len(words): | |
words[i] = words[i].strip() | |
if words[i] == "": | |
i += 1 | |
continue | |
length_in_pixels = (len(words[i]) + 1) * self.char_width | |
remaining_pixels = width - length_in_pixels | |
line = words[i] | |
while remaining_pixels > 0: | |
i += 1 | |
if i >= len(words): | |
break | |
length_in_pixels = (len(words[i]) + 1) * self.char_width | |
remaining_pixels -= length_in_pixels | |
if remaining_pixels < 0: | |
continue | |
else: | |
line += " " + words[i] | |
line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15] | |
start = int(len(line) / total_chars * total_frames) + int(start) | |
lines.append(line_array) | |
self.text_array.append(line_array) | |
cap.release() | |
print('Transcription complete') | |
def extract_audio(self): | |
print('Extracting audio') | |
audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3") | |
video = VideoFileClip(self.video_path) | |
audio = video.audio | |
audio.write_audiofile(audio_path) | |
self.audio_path = audio_path | |
print('Audio extracted') | |
def extract_frames(self, output_folder): | |
print('Extracting frames') | |
cap = cv2.VideoCapture(self.video_path) | |
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) | |
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) | |
asp = width / height | |
N_frames = 0 | |
while True: | |
ret, frame = cap.read() | |
if not ret: | |
break | |
frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)] | |
for i in self.text_array: | |
if N_frames >= i[1] and N_frames <= i[2]: | |
text = i[0] | |
text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2) | |
text_x = int((frame.shape[1] - text_size[0]) / 2) | |
text_y = int(height/2) | |
cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2) | |
break | |
cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame) | |
N_frames += 1 | |
cap.release() | |
print('Frames extracted') | |
def create_video(self, output_video_path): | |
print('Creating video') | |
image_folder = os.path.join(os.path.dirname(self.video_path), "frames") | |
if not os.path.exists(image_folder): | |
os.makedirs(image_folder) | |
self.extract_frames(image_folder) | |
images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")] | |
images.sort(key=lambda x: int(x.split(".")[0])) | |
frame = cv2.imread(os.path.join(image_folder, images[0])) | |
height, width, layers = frame.shape | |
clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps) | |
audio = AudioFileClip(self.audio_path) | |
clip = clip.set_audio(audio) | |
clip.write_videofile(output_video_path) | |
shutil.rmtree(image_folder) | |
os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3")) | |
# Example usage | |
model_path = "base" | |
# video_path = "test_videos/videoplayback.mp4" | |
output_video_path = "output.mp4" | |
# output_audio_path = "test_videos/audio.mp3" | |