Spaces:
Running
Running
File size: 5,483 Bytes
34b89df |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
import whisper
import os
import shutil
import cv2
from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip
from tqdm import tqdm
FONT = cv2.FONT_HERSHEY_SIMPLEX
FONT_SCALE = 0.8
FONT_THICKNESS = 2
class VideoTranscriber:
def __init__(self, model_path, video_path):
self.model = whisper.load_model(model_path)
self.video_path = video_path
self.audio_path = ''
self.text_array = []
self.fps = 0
self.char_width = 0
def transcribe_video(self):
print('Transcribing video')
result = self.model.transcribe(self.audio_path)
text = result["segments"][0]["text"]
textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
cap = cv2.VideoCapture(self.video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
asp = 16/9
ret, frame = cap.read()
width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
width = width - (width * 0.1)
self.fps = cap.get(cv2.CAP_PROP_FPS)
self.char_width = int(textsize[0] / len(text))
for j in tqdm(result["segments"]):
lines = []
text = j["text"]
end = j["end"]
start = j["start"]
total_frames = int((end - start) * self.fps)
start = start * self.fps
total_chars = len(text)
words = text.split(" ")
i = 0
while i < len(words):
words[i] = words[i].strip()
if words[i] == "":
i += 1
continue
length_in_pixels = (len(words[i]) + 1) * self.char_width
remaining_pixels = width - length_in_pixels
line = words[i]
while remaining_pixels > 0:
i += 1
if i >= len(words):
break
length_in_pixels = (len(words[i]) + 1) * self.char_width
remaining_pixels -= length_in_pixels
if remaining_pixels < 0:
continue
else:
line += " " + words[i]
line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
start = int(len(line) / total_chars * total_frames) + int(start)
lines.append(line_array)
self.text_array.append(line_array)
cap.release()
print('Transcription complete')
def extract_audio(self):
print('Extracting audio')
audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
video = VideoFileClip(self.video_path)
audio = video.audio
audio.write_audiofile(audio_path)
self.audio_path = audio_path
print('Audio extracted')
def extract_frames(self, output_folder):
print('Extracting frames')
cap = cv2.VideoCapture(self.video_path)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
asp = width / height
N_frames = 0
while True:
ret, frame = cap.read()
if not ret:
break
frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
for i in self.text_array:
if N_frames >= i[1] and N_frames <= i[2]:
text = i[0]
text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
text_x = int((frame.shape[1] - text_size[0]) / 2)
text_y = int(height/2)
cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
break
cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
N_frames += 1
cap.release()
print('Frames extracted')
def create_video(self, output_video_path):
print('Creating video')
image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
if not os.path.exists(image_folder):
os.makedirs(image_folder)
self.extract_frames(image_folder)
images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
images.sort(key=lambda x: int(x.split(".")[0]))
frame = cv2.imread(os.path.join(image_folder, images[0]))
height, width, layers = frame.shape
clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
audio = AudioFileClip(self.audio_path)
clip = clip.set_audio(audio)
clip.write_videofile(output_video_path)
shutil.rmtree(image_folder)
os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
# Example usage
model_path = "base"
# video_path = "test_videos/videoplayback.mp4"
output_video_path = "output.mp4"
# output_audio_path = "test_videos/audio.mp3"
|