Spaces:

ar08
/

Auto-caption

Sleeping

App Files Files Community

ar08 commited on 27 days ago

Commit

bd6a9e8

verified ·

1 Parent(s): 778610f

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -106

app.py CHANGED Viewed

@@ -6,155 +6,125 @@ import cv2
 from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
 from tqdm import tqdm
 FONT = cv2.FONT_HERSHEY_SIMPLEX
 FONT_SCALE = 0.8
 FONT_THICKNESS = 2
 class VideoTranscriber:
-    def __init__(self, model_path, video_path):
-        self.model = whisper.load_model(model_path)
         self.video_path = video_path
         self.audio_path = ''
-        self.text_array = []
         self.fps = 0
         self.char_width = 0
-    def transcribe_video(self):
-        print('Transcribing video')
-        result = self.model.transcribe(self.audio_path)
-        text = result["segments"][0]["text"]
-        textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
-        cap = cv2.VideoCapture(self.video_path)
-        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        asp = 16/9
-        ret, frame = cap.read()
-        width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
-        width = width - (width * 0.1)
-        self.fps = cap.get(cv2.CAP_PROP_FPS)
-        self.char_width = int(textsize[0] / len(text))
-        for j in tqdm(result["segments"]):
-            lines = []
-            text = j["text"]
-            end = j["end"]
-            start = j["start"]
-            total_frames = int((end - start) * self.fps)
-            start = start * self.fps
-            total_chars = len(text)
-            words = text.split(" ")
-            i = 0
-            while i < len(words):
-                words[i] = words[i].strip()
-                if words[i] == "":
-                    i += 1
-                    continue
-                length_in_pixels = (len(words[i]) + 1) * self.char_width
-                remaining_pixels = width - length_in_pixels
-                line = words[i]
-                while remaining_pixels > 0:
-                    i += 1
-                    if i >= len(words):
-                        break
-                    length_in_pixels = (len(words[i]) + 1) * self.char_width
-                    remaining_pixels -= length_in_pixels
-                    if remaining_pixels < 0:
-                        continue
-                    else:
-                        line += " " + words[i]
-                line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
-                start = int(len(line) / total_chars * total_frames) + int(start)
-                lines.append(line_array)
-                self.text_array.append(line_array)
-        cap.release()
-        print('Transcription complete')
     def extract_audio(self):
-        print('Extracting audio')
-        audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
         video = VideoFileClip(self.video_path)
         audio = video.audio
         audio.write_audiofile(audio_path)
         self.audio_path = audio_path
-        print('Audio extracted')
-    def extract_frames(self, output_folder):
-        print('Extracting frames')
         cap = cv2.VideoCapture(self.video_path)
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-        asp = width / height
-        N_frames = 0
         while True:
             ret, frame = cap.read()
             if not ret:
                 break
-            frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
-            for i in self.text_array:
-                if N_frames >= i[1] and N_frames <= i[2]:
-                    text = i[0]
-                    text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
-                    text_x = int((frame.shape[1] - text_size[0]) / 2)
-                    text_y = int(height/2)
-                    cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
                     break
-            cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
-            N_frames += 1
         cap.release()
-        print('Frames extracted')
-    def create_video(self, output_video_path):
-        print('Creating video')
-        image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
-        if not os.path.exists(image_folder):
-            os.makedirs(image_folder)
-        self.extract_frames(image_folder)
-        images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
-        images.sort(key=lambda x: int(x.split(".")[0]))
-        frame = cv2.imread(os.path.join(image_folder, images[0]))
-        height, width, layers = frame.shape
-        clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
         audio = AudioFileClip(self.audio_path)
         clip = clip.set_audio(audio)
-        clip.write_videofile(output_video_path)
-        shutil.rmtree(image_folder)
-        os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
-def process_video(video_path):
-    model_path = "base"
-    output_video_path = "output.mp4"
-    transcriber = VideoTranscriber(model_path, video_path)
     transcriber.extract_audio()
     transcriber.transcribe_video()
-    transcriber.create_video(output_video_path)
-    return output_video_path
 # Gradio Interface
 def gradio_interface(video):
-    output_video_path = process_video(video)
-    return output_video_path
 iface = gr.Interface(
     fn=gradio_interface,
-    inputs=gr.inputs.Video(label="Upload Video"),
-    outputs=gr.outputs.Video(label="Transcribed Video"),
-    title="Video Transcription App",
-    description="Upload a video to transcribe and generate a new video with subtitles."
 )
 if __name__ == "__main__":

 from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
 from tqdm import tqdm
+# Constants
 FONT = cv2.FONT_HERSHEY_SIMPLEX
 FONT_SCALE = 0.8
 FONT_THICKNESS = 2
 class VideoTranscriber:
+    def __init__(self, model_name, video_path):
+        self.model = whisper.load_model(model_name)
         self.video_path = video_path
         self.audio_path = ''
+        self.text_segments = []
         self.fps = 0
         self.char_width = 0
     def extract_audio(self):
+        print('[INFO] Extracting audio...')
+        audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3"
         video = VideoFileClip(self.video_path)
         audio = video.audio
         audio.write_audiofile(audio_path)
         self.audio_path = audio_path
+        print('[INFO] Audio extracted')
+    def transcribe_video(self):
+        print('[INFO] Transcribing audio...')
+        result = self.model.transcribe(self.audio_path)
+        segments = result["segments"]
+        sample_text = segments[0]["text"] if segments else "Sample"
+        textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
         cap = cv2.VideoCapture(self.video_path)
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
         width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
         height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        aspect_ratio = width / height
+        cap.release()
+        effective_width = int(width - (width * 0.1))
+        self.char_width = max(int(textsize[0] / len(sample_text)), 1)
+        for seg in tqdm(segments, desc="Transcribing"):
+            lines = self._split_text_to_lines(seg["text"], effective_width)
+            start_frame = int(seg["start"] * self.fps)
+            end_frame = int(seg["end"] * self.fps)
+            self.text_segments.extend([[line, start_frame, end_frame] for line in lines])
+        print('[INFO] Transcription complete')
+    def _split_text_to_lines(self, text, max_width):
+        words = text.split()
+        lines, line = [], ""
+        for word in words:
+            if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width:
+                line += (" " if line else "") + word
+            else:
+                lines.append(line)
+                line = word
+        if line:
+            lines.append(line)
+        return lines
+    def extract_and_annotate_frames(self, output_dir):
+        print('[INFO] Extracting and annotating frames...')
+        os.makedirs(output_dir, exist_ok=True)
+        cap = cv2.VideoCapture(self.video_path)
+        frame_count = 0
         while True:
             ret, frame = cap.read()
             if not ret:
                 break
+            for text, start, end in self.text_segments:
+                if start <= frame_count <= end:
+                    text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)
+                    text_x = (frame.shape[1] - text_size[0]) // 2
+                    text_y = frame.shape[0] - 30
+                    cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS)
                     break
+            cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame)
+            frame_count += 1
         cap.release()
+        print('[INFO] Frame extraction complete')
+    def create_annotated_video(self, output_video_path):
+        print('[INFO] Creating final video...')
+        frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp")
+        self.extract_and_annotate_frames(frames_dir)
+        image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")])
+        clip = ImageSequenceClip(image_files, fps=self.fps)
         audio = AudioFileClip(self.audio_path)
         clip = clip.set_audio(audio)
+        clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
+        shutil.rmtree(frames_dir)
+        os.remove(self.audio_path)
+        print('[INFO] Video created successfully')
+def process_video(video_path):
+    transcriber = VideoTranscriber(model_name="base", video_path=video_path)
     transcriber.extract_audio()
     transcriber.transcribe_video()
+    output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4"
+    transcriber.create_annotated_video(output_path)
+    return output_path
 # Gradio Interface
 def gradio_interface(video):
+    return process_video(video)
 iface = gr.Interface(
     fn=gradio_interface,
+    inputs=gr.Video(label="Upload Video"),
+    outputs=gr.Video(label="Transcribed Video"),
+    title="🎬 Whisper Video Subtitle Generator",
+    description="Upload a video to automatically transcribe and add subtitles using Whisper AI."
 )
 if __name__ == "__main__":