Spaces:

ar08
/

Auto-caption

Sleeping

App Files Files Community

ar08 commited on 13 days ago

Commit

34b89df

verified ·

1 Parent(s): 0c9f214

Upload 6 files

Browse files

Files changed (6) hide show

.gitignore +5 -0
README.md +40 -14
UI.py +70 -0
app.py +161 -0
main.py +140 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,5 @@

+__pycache__/main.cpython-311.pyc
+test_videos
+__pycache__/
+output.mp4

README.md CHANGED Viewed

@@ -1,14 +1,40 @@
----
-title: Auto Caption
-emoji: 👀
-colorFrom: purple
-colorTo: pink
-sdk: gradio
-sdk_version: 5.29.0
-app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: Generates auto captions for shorts
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+#     auto-subtitle-generator
+A program that generates subtitles in the format of instagram and facebook reels, youtube shorts and tiktok videos.
+***
+### Installation and usage:
+1. If using git to download repo type: `git clone https://github.com/zubu007/auto-subtitle-generator.git`
+2. Check if you have [ffmpeg](https://ffmpeg.org) installed on your system
+   * Open a terminal and type `ffmpeg -version`. If you get an error, you need to install ffmpeg.
+3. Install [ffmpeg](https://ffmpeg.org)
+   * On Windows
+     * Install [Chocolately](https://chocolatey.org/install) and type `choco install ffmpeg`
+   * On Linux
+     * `sudo apt install ffmpeg`
+   * On Mac
+     * `brew install ffmpeg`
+4. Install the necessary python packages in your environment using `pip install -r requirements.txt`
+5. Run the python script
+   * Windows:  `python GUI.py`
+   * Linux/Mac:  `python3 GUI.py`
+***
+### TODO
+- [ ] Control number of words shown together with a variable
+- [ ] Add support for multiple languages
+- [ ] Add support for multiple video formats
+- [ ] Add support for multiple video resolutions
+- [ ] Add comments to the code
+- [ ] Update this read.me to make professional
+- [ ] Add option to select font color
+- [ ] Font size option
+### Done
+- [x] Create a GUI for the program
+- [x] Design UI for the program
+- [x] Create variables for text size and font.

UI.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import tkinter as tk
+from tkinter import filedialog
+class VideoProcessor:
+    def __init__(self):
+        self.window = tk.Tk()
+        self.window.title("Video Processing GUI")
+        self.models = ["Whisper", "Model 2", "Model 3"]  # Add more models if needed
+        self.model_dropdown = tk.StringVar(self.window)
+        self.model_dropdown.set(self.models[0])  # Set the default model
+        self.setup_ui()
+    def process_video(self):
+        # Get the selected video file path
+        video_file_path = filedialog.askopenfilename()
+        # Get the selected model from the dropdown menu
+        selected_model = self.model_dropdown.get()
+        # Get the output file name and location
+        output_file_path = self.output_entry.get()
+        # Process the video using the selected model and output file path
+        # Add your code here
+        # Display a success message
+        self.result_label.config(text="Video processed successfully!")
+    def setup_ui(self):
+        # Create a label for the video file selection
+        video_label = tk.Label(self.window, text="Select Video File:")
+        video_label.pack()
+        # Create a button to browse and select the video file
+        video_button = tk.Button(self.window, text="Browse", command=self.process_video)
+        video_button.pack()
+        # Create a label for the model selection
+        model_label = tk.Label(self.window, text="Select Model:")
+        model_label.pack()
+        # Create a dropdown menu for model selection
+        model_menu = tk.OptionMenu(self.window, self.model_dropdown, *self.models)
+        model_menu.pack()
+        # Create a label for the output file name and location
+        output_label = tk.Label(self.window, text="Output File Name and Location:")
+        output_label.pack()
+        # Create an entry field for the output file name and location
+        self.output_entry = tk.Entry(self.window)
+        self.output_entry.pack()
+        # Create a button to start the video processing
+        process_button = tk.Button(self.window, text="Process Video", command=self.process_video)
+        process_button.pack()
+        # Create a label to display the result
+        self.result_label = tk.Label(self.window, text="")
+        self.result_label.pack()
+    def run(self):
+        # Start the GUI event loop
+        self.window.mainloop()
+if __name__ == "__main__":
+    app = VideoProcessor()
+    app.run()

app.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import gradio as gr
+import whisper
+import os
+import shutil
+import cv2
+from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip
+from tqdm import tqdm
+FONT = cv2.FONT_HERSHEY_SIMPLEX
+FONT_SCALE = 0.8
+FONT_THICKNESS = 2
+class VideoTranscriber:
+    def __init__(self, model_path, video_path):
+        self.model = whisper.load_model(model_path)
+        self.video_path = video_path
+        self.audio_path = ''
+        self.text_array = []
+        self.fps = 0
+        self.char_width = 0
+    def transcribe_video(self):
+        print('Transcribing video')
+        result = self.model.transcribe(self.audio_path)
+        text = result["segments"][0]["text"]
+        textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
+        cap = cv2.VideoCapture(self.video_path)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        asp = 16/9
+        ret, frame = cap.read()
+        width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
+        width = width - (width * 0.1)
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
+        self.char_width = int(textsize[0] / len(text))
+        for j in tqdm(result["segments"]):
+            lines = []
+            text = j["text"]
+            end = j["end"]
+            start = j["start"]
+            total_frames = int((end - start) * self.fps)
+            start = start * self.fps
+            total_chars = len(text)
+            words = text.split(" ")
+            i = 0
+            while i < len(words):
+                words[i] = words[i].strip()
+                if words[i] == "":
+                    i += 1
+                    continue
+                length_in_pixels = (len(words[i]) + 1) * self.char_width
+                remaining_pixels = width - length_in_pixels
+                line = words[i]
+                while remaining_pixels > 0:
+                    i += 1
+                    if i >= len(words):
+                        break
+                    length_in_pixels = (len(words[i]) + 1) * self.char_width
+                    remaining_pixels -= length_in_pixels
+                    if remaining_pixels < 0:
+                        continue
+                    else:
+                        line += " " + words[i]
+                line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
+                start = int(len(line) / total_chars * total_frames) + int(start)
+                lines.append(line_array)
+                self.text_array.append(line_array)
+        cap.release()
+        print('Transcription complete')
+    def extract_audio(self):
+        print('Extracting audio')
+        audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
+        video = VideoFileClip(self.video_path)
+        audio = video.audio
+        audio.write_audiofile(audio_path)
+        self.audio_path = audio_path
+        print('Audio extracted')
+    def extract_frames(self, output_folder):
+        print('Extracting frames')
+        cap = cv2.VideoCapture(self.video_path)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        asp = width / height
+        N_frames = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
+            for i in self.text_array:
+                if N_frames >= i[1] and N_frames <= i[2]:
+                    text = i[0]
+                    text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
+                    text_x = int((frame.shape[1] - text_size[0]) / 2)
+                    text_y = int(height/2)
+                    cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
+                    break
+            cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
+            N_frames += 1
+        cap.release()
+        print('Frames extracted')
+    def create_video(self, output_video_path):
+        print('Creating video')
+        image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
+        if not os.path.exists(image_folder):
+            os.makedirs(image_folder)
+        self.extract_frames(image_folder)
+        images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
+        images.sort(key=lambda x: int(x.split(".")[0]))
+        frame = cv2.imread(os.path.join(image_folder, images[0]))
+        height, width, layers = frame.shape
+        clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
+        audio = AudioFileClip(self.audio_path)
+        clip = clip.set_audio(audio)
+        clip.write_videofile(output_video_path)
+        shutil.rmtree(image_folder)
+        os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
+def process_video(video_path):
+    model_path = "base"
+    output_video_path = "output.mp4"
+    transcriber = VideoTranscriber(model_path, video_path)
+    transcriber.extract_audio()
+    transcriber.transcribe_video()
+    transcriber.create_video(output_video_path)
+    return output_video_path
+# Gradio Interface
+def gradio_interface(video):
+    output_video_path = process_video(video)
+    return output_video_path
+iface = gr.Interface(
+    fn=gradio_interface,
+    inputs=gr.inputs.Video(label="Upload Video"),
+    outputs=gr.outputs.Video(label="Transcribed Video"),
+    title="Video Transcription App",
+    description="Upload a video to transcribe and generate a new video with subtitles."
+)
+if __name__ == "__main__":
+    iface.launch()

main.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import whisper
+import os
+import shutil
+import cv2
+from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip
+from tqdm import tqdm
+FONT = cv2.FONT_HERSHEY_SIMPLEX
+FONT_SCALE = 0.8
+FONT_THICKNESS = 2
+class VideoTranscriber:
+    def __init__(self, model_path, video_path):
+        self.model = whisper.load_model(model_path)
+        self.video_path = video_path
+        self.audio_path = ''
+        self.text_array = []
+        self.fps = 0
+        self.char_width = 0
+    def transcribe_video(self):
+        print('Transcribing video')
+        result = self.model.transcribe(self.audio_path)
+        text = result["segments"][0]["text"]
+        textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
+        cap = cv2.VideoCapture(self.video_path)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        asp = 16/9
+        ret, frame = cap.read()
+        width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
+        width = width - (width * 0.1)
+        self.fps = cap.get(cv2.CAP_PROP_FPS)
+        self.char_width = int(textsize[0] / len(text))
+        for j in tqdm(result["segments"]):
+            lines = []
+            text = j["text"]
+            end = j["end"]
+            start = j["start"]
+            total_frames = int((end - start) * self.fps)
+            start = start * self.fps
+            total_chars = len(text)
+            words = text.split(" ")
+            i = 0
+            while i < len(words):
+                words[i] = words[i].strip()
+                if words[i] == "":
+                    i += 1
+                    continue
+                length_in_pixels = (len(words[i]) + 1) * self.char_width
+                remaining_pixels = width - length_in_pixels
+                line = words[i]
+                while remaining_pixels > 0:
+                    i += 1
+                    if i >= len(words):
+                        break
+                    length_in_pixels = (len(words[i]) + 1) * self.char_width
+                    remaining_pixels -= length_in_pixels
+                    if remaining_pixels < 0:
+                        continue
+                    else:
+                        line += " " + words[i]
+                line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
+                start = int(len(line) / total_chars * total_frames) + int(start)
+                lines.append(line_array)
+                self.text_array.append(line_array)
+        cap.release()
+        print('Transcription complete')
+    def extract_audio(self):
+        print('Extracting audio')
+        audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
+        video = VideoFileClip(self.video_path)
+        audio = video.audio
+        audio.write_audiofile(audio_path)
+        self.audio_path = audio_path
+        print('Audio extracted')
+    def extract_frames(self, output_folder):
+        print('Extracting frames')
+        cap = cv2.VideoCapture(self.video_path)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        asp = width / height
+        N_frames = 0
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
+            for i in self.text_array:
+                if N_frames >= i[1] and N_frames <= i[2]:
+                    text = i[0]
+                    text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
+                    text_x = int((frame.shape[1] - text_size[0]) / 2)
+                    text_y = int(height/2)
+                    cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
+                    break
+            cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
+            N_frames += 1
+        cap.release()
+        print('Frames extracted')
+    def create_video(self, output_video_path):
+        print('Creating video')
+        image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
+        if not os.path.exists(image_folder):
+            os.makedirs(image_folder)
+        self.extract_frames(image_folder)
+        images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
+        images.sort(key=lambda x: int(x.split(".")[0]))
+        frame = cv2.imread(os.path.join(image_folder, images[0]))
+        height, width, layers = frame.shape
+        clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
+        audio = AudioFileClip(self.audio_path)
+        clip = clip.set_audio(audio)
+        clip.write_videofile(output_video_path)
+        shutil.rmtree(image_folder)
+        os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
+# Example usage
+model_path = "base"
+# video_path = "test_videos/videoplayback.mp4"
+output_video_path = "output.mp4"
+# output_audio_path = "test_videos/audio.mp3"

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+opencv-python
+tqdm
+openai-whisper
+moviepy
+customtkinter