ar08 commited on
Commit
34b89df
·
verified ·
1 Parent(s): 0c9f214

Upload 6 files

Browse files
Files changed (6) hide show
  1. .gitignore +5 -0
  2. README.md +40 -14
  3. UI.py +70 -0
  4. app.py +161 -0
  5. main.py +140 -0
  6. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+
2
+ __pycache__/main.cpython-311.pyc
3
+ test_videos
4
+ __pycache__/
5
+ output.mp4
README.md CHANGED
@@ -1,14 +1,40 @@
1
- ---
2
- title: Auto Caption
3
- emoji: 👀
4
- colorFrom: purple
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.29.0
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Generates auto captions for shorts
12
- ---
13
-
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # auto-subtitle-generator
2
+ A program that generates subtitles in the format of instagram and facebook reels, youtube shorts and tiktok videos.
3
+ ***
4
+ ### Installation and usage:
5
+ 1. If using git to download repo type: `git clone https://github.com/zubu007/auto-subtitle-generator.git`
6
+ 2. Check if you have [ffmpeg](https://ffmpeg.org) installed on your system
7
+ * Open a terminal and type `ffmpeg -version`. If you get an error, you need to install ffmpeg.
8
+
9
+ 3. Install [ffmpeg](https://ffmpeg.org)
10
+ * On Windows
11
+ * Install [Chocolately](https://chocolatey.org/install) and type `choco install ffmpeg`
12
+ * On Linux
13
+ * `sudo apt install ffmpeg`
14
+ * On Mac
15
+ * `brew install ffmpeg`
16
+
17
+ 4. Install the necessary python packages in your environment using `pip install -r requirements.txt`
18
+
19
+ 5. Run the python script
20
+ * Windows: `python GUI.py`
21
+ * Linux/Mac: `python3 GUI.py`
22
+
23
+
24
+ ***
25
+
26
+ ### TODO
27
+ - [ ] Control number of words shown together with a variable
28
+ - [ ] Add support for multiple languages
29
+ - [ ] Add support for multiple video formats
30
+ - [ ] Add support for multiple video resolutions
31
+ - [ ] Add comments to the code
32
+ - [ ] Update this read.me to make professional
33
+ - [ ] Add option to select font color
34
+ - [ ] Font size option
35
+
36
+ ### Done
37
+ - [x] Create a GUI for the program
38
+ - [x] Design UI for the program
39
+ - [x] Create variables for text size and font.
40
+
UI.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tkinter as tk
2
+ from tkinter import filedialog
3
+
4
+ class VideoProcessor:
5
+ def __init__(self):
6
+ self.window = tk.Tk()
7
+ self.window.title("Video Processing GUI")
8
+
9
+ self.models = ["Whisper", "Model 2", "Model 3"] # Add more models if needed
10
+ self.model_dropdown = tk.StringVar(self.window)
11
+ self.model_dropdown.set(self.models[0]) # Set the default model
12
+
13
+ self.setup_ui()
14
+
15
+ def process_video(self):
16
+ # Get the selected video file path
17
+ video_file_path = filedialog.askopenfilename()
18
+
19
+ # Get the selected model from the dropdown menu
20
+ selected_model = self.model_dropdown.get()
21
+
22
+ # Get the output file name and location
23
+ output_file_path = self.output_entry.get()
24
+
25
+ # Process the video using the selected model and output file path
26
+ # Add your code here
27
+
28
+ # Display a success message
29
+ self.result_label.config(text="Video processed successfully!")
30
+
31
+ def setup_ui(self):
32
+ # Create a label for the video file selection
33
+ video_label = tk.Label(self.window, text="Select Video File:")
34
+ video_label.pack()
35
+
36
+ # Create a button to browse and select the video file
37
+ video_button = tk.Button(self.window, text="Browse", command=self.process_video)
38
+ video_button.pack()
39
+
40
+ # Create a label for the model selection
41
+ model_label = tk.Label(self.window, text="Select Model:")
42
+ model_label.pack()
43
+
44
+ # Create a dropdown menu for model selection
45
+ model_menu = tk.OptionMenu(self.window, self.model_dropdown, *self.models)
46
+ model_menu.pack()
47
+
48
+ # Create a label for the output file name and location
49
+ output_label = tk.Label(self.window, text="Output File Name and Location:")
50
+ output_label.pack()
51
+
52
+ # Create an entry field for the output file name and location
53
+ self.output_entry = tk.Entry(self.window)
54
+ self.output_entry.pack()
55
+
56
+ # Create a button to start the video processing
57
+ process_button = tk.Button(self.window, text="Process Video", command=self.process_video)
58
+ process_button.pack()
59
+
60
+ # Create a label to display the result
61
+ self.result_label = tk.Label(self.window, text="")
62
+ self.result_label.pack()
63
+
64
+ def run(self):
65
+ # Start the GUI event loop
66
+ self.window.mainloop()
67
+
68
+ if __name__ == "__main__":
69
+ app = VideoProcessor()
70
+ app.run()
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisper
3
+ import os
4
+ import shutil
5
+ import cv2
6
+ from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip
7
+ from tqdm import tqdm
8
+
9
+ FONT = cv2.FONT_HERSHEY_SIMPLEX
10
+ FONT_SCALE = 0.8
11
+ FONT_THICKNESS = 2
12
+
13
+ class VideoTranscriber:
14
+ def __init__(self, model_path, video_path):
15
+ self.model = whisper.load_model(model_path)
16
+ self.video_path = video_path
17
+ self.audio_path = ''
18
+ self.text_array = []
19
+ self.fps = 0
20
+ self.char_width = 0
21
+
22
+ def transcribe_video(self):
23
+ print('Transcribing video')
24
+ result = self.model.transcribe(self.audio_path)
25
+ text = result["segments"][0]["text"]
26
+ textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
27
+ cap = cv2.VideoCapture(self.video_path)
28
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
29
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
30
+ asp = 16/9
31
+ ret, frame = cap.read()
32
+ width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
33
+ width = width - (width * 0.1)
34
+ self.fps = cap.get(cv2.CAP_PROP_FPS)
35
+ self.char_width = int(textsize[0] / len(text))
36
+
37
+ for j in tqdm(result["segments"]):
38
+ lines = []
39
+ text = j["text"]
40
+ end = j["end"]
41
+ start = j["start"]
42
+ total_frames = int((end - start) * self.fps)
43
+ start = start * self.fps
44
+ total_chars = len(text)
45
+ words = text.split(" ")
46
+ i = 0
47
+
48
+ while i < len(words):
49
+ words[i] = words[i].strip()
50
+ if words[i] == "":
51
+ i += 1
52
+ continue
53
+ length_in_pixels = (len(words[i]) + 1) * self.char_width
54
+ remaining_pixels = width - length_in_pixels
55
+ line = words[i]
56
+
57
+ while remaining_pixels > 0:
58
+ i += 1
59
+ if i >= len(words):
60
+ break
61
+ length_in_pixels = (len(words[i]) + 1) * self.char_width
62
+ remaining_pixels -= length_in_pixels
63
+ if remaining_pixels < 0:
64
+ continue
65
+ else:
66
+ line += " " + words[i]
67
+
68
+ line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
69
+ start = int(len(line) / total_chars * total_frames) + int(start)
70
+ lines.append(line_array)
71
+ self.text_array.append(line_array)
72
+
73
+ cap.release()
74
+ print('Transcription complete')
75
+
76
+ def extract_audio(self):
77
+ print('Extracting audio')
78
+ audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
79
+ video = VideoFileClip(self.video_path)
80
+ audio = video.audio
81
+ audio.write_audiofile(audio_path)
82
+ self.audio_path = audio_path
83
+ print('Audio extracted')
84
+
85
+ def extract_frames(self, output_folder):
86
+ print('Extracting frames')
87
+ cap = cv2.VideoCapture(self.video_path)
88
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
89
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
90
+ asp = width / height
91
+ N_frames = 0
92
+
93
+ while True:
94
+ ret, frame = cap.read()
95
+ if not ret:
96
+ break
97
+
98
+ frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
99
+
100
+ for i in self.text_array:
101
+ if N_frames >= i[1] and N_frames <= i[2]:
102
+ text = i[0]
103
+ text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
104
+ text_x = int((frame.shape[1] - text_size[0]) / 2)
105
+ text_y = int(height/2)
106
+ cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
107
+ break
108
+
109
+ cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
110
+ N_frames += 1
111
+
112
+ cap.release()
113
+ print('Frames extracted')
114
+
115
+ def create_video(self, output_video_path):
116
+ print('Creating video')
117
+ image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
118
+ if not os.path.exists(image_folder):
119
+ os.makedirs(image_folder)
120
+
121
+ self.extract_frames(image_folder)
122
+
123
+ images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
124
+ images.sort(key=lambda x: int(x.split(".")[0]))
125
+
126
+ frame = cv2.imread(os.path.join(image_folder, images[0]))
127
+ height, width, layers = frame.shape
128
+
129
+ clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
130
+ audio = AudioFileClip(self.audio_path)
131
+ clip = clip.set_audio(audio)
132
+ clip.write_videofile(output_video_path)
133
+ shutil.rmtree(image_folder)
134
+ os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
135
+
136
+ def process_video(video_path):
137
+ model_path = "base"
138
+ output_video_path = "output.mp4"
139
+
140
+ transcriber = VideoTranscriber(model_path, video_path)
141
+ transcriber.extract_audio()
142
+ transcriber.transcribe_video()
143
+ transcriber.create_video(output_video_path)
144
+
145
+ return output_video_path
146
+
147
+ # Gradio Interface
148
+ def gradio_interface(video):
149
+ output_video_path = process_video(video)
150
+ return output_video_path
151
+
152
+ iface = gr.Interface(
153
+ fn=gradio_interface,
154
+ inputs=gr.inputs.Video(label="Upload Video"),
155
+ outputs=gr.outputs.Video(label="Transcribed Video"),
156
+ title="Video Transcription App",
157
+ description="Upload a video to transcribe and generate a new video with subtitles."
158
+ )
159
+
160
+ if __name__ == "__main__":
161
+ iface.launch()
main.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import os
3
+ import shutil
4
+ import cv2
5
+ from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip
6
+ from tqdm import tqdm
7
+
8
+ FONT = cv2.FONT_HERSHEY_SIMPLEX
9
+ FONT_SCALE = 0.8
10
+ FONT_THICKNESS = 2
11
+
12
+ class VideoTranscriber:
13
+ def __init__(self, model_path, video_path):
14
+ self.model = whisper.load_model(model_path)
15
+ self.video_path = video_path
16
+ self.audio_path = ''
17
+ self.text_array = []
18
+ self.fps = 0
19
+ self.char_width = 0
20
+
21
+ def transcribe_video(self):
22
+ print('Transcribing video')
23
+ result = self.model.transcribe(self.audio_path)
24
+ text = result["segments"][0]["text"]
25
+ textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
26
+ cap = cv2.VideoCapture(self.video_path)
27
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
28
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
29
+ asp = 16/9
30
+ ret, frame = cap.read()
31
+ width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
32
+ width = width - (width * 0.1)
33
+ self.fps = cap.get(cv2.CAP_PROP_FPS)
34
+ self.char_width = int(textsize[0] / len(text))
35
+
36
+ for j in tqdm(result["segments"]):
37
+ lines = []
38
+ text = j["text"]
39
+ end = j["end"]
40
+ start = j["start"]
41
+ total_frames = int((end - start) * self.fps)
42
+ start = start * self.fps
43
+ total_chars = len(text)
44
+ words = text.split(" ")
45
+ i = 0
46
+
47
+ while i < len(words):
48
+ words[i] = words[i].strip()
49
+ if words[i] == "":
50
+ i += 1
51
+ continue
52
+ length_in_pixels = (len(words[i]) + 1) * self.char_width
53
+ remaining_pixels = width - length_in_pixels
54
+ line = words[i]
55
+
56
+ while remaining_pixels > 0:
57
+ i += 1
58
+ if i >= len(words):
59
+ break
60
+ length_in_pixels = (len(words[i]) + 1) * self.char_width
61
+ remaining_pixels -= length_in_pixels
62
+ if remaining_pixels < 0:
63
+ continue
64
+ else:
65
+ line += " " + words[i]
66
+
67
+ line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
68
+ start = int(len(line) / total_chars * total_frames) + int(start)
69
+ lines.append(line_array)
70
+ self.text_array.append(line_array)
71
+
72
+ cap.release()
73
+ print('Transcription complete')
74
+
75
+ def extract_audio(self):
76
+ print('Extracting audio')
77
+ audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
78
+ video = VideoFileClip(self.video_path)
79
+ audio = video.audio
80
+ audio.write_audiofile(audio_path)
81
+ self.audio_path = audio_path
82
+ print('Audio extracted')
83
+
84
+ def extract_frames(self, output_folder):
85
+ print('Extracting frames')
86
+ cap = cv2.VideoCapture(self.video_path)
87
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
88
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
89
+ asp = width / height
90
+ N_frames = 0
91
+
92
+ while True:
93
+ ret, frame = cap.read()
94
+ if not ret:
95
+ break
96
+
97
+ frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
98
+
99
+ for i in self.text_array:
100
+ if N_frames >= i[1] and N_frames <= i[2]:
101
+ text = i[0]
102
+ text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
103
+ text_x = int((frame.shape[1] - text_size[0]) / 2)
104
+ text_y = int(height/2)
105
+ cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
106
+ break
107
+
108
+ cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
109
+ N_frames += 1
110
+
111
+ cap.release()
112
+ print('Frames extracted')
113
+
114
+ def create_video(self, output_video_path):
115
+ print('Creating video')
116
+ image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
117
+ if not os.path.exists(image_folder):
118
+ os.makedirs(image_folder)
119
+
120
+ self.extract_frames(image_folder)
121
+
122
+ images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
123
+ images.sort(key=lambda x: int(x.split(".")[0]))
124
+
125
+ frame = cv2.imread(os.path.join(image_folder, images[0]))
126
+ height, width, layers = frame.shape
127
+
128
+ clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
129
+ audio = AudioFileClip(self.audio_path)
130
+ clip = clip.set_audio(audio)
131
+ clip.write_videofile(output_video_path)
132
+ shutil.rmtree(image_folder)
133
+ os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
134
+
135
+ # Example usage
136
+ model_path = "base"
137
+ # video_path = "test_videos/videoplayback.mp4"
138
+ output_video_path = "output.mp4"
139
+ # output_audio_path = "test_videos/audio.mp3"
140
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ opencv-python
2
+ tqdm
3
+ openai-whisper
4
+ moviepy
5
+ customtkinter