ar08 commited on
Commit
778610f
·
verified ·
1 Parent(s): 2922284

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +160 -160
app.py CHANGED
@@ -1,161 +1,161 @@
1
- import gradio as gr
2
- import whisper
3
- import os
4
- import shutil
5
- import cv2
6
- from moviepy.editor import ImageSequenceClip, AudioFileClip, VideoFileClip
7
- from tqdm import tqdm
8
-
9
- FONT = cv2.FONT_HERSHEY_SIMPLEX
10
- FONT_SCALE = 0.8
11
- FONT_THICKNESS = 2
12
-
13
- class VideoTranscriber:
14
- def __init__(self, model_path, video_path):
15
- self.model = whisper.load_model(model_path)
16
- self.video_path = video_path
17
- self.audio_path = ''
18
- self.text_array = []
19
- self.fps = 0
20
- self.char_width = 0
21
-
22
- def transcribe_video(self):
23
- print('Transcribing video')
24
- result = self.model.transcribe(self.audio_path)
25
- text = result["segments"][0]["text"]
26
- textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
27
- cap = cv2.VideoCapture(self.video_path)
28
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
29
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
30
- asp = 16/9
31
- ret, frame = cap.read()
32
- width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
33
- width = width - (width * 0.1)
34
- self.fps = cap.get(cv2.CAP_PROP_FPS)
35
- self.char_width = int(textsize[0] / len(text))
36
-
37
- for j in tqdm(result["segments"]):
38
- lines = []
39
- text = j["text"]
40
- end = j["end"]
41
- start = j["start"]
42
- total_frames = int((end - start) * self.fps)
43
- start = start * self.fps
44
- total_chars = len(text)
45
- words = text.split(" ")
46
- i = 0
47
-
48
- while i < len(words):
49
- words[i] = words[i].strip()
50
- if words[i] == "":
51
- i += 1
52
- continue
53
- length_in_pixels = (len(words[i]) + 1) * self.char_width
54
- remaining_pixels = width - length_in_pixels
55
- line = words[i]
56
-
57
- while remaining_pixels > 0:
58
- i += 1
59
- if i >= len(words):
60
- break
61
- length_in_pixels = (len(words[i]) + 1) * self.char_width
62
- remaining_pixels -= length_in_pixels
63
- if remaining_pixels < 0:
64
- continue
65
- else:
66
- line += " " + words[i]
67
-
68
- line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
69
- start = int(len(line) / total_chars * total_frames) + int(start)
70
- lines.append(line_array)
71
- self.text_array.append(line_array)
72
-
73
- cap.release()
74
- print('Transcription complete')
75
-
76
- def extract_audio(self):
77
- print('Extracting audio')
78
- audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
79
- video = VideoFileClip(self.video_path)
80
- audio = video.audio
81
- audio.write_audiofile(audio_path)
82
- self.audio_path = audio_path
83
- print('Audio extracted')
84
-
85
- def extract_frames(self, output_folder):
86
- print('Extracting frames')
87
- cap = cv2.VideoCapture(self.video_path)
88
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
89
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
90
- asp = width / height
91
- N_frames = 0
92
-
93
- while True:
94
- ret, frame = cap.read()
95
- if not ret:
96
- break
97
-
98
- frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
99
-
100
- for i in self.text_array:
101
- if N_frames >= i[1] and N_frames <= i[2]:
102
- text = i[0]
103
- text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
104
- text_x = int((frame.shape[1] - text_size[0]) / 2)
105
- text_y = int(height/2)
106
- cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
107
- break
108
-
109
- cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
110
- N_frames += 1
111
-
112
- cap.release()
113
- print('Frames extracted')
114
-
115
- def create_video(self, output_video_path):
116
- print('Creating video')
117
- image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
118
- if not os.path.exists(image_folder):
119
- os.makedirs(image_folder)
120
-
121
- self.extract_frames(image_folder)
122
-
123
- images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
124
- images.sort(key=lambda x: int(x.split(".")[0]))
125
-
126
- frame = cv2.imread(os.path.join(image_folder, images[0]))
127
- height, width, layers = frame.shape
128
-
129
- clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
130
- audio = AudioFileClip(self.audio_path)
131
- clip = clip.set_audio(audio)
132
- clip.write_videofile(output_video_path)
133
- shutil.rmtree(image_folder)
134
- os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
135
-
136
- def process_video(video_path):
137
- model_path = "base"
138
- output_video_path = "output.mp4"
139
-
140
- transcriber = VideoTranscriber(model_path, video_path)
141
- transcriber.extract_audio()
142
- transcriber.transcribe_video()
143
- transcriber.create_video(output_video_path)
144
-
145
- return output_video_path
146
-
147
- # Gradio Interface
148
- def gradio_interface(video):
149
- output_video_path = process_video(video)
150
- return output_video_path
151
-
152
- iface = gr.Interface(
153
- fn=gradio_interface,
154
- inputs=gr.inputs.Video(label="Upload Video"),
155
- outputs=gr.outputs.Video(label="Transcribed Video"),
156
- title="Video Transcription App",
157
- description="Upload a video to transcribe and generate a new video with subtitles."
158
- )
159
-
160
- if __name__ == "__main__":
161
  iface.launch()
 
1
+ import gradio as gr
2
+ import whisper
3
+ import os
4
+ import shutil
5
+ import cv2
6
+ from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
7
+ from tqdm import tqdm
8
+
9
+ FONT = cv2.FONT_HERSHEY_SIMPLEX
10
+ FONT_SCALE = 0.8
11
+ FONT_THICKNESS = 2
12
+
13
+ class VideoTranscriber:
14
+ def __init__(self, model_path, video_path):
15
+ self.model = whisper.load_model(model_path)
16
+ self.video_path = video_path
17
+ self.audio_path = ''
18
+ self.text_array = []
19
+ self.fps = 0
20
+ self.char_width = 0
21
+
22
+ def transcribe_video(self):
23
+ print('Transcribing video')
24
+ result = self.model.transcribe(self.audio_path)
25
+ text = result["segments"][0]["text"]
26
+ textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
27
+ cap = cv2.VideoCapture(self.video_path)
28
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
29
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
30
+ asp = 16/9
31
+ ret, frame = cap.read()
32
+ width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
33
+ width = width - (width * 0.1)
34
+ self.fps = cap.get(cv2.CAP_PROP_FPS)
35
+ self.char_width = int(textsize[0] / len(text))
36
+
37
+ for j in tqdm(result["segments"]):
38
+ lines = []
39
+ text = j["text"]
40
+ end = j["end"]
41
+ start = j["start"]
42
+ total_frames = int((end - start) * self.fps)
43
+ start = start * self.fps
44
+ total_chars = len(text)
45
+ words = text.split(" ")
46
+ i = 0
47
+
48
+ while i < len(words):
49
+ words[i] = words[i].strip()
50
+ if words[i] == "":
51
+ i += 1
52
+ continue
53
+ length_in_pixels = (len(words[i]) + 1) * self.char_width
54
+ remaining_pixels = width - length_in_pixels
55
+ line = words[i]
56
+
57
+ while remaining_pixels > 0:
58
+ i += 1
59
+ if i >= len(words):
60
+ break
61
+ length_in_pixels = (len(words[i]) + 1) * self.char_width
62
+ remaining_pixels -= length_in_pixels
63
+ if remaining_pixels < 0:
64
+ continue
65
+ else:
66
+ line += " " + words[i]
67
+
68
+ line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
69
+ start = int(len(line) / total_chars * total_frames) + int(start)
70
+ lines.append(line_array)
71
+ self.text_array.append(line_array)
72
+
73
+ cap.release()
74
+ print('Transcription complete')
75
+
76
+ def extract_audio(self):
77
+ print('Extracting audio')
78
+ audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
79
+ video = VideoFileClip(self.video_path)
80
+ audio = video.audio
81
+ audio.write_audiofile(audio_path)
82
+ self.audio_path = audio_path
83
+ print('Audio extracted')
84
+
85
+ def extract_frames(self, output_folder):
86
+ print('Extracting frames')
87
+ cap = cv2.VideoCapture(self.video_path)
88
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
89
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
90
+ asp = width / height
91
+ N_frames = 0
92
+
93
+ while True:
94
+ ret, frame = cap.read()
95
+ if not ret:
96
+ break
97
+
98
+ frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
99
+
100
+ for i in self.text_array:
101
+ if N_frames >= i[1] and N_frames <= i[2]:
102
+ text = i[0]
103
+ text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
104
+ text_x = int((frame.shape[1] - text_size[0]) / 2)
105
+ text_y = int(height/2)
106
+ cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
107
+ break
108
+
109
+ cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
110
+ N_frames += 1
111
+
112
+ cap.release()
113
+ print('Frames extracted')
114
+
115
+ def create_video(self, output_video_path):
116
+ print('Creating video')
117
+ image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
118
+ if not os.path.exists(image_folder):
119
+ os.makedirs(image_folder)
120
+
121
+ self.extract_frames(image_folder)
122
+
123
+ images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
124
+ images.sort(key=lambda x: int(x.split(".")[0]))
125
+
126
+ frame = cv2.imread(os.path.join(image_folder, images[0]))
127
+ height, width, layers = frame.shape
128
+
129
+ clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
130
+ audio = AudioFileClip(self.audio_path)
131
+ clip = clip.set_audio(audio)
132
+ clip.write_videofile(output_video_path)
133
+ shutil.rmtree(image_folder)
134
+ os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
135
+
136
+ def process_video(video_path):
137
+ model_path = "base"
138
+ output_video_path = "output.mp4"
139
+
140
+ transcriber = VideoTranscriber(model_path, video_path)
141
+ transcriber.extract_audio()
142
+ transcriber.transcribe_video()
143
+ transcriber.create_video(output_video_path)
144
+
145
+ return output_video_path
146
+
147
+ # Gradio Interface
148
+ def gradio_interface(video):
149
+ output_video_path = process_video(video)
150
+ return output_video_path
151
+
152
+ iface = gr.Interface(
153
+ fn=gradio_interface,
154
+ inputs=gr.inputs.Video(label="Upload Video"),
155
+ outputs=gr.outputs.Video(label="Transcribed Video"),
156
+ title="Video Transcription App",
157
+ description="Upload a video to transcribe and generate a new video with subtitles."
158
+ )
159
+
160
+ if __name__ == "__main__":
161
  iface.launch()