ar08 commited on
Commit
bd6a9e8
·
verified ·
1 Parent(s): 778610f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -106
app.py CHANGED
@@ -6,155 +6,125 @@ import cv2
6
  from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
7
  from tqdm import tqdm
8
 
 
9
  FONT = cv2.FONT_HERSHEY_SIMPLEX
10
  FONT_SCALE = 0.8
11
  FONT_THICKNESS = 2
12
 
13
  class VideoTranscriber:
14
- def __init__(self, model_path, video_path):
15
- self.model = whisper.load_model(model_path)
16
  self.video_path = video_path
17
  self.audio_path = ''
18
- self.text_array = []
19
  self.fps = 0
20
  self.char_width = 0
21
 
22
- def transcribe_video(self):
23
- print('Transcribing video')
24
- result = self.model.transcribe(self.audio_path)
25
- text = result["segments"][0]["text"]
26
- textsize = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
27
- cap = cv2.VideoCapture(self.video_path)
28
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
29
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
30
- asp = 16/9
31
- ret, frame = cap.read()
32
- width = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)].shape[1]
33
- width = width - (width * 0.1)
34
- self.fps = cap.get(cv2.CAP_PROP_FPS)
35
- self.char_width = int(textsize[0] / len(text))
36
-
37
- for j in tqdm(result["segments"]):
38
- lines = []
39
- text = j["text"]
40
- end = j["end"]
41
- start = j["start"]
42
- total_frames = int((end - start) * self.fps)
43
- start = start * self.fps
44
- total_chars = len(text)
45
- words = text.split(" ")
46
- i = 0
47
-
48
- while i < len(words):
49
- words[i] = words[i].strip()
50
- if words[i] == "":
51
- i += 1
52
- continue
53
- length_in_pixels = (len(words[i]) + 1) * self.char_width
54
- remaining_pixels = width - length_in_pixels
55
- line = words[i]
56
-
57
- while remaining_pixels > 0:
58
- i += 1
59
- if i >= len(words):
60
- break
61
- length_in_pixels = (len(words[i]) + 1) * self.char_width
62
- remaining_pixels -= length_in_pixels
63
- if remaining_pixels < 0:
64
- continue
65
- else:
66
- line += " " + words[i]
67
-
68
- line_array = [line, int(start) + 15, int(len(line) / total_chars * total_frames) + int(start) + 15]
69
- start = int(len(line) / total_chars * total_frames) + int(start)
70
- lines.append(line_array)
71
- self.text_array.append(line_array)
72
-
73
- cap.release()
74
- print('Transcription complete')
75
-
76
  def extract_audio(self):
77
- print('Extracting audio')
78
- audio_path = os.path.join(os.path.dirname(self.video_path), "audio.mp3")
79
  video = VideoFileClip(self.video_path)
80
  audio = video.audio
81
  audio.write_audiofile(audio_path)
82
  self.audio_path = audio_path
83
- print('Audio extracted')
 
 
 
 
 
 
 
84
 
85
- def extract_frames(self, output_folder):
86
- print('Extracting frames')
87
  cap = cv2.VideoCapture(self.video_path)
 
88
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
89
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
90
- asp = width / height
91
- N_frames = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  while True:
94
  ret, frame = cap.read()
95
  if not ret:
96
  break
97
 
98
- frame = frame[:, int(int(width - 1 / asp * height) / 2):width - int((width - 1 / asp * height) / 2)]
99
-
100
- for i in self.text_array:
101
- if N_frames >= i[1] and N_frames <= i[2]:
102
- text = i[0]
103
- text_size, _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
104
- text_x = int((frame.shape[1] - text_size[0]) / 2)
105
- text_y = int(height/2)
106
- cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, 0.75, (0, 0, 255), 2)
107
  break
108
 
109
- cv2.imwrite(os.path.join(output_folder, str(N_frames) + ".jpg"), frame)
110
- N_frames += 1
111
 
112
  cap.release()
113
- print('Frames extracted')
114
-
115
- def create_video(self, output_video_path):
116
- print('Creating video')
117
- image_folder = os.path.join(os.path.dirname(self.video_path), "frames")
118
- if not os.path.exists(image_folder):
119
- os.makedirs(image_folder)
120
 
121
- self.extract_frames(image_folder)
 
 
 
122
 
123
- images = [img for img in os.listdir(image_folder) if img.endswith(".jpg")]
124
- images.sort(key=lambda x: int(x.split(".")[0]))
125
-
126
- frame = cv2.imread(os.path.join(image_folder, images[0]))
127
- height, width, layers = frame.shape
128
-
129
- clip = ImageSequenceClip([os.path.join(image_folder, image) for image in images], fps=self.fps)
130
  audio = AudioFileClip(self.audio_path)
131
  clip = clip.set_audio(audio)
132
- clip.write_videofile(output_video_path)
133
- shutil.rmtree(image_folder)
134
- os.remove(os.path.join(os.path.dirname(self.video_path), "audio.mp3"))
135
 
136
- def process_video(video_path):
137
- model_path = "base"
138
- output_video_path = "output.mp4"
139
 
140
- transcriber = VideoTranscriber(model_path, video_path)
 
141
  transcriber.extract_audio()
142
  transcriber.transcribe_video()
143
- transcriber.create_video(output_video_path)
144
-
145
- return output_video_path
146
 
147
  # Gradio Interface
148
  def gradio_interface(video):
149
- output_video_path = process_video(video)
150
- return output_video_path
151
 
152
  iface = gr.Interface(
153
  fn=gradio_interface,
154
- inputs=gr.inputs.Video(label="Upload Video"),
155
- outputs=gr.outputs.Video(label="Transcribed Video"),
156
- title="Video Transcription App",
157
- description="Upload a video to transcribe and generate a new video with subtitles."
158
  )
159
 
160
  if __name__ == "__main__":
 
6
  from moviepy import ImageSequenceClip, AudioFileClip, VideoFileClip
7
  from tqdm import tqdm
8
 
9
+ # Constants
10
  FONT = cv2.FONT_HERSHEY_SIMPLEX
11
  FONT_SCALE = 0.8
12
  FONT_THICKNESS = 2
13
 
14
  class VideoTranscriber:
15
+ def __init__(self, model_name, video_path):
16
+ self.model = whisper.load_model(model_name)
17
  self.video_path = video_path
18
  self.audio_path = ''
19
+ self.text_segments = []
20
  self.fps = 0
21
  self.char_width = 0
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  def extract_audio(self):
24
+ print('[INFO] Extracting audio...')
25
+ audio_path = os.path.splitext(self.video_path)[0] + "_audio.mp3"
26
  video = VideoFileClip(self.video_path)
27
  audio = video.audio
28
  audio.write_audiofile(audio_path)
29
  self.audio_path = audio_path
30
+ print('[INFO] Audio extracted')
31
+
32
+ def transcribe_video(self):
33
+ print('[INFO] Transcribing audio...')
34
+ result = self.model.transcribe(self.audio_path)
35
+ segments = result["segments"]
36
+ sample_text = segments[0]["text"] if segments else "Sample"
37
+ textsize = cv2.getTextSize(sample_text, FONT, FONT_SCALE, FONT_THICKNESS)[0]
38
 
 
 
39
  cap = cv2.VideoCapture(self.video_path)
40
+ self.fps = cap.get(cv2.CAP_PROP_FPS)
41
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
42
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
43
+ aspect_ratio = width / height
44
+ cap.release()
45
+
46
+ effective_width = int(width - (width * 0.1))
47
+ self.char_width = max(int(textsize[0] / len(sample_text)), 1)
48
+
49
+ for seg in tqdm(segments, desc="Transcribing"):
50
+ lines = self._split_text_to_lines(seg["text"], effective_width)
51
+ start_frame = int(seg["start"] * self.fps)
52
+ end_frame = int(seg["end"] * self.fps)
53
+ self.text_segments.extend([[line, start_frame, end_frame] for line in lines])
54
+
55
+ print('[INFO] Transcription complete')
56
+
57
+ def _split_text_to_lines(self, text, max_width):
58
+ words = text.split()
59
+ lines, line = [], ""
60
+ for word in words:
61
+ if cv2.getTextSize(line + ' ' + word, FONT, FONT_SCALE, FONT_THICKNESS)[0][0] < max_width:
62
+ line += (" " if line else "") + word
63
+ else:
64
+ lines.append(line)
65
+ line = word
66
+ if line:
67
+ lines.append(line)
68
+ return lines
69
+
70
+ def extract_and_annotate_frames(self, output_dir):
71
+ print('[INFO] Extracting and annotating frames...')
72
+ os.makedirs(output_dir, exist_ok=True)
73
+ cap = cv2.VideoCapture(self.video_path)
74
+ frame_count = 0
75
 
76
  while True:
77
  ret, frame = cap.read()
78
  if not ret:
79
  break
80
 
81
+ for text, start, end in self.text_segments:
82
+ if start <= frame_count <= end:
83
+ text_size, _ = cv2.getTextSize(text, FONT, FONT_SCALE, FONT_THICKNESS)
84
+ text_x = (frame.shape[1] - text_size[0]) // 2
85
+ text_y = frame.shape[0] - 30
86
+ cv2.putText(frame, text, (text_x, text_y), FONT, FONT_SCALE, (0, 0, 255), FONT_THICKNESS)
 
 
 
87
  break
88
 
89
+ cv2.imwrite(os.path.join(output_dir, f"{frame_count:05d}.jpg"), frame)
90
+ frame_count += 1
91
 
92
  cap.release()
93
+ print('[INFO] Frame extraction complete')
 
 
 
 
 
 
94
 
95
+ def create_annotated_video(self, output_video_path):
96
+ print('[INFO] Creating final video...')
97
+ frames_dir = os.path.join(os.path.dirname(self.video_path), "frames_temp")
98
+ self.extract_and_annotate_frames(frames_dir)
99
 
100
+ image_files = sorted([os.path.join(frames_dir, f) for f in os.listdir(frames_dir) if f.endswith(".jpg")])
101
+ clip = ImageSequenceClip(image_files, fps=self.fps)
 
 
 
 
 
102
  audio = AudioFileClip(self.audio_path)
103
  clip = clip.set_audio(audio)
104
+ clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac")
 
 
105
 
106
+ shutil.rmtree(frames_dir)
107
+ os.remove(self.audio_path)
108
+ print('[INFO] Video created successfully')
109
 
110
+ def process_video(video_path):
111
+ transcriber = VideoTranscriber(model_name="base", video_path=video_path)
112
  transcriber.extract_audio()
113
  transcriber.transcribe_video()
114
+ output_path = os.path.splitext(video_path)[0] + "_transcribed.mp4"
115
+ transcriber.create_annotated_video(output_path)
116
+ return output_path
117
 
118
  # Gradio Interface
119
  def gradio_interface(video):
120
+ return process_video(video)
 
121
 
122
  iface = gr.Interface(
123
  fn=gradio_interface,
124
+ inputs=gr.Video(label="Upload Video"),
125
+ outputs=gr.Video(label="Transcribed Video"),
126
+ title="🎬 Whisper Video Subtitle Generator",
127
+ description="Upload a video to automatically transcribe and add subtitles using Whisper AI."
128
  )
129
 
130
  if __name__ == "__main__":