Update app.py
Browse files
app.py
CHANGED
@@ -128,32 +128,33 @@ def handle_feedback(feedback):
|
|
128 |
|
129 |
def segment_background_audio(audio_path, output_path="background_segments.wav"):
|
130 |
# Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
|
131 |
-
pipeline = Pipeline.from_pretrained(
|
132 |
-
"pyannote/voice-activity-detection",
|
133 |
-
use_auth_token=hf_api_key
|
134 |
-
)
|
135 |
-
# Step 3: Run VAD to get speech segments
|
136 |
-
vad_result = pipeline(audio_path)
|
137 |
-
print(f"Detected speech segments: {vad_result}")
|
138 |
-
|
139 |
-
# Step 4: Load full audio and subtract speech segments
|
140 |
-
full_audio = AudioSegment.from_wav(audio_path)
|
141 |
-
background_audio = AudioSegment.silent(duration=len(full_audio))
|
142 |
-
|
143 |
-
for segment in vad_result.itersegments():
|
144 |
-
start_ms = int(segment.start * 1000)
|
145 |
-
end_ms = int(segment.end * 1000)
|
146 |
-
# Remove speech by muting that portion
|
147 |
-
background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
|
148 |
-
|
149 |
-
# Step 5: Subtract background_audio from full_audio
|
150 |
-
result_audio = full_audio.overlay(background_audio)
|
151 |
-
|
152 |
-
# Step 6: Export non-speech segments
|
153 |
-
result_audio.export(output_path, format="wav")
|
154 |
-
print(f"Saved non-speech (background) audio to: {output_path}")
|
155 |
-
|
156 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
|
158 |
def transcribe_video_with_speakers(video_path):
|
159 |
# Extract audio from video
|
|
|
128 |
|
129 |
def segment_background_audio(audio_path, output_path="background_segments.wav"):
|
130 |
# Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
return True
|
132 |
+
|
133 |
+
# pipeline = Pipeline.from_pretrained(
|
134 |
+
# "pyannote/voice-activity-detection",
|
135 |
+
# use_auth_token=hf_api_key
|
136 |
+
# )
|
137 |
+
# # Step 3: Run VAD to get speech segments
|
138 |
+
# vad_result = pipeline(audio_path)
|
139 |
+
# print(f"Detected speech segments: {vad_result}")
|
140 |
+
|
141 |
+
# # Step 4: Load full audio and subtract speech segments
|
142 |
+
# full_audio = AudioSegment.from_wav(audio_path)
|
143 |
+
# background_audio = AudioSegment.silent(duration=len(full_audio))
|
144 |
+
|
145 |
+
# for segment in vad_result.itersegments():
|
146 |
+
# start_ms = int(segment.start * 1000)
|
147 |
+
# end_ms = int(segment.end * 1000)
|
148 |
+
# # Remove speech by muting that portion
|
149 |
+
# background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
|
150 |
+
|
151 |
+
# # Step 5: Subtract background_audio from full_audio
|
152 |
+
# result_audio = full_audio.overlay(background_audio)
|
153 |
+
|
154 |
+
# # Step 6: Export non-speech segments
|
155 |
+
# result_audio.export(output_path, format="wav")
|
156 |
+
# print(f"Saved non-speech (background) audio to: {output_path}")
|
157 |
+
|
158 |
|
159 |
def transcribe_video_with_speakers(video_path):
|
160 |
# Extract audio from video
|