Commit
·
08e515a
1
Parent(s):
04bc49f
fade option
Browse files
helper.py
CHANGED
@@ -31,8 +31,8 @@ def overlay_audio(speech_audio, noise_audio):
|
|
31 |
|
32 |
def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
|
33 |
"""
|
34 |
-
Process speech and noise audio data
|
35 |
-
|
36 |
Args:
|
37 |
speech_data (numpy.ndarray): Speech audio data
|
38 |
noise_data (numpy.ndarray): Noise audio data
|
@@ -40,9 +40,6 @@ def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
|
|
40 |
noise_sr (int): Noise sample rate
|
41 |
alpha (float): Speech volume adjustment
|
42 |
beta (float): Noise volume adjustment
|
43 |
-
|
44 |
-
Returns:
|
45 |
-
AudioSegment: Processed audio
|
46 |
"""
|
47 |
# Convert numpy arrays to AudioSegment
|
48 |
speech_audio = AudioSegment(
|
@@ -62,35 +59,33 @@ def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
|
|
62 |
# Get speech duration
|
63 |
speech_duration = len(speech_audio) / 1000.0 # Convert ms to sec
|
64 |
|
65 |
-
#
|
|
|
|
|
|
|
66 |
if len(noise_audio) / 1000.0 <= speech_duration:
|
67 |
trimmed_noise = noise_audio
|
68 |
else:
|
69 |
-
start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000
|
70 |
-
trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)]
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
#
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
#
|
87 |
-
|
88 |
-
|
89 |
-
adjusted_speech = adjust_volume(speech_audio, alpha)
|
90 |
-
adjusted_noise = adjust_volume(trimmed_noise, beta)
|
91 |
-
|
92 |
-
final_audio = overlay_audio(adjusted_speech, adjusted_noise)
|
93 |
-
|
94 |
return final_audio
|
95 |
|
96 |
|
|
|
31 |
|
32 |
def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
|
33 |
"""
|
34 |
+
Process speech and noise audio data with quality preservation.
|
35 |
+
|
36 |
Args:
|
37 |
speech_data (numpy.ndarray): Speech audio data
|
38 |
noise_data (numpy.ndarray): Noise audio data
|
|
|
40 |
noise_sr (int): Noise sample rate
|
41 |
alpha (float): Speech volume adjustment
|
42 |
beta (float): Noise volume adjustment
|
|
|
|
|
|
|
43 |
"""
|
44 |
# Convert numpy arrays to AudioSegment
|
45 |
speech_audio = AudioSegment(
|
|
|
59 |
# Get speech duration
|
60 |
speech_duration = len(speech_audio) / 1000.0 # Convert ms to sec
|
61 |
|
62 |
+
# Modify crossfade duration based on audio length
|
63 |
+
crossfade_duration = min(5, len(speech_audio) // 4) # Use 5ms or 1/4 of audio length, whichever is smaller
|
64 |
+
|
65 |
+
# Cut noise segment with crossfade to avoid clicks
|
66 |
if len(noise_audio) / 1000.0 <= speech_duration:
|
67 |
trimmed_noise = noise_audio
|
68 |
else:
|
69 |
+
start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000
|
70 |
+
trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)]
|
71 |
+
# Adjust crossfade duration for short clips
|
72 |
+
trimmed_noise = trimmed_noise.fade_in(crossfade_duration).fade_out(crossfade_duration)
|
73 |
+
|
74 |
+
# Match sample rates before mixing
|
75 |
+
trimmed_noise = trimmed_noise.set_frame_rate(speech_sr)
|
76 |
+
|
77 |
+
# Gradual volume adjustment with adjusted crossfade
|
78 |
+
adjusted_speech = speech_audio
|
79 |
+
if alpha != 0:
|
80 |
+
adjusted_speech = adjust_volume(speech_audio, alpha).fade_in(crossfade_duration).fade_out(crossfade_duration)
|
81 |
+
|
82 |
+
adjusted_noise = trimmed_noise
|
83 |
+
if beta != 0:
|
84 |
+
adjusted_noise = adjust_volume(trimmed_noise, beta).fade_in(crossfade_duration).fade_out(crossfade_duration)
|
85 |
+
|
86 |
+
# Overlay with crossfade to preserve quality
|
87 |
+
final_audio = adjusted_speech.overlay(adjusted_noise, gain_during_overlay=0)
|
88 |
+
|
|
|
|
|
|
|
|
|
|
|
89 |
return final_audio
|
90 |
|
91 |
|