Commit
·
b36d167
1
Parent(s):
7e40d92
added code
Browse files- .gitattributes +1 -0
- .gitignore +1 -0
- __pycache__/helper.cpython-312.pyc +0 -0
- anushka.wav +3 -0
- app.py +96 -0
- helper.py +106 -0
- traffic.wav +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
*.wav filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
/.venv
|
__pycache__/helper.cpython-312.pyc
ADDED
Binary file (3.25 kB). View file
|
|
anushka.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79d0cfa2385223555e11093ade0c9dcabe1932171318afcd70479e00176026cb
|
3 |
+
size 192780
|
app.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from helper import process_audio
|
3 |
+
import os
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
# Sample audio file paths
|
7 |
+
SAMPLE_SPEECH = "anushka.wav"
|
8 |
+
SAMPLE_NOISE = "traffic.wav"
|
9 |
+
|
10 |
+
def process_audio_files(speech_file, noise_file, alpha, beta):
|
11 |
+
"""
|
12 |
+
Process the audio files and return the mixed output
|
13 |
+
|
14 |
+
Args:
|
15 |
+
speech_file (tuple): Speech audio (sample_rate, data)
|
16 |
+
noise_file (tuple): Noise audio (sample_rate, data)
|
17 |
+
alpha (float): First slider value (-30 to +30)
|
18 |
+
beta (float): Second slider value (-30 to +30)
|
19 |
+
|
20 |
+
Returns:
|
21 |
+
tuple: (sample_rate, processed_audio_data)
|
22 |
+
"""
|
23 |
+
speech_sr, speech_data = speech_file
|
24 |
+
noise_sr, noise_data = noise_file
|
25 |
+
|
26 |
+
# Process the audio using the helper function
|
27 |
+
output_audio = process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta)
|
28 |
+
|
29 |
+
# Convert AudioSegment to numpy array
|
30 |
+
samples = np.array(output_audio.get_array_of_samples())
|
31 |
+
|
32 |
+
return (output_audio.frame_rate, samples)
|
33 |
+
|
34 |
+
|
35 |
+
# Create the Gradio interface
|
36 |
+
|
37 |
+
|
38 |
+
with gr.Blocks() as app:
|
39 |
+
gr.Markdown("# Audio Mixing Application")
|
40 |
+
|
41 |
+
with gr.Row():
|
42 |
+
with gr.Column():
|
43 |
+
# Input components
|
44 |
+
speech_input = gr.Audio(
|
45 |
+
label="Speech Audio",
|
46 |
+
type="numpy"
|
47 |
+
)
|
48 |
+
noise_input = gr.Audio(
|
49 |
+
label="Noise Audio",
|
50 |
+
type="numpy"
|
51 |
+
)
|
52 |
+
|
53 |
+
# Sample audio examples
|
54 |
+
gr.Examples(
|
55 |
+
examples=[[SAMPLE_SPEECH, SAMPLE_NOISE]],
|
56 |
+
inputs=[speech_input, noise_input],
|
57 |
+
label="Sample Audio Files"
|
58 |
+
)
|
59 |
+
|
60 |
+
# Slider controls
|
61 |
+
alpha_slider = gr.Slider(
|
62 |
+
minimum=-30,
|
63 |
+
maximum=30,
|
64 |
+
value=0,
|
65 |
+
step=1,
|
66 |
+
label="Alpha (dB)",
|
67 |
+
info="Adjust alpha from -30 to +30 dB"
|
68 |
+
)
|
69 |
+
beta_slider = gr.Slider(
|
70 |
+
minimum=-30,
|
71 |
+
maximum=30,
|
72 |
+
value=0,
|
73 |
+
step=1,
|
74 |
+
label="Beta (dB)",
|
75 |
+
info="Adjust beta from -30 to +30 dB"
|
76 |
+
)
|
77 |
+
|
78 |
+
# Submit button
|
79 |
+
submit_btn = gr.Button("Process Audio")
|
80 |
+
|
81 |
+
with gr.Column():
|
82 |
+
# Output audio player
|
83 |
+
output_audio = gr.Audio(
|
84 |
+
label="Mixed Audio",
|
85 |
+
type="numpy"
|
86 |
+
)
|
87 |
+
|
88 |
+
# Connect the components
|
89 |
+
submit_btn.click(
|
90 |
+
fn=process_audio_files,
|
91 |
+
inputs=[speech_input, noise_input, alpha_slider, beta_slider],
|
92 |
+
outputs=output_audio
|
93 |
+
)
|
94 |
+
|
95 |
+
if __name__ == "__main__":
|
96 |
+
app.launch()
|
helper.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
from pydub import AudioSegment
|
3 |
+
# from pydub.effects import normalize
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
def get_audio_volume_db(audio):
|
8 |
+
"""Estimate the volume in dBFS (decibels relative to full scale) using PyDub."""
|
9 |
+
return audio.dBFS if audio.dBFS != float('-inf') else -50.0 # Default to -50 dB for silence
|
10 |
+
|
11 |
+
|
12 |
+
def adjust_volume(audio, volume_change_db):
|
13 |
+
"""Adjusts the volume of an AudioSegment."""
|
14 |
+
return audio + volume_change_db
|
15 |
+
|
16 |
+
|
17 |
+
# def compress_audio(audio):
|
18 |
+
# """Apply compression to normalize speech volume."""
|
19 |
+
# return normalize(audio)
|
20 |
+
|
21 |
+
|
22 |
+
def place_in_stereo(audio, pan_value):
|
23 |
+
"""Places audio in stereo field (-1.0 = full left, 1.0 = full right)."""
|
24 |
+
return audio.pan(pan_value)
|
25 |
+
|
26 |
+
|
27 |
+
def overlay_audio(speech_audio, noise_audio):
|
28 |
+
"""Overlays speech and noise using PyDub."""
|
29 |
+
return speech_audio.overlay(noise_audio)
|
30 |
+
|
31 |
+
|
32 |
+
def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
|
33 |
+
"""
|
34 |
+
Process speech and noise audio data to create a mixed audio output.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
speech_data (numpy.ndarray): Speech audio data
|
38 |
+
noise_data (numpy.ndarray): Noise audio data
|
39 |
+
speech_sr (int): Speech sample rate
|
40 |
+
noise_sr (int): Noise sample rate
|
41 |
+
alpha (float): Speech volume adjustment
|
42 |
+
beta (float): Noise volume adjustment
|
43 |
+
|
44 |
+
Returns:
|
45 |
+
AudioSegment: Processed audio
|
46 |
+
"""
|
47 |
+
# Convert numpy arrays to AudioSegment
|
48 |
+
speech_audio = AudioSegment(
|
49 |
+
speech_data.tobytes(),
|
50 |
+
frame_rate=speech_sr,
|
51 |
+
sample_width=speech_data.dtype.itemsize,
|
52 |
+
channels=1
|
53 |
+
)
|
54 |
+
|
55 |
+
noise_audio = AudioSegment(
|
56 |
+
noise_data.tobytes(),
|
57 |
+
frame_rate=noise_sr,
|
58 |
+
sample_width=noise_data.dtype.itemsize,
|
59 |
+
channels=1
|
60 |
+
)
|
61 |
+
|
62 |
+
# Get speech duration
|
63 |
+
speech_duration = len(speech_audio) / 1000.0 # Convert ms to sec
|
64 |
+
|
65 |
+
# Cut noise segment
|
66 |
+
if len(noise_audio) / 1000.0 <= speech_duration:
|
67 |
+
trimmed_noise = noise_audio
|
68 |
+
else:
|
69 |
+
start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000
|
70 |
+
trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)]
|
71 |
+
|
72 |
+
trimmed_noise = trimmed_noise.set_frame_rate(8000)
|
73 |
+
|
74 |
+
# Calculate volumes and adjustments
|
75 |
+
speech_vol = get_audio_volume_db(speech_audio)
|
76 |
+
noise_vol = get_audio_volume_db(trimmed_noise)
|
77 |
+
|
78 |
+
current_snr = speech_vol - noise_vol
|
79 |
+
adjustment_needed = 10 - current_snr # target_snr hardcoded to 10
|
80 |
+
|
81 |
+
if adjustment_needed > 0: # Speech too quiet
|
82 |
+
speech_adjust = min(adjustment_needed, 2)
|
83 |
+
noise_adjust = -min(adjustment_needed / 2, 5)
|
84 |
+
else: # Speech too loud
|
85 |
+
speech_adjust = max(adjustment_needed, -5)
|
86 |
+
noise_adjust = -5 / 2
|
87 |
+
|
88 |
+
# Apply adjustments
|
89 |
+
adjusted_speech = adjust_volume(speech_audio, speech_adjust + alpha)
|
90 |
+
adjusted_noise = adjust_volume(trimmed_noise, noise_adjust + beta)
|
91 |
+
|
92 |
+
final_audio = overlay_audio(adjusted_speech, adjusted_noise)
|
93 |
+
|
94 |
+
return final_audio
|
95 |
+
|
96 |
+
|
97 |
+
# final_audio = process_audio("anushka.wav", "traffic.wav")
|
98 |
+
# # Single write operation at the end
|
99 |
+
# final_audio.export("output-traffic.wav", format="wav")
|
100 |
+
|
101 |
+
# print("Processing complete. Check output.wav!")
|
102 |
+
|
103 |
+
|
104 |
+
# -18, -20 for office
|
105 |
+
# -13 , -20 for market
|
106 |
+
# -18, -20 for traffic
|
traffic.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1fcab96844aaeebbdbd40b0d39df8689edefda2bd05ab6a44b74e5d96e7b852d
|
3 |
+
size 24178236
|