VigneshDark commited on
Commit
b36d167
·
1 Parent(s): 7e40d92

added code

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. .gitignore +1 -0
  3. __pycache__/helper.cpython-312.pyc +0 -0
  4. anushka.wav +3 -0
  5. app.py +96 -0
  6. helper.py +106 -0
  7. traffic.wav +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ /.venv
__pycache__/helper.cpython-312.pyc ADDED
Binary file (3.25 kB). View file
 
anushka.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79d0cfa2385223555e11093ade0c9dcabe1932171318afcd70479e00176026cb
3
+ size 192780
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from helper import process_audio
3
+ import os
4
+ import numpy as np
5
+
6
+ # Sample audio file paths
7
+ SAMPLE_SPEECH = "anushka.wav"
8
+ SAMPLE_NOISE = "traffic.wav"
9
+
10
+ def process_audio_files(speech_file, noise_file, alpha, beta):
11
+ """
12
+ Process the audio files and return the mixed output
13
+
14
+ Args:
15
+ speech_file (tuple): Speech audio (sample_rate, data)
16
+ noise_file (tuple): Noise audio (sample_rate, data)
17
+ alpha (float): First slider value (-30 to +30)
18
+ beta (float): Second slider value (-30 to +30)
19
+
20
+ Returns:
21
+ tuple: (sample_rate, processed_audio_data)
22
+ """
23
+ speech_sr, speech_data = speech_file
24
+ noise_sr, noise_data = noise_file
25
+
26
+ # Process the audio using the helper function
27
+ output_audio = process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta)
28
+
29
+ # Convert AudioSegment to numpy array
30
+ samples = np.array(output_audio.get_array_of_samples())
31
+
32
+ return (output_audio.frame_rate, samples)
33
+
34
+
35
+ # Create the Gradio interface
36
+
37
+
38
+ with gr.Blocks() as app:
39
+ gr.Markdown("# Audio Mixing Application")
40
+
41
+ with gr.Row():
42
+ with gr.Column():
43
+ # Input components
44
+ speech_input = gr.Audio(
45
+ label="Speech Audio",
46
+ type="numpy"
47
+ )
48
+ noise_input = gr.Audio(
49
+ label="Noise Audio",
50
+ type="numpy"
51
+ )
52
+
53
+ # Sample audio examples
54
+ gr.Examples(
55
+ examples=[[SAMPLE_SPEECH, SAMPLE_NOISE]],
56
+ inputs=[speech_input, noise_input],
57
+ label="Sample Audio Files"
58
+ )
59
+
60
+ # Slider controls
61
+ alpha_slider = gr.Slider(
62
+ minimum=-30,
63
+ maximum=30,
64
+ value=0,
65
+ step=1,
66
+ label="Alpha (dB)",
67
+ info="Adjust alpha from -30 to +30 dB"
68
+ )
69
+ beta_slider = gr.Slider(
70
+ minimum=-30,
71
+ maximum=30,
72
+ value=0,
73
+ step=1,
74
+ label="Beta (dB)",
75
+ info="Adjust beta from -30 to +30 dB"
76
+ )
77
+
78
+ # Submit button
79
+ submit_btn = gr.Button("Process Audio")
80
+
81
+ with gr.Column():
82
+ # Output audio player
83
+ output_audio = gr.Audio(
84
+ label="Mixed Audio",
85
+ type="numpy"
86
+ )
87
+
88
+ # Connect the components
89
+ submit_btn.click(
90
+ fn=process_audio_files,
91
+ inputs=[speech_input, noise_input, alpha_slider, beta_slider],
92
+ outputs=output_audio
93
+ )
94
+
95
+ if __name__ == "__main__":
96
+ app.launch()
helper.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ from pydub import AudioSegment
3
+ # from pydub.effects import normalize
4
+ import numpy as np
5
+
6
+
7
+ def get_audio_volume_db(audio):
8
+ """Estimate the volume in dBFS (decibels relative to full scale) using PyDub."""
9
+ return audio.dBFS if audio.dBFS != float('-inf') else -50.0 # Default to -50 dB for silence
10
+
11
+
12
+ def adjust_volume(audio, volume_change_db):
13
+ """Adjusts the volume of an AudioSegment."""
14
+ return audio + volume_change_db
15
+
16
+
17
+ # def compress_audio(audio):
18
+ # """Apply compression to normalize speech volume."""
19
+ # return normalize(audio)
20
+
21
+
22
+ def place_in_stereo(audio, pan_value):
23
+ """Places audio in stereo field (-1.0 = full left, 1.0 = full right)."""
24
+ return audio.pan(pan_value)
25
+
26
+
27
+ def overlay_audio(speech_audio, noise_audio):
28
+ """Overlays speech and noise using PyDub."""
29
+ return speech_audio.overlay(noise_audio)
30
+
31
+
32
+ def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
33
+ """
34
+ Process speech and noise audio data to create a mixed audio output.
35
+
36
+ Args:
37
+ speech_data (numpy.ndarray): Speech audio data
38
+ noise_data (numpy.ndarray): Noise audio data
39
+ speech_sr (int): Speech sample rate
40
+ noise_sr (int): Noise sample rate
41
+ alpha (float): Speech volume adjustment
42
+ beta (float): Noise volume adjustment
43
+
44
+ Returns:
45
+ AudioSegment: Processed audio
46
+ """
47
+ # Convert numpy arrays to AudioSegment
48
+ speech_audio = AudioSegment(
49
+ speech_data.tobytes(),
50
+ frame_rate=speech_sr,
51
+ sample_width=speech_data.dtype.itemsize,
52
+ channels=1
53
+ )
54
+
55
+ noise_audio = AudioSegment(
56
+ noise_data.tobytes(),
57
+ frame_rate=noise_sr,
58
+ sample_width=noise_data.dtype.itemsize,
59
+ channels=1
60
+ )
61
+
62
+ # Get speech duration
63
+ speech_duration = len(speech_audio) / 1000.0 # Convert ms to sec
64
+
65
+ # Cut noise segment
66
+ if len(noise_audio) / 1000.0 <= speech_duration:
67
+ trimmed_noise = noise_audio
68
+ else:
69
+ start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000
70
+ trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)]
71
+
72
+ trimmed_noise = trimmed_noise.set_frame_rate(8000)
73
+
74
+ # Calculate volumes and adjustments
75
+ speech_vol = get_audio_volume_db(speech_audio)
76
+ noise_vol = get_audio_volume_db(trimmed_noise)
77
+
78
+ current_snr = speech_vol - noise_vol
79
+ adjustment_needed = 10 - current_snr # target_snr hardcoded to 10
80
+
81
+ if adjustment_needed > 0: # Speech too quiet
82
+ speech_adjust = min(adjustment_needed, 2)
83
+ noise_adjust = -min(adjustment_needed / 2, 5)
84
+ else: # Speech too loud
85
+ speech_adjust = max(adjustment_needed, -5)
86
+ noise_adjust = -5 / 2
87
+
88
+ # Apply adjustments
89
+ adjusted_speech = adjust_volume(speech_audio, speech_adjust + alpha)
90
+ adjusted_noise = adjust_volume(trimmed_noise, noise_adjust + beta)
91
+
92
+ final_audio = overlay_audio(adjusted_speech, adjusted_noise)
93
+
94
+ return final_audio
95
+
96
+
97
+ # final_audio = process_audio("anushka.wav", "traffic.wav")
98
+ # # Single write operation at the end
99
+ # final_audio.export("output-traffic.wav", format="wav")
100
+
101
+ # print("Processing complete. Check output.wav!")
102
+
103
+
104
+ # -18, -20 for office
105
+ # -13 , -20 for market
106
+ # -18, -20 for traffic
traffic.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fcab96844aaeebbdbd40b0d39df8689edefda2bd05ab6a44b74e5d96e7b852d
3
+ size 24178236