import random from pydub import AudioSegment # type: ignore # from pydub.effects import normalize # import numpy as np # type: ignore def get_audio_volume_db(audio): """Estimate the volume in dBFS (decibels relative to full scale) using PyDub.""" # noqa return audio.dBFS if audio.dBFS != float('-inf') else -50.0 # Default to -50 dB for silence # noqa def adjust_volume(audio, volume_change_db): """Adjusts the volume of an AudioSegment.""" return audio + volume_change_db # def compress_audio(audio): # """Apply compression to normalize speech volume.""" # return normalize(audio) def place_in_stereo(audio, pan_value): """Places audio in stereo field (-1.0 = full left, 1.0 = full right).""" return audio.pan(pan_value) def overlay_audio(speech_audio, noise_audio): """Overlays speech and noise using PyDub.""" return speech_audio.overlay(noise_audio) def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta): """ Process speech and noise audio data with quality preservation. Args: speech_data (numpy.ndarray): Speech audio data noise_data (numpy.ndarray): Noise audio data speech_sr (int): Speech sample rate noise_sr (int): Noise sample rate alpha (float): Speech volume adjustment beta (float): Noise volume adjustment """ # Convert numpy arrays to AudioSegment speech_audio = AudioSegment( speech_data.tobytes(), frame_rate=speech_sr, sample_width=speech_data.dtype.itemsize, channels=1 ) noise_audio = AudioSegment( noise_data.tobytes(), frame_rate=noise_sr, sample_width=noise_data.dtype.itemsize, channels=1 ) # Get speech duration speech_duration = len(speech_audio) / 1000.0 # Convert ms to sec # Modify crossfade duration based on audio length crossfade_duration = min(5, len(speech_audio) // 4) # Use 5ms or 1/4 of audio length, whichever is smaller # Cut noise segment with crossfade to avoid clicks if len(noise_audio) / 1000.0 <= speech_duration: trimmed_noise = noise_audio else: start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000 trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)] # Adjust crossfade duration for short clips trimmed_noise = trimmed_noise.fade_in(crossfade_duration).fade_out(crossfade_duration) # Match sample rates before mixing trimmed_noise = trimmed_noise.set_frame_rate(speech_sr) # Gradual volume adjustment with adjusted crossfade adjusted_speech = speech_audio if alpha != 0: adjusted_speech = adjust_volume(speech_audio, alpha).fade_in(crossfade_duration).fade_out(crossfade_duration) adjusted_noise = trimmed_noise if beta != 0: adjusted_noise = adjust_volume(trimmed_noise, beta).fade_in(crossfade_duration).fade_out(crossfade_duration) # Overlay with crossfade to preserve quality final_audio = adjusted_speech.overlay(adjusted_noise, gain_during_overlay=0) return final_audio # final_audio = process_audio("anushka.wav", "traffic.wav") # # Single write operation at the end # final_audio.export("output-traffic.wav", format="wav") # print("Processing complete. Check output.wav!") # -18, -20 for office # -13 , -20 for market # -18, -20 for traffic