Spaces:

VigneshDark
/

background_noise_over_speech

Sleeping

File size: 3,436 Bytes

import random
from pydub import AudioSegment  # type: ignore
# from pydub.effects import normalize
# import numpy as np  # type: ignore


def get_audio_volume_db(audio):
    """Estimate the volume in dBFS (decibels relative to full scale) using PyDub."""  # noqa
    return audio.dBFS if audio.dBFS != float('-inf') else -50.0  # Default to -50 dB for silence  # noqa


def adjust_volume(audio, volume_change_db):
    """Adjusts the volume of an AudioSegment."""
    return audio + volume_change_db


# def compress_audio(audio):
#     """Apply compression to normalize speech volume."""
#     return normalize(audio)


def place_in_stereo(audio, pan_value):
    """Places audio in stereo field (-1.0 = full left, 1.0 = full right)."""
    return audio.pan(pan_value)


def overlay_audio(speech_audio, noise_audio):
    """Overlays speech and noise using PyDub."""
    return speech_audio.overlay(noise_audio)


def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
    """
    Process speech and noise audio data with quality preservation.
    
    Args:
        speech_data (numpy.ndarray): Speech audio data
        noise_data (numpy.ndarray): Noise audio data
        speech_sr (int): Speech sample rate
        noise_sr (int): Noise sample rate
        alpha (float): Speech volume adjustment
        beta (float): Noise volume adjustment
    """
    # Convert numpy arrays to AudioSegment
    speech_audio = AudioSegment(
        speech_data.tobytes(),
        frame_rate=speech_sr,
        sample_width=speech_data.dtype.itemsize,
        channels=1
    )

    noise_audio = AudioSegment(
        noise_data.tobytes(),
        frame_rate=noise_sr,
        sample_width=noise_data.dtype.itemsize,
        channels=1
    )

    # Get speech duration
    speech_duration = len(speech_audio) / 1000.0  # Convert ms to sec

    # Modify crossfade duration based on audio length
    crossfade_duration = min(5, len(speech_audio) // 4)  # Use 5ms or 1/4 of audio length, whichever is smaller

    # Cut noise segment with crossfade to avoid clicks
    if len(noise_audio) / 1000.0 <= speech_duration:
        trimmed_noise = noise_audio
    else:
        start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000
        trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)]
        # Adjust crossfade duration for short clips
        trimmed_noise = trimmed_noise.fade_in(crossfade_duration).fade_out(crossfade_duration)

    # Match sample rates before mixing
    trimmed_noise = trimmed_noise.set_frame_rate(speech_sr)

    # Gradual volume adjustment with adjusted crossfade
    adjusted_speech = speech_audio
    if alpha != 0:
        adjusted_speech = adjust_volume(speech_audio, alpha).fade_in(crossfade_duration).fade_out(crossfade_duration)
    
    adjusted_noise = trimmed_noise
    if beta != 0:
        adjusted_noise = adjust_volume(trimmed_noise, beta).fade_in(crossfade_duration).fade_out(crossfade_duration)

    # Overlay with crossfade to preserve quality
    final_audio = adjusted_speech.overlay(adjusted_noise, gain_during_overlay=0)
    
    return final_audio


# final_audio = process_audio("anushka.wav", "traffic.wav")
# # Single write operation at the end
# final_audio.export("output-traffic.wav", format="wav")

# print("Processing complete. Check output.wav!")


# -18, -20 for office
# -13 , -20 for market
# -18, -20 for traffic