Spaces:

VigneshDark
/

background_noise_over_speech

Sleeping

App Files Files Community

background_noise_over_speech / helper.py

VigneshDark

fade option

08e515a about 1 month ago

raw

history blame contribute delete

3.44 kB

	import random
	from pydub import AudioSegment # type: ignore
	# from pydub.effects import normalize
	# import numpy as np # type: ignore


	def get_audio_volume_db(audio):
	"""Estimate the volume in dBFS (decibels relative to full scale) using PyDub.""" # noqa
	return audio.dBFS if audio.dBFS != float('-inf') else -50.0 # Default to -50 dB for silence # noqa


	def adjust_volume(audio, volume_change_db):
	"""Adjusts the volume of an AudioSegment."""
	return audio + volume_change_db


	# def compress_audio(audio):
	# """Apply compression to normalize speech volume."""
	# return normalize(audio)


	def place_in_stereo(audio, pan_value):
	"""Places audio in stereo field (-1.0 = full left, 1.0 = full right)."""
	return audio.pan(pan_value)


	def overlay_audio(speech_audio, noise_audio):
	"""Overlays speech and noise using PyDub."""
	return speech_audio.overlay(noise_audio)


	def process_audio(speech_data, noise_data, speech_sr, noise_sr, alpha, beta):
	"""
	Process speech and noise audio data with quality preservation.

	Args:
	speech_data (numpy.ndarray): Speech audio data
	noise_data (numpy.ndarray): Noise audio data
	speech_sr (int): Speech sample rate
	noise_sr (int): Noise sample rate
	alpha (float): Speech volume adjustment
	beta (float): Noise volume adjustment
	"""
	# Convert numpy arrays to AudioSegment
	speech_audio = AudioSegment(
	speech_data.tobytes(),
	frame_rate=speech_sr,
	sample_width=speech_data.dtype.itemsize,
	channels=1
	)

	noise_audio = AudioSegment(
	noise_data.tobytes(),
	frame_rate=noise_sr,
	sample_width=noise_data.dtype.itemsize,
	channels=1
	)

	# Get speech duration
	speech_duration = len(speech_audio) / 1000.0 # Convert ms to sec

	# Modify crossfade duration based on audio length
	crossfade_duration = min(5, len(speech_audio) // 4) # Use 5ms or 1/4 of audio length, whichever is smaller

	# Cut noise segment with crossfade to avoid clicks
	if len(noise_audio) / 1000.0 <= speech_duration:
	trimmed_noise = noise_audio
	else:
	start_time = random.uniform(0, len(noise_audio) / 1000.0 - speech_duration) * 1000
	trimmed_noise = noise_audio[start_time:start_time + (speech_duration * 1000)]
	# Adjust crossfade duration for short clips
	trimmed_noise = trimmed_noise.fade_in(crossfade_duration).fade_out(crossfade_duration)

	# Match sample rates before mixing
	trimmed_noise = trimmed_noise.set_frame_rate(speech_sr)

	# Gradual volume adjustment with adjusted crossfade
	adjusted_speech = speech_audio
	if alpha != 0:
	adjusted_speech = adjust_volume(speech_audio, alpha).fade_in(crossfade_duration).fade_out(crossfade_duration)

	adjusted_noise = trimmed_noise
	if beta != 0:
	adjusted_noise = adjust_volume(trimmed_noise, beta).fade_in(crossfade_duration).fade_out(crossfade_duration)

	# Overlay with crossfade to preserve quality
	final_audio = adjusted_speech.overlay(adjusted_noise, gain_during_overlay=0)

	return final_audio


	# final_audio = process_audio("anushka.wav", "traffic.wav")
	# # Single write operation at the end
	# final_audio.export("output-traffic.wav", format="wav")

	# print("Processing complete. Check output.wav!")


	# -18, -20 for office
	# -13 , -20 for market
	# -18, -20 for traffic