Spaces:
Sleeping
Sleeping
File size: 5,297 Bytes
3aa4ef5 314bd70 3aa4ef5 314bd70 3aa4ef5 314bd70 3aa4ef5 314bd70 3aa4ef5 9a27afb 3aa4ef5 314bd70 3aa4ef5 314bd70 3aa4ef5 314bd70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2
import io
import tempfile
from PIL import Image
import gradio as gr
from gradio_imageslider import ImageSlider
def generate_mel_spectrogram(audio_path, sr=22050, n_mels=128, fmin=0, fmax=7000):
# Load audio file
y, sr = librosa.load(audio_path, sr=sr)
# Generate Mel Spectrogram
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax)
S_dB = librosa.power_to_db(S, ref=np.max)
return S_dB, y, sr
def detect_zero_db(spectrogram,threshold,tol):
# Create a binary mask where the spectrogram values are close to 0 dB
# +0 dB threshold
mask = np.isclose(spectrogram, threshold, atol=tol) # Use a tolerance to include values close to 0 dB
return mask
def plot_spectrogram(spectrogram, file_path):
# Plot the Mel Spectrogram and save it to a file
plt.figure(figsize=(6, 6))
plt.axis('off')
librosa.display.specshow(spectrogram, sr=22050, x_axis='time', y_axis='mel', fmin=0, fmax=7000)
plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
plt.close()
def plot_edge_spectrogram(edges, file_path):
# Plot the Edge Detected Spectrogram and save it to a file
plt.figure(figsize=(6, 6))
plt.axis('off')
plt.imshow(edges, cmap='gray', aspect='auto', origin='lower')
plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
plt.close()
def plot_frequency(times, frequencies, label, color, file_path):
plt.figure(figsize=(12, 6))
plt.plot(times, frequencies, label=label, color=color, linewidth=2)
plt.title(f'{label} Frequency')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.legend()
# Save to file
plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
plt.close()
def process_audio( threshold, audio_file,tol):
mel_spectrogram, y, sr = generate_mel_spectrogram(audio_file)
edges = detect_zero_db(mel_spectrogram,threshold,tol)
# Create temporary files to save the generated images
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as mel_file, \
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as edge_file, \
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f0_file, \
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f1_file, \
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f2_file:
mel_spectrogram_img = mel_file.name
edge_spectrogram_img = edge_file.name
f0_img = f0_file.name
f1_img = f1_file.name
f2_img = f2_file.name
# Save the Mel spectrogram and edge-detected spectrogram to the temporary files
plot_spectrogram(mel_spectrogram, mel_spectrogram_img)
plot_edge_spectrogram(edges, edge_spectrogram_img)
# Extract and save individual frequency plots
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
times = librosa.times_like(f0, sr=sr)
plot_frequency(times, f0, 'F0', 'cyan', f0_img)
# Formant frequency (F1 and F2) detection using LPC
lpc_order = 5 # LPC order for formant estimation
formants = np.empty((times.shape[0], 2)) # F1 and F2
formants[:] = np.nan # Initialize with NaN for unvoiced frames
for i in range(len(times)):
if voiced_flag[i] and i * sr < len(y):
frame = y[int(i * sr):int(i * sr + sr)] # 1 frame
if len(frame) == 0:
continue
# Apply LPC
A = librosa.lpc(frame, order = lpc_order)
rts = np.roots(A)
rts = rts[np.imag(rts) >= 0]
angz = np.arctan2(np.imag(rts), np.real(rts))
frqs = angz * (sr / (2 * np.pi))
frqs = np.sort(frqs)
if len(frqs) >= 2:
formants[i, 0] = frqs[0] # F1
formants[i, 1] = frqs[1] # F2
plot_frequency(times, formants[:, 0], 'F1', 'magenta', f1_img)
plot_frequency(times, formants[:, 1], 'F2', 'yellow', f2_img)
return [mel_spectrogram_img, edge_spectrogram_img], f0_img, f1_img, f2_img
with gr.Blocks() as demo:
with gr.Group():
threshold_slider =gr.Slider(-100,0,value=-2,info="Choose between -100 and 0", label = "db Level")
tol_slider =gr.Slider(0,45,value=30,info="Choose between 0 and 25", label = "Tolerance")
audio_input = gr.Audio(label="Upload an audio file in WAV format", type="filepath")
submit_button = gr.Button("Submit")
img_slider = ImageSlider(label="Before and After Edge Detection", type="filepath", slider_color="pink")
f0_plot = gr.Image(label="F0 Frequency Plot", type="filepath")
f1_plot = gr.Image(label="F1 Frequency Plot", type="filepath")
f2_plot = gr.Image(label="F2 Frequency Plot", type="filepath")
submit_button.click(process_audio, inputs=[ threshold_slider, audio_input,tol_slider], outputs=[img_slider, f0_plot, f1_plot, f2_plot])
if __name__ == "__main__":
demo.launch() |