File size: 5,297 Bytes
3aa4ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314bd70
3aa4ef5
314bd70
 
3aa4ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314bd70
3aa4ef5
314bd70
3aa4ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9a27afb
 
3aa4ef5
314bd70
3aa4ef5
 
 
 
 
314bd70
 
3aa4ef5
 
314bd70
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import librosa
import numpy as np
import matplotlib.pyplot as plt
import cv2
import io
import tempfile
from PIL import Image
import gradio as gr
from gradio_imageslider import ImageSlider

def generate_mel_spectrogram(audio_path, sr=22050, n_mels=128, fmin=0, fmax=7000):
    # Load audio file
    y, sr = librosa.load(audio_path, sr=sr)
    
    # Generate Mel Spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax)
    S_dB = librosa.power_to_db(S, ref=np.max)
    
    return S_dB, y, sr

def detect_zero_db(spectrogram,threshold,tol):
    # Create a binary mask where the spectrogram values are close to 0 dB
    # +0 dB threshold
    mask = np.isclose(spectrogram, threshold, atol=tol)  # Use a tolerance to include values close to 0 dB
    
    return mask

def plot_spectrogram(spectrogram, file_path):
    # Plot the Mel Spectrogram and save it to a file
    plt.figure(figsize=(6, 6))
    plt.axis('off')
    librosa.display.specshow(spectrogram, sr=22050, x_axis='time', y_axis='mel', fmin=0, fmax=7000)
    plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
    plt.close()

def plot_edge_spectrogram(edges, file_path):
    # Plot the Edge Detected Spectrogram and save it to a file
    plt.figure(figsize=(6, 6))
    plt.axis('off')
    plt.imshow(edges, cmap='gray', aspect='auto', origin='lower')
    plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
    plt.close()

def plot_frequency(times, frequencies, label, color, file_path):
    plt.figure(figsize=(12, 6))
    plt.plot(times, frequencies, label=label, color=color, linewidth=2)
    plt.title(f'{label} Frequency')
    plt.xlabel('Time (s)')
    plt.ylabel('Frequency (Hz)')
    plt.legend()
    
    # Save to file
    plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
    plt.close()

def process_audio( threshold, audio_file,tol):
    mel_spectrogram, y, sr = generate_mel_spectrogram(audio_file)
    edges = detect_zero_db(mel_spectrogram,threshold,tol)

    # Create temporary files to save the generated images
    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as mel_file, \
        tempfile.NamedTemporaryFile(suffix=".png", delete=False) as edge_file, \
        tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f0_file, \
        tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f1_file, \
        tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f2_file:
        
        mel_spectrogram_img = mel_file.name
        edge_spectrogram_img = edge_file.name
        f0_img = f0_file.name
        f1_img = f1_file.name
        f2_img = f2_file.name

        # Save the Mel spectrogram and edge-detected spectrogram to the temporary files
        plot_spectrogram(mel_spectrogram, mel_spectrogram_img)
        plot_edge_spectrogram(edges, edge_spectrogram_img)
        
        # Extract and save individual frequency plots
        f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
        times = librosa.times_like(f0, sr=sr)
        
        plot_frequency(times, f0, 'F0', 'cyan', f0_img)
        
        # Formant frequency (F1 and F2) detection using LPC
        lpc_order = 5 # LPC order for formant estimation
        formants = np.empty((times.shape[0], 2))  # F1 and F2
        formants[:] = np.nan  # Initialize with NaN for unvoiced frames
        
        for i in range(len(times)):
            if voiced_flag[i] and i * sr < len(y):
                frame = y[int(i * sr):int(i * sr + sr)]  # 1 frame
                if len(frame) == 0:
                    continue

                # Apply LPC
                A = librosa.lpc(frame, order = lpc_order)
                rts = np.roots(A)
                rts = rts[np.imag(rts) >= 0]
                angz = np.arctan2(np.imag(rts), np.real(rts))
                frqs = angz * (sr / (2 * np.pi))
                frqs = np.sort(frqs)
                
                if len(frqs) >= 2:
                    formants[i, 0] = frqs[0]  # F1
                    formants[i, 1] = frqs[1]  # F2
        
        plot_frequency(times, formants[:, 0], 'F1', 'magenta', f1_img)
        plot_frequency(times, formants[:, 1], 'F2', 'yellow', f2_img)
    
    return [mel_spectrogram_img, edge_spectrogram_img], f0_img, f1_img, f2_img

with gr.Blocks() as demo:
    with gr.Group():
        threshold_slider =gr.Slider(-100,0,value=-2,info="Choose between -100 and 0", label = "db Level")
        tol_slider =gr.Slider(0,45,value=30,info="Choose between 0 and 25", label = "Tolerance")
        audio_input = gr.Audio(label="Upload an audio file in WAV format", type="filepath")
        submit_button = gr.Button("Submit")
        img_slider = ImageSlider(label="Before and After Edge Detection", type="filepath", slider_color="pink")
        f0_plot = gr.Image(label="F0 Frequency Plot", type="filepath")
        f1_plot = gr.Image(label="F1 Frequency Plot", type="filepath")
        f2_plot = gr.Image(label="F2 Frequency Plot", type="filepath")
        
        
        submit_button.click(process_audio, inputs=[ threshold_slider, audio_input,tol_slider], outputs=[img_slider, f0_plot, f1_plot, f2_plot])

if __name__ == "__main__":
    demo.launch()