sjagird1 commited on
Commit
5e1a51c
·
verified ·
1 Parent(s): f1fbf51

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -0
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import librosa
4
+ import tensorflow as tf
5
+ import gradio as gr
6
+
7
+ class SpeechEmotionRecognizer:
8
+ def __init__(self, model_path):
9
+ self.model = tf.keras.models.load_model(model_path)
10
+ self.sample_rate = 22050
11
+ self.duration = 4 # seconds
12
+ self.emotion_labels = ['Anger', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad']
13
+
14
+ def extract_melspectrogram(self, audio_path):
15
+ try:
16
+ # Load and resample audio
17
+ audio, sr = librosa.load(audio_path, sr=self.sample_rate)
18
+
19
+ # Ensure audio is exactly 4 seconds
20
+ target_length = self.sample_rate * self.duration
21
+ if len(audio) < target_length:
22
+ audio = np.pad(audio, (0, int(target_length - len(audio))))
23
+ else:
24
+ audio = audio[:int(target_length)]
25
+
26
+ # Extract mel-spectrogram
27
+ mel_spec = librosa.feature.melspectrogram(
28
+ y=audio,
29
+ sr=self.sample_rate,
30
+ n_mels=128,
31
+ n_fft=2048,
32
+ hop_length=512,
33
+ win_length=2048,
34
+ fmax=8000
35
+ )
36
+
37
+ mel_spec_db = librosa.power_to_db(mel_spec + 1e-10, ref=np.max)
38
+
39
+ # Normalize
40
+ mean = np.mean(mel_spec_db)
41
+ std = np.std(mel_spec_db)
42
+ mel_spec_norm = (mel_spec_db - mean) / (std + 1e-10)
43
+
44
+ # Clip extreme values
45
+ mel_spec_norm = np.clip(mel_spec_norm, -5, 5)
46
+
47
+ # Ensure correct shape (128, 173)
48
+ target_length = 173
49
+ if mel_spec_norm.shape[1] > target_length:
50
+ mel_spec_norm = mel_spec_norm[:, :target_length]
51
+ elif mel_spec_norm.shape[1] < target_length:
52
+ pad_width = target_length - mel_spec_norm.shape[1]
53
+ mel_spec_norm = np.pad(mel_spec_norm, ((0, 0), (0, pad_width)), mode='constant')
54
+
55
+ return mel_spec_norm.reshape((1, 128, 173, 1))
56
+
57
+ except Exception as e:
58
+ raise gr.Error(f"Error processing audio: {str(e)}")
59
+
60
+ def predict_emotion(self, audio_path):
61
+ try:
62
+ # Extract features
63
+ mel_spec = self.extract_melspectrogram(audio_path)
64
+
65
+ # Make prediction
66
+ prediction = self.model.predict(mel_spec)
67
+ emotion_index = np.argmax(prediction)
68
+ confidence = float(prediction[0][emotion_index])
69
+
70
+ # Create results dictionary with confidence scores
71
+ results = {emotion: float(pred) for emotion, pred in zip(self.emotion_labels, prediction[0])}
72
+
73
+ return results
74
+
75
+ except Exception as e:
76
+ raise gr.Error(f"Prediction error: {str(e)}")
77
+
78
+ # Initialize the model
79
+ recognizer = SpeechEmotionRecognizer('final_model_conv2d_1K_1.keras')
80
+
81
+ # Define the Gradio interface
82
+ def process_audio(audio):
83
+ if audio is None:
84
+ raise gr.Error("Please provide an audio input")
85
+
86
+ results = recognizer.predict_emotion(audio)
87
+ return results
88
+
89
+ # Create the Gradio interface
90
+ demo = gr.Interface(
91
+ fn=process_audio,
92
+ inputs=[
93
+ gr.Audio(
94
+ source="microphone",
95
+ type="filepath",
96
+ label="Record audio (4 seconds)"
97
+ )
98
+ ],
99
+ outputs=gr.Label(num_top_classes=6),
100
+ title="Speech Emotion Recognition",
101
+ description="Record a 4-second audio clip to detect the emotion in your voice.",
102
+ examples=None, # You can add example audio files here
103
+ theme=gr.themes.Base()
104
+ )
105
+
106
+ # Launch the app
107
+ if __name__ == "__main__":
108
+ demo.launch()