Spaces:

ameliabb0913
/

AudioEmotionRecognitionTask

Sleeping

ameliabb0913 commited on Mar 14

Commit

3a18141

verified ·

1 Parent(s): fa298aa

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -4,36 +4,46 @@ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
 import librosa
 # Placeholder model (Replace later with your trained model)
-MODEL_NAME = "facebook/wav2vec2-base-960h"
-# Load the pre-trained model and processor
 processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME)
-# Function to process audio input
-def classify_audio(audio_file):
-    # Load the audio file
     speech, sr = librosa.load(audio_file, sr=16000)
-    # Preprocess with Hugging Face's feature extractor
     inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True, truncation=True)
-    # Make predictions
     with torch.no_grad():
         logits = model(**inputs).logits
     predicted_class_id = torch.argmax(logits, dim=-1).item()
-    return f"Predicted Class: {predicted_class_id}"
 # Gradio Interface
 interface = gr.Interface(
-    fn=classify_audio,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs="text",
-    title="Wav2Vec2 Audio Classification",
-    description="Upload an audio file, and the model will classify it."
 )
-# Launch the Gradio demo
 if __name__ == "__main__":
     interface.launch()

 import librosa
 # Placeholder model (Replace later with your trained model)
+MODEL_NAME = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
 processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
 model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_NAME)
+# Emotion labels (based on the dataset used to train the model)
+id2label = {
+    0: "Neutral",
+    1: "Happy",
+    2: "Sad",
+    3: "Angry",
+    4: "Surprised",
+    5: "Disgusted",
+    6: "Fearful"
+}
+# Function to classify emotions from audio
+def classify_emotion(audio_file):
+    # Load and process audio
     speech, sr = librosa.load(audio_file, sr=16000)
     inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True, truncation=True)
+    # Get predictions
     with torch.no_grad():
         logits = model(**inputs).logits
     predicted_class_id = torch.argmax(logits, dim=-1).item()
+    # Convert class ID to emotion label
+    predicted_emotion = id2label.get(predicted_class_id, "Unknown")
+    return f"Predicted Emotion: {predicted_emotion}"
 # Gradio Interface
 interface = gr.Interface(
+    fn=classify_emotion,
     inputs=gr.Audio(source="upload", type="filepath"),
     outputs="text",
+    title="Speech Emotion Classifier 🎭",
+    description="Upload an audio file and the model will classify its emotion (e.g., Happy, Sad, Angry)."
 )
+# Launch the app
 if __name__ == "__main__":
     interface.launch()