Spaces:
Running
Running
# voice_emotion_classification.py | |
import os | |
import subprocess | |
import sys | |
import pkg_resources | |
import time | |
import tempfile | |
import numpy as np | |
import warnings | |
from pathlib import Path | |
warnings.filterwarnings("ignore") | |
def install_package(package, version=None): | |
package_spec = f"{package}=={version}" if version else package | |
print(f"Installing {package_spec}...") | |
try: | |
subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec]) | |
except subprocess.CalledProcessError as e: | |
print(f"Failed to install {package_spec}: {e}") | |
raise | |
# Required packages (you may add version pins if necessary) | |
required_packages = { | |
"gradio": None, | |
"torch": None, | |
"torchaudio": None, | |
"transformers": None, | |
"librosa": None, | |
"scipy": None, | |
"matplotlib": None, | |
"pydub": None | |
} | |
installed_packages = {pkg.key for pkg in pkg_resources.working_set} | |
for package, version in required_packages.items(): | |
if package not in installed_packages: | |
install_package(package, version) | |
# Now import all necessary packages | |
import gradio as gr | |
import torch | |
import torchaudio | |
import librosa | |
import matplotlib.pyplot as plt | |
from matplotlib.colors import LinearSegmentedColormap | |
from pydub import AudioSegment | |
import scipy | |
import io | |
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification | |
from pathlib import Path | |
import matplotlib | |
matplotlib.use('Agg') # Use non-interactive backend | |
# Define emotion labels, tone mapping, and descriptions | |
EMOTION_DESCRIPTIONS = { | |
"angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.", | |
"disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.", | |
"fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.", | |
"happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.", | |
"neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.", | |
"sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.", | |
"surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic." | |
} | |
# Here we map emotion to a generalized tone (for example, negative or positive) | |
TONE_MAPPING = { | |
"positive": ["happy", "surprise"], | |
"neutral": ["neutral"], | |
"negative": ["angry", "sad", "fear", "disgust"] | |
} | |
# Some Hugging Face models return short labels (e.g., "hap", "ang", etc.). | |
# This mapping will ensure they're translated into our full canonical labels. | |
MODEL_TO_EMOTION_MAP = { | |
"hap": "happy", | |
"ang": "angry", | |
"sad": "sad", | |
"dis": "disgust", | |
"fea": "fear", | |
"neu": "neutral", | |
"sur": "surprise" | |
} | |
# Global variable for the emotion classifier | |
audio_emotion_classifier = None | |
def load_emotion_model(): | |
"""Load the emotion classification model once and cache it.""" | |
global audio_emotion_classifier | |
if audio_emotion_classifier is None: | |
try: | |
print("Loading emotion classification model...") | |
# Using the Hugging Face pipeline with the new model that classifies speech emotion | |
model_name = "superb/hubert-large-superb-er" | |
audio_emotion_classifier = pipeline("audio-classification", model=model_name) | |
print("Emotion classification model loaded successfully") | |
return True | |
except Exception as e: | |
print(f"Error loading emotion model: {e}") | |
return False | |
return True | |
def convert_audio_to_wav(audio_file): | |
"""Convert the uploaded audio to WAV format.""" | |
try: | |
audio = AudioSegment.from_file(audio_file) | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav: | |
wav_path = temp_wav.name | |
audio.export(wav_path, format="wav") | |
return wav_path | |
except Exception as e: | |
print(f"Error converting audio: {e}") | |
return None | |
def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5): | |
""" | |
Analyze emotions in an audio file by processing it in chunks. | |
Returns a visualization, processed audio path, summary, and detailed results. | |
""" | |
if not load_emotion_model(): | |
return None, "Failed to load emotion classification model. Please check console for details." | |
# If the file is already a WAV, use it directly; else convert it. | |
if audio_file.endswith('.wav'): | |
audio_path = audio_file | |
else: | |
audio_path = convert_audio_to_wav(audio_file) | |
if not audio_path: | |
return None, "Failed to process audio file. Unsupported format or corrupted file." | |
try: | |
# Load the audio using librosa | |
audio_data, sample_rate = librosa.load(audio_path, sr=16000) | |
duration = len(audio_data) / sample_rate | |
# Process in chunks for long files | |
chunk_samples = int(chunk_duration * sample_rate) | |
num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples))) | |
all_emotions = [] | |
time_points = [] | |
for i in range(num_chunks): | |
progress((i + 1) / num_chunks, "Analyzing audio emotions...") | |
start_idx = i * chunk_samples | |
end_idx = min(start_idx + chunk_samples, len(audio_data)) | |
chunk = audio_data[start_idx:end_idx] | |
# Skip too-short chunks (<0.5 seconds) | |
if len(chunk) < 0.5 * sample_rate: | |
continue | |
# Create a temporary file for this audio chunk | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk: | |
chunk_path = temp_chunk.name | |
scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16)) | |
# Get emotion classification results on this chunk | |
results = audio_emotion_classifier(chunk_path) | |
os.unlink(chunk_path) # Remove the temporary file | |
all_emotions.append(results) | |
time_points.append((start_idx / sample_rate, end_idx / sample_rate)) | |
# Generate visualization and summary | |
fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, duration) | |
with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img: | |
img_path = temp_img.name | |
fig.savefig(img_path, dpi=100, bbox_inches='tight') | |
plt.close(fig) | |
summary = generate_emotion_summary(all_emotions, time_points) | |
return img_path, audio_path, summary, detailed_results | |
except Exception as e: | |
print(f"Error analyzing audio: {e}") | |
import traceback | |
traceback.print_exc() | |
return None, None, f"Error analyzing audio: {str(e)}", None | |
def generate_emotion_timeline(all_emotions, time_points, duration): | |
""" | |
Generate a bar chart visualization of emotion percentages with tone analysis. | |
Returns the matplotlib figure and a list of detailed results. | |
""" | |
# All possible emotion labels from our dictionary | |
emotion_labels = list(EMOTION_DESCRIPTIONS.keys()) | |
# We'll accumulate counts based on our canonical labels (e.g., "happy", "angry"). | |
emotion_counts = {} | |
for emotions in all_emotions: | |
if not emotions: | |
continue | |
# The pipeline returns items like {"label": "Hap", "score": 0.95}, etc. | |
top_emotion = max(emotions, key=lambda x: x['score']) | |
# Normalize the label from the model to a canonical label used in EMOTION_DESCRIPTIONS | |
raw_label = top_emotion['label'].lower().strip() # e.g., "hap", "ang", ... | |
canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label) | |
# If there's no mapping, we leave it as raw_label. | |
# But typically, it should be one of "happy", "angry", "disgust", "fear", "sad", "neutral", "surprise". | |
# Count how many times each canonical label appears | |
emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1 | |
total_chunks = len(all_emotions) | |
emotion_percentages = { | |
e: (count / total_chunks * 100) for e, count in emotion_counts.items() | |
} | |
# Create empty percentages for emotions that didn't appear | |
for label in emotion_labels: | |
if label not in emotion_percentages: | |
emotion_percentages[label] = 0.0 | |
# Sort emotions by percentage | |
sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True) | |
# Create the bar chart with subplots: one for emotions and one for tone | |
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1], gridspec_kw={'hspace': 0.3}) | |
# Capitalize each label for a nice display | |
emotions = [item[0].capitalize() for item in sorted_emotions] | |
percentages = [item[1] for item in sorted_emotions] | |
# Custom colors for emotions (enough for 7 emotions) | |
colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange'] | |
if len(emotions) <= len(colors): | |
bar_colors = colors[:len(emotions)] | |
else: | |
# fallback if there's more emotions than colors | |
bar_colors = colors + ['#666666'] * (len(emotions) - len(colors)) | |
# Plot emotion bars | |
bars = ax1.bar(emotions, percentages, color=bar_colors) | |
# Add percentage labels on top of each bar | |
for bar in bars: | |
height = bar.get_height() | |
ax1.annotate(f'{height:.1f}%', | |
xy=(bar.get_x() + bar.get_width() / 2, height), | |
xytext=(0, 3), # 3 points vertical offset | |
textcoords="offset points", | |
ha='center', va='bottom') | |
ax1.set_ylim(0, 100) # Fixed 100% scale | |
ax1.set_ylabel('Percentage (%)') | |
ax1.set_title('Emotion Distribution') | |
ax1.grid(axis='y', linestyle='--', alpha=0.7) | |
# Calculate tone percentages based on the canonical labels we found | |
tone_percentages = {"positive": 0, "neutral": 0, "negative": 0} | |
for emotion_label, percentage in emotion_percentages.items(): | |
for tone, emotions_list in TONE_MAPPING.items(): | |
if emotion_label in emotions_list: | |
tone_percentages[tone] += percentage | |
# Plot tone bars | |
tones = list(tone_percentages.keys()) | |
tone_values = list(tone_percentages.values()) | |
tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'} | |
tone_bars = ax2.bar(tones, tone_values, color=[tone_colors[t] for t in tones]) | |
# Add percentage labels on tone bars | |
for bar in tone_bars: | |
height = bar.get_height() | |
if height > 0: # Only add label if there's a visible bar | |
ax2.annotate(f'{height:.1f}%', | |
xy=(bar.get_x() + bar.get_width() / 2, height), | |
xytext=(0, 3), | |
textcoords="offset points", | |
ha='center', va='bottom') | |
ax2.set_ylim(0, 100) | |
ax2.set_ylabel('Percentage (%)') | |
ax2.set_title('Tone Analysis') | |
ax2.grid(axis='y', linestyle='--', alpha=0.7) | |
plt.tight_layout() | |
# Generate a more detailed time-segmented result | |
detailed_results = [] | |
for idx, (emotions, (start_time, end_time)) in enumerate(zip(all_emotions, time_points)): | |
if not emotions: | |
continue | |
top_emotion = max(emotions, key=lambda x: x['score']) | |
raw_label = top_emotion['label'].lower().strip() | |
canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label) | |
# Determine the tone for this emotion | |
# (based on canonical_label rather than the raw model label) | |
tone = next((t for t, e_list in TONE_MAPPING.items() if canonical_label in e_list), "unknown") | |
detailed_results.append({ | |
'Time Range': f"{start_time:.1f}s - {end_time:.1f}s", | |
'Emotion': canonical_label, | |
'Tone': tone.capitalize(), | |
'Confidence': f"{top_emotion['score']:.2f}", | |
'Description': EMOTION_DESCRIPTIONS.get(canonical_label, "") | |
}) | |
return fig, detailed_results | |
def generate_emotion_summary(all_emotions, time_points): | |
""" | |
Create a summary text from the emotion analysis. | |
Counts occurrences and computes percentages of the dominant emotion. | |
""" | |
if not all_emotions: | |
return "No emotional content detected." | |
emotion_counts = {} | |
total_chunks = len(all_emotions) | |
for emotions in all_emotions: | |
if not emotions: | |
continue | |
top_emotion = max(emotions, key=lambda x: x['score']) | |
# Normalize the label | |
raw_label = top_emotion['label'].lower().strip() | |
canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label) | |
emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1 | |
emotion_percentages = { | |
e: (count / total_chunks * 100) | |
for e, count in emotion_counts.items() | |
} | |
if not emotion_percentages: | |
return "No emotional content detected." | |
# Find the dominant emotion (highest percentage) | |
dominant_emotion = max(emotion_percentages.items(), key=lambda x: x[1])[0] | |
summary = f"### Voice Emotion Analysis Summary\n\n" | |
summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({emotion_percentages[dominant_emotion]:.1f}%)\n\n" | |
summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n" | |
summary += "**Emotion distribution:**\n" | |
for emotion, percentage in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True): | |
summary += f"- {emotion.capitalize()}: {percentage:.1f}%\n" | |
summary += "\n**Interpretation:** The voice predominantly expresses {0} emotion".format(dominant_emotion) | |
return summary | |
def record_audio(audio): | |
"""Save recorded audio and analyze emotions.""" | |
try: | |
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file: | |
audio_path = temp_file.name | |
with open(audio_path, 'wb') as f: | |
f.write(audio) | |
return audio_path | |
except Exception as e: | |
print(f"Error saving recorded audio: {e}") | |
return None | |
def process_audio(audio_file, progress=gr.Progress()): | |
"""Process the audio file and analyze emotions.""" | |
if audio_file is None: | |
return None, None, "No audio file provided.", None | |
img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress) | |
if img_path is None: | |
return None, None, "Failed to analyze audio emotions.", None | |
return img_path, processed_audio, summary, results | |
# Create Gradio interface | |
with gr.Blocks(title="Voice Emotion Analysis System") as demo: | |
gr.Markdown(""" | |
# ποΈ Voice Emotion Analysis System | |
This app analyzes the emotional content of voice recordings. | |
It detects emotions including: | |
* π‘ **Anger** | |
* π€’ **Disgust** | |
* π¨ **Fear** | |
* π **Happiness** | |
* π **Neutral** | |
* π’ **Sadness** | |
* π² **Surprise** | |
And provides a detailed analysis and timeline. | |
""") | |
with gr.Tabs(): | |
with gr.TabItem("Upload Audio"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
audio_input = gr.Audio( | |
label="Upload Audio File", | |
type="filepath", | |
sources=["upload"] | |
) | |
process_btn = gr.Button("Analyze Voice Emotions") | |
with gr.Column(scale=2): | |
emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True) | |
with gr.Row(): | |
audio_playback = gr.Audio(label="Processed Audio", show_label=True) | |
emotion_summary = gr.Markdown(label="Emotion Summary") | |
with gr.Row(): | |
emotion_results = gr.DataFrame( | |
headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"], | |
label="Detailed Emotion Analysis" | |
) | |
process_btn.click( | |
fn=process_audio, | |
inputs=[audio_input], | |
outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results] | |
) | |
with gr.TabItem("Record Voice"): | |
with gr.Row(): | |
with gr.Column(scale=1): | |
record_input = gr.Audio( | |
label="Record Your Voice", | |
sources=["microphone"], | |
type="filepath" | |
) | |
analyze_btn = gr.Button("Analyze Recording") | |
with gr.Column(scale=2): | |
rec_emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True) | |
with gr.Row(): | |
rec_audio_playback = gr.Audio(label="Processed Audio", show_label=True) | |
rec_emotion_summary = gr.Markdown(label="Emotion Summary") | |
with gr.Row(): | |
rec_emotion_results = gr.DataFrame( | |
headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"], | |
label="Detailed Emotion Analysis" | |
) | |
analyze_btn.click( | |
fn=process_audio, | |
inputs=[record_input], | |
outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results] | |
) | |
gr.Markdown(""" | |
### How to Use | |
1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions". | |
2. **Record Voice Tab:** Record your voice and click "Analyze Recording". | |
**Tips:** | |
- Use clear recordings with minimal background noise. | |
- Longer recordings yield more consistent results. | |
""") | |
def initialize_app(): | |
print("Initializing voice emotion analysis app...") | |
if load_emotion_model(): | |
print("Emotion model loaded successfully!") | |
else: | |
print("Failed to load emotion model.") | |
if __name__ == "__main__": | |
initialize_app() | |
demo.launch() | |