Natwar commited on
Commit
343474c
·
verified ·
1 Parent(s): adc0b7c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +845 -0
app.py ADDED
@@ -0,0 +1,845 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import sys
4
+ import pkg_resources
5
+ import time
6
+ import tempfile
7
+ import numpy as np
8
+ import warnings
9
+ from pathlib import Path
10
+ warnings.filterwarnings("ignore")
11
+
12
+ def install_package(package, version=None):
13
+ package_spec = f"{package}=={version}" if version else package
14
+ print(f"Installing {package_spec}...")
15
+ try:
16
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec])
17
+ except subprocess.CalledProcessError as e:
18
+ print(f"Failed to install {package_spec}: {e}")
19
+ raise
20
+
21
+ # Required packages (add version pins if needed)
22
+ required_packages = {
23
+ "gradio": None,
24
+ "torch": None,
25
+ "torchaudio": None,
26
+ "transformers": None,
27
+ "librosa": None,
28
+ "scipy": None,
29
+ "matplotlib": None,
30
+ "pydub": None,
31
+ "plotly": None
32
+ }
33
+
34
+ installed_packages = {pkg.key for pkg in pkg_resources.working_set}
35
+ for package, version in required_packages.items():
36
+ if package not in installed_packages:
37
+ install_package(package, version)
38
+
39
+ # Now import necessary packages
40
+ import gradio as gr
41
+ import torch
42
+ import torchaudio
43
+ import librosa
44
+ import matplotlib
45
+ matplotlib.use('Agg') # non-interactive backend for any fallback
46
+ from pydub import AudioSegment
47
+ import scipy
48
+ import io
49
+ from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
50
+ import plotly.graph_objects as go
51
+
52
+ # Define emotion labels, tone mapping, and descriptions
53
+ EMOTION_DESCRIPTIONS = {
54
+ "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
55
+ "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
56
+ "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
57
+ "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
58
+ "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
59
+ "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
60
+ "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
61
+ }
62
+
63
+ # If you wish to group emotions by tone, you can do so here:
64
+ TONE_MAPPING = {
65
+ "positive": ["happy", "surprise"],
66
+ "neutral": ["neutral"],
67
+ "negative": ["angry", "sad", "fear", "disgust"]
68
+ }
69
+
70
+ # Global variable for the emotion classifier
71
+ audio_emotion_classifier = None
72
+
73
+ def load_emotion_model():
74
+ """Load and cache the speech emotion classification model."""
75
+ global audio_emotion_classifier
76
+ if audio_emotion_classifier is None:
77
+ try:
78
+ print("Loading emotion classification model...")
79
+ model_name = "superb/hubert-large-superb-er"
80
+ audio_emotion_classifier = pipeline("audio-classification", model=model_name)
81
+ print("Emotion classification model loaded successfully")
82
+ return True
83
+ except Exception as e:
84
+ print(f"Error loading emotion model: {e}")
85
+ return False
86
+ return True
87
+
88
+ def convert_audio_to_wav(audio_file):
89
+ """Convert uploaded audio to WAV format."""
90
+ try:
91
+ audio = AudioSegment.from_file(audio_file)
92
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
93
+ wav_path = temp_wav.name
94
+ audio.export(wav_path, format="wav")
95
+ return wav_path
96
+ except Exception as e:
97
+ print(f"Error converting audio: {e}")
98
+ return None
99
+
100
+ def analyze_voice_tone(audio_file):
101
+ """
102
+ Analyze the tone characteristics of the voice using more robust measurements.
103
+ Includes pitch variation, energy dynamics, and spectral features.
104
+ """
105
+ try:
106
+ audio_data, sample_rate = librosa.load(audio_file, sr=16000)
107
+
108
+ # 1. Basic audio features
109
+ audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
110
+ if audio_duration < 1.0: # Too short for reliable analysis
111
+ return "Audio too short for reliable tone analysis. Please provide at least 3 seconds."
112
+
113
+ # 2. Pitch analysis with more robust handling
114
+ f0, voiced_flag, voiced_prob = librosa.pyin(
115
+ audio_data,
116
+ fmin=librosa.note_to_hz('C2'),
117
+ fmax=librosa. note_to_hz('C7'),
118
+ sr=sample_rate
119
+ )
120
+
121
+ # Filter out NaN values and get valid pitch points
122
+ valid_f0 = f0[~np.isnan(f0)]
123
+
124
+ # If no pitch detected, may be noise or silence
125
+ if len(valid_f0) < 10:
126
+ return "**Voice Tone Analysis:** Unable to detect sufficient pitched content for analysis. The audio may contain primarily noise, silence, or non-speech sounds."
127
+
128
+ # 3. Calculate improved statistics
129
+ mean_pitch = np.mean(valid_f0)
130
+ median_pitch = np.median(valid_f0)
131
+ std_pitch = np.std(valid_f0)
132
+ pitch_range = np.percentile(valid_f0, 95) - np.percentile(valid_f0, 5)
133
+
134
+ # 4. Energy/volume dynamics
135
+ rms_energy = librosa.feature.rms(y=audio_data)[0]
136
+ mean_energy = np.mean(rms_energy)
137
+ std_energy = np.std(rms_energy)
138
+ energy_range = np.percentile(rms_energy, 95) - np.percentile(rms_energy, 5)
139
+
140
+ # 5. Speaking rate approximation (zero-crossing rate can help estimate this)
141
+ zcr = librosa.feature.zero_crossing_rate(audio_data)[0]
142
+ mean_zcr = np.mean(zcr)
143
+
144
+ # 6. Calculate pitch variability relative to the mean (coefficient of variation)
145
+ # This gives a better measure than raw std dev
146
+ pitch_cv = (std_pitch / mean_pitch) * 100 if mean_pitch > 0 else 0
147
+
148
+ # 7. Tone classification logic using multiple features
149
+ # Define tone characteristics based on combinations of features
150
+ tone_class = ""
151
+ tone_details = []
152
+
153
+ # Pitch-based characteristics
154
+ if pitch_cv < 5:
155
+ tone_class = "Monotone"
156
+ tone_details.append("Very little pitch variation - sounds flat and unexpressive")
157
+ elif pitch_cv < 12:
158
+ tone_class = "Steady"
159
+ tone_details.append("Moderate pitch variation - sounds controlled and measured")
160
+ elif pitch_cv < 20:
161
+ tone_class = "Expressive"
162
+ tone_details.append("Good pitch variation - sounds naturally engaging")
163
+ else:
164
+ tone_class = "Highly Dynamic"
165
+ tone_details.append("Strong pitch variation - sounds animated and emphatic")
166
+
167
+ # Pitch range classification
168
+ if mean_pitch > 180:
169
+ tone_details.append("Higher pitched voice - may convey excitement or tension")
170
+ elif mean_pitch < 120:
171
+ tone_details.append("Lower pitched voice - may convey calmness or authority")
172
+ else:
173
+ tone_details.append("Mid-range pitch - typically perceived as balanced")
174
+
175
+ # Energy/volume characteristics
176
+ energy_cv = (std_energy / mean_energy) * 100 if mean_energy > 0 else 0
177
+ if energy_cv < 10:
178
+ tone_details.append("Consistent volume - sounds controlled and measured")
179
+ elif energy_cv > 30:
180
+ tone_details.append("Variable volume - suggests emotional emphasis or expressiveness")
181
+
182
+ # Speech rate approximation
183
+ if mean_zcr > 0.1:
184
+ tone_details.append("Faster speech rate - may convey urgency or enthusiasm")
185
+ elif mean_zcr < 0.05:
186
+ tone_details.append("Slower speech rate - may convey thoughtfulness or hesitation")
187
+
188
+ # Generate tone summary and interpretation
189
+ tone_analysis = f"### Voice Tone Analysis\n\n"
190
+ tone_analysis += f"**Primary tone quality:** {tone_class}\n\n"
191
+ tone_analysis += "**Tone characteristics:**\n"
192
+ for detail in tone_details:
193
+ tone_analysis += f"- {detail}\n"
194
+
195
+ tone_analysis += "\n**Interpretation:**\n"
196
+
197
+ # Generate interpretation based on the classified tone
198
+ if tone_class == "Monotone":
199
+ tone_analysis += ("A monotone delivery can create distance and reduce engagement. "
200
+ "Consider adding more vocal variety to sound more engaging and authentic.")
201
+ elif tone_class == "Steady":
202
+ tone_analysis += ("Your steady tone suggests reliability and control. "
203
+ "This can be effective in professional settings or when conveying serious information.")
204
+ elif tone_class == "Expressive":
205
+ tone_analysis += ("Your expressive tone helps maintain listener interest and emphasize key points. "
206
+ "This naturally engaging quality helps convey authenticity and conviction.")
207
+ else: # Highly Dynamic
208
+ tone_analysis += ("Your highly dynamic vocal style conveys strong emotion and energy. "
209
+ "This can be powerful for storytelling and persuasion, though in some contexts "
210
+ "a more measured approach might be appropriate.")
211
+
212
+ return tone_analysis
213
+
214
+ except Exception as e:
215
+ print(f"Error in tone analysis: {e}")
216
+ return "Tone analysis unavailable due to an error processing the audio."
217
+
218
+ def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=2):
219
+ """
220
+ Analyze speech emotions in short chunks,
221
+ building a timeline of confidence for each emotion.
222
+ Returns a Plotly figure, summary text, detailed results.
223
+ """
224
+ if not load_emotion_model():
225
+ return None, "Failed to load emotion classifier.", None
226
+
227
+ # Use existing WAV if possible, else convert
228
+ if audio_file.endswith(".wav"):
229
+ audio_path = audio_file
230
+ else:
231
+ audio_path = convert_audio_to_wav(audio_file)
232
+ if not audio_path:
233
+ return None, "Could not process audio file", None
234
+
235
+ try:
236
+ # Load with librosa
237
+ audio_data, sample_rate = librosa.load(audio_path, sr=16000)
238
+ duration = len(audio_data) / sample_rate
239
+
240
+ # Use shorter chunks for more granular analysis
241
+ chunk_samples = int(chunk_duration * sample_rate)
242
+ num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
243
+
244
+ all_emotions = []
245
+ time_points = []
246
+
247
+ # For each chunk, run emotion classification
248
+ for i in range(num_chunks):
249
+ progress((i + 1) / num_chunks, "Analyzing audio emotions...")
250
+ start_idx = i * chunk_samples
251
+ end_idx = min(start_idx + chunk_samples, len(audio_data))
252
+ chunk = audio_data[start_idx:end_idx]
253
+
254
+ # Skip very short chunks
255
+ if len(chunk) < 0.5 * sample_rate:
256
+ continue
257
+
258
+ # Write chunk to temp WAV
259
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
260
+ chunk_path = temp_chunk.name
261
+ scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
262
+
263
+ # Classify - extract top-n predictions for each chunk
264
+ raw_results = audio_emotion_classifier(chunk_path, top_k=7) # Get all 7 emotions
265
+ os.unlink(chunk_path)
266
+
267
+ all_emotions.append(raw_results)
268
+ time_points.append((start_idx / sample_rate, end_idx / sample_rate))
269
+
270
+ # Skip if no valid emotions detected
271
+ if not all_emotions:
272
+ return None, "No speech detected in the audio.", None
273
+
274
+ # Build Plotly chart with improved styling
275
+ fig = build_plotly_line_chart(all_emotions, time_points, duration)
276
+
277
+ # Build summary and detailed results
278
+ summary_text = generate_emotion_summary(all_emotions)
279
+ detailed_results = build_detailed_results(all_emotions, time_points)
280
+
281
+ return fig, summary_text, detailed_results
282
+
283
+ except Exception as e:
284
+ import traceback
285
+ traceback.print_exc()
286
+ return None, f"Error analyzing audio: {str(e)}", None
287
+
288
+ def smooth_data(data, window_size=3):
289
+ """Apply a moving average smoothing to the data"""
290
+ smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
291
+
292
+ # Add back points that were lost in the convolution
293
+ padding = len(data) - len(smoothed)
294
+ if padding > 0:
295
+ # Add padding at the beginning
296
+ padding_front = padding // 2
297
+ padding_back = padding - padding_front
298
+
299
+ # Use the first/last values for padding
300
+ front_padding = [smoothed[0]] * padding_front
301
+ back_padding = [smoothed[-1]] * padding_back
302
+
303
+ smoothed = np.concatenate([front_padding, smoothed, back_padding])
304
+
305
+ return smoothed
306
+
307
+ def build_plotly_line_chart(all_emotions, time_points, duration):
308
+ """
309
+ Create an improved Plotly line chart with toggles for each emotion.
310
+ Shows all emotions for each time point rather than just the top one.
311
+ """
312
+ emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
313
+
314
+ # Custom color scheme for emotions
315
+ colors = {
316
+ "angry": "#E53935", # Red
317
+ "disgust": "#8E24AA", # Purple
318
+ "fear": "#7B1FA2", # Deep Purple
319
+ "happy": "#FFC107", # Amber/Yellow
320
+ "neutral": "#78909C", # Blue Grey
321
+ "sad": "#1E88E5", # Blue
322
+ "surprise": "#43A047" # Green
323
+ }
324
+
325
+ # Prepare data structure for all emotions
326
+ emotion_data = {label: [] for label in emotion_labels}
327
+ timeline_times = [(start + end) / 2 for start, end in time_points]
328
+
329
+ # Process emotion scores - ensure all emotions have values
330
+ for chunk_emotions in all_emotions:
331
+ # Create a mapping of label to score for this chunk
332
+ scores = {item["label"]: item["score"] for item in chunk_emotions}
333
+
334
+ # Ensure all emotion labels have a value (default to 0.0)
335
+ for label in emotion_labels:
336
+ emotion_data[label].append(scores.get(label, 0.0))
337
+
338
+ # Smooth the data
339
+ for label in emotion_labels:
340
+ if len(emotion_data[label]) > 2:
341
+ emotion_data[label] = smooth_data(emotion_data[label])
342
+
343
+ # Build the chart
344
+ fig = go.Figure()
345
+
346
+ # Add traces for each emotion
347
+ for label in emotion_labels:
348
+ fig.add_trace(
349
+ go.Scatter(
350
+ x=timeline_times,
351
+ y=emotion_data[label],
352
+ mode='lines',
353
+ name=label.capitalize(),
354
+ line=dict(
355
+ color=colors.get(label, None),
356
+ width=3,
357
+ shape='spline', # Curved lines
358
+ smoothing=1.3
359
+ ),
360
+ hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>',
361
+ )
362
+ )
363
+
364
+ # Add markers for dominant emotion at each point
365
+ dominant_markers_x = []
366
+ dominant_markers_y = []
367
+ dominant_markers_text = []
368
+ dominant_markers_color = []
369
+
370
+ for i, time in enumerate(timeline_times):
371
+ scores = {label: emotion_data[label][i] for label in emotion_labels}
372
+ dominant = max(scores.items(), key=lambda x: x[1])
373
+
374
+ dominant_markers_x.append(time)
375
+ dominant_markers_y.append(dominant[1])
376
+ dominant_markers_text.append(f"{dominant[0].capitalize()}: {dominant[1]:.2f}")
377
+ dominant_markers_color.append(colors.get(dominant[0], "#000000"))
378
+
379
+ fig.add_trace(
380
+ go.Scatter(
381
+ x=dominant_markers_x,
382
+ y=dominant_markers_y,
383
+ mode='markers',
384
+ marker=dict(
385
+ size=10,
386
+ color=dominant_markers_color,
387
+ line=dict(width=2, color='white')
388
+ ),
389
+ name="Dominant Emotion",
390
+ text=dominant_markers_text,
391
+ hoverinfo="text",
392
+ hovertemplate='%{text}<extra></extra>'
393
+ )
394
+ )
395
+
396
+ # Add area chart for better visualization
397
+ for label in emotion_labels:
398
+ fig.add_trace(
399
+ go.Scatter(
400
+ x=timeline_times,
401
+ y=emotion_data[label],
402
+ mode='none',
403
+ name=f"{label.capitalize()} Area",
404
+ fill='tozeroy',
405
+ fillcolor=f"rgba{tuple(list(int(colors.get(label, '#000000').lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + [0.1])}",
406
+ showlegend=False,
407
+ hoverinfo='skip'
408
+ )
409
+ )
410
+
411
+ # Improve layout
412
+ fig.update_layout(
413
+ title={
414
+ 'text': "Voice Emotion Analysis Over Time",
415
+ 'font': {'size': 22, 'family': 'Arial, sans-serif'}
416
+ },
417
+ xaxis_title="Time (seconds)",
418
+ yaxis_title="Confidence Score",
419
+ yaxis=dict(
420
+ range=[0, 1.0],
421
+ showgrid=True,
422
+ gridcolor='rgba(230, 230, 230, 0.8)'
423
+ ),
424
+ xaxis=dict(
425
+ showgrid=True,
426
+ gridcolor='rgba(230, 230, 230, 0.8)'
427
+ ),
428
+ plot_bgcolor='white',
429
+ legend=dict(
430
+ bordercolor='rgba(0,0,0,0.1)',
431
+ borderwidth=1,
432
+ orientation="h",
433
+ yanchor="bottom",
434
+ y=1.02,
435
+ xanchor="right",
436
+ x=1
437
+ ),
438
+ hovermode='closest',
439
+ height=500, # Larger size for better viewing
440
+ margin=dict(l=10, r=10, t=80, b=50)
441
+ )
442
+
443
+ return fig
444
+
445
+ def generate_alternative_chart(all_emotions, time_points):
446
+ """
447
+ Create a stacked area chart to better visualize emotion changes over time
448
+ """
449
+ emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
450
+
451
+ # Custom color scheme for emotions - more visible/distinct
452
+ colors = {
453
+ "angry": "#F44336", # Red
454
+ "disgust": "#9C27B0", # Purple
455
+ "fear": "#673AB7", # Deep Purple
456
+ "happy": "#FFC107", # Amber
457
+ "neutral": "#607D8B", # Blue Grey
458
+ "sad": "#2196F3", # Blue
459
+ "surprise": "#4CAF50" # Green
460
+ }
461
+
462
+ # Prepare timeline points
463
+ timeline_times = [(start + end) / 2 for start, end in time_points]
464
+
465
+ # Prepare data structure for all emotions
466
+ emotion_data = {label: [] for label in emotion_labels}
467
+
468
+ # Process emotion scores - ensure all emotions have values
469
+ for chunk_emotions in all_emotions:
470
+ # Create a mapping of label to score for this chunk
471
+ scores = {item["label"]: item["score"] for item in chunk_emotions}
472
+
473
+ # Ensure all emotion labels have a value (default to 0.0)
474
+ for label in emotion_labels:
475
+ emotion_data[label].append(scores.get(label, 0.0))
476
+
477
+ # Create the stacked area chart
478
+ fig = go.Figure()
479
+
480
+ # Add each emotion as a separate trace
481
+ for label in emotion_labels:
482
+ fig.add_trace(
483
+ go.Scatter(
484
+ x=timeline_times,
485
+ y=emotion_data[label],
486
+ mode='lines',
487
+ name=label.capitalize(),
488
+ line=dict(width=0.5, color=colors.get(label, None)),
489
+ stackgroup='one', # This makes it a stacked area chart
490
+ fillcolor=colors.get(label, None),
491
+ hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>'
492
+ )
493
+ )
494
+
495
+ # Improve layout
496
+ fig.update_layout(
497
+ title={
498
+ 'text': "Voice Emotion Distribution Over Time",
499
+ 'font': {'size': 22, 'family': 'Arial, sans-serif'}
500
+ },
501
+ xaxis_title="Time (seconds)",
502
+ yaxis_title="Emotion Intensity",
503
+ yaxis=dict(
504
+ showgrid=True,
505
+ gridcolor='rgba(230, 230, 230, 0.8)'
506
+ ),
507
+ xaxis=dict(
508
+ showgrid=True,
509
+ gridcolor='rgba(230, 230, 230, 0.8)'
510
+ ),
511
+ plot_bgcolor='white',
512
+ legend=dict(
513
+ bordercolor='rgba(0,0,0,0.1)',
514
+ borderwidth=1,
515
+ orientation="h",
516
+ yanchor="bottom",
517
+ y=1.02,
518
+ xanchor="right",
519
+ x=1
520
+ ),
521
+ hovermode='closest',
522
+ height=500,
523
+ margin=dict(l=10, r=10, t=80, b=50)
524
+ )
525
+
526
+ return fig
527
+
528
+ def generate_emotion_summary(all_emotions):
529
+ """
530
+ Produce an improved textual summary of the overall emotion distribution.
531
+ """
532
+ if not all_emotions:
533
+ return "No emotional content detected."
534
+
535
+ emotion_counts = {}
536
+ emotion_confidence = {}
537
+ total_chunks = len(all_emotions)
538
+
539
+ for chunk_emotions in all_emotions:
540
+ top_emotion = max(chunk_emotions, key=lambda x: x['score'])
541
+ label = top_emotion["label"]
542
+ confidence = top_emotion["score"]
543
+
544
+ emotion_counts[label] = emotion_counts.get(label, 0) + 1
545
+ emotion_confidence[label] = emotion_confidence.get(label, 0) + confidence
546
+
547
+ # Calculate average confidence for each emotion
548
+ for emotion in emotion_confidence:
549
+ if emotion_counts[emotion] > 0:
550
+ emotion_confidence[emotion] /= emotion_counts[emotion]
551
+
552
+ # Dominant emotion (highest percentage)
553
+ dominant_emotion = max(emotion_counts, key=emotion_counts.get)
554
+ dominant_pct = (emotion_counts[dominant_emotion] / total_chunks) * 100
555
+
556
+ # Most confident emotion (might differ from dominant)
557
+ most_confident = max(emotion_confidence, key=emotion_confidence.get)
558
+
559
+ # Tone grouping analysis
560
+ tone_group_counts = {group: 0 for group in TONE_MAPPING}
561
+ for emotion, count in emotion_counts.items():
562
+ for tone_group, emotions in TONE_MAPPING.items():
563
+ if emotion in emotions:
564
+ tone_group_counts[tone_group] += count
565
+
566
+ dominant_tone = max(tone_group_counts, key=tone_group_counts.get)
567
+ dominant_tone_pct = (tone_group_counts[dominant_tone] / total_chunks) * 100
568
+
569
+ # Build summary with markdown formatting
570
+ summary = f"### Voice Emotion Analysis Summary\n\n"
571
+ summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({dominant_pct:.1f}%)\n\n"
572
+
573
+ if dominant_emotion != most_confident and emotion_confidence[most_confident] > 0.7:
574
+ summary += f"**Most confident detection:** {most_confident.capitalize()} "
575
+ summary += f"(avg. confidence: {emotion_confidence[most_confident]:.2f})\n\n"
576
+
577
+ summary += f"**Overall tone:** {dominant_tone.capitalize()} ({dominant_tone_pct:.1f}%)\n\n"
578
+ summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
579
+
580
+ # Show emotion distribution as sorted list
581
+ summary += "**Emotion distribution:**\n"
582
+ for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
583
+ percentage = (count / total_chunks) * 100
584
+ avg_conf = emotion_confidence[emotion]
585
+ summary += f"- {emotion.capitalize()}: {percentage:.1f}% (confidence: {avg_conf:.2f})\n"
586
+
587
+ # Add interpretation based on dominant emotion
588
+ summary += f"\n**Interpretation:**\n"
589
+
590
+ if dominant_emotion == "happy":
591
+ summary += "The voice conveys primarily positive emotions, suggesting enthusiasm, satisfaction, or joy."
592
+ elif dominant_emotion == "neutral":
593
+ summary += "The voice maintains an even emotional tone, suggesting composure or professional delivery."
594
+ elif dominant_emotion == "sad":
595
+ summary += "The voice conveys melancholy or disappointment, potentially indicating concern or distress."
596
+ elif dominant_emotion == "angry":
597
+ summary += "The voice shows frustration or assertiveness, suggesting strong conviction or displeasure."
598
+ elif dominant_emotion == "fear":
599
+ summary += "The voice reveals anxiety or nervousness, suggesting uncertainty or concern."
600
+ elif dominant_emotion == "disgust":
601
+ summary += "The voice expresses disapproval or aversion, suggesting rejection of discussed concepts."
602
+ elif dominant_emotion == "surprise":
603
+ summary += "The voice shows unexpected reactions, suggesting discovery of new information or astonishment."
604
+
605
+ return summary
606
+
607
+ def build_detailed_results(all_emotions, time_points):
608
+ """
609
+ Return a list of dictionaries containing chunk start-end, top emotion, confidence, description.
610
+ Suitable for Gradio DataFrame display.
611
+ """
612
+ results_list = []
613
+ for (emotions, (start_time, end_time)) in zip(all_emotions, time_points):
614
+ top_emotion = max(emotions, key=lambda x: x['score'])
615
+ label = top_emotion["label"]
616
+
617
+ # Find second highest emotion if available
618
+ if len(emotions) > 1:
619
+ sorted_emotions = sorted(emotions, key=lambda x: x['score'], reverse=True)
620
+ second_emotion = sorted_emotions[1]["label"].capitalize()
621
+ second_score = sorted_emotions[1]["score"]
622
+ secondary = f" ({second_emotion}: {second_score:.2f})"
623
+ else:
624
+ secondary = ""
625
+
626
+ results_list.append({
627
+ "Time Range": f"{start_time:.1f}s - {end_time:.1f}s",
628
+ "Primary Emotion": label.capitalize(),
629
+ "Confidence": f"{top_emotion['score']:.2f}{secondary}",
630
+ "Description": EMOTION_DESCRIPTIONS.get(label, "")
631
+ })
632
+ return results_list
633
+
634
+ def process_audio(audio_file, progress=gr.Progress()):
635
+ """
636
+ Main handler for Gradio:
637
+ 1) Emotion analysis (returns Plotly figure).
638
+ 2) Tone analysis (returns descriptive text).
639
+ """
640
+ if not audio_file:
641
+ return None, None, "No audio file provided.", None, "No tone analysis."
642
+
643
+ # 1) Analyze emotions
644
+ fig, summary_text, detailed_results = analyze_audio_emotions(audio_file, progress)
645
+ if not fig: # Error or missing
646
+ return None, None, "Failed to analyze audio emotions.", None, "Tone analysis unavailable."
647
+
648
+ # 2) Generate alternative chart
649
+ # Extract the necessary data from detailed_results to create time_points
650
+ time_points = []
651
+ for result in detailed_results:
652
+ time_range = result["Time Range"]
653
+ start_time = float(time_range.split("s")[0])
654
+ end_time = float(time_range.split(" - ")[1].split("s")[0])
655
+ time_points.append((start_time, end_time))
656
+
657
+ # Extract emotion data from detailed_results
658
+ all_emotions = []
659
+ for result in detailed_results:
660
+ # Parse the primary emotion and confidence
661
+ primary_emotion = result["Primary Emotion"].lower()
662
+ confidence_str = result["Confidence"].split("(")[0].strip()
663
+ primary_confidence = float(confidence_str)
664
+
665
+ # Create a list of emotion dictionaries for this time point
666
+ emotions_at_time = [{"label": primary_emotion, "score": primary_confidence}]
667
+
668
+ # Check if there's a secondary emotion
669
+ if "(" in result["Confidence"]:
670
+ secondary_part = result["Confidence"].split("(")[1].split(")")[0]
671
+ secondary_emotion = secondary_part.split(":")[0].strip().lower()
672
+ secondary_confidence = float(secondary_part.split(":")[1].strip())
673
+ emotions_at_time.append({"label": secondary_emotion, "score": secondary_confidence})
674
+
675
+ # Add remaining emotions with zero confidence
676
+ for emotion in EMOTION_DESCRIPTIONS.keys():
677
+ if emotion not in [e["label"] for e in emotions_at_time]:
678
+ emotions_at_time.append({"label": emotion, "score": 0.0})
679
+
680
+ all_emotions.append(emotions_at_time)
681
+
682
+ # Now we can generate the alternative chart
683
+ alt_fig = generate_alternative_chart(all_emotions, time_points)
684
+
685
+ # 3) Analyze tone
686
+ tone_analysis = analyze_voice_tone(audio_file)
687
+
688
+ return fig, alt_fig, summary_text, detailed_results, tone_analysis
689
+
690
+ # Create Gradio interface with improved UI/UX
691
+ with gr.Blocks(title="Voice Emotion & Tone Analysis System", theme=gr.themes.Soft()) as demo:
692
+ gr.Markdown("""
693
+ # 🎙️ Voice Emotion & Tone Analysis System
694
+
695
+ This app provides professional analysis of:
696
+ - **Emotions** in your voice (Anger, Disgust, Fear, Happy, Neutral, Sad, Surprise)
697
+ - **Tone characteristics** (based on pitch, energy, and speech patterns)
698
+
699
+ The interactive timeline shows emotion confidence scores throughout your audio.
700
+ """)
701
+
702
+ with gr.Tabs():
703
+ # Tab 1: Upload
704
+ with gr.TabItem("Upload Audio"):
705
+ with gr.Row():
706
+ with gr.Column(scale=1):
707
+ audio_input = gr.Audio(
708
+ label="Upload Audio File",
709
+ type="filepath",
710
+ sources=["upload"],
711
+ elem_id="audio_upload"
712
+ )
713
+ process_btn = gr.Button("Analyze Voice", variant="primary")
714
+ gr.Markdown("""
715
+ **Supports:** MP3, WAV, M4A, and most audio formats
716
+ **For best results:** Use a clear voice recording with minimal background noise
717
+ """)
718
+ with gr.Column(scale=2):
719
+ with gr.Tabs():
720
+ with gr.TabItem("Line Chart"):
721
+ emotion_timeline = gr.Plot(label="Emotion Timeline",
722
+ elem_id="emotion_plot",
723
+ container=True)
724
+ with gr.TabItem("Area Chart"):
725
+ emotion_area_chart = gr.Plot(label="Emotion Distribution",
726
+ elem_id="emotion_area_plot",
727
+ container=True)
728
+ with gr.Row():
729
+ with gr.Column():
730
+ emotion_summary = gr.Markdown(label="Emotion Summary")
731
+ with gr.Column():
732
+ tone_analysis_output = gr.Markdown(label="Tone Analysis")
733
+ with gr.Row():
734
+ emotion_results = gr.DataFrame(
735
+ headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
736
+ label="Detailed Emotion Analysis"
737
+ )
738
+
739
+ process_btn.click(
740
+ fn=process_audio,
741
+ inputs=[audio_input],
742
+ outputs=[emotion_timeline, emotion_area_chart, emotion_summary, emotion_results, tone_analysis_output]
743
+ )
744
+
745
+ # Tab 2: Record
746
+ with gr.TabItem("Record Voice"):
747
+ with gr.Row():
748
+ with gr.Column(scale=1):
749
+ record_input = gr.Audio(
750
+ label="Record Your Voice",
751
+ sources=["microphone"],
752
+ type="filepath",
753
+ elem_id="record_audio"
754
+ )
755
+ analyze_btn = gr.Button("Analyze Recording", variant="primary")
756
+ gr.Markdown("""
757
+ **Tips:**
758
+ - Speak clearly and at a normal pace
759
+ - Record at least 10-15 seconds for more accurate analysis
760
+ - Try different emotional tones to see how they're detected
761
+ """)
762
+ with gr.Column(scale=2):
763
+ with gr.Tabs():
764
+ with gr.TabItem("Line Chart"):
765
+ rec_emotion_timeline = gr.Plot(label="Emotion Timeline",
766
+ elem_id="record_emotion_plot",
767
+ container=True)
768
+ with gr.TabItem("Area Chart"):
769
+ rec_emotion_area_chart = gr.Plot(label="Emotion Distribution",
770
+ elem_id="record_emotion_area_plot",
771
+ container=True)
772
+ with gr.Row():
773
+ with gr.Column():
774
+ rec_emotion_summary = gr.Markdown(label="Emotion Summary")
775
+ with gr.Column():
776
+ rec_tone_analysis_output = gr.Markdown(label="Tone Analysis")
777
+ with gr.Row():
778
+ rec_emotion_results = gr.DataFrame(
779
+ headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
780
+ label="Detailed Emotion Analysis"
781
+ )
782
+
783
+ analyze_btn.click(
784
+ fn=process_audio,
785
+ inputs=[record_input],
786
+ outputs=[rec_emotion_timeline, rec_emotion_area_chart, rec_emotion_summary, rec_emotion_results, rec_tone_analysis_output]
787
+ )
788
+
789
+ # Tab 3: About & Help
790
+ with gr.TabItem("About & Help"):
791
+ gr.Markdown("""
792
+ ## About This System
793
+
794
+ This voice emotion & tone analysis system uses state-of-the-art deep learning models to detect emotions and analyze vocal characteristics. The system is built on HuBERT (Hidden Unit BERT) architecture trained on speech emotion recognition tasks.
795
+
796
+ ### How It Works
797
+
798
+ 1. **Audio Processing**: Your audio is processed in short segments (chunks) to capture emotion variations over time.
799
+ 2. **Emotion Classification**: Each segment is analyzed by a neural network to detect emotional patterns.
800
+ 3. **Tone Analysis**: Acoustic features like pitch, energy, and rhythm are analyzed to describe voice tone characteristics.
801
+
802
+ ### Emotion Categories
803
+
804
+ The system detects seven standard emotions:
805
+
806
+ - **Angry**: Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.
807
+ - **Disgust**: Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.
808
+ - **Fear**: Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.
809
+ - **Happy**: Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.
810
+ - **Neutral**: Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.
811
+ - **Sad**: Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.
812
+ - **Surprise**: Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.
813
+
814
+ ### Tips for Best Results
815
+
816
+ - Use clear audio with minimal background noise
817
+ - Speak naturally at a comfortable volume
818
+ - Record at least 10-15 seconds of speech
819
+ - For tone analysis, longer recordings (30+ seconds) provide more accurate results
820
+
821
+ ### Privacy Notice
822
+
823
+ All audio processing happens on your device. No audio recordings or analysis results are stored or transmitted to external servers.
824
+ """)
825
+
826
+ gr.Markdown("""
827
+ ---
828
+ ### System Information
829
+
830
+ - **Model**: HuBERT Large for Speech Emotion Recognition
831
+ - **Version**: 1.2.0
832
+ - **Libraries**: PyTorch, Transformers, Librosa, Plotly
833
+
834
+ This application demonstrates the use of AI for speech emotion recognition and acoustic analysis. For research and educational purposes only.
835
+ """)
836
+
837
+ # Check if model can load before launching interface
838
+ print("Checking model availability...")
839
+ load_success = load_emotion_model()
840
+ if not load_success:
841
+ print("Warning: Emotion model failed to load. Application may have limited functionality.")
842
+
843
+ # Launch the demo
844
+ if __name__ == "__main__":
845
+ demo.launch()