Natwar commited on
Commit
e89b55b
Β·
verified Β·
1 Parent(s): 343474c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +286 -665
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  import subprocess
3
  import sys
@@ -18,7 +20,7 @@ def install_package(package, version=None):
18
  print(f"Failed to install {package_spec}: {e}")
19
  raise
20
 
21
- # Required packages (add version pins if needed)
22
  required_packages = {
23
  "gradio": None,
24
  "torch": None,
@@ -27,8 +29,7 @@ required_packages = {
27
  "librosa": None,
28
  "scipy": None,
29
  "matplotlib": None,
30
- "pydub": None,
31
- "plotly": None
32
  }
33
 
34
  installed_packages = {pkg.key for pkg in pkg_resources.working_set}
@@ -36,18 +37,20 @@ for package, version in required_packages.items():
36
  if package not in installed_packages:
37
  install_package(package, version)
38
 
39
- # Now import necessary packages
40
  import gradio as gr
41
  import torch
42
  import torchaudio
43
  import librosa
44
- import matplotlib
45
- matplotlib.use('Agg') # non-interactive backend for any fallback
46
  from pydub import AudioSegment
47
  import scipy
48
  import io
49
  from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
50
- import plotly.graph_objects as go
 
 
51
 
52
  # Define emotion labels, tone mapping, and descriptions
53
  EMOTION_DESCRIPTIONS = {
@@ -60,22 +63,35 @@ EMOTION_DESCRIPTIONS = {
60
  "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
61
  }
62
 
63
- # If you wish to group emotions by tone, you can do so here:
64
  TONE_MAPPING = {
65
  "positive": ["happy", "surprise"],
66
  "neutral": ["neutral"],
67
  "negative": ["angry", "sad", "fear", "disgust"]
68
  }
69
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Global variable for the emotion classifier
71
  audio_emotion_classifier = None
72
 
73
  def load_emotion_model():
74
- """Load and cache the speech emotion classification model."""
75
  global audio_emotion_classifier
76
  if audio_emotion_classifier is None:
77
  try:
78
  print("Loading emotion classification model...")
 
79
  model_name = "superb/hubert-large-superb-er"
80
  audio_emotion_classifier = pipeline("audio-classification", model=model_name)
81
  print("Emotion classification model loaded successfully")
@@ -86,7 +102,7 @@ def load_emotion_model():
86
  return True
87
 
88
  def convert_audio_to_wav(audio_file):
89
- """Convert uploaded audio to WAV format."""
90
  try:
91
  audio = AudioSegment.from_file(audio_file)
92
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
@@ -97,749 +113,354 @@ def convert_audio_to_wav(audio_file):
97
  print(f"Error converting audio: {e}")
98
  return None
99
 
100
- def analyze_voice_tone(audio_file):
101
- """
102
- Analyze the tone characteristics of the voice using more robust measurements.
103
- Includes pitch variation, energy dynamics, and spectral features.
104
- """
105
- try:
106
- audio_data, sample_rate = librosa.load(audio_file, sr=16000)
107
-
108
- # 1. Basic audio features
109
- audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
110
- if audio_duration < 1.0: # Too short for reliable analysis
111
- return "Audio too short for reliable tone analysis. Please provide at least 3 seconds."
112
-
113
- # 2. Pitch analysis with more robust handling
114
- f0, voiced_flag, voiced_prob = librosa.pyin(
115
- audio_data,
116
- fmin=librosa.note_to_hz('C2'),
117
- fmax=librosa. note_to_hz('C7'),
118
- sr=sample_rate
119
- )
120
-
121
- # Filter out NaN values and get valid pitch points
122
- valid_f0 = f0[~np.isnan(f0)]
123
-
124
- # If no pitch detected, may be noise or silence
125
- if len(valid_f0) < 10:
126
- return "**Voice Tone Analysis:** Unable to detect sufficient pitched content for analysis. The audio may contain primarily noise, silence, or non-speech sounds."
127
-
128
- # 3. Calculate improved statistics
129
- mean_pitch = np.mean(valid_f0)
130
- median_pitch = np.median(valid_f0)
131
- std_pitch = np.std(valid_f0)
132
- pitch_range = np.percentile(valid_f0, 95) - np.percentile(valid_f0, 5)
133
-
134
- # 4. Energy/volume dynamics
135
- rms_energy = librosa.feature.rms(y=audio_data)[0]
136
- mean_energy = np.mean(rms_energy)
137
- std_energy = np.std(rms_energy)
138
- energy_range = np.percentile(rms_energy, 95) - np.percentile(rms_energy, 5)
139
-
140
- # 5. Speaking rate approximation (zero-crossing rate can help estimate this)
141
- zcr = librosa.feature.zero_crossing_rate(audio_data)[0]
142
- mean_zcr = np.mean(zcr)
143
-
144
- # 6. Calculate pitch variability relative to the mean (coefficient of variation)
145
- # This gives a better measure than raw std dev
146
- pitch_cv = (std_pitch / mean_pitch) * 100 if mean_pitch > 0 else 0
147
-
148
- # 7. Tone classification logic using multiple features
149
- # Define tone characteristics based on combinations of features
150
- tone_class = ""
151
- tone_details = []
152
-
153
- # Pitch-based characteristics
154
- if pitch_cv < 5:
155
- tone_class = "Monotone"
156
- tone_details.append("Very little pitch variation - sounds flat and unexpressive")
157
- elif pitch_cv < 12:
158
- tone_class = "Steady"
159
- tone_details.append("Moderate pitch variation - sounds controlled and measured")
160
- elif pitch_cv < 20:
161
- tone_class = "Expressive"
162
- tone_details.append("Good pitch variation - sounds naturally engaging")
163
- else:
164
- tone_class = "Highly Dynamic"
165
- tone_details.append("Strong pitch variation - sounds animated and emphatic")
166
-
167
- # Pitch range classification
168
- if mean_pitch > 180:
169
- tone_details.append("Higher pitched voice - may convey excitement or tension")
170
- elif mean_pitch < 120:
171
- tone_details.append("Lower pitched voice - may convey calmness or authority")
172
- else:
173
- tone_details.append("Mid-range pitch - typically perceived as balanced")
174
-
175
- # Energy/volume characteristics
176
- energy_cv = (std_energy / mean_energy) * 100 if mean_energy > 0 else 0
177
- if energy_cv < 10:
178
- tone_details.append("Consistent volume - sounds controlled and measured")
179
- elif energy_cv > 30:
180
- tone_details.append("Variable volume - suggests emotional emphasis or expressiveness")
181
-
182
- # Speech rate approximation
183
- if mean_zcr > 0.1:
184
- tone_details.append("Faster speech rate - may convey urgency or enthusiasm")
185
- elif mean_zcr < 0.05:
186
- tone_details.append("Slower speech rate - may convey thoughtfulness or hesitation")
187
-
188
- # Generate tone summary and interpretation
189
- tone_analysis = f"### Voice Tone Analysis\n\n"
190
- tone_analysis += f"**Primary tone quality:** {tone_class}\n\n"
191
- tone_analysis += "**Tone characteristics:**\n"
192
- for detail in tone_details:
193
- tone_analysis += f"- {detail}\n"
194
-
195
- tone_analysis += "\n**Interpretation:**\n"
196
-
197
- # Generate interpretation based on the classified tone
198
- if tone_class == "Monotone":
199
- tone_analysis += ("A monotone delivery can create distance and reduce engagement. "
200
- "Consider adding more vocal variety to sound more engaging and authentic.")
201
- elif tone_class == "Steady":
202
- tone_analysis += ("Your steady tone suggests reliability and control. "
203
- "This can be effective in professional settings or when conveying serious information.")
204
- elif tone_class == "Expressive":
205
- tone_analysis += ("Your expressive tone helps maintain listener interest and emphasize key points. "
206
- "This naturally engaging quality helps convey authenticity and conviction.")
207
- else: # Highly Dynamic
208
- tone_analysis += ("Your highly dynamic vocal style conveys strong emotion and energy. "
209
- "This can be powerful for storytelling and persuasion, though in some contexts "
210
- "a more measured approach might be appropriate.")
211
-
212
- return tone_analysis
213
-
214
- except Exception as e:
215
- print(f"Error in tone analysis: {e}")
216
- return "Tone analysis unavailable due to an error processing the audio."
217
-
218
- def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=2):
219
  """
220
- Analyze speech emotions in short chunks,
221
- building a timeline of confidence for each emotion.
222
- Returns a Plotly figure, summary text, detailed results.
223
  """
224
  if not load_emotion_model():
225
- return None, "Failed to load emotion classifier.", None
226
-
227
- # Use existing WAV if possible, else convert
228
- if audio_file.endswith(".wav"):
229
  audio_path = audio_file
230
  else:
231
  audio_path = convert_audio_to_wav(audio_file)
232
  if not audio_path:
233
- return None, "Could not process audio file", None
234
-
235
  try:
236
- # Load with librosa
237
  audio_data, sample_rate = librosa.load(audio_path, sr=16000)
238
  duration = len(audio_data) / sample_rate
239
-
240
- # Use shorter chunks for more granular analysis
241
  chunk_samples = int(chunk_duration * sample_rate)
242
  num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
243
-
244
  all_emotions = []
245
  time_points = []
246
-
247
- # For each chunk, run emotion classification
248
  for i in range(num_chunks):
249
  progress((i + 1) / num_chunks, "Analyzing audio emotions...")
250
  start_idx = i * chunk_samples
251
  end_idx = min(start_idx + chunk_samples, len(audio_data))
252
  chunk = audio_data[start_idx:end_idx]
253
-
254
- # Skip very short chunks
255
  if len(chunk) < 0.5 * sample_rate:
256
  continue
257
-
258
- # Write chunk to temp WAV
259
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
260
  chunk_path = temp_chunk.name
261
  scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
262
-
263
- # Classify - extract top-n predictions for each chunk
264
- raw_results = audio_emotion_classifier(chunk_path, top_k=7) # Get all 7 emotions
265
- os.unlink(chunk_path)
266
-
267
- all_emotions.append(raw_results)
268
  time_points.append((start_idx / sample_rate, end_idx / sample_rate))
269
-
270
- # Skip if no valid emotions detected
271
- if not all_emotions:
272
- return None, "No speech detected in the audio.", None
273
-
274
- # Build Plotly chart with improved styling
275
- fig = build_plotly_line_chart(all_emotions, time_points, duration)
276
-
277
- # Build summary and detailed results
278
- summary_text = generate_emotion_summary(all_emotions)
279
- detailed_results = build_detailed_results(all_emotions, time_points)
280
-
281
- return fig, summary_text, detailed_results
282
-
283
  except Exception as e:
 
284
  import traceback
285
  traceback.print_exc()
286
- return None, f"Error analyzing audio: {str(e)}", None
287
-
288
- def smooth_data(data, window_size=3):
289
- """Apply a moving average smoothing to the data"""
290
- smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
291
-
292
- # Add back points that were lost in the convolution
293
- padding = len(data) - len(smoothed)
294
- if padding > 0:
295
- # Add padding at the beginning
296
- padding_front = padding // 2
297
- padding_back = padding - padding_front
298
-
299
- # Use the first/last values for padding
300
- front_padding = [smoothed[0]] * padding_front
301
- back_padding = [smoothed[-1]] * padding_back
302
-
303
- smoothed = np.concatenate([front_padding, smoothed, back_padding])
304
-
305
- return smoothed
306
 
307
- def build_plotly_line_chart(all_emotions, time_points, duration):
308
  """
309
- Create an improved Plotly line chart with toggles for each emotion.
310
- Shows all emotions for each time point rather than just the top one.
311
  """
 
312
  emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
313
-
314
- # Custom color scheme for emotions
315
- colors = {
316
- "angry": "#E53935", # Red
317
- "disgust": "#8E24AA", # Purple
318
- "fear": "#7B1FA2", # Deep Purple
319
- "happy": "#FFC107", # Amber/Yellow
320
- "neutral": "#78909C", # Blue Grey
321
- "sad": "#1E88E5", # Blue
322
- "surprise": "#43A047" # Green
323
- }
324
-
325
- # Prepare data structure for all emotions
326
- emotion_data = {label: [] for label in emotion_labels}
327
- timeline_times = [(start + end) / 2 for start, end in time_points]
328
-
329
- # Process emotion scores - ensure all emotions have values
330
- for chunk_emotions in all_emotions:
331
- # Create a mapping of label to score for this chunk
332
- scores = {item["label"]: item["score"] for item in chunk_emotions}
333
-
334
- # Ensure all emotion labels have a value (default to 0.0)
335
- for label in emotion_labels:
336
- emotion_data[label].append(scores.get(label, 0.0))
337
-
338
- # Smooth the data
339
- for label in emotion_labels:
340
- if len(emotion_data[label]) > 2:
341
- emotion_data[label] = smooth_data(emotion_data[label])
342
-
343
- # Build the chart
344
- fig = go.Figure()
345
-
346
- # Add traces for each emotion
347
- for label in emotion_labels:
348
- fig.add_trace(
349
- go.Scatter(
350
- x=timeline_times,
351
- y=emotion_data[label],
352
- mode='lines',
353
- name=label.capitalize(),
354
- line=dict(
355
- color=colors.get(label, None),
356
- width=3,
357
- shape='spline', # Curved lines
358
- smoothing=1.3
359
- ),
360
- hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>',
361
- )
362
- )
363
-
364
- # Add markers for dominant emotion at each point
365
- dominant_markers_x = []
366
- dominant_markers_y = []
367
- dominant_markers_text = []
368
- dominant_markers_color = []
369
-
370
- for i, time in enumerate(timeline_times):
371
- scores = {label: emotion_data[label][i] for label in emotion_labels}
372
- dominant = max(scores.items(), key=lambda x: x[1])
373
-
374
- dominant_markers_x.append(time)
375
- dominant_markers_y.append(dominant[1])
376
- dominant_markers_text.append(f"{dominant[0].capitalize()}: {dominant[1]:.2f}")
377
- dominant_markers_color.append(colors.get(dominant[0], "#000000"))
378
-
379
- fig.add_trace(
380
- go.Scatter(
381
- x=dominant_markers_x,
382
- y=dominant_markers_y,
383
- mode='markers',
384
- marker=dict(
385
- size=10,
386
- color=dominant_markers_color,
387
- line=dict(width=2, color='white')
388
- ),
389
- name="Dominant Emotion",
390
- text=dominant_markers_text,
391
- hoverinfo="text",
392
- hovertemplate='%{text}<extra></extra>'
393
- )
394
- )
395
-
396
- # Add area chart for better visualization
397
- for label in emotion_labels:
398
- fig.add_trace(
399
- go.Scatter(
400
- x=timeline_times,
401
- y=emotion_data[label],
402
- mode='none',
403
- name=f"{label.capitalize()} Area",
404
- fill='tozeroy',
405
- fillcolor=f"rgba{tuple(list(int(colors.get(label, '#000000').lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + [0.1])}",
406
- showlegend=False,
407
- hoverinfo='skip'
408
- )
409
- )
410
-
411
- # Improve layout
412
- fig.update_layout(
413
- title={
414
- 'text': "Voice Emotion Analysis Over Time",
415
- 'font': {'size': 22, 'family': 'Arial, sans-serif'}
416
- },
417
- xaxis_title="Time (seconds)",
418
- yaxis_title="Confidence Score",
419
- yaxis=dict(
420
- range=[0, 1.0],
421
- showgrid=True,
422
- gridcolor='rgba(230, 230, 230, 0.8)'
423
- ),
424
- xaxis=dict(
425
- showgrid=True,
426
- gridcolor='rgba(230, 230, 230, 0.8)'
427
- ),
428
- plot_bgcolor='white',
429
- legend=dict(
430
- bordercolor='rgba(0,0,0,0.1)',
431
- borderwidth=1,
432
- orientation="h",
433
- yanchor="bottom",
434
- y=1.02,
435
- xanchor="right",
436
- x=1
437
- ),
438
- hovermode='closest',
439
- height=500, # Larger size for better viewing
440
- margin=dict(l=10, r=10, t=80, b=50)
441
- )
442
-
443
- return fig
444
-
445
- def generate_alternative_chart(all_emotions, time_points):
446
- """
447
- Create a stacked area chart to better visualize emotion changes over time
448
- """
449
- emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
450
-
451
- # Custom color scheme for emotions - more visible/distinct
452
- colors = {
453
- "angry": "#F44336", # Red
454
- "disgust": "#9C27B0", # Purple
455
- "fear": "#673AB7", # Deep Purple
456
- "happy": "#FFC107", # Amber
457
- "neutral": "#607D8B", # Blue Grey
458
- "sad": "#2196F3", # Blue
459
- "surprise": "#4CAF50" # Green
460
  }
461
-
462
- # Prepare timeline points
463
- timeline_times = [(start + end) / 2 for start, end in time_points]
464
-
465
- # Prepare data structure for all emotions
466
- emotion_data = {label: [] for label in emotion_labels}
467
-
468
- # Process emotion scores - ensure all emotions have values
469
- for chunk_emotions in all_emotions:
470
- # Create a mapping of label to score for this chunk
471
- scores = {item["label"]: item["score"] for item in chunk_emotions}
472
-
473
- # Ensure all emotion labels have a value (default to 0.0)
474
- for label in emotion_labels:
475
- emotion_data[label].append(scores.get(label, 0.0))
476
-
477
- # Create the stacked area chart
478
- fig = go.Figure()
479
-
480
- # Add each emotion as a separate trace
481
  for label in emotion_labels:
482
- fig.add_trace(
483
- go.Scatter(
484
- x=timeline_times,
485
- y=emotion_data[label],
486
- mode='lines',
487
- name=label.capitalize(),
488
- line=dict(width=0.5, color=colors.get(label, None)),
489
- stackgroup='one', # This makes it a stacked area chart
490
- fillcolor=colors.get(label, None),
491
- hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>'
492
- )
493
- )
494
-
495
- # Improve layout
496
- fig.update_layout(
497
- title={
498
- 'text': "Voice Emotion Distribution Over Time",
499
- 'font': {'size': 22, 'family': 'Arial, sans-serif'}
500
- },
501
- xaxis_title="Time (seconds)",
502
- yaxis_title="Emotion Intensity",
503
- yaxis=dict(
504
- showgrid=True,
505
- gridcolor='rgba(230, 230, 230, 0.8)'
506
- ),
507
- xaxis=dict(
508
- showgrid=True,
509
- gridcolor='rgba(230, 230, 230, 0.8)'
510
- ),
511
- plot_bgcolor='white',
512
- legend=dict(
513
- bordercolor='rgba(0,0,0,0.1)',
514
- borderwidth=1,
515
- orientation="h",
516
- yanchor="bottom",
517
- y=1.02,
518
- xanchor="right",
519
- x=1
520
- ),
521
- hovermode='closest',
522
- height=500,
523
- margin=dict(l=10, r=10, t=80, b=50)
524
- )
525
-
526
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
527
 
528
- def generate_emotion_summary(all_emotions):
529
  """
530
- Produce an improved textual summary of the overall emotion distribution.
 
531
  """
532
  if not all_emotions:
533
  return "No emotional content detected."
534
-
535
  emotion_counts = {}
536
- emotion_confidence = {}
537
  total_chunks = len(all_emotions)
538
-
539
- for chunk_emotions in all_emotions:
540
- top_emotion = max(chunk_emotions, key=lambda x: x['score'])
541
- label = top_emotion["label"]
542
- confidence = top_emotion["score"]
543
-
544
- emotion_counts[label] = emotion_counts.get(label, 0) + 1
545
- emotion_confidence[label] = emotion_confidence.get(label, 0) + confidence
546
-
547
- # Calculate average confidence for each emotion
548
- for emotion in emotion_confidence:
549
- if emotion_counts[emotion] > 0:
550
- emotion_confidence[emotion] /= emotion_counts[emotion]
551
-
552
- # Dominant emotion (highest percentage)
553
- dominant_emotion = max(emotion_counts, key=emotion_counts.get)
554
- dominant_pct = (emotion_counts[dominant_emotion] / total_chunks) * 100
555
-
556
- # Most confident emotion (might differ from dominant)
557
- most_confident = max(emotion_confidence, key=emotion_confidence.get)
558
-
559
- # Tone grouping analysis
560
- tone_group_counts = {group: 0 for group in TONE_MAPPING}
561
- for emotion, count in emotion_counts.items():
562
- for tone_group, emotions in TONE_MAPPING.items():
563
- if emotion in emotions:
564
- tone_group_counts[tone_group] += count
565
-
566
- dominant_tone = max(tone_group_counts, key=tone_group_counts.get)
567
- dominant_tone_pct = (tone_group_counts[dominant_tone] / total_chunks) * 100
568
-
569
- # Build summary with markdown formatting
570
  summary = f"### Voice Emotion Analysis Summary\n\n"
571
- summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({dominant_pct:.1f}%)\n\n"
572
-
573
- if dominant_emotion != most_confident and emotion_confidence[most_confident] > 0.7:
574
- summary += f"**Most confident detection:** {most_confident.capitalize()} "
575
- summary += f"(avg. confidence: {emotion_confidence[most_confident]:.2f})\n\n"
576
-
577
- summary += f"**Overall tone:** {dominant_tone.capitalize()} ({dominant_tone_pct:.1f}%)\n\n"
578
  summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
579
-
580
- # Show emotion distribution as sorted list
581
  summary += "**Emotion distribution:**\n"
582
- for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
583
- percentage = (count / total_chunks) * 100
584
- avg_conf = emotion_confidence[emotion]
585
- summary += f"- {emotion.capitalize()}: {percentage:.1f}% (confidence: {avg_conf:.2f})\n"
586
-
587
- # Add interpretation based on dominant emotion
588
- summary += f"\n**Interpretation:**\n"
589
-
590
- if dominant_emotion == "happy":
591
- summary += "The voice conveys primarily positive emotions, suggesting enthusiasm, satisfaction, or joy."
592
- elif dominant_emotion == "neutral":
593
- summary += "The voice maintains an even emotional tone, suggesting composure or professional delivery."
594
- elif dominant_emotion == "sad":
595
- summary += "The voice conveys melancholy or disappointment, potentially indicating concern or distress."
596
- elif dominant_emotion == "angry":
597
- summary += "The voice shows frustration or assertiveness, suggesting strong conviction or displeasure."
598
- elif dominant_emotion == "fear":
599
- summary += "The voice reveals anxiety or nervousness, suggesting uncertainty or concern."
600
- elif dominant_emotion == "disgust":
601
- summary += "The voice expresses disapproval or aversion, suggesting rejection of discussed concepts."
602
- elif dominant_emotion == "surprise":
603
- summary += "The voice shows unexpected reactions, suggesting discovery of new information or astonishment."
604
-
605
  return summary
606
 
607
- def build_detailed_results(all_emotions, time_points):
608
- """
609
- Return a list of dictionaries containing chunk start-end, top emotion, confidence, description.
610
- Suitable for Gradio DataFrame display.
611
- """
612
- results_list = []
613
- for (emotions, (start_time, end_time)) in zip(all_emotions, time_points):
614
- top_emotion = max(emotions, key=lambda x: x['score'])
615
- label = top_emotion["label"]
616
-
617
- # Find second highest emotion if available
618
- if len(emotions) > 1:
619
- sorted_emotions = sorted(emotions, key=lambda x: x['score'], reverse=True)
620
- second_emotion = sorted_emotions[1]["label"].capitalize()
621
- second_score = sorted_emotions[1]["score"]
622
- secondary = f" ({second_emotion}: {second_score:.2f})"
623
- else:
624
- secondary = ""
625
-
626
- results_list.append({
627
- "Time Range": f"{start_time:.1f}s - {end_time:.1f}s",
628
- "Primary Emotion": label.capitalize(),
629
- "Confidence": f"{top_emotion['score']:.2f}{secondary}",
630
- "Description": EMOTION_DESCRIPTIONS.get(label, "")
631
- })
632
- return results_list
633
 
634
  def process_audio(audio_file, progress=gr.Progress()):
635
- """
636
- Main handler for Gradio:
637
- 1) Emotion analysis (returns Plotly figure).
638
- 2) Tone analysis (returns descriptive text).
639
- """
640
- if not audio_file:
641
- return None, None, "No audio file provided.", None, "No tone analysis."
642
-
643
- # 1) Analyze emotions
644
- fig, summary_text, detailed_results = analyze_audio_emotions(audio_file, progress)
645
- if not fig: # Error or missing
646
- return None, None, "Failed to analyze audio emotions.", None, "Tone analysis unavailable."
647
-
648
- # 2) Generate alternative chart
649
- # Extract the necessary data from detailed_results to create time_points
650
- time_points = []
651
- for result in detailed_results:
652
- time_range = result["Time Range"]
653
- start_time = float(time_range.split("s")[0])
654
- end_time = float(time_range.split(" - ")[1].split("s")[0])
655
- time_points.append((start_time, end_time))
656
-
657
- # Extract emotion data from detailed_results
658
- all_emotions = []
659
- for result in detailed_results:
660
- # Parse the primary emotion and confidence
661
- primary_emotion = result["Primary Emotion"].lower()
662
- confidence_str = result["Confidence"].split("(")[0].strip()
663
- primary_confidence = float(confidence_str)
664
-
665
- # Create a list of emotion dictionaries for this time point
666
- emotions_at_time = [{"label": primary_emotion, "score": primary_confidence}]
667
-
668
- # Check if there's a secondary emotion
669
- if "(" in result["Confidence"]:
670
- secondary_part = result["Confidence"].split("(")[1].split(")")[0]
671
- secondary_emotion = secondary_part.split(":")[0].strip().lower()
672
- secondary_confidence = float(secondary_part.split(":")[1].strip())
673
- emotions_at_time.append({"label": secondary_emotion, "score": secondary_confidence})
674
-
675
- # Add remaining emotions with zero confidence
676
- for emotion in EMOTION_DESCRIPTIONS.keys():
677
- if emotion not in [e["label"] for e in emotions_at_time]:
678
- emotions_at_time.append({"label": emotion, "score": 0.0})
679
-
680
- all_emotions.append(emotions_at_time)
681
-
682
- # Now we can generate the alternative chart
683
- alt_fig = generate_alternative_chart(all_emotions, time_points)
684
-
685
- # 3) Analyze tone
686
- tone_analysis = analyze_voice_tone(audio_file)
687
-
688
- return fig, alt_fig, summary_text, detailed_results, tone_analysis
689
-
690
- # Create Gradio interface with improved UI/UX
691
- with gr.Blocks(title="Voice Emotion & Tone Analysis System", theme=gr.themes.Soft()) as demo:
692
  gr.Markdown("""
693
- # πŸŽ™οΈ Voice Emotion & Tone Analysis System
694
-
695
- This app provides professional analysis of:
696
- - **Emotions** in your voice (Anger, Disgust, Fear, Happy, Neutral, Sad, Surprise)
697
- - **Tone characteristics** (based on pitch, energy, and speech patterns)
698
-
699
- The interactive timeline shows emotion confidence scores throughout your audio.
 
 
 
 
 
 
 
 
700
  """)
701
-
702
  with gr.Tabs():
703
- # Tab 1: Upload
704
  with gr.TabItem("Upload Audio"):
705
  with gr.Row():
706
  with gr.Column(scale=1):
707
  audio_input = gr.Audio(
708
  label="Upload Audio File",
709
  type="filepath",
710
- sources=["upload"],
711
- elem_id="audio_upload"
712
  )
713
- process_btn = gr.Button("Analyze Voice", variant="primary")
714
- gr.Markdown("""
715
- **Supports:** MP3, WAV, M4A, and most audio formats
716
- **For best results:** Use a clear voice recording with minimal background noise
717
- """)
718
  with gr.Column(scale=2):
719
- with gr.Tabs():
720
- with gr.TabItem("Line Chart"):
721
- emotion_timeline = gr.Plot(label="Emotion Timeline",
722
- elem_id="emotion_plot",
723
- container=True)
724
- with gr.TabItem("Area Chart"):
725
- emotion_area_chart = gr.Plot(label="Emotion Distribution",
726
- elem_id="emotion_area_plot",
727
- container=True)
728
  with gr.Row():
729
- with gr.Column():
730
- emotion_summary = gr.Markdown(label="Emotion Summary")
731
- with gr.Column():
732
- tone_analysis_output = gr.Markdown(label="Tone Analysis")
733
  with gr.Row():
734
  emotion_results = gr.DataFrame(
735
- headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
736
  label="Detailed Emotion Analysis"
737
  )
738
-
739
  process_btn.click(
740
  fn=process_audio,
741
  inputs=[audio_input],
742
- outputs=[emotion_timeline, emotion_area_chart, emotion_summary, emotion_results, tone_analysis_output]
743
  )
744
-
745
- # Tab 2: Record
746
  with gr.TabItem("Record Voice"):
747
  with gr.Row():
748
  with gr.Column(scale=1):
749
  record_input = gr.Audio(
750
  label="Record Your Voice",
751
  sources=["microphone"],
752
- type="filepath",
753
- elem_id="record_audio"
754
  )
755
- analyze_btn = gr.Button("Analyze Recording", variant="primary")
756
- gr.Markdown("""
757
- **Tips:**
758
- - Speak clearly and at a normal pace
759
- - Record at least 10-15 seconds for more accurate analysis
760
- - Try different emotional tones to see how they're detected
761
- """)
762
  with gr.Column(scale=2):
763
- with gr.Tabs():
764
- with gr.TabItem("Line Chart"):
765
- rec_emotion_timeline = gr.Plot(label="Emotion Timeline",
766
- elem_id="record_emotion_plot",
767
- container=True)
768
- with gr.TabItem("Area Chart"):
769
- rec_emotion_area_chart = gr.Plot(label="Emotion Distribution",
770
- elem_id="record_emotion_area_plot",
771
- container=True)
772
  with gr.Row():
773
- with gr.Column():
774
- rec_emotion_summary = gr.Markdown(label="Emotion Summary")
775
- with gr.Column():
776
- rec_tone_analysis_output = gr.Markdown(label="Tone Analysis")
777
  with gr.Row():
778
  rec_emotion_results = gr.DataFrame(
779
- headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
780
  label="Detailed Emotion Analysis"
781
  )
782
-
783
  analyze_btn.click(
784
  fn=process_audio,
785
  inputs=[record_input],
786
- outputs=[rec_emotion_timeline, rec_emotion_area_chart, rec_emotion_summary, rec_emotion_results, rec_tone_analysis_output]
787
  )
788
-
789
- # Tab 3: About & Help
790
- with gr.TabItem("About & Help"):
791
- gr.Markdown("""
792
- ## About This System
793
-
794
- This voice emotion & tone analysis system uses state-of-the-art deep learning models to detect emotions and analyze vocal characteristics. The system is built on HuBERT (Hidden Unit BERT) architecture trained on speech emotion recognition tasks.
795
-
796
- ### How It Works
797
-
798
- 1. **Audio Processing**: Your audio is processed in short segments (chunks) to capture emotion variations over time.
799
- 2. **Emotion Classification**: Each segment is analyzed by a neural network to detect emotional patterns.
800
- 3. **Tone Analysis**: Acoustic features like pitch, energy, and rhythm are analyzed to describe voice tone characteristics.
801
-
802
- ### Emotion Categories
803
-
804
- The system detects seven standard emotions:
805
-
806
- - **Angry**: Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.
807
- - **Disgust**: Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.
808
- - **Fear**: Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.
809
- - **Happy**: Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.
810
- - **Neutral**: Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.
811
- - **Sad**: Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.
812
- - **Surprise**: Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.
813
-
814
- ### Tips for Best Results
815
-
816
- - Use clear audio with minimal background noise
817
- - Speak naturally at a comfortable volume
818
- - Record at least 10-15 seconds of speech
819
- - For tone analysis, longer recordings (30+ seconds) provide more accurate results
820
-
821
- ### Privacy Notice
822
-
823
- All audio processing happens on your device. No audio recordings or analysis results are stored or transmitted to external servers.
824
- """)
825
-
826
  gr.Markdown("""
827
- ---
828
- ### System Information
829
-
830
- - **Model**: HuBERT Large for Speech Emotion Recognition
831
- - **Version**: 1.2.0
832
- - **Libraries**: PyTorch, Transformers, Librosa, Plotly
833
-
834
- This application demonstrates the use of AI for speech emotion recognition and acoustic analysis. For research and educational purposes only.
835
  """)
836
 
837
- # Check if model can load before launching interface
838
- print("Checking model availability...")
839
- load_success = load_emotion_model()
840
- if not load_success:
841
- print("Warning: Emotion model failed to load. Application may have limited functionality.")
 
842
 
843
- # Launch the demo
844
  if __name__ == "__main__":
 
845
  demo.launch()
 
1
+ # voice_emotion_classification.py
2
+
3
  import os
4
  import subprocess
5
  import sys
 
20
  print(f"Failed to install {package_spec}: {e}")
21
  raise
22
 
23
+ # Required packages (you may add version pins if necessary)
24
  required_packages = {
25
  "gradio": None,
26
  "torch": None,
 
29
  "librosa": None,
30
  "scipy": None,
31
  "matplotlib": None,
32
+ "pydub": None
 
33
  }
34
 
35
  installed_packages = {pkg.key for pkg in pkg_resources.working_set}
 
37
  if package not in installed_packages:
38
  install_package(package, version)
39
 
40
+ # Now import all necessary packages
41
  import gradio as gr
42
  import torch
43
  import torchaudio
44
  import librosa
45
+ import matplotlib.pyplot as plt
46
+ from matplotlib.colors import LinearSegmentedColormap
47
  from pydub import AudioSegment
48
  import scipy
49
  import io
50
  from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
51
+ from pathlib import Path
52
+ import matplotlib
53
+ matplotlib.use('Agg') # Use non-interactive backend
54
 
55
  # Define emotion labels, tone mapping, and descriptions
56
  EMOTION_DESCRIPTIONS = {
 
63
  "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
64
  }
65
 
66
+ # Here we map emotion to a generalized tone (for example, negative or positive)
67
  TONE_MAPPING = {
68
  "positive": ["happy", "surprise"],
69
  "neutral": ["neutral"],
70
  "negative": ["angry", "sad", "fear", "disgust"]
71
  }
72
 
73
+ # Some Hugging Face models return short labels (e.g., "hap", "ang", etc.).
74
+ # This mapping will ensure they're translated into our full canonical labels.
75
+ MODEL_TO_EMOTION_MAP = {
76
+ "hap": "happy",
77
+ "ang": "angry",
78
+ "sad": "sad",
79
+ "dis": "disgust",
80
+ "fea": "fear",
81
+ "neu": "neutral",
82
+ "sur": "surprise"
83
+ }
84
+
85
  # Global variable for the emotion classifier
86
  audio_emotion_classifier = None
87
 
88
  def load_emotion_model():
89
+ """Load the emotion classification model once and cache it."""
90
  global audio_emotion_classifier
91
  if audio_emotion_classifier is None:
92
  try:
93
  print("Loading emotion classification model...")
94
+ # Using the Hugging Face pipeline with the new model that classifies speech emotion
95
  model_name = "superb/hubert-large-superb-er"
96
  audio_emotion_classifier = pipeline("audio-classification", model=model_name)
97
  print("Emotion classification model loaded successfully")
 
102
  return True
103
 
104
  def convert_audio_to_wav(audio_file):
105
+ """Convert the uploaded audio to WAV format."""
106
  try:
107
  audio = AudioSegment.from_file(audio_file)
108
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
 
113
  print(f"Error converting audio: {e}")
114
  return None
115
 
116
+ def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  """
118
+ Analyze emotions in an audio file by processing it in chunks.
119
+ Returns a visualization, processed audio path, summary, and detailed results.
 
120
  """
121
  if not load_emotion_model():
122
+ return None, "Failed to load emotion classification model. Please check console for details."
123
+
124
+ # If the file is already a WAV, use it directly; else convert it.
125
+ if audio_file.endswith('.wav'):
126
  audio_path = audio_file
127
  else:
128
  audio_path = convert_audio_to_wav(audio_file)
129
  if not audio_path:
130
+ return None, "Failed to process audio file. Unsupported format or corrupted file."
131
+
132
  try:
133
+ # Load the audio using librosa
134
  audio_data, sample_rate = librosa.load(audio_path, sr=16000)
135
  duration = len(audio_data) / sample_rate
136
+
137
+ # Process in chunks for long files
138
  chunk_samples = int(chunk_duration * sample_rate)
139
  num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
140
+
141
  all_emotions = []
142
  time_points = []
143
+
 
144
  for i in range(num_chunks):
145
  progress((i + 1) / num_chunks, "Analyzing audio emotions...")
146
  start_idx = i * chunk_samples
147
  end_idx = min(start_idx + chunk_samples, len(audio_data))
148
  chunk = audio_data[start_idx:end_idx]
149
+
150
+ # Skip too-short chunks (<0.5 seconds)
151
  if len(chunk) < 0.5 * sample_rate:
152
  continue
153
+
154
+ # Create a temporary file for this audio chunk
155
  with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
156
  chunk_path = temp_chunk.name
157
  scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
158
+
159
+ # Get emotion classification results on this chunk
160
+ results = audio_emotion_classifier(chunk_path)
161
+ os.unlink(chunk_path) # Remove the temporary file
162
+
163
+ all_emotions.append(results)
164
  time_points.append((start_idx / sample_rate, end_idx / sample_rate))
165
+
166
+ # Generate visualization and summary
167
+ fig, detailed_results = generate_emotion_timeline(all_emotions, time_points, duration)
168
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as temp_img:
169
+ img_path = temp_img.name
170
+ fig.savefig(img_path, dpi=100, bbox_inches='tight')
171
+ plt.close(fig)
172
+
173
+ summary = generate_emotion_summary(all_emotions, time_points)
174
+ return img_path, audio_path, summary, detailed_results
175
+
 
 
 
176
  except Exception as e:
177
+ print(f"Error analyzing audio: {e}")
178
  import traceback
179
  traceback.print_exc()
180
+ return None, None, f"Error analyzing audio: {str(e)}", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ def generate_emotion_timeline(all_emotions, time_points, duration):
183
  """
184
+ Generate a bar chart visualization of emotion percentages with tone analysis.
185
+ Returns the matplotlib figure and a list of detailed results.
186
  """
187
+ # All possible emotion labels from our dictionary
188
  emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
189
+
190
+ # We'll accumulate counts based on our canonical labels (e.g., "happy", "angry").
191
+ emotion_counts = {}
192
+
193
+ for emotions in all_emotions:
194
+ if not emotions:
195
+ continue
196
+
197
+ # The pipeline returns items like {"label": "Hap", "score": 0.95}, etc.
198
+ top_emotion = max(emotions, key=lambda x: x['score'])
199
+
200
+ # Normalize the label from the model to a canonical label used in EMOTION_DESCRIPTIONS
201
+ raw_label = top_emotion['label'].lower().strip() # e.g., "hap", "ang", ...
202
+ canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
203
+ # If there's no mapping, we leave it as raw_label.
204
+ # But typically, it should be one of "happy", "angry", "disgust", "fear", "sad", "neutral", "surprise".
205
+
206
+ # Count how many times each canonical label appears
207
+ emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1
208
+
209
+ total_chunks = len(all_emotions)
210
+ emotion_percentages = {
211
+ e: (count / total_chunks * 100) for e, count in emotion_counts.items()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
212
  }
213
+
214
+ # Create empty percentages for emotions that didn't appear
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  for label in emotion_labels:
216
+ if label not in emotion_percentages:
217
+ emotion_percentages[label] = 0.0
218
+
219
+ # Sort emotions by percentage
220
+ sorted_emotions = sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True)
221
+
222
+ # Create the bar chart with subplots: one for emotions and one for tone
223
+ fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), height_ratios=[3, 1], gridspec_kw={'hspace': 0.3})
224
+
225
+ # Capitalize each label for a nice display
226
+ emotions = [item[0].capitalize() for item in sorted_emotions]
227
+ percentages = [item[1] for item in sorted_emotions]
228
+
229
+ # Custom colors for emotions (enough for 7 emotions)
230
+ colors = ['red', 'brown', 'purple', 'green', 'gray', 'blue', 'orange']
231
+ if len(emotions) <= len(colors):
232
+ bar_colors = colors[:len(emotions)]
233
+ else:
234
+ # fallback if there's more emotions than colors
235
+ bar_colors = colors + ['#666666'] * (len(emotions) - len(colors))
236
+
237
+ # Plot emotion bars
238
+ bars = ax1.bar(emotions, percentages, color=bar_colors)
239
+
240
+ # Add percentage labels on top of each bar
241
+ for bar in bars:
242
+ height = bar.get_height()
243
+ ax1.annotate(f'{height:.1f}%',
244
+ xy=(bar.get_x() + bar.get_width() / 2, height),
245
+ xytext=(0, 3), # 3 points vertical offset
246
+ textcoords="offset points",
247
+ ha='center', va='bottom')
248
+
249
+ ax1.set_ylim(0, 100) # Fixed 100% scale
250
+ ax1.set_ylabel('Percentage (%)')
251
+ ax1.set_title('Emotion Distribution')
252
+ ax1.grid(axis='y', linestyle='--', alpha=0.7)
253
+
254
+ # Calculate tone percentages based on the canonical labels we found
255
+ tone_percentages = {"positive": 0, "neutral": 0, "negative": 0}
256
+
257
+ for emotion_label, percentage in emotion_percentages.items():
258
+ for tone, emotions_list in TONE_MAPPING.items():
259
+ if emotion_label in emotions_list:
260
+ tone_percentages[tone] += percentage
261
+
262
+ # Plot tone bars
263
+ tones = list(tone_percentages.keys())
264
+ tone_values = list(tone_percentages.values())
265
+ tone_colors = {'positive': 'green', 'neutral': 'gray', 'negative': 'red'}
266
+ tone_bars = ax2.bar(tones, tone_values, color=[tone_colors[t] for t in tones])
267
+
268
+ # Add percentage labels on tone bars
269
+ for bar in tone_bars:
270
+ height = bar.get_height()
271
+ if height > 0: # Only add label if there's a visible bar
272
+ ax2.annotate(f'{height:.1f}%',
273
+ xy=(bar.get_x() + bar.get_width() / 2, height),
274
+ xytext=(0, 3),
275
+ textcoords="offset points",
276
+ ha='center', va='bottom')
277
+
278
+ ax2.set_ylim(0, 100)
279
+ ax2.set_ylabel('Percentage (%)')
280
+ ax2.set_title('Tone Analysis')
281
+ ax2.grid(axis='y', linestyle='--', alpha=0.7)
282
+
283
+ plt.tight_layout()
284
+
285
+ # Generate a more detailed time-segmented result
286
+ detailed_results = []
287
+ for idx, (emotions, (start_time, end_time)) in enumerate(zip(all_emotions, time_points)):
288
+ if not emotions:
289
+ continue
290
+
291
+ top_emotion = max(emotions, key=lambda x: x['score'])
292
+ raw_label = top_emotion['label'].lower().strip()
293
+ canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
294
+
295
+ # Determine the tone for this emotion
296
+ # (based on canonical_label rather than the raw model label)
297
+ tone = next((t for t, e_list in TONE_MAPPING.items() if canonical_label in e_list), "unknown")
298
+
299
+ detailed_results.append({
300
+ 'Time Range': f"{start_time:.1f}s - {end_time:.1f}s",
301
+ 'Emotion': canonical_label,
302
+ 'Tone': tone.capitalize(),
303
+ 'Confidence': f"{top_emotion['score']:.2f}",
304
+ 'Description': EMOTION_DESCRIPTIONS.get(canonical_label, "")
305
+ })
306
+
307
+ return fig, detailed_results
308
 
309
+ def generate_emotion_summary(all_emotions, time_points):
310
  """
311
+ Create a summary text from the emotion analysis.
312
+ Counts occurrences and computes percentages of the dominant emotion.
313
  """
314
  if not all_emotions:
315
  return "No emotional content detected."
316
+
317
  emotion_counts = {}
 
318
  total_chunks = len(all_emotions)
319
+
320
+ for emotions in all_emotions:
321
+ if not emotions:
322
+ continue
323
+ top_emotion = max(emotions, key=lambda x: x['score'])
324
+
325
+ # Normalize the label
326
+ raw_label = top_emotion['label'].lower().strip()
327
+ canonical_label = MODEL_TO_EMOTION_MAP.get(raw_label, raw_label)
328
+
329
+ emotion_counts[canonical_label] = emotion_counts.get(canonical_label, 0) + 1
330
+
331
+ emotion_percentages = {
332
+ e: (count / total_chunks * 100)
333
+ for e, count in emotion_counts.items()
334
+ }
335
+
336
+ if not emotion_percentages:
337
+ return "No emotional content detected."
338
+
339
+ # Find the dominant emotion (highest percentage)
340
+ dominant_emotion = max(emotion_percentages.items(), key=lambda x: x[1])[0]
341
+
 
 
 
 
 
 
 
 
 
342
  summary = f"### Voice Emotion Analysis Summary\n\n"
343
+ summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({emotion_percentages[dominant_emotion]:.1f}%)\n\n"
 
 
 
 
 
 
344
  summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
 
 
345
  summary += "**Emotion distribution:**\n"
346
+
347
+ for emotion, percentage in sorted(emotion_percentages.items(), key=lambda x: x[1], reverse=True):
348
+ summary += f"- {emotion.capitalize()}: {percentage:.1f}%\n"
349
+
350
+ summary += "\n**Interpretation:** The voice predominantly expresses {0} emotion".format(dominant_emotion)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
351
  return summary
352
 
353
+ def record_audio(audio):
354
+ """Save recorded audio and analyze emotions."""
355
+ try:
356
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
357
+ audio_path = temp_file.name
358
+ with open(audio_path, 'wb') as f:
359
+ f.write(audio)
360
+ return audio_path
361
+ except Exception as e:
362
+ print(f"Error saving recorded audio: {e}")
363
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
 
365
  def process_audio(audio_file, progress=gr.Progress()):
366
+ """Process the audio file and analyze emotions."""
367
+ if audio_file is None:
368
+ return None, None, "No audio file provided.", None
369
+
370
+ img_path, processed_audio, summary, results = analyze_audio_emotions(audio_file, progress)
371
+ if img_path is None:
372
+ return None, None, "Failed to analyze audio emotions.", None
373
+ return img_path, processed_audio, summary, results
374
+
375
+ # Create Gradio interface
376
+ with gr.Blocks(title="Voice Emotion Analysis System") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  gr.Markdown("""
378
+ # πŸŽ™οΈ Voice Emotion Analysis System
379
+
380
+ This app analyzes the emotional content of voice recordings.
381
+
382
+ It detects emotions including:
383
+
384
+ * 😑 **Anger**
385
+ * 🀒 **Disgust**
386
+ * 😨 **Fear**
387
+ * 😊 **Happiness**
388
+ * 😐 **Neutral**
389
+ * 😒 **Sadness**
390
+ * 😲 **Surprise**
391
+
392
+ And provides a detailed analysis and timeline.
393
  """)
394
+
395
  with gr.Tabs():
 
396
  with gr.TabItem("Upload Audio"):
397
  with gr.Row():
398
  with gr.Column(scale=1):
399
  audio_input = gr.Audio(
400
  label="Upload Audio File",
401
  type="filepath",
402
+ sources=["upload"]
 
403
  )
404
+ process_btn = gr.Button("Analyze Voice Emotions")
 
 
 
 
405
  with gr.Column(scale=2):
406
+ emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True)
 
 
 
 
 
 
 
 
407
  with gr.Row():
408
+ audio_playback = gr.Audio(label="Processed Audio", show_label=True)
409
+ emotion_summary = gr.Markdown(label="Emotion Summary")
 
 
410
  with gr.Row():
411
  emotion_results = gr.DataFrame(
412
+ headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
413
  label="Detailed Emotion Analysis"
414
  )
 
415
  process_btn.click(
416
  fn=process_audio,
417
  inputs=[audio_input],
418
+ outputs=[emotion_timeline, audio_playback, emotion_summary, emotion_results]
419
  )
420
+
 
421
  with gr.TabItem("Record Voice"):
422
  with gr.Row():
423
  with gr.Column(scale=1):
424
  record_input = gr.Audio(
425
  label="Record Your Voice",
426
  sources=["microphone"],
427
+ type="filepath"
 
428
  )
429
+ analyze_btn = gr.Button("Analyze Recording")
 
 
 
 
 
 
430
  with gr.Column(scale=2):
431
+ rec_emotion_timeline = gr.Image(label="Emotion Timeline", show_label=True)
 
 
 
 
 
 
 
 
432
  with gr.Row():
433
+ rec_audio_playback = gr.Audio(label="Processed Audio", show_label=True)
434
+ rec_emotion_summary = gr.Markdown(label="Emotion Summary")
 
 
435
  with gr.Row():
436
  rec_emotion_results = gr.DataFrame(
437
+ headers=["Time Range", "Emotion", "Tone", "Confidence", "Description"],
438
  label="Detailed Emotion Analysis"
439
  )
 
440
  analyze_btn.click(
441
  fn=process_audio,
442
  inputs=[record_input],
443
+ outputs=[rec_emotion_timeline, rec_audio_playback, rec_emotion_summary, rec_emotion_results]
444
  )
445
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  gr.Markdown("""
447
+ ### How to Use
448
+
449
+ 1. **Upload Audio Tab:** Upload an audio file and click "Analyze Voice Emotions".
450
+ 2. **Record Voice Tab:** Record your voice and click "Analyze Recording".
451
+
452
+ **Tips:**
453
+ - Use clear recordings with minimal background noise.
454
+ - Longer recordings yield more consistent results.
455
  """)
456
 
457
+ def initialize_app():
458
+ print("Initializing voice emotion analysis app...")
459
+ if load_emotion_model():
460
+ print("Emotion model loaded successfully!")
461
+ else:
462
+ print("Failed to load emotion model.")
463
 
 
464
  if __name__ == "__main__":
465
+ initialize_app()
466
  demo.launch()