Spaces:

Natwar
/

VoiceAnalysis

Sleeping

App Files Files Community

Natwar commited on 20 days ago

Commit

343474c

verified ·

1 Parent(s): adc0b7c

Create app.py

Browse files

Files changed (1) hide show

app.py +845 -0

app.py ADDED Viewed

	@@ -0,0 +1,845 @@

+import os
+import subprocess
+import sys
+import pkg_resources
+import time
+import tempfile
+import numpy as np
+import warnings
+from pathlib import Path
+warnings.filterwarnings("ignore")
+def install_package(package, version=None):
+    package_spec = f"{package}=={version}" if version else package
+    print(f"Installing {package_spec}...")
+    try:
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "--no-cache-dir", package_spec])
+    except subprocess.CalledProcessError as e:
+        print(f"Failed to install {package_spec}: {e}")
+        raise
+# Required packages (add version pins if needed)
+required_packages = {
+    "gradio": None,
+    "torch": None,
+    "torchaudio": None,
+    "transformers": None,
+    "librosa": None,
+    "scipy": None,
+    "matplotlib": None,
+    "pydub": None,
+    "plotly": None
+}
+installed_packages = {pkg.key for pkg in pkg_resources.working_set}
+for package, version in required_packages.items():
+    if package not in installed_packages:
+        install_package(package, version)
+# Now import necessary packages
+import gradio as gr
+import torch
+import torchaudio
+import librosa
+import matplotlib
+matplotlib.use('Agg')  # non-interactive backend for any fallback
+from pydub import AudioSegment
+import scipy
+import io
+from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
+import plotly.graph_objects as go
+# Define emotion labels, tone mapping, and descriptions
+EMOTION_DESCRIPTIONS = {
+    "angry": "Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.",
+    "disgust": "Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.",
+    "fear": "Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.",
+    "happy": "Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.",
+    "neutral": "Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.",
+    "sad": "Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.",
+    "surprise": "Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic."
+}
+# If you wish to group emotions by tone, you can do so here:
+TONE_MAPPING = {
+    "positive": ["happy", "surprise"],
+    "neutral": ["neutral"],
+    "negative": ["angry", "sad", "fear", "disgust"]
+}
+# Global variable for the emotion classifier
+audio_emotion_classifier = None
+def load_emotion_model():
+    """Load and cache the speech emotion classification model."""
+    global audio_emotion_classifier
+    if audio_emotion_classifier is None:
+        try:
+            print("Loading emotion classification model...")
+            model_name = "superb/hubert-large-superb-er"
+            audio_emotion_classifier = pipeline("audio-classification", model=model_name)
+            print("Emotion classification model loaded successfully")
+            return True
+        except Exception as e:
+            print(f"Error loading emotion model: {e}")
+            return False
+    return True
+def convert_audio_to_wav(audio_file):
+    """Convert uploaded audio to WAV format."""
+    try:
+        audio = AudioSegment.from_file(audio_file)
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_wav:
+            wav_path = temp_wav.name
+            audio.export(wav_path, format="wav")
+        return wav_path
+    except Exception as e:
+        print(f"Error converting audio: {e}")
+        return None
+def analyze_voice_tone(audio_file):
+    """
+    Analyze the tone characteristics of the voice using more robust measurements.
+    Includes pitch variation, energy dynamics, and spectral features.
+    """
+    try:
+        audio_data, sample_rate = librosa.load(audio_file, sr=16000)
+        # 1. Basic audio features
+        audio_duration = librosa.get_duration(y=audio_data, sr=sample_rate)
+        if audio_duration < 1.0:  # Too short for reliable analysis
+            return "Audio too short for reliable tone analysis. Please provide at least 3 seconds."
+        # 2. Pitch analysis with more robust handling
+        f0, voiced_flag, voiced_prob = librosa.pyin(
+            audio_data,
+            fmin=librosa.note_to_hz('C2'),
+            fmax=librosa. note_to_hz('C7'),
+            sr=sample_rate
+        )
+        # Filter out NaN values and get valid pitch points
+        valid_f0 = f0[~np.isnan(f0)]
+        # If no pitch detected, may be noise or silence
+        if len(valid_f0) < 10:
+            return "**Voice Tone Analysis:** Unable to detect sufficient pitched content for analysis. The audio may contain primarily noise, silence, or non-speech sounds."
+        # 3. Calculate improved statistics
+        mean_pitch = np.mean(valid_f0)
+        median_pitch = np.median(valid_f0)
+        std_pitch = np.std(valid_f0)
+        pitch_range = np.percentile(valid_f0, 95) - np.percentile(valid_f0, 5)
+        # 4. Energy/volume dynamics
+        rms_energy = librosa.feature.rms(y=audio_data)[0]
+        mean_energy = np.mean(rms_energy)
+        std_energy = np.std(rms_energy)
+        energy_range = np.percentile(rms_energy, 95) - np.percentile(rms_energy, 5)
+        # 5. Speaking rate approximation (zero-crossing rate can help estimate this)
+        zcr = librosa.feature.zero_crossing_rate(audio_data)[0]
+        mean_zcr = np.mean(zcr)
+        # 6. Calculate pitch variability relative to the mean (coefficient of variation)
+        # This gives a better measure than raw std dev
+        pitch_cv = (std_pitch / mean_pitch) * 100 if mean_pitch > 0 else 0
+        # 7. Tone classification logic using multiple features
+        # Define tone characteristics based on combinations of features
+        tone_class = ""
+        tone_details = []
+        # Pitch-based characteristics
+        if pitch_cv < 5:
+            tone_class = "Monotone"
+            tone_details.append("Very little pitch variation - sounds flat and unexpressive")
+        elif pitch_cv < 12:
+            tone_class = "Steady"
+            tone_details.append("Moderate pitch variation - sounds controlled and measured")
+        elif pitch_cv < 20:
+            tone_class = "Expressive"
+            tone_details.append("Good pitch variation - sounds naturally engaging")
+        else:
+            tone_class = "Highly Dynamic"
+            tone_details.append("Strong pitch variation - sounds animated and emphatic")
+        # Pitch range classification
+        if mean_pitch > 180:
+            tone_details.append("Higher pitched voice - may convey excitement or tension")
+        elif mean_pitch < 120:
+            tone_details.append("Lower pitched voice - may convey calmness or authority")
+        else:
+            tone_details.append("Mid-range pitch - typically perceived as balanced")
+        # Energy/volume characteristics
+        energy_cv = (std_energy / mean_energy) * 100 if mean_energy > 0 else 0
+        if energy_cv < 10:
+            tone_details.append("Consistent volume - sounds controlled and measured")
+        elif energy_cv > 30:
+            tone_details.append("Variable volume - suggests emotional emphasis or expressiveness")
+        # Speech rate approximation
+        if mean_zcr > 0.1:
+            tone_details.append("Faster speech rate - may convey urgency or enthusiasm")
+        elif mean_zcr < 0.05:
+            tone_details.append("Slower speech rate - may convey thoughtfulness or hesitation")
+        # Generate tone summary and interpretation
+        tone_analysis = f"### Voice Tone Analysis\n\n"
+        tone_analysis += f"**Primary tone quality:** {tone_class}\n\n"
+        tone_analysis += "**Tone characteristics:**\n"
+        for detail in tone_details:
+            tone_analysis += f"- {detail}\n"
+        tone_analysis += "\n**Interpretation:**\n"
+        # Generate interpretation based on the classified tone
+        if tone_class == "Monotone":
+            tone_analysis += ("A monotone delivery can create distance and reduce engagement. "
+                             "Consider adding more vocal variety to sound more engaging and authentic.")
+        elif tone_class == "Steady":
+            tone_analysis += ("Your steady tone suggests reliability and control. "
+                             "This can be effective in professional settings or when conveying serious information.")
+        elif tone_class == "Expressive":
+            tone_analysis += ("Your expressive tone helps maintain listener interest and emphasize key points. "
+                             "This naturally engaging quality helps convey authenticity and conviction.")
+        else:  # Highly Dynamic
+            tone_analysis += ("Your highly dynamic vocal style conveys strong emotion and energy. "
+                             "This can be powerful for storytelling and persuasion, though in some contexts "
+                             "a more measured approach might be appropriate.")
+        return tone_analysis
+    except Exception as e:
+        print(f"Error in tone analysis: {e}")
+        return "Tone analysis unavailable due to an error processing the audio."
+def analyze_audio_emotions(audio_file, progress=gr.Progress(), chunk_duration=2):
+    """
+    Analyze speech emotions in short chunks,
+    building a timeline of confidence for each emotion.
+    Returns a Plotly figure, summary text, detailed results.
+    """
+    if not load_emotion_model():
+        return None, "Failed to load emotion classifier.", None
+    # Use existing WAV if possible, else convert
+    if audio_file.endswith(".wav"):
+        audio_path = audio_file
+    else:
+        audio_path = convert_audio_to_wav(audio_file)
+        if not audio_path:
+            return None, "Could not process audio file", None
+    try:
+        # Load with librosa
+        audio_data, sample_rate = librosa.load(audio_path, sr=16000)
+        duration = len(audio_data) / sample_rate
+        # Use shorter chunks for more granular analysis
+        chunk_samples = int(chunk_duration * sample_rate)
+        num_chunks = max(1, int(np.ceil(len(audio_data) / chunk_samples)))
+        all_emotions = []
+        time_points = []
+        # For each chunk, run emotion classification
+        for i in range(num_chunks):
+            progress((i + 1) / num_chunks, "Analyzing audio emotions...")
+            start_idx = i * chunk_samples
+            end_idx = min(start_idx + chunk_samples, len(audio_data))
+            chunk = audio_data[start_idx:end_idx]
+            # Skip very short chunks
+            if len(chunk) < 0.5 * sample_rate:
+                continue
+            # Write chunk to temp WAV
+            with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_chunk:
+                chunk_path = temp_chunk.name
+                scipy.io.wavfile.write(chunk_path, sample_rate, (chunk * 32767).astype(np.int16))
+            # Classify - extract top-n predictions for each chunk
+            raw_results = audio_emotion_classifier(chunk_path, top_k=7)  # Get all 7 emotions
+            os.unlink(chunk_path)
+            all_emotions.append(raw_results)
+            time_points.append((start_idx / sample_rate, end_idx / sample_rate))
+        # Skip if no valid emotions detected
+        if not all_emotions:
+            return None, "No speech detected in the audio.", None
+        # Build Plotly chart with improved styling
+        fig = build_plotly_line_chart(all_emotions, time_points, duration)
+        # Build summary and detailed results
+        summary_text = generate_emotion_summary(all_emotions)
+        detailed_results = build_detailed_results(all_emotions, time_points)
+        return fig, summary_text, detailed_results
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"Error analyzing audio: {str(e)}", None
+def smooth_data(data, window_size=3):
+    """Apply a moving average smoothing to the data"""
+    smoothed = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
+    # Add back points that were lost in the convolution
+    padding = len(data) - len(smoothed)
+    if padding > 0:
+        # Add padding at the beginning
+        padding_front = padding // 2
+        padding_back = padding - padding_front
+        # Use the first/last values for padding
+        front_padding = [smoothed[0]] * padding_front
+        back_padding = [smoothed[-1]] * padding_back
+        smoothed = np.concatenate([front_padding, smoothed, back_padding])
+    return smoothed
+def build_plotly_line_chart(all_emotions, time_points, duration):
+    """
+    Create an improved Plotly line chart with toggles for each emotion.
+    Shows all emotions for each time point rather than just the top one.
+    """
+    emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
+    # Custom color scheme for emotions
+    colors = {
+        "angry": "#E53935",     # Red
+        "disgust": "#8E24AA",   # Purple
+        "fear": "#7B1FA2",      # Deep Purple
+        "happy": "#FFC107",     # Amber/Yellow
+        "neutral": "#78909C",   # Blue Grey
+        "sad": "#1E88E5",       # Blue
+        "surprise": "#43A047"   # Green
+    }
+    # Prepare data structure for all emotions
+    emotion_data = {label: [] for label in emotion_labels}
+    timeline_times = [(start + end) / 2 for start, end in time_points]
+    # Process emotion scores - ensure all emotions have values
+    for chunk_emotions in all_emotions:
+        # Create a mapping of label to score for this chunk
+        scores = {item["label"]: item["score"] for item in chunk_emotions}
+        # Ensure all emotion labels have a value (default to 0.0)
+        for label in emotion_labels:
+            emotion_data[label].append(scores.get(label, 0.0))
+    # Smooth the data
+    for label in emotion_labels:
+        if len(emotion_data[label]) > 2:
+            emotion_data[label] = smooth_data(emotion_data[label])
+    # Build the chart
+    fig = go.Figure()
+    # Add traces for each emotion
+    for label in emotion_labels:
+        fig.add_trace(
+            go.Scatter(
+                x=timeline_times,
+                y=emotion_data[label],
+                mode='lines',
+                name=label.capitalize(),
+                line=dict(
+                    color=colors.get(label, None),
+                    width=3,
+                    shape='spline',  # Curved lines
+                    smoothing=1.3
+                ),
+                hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>',
+            )
+        )
+    # Add markers for dominant emotion at each point
+    dominant_markers_x = []
+    dominant_markers_y = []
+    dominant_markers_text = []
+    dominant_markers_color = []
+    for i, time in enumerate(timeline_times):
+        scores = {label: emotion_data[label][i] for label in emotion_labels}
+        dominant = max(scores.items(), key=lambda x: x[1])
+        dominant_markers_x.append(time)
+        dominant_markers_y.append(dominant[1])
+        dominant_markers_text.append(f"{dominant[0].capitalize()}: {dominant[1]:.2f}")
+        dominant_markers_color.append(colors.get(dominant[0], "#000000"))
+    fig.add_trace(
+        go.Scatter(
+            x=dominant_markers_x,
+            y=dominant_markers_y,
+            mode='markers',
+            marker=dict(
+                size=10,
+                color=dominant_markers_color,
+                line=dict(width=2, color='white')
+            ),
+            name="Dominant Emotion",
+            text=dominant_markers_text,
+            hoverinfo="text",
+            hovertemplate='%{text}<extra></extra>'
+        )
+    )
+    # Add area chart for better visualization
+    for label in emotion_labels:
+        fig.add_trace(
+            go.Scatter(
+                x=timeline_times,
+                y=emotion_data[label],
+                mode='none',
+                name=f"{label.capitalize()} Area",
+                fill='tozeroy',
+                fillcolor=f"rgba{tuple(list(int(colors.get(label, '#000000').lstrip('#')[i:i+2], 16) for i in (0, 2, 4)) + [0.1])}",
+                showlegend=False,
+                hoverinfo='skip'
+            )
+        )
+    # Improve layout
+    fig.update_layout(
+        title={
+            'text': "Voice Emotion Analysis Over Time",
+            'font': {'size': 22, 'family': 'Arial, sans-serif'}
+        },
+        xaxis_title="Time (seconds)",
+        yaxis_title="Confidence Score",
+        yaxis=dict(
+            range=[0, 1.0],
+            showgrid=True,
+            gridcolor='rgba(230, 230, 230, 0.8)'
+        ),
+        xaxis=dict(
+            showgrid=True,
+            gridcolor='rgba(230, 230, 230, 0.8)'
+        ),
+        plot_bgcolor='white',
+        legend=dict(
+            bordercolor='rgba(0,0,0,0.1)',
+            borderwidth=1,
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1
+        ),
+        hovermode='closest',
+        height=500,  # Larger size for better viewing
+        margin=dict(l=10, r=10, t=80, b=50)
+    )
+    return fig
+def generate_alternative_chart(all_emotions, time_points):
+    """
+    Create a stacked area chart to better visualize emotion changes over time
+    """
+    emotion_labels = list(EMOTION_DESCRIPTIONS.keys())
+    # Custom color scheme for emotions - more visible/distinct
+    colors = {
+        "angry": "#F44336",     # Red
+        "disgust": "#9C27B0",   # Purple
+        "fear": "#673AB7",      # Deep Purple
+        "happy": "#FFC107",     # Amber
+        "neutral": "#607D8B",   # Blue Grey
+        "sad": "#2196F3",       # Blue
+        "surprise": "#4CAF50"   # Green
+    }
+    # Prepare timeline points
+    timeline_times = [(start + end) / 2 for start, end in time_points]
+    # Prepare data structure for all emotions
+    emotion_data = {label: [] for label in emotion_labels}
+    # Process emotion scores - ensure all emotions have values
+    for chunk_emotions in all_emotions:
+        # Create a mapping of label to score for this chunk
+        scores = {item["label"]: item["score"] for item in chunk_emotions}
+        # Ensure all emotion labels have a value (default to 0.0)
+        for label in emotion_labels:
+            emotion_data[label].append(scores.get(label, 0.0))
+    # Create the stacked area chart
+    fig = go.Figure()
+    # Add each emotion as a separate trace
+    for label in emotion_labels:
+        fig.add_trace(
+            go.Scatter(
+                x=timeline_times,
+                y=emotion_data[label],
+                mode='lines',
+                name=label.capitalize(),
+                line=dict(width=0.5, color=colors.get(label, None)),
+                stackgroup='one',  # This makes it a stacked area chart
+                fillcolor=colors.get(label, None),
+                hovertemplate=f'{label.capitalize()}: %{{y:.2f}}<extra></extra>'
+            )
+        )
+    # Improve layout
+    fig.update_layout(
+        title={
+            'text': "Voice Emotion Distribution Over Time",
+            'font': {'size': 22, 'family': 'Arial, sans-serif'}
+        },
+        xaxis_title="Time (seconds)",
+        yaxis_title="Emotion Intensity",
+        yaxis=dict(
+            showgrid=True,
+            gridcolor='rgba(230, 230, 230, 0.8)'
+        ),
+        xaxis=dict(
+            showgrid=True,
+            gridcolor='rgba(230, 230, 230, 0.8)'
+        ),
+        plot_bgcolor='white',
+        legend=dict(
+            bordercolor='rgba(0,0,0,0.1)',
+            borderwidth=1,
+            orientation="h",
+            yanchor="bottom",
+            y=1.02,
+            xanchor="right",
+            x=1
+        ),
+        hovermode='closest',
+        height=500,
+        margin=dict(l=10, r=10, t=80, b=50)
+    )
+    return fig
+def generate_emotion_summary(all_emotions):
+    """
+    Produce an improved textual summary of the overall emotion distribution.
+    """
+    if not all_emotions:
+        return "No emotional content detected."
+    emotion_counts = {}
+    emotion_confidence = {}
+    total_chunks = len(all_emotions)
+    for chunk_emotions in all_emotions:
+        top_emotion = max(chunk_emotions, key=lambda x: x['score'])
+        label = top_emotion["label"]
+        confidence = top_emotion["score"]
+        emotion_counts[label] = emotion_counts.get(label, 0) + 1
+        emotion_confidence[label] = emotion_confidence.get(label, 0) + confidence
+    # Calculate average confidence for each emotion
+    for emotion in emotion_confidence:
+        if emotion_counts[emotion] > 0:
+            emotion_confidence[emotion] /= emotion_counts[emotion]
+    # Dominant emotion (highest percentage)
+    dominant_emotion = max(emotion_counts, key=emotion_counts.get)
+    dominant_pct = (emotion_counts[dominant_emotion] / total_chunks) * 100
+    # Most confident emotion (might differ from dominant)
+    most_confident = max(emotion_confidence, key=emotion_confidence.get)
+    # Tone grouping analysis
+    tone_group_counts = {group: 0 for group in TONE_MAPPING}
+    for emotion, count in emotion_counts.items():
+        for tone_group, emotions in TONE_MAPPING.items():
+            if emotion in emotions:
+                tone_group_counts[tone_group] += count
+    dominant_tone = max(tone_group_counts, key=tone_group_counts.get)
+    dominant_tone_pct = (tone_group_counts[dominant_tone] / total_chunks) * 100
+    # Build summary with markdown formatting
+    summary = f"### Voice Emotion Analysis Summary\n\n"
+    summary += f"**Dominant emotion:** {dominant_emotion.capitalize()} ({dominant_pct:.1f}%)\n\n"
+    if dominant_emotion != most_confident and emotion_confidence[most_confident] > 0.7:
+        summary += f"**Most confident detection:** {most_confident.capitalize()} "
+        summary += f"(avg. confidence: {emotion_confidence[most_confident]:.2f})\n\n"
+    summary += f"**Overall tone:** {dominant_tone.capitalize()} ({dominant_tone_pct:.1f}%)\n\n"
+    summary += f"**Description:** {EMOTION_DESCRIPTIONS.get(dominant_emotion, '')}\n\n"
+    # Show emotion distribution as sorted list
+    summary += "**Emotion distribution:**\n"
+    for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True):
+        percentage = (count / total_chunks) * 100
+        avg_conf = emotion_confidence[emotion]
+        summary += f"- {emotion.capitalize()}: {percentage:.1f}% (confidence: {avg_conf:.2f})\n"
+    # Add interpretation based on dominant emotion
+    summary += f"\n**Interpretation:**\n"
+    if dominant_emotion == "happy":
+        summary += "The voice conveys primarily positive emotions, suggesting enthusiasm, satisfaction, or joy."
+    elif dominant_emotion == "neutral":
+        summary += "The voice maintains an even emotional tone, suggesting composure or professional delivery."
+    elif dominant_emotion == "sad":
+        summary += "The voice conveys melancholy or disappointment, potentially indicating concern or distress."
+    elif dominant_emotion == "angry":
+        summary += "The voice shows frustration or assertiveness, suggesting strong conviction or displeasure."
+    elif dominant_emotion == "fear":
+        summary += "The voice reveals anxiety or nervousness, suggesting uncertainty or concern."
+    elif dominant_emotion == "disgust":
+        summary += "The voice expresses disapproval or aversion, suggesting rejection of discussed concepts."
+    elif dominant_emotion == "surprise":
+        summary += "The voice shows unexpected reactions, suggesting discovery of new information or astonishment."
+    return summary
+def build_detailed_results(all_emotions, time_points):
+    """
+    Return a list of dictionaries containing chunk start-end, top emotion, confidence, description.
+    Suitable for Gradio DataFrame display.
+    """
+    results_list = []
+    for (emotions, (start_time, end_time)) in zip(all_emotions, time_points):
+        top_emotion = max(emotions, key=lambda x: x['score'])
+        label = top_emotion["label"]
+        # Find second highest emotion if available
+        if len(emotions) > 1:
+            sorted_emotions = sorted(emotions, key=lambda x: x['score'], reverse=True)
+            second_emotion = sorted_emotions[1]["label"].capitalize()
+            second_score = sorted_emotions[1]["score"]
+            secondary = f" ({second_emotion}: {second_score:.2f})"
+        else:
+            secondary = ""
+        results_list.append({
+            "Time Range": f"{start_time:.1f}s - {end_time:.1f}s",
+            "Primary Emotion": label.capitalize(),
+            "Confidence": f"{top_emotion['score']:.2f}{secondary}",
+            "Description": EMOTION_DESCRIPTIONS.get(label, "")
+        })
+    return results_list
+def process_audio(audio_file, progress=gr.Progress()):
+    """
+    Main handler for Gradio:
+      1) Emotion analysis (returns Plotly figure).
+      2) Tone analysis (returns descriptive text).
+    """
+    if not audio_file:
+        return None, None, "No audio file provided.", None, "No tone analysis."
+    # 1) Analyze emotions
+    fig, summary_text, detailed_results = analyze_audio_emotions(audio_file, progress)
+    if not fig:  # Error or missing
+        return None, None, "Failed to analyze audio emotions.", None, "Tone analysis unavailable."
+    # 2) Generate alternative chart
+    # Extract the necessary data from detailed_results to create time_points
+    time_points = []
+    for result in detailed_results:
+        time_range = result["Time Range"]
+        start_time = float(time_range.split("s")[0])
+        end_time = float(time_range.split(" - ")[1].split("s")[0])
+        time_points.append((start_time, end_time))
+    # Extract emotion data from detailed_results
+    all_emotions = []
+    for result in detailed_results:
+        # Parse the primary emotion and confidence
+        primary_emotion = result["Primary Emotion"].lower()
+        confidence_str = result["Confidence"].split("(")[0].strip()
+        primary_confidence = float(confidence_str)
+        # Create a list of emotion dictionaries for this time point
+        emotions_at_time = [{"label": primary_emotion, "score": primary_confidence}]
+        # Check if there's a secondary emotion
+        if "(" in result["Confidence"]:
+            secondary_part = result["Confidence"].split("(")[1].split(")")[0]
+            secondary_emotion = secondary_part.split(":")[0].strip().lower()
+            secondary_confidence = float(secondary_part.split(":")[1].strip())
+            emotions_at_time.append({"label": secondary_emotion, "score": secondary_confidence})
+        # Add remaining emotions with zero confidence
+        for emotion in EMOTION_DESCRIPTIONS.keys():
+            if emotion not in [e["label"] for e in emotions_at_time]:
+                emotions_at_time.append({"label": emotion, "score": 0.0})
+        all_emotions.append(emotions_at_time)
+    # Now we can generate the alternative chart
+    alt_fig = generate_alternative_chart(all_emotions, time_points)
+    # 3) Analyze tone
+    tone_analysis = analyze_voice_tone(audio_file)
+    return fig, alt_fig, summary_text, detailed_results, tone_analysis
+# Create Gradio interface with improved UI/UX
+with gr.Blocks(title="Voice Emotion & Tone Analysis System", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎙️ Voice Emotion & Tone Analysis System
+    This app provides professional analysis of:
+    - **Emotions** in your voice (Anger, Disgust, Fear, Happy, Neutral, Sad, Surprise)
+    - **Tone characteristics** (based on pitch, energy, and speech patterns)
+    The interactive timeline shows emotion confidence scores throughout your audio.
+    """)
+    with gr.Tabs():
+        # Tab 1: Upload
+        with gr.TabItem("Upload Audio"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    audio_input = gr.Audio(
+                        label="Upload Audio File",
+                        type="filepath",
+                        sources=["upload"],
+                        elem_id="audio_upload"
+                    )
+                    process_btn = gr.Button("Analyze Voice", variant="primary")
+                    gr.Markdown("""
+                    **Supports:** MP3, WAV, M4A, and most audio formats
+                    **For best results:** Use a clear voice recording with minimal background noise
+                    """)
+                with gr.Column(scale=2):
+                    with gr.Tabs():
+                        with gr.TabItem("Line Chart"):
+                            emotion_timeline = gr.Plot(label="Emotion Timeline",
+                                                      elem_id="emotion_plot",
+                                                      container=True)
+                        with gr.TabItem("Area Chart"):
+                            emotion_area_chart = gr.Plot(label="Emotion Distribution",
+                                                        elem_id="emotion_area_plot",
+                                                        container=True)
+            with gr.Row():
+                with gr.Column():
+                    emotion_summary = gr.Markdown(label="Emotion Summary")
+                with gr.Column():
+                    tone_analysis_output = gr.Markdown(label="Tone Analysis")
+            with gr.Row():
+                emotion_results = gr.DataFrame(
+                    headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
+                    label="Detailed Emotion Analysis"
+                )
+            process_btn.click(
+                fn=process_audio,
+                inputs=[audio_input],
+                outputs=[emotion_timeline, emotion_area_chart, emotion_summary, emotion_results, tone_analysis_output]
+            )
+        # Tab 2: Record
+        with gr.TabItem("Record Voice"):
+            with gr.Row():
+                with gr.Column(scale=1):
+                    record_input = gr.Audio(
+                        label="Record Your Voice",
+                        sources=["microphone"],
+                        type="filepath",
+                        elem_id="record_audio"
+                    )
+                    analyze_btn = gr.Button("Analyze Recording", variant="primary")
+                    gr.Markdown("""
+                    **Tips:**
+                    - Speak clearly and at a normal pace
+                    - Record at least 10-15 seconds for more accurate analysis
+                    - Try different emotional tones to see how they're detected
+                    """)
+                with gr.Column(scale=2):
+                    with gr.Tabs():
+                        with gr.TabItem("Line Chart"):
+                            rec_emotion_timeline = gr.Plot(label="Emotion Timeline",
+                                                          elem_id="record_emotion_plot",
+                                                          container=True)
+                        with gr.TabItem("Area Chart"):
+                            rec_emotion_area_chart = gr.Plot(label="Emotion Distribution",
+                                                           elem_id="record_emotion_area_plot",
+                                                           container=True)
+            with gr.Row():
+                with gr.Column():
+                    rec_emotion_summary = gr.Markdown(label="Emotion Summary")
+                with gr.Column():
+                    rec_tone_analysis_output = gr.Markdown(label="Tone Analysis")
+            with gr.Row():
+                rec_emotion_results = gr.DataFrame(
+                    headers=["Time Range", "Primary Emotion", "Confidence", "Description"],
+                    label="Detailed Emotion Analysis"
+                )
+            analyze_btn.click(
+                fn=process_audio,
+                inputs=[record_input],
+                outputs=[rec_emotion_timeline, rec_emotion_area_chart, rec_emotion_summary, rec_emotion_results, rec_tone_analysis_output]
+            )
+        # Tab 3: About & Help
+        with gr.TabItem("About & Help"):
+            gr.Markdown("""
+            ## About This System
+            This voice emotion & tone analysis system uses state-of-the-art deep learning models to detect emotions and analyze vocal characteristics. The system is built on HuBERT (Hidden Unit BERT) architecture trained on speech emotion recognition tasks.
+            ### How It Works
+            1. **Audio Processing**: Your audio is processed in short segments (chunks) to capture emotion variations over time.
+            2. **Emotion Classification**: Each segment is analyzed by a neural network to detect emotional patterns.
+            3. **Tone Analysis**: Acoustic features like pitch, energy, and rhythm are analyzed to describe voice tone characteristics.
+            ### Emotion Categories
+            The system detects seven standard emotions:
+            - **Angry**: Voice shows irritation, hostility, or aggression. Tone may be harsh, loud, or intense.
+            - **Disgust**: Voice expresses revulsion or strong disapproval. Tone may sound repulsed or contemptuous.
+            - **Fear**: Voice reveals anxiety, worry, or dread. Tone may be shaky, hesitant, or tense.
+            - **Happy**: Voice conveys joy, pleasure, or positive emotions. Tone is often bright, energetic, and uplifted.
+            - **Neutral**: Voice lacks strong emotional signals. Tone is even, moderate, and relatively flat.
+            - **Sad**: Voice expresses sorrow, unhappiness, or melancholy. Tone may be quiet, heavy, or subdued.
+            - **Surprise**: Voice reflects unexpected reactions. Tone may be higher pitched, quick, or energetic.
+            ### Tips for Best Results
+            - Use clear audio with minimal background noise
+            - Speak naturally at a comfortable volume
+            - Record at least 10-15 seconds of speech
+            - For tone analysis, longer recordings (30+ seconds) provide more accurate results
+            ### Privacy Notice
+            All audio processing happens on your device. No audio recordings or analysis results are stored or transmitted to external servers.
+            """)
+    gr.Markdown("""
+    ---
+    ### System Information
+    - **Model**: HuBERT Large for Speech Emotion Recognition
+    - **Version**: 1.2.0
+    - **Libraries**: PyTorch, Transformers, Librosa, Plotly
+    This application demonstrates the use of AI for speech emotion recognition and acoustic analysis. For research and educational purposes only.
+    """)
+# Check if model can load before launching interface
+print("Checking model availability...")
+load_success = load_emotion_model()
+if not load_success:
+    print("Warning: Emotion model failed to load. Application may have limited functionality.")
+# Launch the demo
+if __name__ == "__main__":
+    demo.launch()