Spaces:

Arjunadhithya
/

Speech-Analyser

Sleeping

App Files Files Community

Arjunadhithya commited on 26 days ago

Commit

1f75494

verified ·

1 Parent(s): 83560b9

Upload 7 files

Browse files

Files changed (7) hide show

app.py +171 -0
backend.py +92 -0
emotion_test.py +39 -0
packages.txt +1 -0
requirements.txt +7 -0
summarizer_test.py +17 -0
whisper_test.py +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import streamlit as st
+import torch
+import torchaudio
+import whisper
+from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+import numpy as np
+import altair as alt
+# Constants
+MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/"
+TARGET_SAMPLE_RATE = 16000  # Required sample rate for Wav2Vec2
+AUDIO_SAVE_PATH = "temp_audio.wav"
+emotion_labels = {
+    0: "Neutral",
+    1: "Calm",
+    2: "Happy",
+    3: "Sad",
+    4: "Angry",
+    5: "Fearful",
+    6: "Disgust",
+    7: "Surprised"
+}
+# Load models with caching
+@st.cache_resource
+def load_models():
+    feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
+    ser_model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)
+    whisper_model = whisper.load_model("base")
+    summarizer = pipeline("summarization", model="t5-base", framework="pt")
+    return feature_extractor, ser_model, whisper_model, summarizer
+feature_extractor, ser_model, whisper_model, summarizer = load_models()
+# UI Layout
+st.set_page_config(page_title="Speech Analysis App", layout="wide")
+st.title("Speech Emotion Recognition & Summarization")
+st.markdown("Upload an audio file to analyze emotions, transcribe speech, and get a concise summary.")
+uploaded_file = st.file_uploader("Upload Audio File", type=["wav", "mp3", "ogg"])
+if uploaded_file:
+    with open(AUDIO_SAVE_PATH, "wb") as f:
+        f.write(uploaded_file.getbuffer())
+    st.audio(AUDIO_SAVE_PATH, format="audio/wav")
+    waveform, sample_rate = torchaudio.load(AUDIO_SAVE_PATH)
+    if waveform.shape[0] > 1:
+        waveform = torch.mean(waveform, dim=0, keepdim=True)
+    if sample_rate != TARGET_SAMPLE_RATE:
+        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
+        waveform = resampler(waveform)
+    y = waveform.squeeze().numpy()
+    # Audio Visualization
+    st.subheader("Audio Visualizations")
+    # Waveform
+    st.markdown("**Waveform**")
+    fig_wave, ax_wave = plt.subplots(figsize=(8, 1.8))  # Reduced vertical height
+    ax_wave.plot(y, linewidth=0.8)
+    ax_wave.set_xlabel("Samples")
+    ax_wave.set_ylabel("Amplitude")
+    ax_wave.set_title("Waveform", fontsize=10)
+    ax_wave.tick_params(labelsize=8)
+    fig_wave.tight_layout(pad=0.3)
+    st.pyplot(fig_wave)
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        st.markdown("**Spectrogram**")
+        fig, ax = plt.subplots(figsize=(6, 3))
+        D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
+        img = librosa.display.specshow(D, sr=TARGET_SAMPLE_RATE, x_axis='time', y_axis='log', ax=ax)
+        fig.colorbar(img, ax=ax, format="%+2.0f dB")
+        fig.tight_layout(pad=0.5)
+        st.pyplot(fig)
+    with col2:
+        st.markdown("**MFCCs**")
+        fig2, ax2 = plt.subplots(figsize=(6, 3))
+        mfccs = librosa.feature.mfcc(y=y, sr=TARGET_SAMPLE_RATE, n_mfcc=13)
+        img2 = librosa.display.specshow(mfccs, x_axis='time', ax=ax2)
+        fig2.colorbar(img2, ax=ax2)
+        fig2.tight_layout(pad=0.5)
+        st.pyplot(fig2)
+    # Emotion Prediction
+    st.subheader("Emotion Recognition")
+    inputs = feature_extractor(y, sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt")
+    with torch.no_grad():
+        logits = ser_model(**inputs).logits
+    predicted_class = torch.argmax(logits, dim=-1).item()
+    predicted_emotion = emotion_labels[predicted_class]
+    st.success(f"Predicted Emotion: {predicted_emotion}")
+    # Transcription & Summarization
+    st.subheader("Speech Transcription & Summary")
+    transcription = whisper_model.transcribe(AUDIO_SAVE_PATH)["text"]
+    st.info(f"Transcription: {transcription}")
+    summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
+    st.success(f"Summary: {summary}")
+    # Playback speed
+    st.subheader("Playback Options")
+    speed = st.select_slider("Playback Speed", options=[0.5, 0.75, 1.0, 1.25, 1.5], value=1.0)
+    st.markdown(f"Playback speed set to {speed}x (you can use external player to preview adjusted audio)")
+    # Audio Info
+    st.markdown("**Audio Metadata**")
+    st.write(f"Duration: {round(len(y) / TARGET_SAMPLE_RATE, 2)} seconds")
+    st.write(f"Sample Rate: {TARGET_SAMPLE_RATE} Hz")
+# # Streamlit UI
+# st.title("🎤 Speech Analysis: Emotion & Summarization")
+# # Upload audio file
+# uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
+# if uploaded_file is not None:
+#     # Save the uploaded file
+#     with open(AUDIO_SAVE_PATH, "wb") as f:
+#         f.write(uploaded_file.getbuffer())
+#     # Display the audio
+#     st.audio(AUDIO_SAVE_PATH, format="audio/wav")
+#     # **Speech Emotion Recognition**
+#     st.subheader("Speech Emotion Recognition")
+#     waveform, sample_rate = torchaudio.load(AUDIO_SAVE_PATH)
+#     # Convert stereo to mono
+#     if waveform.shape[0] > 1:
+#         waveform = torch.mean(waveform, dim=0, keepdim=True)
+#     # Resample if needed
+#     if sample_rate != TARGET_SAMPLE_RATE:
+#         resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
+#         waveform = resampler(waveform)
+#     # Extract features
+#     inputs = feature_extractor(waveform.squeeze(0), sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt")
+#     # Get emotion prediction
+#     with torch.no_grad():
+#         logits = ser_model(**inputs).logits
+#     predicted_class = torch.argmax(logits, dim=-1).item()
+#     emotion = emotion_labels.get(predicted_class, "Unknown")
+#     st.success(f"Predicted Emotion: {emotion} ({predicted_class})")
+#     # **Speech Summarization**
+#     st.subheader(" Speech Summarization")
+#     transcription = whisper_model.transcribe(AUDIO_SAVE_PATH)["text"]
+#     st.info(f" Transcription: {transcription}")
+#     # Generate summary
+#     summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
+#     st.success(f"The Summary: {summary}")

backend.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from fastapi import FastAPI, File, UploadFile
+import uvicorn
+import openai
+import torch
+import torchaudio
+import torchaudio.transforms as T
+from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
+import whisper
+import os
+app = FastAPI()
+# Load Whisper model for transcription
+whisper_model = whisper.load_model("small")
+# Load speech emotion recognition model
+ser_model_name = "superb/wav2vec2-base-superb-er"
+feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(ser_model_name)
+ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
+# OpenAI API Key
+openai.api_key = os.getenv("OPENAI_API_KEY")  # Ensure you set this in the terminal before running
+@app.post("/process_audio/")
+async def process_audio(file: UploadFile = File(...)):
+    try:
+        print(f"✅ File received: {file.filename}")
+        # Save audio
+        audio_path = "temp_audio.wav"
+        with open(audio_path, "wb") as f:
+            f.write(await file.read())
+        print("✅ Audio saved successfully!")
+        # 🟢 TEST 1: Check if the file is corrupted
+        try:
+            waveform, sample_rate = torchaudio.load(audio_path)
+            print(f"✅ Audio loaded! Shape: {waveform.shape}, Sample Rate: {sample_rate}")
+        except Exception as e:
+            return {"error": f"❌ Audio loading failed: {e}"}
+        # 🟢 TEST 2: Whisper Transcription
+        try:
+            transcription = whisper_model.transcribe(audio_path)["text"]
+            print(f"✅ Whisper Transcription: {transcription}")
+        except Exception as e:
+            return {"error": f"❌ Whisper failed: {e}"}
+        # 🟢 TEST 3: Emotion Recognition
+        try:
+            if waveform.shape[0] > 1:
+                waveform = torch.mean(waveform, dim=0, keepdim=True)
+            if sample_rate != 16000:
+                resampler = T.Resample(sample_rate, 16000)
+                waveform = resampler(waveform)
+            inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
+            with torch.no_grad():
+                logits = ser_model(**inputs).logits
+            predicted_class = torch.argmax(logits, dim=-1).item()
+            emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
+            emotion_detected = emotions[predicted_class] if predicted_class < len(emotions) else "unknown"
+            print(f"✅ Emotion Detected: {emotion_detected}")
+        except Exception as e:
+            return {"error": f"❌ Emotion recognition failed: {e}"}
+        # 🟢 TEST 4: OpenAI API Summarization
+        try:
+            summary_response = openai.ChatCompletion.create(
+                model="gpt-4-turbo",
+                messages=[
+                    {"role": "system", "content": "Summarize the following text."},
+                    {"role": "user", "content": transcription}
+                ]
+            )
+            summary = summary_response["choices"][0]["message"]["content"]
+            print(f"✅ OpenAI Summary: {summary}")
+        except Exception as e:
+            return {"error": f"❌ OpenAI Summarization failed: {e}"}
+        return {
+            "transcription": transcription,
+            "emotion": emotion_detected,
+            "summary": summary
+        }
+    except Exception as e:
+        print(f"❌ Error in process_audio: {e}")
+        return {"error": str(e)}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

emotion_test.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+import torchaudio
+from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
+MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/"
+TARGET_SAMPLE_RATE = 16000  # Model requires 16kHz audio
+# Load feature extractor and model
+feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
+model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)
+print("Feature extractor and model loaded successfully!")
+# Load an audio file
+audio_file = "D:/SER MiniProj/temp_audio.wav"
+waveform, sample_rate = torchaudio.load(audio_file)
+# Convert to mono if needed
+if waveform.shape[0] > 1:
+    waveform = torch.mean(waveform, dim=0, keepdim=True)
+# Resample if the sample rate is not 16kHz
+if sample_rate != TARGET_SAMPLE_RATE:
+    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
+    waveform = resampler(waveform)
+    sample_rate = TARGET_SAMPLE_RATE  # Update sample rate
+# Process the audio for the model
+inputs = feature_extractor(waveform.squeeze(0), sampling_rate=sample_rate, return_tensors="pt")
+# Perform inference
+with torch.no_grad():
+    logits = model(**inputs).logits
+# Get the predicted emotion
+predicted_label = torch.argmax(logits, dim=-1).item()
+# Print the output
+print(f"Predicted Emotion Class: {predicted_label}")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+streamlit
+librosa
+transformers
+torch
+numpy
+scipy
+soundfile

summarizer_test.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import whisper
+from transformers import pipeline
+AUDIO_FILE = "D:/SER MiniProj/temp_audio.wav"
+# Load Whisper model for transcription
+whisper_model = whisper.load_model("base")  # You can use "small", "medium", or "large" for better accuracy
+# Transcribe the audio
+transcription = whisper_model.transcribe(AUDIO_FILE)["text"]
+print(f"📝 Transcribed Text: {transcription}")
+# Load summarization model
+summarizer = pipeline("summarization", model="t5-base", framework="pt")
+# Generate summary
+summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
+print(f"📌 Summary: {summary}")

whisper_test.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import whisper
+model = whisper.load_model("small")
+result = model.transcribe("temp_audio.wav")
+print("Whisper Output:", result["text"])