Spaces:
Sleeping
Sleeping
Upload 7 files
Browse files- app.py +171 -0
- backend.py +92 -0
- emotion_test.py +39 -0
- packages.txt +1 -0
- requirements.txt +7 -0
- summarizer_test.py +17 -0
- whisper_test.py +4 -0
app.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import torch
|
3 |
+
import torchaudio
|
4 |
+
import whisper
|
5 |
+
from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
|
6 |
+
import librosa
|
7 |
+
import librosa.display
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import numpy as np
|
10 |
+
import altair as alt
|
11 |
+
|
12 |
+
# Constants
|
13 |
+
MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/"
|
14 |
+
TARGET_SAMPLE_RATE = 16000 # Required sample rate for Wav2Vec2
|
15 |
+
AUDIO_SAVE_PATH = "temp_audio.wav"
|
16 |
+
|
17 |
+
emotion_labels = {
|
18 |
+
0: "Neutral",
|
19 |
+
1: "Calm",
|
20 |
+
2: "Happy",
|
21 |
+
3: "Sad",
|
22 |
+
4: "Angry",
|
23 |
+
5: "Fearful",
|
24 |
+
6: "Disgust",
|
25 |
+
7: "Surprised"
|
26 |
+
}
|
27 |
+
|
28 |
+
# Load models with caching
|
29 |
+
@st.cache_resource
|
30 |
+
def load_models():
|
31 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
|
32 |
+
ser_model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)
|
33 |
+
|
34 |
+
whisper_model = whisper.load_model("base")
|
35 |
+
summarizer = pipeline("summarization", model="t5-base", framework="pt")
|
36 |
+
|
37 |
+
return feature_extractor, ser_model, whisper_model, summarizer
|
38 |
+
|
39 |
+
feature_extractor, ser_model, whisper_model, summarizer = load_models()
|
40 |
+
|
41 |
+
# UI Layout
|
42 |
+
st.set_page_config(page_title="Speech Analysis App", layout="wide")
|
43 |
+
st.title("Speech Emotion Recognition & Summarization")
|
44 |
+
st.markdown("Upload an audio file to analyze emotions, transcribe speech, and get a concise summary.")
|
45 |
+
|
46 |
+
uploaded_file = st.file_uploader("Upload Audio File", type=["wav", "mp3", "ogg"])
|
47 |
+
|
48 |
+
if uploaded_file:
|
49 |
+
with open(AUDIO_SAVE_PATH, "wb") as f:
|
50 |
+
f.write(uploaded_file.getbuffer())
|
51 |
+
|
52 |
+
st.audio(AUDIO_SAVE_PATH, format="audio/wav")
|
53 |
+
|
54 |
+
waveform, sample_rate = torchaudio.load(AUDIO_SAVE_PATH)
|
55 |
+
|
56 |
+
if waveform.shape[0] > 1:
|
57 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
58 |
+
|
59 |
+
if sample_rate != TARGET_SAMPLE_RATE:
|
60 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
|
61 |
+
waveform = resampler(waveform)
|
62 |
+
|
63 |
+
y = waveform.squeeze().numpy()
|
64 |
+
|
65 |
+
# Audio Visualization
|
66 |
+
st.subheader("Audio Visualizations")
|
67 |
+
|
68 |
+
# Waveform
|
69 |
+
st.markdown("**Waveform**")
|
70 |
+
fig_wave, ax_wave = plt.subplots(figsize=(8, 1.8)) # Reduced vertical height
|
71 |
+
ax_wave.plot(y, linewidth=0.8)
|
72 |
+
ax_wave.set_xlabel("Samples")
|
73 |
+
ax_wave.set_ylabel("Amplitude")
|
74 |
+
ax_wave.set_title("Waveform", fontsize=10)
|
75 |
+
ax_wave.tick_params(labelsize=8)
|
76 |
+
fig_wave.tight_layout(pad=0.3)
|
77 |
+
st.pyplot(fig_wave)
|
78 |
+
|
79 |
+
col1, col2 = st.columns([1, 1])
|
80 |
+
with col1:
|
81 |
+
st.markdown("**Spectrogram**")
|
82 |
+
fig, ax = plt.subplots(figsize=(6, 3))
|
83 |
+
D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
|
84 |
+
img = librosa.display.specshow(D, sr=TARGET_SAMPLE_RATE, x_axis='time', y_axis='log', ax=ax)
|
85 |
+
fig.colorbar(img, ax=ax, format="%+2.0f dB")
|
86 |
+
fig.tight_layout(pad=0.5)
|
87 |
+
st.pyplot(fig)
|
88 |
+
|
89 |
+
with col2:
|
90 |
+
st.markdown("**MFCCs**")
|
91 |
+
fig2, ax2 = plt.subplots(figsize=(6, 3))
|
92 |
+
mfccs = librosa.feature.mfcc(y=y, sr=TARGET_SAMPLE_RATE, n_mfcc=13)
|
93 |
+
img2 = librosa.display.specshow(mfccs, x_axis='time', ax=ax2)
|
94 |
+
fig2.colorbar(img2, ax=ax2)
|
95 |
+
fig2.tight_layout(pad=0.5)
|
96 |
+
st.pyplot(fig2)
|
97 |
+
|
98 |
+
# Emotion Prediction
|
99 |
+
st.subheader("Emotion Recognition")
|
100 |
+
inputs = feature_extractor(y, sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt")
|
101 |
+
with torch.no_grad():
|
102 |
+
logits = ser_model(**inputs).logits
|
103 |
+
predicted_class = torch.argmax(logits, dim=-1).item()
|
104 |
+
predicted_emotion = emotion_labels[predicted_class]
|
105 |
+
st.success(f"Predicted Emotion: {predicted_emotion}")
|
106 |
+
|
107 |
+
# Transcription & Summarization
|
108 |
+
st.subheader("Speech Transcription & Summary")
|
109 |
+
transcription = whisper_model.transcribe(AUDIO_SAVE_PATH)["text"]
|
110 |
+
st.info(f"Transcription: {transcription}")
|
111 |
+
|
112 |
+
summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
|
113 |
+
st.success(f"Summary: {summary}")
|
114 |
+
|
115 |
+
# Playback speed
|
116 |
+
st.subheader("Playback Options")
|
117 |
+
speed = st.select_slider("Playback Speed", options=[0.5, 0.75, 1.0, 1.25, 1.5], value=1.0)
|
118 |
+
st.markdown(f"Playback speed set to {speed}x (you can use external player to preview adjusted audio)")
|
119 |
+
|
120 |
+
# Audio Info
|
121 |
+
st.markdown("**Audio Metadata**")
|
122 |
+
st.write(f"Duration: {round(len(y) / TARGET_SAMPLE_RATE, 2)} seconds")
|
123 |
+
st.write(f"Sample Rate: {TARGET_SAMPLE_RATE} Hz")
|
124 |
+
|
125 |
+
|
126 |
+
# # Streamlit UI
|
127 |
+
# st.title("π€ Speech Analysis: Emotion & Summarization")
|
128 |
+
|
129 |
+
# # Upload audio file
|
130 |
+
# uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
|
131 |
+
|
132 |
+
# if uploaded_file is not None:
|
133 |
+
# # Save the uploaded file
|
134 |
+
# with open(AUDIO_SAVE_PATH, "wb") as f:
|
135 |
+
# f.write(uploaded_file.getbuffer())
|
136 |
+
|
137 |
+
# # Display the audio
|
138 |
+
# st.audio(AUDIO_SAVE_PATH, format="audio/wav")
|
139 |
+
|
140 |
+
# # **Speech Emotion Recognition**
|
141 |
+
# st.subheader("Speech Emotion Recognition")
|
142 |
+
# waveform, sample_rate = torchaudio.load(AUDIO_SAVE_PATH)
|
143 |
+
|
144 |
+
# # Convert stereo to mono
|
145 |
+
# if waveform.shape[0] > 1:
|
146 |
+
# waveform = torch.mean(waveform, dim=0, keepdim=True)
|
147 |
+
|
148 |
+
# # Resample if needed
|
149 |
+
# if sample_rate != TARGET_SAMPLE_RATE:
|
150 |
+
# resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
|
151 |
+
# waveform = resampler(waveform)
|
152 |
+
|
153 |
+
# # Extract features
|
154 |
+
# inputs = feature_extractor(waveform.squeeze(0), sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt")
|
155 |
+
|
156 |
+
# # Get emotion prediction
|
157 |
+
# with torch.no_grad():
|
158 |
+
# logits = ser_model(**inputs).logits
|
159 |
+
|
160 |
+
# predicted_class = torch.argmax(logits, dim=-1).item()
|
161 |
+
# emotion = emotion_labels.get(predicted_class, "Unknown")
|
162 |
+
# st.success(f"Predicted Emotion: {emotion} ({predicted_class})")
|
163 |
+
|
164 |
+
# # **Speech Summarization**
|
165 |
+
# st.subheader(" Speech Summarization")
|
166 |
+
# transcription = whisper_model.transcribe(AUDIO_SAVE_PATH)["text"]
|
167 |
+
# st.info(f" Transcription: {transcription}")
|
168 |
+
|
169 |
+
# # Generate summary
|
170 |
+
# summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
|
171 |
+
# st.success(f"The Summary: {summary}")
|
backend.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, File, UploadFile
|
2 |
+
import uvicorn
|
3 |
+
import openai
|
4 |
+
import torch
|
5 |
+
import torchaudio
|
6 |
+
import torchaudio.transforms as T
|
7 |
+
from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
|
8 |
+
import whisper
|
9 |
+
import os
|
10 |
+
|
11 |
+
app = FastAPI()
|
12 |
+
|
13 |
+
# Load Whisper model for transcription
|
14 |
+
whisper_model = whisper.load_model("small")
|
15 |
+
|
16 |
+
# Load speech emotion recognition model
|
17 |
+
ser_model_name = "superb/wav2vec2-base-superb-er"
|
18 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(ser_model_name)
|
19 |
+
ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
|
20 |
+
|
21 |
+
# OpenAI API Key
|
22 |
+
openai.api_key = os.getenv("OPENAI_API_KEY") # Ensure you set this in the terminal before running
|
23 |
+
|
24 |
+
@app.post("/process_audio/")
|
25 |
+
async def process_audio(file: UploadFile = File(...)):
|
26 |
+
try:
|
27 |
+
print(f"β
File received: {file.filename}")
|
28 |
+
|
29 |
+
# Save audio
|
30 |
+
audio_path = "temp_audio.wav"
|
31 |
+
with open(audio_path, "wb") as f:
|
32 |
+
f.write(await file.read())
|
33 |
+
print("β
Audio saved successfully!")
|
34 |
+
|
35 |
+
# π’ TEST 1: Check if the file is corrupted
|
36 |
+
try:
|
37 |
+
waveform, sample_rate = torchaudio.load(audio_path)
|
38 |
+
print(f"β
Audio loaded! Shape: {waveform.shape}, Sample Rate: {sample_rate}")
|
39 |
+
except Exception as e:
|
40 |
+
return {"error": f"β Audio loading failed: {e}"}
|
41 |
+
|
42 |
+
# π’ TEST 2: Whisper Transcription
|
43 |
+
try:
|
44 |
+
transcription = whisper_model.transcribe(audio_path)["text"]
|
45 |
+
print(f"β
Whisper Transcription: {transcription}")
|
46 |
+
except Exception as e:
|
47 |
+
return {"error": f"β Whisper failed: {e}"}
|
48 |
+
|
49 |
+
# π’ TEST 3: Emotion Recognition
|
50 |
+
try:
|
51 |
+
if waveform.shape[0] > 1:
|
52 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
53 |
+
if sample_rate != 16000:
|
54 |
+
resampler = T.Resample(sample_rate, 16000)
|
55 |
+
waveform = resampler(waveform)
|
56 |
+
|
57 |
+
inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
|
58 |
+
with torch.no_grad():
|
59 |
+
logits = ser_model(**inputs).logits
|
60 |
+
predicted_class = torch.argmax(logits, dim=-1).item()
|
61 |
+
emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
|
62 |
+
emotion_detected = emotions[predicted_class] if predicted_class < len(emotions) else "unknown"
|
63 |
+
print(f"β
Emotion Detected: {emotion_detected}")
|
64 |
+
except Exception as e:
|
65 |
+
return {"error": f"β Emotion recognition failed: {e}"}
|
66 |
+
|
67 |
+
# π’ TEST 4: OpenAI API Summarization
|
68 |
+
try:
|
69 |
+
summary_response = openai.ChatCompletion.create(
|
70 |
+
model="gpt-4-turbo",
|
71 |
+
messages=[
|
72 |
+
{"role": "system", "content": "Summarize the following text."},
|
73 |
+
{"role": "user", "content": transcription}
|
74 |
+
]
|
75 |
+
)
|
76 |
+
summary = summary_response["choices"][0]["message"]["content"]
|
77 |
+
print(f"β
OpenAI Summary: {summary}")
|
78 |
+
except Exception as e:
|
79 |
+
return {"error": f"β OpenAI Summarization failed: {e}"}
|
80 |
+
|
81 |
+
return {
|
82 |
+
"transcription": transcription,
|
83 |
+
"emotion": emotion_detected,
|
84 |
+
"summary": summary
|
85 |
+
}
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
print(f"β Error in process_audio: {e}")
|
89 |
+
return {"error": str(e)}
|
90 |
+
|
91 |
+
if __name__ == "__main__":
|
92 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
emotion_test.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torchaudio
|
3 |
+
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
|
4 |
+
|
5 |
+
MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/"
|
6 |
+
TARGET_SAMPLE_RATE = 16000 # Model requires 16kHz audio
|
7 |
+
|
8 |
+
# Load feature extractor and model
|
9 |
+
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
|
10 |
+
model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)
|
11 |
+
|
12 |
+
print("Feature extractor and model loaded successfully!")
|
13 |
+
|
14 |
+
# Load an audio file
|
15 |
+
audio_file = "D:/SER MiniProj/temp_audio.wav"
|
16 |
+
waveform, sample_rate = torchaudio.load(audio_file)
|
17 |
+
|
18 |
+
# Convert to mono if needed
|
19 |
+
if waveform.shape[0] > 1:
|
20 |
+
waveform = torch.mean(waveform, dim=0, keepdim=True)
|
21 |
+
|
22 |
+
# Resample if the sample rate is not 16kHz
|
23 |
+
if sample_rate != TARGET_SAMPLE_RATE:
|
24 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
|
25 |
+
waveform = resampler(waveform)
|
26 |
+
sample_rate = TARGET_SAMPLE_RATE # Update sample rate
|
27 |
+
|
28 |
+
# Process the audio for the model
|
29 |
+
inputs = feature_extractor(waveform.squeeze(0), sampling_rate=sample_rate, return_tensors="pt")
|
30 |
+
|
31 |
+
# Perform inference
|
32 |
+
with torch.no_grad():
|
33 |
+
logits = model(**inputs).logits
|
34 |
+
|
35 |
+
# Get the predicted emotion
|
36 |
+
predicted_label = torch.argmax(logits, dim=-1).item()
|
37 |
+
|
38 |
+
# Print the output
|
39 |
+
print(f"Predicted Emotion Class: {predicted_label}")
|
packages.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
ffmpeg
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
librosa
|
3 |
+
transformers
|
4 |
+
torch
|
5 |
+
numpy
|
6 |
+
scipy
|
7 |
+
soundfile
|
summarizer_test.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
AUDIO_FILE = "D:/SER MiniProj/temp_audio.wav"
|
5 |
+
# Load Whisper model for transcription
|
6 |
+
whisper_model = whisper.load_model("base") # You can use "small", "medium", or "large" for better accuracy
|
7 |
+
|
8 |
+
# Transcribe the audio
|
9 |
+
transcription = whisper_model.transcribe(AUDIO_FILE)["text"]
|
10 |
+
print(f"π Transcribed Text: {transcription}")
|
11 |
+
|
12 |
+
# Load summarization model
|
13 |
+
summarizer = pipeline("summarization", model="t5-base", framework="pt")
|
14 |
+
|
15 |
+
# Generate summary
|
16 |
+
summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
|
17 |
+
print(f"π Summary: {summary}")
|
whisper_test.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import whisper
|
2 |
+
model = whisper.load_model("small")
|
3 |
+
result = model.transcribe("temp_audio.wav")
|
4 |
+
print("Whisper Output:", result["text"])
|