Arjunadhithya commited on
Commit
1f75494
Β·
verified Β·
1 Parent(s): 83560b9

Upload 7 files

Browse files
Files changed (7) hide show
  1. app.py +171 -0
  2. backend.py +92 -0
  3. emotion_test.py +39 -0
  4. packages.txt +1 -0
  5. requirements.txt +7 -0
  6. summarizer_test.py +17 -0
  7. whisper_test.py +4 -0
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import torchaudio
4
+ import whisper
5
+ from transformers import pipeline, AutoFeatureExtractor, AutoModelForAudioClassification
6
+ import librosa
7
+ import librosa.display
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import altair as alt
11
+
12
+ # Constants
13
+ MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/"
14
+ TARGET_SAMPLE_RATE = 16000 # Required sample rate for Wav2Vec2
15
+ AUDIO_SAVE_PATH = "temp_audio.wav"
16
+
17
+ emotion_labels = {
18
+ 0: "Neutral",
19
+ 1: "Calm",
20
+ 2: "Happy",
21
+ 3: "Sad",
22
+ 4: "Angry",
23
+ 5: "Fearful",
24
+ 6: "Disgust",
25
+ 7: "Surprised"
26
+ }
27
+
28
+ # Load models with caching
29
+ @st.cache_resource
30
+ def load_models():
31
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
32
+ ser_model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)
33
+
34
+ whisper_model = whisper.load_model("base")
35
+ summarizer = pipeline("summarization", model="t5-base", framework="pt")
36
+
37
+ return feature_extractor, ser_model, whisper_model, summarizer
38
+
39
+ feature_extractor, ser_model, whisper_model, summarizer = load_models()
40
+
41
+ # UI Layout
42
+ st.set_page_config(page_title="Speech Analysis App", layout="wide")
43
+ st.title("Speech Emotion Recognition & Summarization")
44
+ st.markdown("Upload an audio file to analyze emotions, transcribe speech, and get a concise summary.")
45
+
46
+ uploaded_file = st.file_uploader("Upload Audio File", type=["wav", "mp3", "ogg"])
47
+
48
+ if uploaded_file:
49
+ with open(AUDIO_SAVE_PATH, "wb") as f:
50
+ f.write(uploaded_file.getbuffer())
51
+
52
+ st.audio(AUDIO_SAVE_PATH, format="audio/wav")
53
+
54
+ waveform, sample_rate = torchaudio.load(AUDIO_SAVE_PATH)
55
+
56
+ if waveform.shape[0] > 1:
57
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
58
+
59
+ if sample_rate != TARGET_SAMPLE_RATE:
60
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
61
+ waveform = resampler(waveform)
62
+
63
+ y = waveform.squeeze().numpy()
64
+
65
+ # Audio Visualization
66
+ st.subheader("Audio Visualizations")
67
+
68
+ # Waveform
69
+ st.markdown("**Waveform**")
70
+ fig_wave, ax_wave = plt.subplots(figsize=(8, 1.8)) # Reduced vertical height
71
+ ax_wave.plot(y, linewidth=0.8)
72
+ ax_wave.set_xlabel("Samples")
73
+ ax_wave.set_ylabel("Amplitude")
74
+ ax_wave.set_title("Waveform", fontsize=10)
75
+ ax_wave.tick_params(labelsize=8)
76
+ fig_wave.tight_layout(pad=0.3)
77
+ st.pyplot(fig_wave)
78
+
79
+ col1, col2 = st.columns([1, 1])
80
+ with col1:
81
+ st.markdown("**Spectrogram**")
82
+ fig, ax = plt.subplots(figsize=(6, 3))
83
+ D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
84
+ img = librosa.display.specshow(D, sr=TARGET_SAMPLE_RATE, x_axis='time', y_axis='log', ax=ax)
85
+ fig.colorbar(img, ax=ax, format="%+2.0f dB")
86
+ fig.tight_layout(pad=0.5)
87
+ st.pyplot(fig)
88
+
89
+ with col2:
90
+ st.markdown("**MFCCs**")
91
+ fig2, ax2 = plt.subplots(figsize=(6, 3))
92
+ mfccs = librosa.feature.mfcc(y=y, sr=TARGET_SAMPLE_RATE, n_mfcc=13)
93
+ img2 = librosa.display.specshow(mfccs, x_axis='time', ax=ax2)
94
+ fig2.colorbar(img2, ax=ax2)
95
+ fig2.tight_layout(pad=0.5)
96
+ st.pyplot(fig2)
97
+
98
+ # Emotion Prediction
99
+ st.subheader("Emotion Recognition")
100
+ inputs = feature_extractor(y, sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt")
101
+ with torch.no_grad():
102
+ logits = ser_model(**inputs).logits
103
+ predicted_class = torch.argmax(logits, dim=-1).item()
104
+ predicted_emotion = emotion_labels[predicted_class]
105
+ st.success(f"Predicted Emotion: {predicted_emotion}")
106
+
107
+ # Transcription & Summarization
108
+ st.subheader("Speech Transcription & Summary")
109
+ transcription = whisper_model.transcribe(AUDIO_SAVE_PATH)["text"]
110
+ st.info(f"Transcription: {transcription}")
111
+
112
+ summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
113
+ st.success(f"Summary: {summary}")
114
+
115
+ # Playback speed
116
+ st.subheader("Playback Options")
117
+ speed = st.select_slider("Playback Speed", options=[0.5, 0.75, 1.0, 1.25, 1.5], value=1.0)
118
+ st.markdown(f"Playback speed set to {speed}x (you can use external player to preview adjusted audio)")
119
+
120
+ # Audio Info
121
+ st.markdown("**Audio Metadata**")
122
+ st.write(f"Duration: {round(len(y) / TARGET_SAMPLE_RATE, 2)} seconds")
123
+ st.write(f"Sample Rate: {TARGET_SAMPLE_RATE} Hz")
124
+
125
+
126
+ # # Streamlit UI
127
+ # st.title("🎀 Speech Analysis: Emotion & Summarization")
128
+
129
+ # # Upload audio file
130
+ # uploaded_file = st.file_uploader("Upload an audio file", type=["wav", "mp3", "ogg"])
131
+
132
+ # if uploaded_file is not None:
133
+ # # Save the uploaded file
134
+ # with open(AUDIO_SAVE_PATH, "wb") as f:
135
+ # f.write(uploaded_file.getbuffer())
136
+
137
+ # # Display the audio
138
+ # st.audio(AUDIO_SAVE_PATH, format="audio/wav")
139
+
140
+ # # **Speech Emotion Recognition**
141
+ # st.subheader("Speech Emotion Recognition")
142
+ # waveform, sample_rate = torchaudio.load(AUDIO_SAVE_PATH)
143
+
144
+ # # Convert stereo to mono
145
+ # if waveform.shape[0] > 1:
146
+ # waveform = torch.mean(waveform, dim=0, keepdim=True)
147
+
148
+ # # Resample if needed
149
+ # if sample_rate != TARGET_SAMPLE_RATE:
150
+ # resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
151
+ # waveform = resampler(waveform)
152
+
153
+ # # Extract features
154
+ # inputs = feature_extractor(waveform.squeeze(0), sampling_rate=TARGET_SAMPLE_RATE, return_tensors="pt")
155
+
156
+ # # Get emotion prediction
157
+ # with torch.no_grad():
158
+ # logits = ser_model(**inputs).logits
159
+
160
+ # predicted_class = torch.argmax(logits, dim=-1).item()
161
+ # emotion = emotion_labels.get(predicted_class, "Unknown")
162
+ # st.success(f"Predicted Emotion: {emotion} ({predicted_class})")
163
+
164
+ # # **Speech Summarization**
165
+ # st.subheader(" Speech Summarization")
166
+ # transcription = whisper_model.transcribe(AUDIO_SAVE_PATH)["text"]
167
+ # st.info(f" Transcription: {transcription}")
168
+
169
+ # # Generate summary
170
+ # summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
171
+ # st.success(f"The Summary: {summary}")
backend.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile
2
+ import uvicorn
3
+ import openai
4
+ import torch
5
+ import torchaudio
6
+ import torchaudio.transforms as T
7
+ from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification
8
+ import whisper
9
+ import os
10
+
11
+ app = FastAPI()
12
+
13
+ # Load Whisper model for transcription
14
+ whisper_model = whisper.load_model("small")
15
+
16
+ # Load speech emotion recognition model
17
+ ser_model_name = "superb/wav2vec2-base-superb-er"
18
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(ser_model_name)
19
+ ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name)
20
+
21
+ # OpenAI API Key
22
+ openai.api_key = os.getenv("OPENAI_API_KEY") # Ensure you set this in the terminal before running
23
+
24
+ @app.post("/process_audio/")
25
+ async def process_audio(file: UploadFile = File(...)):
26
+ try:
27
+ print(f"βœ… File received: {file.filename}")
28
+
29
+ # Save audio
30
+ audio_path = "temp_audio.wav"
31
+ with open(audio_path, "wb") as f:
32
+ f.write(await file.read())
33
+ print("βœ… Audio saved successfully!")
34
+
35
+ # 🟒 TEST 1: Check if the file is corrupted
36
+ try:
37
+ waveform, sample_rate = torchaudio.load(audio_path)
38
+ print(f"βœ… Audio loaded! Shape: {waveform.shape}, Sample Rate: {sample_rate}")
39
+ except Exception as e:
40
+ return {"error": f"❌ Audio loading failed: {e}"}
41
+
42
+ # 🟒 TEST 2: Whisper Transcription
43
+ try:
44
+ transcription = whisper_model.transcribe(audio_path)["text"]
45
+ print(f"βœ… Whisper Transcription: {transcription}")
46
+ except Exception as e:
47
+ return {"error": f"❌ Whisper failed: {e}"}
48
+
49
+ # 🟒 TEST 3: Emotion Recognition
50
+ try:
51
+ if waveform.shape[0] > 1:
52
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
53
+ if sample_rate != 16000:
54
+ resampler = T.Resample(sample_rate, 16000)
55
+ waveform = resampler(waveform)
56
+
57
+ inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)
58
+ with torch.no_grad():
59
+ logits = ser_model(**inputs).logits
60
+ predicted_class = torch.argmax(logits, dim=-1).item()
61
+ emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"]
62
+ emotion_detected = emotions[predicted_class] if predicted_class < len(emotions) else "unknown"
63
+ print(f"βœ… Emotion Detected: {emotion_detected}")
64
+ except Exception as e:
65
+ return {"error": f"❌ Emotion recognition failed: {e}"}
66
+
67
+ # 🟒 TEST 4: OpenAI API Summarization
68
+ try:
69
+ summary_response = openai.ChatCompletion.create(
70
+ model="gpt-4-turbo",
71
+ messages=[
72
+ {"role": "system", "content": "Summarize the following text."},
73
+ {"role": "user", "content": transcription}
74
+ ]
75
+ )
76
+ summary = summary_response["choices"][0]["message"]["content"]
77
+ print(f"βœ… OpenAI Summary: {summary}")
78
+ except Exception as e:
79
+ return {"error": f"❌ OpenAI Summarization failed: {e}"}
80
+
81
+ return {
82
+ "transcription": transcription,
83
+ "emotion": emotion_detected,
84
+ "summary": summary
85
+ }
86
+
87
+ except Exception as e:
88
+ print(f"❌ Error in process_audio: {e}")
89
+ return {"error": str(e)}
90
+
91
+ if __name__ == "__main__":
92
+ uvicorn.run(app, host="0.0.0.0", port=8000)
emotion_test.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torchaudio
3
+ from transformers import AutoFeatureExtractor, AutoModelForAudioClassification
4
+
5
+ MODEL_PATH = "D:/SER MiniProj/wav2vec2_model/"
6
+ TARGET_SAMPLE_RATE = 16000 # Model requires 16kHz audio
7
+
8
+ # Load feature extractor and model
9
+ feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_PATH)
10
+ model = AutoModelForAudioClassification.from_pretrained(MODEL_PATH)
11
+
12
+ print("Feature extractor and model loaded successfully!")
13
+
14
+ # Load an audio file
15
+ audio_file = "D:/SER MiniProj/temp_audio.wav"
16
+ waveform, sample_rate = torchaudio.load(audio_file)
17
+
18
+ # Convert to mono if needed
19
+ if waveform.shape[0] > 1:
20
+ waveform = torch.mean(waveform, dim=0, keepdim=True)
21
+
22
+ # Resample if the sample rate is not 16kHz
23
+ if sample_rate != TARGET_SAMPLE_RATE:
24
+ resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=TARGET_SAMPLE_RATE)
25
+ waveform = resampler(waveform)
26
+ sample_rate = TARGET_SAMPLE_RATE # Update sample rate
27
+
28
+ # Process the audio for the model
29
+ inputs = feature_extractor(waveform.squeeze(0), sampling_rate=sample_rate, return_tensors="pt")
30
+
31
+ # Perform inference
32
+ with torch.no_grad():
33
+ logits = model(**inputs).logits
34
+
35
+ # Get the predicted emotion
36
+ predicted_label = torch.argmax(logits, dim=-1).item()
37
+
38
+ # Print the output
39
+ print(f"Predicted Emotion Class: {predicted_label}")
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ librosa
3
+ transformers
4
+ torch
5
+ numpy
6
+ scipy
7
+ soundfile
summarizer_test.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ from transformers import pipeline
3
+
4
+ AUDIO_FILE = "D:/SER MiniProj/temp_audio.wav"
5
+ # Load Whisper model for transcription
6
+ whisper_model = whisper.load_model("base") # You can use "small", "medium", or "large" for better accuracy
7
+
8
+ # Transcribe the audio
9
+ transcription = whisper_model.transcribe(AUDIO_FILE)["text"]
10
+ print(f"πŸ“ Transcribed Text: {transcription}")
11
+
12
+ # Load summarization model
13
+ summarizer = pipeline("summarization", model="t5-base", framework="pt")
14
+
15
+ # Generate summary
16
+ summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
17
+ print(f"πŸ“Œ Summary: {summary}")
whisper_test.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import whisper
2
+ model = whisper.load_model("small")
3
+ result = model.transcribe("temp_audio.wav")
4
+ print("Whisper Output:", result["text"])