joey1101 commited on
Commit
d7ef86b
·
verified ·
1 Parent(s): b1dada0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -146
app.py CHANGED
@@ -1,191 +1,210 @@
1
  ##########################################
2
- # Step 0: Essential imports
3
  ##########################################
4
- import streamlit as st # Web interface
5
- from transformers import ( # AI components: emotion analysis, text-to-speech, text generation
6
  pipeline,
7
  SpeechT5Processor,
8
  SpeechT5ForTextToSpeech,
9
  SpeechT5HifiGan,
10
  AutoModelForCausalLM,
11
  AutoTokenizer
12
- )
13
- from datasets import load_dataset # To load speaker embeddings dataset
14
- import torch # For tensor operations
15
- import soundfile as sf # For audio file writing
16
- import sentencepiece # Required for SpeechT5Processor tokenization
17
 
18
  ##########################################
19
- # Initial configuration (MUST BE FIRST)
20
  ##########################################
21
- st.set_page_config( # Set page configuration
22
  page_title="Just Comment",
23
  page_icon="💬",
24
- layout="centered"
 
25
  )
26
 
27
  ##########################################
28
- # Optimized model loader with caching
29
  ##########################################
30
  @st.cache_resource(show_spinner=False)
31
- def _load_components():
32
- """Load and cache all models with hardware optimization."""
33
- device = "cuda" if torch.cuda.is_available() else "cpu" # Detect available device
34
-
35
- # Load emotion classifier (fast; input truncated)
36
- emotion_pipe = pipeline(
37
- "text-classification",
38
- model="Thea231/jhartmann_emotion_finetuning",
39
- device=device,
40
- truncation=True
41
- )
42
-
43
- # Load text generation components with conditional device mapping
44
- text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
45
- if device == "cuda":
46
- text_model = AutoModelForCausalLM.from_pretrained(
47
  "Qwen/Qwen1.5-0.5B",
48
- torch_dtype=torch.float16,
49
- device_map="auto"
50
- )
51
- else:
52
- text_model = AutoModelForCausalLM.from_pretrained(
53
  "Qwen/Qwen1.5-0.5B",
54
- torch_dtype=torch.float16
55
- ).to(device)
56
-
57
- # Load TTS components
58
- tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
59
- tts_model = SpeechT5ForTextToSpeech.from_pretrained(
60
- "microsoft/speecht5_tts",
61
- torch_dtype=torch.float16
62
- ).to(device)
63
- tts_vocoder = SpeechT5HifiGan.from_pretrained(
64
- "microsoft/speecht5_hifigan",
65
- torch_dtype=torch.float16
66
- ).to(device)
67
-
68
- # Load a pre-trained speaker embedding (neutral voice)
69
- speaker_emb = torch.tensor(
70
- load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
71
- ).unsqueeze(0).to(device)
72
-
73
- return {
74
- "emotion": emotion_pipe,
75
- "text_model": text_model,
76
- "text_tokenizer": text_tokenizer,
77
- "tts_processor": tts_processor,
78
- "tts_model": tts_model,
79
- "tts_vocoder": tts_vocoder,
80
- "speaker_emb": speaker_emb,
81
- "device": device
82
  }
83
 
84
  ##########################################
85
- # User interface components
86
  ##########################################
87
- def _show_interface():
88
- """Render input interface."""
89
- st.title("🚀 Just Comment") # Display title with rocket emoji
90
- st.markdown("### I'm listening to you, my friend~") # Display friendly subtitle
91
- return st.text_area( # Return user comment input
 
92
  "📝 Enter your comment:",
93
- placeholder="Share your thoughts...",
94
  height=150,
95
- key="input"
96
  )
97
 
98
  ##########################################
99
- # Core processing functions
100
  ##########################################
101
- def _fast_emotion(text, analyzer):
102
- """Rapidly detect dominant emotion using a truncated input."""
103
- result = analyzer(text[:256], return_all_scores=True)[0] # Analyze first 256 characters
104
- valid_emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
105
- return max(
106
- (e for e in result if e['label'].lower() in valid_emotions),
107
- key=lambda x: x['score'],
108
- default={'label': 'neutral', 'score': 0}
109
- )
110
 
111
- def _build_prompt(text, emotion):
112
- """Build a continuous prompt (1–3 sentences) based on detected emotion."""
113
- templates = {
114
- "sadness": "I sensed sadness in your comment: {text}. We are sorry and ready to support you.",
115
- "joy": "Your comment shows joy: {text}. Thank you for your positive feedback; we are excited to serve you better.",
116
- "love": "Your comment expresses love: {text}. We appreciate your heartfelt words and value our connection.",
117
- "anger": "I understand your comment reflects anger: {text}. Please accept our sincere apologies as we address your concerns.",
118
- "fear": "It seems you feel fear: {text}. Rest assured, your safety and satisfaction are our top priorities.",
119
- "surprise": "Your comment exudes surprise: {text}. We are pleased by your experience and will strive to exceed your expectations.",
120
- "neutral": "Thank you for your comment: {text}. We are committed to providing you with excellent service."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  }
122
- # Use the template corresponding to the detected emotion (default to neutral)
123
- return templates.get(emotion.lower(), templates["neutral"]).format(text=text[:200])
124
 
125
- def _generate_response(text, models):
126
- """Generate a response by combining emotion detection and text generation."""
127
- # Detect emotion quickly
128
- detected_emotion = _fast_emotion(text, models["emotion"])
129
- # Build prompt based on the detected emotion in a continuous format
130
- prompt = _build_prompt(text, detected_emotion["label"])
131
- print(f"Generated prompt: {prompt}") # Debug print with f-string
132
- # Tokenize and generate response using the Qwen model
133
- inputs = models["text_tokenizer"](
134
- prompt,
135
- return_tensors="pt",
136
- max_length=100,
137
- truncation=True
138
- ).to(models["device"])
139
- output = models["text_model"].generate(
 
 
 
 
 
 
 
 
140
  inputs.input_ids,
141
- max_new_tokens=120, # Constrain length for 50-200 tokens response
142
- min_length=50,
143
  temperature=0.7,
144
  top_p=0.9,
145
  do_sample=True,
146
- pad_token_id=models["text_tokenizer"].eos_token_id
 
 
 
 
147
  )
148
- input_len = inputs.input_ids.shape[1] # Length of prompt tokens
149
- full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
150
- # Extract only the generated response portion (after any "Response:" marker if present)
151
- response = full_text.split("Response:")[-1].strip()
152
- print(f"Generated response: {response}") # Debug print with f-string
153
- return response[:200] # Return response truncated to around 200 characters as an approximation
154
 
155
- def _text_to_speech(text, models):
156
- """Convert the generated response text to speech and return the audio file path."""
157
- inputs = models["tts_processor"](
158
- text=text[:150], # Limit TTS input to 150 characters for speed
159
- return_tensors="pt"
160
- ).to(models["device"])
161
- with torch.inference_mode(): # Accelerate inference
162
- spectrogram = models["tts_model"].generate_speech(
163
- inputs["input_ids"],
164
- models["speaker_emb"]
165
- )
166
- audio = models["tts_vocoder"](spectrogram)
167
- sf.write("output.wav", audio.cpu().numpy(), 16000) # Save the audio file with 16kHz sample rate
168
- return "output.wav" # Return the path to the audio file
 
 
 
 
169
 
170
  ##########################################
171
- # Main application flow
172
  ##########################################
173
  def main():
174
- """Primary execution controller."""
175
- models = _load_components() # Load all necessary models and components
176
- user_input = _show_interface() # Render the input interface and get user comment
177
- if user_input: # Proceed only if a comment is provided
178
- with st.spinner("🔍 Generating response..."):
179
- generated_response = _generate_response(user_input, models)
180
- st.subheader("📄 Response")
181
- st.markdown(
182
- f"<p style='color:#3498DB; font-size:20px;'>{generated_response}</p>",
183
- unsafe_allow_html=True
184
- ) # Display the generated response in styled format
185
- with st.spinner("🔊 Synthesizing audio..."):
186
- audio_file = _text_to_speech(generated_response, models)
187
- st.audio(audio_file, format="audio/wav", start_time=0) # Embed auto-playing audio player
188
- print(f"Final generated response: {generated_response}") # Debug print with f-string
 
 
 
 
 
189
 
190
  if __name__ == "__main__":
191
- main() # Call the main function
 
1
  ##########################################
2
+ # Step 0: Import required libraries
3
  ##########################################
4
+ import streamlit as st # For web interface
5
+ from transformers import (
6
  pipeline,
7
  SpeechT5Processor,
8
  SpeechT5ForTextToSpeech,
9
  SpeechT5HifiGan,
10
  AutoModelForCausalLM,
11
  AutoTokenizer
12
+ ) # AI model components
13
+ from datasets import load_dataset # For voice embeddings
14
+ import torch # Tensor computations
15
+ import soundfile as sf # Audio file handling
16
+ import re # Regular expressions for text processing
17
 
18
  ##########################################
19
+ # Initial configuration (MUST be first)
20
  ##########################################
21
+ st.set_page_config(
22
  page_title="Just Comment",
23
  page_icon="💬",
24
+ layout="centered",
25
+ initial_sidebar_state="collapsed"
26
  )
27
 
28
  ##########################################
29
+ # Global model loading with caching
30
  ##########################################
31
  @st.cache_resource(show_spinner=False)
32
+ def _load_models():
33
+ """Load and cache all ML models with optimized settings"""
34
+ return {
35
+ # Emotion classification pipeline
36
+ 'emotion': pipeline(
37
+ "text-classification",
38
+ model="Thea231/jhartmann_emotion_finetuning",
39
+ truncation=True # Enable text truncation for long inputs
40
+ ),
41
+
42
+ # Text generation components
43
+ 'textgen_tokenizer': AutoTokenizer.from_pretrained(
 
 
 
 
44
  "Qwen/Qwen1.5-0.5B",
45
+ use_fast=True # Enable fast tokenization
46
+ ),
47
+ 'textgen_model': AutoModelForCausalLM.from_pretrained(
 
 
48
  "Qwen/Qwen1.5-0.5B",
49
+ torch_dtype=torch.float16 # Use half-precision for faster inference
50
+ ),
51
+
52
+ # Text-to-speech components
53
+ 'tts_processor': SpeechT5Processor.from_pretrained("microsoft/speecht5_tts"),
54
+ 'tts_model': SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts"),
55
+ 'tts_vocoder': SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan"),
56
+
57
+ # Preloaded speaker embeddings
58
+ 'speaker_embeddings': torch.tensor(
59
+ load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
60
+ ).unsqueeze(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  }
62
 
63
  ##########################################
64
+ # UI Components
65
  ##########################################
66
+ def _display_interface():
67
+ """Render user interface elements"""
68
+ st.title("Just Comment")
69
+ st.markdown("### I'm listening to you, my friend~")
70
+
71
+ return st.text_area(
72
  "📝 Enter your comment:",
73
+ placeholder="Type your message here...",
74
  height=150,
75
+ key="user_input"
76
  )
77
 
78
  ##########################################
79
+ # Core Processing Functions
80
  ##########################################
81
+ def _analyze_emotion(text, classifier):
82
+ """Identify dominant emotion with confidence threshold"""
83
+ results = classifier(text, return_all_scores=True)[0]
84
+ valid_emotions = {'sadness', 'joy', 'love', 'anger', 'fear', 'surprise'}
85
+ filtered = [e for e in results if e['label'].lower() in valid_emotions]
86
+ return max(filtered, key=lambda x: x['score'])
 
 
 
87
 
88
+ def _generate_prompt(text, emotion):
89
+ """Create structured prompts for all emotion types"""
90
+ prompt_templates = {
91
+ "sadness": (
92
+ "Sadness detected: {input}\n"
93
+ "Required response structure:\n"
94
+ "1. Empathetic acknowledgment\n2. Support offer\n3. Solution proposal\n"
95
+ "Response:"
96
+ ),
97
+ "joy": (
98
+ "Joy detected: {input}\n"
99
+ "Required response structure:\n"
100
+ "1. Enthusiastic thanks\n2. Positive reinforcement\n3. Future engagement\n"
101
+ "Response:"
102
+ ),
103
+ "love": (
104
+ "Affection detected: {input}\n"
105
+ "Required response structure:\n"
106
+ "1. Warm appreciation\n2. Community focus\n3. Exclusive benefit\n"
107
+ "Response:"
108
+ ),
109
+ "anger": (
110
+ "Anger detected: {input}\n"
111
+ "Required response structure:\n"
112
+ "1. Sincere apology\n2. Action steps\n3. Compensation\n"
113
+ "Response:"
114
+ ),
115
+ "fear": (
116
+ "Concern detected: {input}\n"
117
+ "Required response structure:\n"
118
+ "1. Reassurance\n2. Safety measures\n3. Support options\n"
119
+ "Response:"
120
+ ),
121
+ "surprise": (
122
+ "Surprise detected: {input}\n"
123
+ "Required response structure:\n"
124
+ "1. Acknowledge uniqueness\n2. Creative solution\n3. Follow-up\n"
125
+ "Response:"
126
+ )
127
  }
128
+ return prompt_templates.get(emotion.lower(), "").format(input=text)
 
129
 
130
+ def _process_response(raw_text):
131
+ """Clean and format generated response"""
132
+ # Extract text after last "Response:" marker
133
+ processed = raw_text.split("Response:")[-1].strip()
134
+
135
+ # Remove incomplete sentences
136
+ if '.' in processed:
137
+ processed = processed.rsplit('.', 1)[0] + '.'
138
+
139
+ # Ensure length between 50-200 characters
140
+ return processed[:200].strip() if len(processed) > 50 else "Thank you for your feedback. We value your input and will respond shortly."
141
+
142
+ def _generate_text_response(input_text, models):
143
+ """Generate optimized text response with timing controls"""
144
+ # Emotion analysis
145
+ emotion = _analyze_emotion(input_text, models['emotion'])
146
+
147
+ # Prompt engineering
148
+ prompt = _generate_prompt(input_text, emotion['label'])
149
+
150
+ # Text generation with optimized parameters
151
+ inputs = models['textgen_tokenizer'](prompt, return_tensors="pt").to('cpu')
152
+ outputs = models['textgen_model'].generate(
153
  inputs.input_ids,
154
+ max_new_tokens=100, # Strict token limit
 
155
  temperature=0.7,
156
  top_p=0.9,
157
  do_sample=True,
158
+ pad_token_id=models['textgen_tokenizer'].eos_token_id
159
+ )
160
+
161
+ return _process_response(
162
+ models['textgen_tokenizer'].decode(outputs[0], skip_special_tokens=True)
163
  )
 
 
 
 
 
 
164
 
165
+ def _generate_audio_response(text, models):
166
+ """Convert text to speech with performance optimizations"""
167
+ # Process text input
168
+ inputs = models['tts_processor'](text=text, return_tensors="pt")
169
+
170
+ # Generate spectrogram
171
+ spectrogram = models['tts_model'].generate_speech(
172
+ inputs["input_ids"],
173
+ models['speaker_embeddings']
174
+ )
175
+
176
+ # Generate waveform with optimizations
177
+ with torch.no_grad(): # Disable gradient calculation
178
+ waveform = models['tts_vocoder'](spectrogram)
179
+
180
+ # Save audio file
181
+ sf.write("response.wav", waveform.numpy(), samplerate=16000)
182
+ return "response.wav"
183
 
184
  ##########################################
185
+ # Main Application Flow
186
  ##########################################
187
  def main():
188
+ """Primary execution flow"""
189
+ # Load models once
190
+ ml_models = _load_models()
191
+
192
+ # Display interface
193
+ user_input = _display_interface()
194
+
195
+ if user_input:
196
+ # Text generation stage
197
+ with st.spinner("🔍 Analyzing emotions and generating response..."):
198
+ text_response = _generate_text_response(user_input, ml_models)
199
+
200
+ # Display results
201
+ st.subheader("📄 Generated Response")
202
+ st.markdown(f"```\n{text_response}\n```") # f-string formatted output
203
+
204
+ # Audio generation stage
205
+ with st.spinner("🔊 Converting to speech..."):
206
+ audio_file = _generate_audio_response(text_response, ml_models)
207
+ st.audio(audio_file, format="audio/wav")
208
 
209
  if __name__ == "__main__":
210
+ main()