Spaces:
Sleeping
Sleeping
File size: 8,291 Bytes
152d61c 5e4841e 152d61c 5e4841e b1dada0 7abe73c 5e4841e b1dada0 c39c802 0e85ac7 5e4841e 0e85ac7 b1dada0 05be7ae 0e85ac7 5e4841e 0e85ac7 152d61c 5e4841e 152d61c 0a4b920 5e4841e b1dada0 befe307 b1dada0 6597a2f 5e4841e 6597a2f befe307 b1dada0 5e4841e b1dada0 befe307 b1dada0 6597a2f befe307 b1dada0 5e4841e 6597a2f befe307 7abe73c 5e4841e 7abe73c 3970052 152d61c 5e4841e 152d61c 5e4841e b1dada0 05be7ae 5e4841e 0a4b920 5e4841e 0a4b920 e4cf4e2 152d61c 5e4841e 7abe73c 5e4841e b1dada0 5e4841e b1dada0 6597a2f 5e4841e 6597a2f 0a4b920 5e4841e b1dada0 0a4b920 5e4841e b1dada0 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 0a4b920 b1dada0 0a4b920 5e2d609 0a4b920 5e4841e 0a4b920 b1dada0 0a4b920 5e4841e b1dada0 5e4841e b1dada0 152d61c 5e4841e 152d61c b1dada0 f551227 b1dada0 c39c802 152d61c b1dada0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
##########################################
# Step 0: Essential imports
##########################################
import streamlit as st # Web interface
from transformers import ( # AI components: emotion analysis, text-to-speech, text generation
pipeline,
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan,
AutoModelForCausalLM,
AutoTokenizer
)
from datasets import load_dataset # To load speaker embeddings dataset
import torch # For tensor operations
import soundfile as sf # For audio file writing
import sentencepiece # Required for SpeechT5Processor tokenization
##########################################
# Initial configuration (MUST BE FIRST)
##########################################
st.set_page_config( # Set page configuration
page_title="Just Comment",
page_icon="π¬",
layout="centered"
)
##########################################
# Optimized model loader with caching
##########################################
@st.cache_resource(show_spinner=False)
def _load_components():
"""Load and cache all models with hardware optimization."""
device = "cuda" if torch.cuda.is_available() else "cpu" # Detect available device
# Load emotion classifier (fast; input truncated)
emotion_pipe = pipeline(
"text-classification",
model="Thea231/jhartmann_emotion_finetuning",
device=device,
truncation=True
)
# Load text generation components with conditional device mapping
text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
if device == "cuda":
text_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen1.5-0.5B",
torch_dtype=torch.float16,
device_map="auto"
)
else:
text_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen1.5-0.5B",
torch_dtype=torch.float16
).to(device)
# Load TTS components
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained(
"microsoft/speecht5_tts",
torch_dtype=torch.float16
).to(device)
tts_vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan",
torch_dtype=torch.float16
).to(device)
# Load a pre-trained speaker embedding (neutral voice)
speaker_emb = torch.tensor(
load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
).unsqueeze(0).to(device)
return {
"emotion": emotion_pipe,
"text_model": text_model,
"text_tokenizer": text_tokenizer,
"tts_processor": tts_processor,
"tts_model": tts_model,
"tts_vocoder": tts_vocoder,
"speaker_emb": speaker_emb,
"device": device
}
##########################################
# User interface components
##########################################
def _show_interface():
"""Render input interface."""
st.title("π Just Comment") # Display title with rocket emoji
st.markdown("### I'm listening to you, my friendο½") # Display friendly subtitle
return st.text_area( # Return user comment input
"π Enter your comment:",
placeholder="Share your thoughts...",
height=150,
key="input"
)
##########################################
# Core processing functions
##########################################
def _fast_emotion(text, analyzer):
"""Rapidly detect dominant emotion using a truncated input."""
result = analyzer(text[:256], return_all_scores=True)[0] # Analyze first 256 characters
valid_emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
return max(
(e for e in result if e['label'].lower() in valid_emotions),
key=lambda x: x['score'],
default={'label': 'neutral', 'score': 0}
)
def _build_prompt(text, emotion):
"""Build a continuous prompt (1β3 sentences) based on detected emotion."""
templates = {
"sadness": "I sensed sadness in your comment: {text}. We are sorry and ready to support you.",
"joy": "Your comment shows joy: {text}. Thank you for your positive feedback; we are excited to serve you better.",
"love": "Your comment expresses love: {text}. We appreciate your heartfelt words and value our connection.",
"anger": "I understand your comment reflects anger: {text}. Please accept our sincere apologies as we address your concerns.",
"fear": "It seems you feel fear: {text}. Rest assured, your safety and satisfaction are our top priorities.",
"surprise": "Your comment exudes surprise: {text}. We are pleased by your experience and will strive to exceed your expectations.",
"neutral": "Thank you for your comment: {text}. We are committed to providing you with excellent service."
}
# Use the template corresponding to the detected emotion (default to neutral)
return templates.get(emotion.lower(), templates["neutral"]).format(text=text[:200])
def _generate_response(text, models):
"""Generate a response by combining emotion detection and text generation."""
# Detect emotion quickly
detected_emotion = _fast_emotion(text, models["emotion"])
# Build prompt based on the detected emotion in a continuous format
prompt = _build_prompt(text, detected_emotion["label"])
print(f"Generated prompt: {prompt}") # Debug print with f-string
# Tokenize and generate response using the Qwen model
inputs = models["text_tokenizer"](
prompt,
return_tensors="pt",
max_length=100,
truncation=True
).to(models["device"])
output = models["text_model"].generate(
inputs.input_ids,
max_new_tokens=120, # Constrain length for 50-200 tokens response
min_length=50,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=models["text_tokenizer"].eos_token_id
)
input_len = inputs.input_ids.shape[1] # Length of prompt tokens
full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
# Extract only the generated response portion (after any "Response:" marker if present)
response = full_text.split("Response:")[-1].strip()
print(f"Generated response: {response}") # Debug print with f-string
return response[:200] # Return response truncated to around 200 characters as an approximation
def _text_to_speech(text, models):
"""Convert the generated response text to speech and return the audio file path."""
inputs = models["tts_processor"](
text=text[:150], # Limit TTS input to 150 characters for speed
return_tensors="pt"
).to(models["device"])
with torch.inference_mode(): # Accelerate inference
spectrogram = models["tts_model"].generate_speech(
inputs["input_ids"],
models["speaker_emb"]
)
audio = models["tts_vocoder"](spectrogram)
sf.write("output.wav", audio.cpu().numpy(), 16000) # Save the audio file with 16kHz sample rate
return "output.wav" # Return the path to the audio file
##########################################
# Main application flow
##########################################
def main():
"""Primary execution controller."""
models = _load_components() # Load all necessary models and components
user_input = _show_interface() # Render the input interface and get user comment
if user_input: # Proceed only if a comment is provided
with st.spinner("π Generating response..."):
generated_response = _generate_response(user_input, models)
st.subheader("π Response")
st.markdown(
f"<p style='color:#3498DB; font-size:20px;'>{generated_response}</p>",
unsafe_allow_html=True
) # Display the generated response in styled format
with st.spinner("π Synthesizing audio..."):
audio_file = _text_to_speech(generated_response, models)
st.audio(audio_file, format="audio/wav", start_time=0) # Embed auto-playing audio player
print(f"Final generated response: {generated_response}") # Debug print with f-string
if __name__ == "__main__":
main() # Call the main function
|