Spaces:
Running
Running
File size: 6,695 Bytes
152d61c 5e4841e 152d61c 5e4841e 7abe73c 5e4841e c39c802 0e85ac7 5e4841e 0e85ac7 5e4841e 05be7ae 0e85ac7 5e4841e 0e85ac7 152d61c 5e4841e 152d61c 0a4b920 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 6597a2f 7abe73c 5e4841e 7abe73c 3970052 152d61c 5e4841e 152d61c 5e4841e 6597a2f 5e4841e 05be7ae 5e4841e 0a4b920 5e4841e 0a4b920 e4cf4e2 152d61c 5e4841e 7abe73c 5e4841e 6597a2f 5e4841e 6597a2f 0a4b920 5e4841e 0a4b920 5e4841e 0a4b920 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 0a4b920 5e4841e 0a4b920 5e4841e 0a4b920 5e2d609 0a4b920 5e4841e 0a4b920 5e4841e 6597a2f 5e4841e 0a4b920 5e4841e 6597a2f 5e4841e 5e2d609 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 152d61c 5e4841e 152d61c 5e4841e 5e2d609 5e4841e 7abe73c 6597a2f 5e4841e 0a4b920 5e4841e 7abe73c 6597a2f 5e4841e c39c802 152d61c 7abe73c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 |
##########################################
# Step 0: Essential imports
##########################################
import streamlit as st # Web interface
from transformers import ( # AI components
pipeline,
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan,
AutoModelForCausalLM,
AutoTokenizer
)
from datasets import load_dataset # Voice data
import torch # Tensor operations
import soundfile as sf # Audio processing
##########################################
# Initial configuration (MUST BE FIRST)
##########################################
st.set_page_config( # Set page config first
page_title="Just Comment",
page_icon="๐ฌ",
layout="centered"
)
##########################################
# Optimized model loader with caching
##########################################
@st.cache_resource(show_spinner=False)
def _load_components():
"""Load and cache all models with hardware optimization"""
device = "cuda" if torch.cuda.is_available() else "cpu"
# Emotion classifier (fast)
emotion_pipe = pipeline(
"text-classification",
model="Thea231/jhartmann_emotion_finetuning",
device=device,
truncation=True
)
# Text generator (optimized)
text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
text_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen1.5-0.5B",
torch_dtype=torch.float16,
device_map="auto"
)
# TTS system (accelerated)
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained(
"microsoft/speecht5_tts",
torch_dtype=torch.float16
).to(device)
tts_vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan",
torch_dtype=torch.float16
).to(device)
# Preloaded voice profile
speaker_emb = torch.tensor(
load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
).unsqueeze(0).to(device)
return {
"emotion": emotion_pipe,
"text_model": text_model,
"text_tokenizer": text_tokenizer,
"tts_processor": tts_processor,
"tts_model": tts_model,
"tts_vocoder": tts_vocoder,
"speaker_emb": speaker_emb,
"device": device
}
##########################################
# User interface components
##########################################
def _show_interface():
"""Render input interface"""
st.title("Just Comment")
st.markdown(f"### I'm listening to you, my friend๏ฝ")
return st.text_area( # Input field
"๐ Enter your comment:",
placeholder="Share your thoughts...",
height=150,
key="input"
)
##########################################
# Core processing functions
##########################################
def _fast_emotion(text, analyzer):
"""Rapid emotion detection with input limits"""
result = analyzer(text[:256], return_all_scores=True)[0] # Limit input length
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
return max(
(e for e in result if e['label'].lower() in emotions),
key=lambda x: x['score'],
default={'label': 'neutral', 'score': 0}
)
def _build_prompt(text, emotion):
"""Template-based prompt engineering"""
templates = {
"sadness": f"Sadness detected: {{text}}\nRespond with: 1. Empathy 2. Support 3. Solution\nResponse:",
"joy": f"Joy detected: {{text}}\nRespond with: 1. Thanks 2. Praise 3. Engagement\nResponse:",
"love": f"Love detected: {{text}}\nRespond with: 1. Appreciation 2. Connection 3. Offer\nResponse:",
"anger": f"Anger detected: {{text}}\nRespond with: 1. Apology 2. Action 3. Compensation\nResponse:",
"fear": f"Fear detected: {{text}}\nRespond with: 1. Reassurance 2. Safety 3. Support\nResponse:",
"surprise": f"Surprise detected: {{text}}\nRespond with: 1. Acknowledgement 2. Solution 3. Follow-up\nResponse:",
"neutral": f"Feedback: {{text}}\nProfessional response:\n1. Acknowledgement\n2. Assistance\n3. Next steps\nResponse:"
}
return templates[emotion.lower()].format(text=text[:200]) # Input truncation
def _generate_response(text, models):
"""Optimized text generation pipeline"""
# Emotion detection
emotion = _fast_emotion(text, models["emotion"])
# Prompt construction
prompt = _build_prompt(text, emotion["label"])
# Generate text
inputs = models["text_tokenizer"](
prompt,
return_tensors="pt",
max_length=100,
truncation=True
).to(models["device"])
output = models["text_model"].generate(
inputs.input_ids,
max_new_tokens=120, # Balanced length
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=models["text_tokenizer"].eos_token_id
)
# Process output
full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
response = full_text.split("Response:")[-1].strip()
# Ensure completeness
if "." in response:
response = response.rsplit(".", 1)[0] + "."
return response[:200] or "Thank you for your feedback. We'll respond shortly."
def _text_to_speech(text, models):
"""High-speed audio synthesis"""
inputs = models["tts_processor"](
text=text[:150], # Limit text length
return_tensors="pt"
).to(models["device"])
with torch.inference_mode(): # Accelerated inference
spectrogram = models["tts_model"].generate_speech(
inputs["input_ids"],
models["speaker_emb"]
)
audio = models["tts_vocoder"](spectrogram)
sf.write("output.wav", audio.cpu().numpy(), 16000)
return "output.wav"
##########################################
# Main application flow
##########################################
def main():
"""Primary execution controller"""
# Load components
components = _load_components()
# Show interface
user_input = _show_interface()
if user_input:
# Text generation
with st.spinner("๐ Analyzing..."):
response = _generate_response(user_input, components)
# Display result
st.subheader(f"๐ Response")
st.markdown(f"```\n{response}\n```") # f-string formatted
# Audio generation
with st.spinner("๐ Synthesizing..."):
audio_path = _text_to_speech(response, components)
st.audio(audio_path, format="audio/wav")
if __name__ == "__main__":
main() |