Spaces:
Running
Running
File size: 5,697 Bytes
152d61c 5e4841e 152d61c 5e4841e befe307 7abe73c 5e4841e befe307 c39c802 0e85ac7 5e4841e 0e85ac7 befe307 05be7ae 0e85ac7 5e4841e 0e85ac7 152d61c 5e4841e 152d61c 0a4b920 5e4841e befe307 6597a2f 5e4841e 6597a2f befe307 5e4841e 6597a2f befe307 5e4841e 6597a2f befe307 5e4841e 6597a2f befe307 7abe73c 5e4841e 7abe73c 3970052 152d61c 5e4841e 152d61c 5e4841e befe307 05be7ae 5e4841e 0a4b920 5e4841e 0a4b920 e4cf4e2 152d61c 5e4841e 7abe73c 5e4841e befe307 5e4841e befe307 6597a2f 5e4841e 6597a2f 0a4b920 5e4841e befe307 0a4b920 5e4841e befe307 5e4841e 6597a2f 5e4841e 6597a2f 5e4841e 0a4b920 5e4841e 0a4b920 befe307 0a4b920 5e2d609 0a4b920 5e4841e 0a4b920 befe307 0a4b920 5e4841e befe307 5e2d609 befe307 5e4841e 6597a2f befe307 5e4841e 152d61c 5e4841e 152d61c befe307 7abe73c befe307 f551227 befe307 c39c802 152d61c befe307 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
##########################################
# Step 0: Essential imports
##########################################
import streamlit as st # Web interface
from transformers import ( # AI components
pipeline,
SpeechT5Processor,
SpeechT5ForTextToSpeech,
SpeechT5HifiGan,
AutoModelForCausalLM,
AutoTokenizer
)
from datasets import load_dataset # Voice data
import torch # Tensor operations
import soundfile as sf # Audio processing
##########################################
# Initial configuration (MUST BE FIRST)
##########################################
st.set_page_config( # Set page config first
page_title="Just Comment",
page_icon="๐ฌ",
layout="centered"
)
##########################################
# Optimized model loader with caching
##########################################
@st.cache_resource(show_spinner=False)
def _load_components():
"""Load and cache all models with hardware optimization"""
device = "cuda" if torch.cuda.is_available() else "cpu"
# Emotion classifier (fast)
emotion_pipe = pipeline(
"text-classification",
model="Thea231/jhartmann_emotion_finetuning",
device=device,
truncation=True
)
# Text generator (optimized)
text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
text_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen1.5-0.5B",
torch_dtype=torch.float16,
device_map="auto"
)
# TTS system (accelerated)
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained(
"microsoft/speecht5_tts",
torch_dtype=torch.float16
).to(device)
tts_vocoder = SpeechT5HifiGan.from_pretrained(
"microsoft/speecht5_hifigan",
torch_dtype=torch.float16
).to(device)
# Preloaded voice profile
speaker_emb = torch.tensor(
load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
).unsqueeze(0).to(device)
return {
"emotion": emotion_pipe,
"text_model": text_model,
"text_tokenizer": text_tokenizer,
"tts_processor": tts_processor,
"tts_model": tts_model,
"tts_vocoder": tts_vocoder,
"speaker_emb": speaker_emb,
"device": device
}
##########################################
# User interface components
##########################################
def _show_interface():
"""Render input interface"""
st.title("Just Comment")
st.markdown("### I'm listening to you, my friend๏ฝ")
return st.text_area( # Input field
"๐ Enter your comment:",
placeholder="Share your thoughts...",
height=150,
key="input"
)
##########################################
# Core processing functions
##########################################
def _fast_emotion(text, analyzer):
"""Rapid emotion detection with input limits"""
result = analyzer(text[:256], return_all_scores=True)[0] # Limit input length
emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
return max(
(e for e in result if e['label'].lower() in emotions),
key=lambda x: x['score'],
default={'label': 'neutral', 'score': 0}
)
def _build_prompt(text, emotion):
"""Template-based prompt engineering for response generation"""
return f"{emotion.capitalize()} detected: {text}\nRespond with a coherent and supportive response."
def _generate_response(text, models):
"""Optimized text generation pipeline"""
# Emotion detection
emotion = _fast_emotion(text, models["emotion"])
# Prompt construction
prompt = _build_prompt(text, emotion["label"])
# Generate text
inputs = models["text_tokenizer"](
prompt,
return_tensors="pt",
max_length=100,
truncation=True
).to(models["device"])
output = models["text_model"].generate(
inputs.input_ids,
max_new_tokens=100, # Balanced length for response
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=models["text_tokenizer"].eos_token_id
)
# Process output
response = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
return response.strip()[:200] or "Thank you for your feedback."
def _text_to_speech(text, models):
"""High-speed audio synthesis"""
inputs = models["tts_processor"](text=text[:150], return_tensors="pt").to(models["device"])
with torch.inference_mode(): # Accelerated inference
spectrogram = models["tts_model"].generate_speech(inputs["input_ids"], models["speaker_emb"])
audio = models["tts_vocoder"](spectrogram)
sf.write("output.wav", audio.cpu().numpy(), 16000)
return "output.wav"
##########################################
# Main application flow
##########################################
def main():
"""Primary execution controller"""
# Load components
components = _load_components()
# Show interface
user_input = _show_interface()
if user_input:
# Text generation
with st.spinner("๐ Analyzing..."):
response = _generate_response(user_input, components)
# Display result
st.subheader("๐ Response")
st.markdown(f"```\n{response}\n```") # f-string formatted
# Audio generation
with st.spinner("๐ Synthesizing..."):
audio_path = _text_to_speech(response, components)
st.audio(audio_path, format="audio/wav")
if __name__ == "__main__":
main() # Execute the main function |