########################################## # Step 0: Essential imports ########################################## import streamlit as st # Web interface from transformers import ( # AI components pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, AutoModelForCausalLM, AutoTokenizer ) from datasets import load_dataset # Voice data import torch # Tensor operations import soundfile as sf # Audio processing ########################################## # Initial configuration (MUST BE FIRST) ########################################## st.set_page_config( # Set page config first page_title="Just Comment", page_icon="πŸ’¬", layout="centered" ) ########################################## # Optimized model loader with caching ########################################## @st.cache_resource(show_spinner=False) def _load_components(): """Load and cache all models with hardware optimization""" device = "cuda" if torch.cuda.is_available() else "cpu" # Emotion classifier (fast) emotion_pipe = pipeline( "text-classification", model="Thea231/jhartmann_emotion_finetuning", device=device, truncation=True ) # Text generator (optimized) text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B") text_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen1.5-0.5B", torch_dtype=torch.float16, device_map="auto" ) # TTS system (accelerated) tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") tts_model = SpeechT5ForTextToSpeech.from_pretrained( "microsoft/speecht5_tts", torch_dtype=torch.float16 ).to(device) tts_vocoder = SpeechT5HifiGan.from_pretrained( "microsoft/speecht5_hifigan", torch_dtype=torch.float16 ).to(device) # Preloaded voice profile speaker_emb = torch.tensor( load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"] ).unsqueeze(0).to(device) return { "emotion": emotion_pipe, "text_model": text_model, "text_tokenizer": text_tokenizer, "tts_processor": tts_processor, "tts_model": tts_model, "tts_vocoder": tts_vocoder, "speaker_emb": speaker_emb, "device": device } ########################################## # User interface components ########################################## def _show_interface(): """Render input interface""" st.title("Just Comment") st.markdown(f"### I'm listening to you, my friend~") return st.text_area( # Input field "πŸ“ Enter your comment:", placeholder="Share your thoughts...", height=150, key="input" ) ########################################## # Core processing functions ########################################## def _fast_emotion(text, analyzer): """Rapid emotion detection with input limits""" result = analyzer(text[:256], return_all_scores=True)[0] # Limit input length emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'] return max( (e for e in result if e['label'].lower() in emotions), key=lambda x: x['score'], default={'label': 'neutral', 'score': 0} ) def _build_prompt(text, emotion): """Template-based prompt engineering""" templates = { "sadness": f"Sadness detected: {{text}}\nRespond with: 1. Empathy 2. Support 3. Solution\nResponse:", "joy": f"Joy detected: {{text}}\nRespond with: 1. Thanks 2. Praise 3. Engagement\nResponse:", "love": f"Love detected: {{text}}\nRespond with: 1. Appreciation 2. Connection 3. Offer\nResponse:", "anger": f"Anger detected: {{text}}\nRespond with: 1. Apology 2. Action 3. Compensation\nResponse:", "fear": f"Fear detected: {{text}}\nRespond with: 1. Reassurance 2. Safety 3. Support\nResponse:", "surprise": f"Surprise detected: {{text}}\nRespond with: 1. Acknowledgement 2. Solution 3. Follow-up\nResponse:", "neutral": f"Feedback: {{text}}\nProfessional response:\n1. Acknowledgement\n2. Assistance\n3. Next steps\nResponse:" } return templates[emotion.lower()].format(text=text[:200]) # Input truncation def _generate_response(text, models): """Optimized text generation pipeline""" # Emotion detection emotion = _fast_emotion(text, models["emotion"]) # Prompt construction prompt = _build_prompt(text, emotion["label"]) # Generate text inputs = models["text_tokenizer"]( prompt, return_tensors="pt", max_length=100, truncation=True ).to(models["device"]) output = models["text_model"].generate( inputs.input_ids, max_new_tokens=120, # Balanced length temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=models["text_tokenizer"].eos_token_id ) # Process output full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True) response = full_text.split("Response:")[-1].strip() # Ensure completeness if "." in response: response = response.rsplit(".", 1)[0] + "." return response[:200] or "Thank you for your feedback. We'll respond shortly." def _text_to_speech(text, models): """High-speed audio synthesis""" inputs = models["tts_processor"]( text=text[:150], # Limit text length return_tensors="pt" ).to(models["device"]) with torch.inference_mode(): # Accelerated inference spectrogram = models["tts_model"].generate_speech( inputs["input_ids"], models["speaker_emb"] ) audio = models["tts_vocoder"](spectrogram) sf.write("output.wav", audio.cpu().numpy(), 16000) return "output.wav" ########################################## # Main application flow ########################################## def main(): """Primary execution controller""" # Load components components = _load_components() # Show interface user_input = _show_interface() if user_input: # Text generation with st.spinner("πŸ” Analyzing..."): response = _generate_response(user_input, components) # Display result st.subheader(f"πŸ“„ Response") st.markdown(f"```\n{response}\n```") # f-string formatted # Audio generation with st.spinner("πŸ”Š Synthesizing..."): audio_path = _text_to_speech(response, components) st.audio(audio_path, format="audio/wav") if __name__ == "__main__": main()