##########################################
# Step 0: Essential imports
##########################################
import streamlit as st  # Web interface
from transformers import (  # AI components
    pipeline,
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    AutoModelForCausalLM,
    AutoTokenizer
)
from datasets import load_dataset  # Voice data
import torch  # Tensor operations
import soundfile as sf  # Audio processing

##########################################
# Initial configuration (MUST BE FIRST)
##########################################
st.set_page_config(  # Set page config first
    page_title="Just Comment",
    page_icon="💬",
    layout="centered"
)

##########################################
# Optimized model loader with caching
##########################################
@st.cache_resource(show_spinner=False)
def _load_components():
    """Load and cache all models with hardware optimization"""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Emotion classifier (fast)
    emotion_pipe = pipeline(
        "text-classification",
        model="Thea231/jhartmann_emotion_finetuning",
        device=device,
        truncation=True
    )
    
    # Text generator (optimized)
    text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
    text_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen1.5-0.5B",
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # TTS system (accelerated)
    tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    tts_model = SpeechT5ForTextToSpeech.from_pretrained(
        "microsoft/speecht5_tts",
        torch_dtype=torch.float16
    ).to(device)
    tts_vocoder = SpeechT5HifiGan.from_pretrained(
        "microsoft/speecht5_hifigan",
        torch_dtype=torch.float16
    ).to(device)
    
    # Preloaded voice profile
    speaker_emb = torch.tensor(
        load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
    ).unsqueeze(0).to(device)
    
    return {
        "emotion": emotion_pipe,
        "text_model": text_model,
        "text_tokenizer": text_tokenizer,
        "tts_processor": tts_processor,
        "tts_model": tts_model,
        "tts_vocoder": tts_vocoder,
        "speaker_emb": speaker_emb,
        "device": device
    }

##########################################
# User interface components
##########################################
def _show_interface():
    """Render input interface"""
    st.title("Just Comment")
    st.markdown(f"### I'm listening to you, my friend～")
    return st.text_area(  # Input field
        "📝 Enter your comment:",
        placeholder="Share your thoughts...",
        height=150,
        key="input"
    )

##########################################
# Core processing functions
##########################################
def _fast_emotion(text, analyzer):
    """Rapid emotion detection with input limits"""
    result = analyzer(text[:256], return_all_scores=True)[0]  # Limit input length
    emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    return max(
        (e for e in result if e['label'].lower() in emotions),
        key=lambda x: x['score'],
        default={'label': 'neutral', 'score': 0}
    )

def _build_prompt(text, emotion):
    """Template-based prompt engineering"""
    templates = {
        "sadness": f"Sadness detected: {{text}}\nRespond with: 1. Empathy 2. Support 3. Solution\nResponse:",
        "joy": f"Joy detected: {{text}}\nRespond with: 1. Thanks 2. Praise 3. Engagement\nResponse:",
        "love": f"Love detected: {{text}}\nRespond with: 1. Appreciation 2. Connection 3. Offer\nResponse:",
        "anger": f"Anger detected: {{text}}\nRespond with: 1. Apology 2. Action 3. Compensation\nResponse:",
        "fear": f"Fear detected: {{text}}\nRespond with: 1. Reassurance 2. Safety 3. Support\nResponse:",
        "surprise": f"Surprise detected: {{text}}\nRespond with: 1. Acknowledgement 2. Solution 3. Follow-up\nResponse:",
        "neutral": f"Feedback: {{text}}\nProfessional response:\n1. Acknowledgement\n2. Assistance\n3. Next steps\nResponse:"
    }
    return templates[emotion.lower()].format(text=text[:200])  # Input truncation

def _generate_response(text, models):
    """Optimized text generation pipeline"""
    # Emotion detection
    emotion = _fast_emotion(text, models["emotion"])
    
    # Prompt construction
    prompt = _build_prompt(text, emotion["label"])
    
    # Generate text
    inputs = models["text_tokenizer"](
        prompt,
        return_tensors="pt",
        max_length=100,
        truncation=True
    ).to(models["device"])
    
    output = models["text_model"].generate(
        inputs.input_ids,
        max_new_tokens=120,  # Balanced length
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=models["text_tokenizer"].eos_token_id
    )
    
    # Process output
    full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
    response = full_text.split("Response:")[-1].strip()
    
    # Ensure completeness
    if "." in response:
        response = response.rsplit(".", 1)[0] + "."
    return response[:200] or "Thank you for your feedback. We'll respond shortly."

def _text_to_speech(text, models):
    """High-speed audio synthesis"""
    inputs = models["tts_processor"](
        text=text[:150],  # Limit text length
        return_tensors="pt"
    ).to(models["device"])
    
    with torch.inference_mode():  # Accelerated inference
        spectrogram = models["tts_model"].generate_speech(
            inputs["input_ids"],
            models["speaker_emb"]
        )
        audio = models["tts_vocoder"](spectrogram)
    
    sf.write("output.wav", audio.cpu().numpy(), 16000)
    return "output.wav"

##########################################
# Main application flow
##########################################
def main():
    """Primary execution controller"""
    # Load components
    components = _load_components()
    
    # Show interface
    user_input = _show_interface()
    
    if user_input:
        # Text generation
        with st.spinner("🔍 Analyzing..."):
            response = _generate_response(user_input, components)
        
        # Display result
        st.subheader(f"📄 Response")
        st.markdown(f"```\n{response}\n```")  # f-string formatted
        
        # Audio generation
        with st.spinner("🔊 Synthesizing..."):
            audio_path = _text_to_speech(response, components)
            st.audio(audio_path, format="audio/wav")

if __name__ == "__main__":
    main()