File size: 6,695 Bytes
152d61c
5e4841e
152d61c
5e4841e
 
7abe73c
 
 
 
 
 
5e4841e
 
 
 
c39c802
0e85ac7
5e4841e
0e85ac7
5e4841e
05be7ae
0e85ac7
5e4841e
0e85ac7
 
152d61c
5e4841e
152d61c
0a4b920
5e4841e
 
6597a2f
 
5e4841e
6597a2f
 
 
 
5e4841e
6597a2f
 
5e4841e
 
 
6597a2f
 
 
 
 
5e4841e
6597a2f
 
 
 
 
 
 
 
 
 
5e4841e
 
6597a2f
 
 
7abe73c
5e4841e
 
 
 
 
 
 
 
7abe73c
3970052
152d61c
5e4841e
152d61c
5e4841e
 
6597a2f
5e4841e
 
05be7ae
5e4841e
0a4b920
5e4841e
0a4b920
e4cf4e2
152d61c
5e4841e
7abe73c
5e4841e
 
 
 
 
 
6597a2f
5e4841e
6597a2f
0a4b920
5e4841e
 
 
 
 
 
 
 
 
 
0a4b920
5e4841e
0a4b920
5e4841e
 
 
 
6597a2f
5e4841e
 
6597a2f
5e4841e
 
6597a2f
 
5e4841e
6597a2f
5e4841e
0a4b920
5e4841e
0a4b920
5e4841e
0a4b920
5e2d609
0a4b920
5e4841e
0a4b920
 
5e4841e
 
 
6597a2f
5e4841e
 
 
 
0a4b920
5e4841e
 
 
6597a2f
 
5e4841e
5e2d609
5e4841e
 
6597a2f
5e4841e
6597a2f
5e4841e
6597a2f
5e4841e
 
152d61c
 
5e4841e
152d61c
 
5e4841e
 
 
5e2d609
5e4841e
 
7abe73c
 
6597a2f
5e4841e
 
0a4b920
5e4841e
 
 
7abe73c
6597a2f
5e4841e
 
 
c39c802
152d61c
7abe73c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
##########################################
# Step 0: Essential imports
##########################################
import streamlit as st  # Web interface
from transformers import (  # AI components
    pipeline,
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    AutoModelForCausalLM,
    AutoTokenizer
)
from datasets import load_dataset  # Voice data
import torch  # Tensor operations
import soundfile as sf  # Audio processing

##########################################
# Initial configuration (MUST BE FIRST)
##########################################
st.set_page_config(  # Set page config first
    page_title="Just Comment",
    page_icon="๐Ÿ’ฌ",
    layout="centered"
)

##########################################
# Optimized model loader with caching
##########################################
@st.cache_resource(show_spinner=False)
def _load_components():
    """Load and cache all models with hardware optimization"""
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    # Emotion classifier (fast)
    emotion_pipe = pipeline(
        "text-classification",
        model="Thea231/jhartmann_emotion_finetuning",
        device=device,
        truncation=True
    )
    
    # Text generator (optimized)
    text_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B")
    text_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen1.5-0.5B",
        torch_dtype=torch.float16,
        device_map="auto"
    )
    
    # TTS system (accelerated)
    tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    tts_model = SpeechT5ForTextToSpeech.from_pretrained(
        "microsoft/speecht5_tts",
        torch_dtype=torch.float16
    ).to(device)
    tts_vocoder = SpeechT5HifiGan.from_pretrained(
        "microsoft/speecht5_hifigan",
        torch_dtype=torch.float16
    ).to(device)
    
    # Preloaded voice profile
    speaker_emb = torch.tensor(
        load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")[7306]["xvector"]
    ).unsqueeze(0).to(device)
    
    return {
        "emotion": emotion_pipe,
        "text_model": text_model,
        "text_tokenizer": text_tokenizer,
        "tts_processor": tts_processor,
        "tts_model": tts_model,
        "tts_vocoder": tts_vocoder,
        "speaker_emb": speaker_emb,
        "device": device
    }

##########################################
# User interface components
##########################################
def _show_interface():
    """Render input interface"""
    st.title("Just Comment")
    st.markdown(f"### I'm listening to you, my friend๏ฝž")
    return st.text_area(  # Input field
        "๐Ÿ“ Enter your comment:",
        placeholder="Share your thoughts...",
        height=150,
        key="input"
    )

##########################################
# Core processing functions
##########################################
def _fast_emotion(text, analyzer):
    """Rapid emotion detection with input limits"""
    result = analyzer(text[:256], return_all_scores=True)[0]  # Limit input length
    emotions = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
    return max(
        (e for e in result if e['label'].lower() in emotions),
        key=lambda x: x['score'],
        default={'label': 'neutral', 'score': 0}
    )

def _build_prompt(text, emotion):
    """Template-based prompt engineering"""
    templates = {
        "sadness": f"Sadness detected: {{text}}\nRespond with: 1. Empathy 2. Support 3. Solution\nResponse:",
        "joy": f"Joy detected: {{text}}\nRespond with: 1. Thanks 2. Praise 3. Engagement\nResponse:",
        "love": f"Love detected: {{text}}\nRespond with: 1. Appreciation 2. Connection 3. Offer\nResponse:",
        "anger": f"Anger detected: {{text}}\nRespond with: 1. Apology 2. Action 3. Compensation\nResponse:",
        "fear": f"Fear detected: {{text}}\nRespond with: 1. Reassurance 2. Safety 3. Support\nResponse:",
        "surprise": f"Surprise detected: {{text}}\nRespond with: 1. Acknowledgement 2. Solution 3. Follow-up\nResponse:",
        "neutral": f"Feedback: {{text}}\nProfessional response:\n1. Acknowledgement\n2. Assistance\n3. Next steps\nResponse:"
    }
    return templates[emotion.lower()].format(text=text[:200])  # Input truncation

def _generate_response(text, models):
    """Optimized text generation pipeline"""
    # Emotion detection
    emotion = _fast_emotion(text, models["emotion"])
    
    # Prompt construction
    prompt = _build_prompt(text, emotion["label"])
    
    # Generate text
    inputs = models["text_tokenizer"](
        prompt,
        return_tensors="pt",
        max_length=100,
        truncation=True
    ).to(models["device"])
    
    output = models["text_model"].generate(
        inputs.input_ids,
        max_new_tokens=120,  # Balanced length
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=models["text_tokenizer"].eos_token_id
    )
    
    # Process output
    full_text = models["text_tokenizer"].decode(output[0], skip_special_tokens=True)
    response = full_text.split("Response:")[-1].strip()
    
    # Ensure completeness
    if "." in response:
        response = response.rsplit(".", 1)[0] + "."
    return response[:200] or "Thank you for your feedback. We'll respond shortly."

def _text_to_speech(text, models):
    """High-speed audio synthesis"""
    inputs = models["tts_processor"](
        text=text[:150],  # Limit text length
        return_tensors="pt"
    ).to(models["device"])
    
    with torch.inference_mode():  # Accelerated inference
        spectrogram = models["tts_model"].generate_speech(
            inputs["input_ids"],
            models["speaker_emb"]
        )
        audio = models["tts_vocoder"](spectrogram)
    
    sf.write("output.wav", audio.cpu().numpy(), 16000)
    return "output.wav"

##########################################
# Main application flow
##########################################
def main():
    """Primary execution controller"""
    # Load components
    components = _load_components()
    
    # Show interface
    user_input = _show_interface()
    
    if user_input:
        # Text generation
        with st.spinner("๐Ÿ” Analyzing..."):
            response = _generate_response(user_input, components)
        
        # Display result
        st.subheader(f"๐Ÿ“„ Response")
        st.markdown(f"```\n{response}\n```")  # f-string formatted
        
        # Audio generation
        with st.spinner("๐Ÿ”Š Synthesizing..."):
            audio_path = _text_to_speech(response, components)
            st.audio(audio_path, format="audio/wav")

if __name__ == "__main__":
    main()