import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import edge_tts
import tempfile
import asyncio

# Initialize the inference client with your Hugging Face token
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
# Initialize the ASR pipeline
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")

INITIAL_MESSAGE = "Hi! I'm your music buddy—tell me about your mood and the type of tunes you're in the mood for today!"

def speech_to_text(speech):
    """Converts speech to text using the ASR pipeline."""
    return asr(speech)["text"]

def classify_mood(input_string):
    """Classifies the mood based on keywords in the input string."""
    input_string = input_string.lower()
    mood_words = {"happy", "sad", "instrumental", "party"}
    for word in mood_words:
        if word in input_string:
            return word, True
    return None, False

def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(prompt, history)

    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""

    for response in stream:
        output += response.token.text
        
    # Check if the output is a single mood word (confirmed by user)
    if output.strip().lower() in ["happy", "sad", "instrumental", "party"]:
        return f"Playing {output.strip().capitalize()} playlist for you!"
    elif output.strip().lower() == "unclear":
        return "I'm having trouble determining your mood. Could you tell me more explicitly how you're feeling?"
    else:
        return output.strip()

def format_prompt(message, history):
    """Formats the prompt including fixed instructions and conversation history."""
    fixed_prompt = """
    You are a smart mood analyzer for a music recommendation system. Your goal is to determine the user's current mood and suggest an appropriate music playlist. Follow these instructions carefully:

    1. Engage in a conversation to understand the user's mood. Don't assume their mood based on activities or preferences.
    2. Classify the mood into one of four categories: Happy, Sad, Instrumental, or Party.
    3. If the mood is unclear, ask relevant follow-up questions. Do not classify prematurely.
    4. Before suggesting a playlist, always ask for confirmation. For example: "It sounds like you might be in a [mood] mood. Would you like me to play a [mood] playlist for you?"
    5. Only respond with a single mood word (Happy, Sad, Instrumental, or Party) if the user explicitly confirms they want that type of playlist.
    6. If you can't determine the mood after 5 exchanges, respond with "Unclear".
    7. Stay on topic and focus on understanding the user's current emotional state.

    Remember: Your primary goal is accurate mood classification and appropriate music suggestion. Always get confirmation before playing a playlist.
    """
    prompt = f"{fixed_prompt}\n"
    
    # Add conversation history
    for i, (user_prompt, bot_response) in enumerate(history):
        prompt += f"User: {user_prompt}\nAssistant: {bot_response}\n"
        if i == 3:  # This is the 4th exchange (0-indexed)
            prompt += "Note: This is the last exchange. If the mood is still unclear, respond with 'Unclear'.\n"
    
    prompt += f"User: {message}\nAssistant:"
    return prompt


async def text_to_speech(text):
    communicate = edge_tts.Communicate(text)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

def process_input(input_text, history):
    if not input_text:
        return history, history, "", None
    response = generate(input_text, history)
    history.append((input_text, response))
    return history, history, "", None

async def generate_audio(history):
    if history and len(history) > 0:
        last_response = history[-1][1]
        audio_path = await text_to_speech(last_response)
        return audio_path
    return None

async def init_chat():
    history = [("", INITIAL_MESSAGE)]
    audio_path = await text_to_speech(INITIAL_MESSAGE)
    return history, history, audio_path

# Gradio interface setup
with gr.Blocks() as demo:
    gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Type your message here or use the microphone to speak...")
    audio_output = gr.Audio(label="AI Response", autoplay=True)

    state = gr.State([])

    with gr.Row():
        submit = gr.Button("Send")
        voice_input = gr.Audio(sources="microphone", type="filepath", label="Voice Input")

    # Initialize chat with greeting
    demo.load(init_chat, outputs=[state, chatbot, audio_output])

    # Handle text input
    msg.submit(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )
    submit.click(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )

    # Handle voice input
    voice_input.stop_recording(
        lambda x: speech_to_text(x) if x else "",
        inputs=[voice_input],
        outputs=[msg]
    ).then(
        process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]
    ).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )

if __name__ == "__main__":
    demo.launch(share=True)