Spaces:
Sleeping
Sleeping
File size: 6,013 Bytes
e4c39da f8d538d e4c39da f8d538d e4c39da 98c26bb f8d538d e4c39da f8d538d e4c39da f8d538d e4c39da f8d538d e4c39da f8d538d 2bcdf1f f8d538d 2bcdf1f f8d538d a043e18 2bcdf1f f8d538d 1eacd2b a043e18 2bcdf1f a043e18 e4c39da a043e18 1eacd2b 6167dce e83365a a043e18 e83365a f8d538d 8413e00 e83365a f8d538d 6167dce c163697 6167dce 4c7e3fe 6167dce f8d538d 98c26bb 4c7e3fe 98c26bb bfd74d0 98c26bb 1eacd2b f8d538d 6167dce f8d538d 7fb43f8 6167dce 5251d1b 15196a0 6167dce 5ffa7c3 98c26bb 8413e00 6167dce c163697 6167dce c163697 6167dce 15196a0 6167dce c163697 15196a0 6167dce 15196a0 e4c39da f8d538d f39012c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import edge_tts
import tempfile
import asyncio
# Initialize the inference client with your Hugging Face token
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
# Initialize the ASR pipeline
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")
INITIAL_MESSAGE = "Hi! I'm your music buddy—tell me about your mood and the type of tunes you're in the mood for today!"
def speech_to_text(speech):
"""Converts speech to text using the ASR pipeline."""
return asr(speech)["text"]
def classify_mood(input_string):
"""Classifies the mood based on keywords in the input string."""
input_string = input_string.lower()
mood_words = {"happy", "sad", "instrumental", "party"}
for word in mood_words:
if word in input_string:
return word, True
return None, False
def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
temperature = float(temperature)
if temperature < 1e-2:
temperature = 1e-2
top_p = float(top_p)
generate_kwargs = dict(
temperature=temperature,
max_new_tokens=max_new_tokens,
top_p=top_p,
repetition_penalty=repetition_penalty,
do_sample=True,
seed=42,
)
formatted_prompt = format_prompt(prompt, history)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
output = ""
for response in stream:
output += response.token.text
# Check if the output is a single mood word (confirmed by user)
if output.strip().lower() in ["happy", "sad", "instrumental", "party"]:
return f"Playing {output.strip().capitalize()} playlist for you!"
elif output.strip().lower() == "unclear":
return "I'm having trouble determining your mood. Could you tell me more explicitly how you're feeling?"
else:
return output.strip()
def format_prompt(message, history):
"""Formats the prompt including fixed instructions and conversation history."""
fixed_prompt = """
You are a smart mood analyzer for a music recommendation system. Your goal is to determine the user's current mood and suggest an appropriate music playlist. Follow these instructions carefully:
1. Engage in a conversation to understand the user's mood. Don't assume their mood based on activities or preferences.
2. Classify the mood into one of four categories: Happy, Sad, Instrumental, or Party.
3. If the mood is unclear, ask relevant follow-up questions. Do not classify prematurely.
4. Before suggesting a playlist, always ask for confirmation. For example: "It sounds like you might be in a [mood] mood. Would you like me to play a [mood] playlist for you?"
5. Only respond with a single mood word (Happy, Sad, Instrumental, or Party) if the user explicitly confirms they want that type of playlist.
6. If you can't determine the mood after 5 exchanges, respond with "Unclear".
7. Stay on topic and focus on understanding the user's current emotional state.
Remember: Your primary goal is accurate mood classification and appropriate music suggestion. Always get confirmation before playing a playlist.
"""
prompt = f"{fixed_prompt}\n"
# Add conversation history
for i, (user_prompt, bot_response) in enumerate(history):
prompt += f"User: {user_prompt}\nAssistant: {bot_response}\n"
if i == 3: # This is the 4th exchange (0-indexed)
prompt += "Note: This is the last exchange. If the mood is still unclear, respond with 'Unclear'.\n"
prompt += f"User: {message}\nAssistant:"
return prompt
async def text_to_speech(text):
communicate = edge_tts.Communicate(text)
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path
def process_input(input_text, history):
if not input_text:
return history, history, "", None
response = generate(input_text, history)
history.append((input_text, response))
return history, history, "", None
async def generate_audio(history):
if history and len(history) > 0:
last_response = history[-1][1]
audio_path = await text_to_speech(last_response)
return audio_path
return None
async def init_chat():
history = [("", INITIAL_MESSAGE)]
audio_path = await text_to_speech(INITIAL_MESSAGE)
return history, history, audio_path
# Gradio interface setup
with gr.Blocks() as demo:
gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat")
chatbot = gr.Chatbot()
msg = gr.Textbox(placeholder="Type your message here or use the microphone to speak...")
audio_output = gr.Audio(label="AI Response", autoplay=True)
state = gr.State([])
with gr.Row():
submit = gr.Button("Send")
voice_input = gr.Audio(sources="microphone", type="filepath", label="Voice Input")
# Initialize chat with greeting
demo.load(init_chat, outputs=[state, chatbot, audio_output])
# Handle text input
msg.submit(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
generate_audio, inputs=[state], outputs=[audio_output]
)
submit.click(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
generate_audio, inputs=[state], outputs=[audio_output]
)
# Handle voice input
voice_input.stop_recording(
lambda x: speech_to_text(x) if x else "",
inputs=[voice_input],
outputs=[msg]
).then(
process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]
).then(
generate_audio, inputs=[state], outputs=[audio_output]
)
if __name__ == "__main__":
demo.launch(share=True) |