Voice-To-Voice_test

Sleeping

File size: 5,200 Bytes

e4c39da
f8d538d
 
 
 
 
e4c39da
f8d538d
 
 
 
e4c39da
f8d538d
 
 
e4c39da
f8d538d
 
 
 
 
 
 
 
e4c39da
f8d538d
 
 
 
 
e4c39da
f8d538d
 
 
 
 
 
 
 
e4c39da
f8d538d
2bcdf1f
f8d538d
 
2bcdf1f
f8d538d
 
 
 
 
 
 
2bcdf1f
f8d538d
 
1eacd2b
 
2bcdf1f
1eacd2b
e4c39da
1eacd2b
f8d538d
1eacd2b
f8d538d
1eacd2b
2bcdf1f
f39012c
1eacd2b
6167dce
f8d538d
6167dce
 
f8d538d
8413e00
f8d538d
 
 
 
 
 
 
6167dce
 
c163697
6167dce
 
c163697
6167dce
 
 
 
 
f8d538d
 
 
1eacd2b
f8d538d
6167dce
f8d538d
1eacd2b
 
6167dce
5251d1b
15196a0
 
6167dce
 
f39012c
8413e00
6167dce
c163697
6167dce
 
c163697
6167dce
15196a0
 
6167dce
 
 
 
 
 
c163697
15196a0
6167dce
15196a0
e4c39da
f8d538d
f39012c

import gradio as gr
from huggingface_hub import InferenceClient
from transformers import pipeline
import edge_tts
import tempfile
import asyncio

# Initialize the inference client with your Hugging Face token
client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
# Initialize the ASR pipeline
asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")

def speech_to_text(speech):
    """Converts speech to text using the ASR pipeline."""
    return asr(speech)["text"]

def classify_mood(input_string):
    """Classifies the mood based on keywords in the input string."""
    input_string = input_string.lower()
    mood_words = {"happy", "sad", "instrumental", "party"}
    for word in mood_words:
        if word in input_string:
            return word, True
    return None, False

def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
    temperature = float(temperature)
    if temperature < 1e-2:
        temperature = 1e-2
    top_p = float(top_p)

    generate_kwargs = dict(
        temperature=temperature,
        max_new_tokens=max_new_tokens,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=True,
        seed=42,
    )

    formatted_prompt = format_prompt(prompt, history)

    stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
    output = ""

    for response in stream:
        output += response.token.text
        mood, is_classified = classify_mood(output)
        if is_classified:
            playlist_message = f"Playing {mood.capitalize()} playlist for you!"
            return playlist_message
    return output

def format_prompt(message, history):
    """Formats the prompt including fixed instructions and conversation history."""
    fixed_prompt = """
    You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".

    Note: Do not write anything else other than the classified mood if classified.

    Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.

    Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.

    Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.

    [Examples omitted for brevity]
    """
    prompt = f"{fixed_prompt}\n"
    for user_prompt, bot_response in history:
        prompt += f"User: {user_prompt}\nLLM Response: {bot_response}\n"
    prompt += f"User: {message}\nLLM Response:"
    return prompt

async def text_to_speech(text):
    communicate = edge_tts.Communicate(text)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path

def process_input(input_text, history):
    if not input_text:
        return history, history, "", None
    response = generate(input_text, history)
    history.append((input_text, response))
    return history, history, "", None  # Return history twice: once for state, once for chatbot, and reset voice input

async def generate_audio(history):
    if history and len(history) > 0:
        last_response = history[-1][1]
        audio_path = await text_to_speech(last_response)
        return audio_path
    return None

# Gradio interface setup
with gr.Blocks() as demo:
    gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat")
    
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Type your message here or use the microphone to speak...")
    audio_output = gr.Audio(label="AI Response", autoplay=True)

    state = gr.State([])

    with gr.Row():
        submit = gr.Button("Send")
        voice_input = gr.Audio(source="microphone", type="filepath", label="Voice Input")

    # Handle text input
    msg.submit(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )
    submit.click(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )

    # Handle voice input
    voice_input.stop_recording(
        lambda x: speech_to_text(x) if x else "",
        inputs=[voice_input],
        outputs=[msg]
    ).then(
        process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]
    ).then(
        generate_audio, inputs=[state], outputs=[audio_output]
    )

if __name__ == "__main__":
    demo.launch(share=True)