Voice-To-Voice_test

Sleeping

App Files Files Community

Voice-To-Voice_test / app.py

syedmudassir16

Update app.py

f39012c verified 8 months ago

raw

history blame

5.2 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	from transformers import pipeline
	import edge_tts
	import tempfile
	import asyncio

	# Initialize the inference client with your Hugging Face token
	client = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
	# Initialize the ASR pipeline
	asr = pipeline("automatic-speech-recognition", "facebook/wav2vec2-base-960h")

	def speech_to_text(speech):
	"""Converts speech to text using the ASR pipeline."""
	return asr(speech)["text"]

	def classify_mood(input_string):
	"""Classifies the mood based on keywords in the input string."""
	input_string = input_string.lower()
	mood_words = {"happy", "sad", "instrumental", "party"}
	for word in mood_words:
	if word in input_string:
	return word, True
	return None, False

	def generate(prompt, history, temperature=0.1, max_new_tokens=2048, top_p=0.8, repetition_penalty=1.0):
	temperature = float(temperature)
	if temperature < 1e-2:
	temperature = 1e-2
	top_p = float(top_p)

	generate_kwargs = dict(
	temperature=temperature,
	max_new_tokens=max_new_tokens,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	do_sample=True,
	seed=42,
	)

	formatted_prompt = format_prompt(prompt, history)

	stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
	output = ""

	for response in stream:
	output += response.token.text
	mood, is_classified = classify_mood(output)
	if is_classified:
	playlist_message = f"Playing {mood.capitalize()} playlist for you!"
	return playlist_message
	return output

	def format_prompt(message, history):
	"""Formats the prompt including fixed instructions and conversation history."""
	fixed_prompt = """
	You are a smart mood analyser, who determines user mood. Based on the user input, classify the mood of the user into one of the four moods {Happy, Sad, Instrumental, Party}. If you are finding it difficult to classify into one of these four moods, keep the conversation going on until we classify the user's mood. Return a single-word reply from one of the options if you have classified. Suppose you classify a sentence as happy, then just respond with "happy".

	Note: Do not write anything else other than the classified mood if classified.

	Note: If any question or any user text cannot be classified, follow up with a question to know the user's mood until you classify the mood.

	Note: Mood should be classified only from any of these 4 classes {Happy, Sad, Instrumental, Party}, if not any of these 4 then continue with a follow-up question until you classify the mood.

	Note: if user asks something like i need a coffee then do not classify the mood directly and ask more follow-up questions as asked in examples.

	[Examples omitted for brevity]
	"""
	prompt = f"{fixed_prompt}\n"
	for user_prompt, bot_response in history:
	prompt += f"User: {user_prompt}\nLLM Response: {bot_response}\n"
	prompt += f"User: {message}\nLLM Response:"
	return prompt

	async def text_to_speech(text):
	communicate = edge_tts.Communicate(text)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
	tmp_path = tmp_file.name
	await communicate.save(tmp_path)
	return tmp_path

	def process_input(input_text, history):
	if not input_text:
	return history, history, "", None
	response = generate(input_text, history)
	history.append((input_text, response))
	return history, history, "", None # Return history twice: once for state, once for chatbot, and reset voice input

	async def generate_audio(history):
	if history and len(history) > 0:
	last_response = history[-1][1]
	audio_path = await text_to_speech(last_response)
	return audio_path
	return None

	# Gradio interface setup
	with gr.Blocks() as demo:
	gr.Markdown("# Mood-Based Music Recommender with Continuous Voice Chat")

	chatbot = gr.Chatbot()
	msg = gr.Textbox(placeholder="Type your message here or use the microphone to speak...")
	audio_output = gr.Audio(label="AI Response", autoplay=True)

	state = gr.State([])

	with gr.Row():
	submit = gr.Button("Send")
	voice_input = gr.Audio(source="microphone", type="filepath", label="Voice Input")

	# Handle text input
	msg.submit(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
	generate_audio, inputs=[state], outputs=[audio_output]
	)
	submit.click(process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]).then(
	generate_audio, inputs=[state], outputs=[audio_output]
	)

	# Handle voice input
	voice_input.stop_recording(
	lambda x: speech_to_text(x) if x else "",
	inputs=[voice_input],
	outputs=[msg]
	).then(
	process_input, inputs=[msg, state], outputs=[state, chatbot, msg, voice_input]
	).then(
	generate_audio, inputs=[state], outputs=[audio_output]
	)

	if __name__ == "__main__":
	demo.launch(share=True)