fast-chatbot

Sleeping

App Files Files Community

fast-chatbot / app.py

ayouuubBn

Update app.py

ff0ebe4 verified 3 months ago

raw

history blame contribute delete

2.39 kB

	import os
	import time
	import gradio as gr
	from huggingface_hub import InferenceClient

	# Retrieve API token from Hugging Face Secrets
	HF_TOKEN = os.getenv("HF_TOKEN")

	# Ensure the token is available
	if not HF_TOKEN:
	raise ValueError("Missing Hugging Face API Token! Set 'HF_TOKEN' in Hugging Face Secrets.")

	# Initialize clients with authentication
	client_gemma = InferenceClient("google/gemma-1.1-2b-it", token=HF_TOKEN)
	client_mistral = InferenceClient("rgb2gbr/deepseek-r1-distill-qwen-1-5b-kto", token=HF_TOKEN)

	# Function for normal fast responses
	def models(query):
	messages = [{"role": "user", "content": f"[USER] {query}"}]

	response = ""
	try:
	for message in client_gemma.chat_completion(messages, max_tokens=1024, stream=True):
	token = message.choices[0].delta.content
	response += token
	yield response
	time.sleep(0.3) # Prevent rate limiting
	except Exception as e:
	yield f"Error: {str(e)}"

	# Function for detailed critical thinking responses
	def nemo(query):
	budget = 3
	message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation.
	Your task is to provide a detailed, step-by-step solution.
	You have {budget} steps to solve the problem.
	<answer> Your final answer should be here. </answer>
	[QUERY] {query} [/INST] [ASSISTANT] """

	output = ""
	try:
	stream = client_mistral.text_generation(message, max_new_tokens=2048, stream=True, details=True, return_full_text=False)
	for response in stream:
	output += response.token.text
	yield output
	time.sleep(0.3) # Prevent rate limiting
	except Exception as e:
	yield f"Error: {str(e)}"

	# Description for Gradio UI
	description = "# Chat GO\n### Enter your query and get lightning-fast responses"

	# Gradio Interfaces
	with gr.Blocks() as demo1:
	gr.Interface(fn=models, inputs=["text"], outputs="text", description=description)

	with gr.Blocks() as demo2:
	gr.Interface(fn=nemo, inputs=["text"], outputs="text", description="Critical Thinking Mode", api_name="critical_thinker", concurrency_limit=5)

	# Create a tabbed interface
	with gr.Blocks() as demo:
	gr.TabbedInterface([demo1, demo2], ["Fast", "Critical"])
	print("HF_TOKEN:", HF_TOKEN)

	# Launch the application
	demo.queue(max_size=100000) # Adjust max queue size
	demo.launch()