fast-chatbot / app.py
ayouuubBn's picture
Update app.py
ff0ebe4 verified
import os
import time
import gradio as gr
from huggingface_hub import InferenceClient
# Retrieve API token from Hugging Face Secrets
HF_TOKEN = os.getenv("HF_TOKEN")
# Ensure the token is available
if not HF_TOKEN:
raise ValueError("Missing Hugging Face API Token! Set 'HF_TOKEN' in Hugging Face Secrets.")
# Initialize clients with authentication
client_gemma = InferenceClient("google/gemma-1.1-2b-it", token=HF_TOKEN)
client_mistral = InferenceClient("rgb2gbr/deepseek-r1-distill-qwen-1-5b-kto", token=HF_TOKEN)
# Function for normal fast responses
def models(query):
messages = [{"role": "user", "content": f"[USER] {query}"}]
response = ""
try:
for message in client_gemma.chat_completion(messages, max_tokens=1024, stream=True):
token = message.choices[0].delta.content
response += token
yield response
time.sleep(0.3) # Prevent rate limiting
except Exception as e:
yield f"Error: {str(e)}"
# Function for detailed critical thinking responses
def nemo(query):
budget = 3
message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation.
Your task is to provide a detailed, step-by-step solution.
You have {budget} steps to solve the problem.
<answer> Your final answer should be here. </answer>
[QUERY] {query} [/INST] [ASSISTANT] """
output = ""
try:
stream = client_mistral.text_generation(message, max_new_tokens=2048, stream=True, details=True, return_full_text=False)
for response in stream:
output += response.token.text
yield output
time.sleep(0.3) # Prevent rate limiting
except Exception as e:
yield f"Error: {str(e)}"
# Description for Gradio UI
description = "# Chat GO\n### Enter your query and get lightning-fast responses"
# Gradio Interfaces
with gr.Blocks() as demo1:
gr.Interface(fn=models, inputs=["text"], outputs="text", description=description)
with gr.Blocks() as demo2:
gr.Interface(fn=nemo, inputs=["text"], outputs="text", description="Critical Thinking Mode", api_name="critical_thinker", concurrency_limit=5)
# Create a tabbed interface
with gr.Blocks() as demo:
gr.TabbedInterface([demo1, demo2], ["Fast", "Critical"])
print("HF_TOKEN:", HF_TOKEN)
# Launch the application
demo.queue(max_size=100000) # Adjust max queue size
demo.launch()