Spaces:
Sleeping
Sleeping
import os | |
import time | |
import gradio as gr | |
from huggingface_hub import InferenceClient | |
# Retrieve API token from Hugging Face Secrets | |
HF_TOKEN = os.getenv("HF_TOKEN") | |
# Ensure the token is available | |
if not HF_TOKEN: | |
raise ValueError("Missing Hugging Face API Token! Set 'HF_TOKEN' in Hugging Face Secrets.") | |
# Initialize clients with authentication | |
client_gemma = InferenceClient("google/gemma-1.1-2b-it", token=HF_TOKEN) | |
client_mistral = InferenceClient("rgb2gbr/deepseek-r1-distill-qwen-1-5b-kto", token=HF_TOKEN) | |
# Function for normal fast responses | |
def models(query): | |
messages = [{"role": "user", "content": f"[USER] {query}"}] | |
response = "" | |
try: | |
for message in client_gemma.chat_completion(messages, max_tokens=1024, stream=True): | |
token = message.choices[0].delta.content | |
response += token | |
yield response | |
time.sleep(0.3) # Prevent rate limiting | |
except Exception as e: | |
yield f"Error: {str(e)}" | |
# Function for detailed critical thinking responses | |
def nemo(query): | |
budget = 3 | |
message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation. | |
Your task is to provide a detailed, step-by-step solution. | |
You have {budget} steps to solve the problem. | |
<answer> Your final answer should be here. </answer> | |
[QUERY] {query} [/INST] [ASSISTANT] """ | |
output = "" | |
try: | |
stream = client_mistral.text_generation(message, max_new_tokens=2048, stream=True, details=True, return_full_text=False) | |
for response in stream: | |
output += response.token.text | |
yield output | |
time.sleep(0.3) # Prevent rate limiting | |
except Exception as e: | |
yield f"Error: {str(e)}" | |
# Description for Gradio UI | |
description = "# Chat GO\n### Enter your query and get lightning-fast responses" | |
# Gradio Interfaces | |
with gr.Blocks() as demo1: | |
gr.Interface(fn=models, inputs=["text"], outputs="text", description=description) | |
with gr.Blocks() as demo2: | |
gr.Interface(fn=nemo, inputs=["text"], outputs="text", description="Critical Thinking Mode", api_name="critical_thinker", concurrency_limit=5) | |
# Create a tabbed interface | |
with gr.Blocks() as demo: | |
gr.TabbedInterface([demo1, demo2], ["Fast", "Critical"]) | |
print("HF_TOKEN:", HF_TOKEN) | |
# Launch the application | |
demo.queue(max_size=100000) # Adjust max queue size | |
demo.launch() |