Spaces:
Sleeping
Sleeping
File size: 2,385 Bytes
2e7bc6d c127950 0897689 c127950 2e7bc6d c127950 2e7bc6d 6cbb926 2e7bc6d 5c2fd01 2779d84 2e7bc6d d596947 2e7bc6d 41838e1 2e7bc6d ea9ef1c 14c5296 ea9ef1c 2e7bc6d ea9ef1c 2e7bc6d f488ab5 2e7bc6d 2779d84 2e7bc6d 2f34a1d 2e7bc6d 3f15e00 2e7bc6d 3f15e00 2e7bc6d ff0ebe4 3f15e00 2e7bc6d 3f15e00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import os
import time
import gradio as gr
from huggingface_hub import InferenceClient
# Retrieve API token from Hugging Face Secrets
HF_TOKEN = os.getenv("HF_TOKEN")
# Ensure the token is available
if not HF_TOKEN:
raise ValueError("Missing Hugging Face API Token! Set 'HF_TOKEN' in Hugging Face Secrets.")
# Initialize clients with authentication
client_gemma = InferenceClient("google/gemma-1.1-2b-it", token=HF_TOKEN)
client_mistral = InferenceClient("rgb2gbr/deepseek-r1-distill-qwen-1-5b-kto", token=HF_TOKEN)
# Function for normal fast responses
def models(query):
messages = [{"role": "user", "content": f"[USER] {query}"}]
response = ""
try:
for message in client_gemma.chat_completion(messages, max_tokens=1024, stream=True):
token = message.choices[0].delta.content
response += token
yield response
time.sleep(0.3) # Prevent rate limiting
except Exception as e:
yield f"Error: {str(e)}"
# Function for detailed critical thinking responses
def nemo(query):
budget = 3
message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation.
Your task is to provide a detailed, step-by-step solution.
You have {budget} steps to solve the problem.
<answer> Your final answer should be here. </answer>
[QUERY] {query} [/INST] [ASSISTANT] """
output = ""
try:
stream = client_mistral.text_generation(message, max_new_tokens=2048, stream=True, details=True, return_full_text=False)
for response in stream:
output += response.token.text
yield output
time.sleep(0.3) # Prevent rate limiting
except Exception as e:
yield f"Error: {str(e)}"
# Description for Gradio UI
description = "# Chat GO\n### Enter your query and get lightning-fast responses"
# Gradio Interfaces
with gr.Blocks() as demo1:
gr.Interface(fn=models, inputs=["text"], outputs="text", description=description)
with gr.Blocks() as demo2:
gr.Interface(fn=nemo, inputs=["text"], outputs="text", description="Critical Thinking Mode", api_name="critical_thinker", concurrency_limit=5)
# Create a tabbed interface
with gr.Blocks() as demo:
gr.TabbedInterface([demo1, demo2], ["Fast", "Critical"])
print("HF_TOKEN:", HF_TOKEN)
# Launch the application
demo.queue(max_size=100000) # Adjust max queue size
demo.launch() |