import os import time import gradio as gr from huggingface_hub import InferenceClient # Retrieve API token from Hugging Face Secrets HF_TOKEN = os.getenv("HF_TOKEN") # Ensure the token is available if not HF_TOKEN: raise ValueError("Missing Hugging Face API Token! Set 'HF_TOKEN' in Hugging Face Secrets.") # Initialize clients with authentication client_gemma = InferenceClient("google/gemma-1.1-2b-it", token=HF_TOKEN) client_mistral = InferenceClient("rgb2gbr/deepseek-r1-distill-qwen-1-5b-kto", token=HF_TOKEN) # Function for normal fast responses def models(query): messages = [{"role": "user", "content": f"[USER] {query}"}] response = "" try: for message in client_gemma.chat_completion(messages, max_tokens=1024, stream=True): token = message.choices[0].delta.content response += token yield response time.sleep(0.3) # Prevent rate limiting except Exception as e: yield f"Error: {str(e)}" # Function for detailed critical thinking responses def nemo(query): budget = 3 message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation. Your task is to provide a detailed, step-by-step solution. You have {budget} steps to solve the problem. Your final answer should be here. [QUERY] {query} [/INST] [ASSISTANT] """ output = "" try: stream = client_mistral.text_generation(message, max_new_tokens=2048, stream=True, details=True, return_full_text=False) for response in stream: output += response.token.text yield output time.sleep(0.3) # Prevent rate limiting except Exception as e: yield f"Error: {str(e)}" # Description for Gradio UI description = "# Chat GO\n### Enter your query and get lightning-fast responses" # Gradio Interfaces with gr.Blocks() as demo1: gr.Interface(fn=models, inputs=["text"], outputs="text", description=description) with gr.Blocks() as demo2: gr.Interface(fn=nemo, inputs=["text"], outputs="text", description="Critical Thinking Mode", api_name="critical_thinker", concurrency_limit=5) # Create a tabbed interface with gr.Blocks() as demo: gr.TabbedInterface([demo1, demo2], ["Fast", "Critical"]) print("HF_TOKEN:", HF_TOKEN) # Launch the application demo.queue(max_size=100000) # Adjust max queue size demo.launch()