import os
import time
import gradio as gr
from huggingface_hub import InferenceClient

# Retrieve API token from Hugging Face Secrets
HF_TOKEN = os.getenv("HF_TOKEN")

# Ensure the token is available
if not HF_TOKEN:
    raise ValueError("Missing Hugging Face API Token! Set 'HF_TOKEN' in Hugging Face Secrets.")

# Initialize clients with authentication
client_gemma = InferenceClient("google/gemma-1.1-2b-it", token=HF_TOKEN)
client_mistral = InferenceClient("rgb2gbr/deepseek-r1-distill-qwen-1-5b-kto", token=HF_TOKEN)

# Function for normal fast responses
def models(query): 
    messages = [{"role": "user", "content": f"[USER] {query}"}]
    
    response = ""
    try:
        for message in client_gemma.chat_completion(messages, max_tokens=1024, stream=True):
            token = message.choices[0].delta.content
            response += token
            yield response
            time.sleep(0.3)  # Prevent rate limiting
    except Exception as e:
        yield f"Error: {str(e)}"

# Function for detailed critical thinking responses
def nemo(query):
    budget = 3
    message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation.
Your task is to provide a detailed, step-by-step solution. 
You have {budget} steps to solve the problem.
<answer> Your final answer should be here. </answer>
[QUERY] {query} [/INST] [ASSISTANT] """

    output = ""
    try:
        stream = client_mistral.text_generation(message, max_new_tokens=2048, stream=True, details=True, return_full_text=False)
        for response in stream:
            output += response.token.text
            yield output
            time.sleep(0.3)  # Prevent rate limiting
    except Exception as e:
        yield f"Error: {str(e)}"

# Description for Gradio UI
description = "# Chat GO\n### Enter your query and get lightning-fast responses"

# Gradio Interfaces
with gr.Blocks() as demo1:
    gr.Interface(fn=models, inputs=["text"], outputs="text", description=description)

with gr.Blocks() as demo2:
    gr.Interface(fn=nemo, inputs=["text"], outputs="text", description="Critical Thinking Mode", api_name="critical_thinker", concurrency_limit=5)

# Create a tabbed interface
with gr.Blocks() as demo:
    gr.TabbedInterface([demo1, demo2], ["Fast", "Critical"])
print("HF_TOKEN:", HF_TOKEN)

# Launch the application
demo.queue(max_size=100000)  # Adjust max queue size
demo.launch()