File size: 2,385 Bytes
2e7bc6d
 
c127950
0897689
c127950
2e7bc6d
 
c127950
2e7bc6d
 
 
6cbb926
2e7bc6d
 
5c2fd01
2779d84
2e7bc6d
 
d596947
2e7bc6d
 
 
 
 
 
 
 
 
 
41838e1
2e7bc6d
ea9ef1c
 
 
14c5296
 
 
 
ea9ef1c
 
2e7bc6d
 
 
 
 
 
 
 
ea9ef1c
2e7bc6d
 
f488ab5
2e7bc6d
2779d84
2e7bc6d
 
2f34a1d
2e7bc6d
3f15e00
2e7bc6d
3f15e00
2e7bc6d
ff0ebe4
3f15e00
2e7bc6d
 
3f15e00
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import time
import gradio as gr
from huggingface_hub import InferenceClient

# Retrieve API token from Hugging Face Secrets
HF_TOKEN = os.getenv("HF_TOKEN")

# Ensure the token is available
if not HF_TOKEN:
    raise ValueError("Missing Hugging Face API Token! Set 'HF_TOKEN' in Hugging Face Secrets.")

# Initialize clients with authentication
client_gemma = InferenceClient("google/gemma-1.1-2b-it", token=HF_TOKEN)
client_mistral = InferenceClient("rgb2gbr/deepseek-r1-distill-qwen-1-5b-kto", token=HF_TOKEN)

# Function for normal fast responses
def models(query): 
    messages = [{"role": "user", "content": f"[USER] {query}"}]
    
    response = ""
    try:
        for message in client_gemma.chat_completion(messages, max_tokens=1024, stream=True):
            token = message.choices[0].delta.content
            response += token
            yield response
            time.sleep(0.3)  # Prevent rate limiting
    except Exception as e:
        yield f"Error: {str(e)}"

# Function for detailed critical thinking responses
def nemo(query):
    budget = 3
    message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation.
Your task is to provide a detailed, step-by-step solution. 
You have {budget} steps to solve the problem.
<answer> Your final answer should be here. </answer>
[QUERY] {query} [/INST] [ASSISTANT] """

    output = ""
    try:
        stream = client_mistral.text_generation(message, max_new_tokens=2048, stream=True, details=True, return_full_text=False)
        for response in stream:
            output += response.token.text
            yield output
            time.sleep(0.3)  # Prevent rate limiting
    except Exception as e:
        yield f"Error: {str(e)}"

# Description for Gradio UI
description = "# Chat GO\n### Enter your query and get lightning-fast responses"

# Gradio Interfaces
with gr.Blocks() as demo1:
    gr.Interface(fn=models, inputs=["text"], outputs="text", description=description)

with gr.Blocks() as demo2:
    gr.Interface(fn=nemo, inputs=["text"], outputs="text", description="Critical Thinking Mode", api_name="critical_thinker", concurrency_limit=5)

# Create a tabbed interface
with gr.Blocks() as demo:
    gr.TabbedInterface([demo1, demo2], ["Fast", "Critical"])
print("HF_TOKEN:", HF_TOKEN)

# Launch the application
demo.queue(max_size=100000)  # Adjust max queue size
demo.launch()