ayouuubBn commited on
Commit
f488ab5
·
verified ·
1 Parent(s): 016ea58

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -27
app.py CHANGED
@@ -1,27 +1,34 @@
 
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
 
4
- client = InferenceClient("google/gemma-1.1-2b-it")
5
- client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407")
6
 
7
- def models(Query):
8
-
9
- messages = []
10
-
11
- messages.append({"role": "user", "content": f"[SYSTEM] You are ASSISTANT who answer question asked by user in short and concise manner. [USER] {Query}"})
12
-
13
- Response = ""
14
 
15
- for message in client.chat_completion(
16
- messages,
17
- max_tokens=2048,
18
- stream=True
19
- ):
20
- token = message.choices[0].delta.content
21
 
22
- Response += token
23
- yield Response
 
 
 
 
 
 
 
 
 
 
 
24
 
 
25
  def nemo(query):
26
  budget = 3
27
  message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation.
@@ -54,22 +61,30 @@ Example format:
54
  <reflection> [Evaluation of the solution] </reflection>
55
  <reward> [Float between 0.0 and 1.0] </reward> [/INST] [INST] [QUERY] {query} [/INST] [ASSISTANT] """
56
 
57
- stream = client.text_generation(message, max_new_tokens=4096, stream=True, details=True, return_full_text=False)
58
  output = ""
 
 
 
 
 
 
 
 
59
 
60
- for response in stream:
61
- output += response.token.text
62
- return output
63
-
64
- description="# Chat GO\n### Enter your query and Press enter and get lightning fast response"
65
 
 
66
  with gr.Blocks() as demo1:
67
- gr.Interface(description=description,fn=models, inputs=["text"], outputs="text")
 
68
  with gr.Blocks() as demo2:
69
- gr.Interface(description="Very low but critical thinker",fn=nemo, inputs=["text"], outputs="text", api_name="critical_thinker", concurrency_limit=10)
70
 
 
71
  with gr.Blocks() as demo:
72
- gr.TabbedInterface([demo1, demo2] , ["Fast", "Critical"])
73
 
74
- demo.queue(max_size=300000)
 
75
  demo.launch()
 
1
+ import os
2
+ import time
3
  import gradio as gr
4
  from huggingface_hub import InferenceClient
5
 
6
+ # Retrieve API token from Hugging Face Secrets
7
+ HF_TOKEN = os.getenv("HF_TOKEN")
8
 
9
+ # Ensure the token is available
10
+ if not HF_TOKEN:
11
+ raise ValueError("Missing Hugging Face API Token! Set 'HF_TOKEN' in Hugging Face Secrets.")
 
 
 
 
12
 
13
+ # Initialize clients with authentication
14
+ client_gemma = InferenceClient("google/gemma-1.1-2b-it", token=HF_TOKEN)
15
+ client_mistral = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=HF_TOKEN)
 
 
 
16
 
17
+ # Function for normal fast responses
18
+ def models(query):
19
+ messages = [{"role": "user", "content": f"[SYSTEM] You are an AI assistant. Answer concisely. [USER] {query}"}]
20
+
21
+ response = ""
22
+ try:
23
+ for message in client_gemma.chat_completion(messages, max_tokens=1024, stream=True):
24
+ token = message.choices[0].delta.content
25
+ response += token
26
+ yield response
27
+ time.sleep(0.3) # Prevent rate limiting
28
+ except Exception as e:
29
+ yield f"Error: {str(e)}"
30
 
31
+ # Function for detailed critical thinking responses
32
  def nemo(query):
33
  budget = 3
34
  message = f"""[INST] [SYSTEM] You are a helpful assistant in normal conversation.
 
61
  <reflection> [Evaluation of the solution] </reflection>
62
  <reward> [Float between 0.0 and 1.0] </reward> [/INST] [INST] [QUERY] {query} [/INST] [ASSISTANT] """
63
 
 
64
  output = ""
65
+ try:
66
+ stream = client_mistral.text_generation(message, max_new_tokens=2048, stream=True, details=True, return_full_text=False)
67
+ for response in stream:
68
+ output += response.token.text
69
+ yield output
70
+ time.sleep(0.3) # Prevent rate limiting
71
+ except Exception as e:
72
+ yield f"Error: {str(e)}"
73
 
74
+ # Description for Gradio UI
75
+ description = "# Chat GO\n### Enter your query and get lightning-fast responses"
 
 
 
76
 
77
+ # Gradio Interfaces
78
  with gr.Blocks() as demo1:
79
+ gr.Interface(fn=models, inputs=["text"], outputs="text", description=description)
80
+
81
  with gr.Blocks() as demo2:
82
+ gr.Interface(fn=nemo, inputs=["text"], outputs="text", description="Critical Thinking Mode", api_name="critical_thinker", concurrency_limit=5)
83
 
84
+ # Create a tabbed interface
85
  with gr.Blocks() as demo:
86
+ gr.TabbedInterface([demo1, demo2], ["Fast", "Critical"])
87
 
88
+ # Launch the application
89
+ demo.queue(max_size=100000) # Adjust max queue size
90
  demo.launch()