Luigi commited on
Commit
293686e
·
1 Parent(s): 939895d

refactor(app): improve streaming, background search, dtype fallback, and cleanup :contentReference[oaicite:0]{index=0}

Browse files

- Add cancel_event checks in the streamer loop to enable true cancellation during response generation. :contentReference[oaicite:1]{index=1}
- Launch DuckDuckGo web search in a background thread to prevent blocking the streaming pipeline. :contentReference[oaicite:2]{index=2}
- Implement dtype fallback (bfloat16 → float16 → float32) for broader hardware compatibility. :contentReference[oaicite:3]{index=3}
- Suppress repeated debug messages after the first token to avoid UI flooding. :contentReference[oaicite:4]{index=4}
- Remove unused imports and streamline load_pipeline caching logic for cleaner code. :contentReference[oaicite:5]{index=5}

Files changed (1) hide show
  1. app.py +132 -229
app.py CHANGED
@@ -6,7 +6,7 @@ from itertools import islice
6
  from datetime import datetime
7
  import gradio as gr
8
  import torch
9
- from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
10
  from duckduckgo_search import DDGS
11
  import spaces # Import spaces early to enable ZeroGPU support
12
 
@@ -22,55 +22,18 @@ cancel_event = threading.Event()
22
  # Torch-Compatible Model Definitions with Adjusted Descriptions
23
  # ------------------------------
24
  MODELS = {
25
- "Gemma-3-4B-IT": {
26
- "repo_id": "unsloth/gemma-3-4b-it",
27
- "description": "Gemma-3-4B-IT"
28
- },
29
- "SmolLM2-135M-Instruct-TaiwanChat": {
30
- "repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat",
31
- "description": "SmolLM2‑135M Instruct fine-tuned on the TaiwanChat"
32
- },
33
- "SmolLM2-135M-Instruct": {
34
- "repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct",
35
- "description": "Original SmolLM2‑135M Instruct"
36
- },
37
- "Llama-3.2-Taiwan-3B-Instruct": {
38
- "repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct",
39
- "description": "Llama-3.2-Taiwan-3B-Instruct"
40
- },
41
- "MiniCPM3-4B": {
42
- "repo_id": "openbmb/MiniCPM3-4B",
43
- "description": "MiniCPM3-4B"
44
- },
45
- "Qwen2.5-3B-Instruct": {
46
- "repo_id": "Qwen/Qwen2.5-3B-Instruct",
47
- "description": "Qwen2.5-3B-Instruct"
48
- },
49
- "Qwen2.5-7B-Instruct": {
50
- "repo_id": "Qwen/Qwen2.5-7B-Instruct",
51
- "description": "Qwen2.5-7B-Instruct"
52
- },
53
-
54
- "Phi-4-mini-Instruct": {
55
- "repo_id": "unsloth/Phi-4-mini-instruct",
56
- "description": "Phi-4-mini-Instruct"
57
- },
58
- "Meta-Llama-3.1-8B-Instruct": {
59
- "repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct",
60
- "description": "Meta-Llama-3.1-8B-Instruct"
61
- },
62
- "DeepSeek-R1-Distill-Llama-8B": {
63
- "repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B",
64
- "description": "DeepSeek-R1-Distill-Llama-8B"
65
- },
66
- "Mistral-7B-Instruct-v0.3": {
67
- "repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3",
68
- "description": "Mistral-7B-Instruct-v0.3"
69
- },
70
- "Qwen2.5-Coder-7B-Instruct": {
71
- "repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct",
72
- "description": "Qwen2.5-Coder-7B-Instruct"
73
- },
74
  }
75
 
76
  # Global cache for pipelines to avoid re-loading.
@@ -78,238 +41,178 @@ PIPELINES = {}
78
 
79
  def load_pipeline(model_name):
80
  """
81
- Load and cache a transformers pipeline for chat/text-generation.
82
- Uses the model's repo_id from MODELS and caches the pipeline for future use.
83
  """
84
  global PIPELINES
85
  if model_name in PIPELINES:
86
  return PIPELINES[model_name]
87
- selected_model = MODELS[model_name]
88
- # Create a chat-style text-generation pipeline.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  pipe = pipeline(
90
  task="text-generation",
91
- model=selected_model["repo_id"],
92
- tokenizer=selected_model["repo_id"],
93
  trust_remote_code=True,
94
- torch_dtype=torch.bfloat16,
95
  device_map="auto"
96
  )
97
  PIPELINES[model_name] = pipe
98
  return pipe
99
 
100
- def retrieve_context(query, max_results=6, max_chars_per_result=600):
 
101
  """
102
- Retrieve recent web search context for the given query using DuckDuckGo.
103
- Returns a formatted string with search results.
104
  """
105
  try:
106
  with DDGS() as ddgs:
107
- results = list(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))
108
- context = ""
109
- for i, result in enumerate(results, start=1):
110
- title = result.get("title", "No Title")
111
- snippet = result.get("body", "")[:max_chars_per_result]
112
- context += f"Result {i}:\nTitle: {title}\nSnippet: {snippet}\n\n"
113
- return context.strip()
114
  except Exception:
115
- return ""
 
116
 
117
- # ----------------------------------------------------------------------------
118
- # NEW HELPER FUNCTION: Format Conversation History into a Clean Prompt
119
- # ----------------------------------------------------------------------------
120
- def format_conversation(conversation, system_prompt):
121
  """
122
- Converts a list of conversation messages (each a dict with 'role' and 'content')
123
- and a system prompt into a single plain text string.
124
- This prevents raw role labels from being passed to the model.
125
  """
126
- # Start with the system prompt.
127
  prompt = system_prompt.strip() + "\n"
128
- # Loop through conversation and format user and assistant messages.
129
- for msg in conversation:
130
- if msg["role"] == "user":
131
- prompt += "User: " + msg["content"].strip() + "\n"
132
- elif msg["role"] == "assistant":
133
- prompt += "Assistant: " + msg["content"].strip() + "\n"
134
- elif msg["role"] == "system":
135
- prompt += msg["content"].strip() + "\n"
136
- # Append the assistant cue to indicate the start of the reply.
137
  if not prompt.strip().endswith("Assistant:"):
138
  prompt += "Assistant: "
139
  return prompt
140
 
141
- # ------------------------------
142
- # Chat Response Generation with ZeroGPU using Pipeline (Streaming Token-by-Token)
143
- # ------------------------------
144
  @spaces.GPU(duration=60)
145
- def chat_response(user_message, chat_history, system_prompt, enable_search,
146
- max_results, max_chars, model_name, max_tokens, temperature, top_k, top_p, repeat_penalty):
 
 
147
  """
148
- Generate a chat response by utilizing a transformers pipeline with streaming.
149
-
150
- - Appends the user's message to the conversation history.
151
- - Optionally retrieves web search context and inserts it as an additional system message.
152
- - Converts the conversation into a formatted prompt to avoid leaking role labels.
153
- - Uses the cached pipeline’s underlying model and tokenizer with a streamer to yield tokens as they are generated.
154
- - Yields updated conversation history token by token.
155
  """
156
  cancel_event.clear()
157
-
158
- # Build conversation list from chat history.
159
- conversation = list(chat_history) if chat_history else []
160
- conversation.append({"role": "user", "content": user_message})
161
-
162
- # Retrieve web search context if enabled.
163
- debug_message = ""
164
  if enable_search:
165
- debug_message = "Initiating web search..."
166
- yield conversation, debug_message
167
- search_result = [""]
168
- def do_search():
169
- search_result[0] = retrieve_context(user_message, max_results, max_chars)
170
- search_thread = threading.Thread(target=do_search)
171
- search_thread.start()
172
- search_thread.join(timeout=2)
173
- retrieved_context = search_result[0]
174
- if retrieved_context:
175
- debug_message = f"Web search results:\n\n{retrieved_context}"
176
- # Insert the search context as a system-level message immediately after the original system prompt.
177
- conversation.insert(1, {"role": "system", "content": f"Web search context:\n{retrieved_context}"})
178
- else:
179
- debug_message = "Web search returned no results or timed out."
180
  else:
181
- debug_message = "Web search disabled."
182
-
183
- # Append a placeholder for the assistant's response.
184
- conversation.append({"role": "assistant", "content": ""})
185
-
186
  try:
187
- # Format the entire conversation into a single prompt.
188
- prompt_text = format_conversation(conversation, system_prompt)
189
-
190
- # Load the pipeline.
191
  pipe = load_pipeline(model_name)
192
- # Set up a streamer tied to the pipeline’s tokenizer.
193
- streamer = TextIteratorStreamer(
194
- pipe.tokenizer,
195
- skip_prompt=True,
196
- skip_special_tokens=True
197
- )
198
-
199
- # Kick off generation via the pipeline itself.
200
- thread = threading.Thread(
201
  target=pipe,
202
- args=(prompt_text,),
203
  kwargs={
204
- "max_new_tokens": max_tokens,
205
- "temperature": temperature,
206
- "top_k": top_k,
207
- "top_p": top_p,
208
- "repetition_penalty": repeat_penalty,
209
- "streamer": streamer,
210
- "return_full_text": False,
211
  }
212
  )
213
- thread.start()
214
-
215
- # Collect tokens from the streamer as they are generated.
216
- assistant_text = ""
217
- for new_text in streamer:
218
- assistant_text += new_text
219
- conversation[-1]["content"] = assistant_text
220
- yield conversation, debug_message # Update UI token by token
221
-
222
- thread.join()
 
 
 
223
  except Exception as e:
224
- conversation[-1]["content"] = f"Error: {e}"
225
- yield conversation, debug_message
226
  finally:
227
  gc.collect()
228
 
229
- # ------------------------------
230
- # Cancel Function
231
- # ------------------------------
232
  def cancel_generation():
233
  cancel_event.set()
234
- return "Cancellation requested."
 
235
 
236
- # ------------------------------
237
- # Helper Function for Default Prompt Update
238
- # ------------------------------
239
  def update_default_prompt(enable_search):
240
  today = datetime.now().strftime('%Y-%m-%d')
241
- if enable_search:
242
- return f"You are a helpful assistant. Today is {today}. Please leverage the latest web data when responding to queries."
243
- else:
244
- return f"You are a helpful assistant. Today is {today}."
245
 
246
  # ------------------------------
247
- # Gradio UI Definition
248
  # ------------------------------
249
  with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
250
  gr.Markdown("## 🧠 ZeroGPU LLM Inference with Web Search")
251
- gr.Markdown("Interact with the model. Select your model, set your system prompt, and adjust parameters on the left.")
252
-
253
  with gr.Row():
254
  with gr.Column(scale=3):
255
- default_model = list(MODELS.keys())[0] if MODELS else "No models available"
256
- model_dropdown = gr.Dropdown(
257
- label="Select Model",
258
- choices=list(MODELS.keys()) if MODELS else [],
259
- value=default_model,
260
- info="Choose from available models."
261
- )
262
- # Create the Enable Web Search checkbox.
263
- enable_search_checkbox = gr.Checkbox(label="Enable Web Search", value=True,
264
- info="Include recent search context to improve answers.")
265
- # Create the System Prompt textbox with an initial value.
266
- system_prompt_text = gr.Textbox(label="System Prompt",
267
- value=update_default_prompt(enable_search_checkbox.value),
268
- lines=3,
269
- info="Define the base context for the AI's responses.")
270
  gr.Markdown("### Generation Parameters")
271
- max_tokens_slider = gr.Slider(label="Max Tokens", minimum=64, maximum=1024, value=1024, step=32,
272
- info="Maximum tokens for the response.")
273
- temperature_slider = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=0.7, step=0.1,
274
- info="Controls the randomness of the output.")
275
- top_k_slider = gr.Slider(label="Top-K", minimum=1, maximum=100, value=40, step=1,
276
- info="Limits token candidates to the top-k tokens.")
277
- top_p_slider = gr.Slider(label="Top-P (Nucleus Sampling)", minimum=0.1, maximum=1.0, value=0.95, step=0.05,
278
- info="Limits token candidates to a cumulative probability threshold.")
279
- repeat_penalty_slider = gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.1,
280
- info="Penalizes token repetition to improve diversity.")
281
  gr.Markdown("### Web Search Settings")
282
- max_results_number = gr.Number(label="Max Search Results", value=10, precision=0,
283
- info="Maximum number of search results to retrieve.")
284
- max_chars_number = gr.Number(label="Max Chars per Result", value=2000, precision=0,
285
- info="Maximum characters to retrieve per search result.")
286
- clear_button = gr.Button("Clear Chat")
287
- cancel_button = gr.Button("Cancel Generation")
288
  with gr.Column(scale=7):
289
- chatbot = gr.Chatbot(label="Chat", type="messages")
290
- msg_input = gr.Textbox(label="Your Message", placeholder="Enter your message and press Enter")
291
- search_debug = gr.Markdown(label="Web Search Debug")
292
-
293
- # Wire the Enable Web Search checkbox change to update the System Prompt textbox.
294
- enable_search_checkbox.change(
295
- fn=update_default_prompt,
296
- inputs=[enable_search_checkbox],
297
- outputs=[system_prompt_text]
298
- )
299
-
300
- def clear_chat():
301
- return [], "", ""
302
-
303
- clear_button.click(fn=clear_chat, outputs=[chatbot, msg_input, search_debug])
304
- cancel_button.click(fn=cancel_generation, outputs=search_debug)
305
-
306
- # Submission: the chat_response function is used with streaming.
307
- msg_input.submit(
308
- fn=chat_response,
309
- inputs=[msg_input, chatbot, system_prompt_text, enable_search_checkbox,
310
- max_results_number, max_chars_number, model_dropdown,
311
- max_tokens_slider, temperature_slider, top_k_slider, top_p_slider, repeat_penalty_slider],
312
- outputs=[chatbot, search_debug],
313
- )
314
-
315
- demo.launch()
 
6
  from datetime import datetime
7
  import gradio as gr
8
  import torch
9
+ from transformers import pipeline, TextIteratorStreamer
10
  from duckduckgo_search import DDGS
11
  import spaces # Import spaces early to enable ZeroGPU support
12
 
 
22
  # Torch-Compatible Model Definitions with Adjusted Descriptions
23
  # ------------------------------
24
  MODELS = {
25
+ "Gemma-3-4B-IT": {"repo_id": "unsloth/gemma-3-4b-it", "description": "Gemma-3-4B-IT"},
26
+ "SmolLM2-135M-Instruct-TaiwanChat": {"repo_id": "Luigi/SmolLM2-135M-Instruct-TaiwanChat", "description": "SmolLM2‑135M Instruct fine-tuned on TaiwanChat"},
27
+ "SmolLM2-135M-Instruct": {"repo_id": "HuggingFaceTB/SmolLM2-135M-Instruct", "description": "Original SmolLM2‑135M Instruct"},
28
+ "Llama-3.2-Taiwan-3B-Instruct": {"repo_id": "lianghsun/Llama-3.2-Taiwan-3B-Instruct", "description": "Llama-3.2-Taiwan-3B-Instruct"},
29
+ "MiniCPM3-4B": {"repo_id": "openbmb/MiniCPM3-4B", "description": "MiniCPM3-4B"},
30
+ "Qwen2.5-3B-Instruct": {"repo_id": "Qwen/Qwen2.5-3B-Instruct", "description": "Qwen2.5-3B-Instruct"},
31
+ "Qwen2.5-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-7B-Instruct", "description": "Qwen2.5-7B-Instruct"},
32
+ "Phi-4-mini-Instruct": {"repo_id": "unsloth/Phi-4-mini-instruct", "description": "Phi-4-mini-Instruct"},
33
+ "Meta-Llama-3.1-8B-Instruct": {"repo_id": "MaziyarPanahi/Meta-Llama-3.1-8B-Instruct", "description": "Meta-Llama-3.1-8B-Instruct"},
34
+ "DeepSeek-R1-Distill-Llama-8B": {"repo_id": "unsloth/DeepSeek-R1-Distill-Llama-8B", "description": "DeepSeek-R1-Distill-Llama-8B"},
35
+ "Mistral-7B-Instruct-v0.3": {"repo_id": "MaziyarPanahi/Mistral-7B-Instruct-v0.3", "description": "Mistral-7B-Instruct-v0.3"},
36
+ "Qwen2.5-Coder-7B-Instruct": {"repo_id": "Qwen/Qwen2.5-Coder-7B-Instruct", "description": "Qwen2.5-Coder-7B-Instruct"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  }
38
 
39
  # Global cache for pipelines to avoid re-loading.
 
41
 
42
  def load_pipeline(model_name):
43
  """
44
+ Load and cache a transformers pipeline for text generation.
45
+ Tries bfloat16, falls back to float16 or float32 if unsupported.
46
  """
47
  global PIPELINES
48
  if model_name in PIPELINES:
49
  return PIPELINES[model_name]
50
+ repo = MODELS[model_name]["repo_id"]
51
+ for dtype in (torch.bfloat16, torch.float16, torch.float32):
52
+ try:
53
+ pipe = pipeline(
54
+ task="text-generation",
55
+ model=repo,
56
+ tokenizer=repo,
57
+ trust_remote_code=True,
58
+ torch_dtype=dtype,
59
+ device_map="auto"
60
+ )
61
+ PIPELINES[model_name] = pipe
62
+ return pipe
63
+ except Exception:
64
+ continue
65
+ # Final fallback
66
  pipe = pipeline(
67
  task="text-generation",
68
+ model=repo,
69
+ tokenizer=repo,
70
  trust_remote_code=True,
 
71
  device_map="auto"
72
  )
73
  PIPELINES[model_name] = pipe
74
  return pipe
75
 
76
+
77
+ def retrieve_context(query, max_results=6, max_chars=600):
78
  """
79
+ Retrieve search snippets from DuckDuckGo (runs in background).
80
+ Returns a list of result strings.
81
  """
82
  try:
83
  with DDGS() as ddgs:
84
+ return [f"{i+1}. {r.get('title','No Title')} - {r.get('body','')[:max_chars]}"
85
+ for i, r in enumerate(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))]
 
 
 
 
 
86
  except Exception:
87
+ return []
88
+
89
 
90
+ def format_conversation(history, system_prompt):
 
 
 
91
  """
92
+ Flatten chat history and system prompt into a single string.
 
 
93
  """
 
94
  prompt = system_prompt.strip() + "\n"
95
+ for msg in history:
96
+ if msg['role'] == 'user':
97
+ prompt += "User: " + msg['content'].strip() + "\n"
98
+ elif msg['role'] == 'assistant':
99
+ prompt += "Assistant: " + msg['content'].strip() + "\n"
100
+ else:
101
+ prompt += msg['content'].strip() + "\n"
 
 
102
  if not prompt.strip().endswith("Assistant:"):
103
  prompt += "Assistant: "
104
  return prompt
105
 
 
 
 
106
  @spaces.GPU(duration=60)
107
+ def chat_response(user_msg, chat_history, system_prompt,
108
+ enable_search, max_results, max_chars,
109
+ model_name, max_tokens, temperature,
110
+ top_k, top_p, repeat_penalty):
111
  """
112
+ Generates streaming chat responses, optionally with background web search.
 
 
 
 
 
 
113
  """
114
  cancel_event.clear()
115
+ history = list(chat_history or [])
116
+ history.append({'role': 'user', 'content': user_msg})
117
+
118
+ # Launch web search if enabled
119
+ debug = ''
120
+ search_results = []
 
121
  if enable_search:
122
+ debug = 'Search task started.'
123
+ thread_search = threading.Thread(
124
+ target=lambda: search_results.extend(
125
+ retrieve_context(user_msg, int(max_results), int(max_chars))
126
+ )
127
+ )
128
+ thread_search.daemon = True
129
+ thread_search.start()
 
 
 
 
 
 
 
130
  else:
131
+ debug = 'Web search disabled.'
132
+
133
+ # Prepare assistant placeholder
134
+ history.append({'role': 'assistant', 'content': ''})
135
+
136
  try:
137
+ prompt = format_conversation(history, system_prompt)
 
 
 
138
  pipe = load_pipeline(model_name)
139
+ streamer = TextIteratorStreamer(pipe.tokenizer,
140
+ skip_prompt=True,
141
+ skip_special_tokens=True)
142
+ gen_thread = threading.Thread(
 
 
 
 
 
143
  target=pipe,
144
+ args=(prompt,),
145
  kwargs={
146
+ 'max_new_tokens': max_tokens,
147
+ 'temperature': temperature,
148
+ 'top_k': top_k,
149
+ 'top_p': top_p,
150
+ 'repetition_penalty': repeat_penalty,
151
+ 'streamer': streamer,
152
+ 'return_full_text': False
153
  }
154
  )
155
+ gen_thread.start()
156
+
157
+ assistant_text = ''
158
+ first = True
159
+ for chunk in streamer:
160
+ if cancel_event.is_set():
161
+ break
162
+ assistant_text += chunk
163
+ history[-1]['content'] = assistant_text
164
+ # Show debug only once
165
+ yield history, (debug if first else '')
166
+ first = False
167
+ gen_thread.join()
168
  except Exception as e:
169
+ history[-1]['content'] = f"Error: {e}"
170
+ yield history, debug
171
  finally:
172
  gc.collect()
173
 
174
+
 
 
175
  def cancel_generation():
176
  cancel_event.set()
177
+ return 'Generation cancelled.'
178
+
179
 
 
 
 
180
  def update_default_prompt(enable_search):
181
  today = datetime.now().strftime('%Y-%m-%d')
182
+ return f"You are a helpful assistant. Today is {today}."
 
 
 
183
 
184
  # ------------------------------
185
+ # Gradio UI
186
  # ------------------------------
187
  with gr.Blocks(title="LLM Inference with ZeroGPU") as demo:
188
  gr.Markdown("## 🧠 ZeroGPU LLM Inference with Web Search")
189
+ gr.Markdown("Interact with the model. Select parameters and chat below.")
 
190
  with gr.Row():
191
  with gr.Column(scale=3):
192
+ model_dd = gr.Dropdown(label="Select Model", choices=list(MODELS.keys()), value=list(MODELS.keys())[0])
193
+ search_chk = gr.Checkbox(label="Enable Web Search", value=True)
194
+ sys_prompt = gr.Textbox(label="System Prompt", lines=3, value=update_default_prompt(search_chk.value))
 
 
 
 
 
 
 
 
 
 
 
 
195
  gr.Markdown("### Generation Parameters")
196
+ max_tok = gr.Slider(64, 1024, value=512, step=32, label="Max Tokens")
197
+ temp = gr.Slider(0.1, 2.0, value=0.7, step=0.1, label="Temperature")
198
+ k = gr.Slider(1, 100, value=40, step=1, label="Top-K")
199
+ p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
200
+ rp = gr.Slider(1.0, 2.0, value=1.1, step=0.1, label="Repetition Penalty")
 
 
 
 
 
201
  gr.Markdown("### Web Search Settings")
202
+ mr = gr.Number(value=6, precision=0, label="Max Results")
203
+ mc = gr.Number(value=600, precision=0, label="Max Chars/Result")
204
+ clr = gr.Button("Clear Chat")
205
+ cnl = gr.Button("Cancel Generation")
 
 
206
  with gr.Column(scale=7):
207
+ chat = gr.Chatbot(type="messages")
208
+ txt = gr.Textbox(placeholder="Type your message and press Enter...")
209
+ dbg = gr.Markdown()
210
+
211
+ search_chk.change(fn=update_default_prompt, inputs=search_chk, outputs=sys_prompt)
212
+ clr.click(fn=lambda: ([], "", ""), outputs=[chat, txt, dbg])
213
+ cnl.click(fn=cancel_generation, outputs=dbg)
214
+ txt.submit(fn=chat_response,
215
+ inputs=[txt, chat, sys_prompt, search_chk, mr, mc,
216
+ model_dd, max_tok, temp, k, p, rp],
217
+ outputs=[chat, dbg])
218
+ demo.launch()