Metal3d's picture
Just a bit more UI improvements
61fff43 unverified
raw
history blame
6.84 kB
import re
import threading
import gradio as gr
import spaces
import transformers
from transformers import pipeline
# loading model and tokenizer
model_name = "Qwen/Qwen2-1.5B-Instruct"
if gr.NO_RELOAD:
pipe = pipeline(
"text-generation",
model=model_name,
device_map="auto",
torch_dtype="auto",
)
# the answer marker to detect final answer
ANSWER_MARKER = "**ANSWER**"
# the sentences starting the reasoning step by step
rethink_prepends = [
"OK, I need to figure out ",
"I think ",
"Wait, I think ",
"Let me check if ",
"I should also remember that ",
"Another thing to note is that ",
"I also recall that ",
"I think I have a good grasp ",
"Now, using all the above information, I can answer the question using the original language used for the question:"
"\n{question}\n"
f"\n{ANSWER_MARKER}\n",
]
# to fix some problems with math display
latex_delimiters = [
{"left": "$$", "right": "$$", "display": True},
{"left": "$", "right": "$", "display": False},
]
def reformat_math(text):
"""Fix MathJax delimiters to use the Gradio syntax (Katex).
This is a workaround to display math formulas in Gradio. For now, I havn't found a way to
make it work as expected using others latex_delimiters...
"""
text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL)
text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL)
return text
def user_input(message, history: list):
"""Append the user input in the history and clean the input textbox"""
return "", history + [
gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, ""))
]
def rebuild_messages(history: list):
"""Rebuid the messages from the history to be used by the model without the intermediate thoughs"""
messages = []
for h in history:
if isinstance(h, dict) and not h.get("metadata", {}).get("title", False):
messages.append(h)
elif (
isinstance(h, gr.ChatMessage)
and h.metadata.get("title")
and isinstance(h.content, str)
):
messages.append({"role": h.role, "content": h.content})
return messages
@spaces.GPU
def bot(history: list, max_num_tokens: int, final_num_tokens: int):
"""Make the model answering the question"""
# to get token as a stream, later in a thread
streamer = transformers.TextIteratorStreamer(
pipe.tokenizer, # pyright: ignore
skip_special_tokens=True,
skip_prompt=True,
)
# to reinsert the question in the reasoning if needed
question = history[-1]["content"]
# prepare the assistant message
history.append(
gr.ChatMessage(
role="assistant",
content=str(""),
metadata={"title": "🧠 Thinking...", "status": "pending"},
)
)
# for the moment, make the reasoning to be displayed in the chat
messages = rebuild_messages(history)
for i, prepend in enumerate(rethink_prepends):
if i > 0:
messages[-1]["content"] += "\n\n"
messages[-1]["content"] += prepend.format(question=question)
num_tokens = int(
max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens
)
t = threading.Thread(
target=pipe,
args=(messages,),
kwargs=dict(
max_new_tokens=num_tokens,
streamer=streamer,
),
)
t.start()
# rebuild the history with the new content
history[-1].content += prepend.format(question=question)
if ANSWER_MARKER in prepend:
history[-1].metadata = {"title": "💭 Thoughs", "status": "done"}
# stop thinking, this is the answer now (no metadata for intermediate steps)
history.append(gr.ChatMessage(role="assistant", content=""))
for token in streamer:
history[-1].content += token
history[-1].content = reformat_math(history[-1].content)
yield history
t.join()
yield history
with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo:
with gr.Row(scale=1):
with gr.Column(scale=5):
gr.Markdown(f"""
# Force reasoning for any model
This is a simple proof-of-concept to get any LLM model to reason ahead of its response.
This interface uses *{model_name}* model which is **not** a reasoning model. The used method
is only to force some "reasoning" steps with prefixes to help the model to enhance the answer.
See my related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning)
""")
chatbot = gr.Chatbot(
scale=1,
type="messages",
latex_delimiters=latex_delimiters,
)
msg = gr.Textbox(
submit_btn=True,
label="",
show_label=False,
placeholder="Type your question here.",
autofocus=True,
)
with gr.Column(scale=1):
gr.Markdown("""## Tweaks""")
num_tokens = gr.Slider(
50,
255,
100,
step=1,
label="Max tokens per reasoning step",
interactive=True,
)
final_num_tokens = gr.Slider(
50,
255,
200,
step=1,
label="Max token for the final answer",
interactive=True,
)
gr.Markdown("""
Using smaller number of tokens in the reasoning steps will make the model
faster to answer, but it may not be able to go deep enough in its reasoning.
A good value is 100.
Using smaller number of tokens for the final answer will make the model
to be less verbose, but it may not be able to give a complete answer.
A good value is 200 to 255.
""")
gr.Markdown("""
This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop).
Feel free to fork the application and try others instruct models.
""")
# when the user submit a message, the bot will answer
msg.submit(
user_input,
[msg, chatbot], # inputs
[msg, chatbot], # outputs
).then(
bot,
[chatbot, num_tokens, final_num_tokens], # actually, the "history" input
chatbot, # to store the new history from the output
)
if __name__ == "__main__":
demo.queue().launch()