Spaces:

Metal3d
/

force-reasoning-any-model

Running on Zero

App Files Files Community

force-reasoning-any-model / app.py

Metal3d

Just a bit more UI improvements

61fff43 unverified about 1 month ago

raw

history blame

6.84 kB

	import re
	import threading

	import gradio as gr
	import spaces
	import transformers
	from transformers import pipeline

	# loading model and tokenizer
	model_name = "Qwen/Qwen2-1.5B-Instruct"
	if gr.NO_RELOAD:
	pipe = pipeline(
	"text-generation",
	model=model_name,
	device_map="auto",
	torch_dtype="auto",
	)

	# the answer marker to detect final answer
	ANSWER_MARKER = "ANSWER"

	# the sentences starting the reasoning step by step
	rethink_prepends = [
	"OK, I need to figure out ",
	"I think ",
	"Wait, I think ",
	"Let me check if ",
	"I should also remember that ",
	"Another thing to note is that ",
	"I also recall that ",
	"I think I have a good grasp ",
	"Now, using all the above information, I can answer the question using the original language used for the question:"
	"\n{question}\n"
	f"\n{ANSWER_MARKER}\n",
	]


	# to fix some problems with math display
	latex_delimiters = [
	{"left": "$$", "right": "$$", "display": True},
	{"left": "$", "right": "$", "display": False},
	]


	def reformat_math(text):
	"""Fix MathJax delimiters to use the Gradio syntax (Katex).

	This is a workaround to display math formulas in Gradio. For now, I havn't found a way to
	make it work as expected using others latex_delimiters...
	"""
	text = re.sub(r"\\\[\s(.?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL)
	text = re.sub(r"\\$\s(.?)\s*\\$", r"$\1$", text, flags=re.DOTALL)
	return text


	def user_input(message, history: list):
	"""Append the user input in the history and clean the input textbox"""
	return "", history + [
	gr.ChatMessage(role="user", content=message.replace(ANSWER_MARKER, ""))
	]


	def rebuild_messages(history: list):
	"""Rebuid the messages from the history to be used by the model without the intermediate thoughs"""
	messages = []
	for h in history:
	if isinstance(h, dict) and not h.get("metadata", {}).get("title", False):
	messages.append(h)
	elif (
	isinstance(h, gr.ChatMessage)
	and h.metadata.get("title")
	and isinstance(h.content, str)
	):
	messages.append({"role": h.role, "content": h.content})
	return messages


	@spaces.GPU
	def bot(history: list, max_num_tokens: int, final_num_tokens: int):
	"""Make the model answering the question"""

	# to get token as a stream, later in a thread
	streamer = transformers.TextIteratorStreamer(
	pipe.tokenizer, # pyright: ignore
	skip_special_tokens=True,
	skip_prompt=True,
	)

	# to reinsert the question in the reasoning if needed
	question = history[-1]["content"]

	# prepare the assistant message
	history.append(
	gr.ChatMessage(
	role="assistant",
	content=str(""),
	metadata={"title": "🧠 Thinking...", "status": "pending"},
	)
	)

	# for the moment, make the reasoning to be displayed in the chat
	messages = rebuild_messages(history)
	for i, prepend in enumerate(rethink_prepends):
	if i > 0:
	messages[-1]["content"] += "\n\n"
	messages[-1]["content"] += prepend.format(question=question)

	num_tokens = int(
	max_num_tokens if ANSWER_MARKER not in prepend else final_num_tokens
	)
	t = threading.Thread(
	target=pipe,
	args=(messages,),
	kwargs=dict(
	max_new_tokens=num_tokens,
	streamer=streamer,
	),
	)
	t.start()

	# rebuild the history with the new content
	history[-1].content += prepend.format(question=question)
	if ANSWER_MARKER in prepend:
	history[-1].metadata = {"title": "💭 Thoughs", "status": "done"}
	# stop thinking, this is the answer now (no metadata for intermediate steps)
	history.append(gr.ChatMessage(role="assistant", content=""))
	for token in streamer:
	history[-1].content += token
	history[-1].content = reformat_math(history[-1].content)
	yield history
	t.join()

	yield history


	with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo:
	with gr.Row(scale=1):
	with gr.Column(scale=5):
	gr.Markdown(f"""
	# Force reasoning for any model

	This is a simple proof-of-concept to get any LLM model to reason ahead of its response.
	This interface uses {model_name} model which is not a reasoning model. The used method
	is only to force some "reasoning" steps with prefixes to help the model to enhance the answer.

	See my related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning)
	""")
	chatbot = gr.Chatbot(
	scale=1,
	type="messages",
	latex_delimiters=latex_delimiters,
	)
	msg = gr.Textbox(
	submit_btn=True,
	label="",
	show_label=False,
	placeholder="Type your question here.",
	autofocus=True,
	)
	with gr.Column(scale=1):
	gr.Markdown("""## Tweaks""")
	num_tokens = gr.Slider(
	50,
	255,
	100,
	step=1,
	label="Max tokens per reasoning step",
	interactive=True,
	)
	final_num_tokens = gr.Slider(
	50,
	255,
	200,
	step=1,
	label="Max token for the final answer",
	interactive=True,
	)
	gr.Markdown("""
	Using smaller number of tokens in the reasoning steps will make the model
	faster to answer, but it may not be able to go deep enough in its reasoning.
	A good value is 100.

	Using smaller number of tokens for the final answer will make the model
	to be less verbose, but it may not be able to give a complete answer.
	A good value is 200 to 255.
	""")
	gr.Markdown("""
	This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop).
	Feel free to fork the application and try others instruct models.
	""")

	# when the user submit a message, the bot will answer
	msg.submit(
	user_input,
	[msg, chatbot], # inputs
	[msg, chatbot], # outputs
	).then(
	bot,
	[chatbot, num_tokens, final_num_tokens], # actually, the "history" input
	chatbot, # to store the new history from the output
	)

	if __name__ == "__main__":
	demo.queue().launch()