Spaces:

aaurelions
/

bitnet-space

Runtime error

App Files Files Community

bitnet-space / app.py

aaurelions

Update app.py

9b68748 verified 11 days ago

raw

history blame contribute delete

4.6 kB

	import gradio as gr
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
	from threading import Thread

	# --- Configuration ---
	MODEL_ID = "microsoft/bitnet-b1.58-2B-4T"
	# Try 'cuda' if you have a GPU space, 'cpu' otherwise (will be slow)
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {DEVICE}")

	# --- Load Model and Tokenizer ---
	# Note: Loading might require specific trust_remote_code=True or other flags
	# depending on the model implementation. Check the model card on Hugging Face.
	# You might also need specific quantization configs if not handled automatically.
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
	# Adjust loading parameters as needed (e.g., torch_dtype, device_map)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.bfloat16, # Or float16, adjust based on hardware/model reqs
	device_map="auto", # Automatically distribute across available devices (GPU/CPU)
	trust_remote_code=True # May be required for some custom model code
	)
	# model.to(DEVICE) # Usually handled by device_map="auto"
	print("Model and tokenizer loaded successfully.")
	except Exception as e:
	print(f"Error loading model or tokenizer: {e}")
	# Fallback or exit if loading fails
	raise SystemExit("Failed to load model/tokenizer.")

	# --- Chat Processing Function ---
	def predict(message, history):
	"""
	Generates a response to the user's message using the chat history.
	"""
	history_transformer_format = []
	for human, assistant in history:
	# Basic alternating format - adjust if the model expects something different
	history_transformer_format.append({"role": "user", "content": human})
	history_transformer_format.append({"role": "assistant", "content": assistant})

	# Add the current user message
	history_transformer_format.append({"role": "user", "content": message})

	# Use the tokenizer's chat template if available, otherwise manual formatting.
	# Base models might not have a specific chat template.
	try:
	prompt = tokenizer.apply_chat_template(
	history_transformer_format,
	tokenize=False,
	add_generation_prompt=True # Important for generation
	)
	except Exception:
	# Manual fallback prompt formatting (Example - adjust as needed!)
	print("Warning: Using basic manual prompt formatting.")
	prompt_parts = ["Chat History:"]
	for turn in history_transformer_format:
	prompt_parts.append(f"{turn['role'].capitalize()}: {turn['content']}")
	prompt = "\n".join(prompt_parts) + "\nAssistant:" # Ensure it ends ready for generation

	print(f"\n--- Prompt Sent to Model ---\n{prompt}\n---------------------------\n")

	# Use a streamer for interactive generation
	streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

	inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

	generation_kwargs = dict(
	inputs,
	streamer=streamer,
	max_new_tokens=512,
	do_sample=True,
	top_p=0.9,
	temperature=0.7,
	# Add other generation parameters as needed
	# eos_token_id=tokenizer.eos_token_id # Important if model needs it
	pad_token_id=tokenizer.eos_token_id # Often set for open-end generation
	)

	# Run generation in a separate thread for streaming
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Yield tokens as they become available
	partial_message = ""
	for new_token in streamer:
	partial_message += new_token
	yield partial_message

	# --- Gradio Interface ---
	# Use gr.ChatInterface - it handles history management automatically
	chatbot_interface = gr.ChatInterface(
	fn=predict,
	chatbot=gr.Chatbot(height=500),
	textbox=gr.Textbox(placeholder="Ask me anything...", container=False, scale=7),
	title="Chat with microsoft/bitnet-b1.58-2B-4T",
	description="A basic chat interface for the BitNet 1.58-bit 2B parameter model. Remember it's a base model, so prompting matters!",
	theme="soft",
	examples=[["Hello!"], ["Explain the concept of 1.58-bit quantization."]],
	cache_examples=False, # Set to True to cache example results
	retry_btn=None,
	undo_btn="Delete Previous Turn",
	clear_btn="Clear Chat",
	)

	# --- Launch the Interface ---
	if __name__ == "__main__":
	chatbot_interface.launch() # Use share=True for public link if running locally