Spaces:

Tonic
/

Command-A

Runtime error

App Files Files Community

Command-A / app.py

Tonic

reduce position embeddings

d559f10 unverified about 2 months ago

raw

history blame

5.2 kB

	import spaces
	import gradio as gr
	import transformers
	from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
	import torch
	import os

	HF_TOKEN = os.environ.get("HF_TOKEN")

	title = """
	# Welcome to 🌟Tonic's🫡Command-A
	🫡Command-A is a Large Language Model optimized for conversational interaction and long context tasks. It targets the “scalable” category of models that balance high performance with strong accuracy, enabling companies to move beyond proof of concept, and into production. 🫡Command-A boasts high precision on retrieval augmented generation (RAG) and tool use tasks, low latency and high throughput, a long 128k context, and strong capabilities across 10 key languages. You can build with this endpoint using🫡Command-R available here : [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01). You can also use 🫡Command-A by cloning this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/Command-A?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>
	Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [DataTonic](https://huggingface.co/DataTonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
	"""

	model_id = "Tonic/c4ai-command-a-03-2025-4bit_fp4"

	# Define quantization config for 4-bit
	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True, # Enable 4-bit quantization
	bnb_4bit_quant_type="fp4", # Use FP4 quantization
	bnb_4bit_use_double_quant=True#, # Optional: double quantization for better precision
	# llm_int8_enable_fp32_cpu_offload=True # Allow CPU offloading for 32-bit modules
	)

	# Load tokenizer and model
	tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	quantization_config=quantization_config, # Apply quantization
	# device_map="auto", # Automatically map to available devices
	torch_dtype=torch.bfloat16,
	token=HF_TOKEN,
	max_position_embeddings=8192 # Reduce context window to 8k tokens (from 128k)
	)

	@spaces.GPU
	def generate_response(user_input, max_new_tokens, temperature):
	messages = [{"role": "user", "content": user_input}]
	input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
	input_ids = input_ids.to(model.device)
	gen_tokens = model.generate(
	input_ids = input_ids,
	max_new_tokens=max_new_tokens,
	do_sample=True,
	temperature=temperature,
	)

	gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
	if gen_text.startswith(user_input):
	gen_text = gen_text[len(user_input):].lstrip()

	return gen_text



	examples = [
	{"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
	{"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
	{"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
	]
	example_choices = [f"Example {i+1}" for i in range(len(examples))]

	def load_example(choice):
	index = example_choices.index(choice)
	example = examples[index]
	return example["message"], example["max_new_tokens"], example["temperature"]


	with gr.Blocks() as demo:
	gr.Markdown(title)
	with gr.Row():
	max_new_tokens_slider = gr.Slider(minimum=100, maximum=4000, value=980, label="Max New Tokens")
	temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.3, label="Temperature")
	message_box = gr.Textbox(lines=2, label="Your Message")
	generate_button = gr.Button("Try🫡Command-A")
	output_box = gr.Textbox(label="🫡Command-A")

	generate_button.click(
	fn=generate_response,
	inputs=[message_box, max_new_tokens_slider, temperature_slider],
	outputs=output_box
	)
	example_dropdown = gr.Dropdown(label="🫡Load Example", choices=example_choices)
	example_button = gr.Button("🫡Load")
	example_button.click(
	fn=load_example,
	inputs=example_dropdown,
	outputs=[message_box, max_new_tokens_slider, temperature_slider]
	)

	demo.launch(ssr_mode=False)