File size: 5,733 Bytes
55a28df
 
 
 
 
7f76be9
 
0ac061c
55a28df
 
 
 
 
 
 
 
 
8c33a08
9aaad03
8c33a08
 
 
 
 
9aaad03
 
8c33a08
 
 
 
 
 
 
ac9fe9d
9aaad03
 
 
 
8c33a08
 
9aaad03
d559f10
8c33a08
9aaad03
55a28df
 
 
 
8c33a08
 
 
 
 
 
 
 
 
 
 
 
55a28df
8c33a08
55a28df
8c33a08
55a28df
8c33a08
 
 
55a28df
 
 
 
 
 
 
 
 
8c33a08
 
 
55a28df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c33a08
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import spaces
import gradio as gr
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import os

HF_TOKEN = os.environ.get("HF_TOKEN") 

title = """
# Welcome to 🌟Tonic's🫡Command-A
🫡Command-A is a Large Language Model optimized for conversational interaction and long context tasks. It targets the “scalable” category of models that balance high performance with strong accuracy, enabling companies to move beyond proof of concept, and into production. 🫡Command-A boasts high precision on retrieval augmented generation (RAG) and tool use tasks, low latency and high throughput, a long 128k context, and strong capabilities across 10 key languages. You can build with this endpoint using🫡Command-R available here : [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01). You can also use 🫡Command-A by cloning this space. Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/Command-A?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3> 
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [DataTonic](https://huggingface.co/DataTonic)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
"""

model_id = "Tonic/c4ai-command-a-03-2025-4bit_fp4"

# Define quantization config with CPU offloading support
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="fp4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading
)

# Custom device map to split model across GPU and CPU
custom_device_map = {
    "transformer.word_embeddings": "cuda",
    "transformer.h": "cuda",  # Main transformer layers on GPU
    "transformer.ln_f": "cpu",  # Layer norm to CPU
    "lm_head": "cpu"  # Language model head to CPU
}

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map=custom_device_map,  # Use custom device mapping
    torch_dtype=torch.bfloat16,
    token=HF_TOKEN,
    max_position_embeddings=8192  # Adjusted to 8k tokens for memory efficiency
)

@spaces.GPU
def generate_response(user_input, max_new_tokens, temperature):
    messages = [{"role": "user", "content": user_input}]
    input_ids = tokenizer.apply_chat_template(
        messages, 
        tokenize=True, 
        add_generation_prompt=True, 
        return_tensors="pt"
    )
    
    # Move inputs to GPU if available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    input_ids = input_ids.to(device)
    
    # Generate with memory-efficient settings
    gen_tokens = model.generate(
        input_ids=input_ids,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        pad_token_id=tokenizer.eos_token_id,
        # Add memory-efficient parameters
        max_length=min(4000, max_new_tokens + input_ids.shape[-1]),  # Cap at context length
    )

    gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
    if gen_text.startswith(user_input):
        gen_text = gen_text[len(user_input):].lstrip()

    return gen_text

examples = [
    {"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
    {"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
    {"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
]
example_choices = [f"Example {i+1}" for i in range(len(examples))]

def load_example(choice):
    index = example_choices.index(choice)
    example = examples[index]
    return example["message"], example["max_new_tokens"], example["temperature"]

with gr.Blocks() as demo:
    gr.Markdown(title)
    with gr.Row():
        max_new_tokens_slider = gr.Slider(minimum=100, maximum=4000, value=980, label="Max New Tokens")
        temperature_slider = gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.3, label="Temperature")
    message_box = gr.Textbox(lines=2, label="Your Message")
    generate_button = gr.Button("Try🫡Command-A")
    output_box = gr.Textbox(label="🫡Command-A")

    generate_button.click(
        fn=generate_response,
        inputs=[message_box, max_new_tokens_slider, temperature_slider],
        outputs=output_box
    )
    example_dropdown = gr.Dropdown(label="🫡Load Example", choices=example_choices)
    example_button = gr.Button("🫡Load")
    example_button.click(
        fn=load_example,
        inputs=example_dropdown,
        outputs=[message_box, max_new_tokens_slider, temperature_slider]
    )

demo.launch(ssr_mode=False)