Spaces:

Tonic
/

Command-A

Runtime error

App Files Files Community

Tonic commited on Mar 14

Commit

8c33a08

unverified ·

1 Parent(s): 098ce94

memory efficient loading

Browse files

Files changed (1) hide show

app.py +37 -26

app.py CHANGED Viewed

@@ -15,43 +15,57 @@ Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder
 model_id = "Tonic/c4ai-command-a-03-2025-4bit_fp4"
-# Define quantization config for 4-bit
 quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,  # Enable 4-bit quantization
-    bnb_4bit_quant_type="fp4",  # Use FP4 quantization
-    bnb_4bit_use_double_quant=True,  # Optional: double quantization for better precision
-    # llm_int8_enable_fp32_cpu_offload=True  # Allow CPU offloading for 32-bit modules
-    bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation to save memory
 )
-# # Custom device map to offload non-critical components
-# custom_device_map = {
-#     "transformer": "cuda",  # Keep transformer layers on GPU
-#     "lm_head": "cpu",       # Offload language model head to CPU
-# }
 # Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
-    quantization_config=quantization_config,  # Apply quantization
-    device_map="auto",  # Automatically map to available devices
-    # device_map=custom_device_map,             # Use custom device map
     torch_dtype=torch.bfloat16,
     token=HF_TOKEN,
-    max_position_embeddings=4096  # Reduce context window to 8k tokens (from 128k)
 )
 @spaces.GPU
 def generate_response(user_input, max_new_tokens, temperature):
     messages = [{"role": "user", "content": user_input}]
-    input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
-    input_ids = input_ids.to("cuda" if torch.cuda.is_available() else "cpu")  # Dynamic device placement
     gen_tokens = model.generate(
-        input_ids = input_ids,
         max_new_tokens=max_new_tokens,
-        do_sample=True,
         temperature=temperature,
     )
     gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
@@ -60,12 +74,10 @@ def generate_response(user_input, max_new_tokens, temperature):
     return gen_text
 examples = [
-        {"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
-        {"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
-        {"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
 ]
 example_choices = [f"Example {i+1}" for i in range(len(examples))]
@@ -74,7 +86,6 @@ def load_example(choice):
     example = examples[index]
     return example["message"], example["max_new_tokens"], example["temperature"]
 with gr.Blocks() as demo:
     gr.Markdown(title)
     with gr.Row():
@@ -97,4 +108,4 @@ with gr.Blocks() as demo:
         outputs=[message_box, max_new_tokens_slider, temperature_slider]
     )
-demo.launch(ssr_mode=False)

 model_id = "Tonic/c4ai-command-a-03-2025-4bit_fp4"
+# Define quantization config with CPU offloading support
 quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="fp4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading
 )
+# Custom device map to split model across GPU and CPU
+custom_device_map = {
+    "transformer.word_embeddings": "cuda",
+    "transformer.h": "cuda",  # Main transformer layers on GPU
+    "transformer.ln_f": "cpu",  # Layer norm to CPU
+    "lm_head": "cpu"  # Language model head to CPU
+}
 # Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
+    quantization_config=quantization_config,
+    device_map=custom_device_map,  # Use custom device mapping
     torch_dtype=torch.bfloat16,
     token=HF_TOKEN,
+    max_position_embeddings=8192  # Adjusted to 8k tokens for memory efficiency
 )
 @spaces.GPU
 def generate_response(user_input, max_new_tokens, temperature):
     messages = [{"role": "user", "content": user_input}]
+    input_ids = tokenizer.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    )
+    # Move inputs to GPU if available
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    input_ids = input_ids.to(device)
+    # Generate with memory-efficient settings
     gen_tokens = model.generate(
+        input_ids=input_ids,
         max_new_tokens=max_new_tokens,
+        do_sample=True,
         temperature=temperature,
+        pad_token_id=tokenizer.eos_token_id,
+        # Add memory-efficient parameters
+        max_length=min(4000, max_new_tokens + input_ids.shape[-1]),  # Cap at context length
     )
     gen_text = tokenizer.decode(gen_tokens[0], skip_special_tokens=True)
     return gen_text
 examples = [
+    {"message": "What is the weather like today?", "max_new_tokens": 250, "temperature": 0.5},
+    {"message": "Tell me a joke.", "max_new_tokens": 650, "temperature": 0.7},
+    {"message": "Explain the concept of machine learning.", "max_new_tokens": 980, "temperature": 0.4}
 ]
 example_choices = [f"Example {i+1}" for i in range(len(examples))]
     example = examples[index]
     return example["message"], example["max_new_tokens"], example["temperature"]
 with gr.Blocks() as demo:
     gr.Markdown(title)
     with gr.Row():
         outputs=[message_box, max_new_tokens_slider, temperature_slider]
     )
+demo.launch(ssr_mode=False)