Spaces:

Tonic
/

Command-A

Runtime error

Tonic commited on Mar 13

Commit

1584225

unverified ·

1 Parent(s): d559f10

reduce memory footprint bfloat16

Files changed (1) hide show

app.py CHANGED Viewed

@@ -19,8 +19,9 @@ model_id = "Tonic/c4ai-command-a-03-2025-4bit_fp4"
 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,  # Enable 4-bit quantization
     bnb_4bit_quant_type="fp4",  # Use FP4 quantization
-    bnb_4bit_use_double_quant=True#,  # Optional: double quantization for better precision
     # llm_int8_enable_fp32_cpu_offload=True  # Allow CPU offloading for 32-bit modules
 )
 # Load tokenizer and model
@@ -31,7 +32,7 @@ model = AutoModelForCausalLM.from_pretrained(
     # device_map="auto",  # Automatically map to available devices
     torch_dtype=torch.bfloat16,
     token=HF_TOKEN,
-    max_position_embeddings=8192  # Reduce context window to 8k tokens (from 128k)
 )
 @spaces.GPU

 quantization_config = BitsAndBytesConfig(
     load_in_4bit=True,  # Enable 4-bit quantization
     bnb_4bit_quant_type="fp4",  # Use FP4 quantization
+    bnb_4bit_use_double_quant=True,  # Optional: double quantization for better precision
     # llm_int8_enable_fp32_cpu_offload=True  # Allow CPU offloading for 32-bit modules
+    bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation to save memory
 )
 # Load tokenizer and model
     # device_map="auto",  # Automatically map to available devices
     torch_dtype=torch.bfloat16,
     token=HF_TOKEN,
+    max_position_embeddings=4096  # Reduce context window to 8k tokens (from 128k)
 )
 @spaces.GPU