Tonic commited on
Commit
1584225
unverified
1 Parent(s): d559f10

reduce memory footprint bfloat16

Browse files
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -19,8 +19,9 @@ model_id = "Tonic/c4ai-command-a-03-2025-4bit_fp4"
19
  quantization_config = BitsAndBytesConfig(
20
  load_in_4bit=True, # Enable 4-bit quantization
21
  bnb_4bit_quant_type="fp4", # Use FP4 quantization
22
- bnb_4bit_use_double_quant=True#, # Optional: double quantization for better precision
23
  # llm_int8_enable_fp32_cpu_offload=True # Allow CPU offloading for 32-bit modules
 
24
  )
25
 
26
  # Load tokenizer and model
@@ -31,7 +32,7 @@ model = AutoModelForCausalLM.from_pretrained(
31
  # device_map="auto", # Automatically map to available devices
32
  torch_dtype=torch.bfloat16,
33
  token=HF_TOKEN,
34
- max_position_embeddings=8192 # Reduce context window to 8k tokens (from 128k)
35
  )
36
 
37
  @spaces.GPU
 
19
  quantization_config = BitsAndBytesConfig(
20
  load_in_4bit=True, # Enable 4-bit quantization
21
  bnb_4bit_quant_type="fp4", # Use FP4 quantization
22
+ bnb_4bit_use_double_quant=True, # Optional: double quantization for better precision
23
  # llm_int8_enable_fp32_cpu_offload=True # Allow CPU offloading for 32-bit modules
24
+ bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
25
  )
26
 
27
  # Load tokenizer and model
 
32
  # device_map="auto", # Automatically map to available devices
33
  torch_dtype=torch.bfloat16,
34
  token=HF_TOKEN,
35
+ max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
36
  )
37
 
38
  @spaces.GPU