Tonic commited on
Commit
ac9fe9d
unverified
1 Parent(s): 1584225

reduce memory footprint bfloat16

Browse files
Files changed (1) hide show
  1. app.py +7 -0
app.py CHANGED
@@ -24,12 +24,19 @@ quantization_config = BitsAndBytesConfig(
24
  bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
25
  )
26
 
 
 
 
 
 
 
27
  # Load tokenizer and model
28
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
29
  model = AutoModelForCausalLM.from_pretrained(
30
  model_id,
31
  quantization_config=quantization_config, # Apply quantization
32
  # device_map="auto", # Automatically map to available devices
 
33
  torch_dtype=torch.bfloat16,
34
  token=HF_TOKEN,
35
  max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
 
24
  bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
25
  )
26
 
27
+ # Custom device map to offload non-critical components
28
+ custom_device_map = {
29
+ "transformer": "cuda", # Keep transformer layers on GPU
30
+ "lm_head": "cpu", # Offload language model head to CPU
31
+ }
32
+
33
  # Load tokenizer and model
34
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
35
  model = AutoModelForCausalLM.from_pretrained(
36
  model_id,
37
  quantization_config=quantization_config, # Apply quantization
38
  # device_map="auto", # Automatically map to available devices
39
+ device_map=custom_device_map, # Use custom device map
40
  torch_dtype=torch.bfloat16,
41
  token=HF_TOKEN,
42
  max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)