Tonic commited on
Commit
098ce94
unverified
1 Parent(s): ac9fe9d
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -24,19 +24,19 @@ quantization_config = BitsAndBytesConfig(
24
  bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
25
  )
26
 
27
- # Custom device map to offload non-critical components
28
- custom_device_map = {
29
- "transformer": "cuda", # Keep transformer layers on GPU
30
- "lm_head": "cpu", # Offload language model head to CPU
31
- }
32
 
33
  # Load tokenizer and model
34
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
35
  model = AutoModelForCausalLM.from_pretrained(
36
  model_id,
37
  quantization_config=quantization_config, # Apply quantization
38
- # device_map="auto", # Automatically map to available devices
39
- device_map=custom_device_map, # Use custom device map
40
  torch_dtype=torch.bfloat16,
41
  token=HF_TOKEN,
42
  max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
@@ -46,7 +46,7 @@ model = AutoModelForCausalLM.from_pretrained(
46
  def generate_response(user_input, max_new_tokens, temperature):
47
  messages = [{"role": "user", "content": user_input}]
48
  input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
49
- input_ids = input_ids.to(model.device)
50
  gen_tokens = model.generate(
51
  input_ids = input_ids,
52
  max_new_tokens=max_new_tokens,
 
24
  bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
25
  )
26
 
27
+ # # Custom device map to offload non-critical components
28
+ # custom_device_map = {
29
+ # "transformer": "cuda", # Keep transformer layers on GPU
30
+ # "lm_head": "cpu", # Offload language model head to CPU
31
+ # }
32
 
33
  # Load tokenizer and model
34
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
35
  model = AutoModelForCausalLM.from_pretrained(
36
  model_id,
37
  quantization_config=quantization_config, # Apply quantization
38
+ device_map="auto", # Automatically map to available devices
39
+ # device_map=custom_device_map, # Use custom device map
40
  torch_dtype=torch.bfloat16,
41
  token=HF_TOKEN,
42
  max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
 
46
  def generate_response(user_input, max_new_tokens, temperature):
47
  messages = [{"role": "user", "content": user_input}]
48
  input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
49
+ input_ids = input_ids.to("cuda" if torch.cuda.is_available() else "cpu") # Dynamic device placement
50
  gen_tokens = model.generate(
51
  input_ids = input_ids,
52
  max_new_tokens=max_new_tokens,