Spaces:

Tonic
/

Command-A

Runtime error

Tonic commited on Mar 13

Commit

098ce94

unverified ·

1 Parent(s): ac9fe9d

last try

Files changed (1) hide show

app.py CHANGED Viewed

@@ -24,19 +24,19 @@ quantization_config = BitsAndBytesConfig(
     bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation to save memory
 )
-# Custom device map to offload non-critical components
-custom_device_map = {
-    "transformer": "cuda",  # Keep transformer layers on GPU
-    "lm_head": "cpu",       # Offload language model head to CPU
-}
 # Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     quantization_config=quantization_config,  # Apply quantization
-    # device_map="auto",  # Automatically map to available devices
-    device_map=custom_device_map,             # Use custom device map
     torch_dtype=torch.bfloat16,
     token=HF_TOKEN,
     max_position_embeddings=4096  # Reduce context window to 8k tokens (from 128k)
@@ -46,7 +46,7 @@ model = AutoModelForCausalLM.from_pretrained(
 def generate_response(user_input, max_new_tokens, temperature):
     messages = [{"role": "user", "content": user_input}]
     input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
-    input_ids = input_ids.to(model.device)
     gen_tokens = model.generate(
         input_ids = input_ids,
         max_new_tokens=max_new_tokens,

     bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for computation to save memory
 )
+# # Custom device map to offload non-critical components
+# custom_device_map = {
+#     "transformer": "cuda",  # Keep transformer layers on GPU
+#     "lm_head": "cpu",       # Offload language model head to CPU
+# }
 # Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     quantization_config=quantization_config,  # Apply quantization
+    device_map="auto",  # Automatically map to available devices
+    # device_map=custom_device_map,             # Use custom device map
     torch_dtype=torch.bfloat16,
     token=HF_TOKEN,
     max_position_embeddings=4096  # Reduce context window to 8k tokens (from 128k)
 def generate_response(user_input, max_new_tokens, temperature):
     messages = [{"role": "user", "content": user_input}]
     input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+    input_ids = input_ids.to("cuda" if torch.cuda.is_available() else "cpu")  # Dynamic device placement
     gen_tokens = model.generate(
         input_ids = input_ids,
         max_new_tokens=max_new_tokens,