last try
Browse files
app.py
CHANGED
@@ -24,19 +24,19 @@ quantization_config = BitsAndBytesConfig(
|
|
24 |
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
|
25 |
)
|
26 |
|
27 |
-
# Custom device map to offload non-critical components
|
28 |
-
custom_device_map = {
|
29 |
-
|
30 |
-
|
31 |
-
}
|
32 |
|
33 |
# Load tokenizer and model
|
34 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
|
35 |
model = AutoModelForCausalLM.from_pretrained(
|
36 |
model_id,
|
37 |
quantization_config=quantization_config, # Apply quantization
|
38 |
-
|
39 |
-
device_map=custom_device_map, # Use custom device map
|
40 |
torch_dtype=torch.bfloat16,
|
41 |
token=HF_TOKEN,
|
42 |
max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
|
@@ -46,7 +46,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
46 |
def generate_response(user_input, max_new_tokens, temperature):
|
47 |
messages = [{"role": "user", "content": user_input}]
|
48 |
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
|
49 |
-
input_ids = input_ids.to(
|
50 |
gen_tokens = model.generate(
|
51 |
input_ids = input_ids,
|
52 |
max_new_tokens=max_new_tokens,
|
|
|
24 |
bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation to save memory
|
25 |
)
|
26 |
|
27 |
+
# # Custom device map to offload non-critical components
|
28 |
+
# custom_device_map = {
|
29 |
+
# "transformer": "cuda", # Keep transformer layers on GPU
|
30 |
+
# "lm_head": "cpu", # Offload language model head to CPU
|
31 |
+
# }
|
32 |
|
33 |
# Load tokenizer and model
|
34 |
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
|
35 |
model = AutoModelForCausalLM.from_pretrained(
|
36 |
model_id,
|
37 |
quantization_config=quantization_config, # Apply quantization
|
38 |
+
device_map="auto", # Automatically map to available devices
|
39 |
+
# device_map=custom_device_map, # Use custom device map
|
40 |
torch_dtype=torch.bfloat16,
|
41 |
token=HF_TOKEN,
|
42 |
max_position_embeddings=4096 # Reduce context window to 8k tokens (from 128k)
|
|
|
46 |
def generate_response(user_input, max_new_tokens, temperature):
|
47 |
messages = [{"role": "user", "content": user_input}]
|
48 |
input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
|
49 |
+
input_ids = input_ids.to("cuda" if torch.cuda.is_available() else "cpu") # Dynamic device placement
|
50 |
gen_tokens = model.generate(
|
51 |
input_ids = input_ids,
|
52 |
max_new_tokens=max_new_tokens,
|