Update app.py
Browse files
app.py
CHANGED
@@ -24,7 +24,7 @@ def load_quantized_model(model_path):
|
|
24 |
# Use Hugging Face transformers to load the quantized model directly
|
25 |
model = LlamaForCausalLM.from_pretrained(
|
26 |
model_path,
|
27 |
-
use_auth_token=
|
28 |
device_map="auto", # Auto-distributes across CPU/GPU
|
29 |
torch_dtype=torch.float16, # Reduces memory usage
|
30 |
low_cpu_mem_usage=True # Optimized RAM loading
|
|
|
24 |
# Use Hugging Face transformers to load the quantized model directly
|
25 |
model = LlamaForCausalLM.from_pretrained(
|
26 |
model_path,
|
27 |
+
use_auth_token=HUGGINGFACE_TOKEN,
|
28 |
device_map="auto", # Auto-distributes across CPU/GPU
|
29 |
torch_dtype=torch.float16, # Reduces memory usage
|
30 |
low_cpu_mem_usage=True # Optimized RAM loading
|