Spaces:

Steph254
/

demo_1

Runtime error

Steph254 commited on Mar 18

Commit

48bf8a4

verified ·

1 Parent(s): f2b9562

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import gradio as gr
 import torch
 import json
-from transformers import LlamaTokenizer, LlamaForCausalLM
 from peft import PeftModel
 # Set Hugging Face Token for Authentication
@@ -21,14 +21,25 @@ LLAMA_GUARD_NAME = "meta-llama/Llama-Guard-3-1B-INT4"
 def load_quantized_model(model_path):
     print(f"🔄 Loading Quantized Model: {model_path}")
-    # Use Hugging Face transformers to load the quantized model directly
-    model = LlamaForCausalLM.from_pretrained(
-        model_path,
-        use_auth_token=HUGGINGFACE_TOKEN,
-        device_map="auto",  # Auto-distributes across CPU/GPU
-        torch_dtype=torch.float16,  # Reduces memory usage
-        low_cpu_mem_usage=True  # Optimized RAM loading
-    )
     print("✅ Quantized model loaded successfully!")
     return model

 import gradio as gr
 import torch
 import json
+from transformers import LlamaTokenizer, LlamaForCausalLM, LlamaConfig
 from peft import PeftModel
 # Set Hugging Face Token for Authentication
 def load_quantized_model(model_path):
     print(f"🔄 Loading Quantized Model: {model_path}")
+    # Load the config manually
+    config = LlamaConfig.from_pretrained(model_path)
+    # Initialize model
+    model = LlamaForCausalLM(config)
+    # Load the quantized weights manually
+    checkpoint_path = os.path.join(model_path, "consolidated.00.pth")
+    if not os.path.exists(checkpoint_path):
+        raise FileNotFoundError(f"❌ Checkpoint file not found: {checkpoint_path}")
+    state_dict = torch.load(checkpoint_path, map_location="cpu")
+    # Load the state dict into the model
+    model.load_state_dict(state_dict, strict=False)
+    # Move model to GPU if available
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model.to(device)
     print("✅ Quantized model loaded successfully!")
     return model