Spaces:

Steph254
/

demo_1

Runtime error

Steph254 commited on Mar 18

Commit

609a610

verified ·

1 Parent(s): 6d76df7

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -18,29 +18,22 @@ MODEL_PATH = "meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8"  # Directly using
 LLAMA_GUARD_NAME = "meta-llama/Llama-Guard-3-1B-INT4"
 # Function to load Llama model (without LoRA)
-# Load Model Manually (for Quantized Models)
 def load_quantized_model(model_path):
     print(f"🔄 Loading Quantized Model: {model_path}")
-    # Load config file manually
-    from transformers import LlamaConfig
-    config = LlamaConfig.from_pretrained(model_path)
-    # Initialize model
-    model = LlamaForCausalLM(config)
-    # Load quantized state_dict
-    checkpoint_path = os.path.join(model_path, "consolidated.00.pth")
-    state_dict = torch.load(checkpoint_path, map_location="cpu")
-    # Load state dict into model
-    model.load_state_dict(state_dict, strict=False)
     print("✅ Quantized model loaded successfully!")
     return model
 # Load Tokenizer
-tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, token=HUGGINGFACE_TOKEN)
 # Load the model
 model = load_quantized_model(MODEL_PATH)
@@ -104,7 +97,7 @@ def generate_response(prompt_type, **kwargs):
     with torch.no_grad():
         outputs = model.generate(
             inputs.input_ids,
-            max_length=1024,
             temperature=0.7 if prompt_type == "project_analysis" else 0.5,
             top_p=0.9,
             do_sample=True

 LLAMA_GUARD_NAME = "meta-llama/Llama-Guard-3-1B-INT4"
 # Function to load Llama model (without LoRA)
 def load_quantized_model(model_path):
     print(f"🔄 Loading Quantized Model: {model_path}")
+    # Use Hugging Face transformers to load the quantized model directly
+    model = LlamaForCausalLM.from_pretrained(
+        model_path,
+        device_map="auto",  # Auto-distributes across CPU/GPU
+        torch_dtype=torch.float16,  # Reduces memory usage
+        low_cpu_mem_usage=True  # Optimized RAM loading
+    )
     print("✅ Quantized model loaded successfully!")
     return model
 # Load Tokenizer
+tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, token=HUGGINGFACE_TOKEN, legacy=False)
 # Load the model
 model = load_quantized_model(MODEL_PATH)
     with torch.no_grad():
         outputs = model.generate(
             inputs.input_ids,
+            max_length=512,
             temperature=0.7 if prompt_type == "project_analysis" else 0.5,
             top_p=0.9,
             do_sample=True