Steph254 commited on
Commit
609a610
·
verified ·
1 Parent(s): 6d76df7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -16
app.py CHANGED
@@ -18,29 +18,22 @@ MODEL_PATH = "meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8" # Directly using
18
  LLAMA_GUARD_NAME = "meta-llama/Llama-Guard-3-1B-INT4"
19
 
20
  # Function to load Llama model (without LoRA)
21
- # Load Model Manually (for Quantized Models)
22
  def load_quantized_model(model_path):
23
  print(f"🔄 Loading Quantized Model: {model_path}")
24
 
25
- # Load config file manually
26
- from transformers import LlamaConfig
27
- config = LlamaConfig.from_pretrained(model_path)
28
-
29
- # Initialize model
30
- model = LlamaForCausalLM(config)
31
-
32
- # Load quantized state_dict
33
- checkpoint_path = os.path.join(model_path, "consolidated.00.pth")
34
- state_dict = torch.load(checkpoint_path, map_location="cpu")
35
-
36
- # Load state dict into model
37
- model.load_state_dict(state_dict, strict=False)
38
 
39
  print("✅ Quantized model loaded successfully!")
40
  return model
41
 
42
  # Load Tokenizer
43
- tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, token=HUGGINGFACE_TOKEN)
44
 
45
  # Load the model
46
  model = load_quantized_model(MODEL_PATH)
@@ -104,7 +97,7 @@ def generate_response(prompt_type, **kwargs):
104
  with torch.no_grad():
105
  outputs = model.generate(
106
  inputs.input_ids,
107
- max_length=1024,
108
  temperature=0.7 if prompt_type == "project_analysis" else 0.5,
109
  top_p=0.9,
110
  do_sample=True
 
18
  LLAMA_GUARD_NAME = "meta-llama/Llama-Guard-3-1B-INT4"
19
 
20
  # Function to load Llama model (without LoRA)
 
21
  def load_quantized_model(model_path):
22
  print(f"🔄 Loading Quantized Model: {model_path}")
23
 
24
+ # Use Hugging Face transformers to load the quantized model directly
25
+ model = LlamaForCausalLM.from_pretrained(
26
+ model_path,
27
+ device_map="auto", # Auto-distributes across CPU/GPU
28
+ torch_dtype=torch.float16, # Reduces memory usage
29
+ low_cpu_mem_usage=True # Optimized RAM loading
30
+ )
 
 
 
 
 
 
31
 
32
  print("✅ Quantized model loaded successfully!")
33
  return model
34
 
35
  # Load Tokenizer
36
+ tokenizer = LlamaTokenizer.from_pretrained(MODEL_PATH, token=HUGGINGFACE_TOKEN, legacy=False)
37
 
38
  # Load the model
39
  model = load_quantized_model(MODEL_PATH)
 
97
  with torch.no_grad():
98
  outputs = model.generate(
99
  inputs.input_ids,
100
+ max_length=512,
101
  temperature=0.7 if prompt_type == "project_analysis" else 0.5,
102
  top_p=0.9,
103
  do_sample=True