Steph254 commited on
Commit
8c6c12f
·
verified ·
1 Parent(s): be4cb79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -18
app.py CHANGED
@@ -13,37 +13,31 @@ if not HUGGINGFACE_TOKEN:
13
 
14
  print("✅ HUGGINGFACE_TOKEN is set.")
15
 
16
- # Model Paths (Replace with your actual Hugging Face Model Names)
17
- BASE_MODEL = "meta-llama/Llama-3.2-1B-Instruct"
18
- QLORA_ADAPTER = "meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8"
19
  LLAMA_GUARD_NAME = "meta-llama/Llama-Guard-3-1B-INT4"
20
 
21
- # Function to load Llama model
22
- def load_llama_model(base_model=BASE_MODEL, adapter=None, is_guard=False):
23
- print(f"🔄 Loading Model: {base_model}")
24
 
25
- tokenizer = AutoTokenizer.from_pretrained(base_model, token=HUGGINGFACE_TOKEN)
26
  model = AutoModelForCausalLM.from_pretrained(
27
- base_model,
28
  token=HUGGINGFACE_TOKEN,
29
- torch_dtype=torch.float32, # Using float32 for CPU compatibility
30
  low_cpu_mem_usage=True
31
  )
32
 
33
- if adapter and not is_guard:
34
- print(f"🔄 Loading Adapter: {adapter}")
35
- model = PeftModel.from_pretrained(model, adapter, token=HUGGINGFACE_TOKEN)
36
- model = model.merge_and_unload()
37
- print("✅ Adapter Loaded Successfully")
38
-
39
  model.eval()
 
40
  return tokenizer, model
41
 
42
- # Load Llama 3.2 model
43
- tokenizer, model = load_llama_model(adapter=QLORA_ADAPTER)
44
 
45
  # Load Llama Guard for content moderation
46
- guard_tokenizer, guard_model = load_llama_model(base_model=LLAMA_GUARD_NAME, is_guard=True)
47
 
48
  # Define Prompt Templates
49
  PROMPTS = {
 
13
 
14
  print("✅ HUGGINGFACE_TOKEN is set.")
15
 
16
+ # Model Paths
17
+ QUANTIZED_MODEL = "meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8" # Directly using quantized model
 
18
  LLAMA_GUARD_NAME = "meta-llama/Llama-Guard-3-1B-INT4"
19
 
20
+ # Function to load Llama model (without LoRA)
21
+ def load_llama_model(model_name, is_guard=False):
22
+ print(f"🔄 Loading Model: {model_name}")
23
 
24
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGINGFACE_TOKEN)
25
  model = AutoModelForCausalLM.from_pretrained(
26
+ model_name,
27
  token=HUGGINGFACE_TOKEN,
28
+ torch_dtype=torch.float16, # Use float16 for optimized performance
29
  low_cpu_mem_usage=True
30
  )
31
 
 
 
 
 
 
 
32
  model.eval()
33
+ print("✅ Model Loaded Successfully")
34
  return tokenizer, model
35
 
36
+ # Load the quantized Llama model
37
+ tokenizer, model = load_llama_model(QUANTIZED_MODEL)
38
 
39
  # Load Llama Guard for content moderation
40
+ guard_tokenizer, guard_model = load_llama_model(LLAMA_GUARD_NAME, is_guard=True)
41
 
42
  # Define Prompt Templates
43
  PROMPTS = {