Steph254 commited on
Commit
3ecadea
·
verified ·
1 Parent(s): b94b847

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -14
app.py CHANGED
@@ -2,27 +2,39 @@ import os
2
  import gradio as gr
3
  import torch
4
  import json
5
- from transformers import AutoTokenizer
 
6
 
7
  # Set Hugging Face Token for Authentication (ensure it's set in your environment)
8
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
9
 
 
 
 
 
10
  # Function to load Llama model
11
- def load_llama_model(model_name):
12
- from transformers import LlamaForCausalLM, LlamaTokenizer
13
-
14
- # Use AutoTokenizer which will handle various tokenizer types
15
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=HUGGINGFACE_TOKEN, use_fast=False)
16
-
17
- # Use the LlamaForCausalLM class which can properly load the consolidated.00.pth format
18
- model = LlamaForCausalLM.from_pretrained(
19
- model_name,
20
- token=HUGGINGFACE_TOKEN,
21
- torch_dtype=torch.float16, # Use float16 to reduce memory usage on CPU
22
- low_cpu_mem_usage=True, # Optimize for low memory usage
23
- device_map="cpu"
 
 
 
 
24
  )
25
 
 
 
 
26
  return tokenizer, model
27
 
28
  # Load Llama 3.2 model
 
2
  import gradio as gr
3
  import torch
4
  import json
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM
6
+ from peft import PeftModel
7
 
8
  # Set Hugging Face Token for Authentication (ensure it's set in your environment)
9
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
10
 
11
+ # Base model (needed for QLoRA adapter)
12
+ BASE_MODEL = "meta-llama/Llama-3-1B-Instruct"
13
+ QLORA_ADAPTER = "meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8"
14
+
15
  # Function to load Llama model
16
+ def load_llama_model():
17
+ print("Loading base model...")
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ BASE_MODEL,
20
+ torch_dtype=torch.bfloat16 if torch.has_bfloat16 else torch.float32, # Use bfloat16 if available, else float32
21
+ device_map="cpu", # Ensure it runs on CPU
22
+ token=HUGGINGFACE_TOKEN
23
+ )
24
+
25
+ print("Loading tokenizer...")
26
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False, token=HUGGINGFACE_TOKEN)
27
+
28
+ print("Loading QLoRA adapter...")
29
+ model = PeftModel.from_pretrained(
30
+ model,
31
+ QLORA_ADAPTER,
32
+ token=HUGGINGFACE_TOKEN
33
  )
34
 
35
+ print("Merging LoRA weights...")
36
+ model = model.merge_and_unload() # Merge LoRA weights for inference
37
+
38
  return tokenizer, model
39
 
40
  # Load Llama 3.2 model