sagar007 commited on
Commit
a66049a
·
verified ·
1 Parent(s): 6cde641

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -9
app.py CHANGED
@@ -2,7 +2,6 @@ import torch
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel, PeftConfig
5
- import spaces
6
 
7
  # Load model and tokenizer
8
  MODEL_PATH = "sagar007/phi2_finetune"
@@ -10,18 +9,29 @@ MODEL_PATH = "sagar007/phi2_finetune"
10
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
11
  tokenizer.pad_token = tokenizer.eos_token
12
 
 
13
  base_model = AutoModelForCausalLM.from_pretrained(
14
  "microsoft/phi-2",
15
- torch_dtype=torch.float32, # Use float32 for CPU
16
- device_map="auto",
17
- trust_remote_code=True
 
18
  )
19
 
 
20
  peft_config = PeftConfig.from_pretrained(MODEL_PATH)
21
- model = PeftModel.from_pretrained(base_model, MODEL_PATH)
 
 
 
 
 
 
 
 
 
22
  model.eval()
23
 
24
- @spaces.GPU(duration=60)
25
  def generate_response(instruction, max_length=512):
26
  prompt = f"Instruction: {instruction}\nResponse:"
27
  inputs = tokenizer(prompt, return_tensors="pt")
@@ -45,15 +55,15 @@ def chatbot(message, history):
45
 
46
  demo = gr.ChatInterface(
47
  chatbot,
48
- title="Fine-tuned Phi-2 Chatbot",
49
- description="This is a chatbot using a fine-tuned version of the Phi-2 model.",
50
  theme="default",
51
  examples=[
52
  "Explain the concept of machine learning.",
53
  "Write a short story about a robot learning to paint.",
54
  "What are some effective ways to reduce stress?",
55
  ],
56
- cache_examples=True,
57
  )
58
 
59
  if __name__ == "__main__":
 
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel, PeftConfig
 
5
 
6
  # Load model and tokenizer
7
  MODEL_PATH = "sagar007/phi2_finetune"
 
9
  tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
10
  tokenizer.pad_token = tokenizer.eos_token
11
 
12
+ # Load the base model
13
  base_model = AutoModelForCausalLM.from_pretrained(
14
  "microsoft/phi-2",
15
+ torch_dtype=torch.float32,
16
+ device_map="cpu",
17
+ trust_remote_code=True,
18
+ low_cpu_mem_usage=True
19
  )
20
 
21
+ # Apply PEFT
22
  peft_config = PeftConfig.from_pretrained(MODEL_PATH)
23
+ model = PeftModel.from_pretrained(base_model, MODEL_PATH, device_map="cpu")
24
+
25
+ # Merge the PEFT model with the base model
26
+ model = model.merge_and_unload()
27
+
28
+ # Quantize the model
29
+ model = torch.quantization.quantize_dynamic(
30
+ model, {torch.nn.Linear}, dtype=torch.qint8
31
+ )
32
+
33
  model.eval()
34
 
 
35
  def generate_response(instruction, max_length=512):
36
  prompt = f"Instruction: {instruction}\nResponse:"
37
  inputs = tokenizer(prompt, return_tensors="pt")
 
55
 
56
  demo = gr.ChatInterface(
57
  chatbot,
58
+ title="Fine-tuned Phi-2 Chatbot (CPU Optimized)",
59
+ description="This is a chatbot using a quantized, fine-tuned version of the Phi-2 model, optimized for CPU inference.",
60
  theme="default",
61
  examples=[
62
  "Explain the concept of machine learning.",
63
  "Write a short story about a robot learning to paint.",
64
  "What are some effective ways to reduce stress?",
65
  ],
66
+ cache_examples=False,
67
  )
68
 
69
  if __name__ == "__main__":