Spaces:

sagar007
/

phi2_finetune

Runtime error

App Files Files Community

sagar007 commited on Sep 3, 2024

Commit

a66049a

verified ·

1 Parent(s): 6cde641

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -9

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig
-import spaces
 # Load model and tokenizer
 MODEL_PATH = "sagar007/phi2_finetune"
@@ -10,18 +9,29 @@ MODEL_PATH = "sagar007/phi2_finetune"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token
 base_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/phi-2",
-    torch_dtype=torch.float32,  # Use float32 for CPU
-    device_map="auto",
-    trust_remote_code=True
 )
 peft_config = PeftConfig.from_pretrained(MODEL_PATH)
-model = PeftModel.from_pretrained(base_model, MODEL_PATH)
 model.eval()
-@spaces.GPU(duration=60)
 def generate_response(instruction, max_length=512):
     prompt = f"Instruction: {instruction}\nResponse:"
     inputs = tokenizer(prompt, return_tensors="pt")
@@ -45,15 +55,15 @@ def chatbot(message, history):
 demo = gr.ChatInterface(
     chatbot,
-    title="Fine-tuned Phi-2 Chatbot",
-    description="This is a chatbot using a fine-tuned version of the Phi-2 model.",
     theme="default",
     examples=[
         "Explain the concept of machine learning.",
         "Write a short story about a robot learning to paint.",
         "What are some effective ways to reduce stress?",
     ],
-    cache_examples=True,
 )
 if __name__ == "__main__":

 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from peft import PeftModel, PeftConfig
 # Load model and tokenizer
 MODEL_PATH = "sagar007/phi2_finetune"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 tokenizer.pad_token = tokenizer.eos_token
+# Load the base model
 base_model = AutoModelForCausalLM.from_pretrained(
     "microsoft/phi-2",
+    torch_dtype=torch.float32,
+    device_map="cpu",
+    trust_remote_code=True,
+    low_cpu_mem_usage=True
 )
+# Apply PEFT
 peft_config = PeftConfig.from_pretrained(MODEL_PATH)
+model = PeftModel.from_pretrained(base_model, MODEL_PATH, device_map="cpu")
+# Merge the PEFT model with the base model
+model = model.merge_and_unload()
+# Quantize the model
+model = torch.quantization.quantize_dynamic(
+    model, {torch.nn.Linear}, dtype=torch.qint8
+)
 model.eval()
 def generate_response(instruction, max_length=512):
     prompt = f"Instruction: {instruction}\nResponse:"
     inputs = tokenizer(prompt, return_tensors="pt")
 demo = gr.ChatInterface(
     chatbot,
+    title="Fine-tuned Phi-2 Chatbot (CPU Optimized)",
+    description="This is a chatbot using a quantized, fine-tuned version of the Phi-2 model, optimized for CPU inference.",
     theme="default",
     examples=[
         "Explain the concept of machine learning.",
         "Write a short story about a robot learning to paint.",
         "What are some effective ways to reduce stress?",
     ],
+    cache_examples=False,
 )
 if __name__ == "__main__":