Spaces:

Heit39
/

iris

Sleeping

IST199655 commited on Dec 2, 2024

Commit

ca66e0b

1 Parent(s): 64cfbfa

a

Files changed (2) hide show

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from huggingface_hub import InferenceClient
 Copied from inference in colab notebook
 """
 from transformers import LlamaForCausalLM, AutoTokenizer
 import torch
@@ -15,11 +16,7 @@ model_path = "llama_lora_model_1"
 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
 # Load model
-model = LlamaForCausalLM.from_pretrained(
-    model_path,
-    torch_dtype=torch.float32,  # Adjust based on your environment
-    device_map="cpu"  # Use CPU for inference
-)
 # Define the response function
 def respond(

 Copied from inference in colab notebook
 """
+from optimum.onnxruntime import ORTModelForCausalLM
 from transformers import LlamaForCausalLM, AutoTokenizer
 import torch
 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
 # Load model
+model = ORTModelForCausalLM.from_pretrained(model_path, from_transformers=True)
 # Define the response function
 def respond(

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ huggingface_hub==0.25.2
 unsloth
 transformers
-accelerate

 unsloth
 transformers
+accelerate
+optimum[onnxruntime]