IST199655 commited on
Commit
ca66e0b
·
1 Parent(s): 64cfbfa
Files changed (2) hide show
  1. app.py +2 -5
  2. requirements.txt +2 -1
app.py CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import InferenceClient
5
  Copied from inference in colab notebook
6
  """
7
 
 
8
  from transformers import LlamaForCausalLM, AutoTokenizer
9
  import torch
10
 
@@ -15,11 +16,7 @@ model_path = "llama_lora_model_1"
15
  tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
16
 
17
  # Load model
18
- model = LlamaForCausalLM.from_pretrained(
19
- model_path,
20
- torch_dtype=torch.float32, # Adjust based on your environment
21
- device_map="cpu" # Use CPU for inference
22
- )
23
 
24
  # Define the response function
25
  def respond(
 
5
  Copied from inference in colab notebook
6
  """
7
 
8
+ from optimum.onnxruntime import ORTModelForCausalLM
9
  from transformers import LlamaForCausalLM, AutoTokenizer
10
  import torch
11
 
 
16
  tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
17
 
18
  # Load model
19
+ model = ORTModelForCausalLM.from_pretrained(model_path, from_transformers=True)
 
 
 
 
20
 
21
  # Define the response function
22
  def respond(
requirements.txt CHANGED
@@ -2,4 +2,5 @@ huggingface_hub==0.25.2
2
 
3
  unsloth
4
  transformers
5
- accelerate
 
 
2
 
3
  unsloth
4
  transformers
5
+ accelerate
6
+ optimum[onnxruntime]