IST199655 commited on
Commit
6c6f896
·
1 Parent(s): ca66e0b
Files changed (1) hide show
  1. app.py +2 -44
app.py CHANGED
@@ -5,8 +5,7 @@ from huggingface_hub import InferenceClient
5
  Copied from inference in colab notebook
6
  """
7
 
8
- from optimum.onnxruntime import ORTModelForCausalLM
9
- from transformers import LlamaForCausalLM, AutoTokenizer
10
  import torch
11
 
12
  # Load model and tokenizer globally to avoid reloading for every request
@@ -16,7 +15,7 @@ model_path = "llama_lora_model_1"
16
  tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
17
 
18
  # Load model
19
- model = ORTModelForCausalLM.from_pretrained(model_path, from_transformers=True)
20
 
21
  # Define the response function
22
  def respond(
@@ -71,47 +70,6 @@ def respond(
71
  response += token + " "
72
  yield response.strip()
73
 
74
-
75
- """
76
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
77
- """
78
- # client = InferenceClient(model="https://huggingface.co/Heit39/llama_lora_model_1")
79
-
80
-
81
-
82
- # def respond(
83
- # message,
84
- # history: list[tuple[str, str]],
85
- # system_message,
86
- # max_tokens,
87
- # temperature,
88
- # top_p,
89
- # ):
90
- # messages = [{"role": "system", "content": system_message}]
91
-
92
- # for val in history:
93
- # if val[0]:
94
- # messages.append({"role": "user", "content": val[0]})
95
- # if val[1]:
96
- # messages.append({"role": "assistant", "content": val[1]})
97
-
98
- # messages.append({"role": "user", "content": message})
99
-
100
- # response = ""
101
-
102
- # for message in client.chat_completion(
103
- # messages,
104
- # max_tokens=max_tokens,
105
- # stream=True,
106
- # temperature=temperature,
107
- # top_p=top_p,
108
- # ):
109
- # token = message.choices[0].delta.content
110
-
111
- # response += token
112
- # yield response
113
-
114
-
115
  """
116
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
117
  """
 
5
  Copied from inference in colab notebook
6
  """
7
 
8
+ from transformers import AutoModel, AutoTokenizer
 
9
  import torch
10
 
11
  # Load model and tokenizer globally to avoid reloading for every request
 
15
  tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
16
 
17
  # Load model
18
+ model = AutoModel.from_pretrained("Heit39/llama_lora_model_1")
19
 
20
  # Define the response function
21
  def respond(
 
70
  response += token + " "
71
  yield response.strip()
72
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  """
74
  For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
75
  """