Spaces:

Heit39
/

iris

Sleeping

App Files Files Community

IST199655 commited on Dec 3, 2024

Commit

6c6f896

1 Parent(s): ca66e0b

a

Browse files

Files changed (1) hide show

app.py +2 -44

app.py CHANGED Viewed

@@ -5,8 +5,7 @@ from huggingface_hub import InferenceClient
 Copied from inference in colab notebook
 """
-from optimum.onnxruntime import ORTModelForCausalLM
-from transformers import LlamaForCausalLM, AutoTokenizer
 import torch
 # Load model and tokenizer globally to avoid reloading for every request
@@ -16,7 +15,7 @@ model_path = "llama_lora_model_1"
 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
 # Load model
-model = ORTModelForCausalLM.from_pretrained(model_path, from_transformers=True)
 # Define the response function
 def respond(
@@ -71,47 +70,6 @@ def respond(
         response += token + " "
         yield response.strip()
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-# client = InferenceClient(model="https://huggingface.co/Heit39/llama_lora_model_1")
-# def respond(
-#     message,
-#     history: list[tuple[str, str]],
-#     system_message,
-#     max_tokens,
-#     temperature,
-#     top_p,
-# ):
-#     messages = [{"role": "system", "content": system_message}]
-#     for val in history:
-#         if val[0]:
-#             messages.append({"role": "user", "content": val[0]})
-#         if val[1]:
-#             messages.append({"role": "assistant", "content": val[1]})
-#     messages.append({"role": "user", "content": message})
-#     response = ""
-#     for message in client.chat_completion(
-#         messages,
-#         max_tokens=max_tokens,
-#         stream=True,
-#         temperature=temperature,
-#         top_p=top_p,
-#     ):
-#         token = message.choices[0].delta.content
-#         response += token
-#         yield response
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """

 Copied from inference in colab notebook
 """
+from transformers import AutoModel, AutoTokenizer
 import torch
 # Load model and tokenizer globally to avoid reloading for every request
 tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
 # Load model
+model = AutoModel.from_pretrained("Heit39/llama_lora_model_1")
 # Define the response function
 def respond(
         response += token + " "
         yield response.strip()
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """