File size: 4,074 Bytes
998b0a8 b6e525e 998b0a8 192eae5 3d5b038 192eae5 3d5b038 b6e525e 3d5b038 b6e525e 3d5b038 998b0a8 3d5b038 998b0a8 b6e525e 3d5b038 59e0922 3d5b038 998b0a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import gradio as gr
from huggingface_hub import InferenceClient
"""
Copied from inference in colab notebook
"""
from transformers import LlamaForCausalLM, LlamaTokenizer
import torch
# Load model and tokenizer globally to avoid reloading for every request
model_path = "llama_lora_model_1"
# Load tokenizer
tokenizer = LlamaTokenizer.from_pretrained(model_path)
# Load model
model = LlamaForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float32, # Adjust based on your environment
device_map="cpu" # Use CPU for inference
)
# Define the response function
def respond(
message: str,
history: list[tuple[str, str]],
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
):
# Combine system message and history into a single prompt
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# Create a single text prompt from the messages
prompt = ""
for msg in messages:
if msg["role"] == "system":
prompt += f"[System]: {msg['content']}\n\n"
elif msg["role"] == "user":
prompt += f"[User]: {msg['content']}\n\n"
elif msg["role"] == "assistant":
prompt += f"[Assistant]: {msg['content']}\n\n"
# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
input_ids = inputs.input_ids.to("cpu") # Ensure input is on the CPU
# Generate response
output_ids = model.generate(
input_ids,
max_length=input_ids.shape[1] + max_tokens,
temperature=temperature,
top_p=top_p,
do_sample=True,
)
# Decode the generated text
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# Extract the assistant's response from the generated text
assistant_response = generated_text[len(prompt):].strip()
# Yield responses incrementally (simulate streaming)
response = ""
for token in assistant_response.split(): # Split tokens by whitespace
response += token + " "
yield response.strip()
"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
# client = InferenceClient(model="https://huggingface.co/Heit39/llama_lora_model_1")
# def respond(
# message,
# history: list[tuple[str, str]],
# system_message,
# max_tokens,
# temperature,
# top_p,
# ):
# messages = [{"role": "system", "content": system_message}]
# for val in history:
# if val[0]:
# messages.append({"role": "user", "content": val[0]})
# if val[1]:
# messages.append({"role": "assistant", "content": val[1]})
# messages.append({"role": "user", "content": message})
# response = ""
# for message in client.chat_completion(
# messages,
# max_tokens=max_tokens,
# stream=True,
# temperature=temperature,
# top_p=top_p,
# ):
# token = message.choices[0].delta.content
# response += token
# yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|