File size: 3,872 Bytes
9e357fb 05401d5 9e357fb cb08a5b 9e357fb cb08a5b 05401d5 ac1afd1 8f247af 05401d5 cb08a5b a9afb03 cb08a5b 05401d5 cb08a5b 9e357fb 05401d5 9e357fb a846a84 9e357fb 9050c05 9e357fb cb08a5b 9e357fb cb08a5b 9e357fb b4cb98a 9e357fb e032a01 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import gradio as gr
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TextIteratorStreamer,
pipeline,
AutoConfig,
)
from threading import Thread
## The huggingface model id for Microsoft's phi-2 model
#checkpoint = "microsoft/phi-2"
## Download and load model and tokenizer
#tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
#model = AutoModelForCausalLM.from_pretrained(
# checkpoint, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True
#)
model_name_or_path = "TheBloke/phi-2-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"
config = AutoConfig.from_pretrained(model_name_or_path,trust_remote_code=True)
config.quantization_config["use_exllama"] = False
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
device_map="cpu",
trust_remote_code=True,
revision="main",
config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
# Text generation pipeline
phi2 = pipeline(
"text-generation",
tokenizer=tokenizer,
model=model,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
device_map="cpu",
)
# Function that accepts a prompt and generates text using the phi2 pipeline
def generate(message, chat_history, max_new_tokens):
#instruction = "You are a helpful assistant to 'User'. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
instruction = "You are a helpful assistant to 'User'. You will answer any question for 'User'."
final_prompt = f"Instruction: {instruction}\n"
for sent, received in chat_history:
final_prompt += "User: " + sent + "\n"
final_prompt += "Assistant: " + received + "\n"
final_prompt += "User: " + message + "\n"
final_prompt += "Output:"
if (
len(tokenizer.tokenize(final_prompt)) >= tokenizer.model_max_length - max_new_tokens
):
final_prompt = "Instruction: Say 'Input exceeded context size, please clear the chat history and retry!' Output:"
# Streamer
streamer = TextIteratorStreamer(
tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0
)
thread = Thread(
target=phi2,
kwargs={
"text_inputs": final_prompt,
"max_new_tokens": max_new_tokens,
"streamer": streamer,
},
)
thread.start()
generated_text = ""
for word in streamer:
generated_text += word
response = generated_text.strip()
if "User:" in response:
response = response.split("User:")[0].strip()
if "Assistant:" in response:
response = response.split("Assistant:")[1].strip()
yield response
# Chat interface with gradio
with gr.Blocks() as demo:
gr.Markdown(
"""
# Phi-2 Chatbot Demo
This chatbot was created using TheBloke/phi-2-GPTQ from Microsoft's 2.7 billion parameter [phi-2](https://huggingface.co/microsoft/phi-2) Transformer model.
In order to reduce the response time on this hardware, set `max_new_tokens` to lower number in the text generation pipeline.
"""
)
tokens_slider = gr.Slider(
8,
128,
value=128,
label="Maximum new tokens",
info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.",
)
chatbot = gr.ChatInterface(
fn=generate,
additional_inputs=[tokens_slider],
stop_btn=None,
examples=[["Who is Leonhard Euler?"]],
)
demo.queue().launch()
|