Spaces:

sdung
/

phi-2

Sleeping

File size: 3,872 Bytes

9e357fb
 
 
 
 
 
 
 
05401d5
9e357fb
 
 
cb08a5b
 
9e357fb
cb08a5b
 
 
 
 
 
 
 
 
05401d5
ac1afd1
8f247af
05401d5
cb08a5b
a9afb03
cb08a5b
05401d5
 
cb08a5b
 
9e357fb
05401d5
9e357fb
 
 
 
 
 
 
 
 
 
 
 
 
a846a84
 
9e357fb
 
 
 
 
 
 
 
 
 
9050c05
9e357fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cb08a5b
9e357fb
cb08a5b
9e357fb
 
 
 
 
 
b4cb98a
9e357fb
 
 
 
 
 
 
 
 
 
 
e032a01

import gradio as gr

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    pipeline,
    AutoConfig,
)
from threading import Thread

## The huggingface model id for Microsoft's phi-2 model
#checkpoint = "microsoft/phi-2"

## Download and load model and tokenizer
#tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
#model = AutoModelForCausalLM.from_pretrained(
#    checkpoint, torch_dtype=torch.float32, device_map="cpu", trust_remote_code=True
#)

model_name_or_path = "TheBloke/phi-2-GPTQ"
# To use a different branch, change revision
# For example: revision="gptq-4bit-32g-actorder_True"

config = AutoConfig.from_pretrained(model_name_or_path,trust_remote_code=True)
config.quantization_config["use_exllama"] = False

model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
                                             device_map="cpu",
                                             trust_remote_code=True,
                                             revision="main",
                                             config=config)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)


# Text generation pipeline
phi2 = pipeline(
    "text-generation",
    tokenizer=tokenizer,
    model=model,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    device_map="cpu",
)


# Function that accepts a prompt and generates text using the phi2 pipeline
def generate(message, chat_history, max_new_tokens):
    #instruction = "You are a helpful assistant to 'User'. You do not respond as 'User' or pretend to be 'User'. You only respond once as 'Assistant'."
    instruction = "You are a helpful assistant to 'User'. You will answer any question for 'User'."
    final_prompt = f"Instruction: {instruction}\n"

    for sent, received in chat_history:
        final_prompt += "User: " + sent + "\n"
        final_prompt += "Assistant: " + received + "\n"

    final_prompt += "User: " + message + "\n"
    final_prompt += "Output:"

    if (
        len(tokenizer.tokenize(final_prompt)) >= tokenizer.model_max_length - max_new_tokens
    ):
        final_prompt = "Instruction: Say 'Input exceeded context size, please clear the chat history and retry!' Output:"

    # Streamer
    streamer = TextIteratorStreamer(
        tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0
    )
    thread = Thread(
        target=phi2,
        kwargs={
            "text_inputs": final_prompt,
            "max_new_tokens": max_new_tokens,
            "streamer": streamer,
        },
    )
    thread.start()

    generated_text = ""
    for word in streamer:
        generated_text += word
        response = generated_text.strip()

        if "User:" in response:
            response = response.split("User:")[0].strip()

        if "Assistant:" in response:
            response = response.split("Assistant:")[1].strip()

        yield response


# Chat interface with gradio
with gr.Blocks() as demo:
    gr.Markdown(
        """
  # Phi-2 Chatbot Demo
  This chatbot was created using TheBloke/phi-2-GPTQ from Microsoft's 2.7 billion parameter [phi-2](https://huggingface.co/microsoft/phi-2) Transformer model. 
  
  In order to reduce the response time on this hardware, set `max_new_tokens` to lower number in the text generation pipeline.
  """
    )

    tokens_slider = gr.Slider(
        8,
        128,
        value=128,
        label="Maximum new tokens",
        info="A larger `max_new_tokens` parameter value gives you longer text responses but at the cost of a slower response time.",
    )

    chatbot = gr.ChatInterface(
        fn=generate,
        additional_inputs=[tokens_slider],
        stop_btn=None,
        examples=[["Who is Leonhard Euler?"]],
    )

demo.queue().launch()