File size: 2,707 Bytes
998b0a8 b6e525e 998b0a8 192eae5 12b9045 3d5b038 9213095 192eae5 3d5b038 f2d1f01 b6e525e 3d5b038 64cfbfa b6e525e ef4866e 344f6f5 ef4866e 3d5b038 f63e352 9213095 be0874b 9213095 f63e352 9213095 3d5b038 9213095 3d5b038 9213095 f7bf18e 3d5b038 9213095 998b0a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import gradio as gr
from huggingface_hub import InferenceClient
"""
Copied from inference in colab notebook
"""
from transformers import AutoTokenizer , AutoModelForCausalLM , TextIteratorStreamer
import torch
from threading import Thread
# Load model and tokenizer globally to avoid reloading for every request
base_model = "Helsinki-NLP/europarl"
model_path = "Mat17892/t5small_enfr_opus"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)
# Load the base model (e.g., LLaMA)
base_model = AutoModelForCausalLM.from_pretrained(base_model)
# Load LoRA adapter
from peft import PeftModel
model = PeftModel.from_pretrained(base_model, model_path)
def respond(
message: str,
history: list[tuple[str, str]],
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
):
# Combine system message and history into a single prompt
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# Tokenize the messages
inputs = tokenizer.apply_chat_template(
messages,
tokenize = True,
add_generation_prompt = True, # Must add for generation
return_tensors = "pt",
)
# Generate tokens incrementally
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = {
"input_ids": inputs,
"max_new_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"do_sample": True,
"streamer": streamer,
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Yield responses as they are generated
response = ""
for token in streamer:
response += token
yield response
"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p (nucleus sampling)",
),
],
)
if __name__ == "__main__":
demo.launch()
|