File size: 2,783 Bytes
998b0a8
 
a411157
998b0a8
 
b6e525e
998b0a8
192eae5
16ca936
9213095
192eae5
3d5b038
cb2fe42
f2d1f01
b6e525e
3d5b038
64cfbfa
b6e525e
ef4866e
c46d4d6
ef4866e
 
 
c46d4d6
ef4866e
3d5b038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f63e352
 
 
 
 
 
 
9213095
be0874b
9213095
f63e352
9213095
 
 
 
 
 
 
 
3d5b038
9213095
3d5b038
9213095
 
f7bf18e
3d5b038
9213095
998b0a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
from huggingface_hub import InferenceClient
import os

"""
Copied from inference in colab notebook
"""

from transformers import AutoTokenizer , AutoModelForSeq2SeqLM , TextIteratorStreamer
from threading import Thread

# Load model and tokenizer globally to avoid reloading for every request
base_model = "google-t5/t5-small"
model_path = "Mat17892/t5small_enfr_opus"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)

# Load the base model (e.g., LLaMA)
base_model = AutoModelForSeq2SeqLM.from_pretrained(base_model, token = os.getenv('huggingface_token'))

# Load LoRA adapter
from peft import PeftModel
model = PeftModel.from_pretrained(base_model, model_path, token = os.getenv('huggingface_token'))

def respond(
    message: str,
    history: list[tuple[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
):
    # Combine system message and history into a single prompt
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": message})
    
    # Tokenize the messages
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    )
    # Generate tokens incrementally
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = {
        "input_ids": inputs,
        "max_new_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "do_sample": True,
        "streamer": streamer,
    }
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Yield responses as they are generated
    response = ""
    for token in streamer:
        response += token
        yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()