File size: 2,707 Bytes
998b0a8
 
 
 
b6e525e
998b0a8
192eae5
12b9045
3d5b038
9213095
192eae5
3d5b038
f2d1f01
 
b6e525e
3d5b038
64cfbfa
b6e525e
ef4866e
344f6f5
ef4866e
 
 
 
 
3d5b038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f63e352
 
 
 
 
 
 
9213095
be0874b
9213095
f63e352
9213095
 
 
 
 
 
 
 
3d5b038
9213095
3d5b038
9213095
 
f7bf18e
3d5b038
9213095
998b0a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import gradio as gr
from huggingface_hub import InferenceClient

"""
Copied from inference in colab notebook
"""

from transformers import AutoTokenizer , AutoModelForCausalLM , TextIteratorStreamer
import torch
from threading import Thread

# Load model and tokenizer globally to avoid reloading for every request
base_model = "Helsinki-NLP/europarl"
model_path = "Mat17892/t5small_enfr_opus"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, legacy=False)

# Load the base model (e.g., LLaMA)
base_model = AutoModelForCausalLM.from_pretrained(base_model)

# Load LoRA adapter
from peft import PeftModel
model = PeftModel.from_pretrained(base_model, model_path)

def respond(
    message: str,
    history: list[tuple[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
):
    # Combine system message and history into a single prompt
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": message})
    
    # Tokenize the messages
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize = True,
        add_generation_prompt = True, # Must add for generation
        return_tensors = "pt",
    )
    # Generate tokens incrementally
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = {
        "input_ids": inputs,
        "max_new_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "do_sample": True,
        "streamer": streamer,
    }
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    # Yield responses as they are generated
    response = ""
    for token in streamer:
        response += token
        yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()