File size: 3,708 Bytes
998b0a8
 
 
 
b6e525e
998b0a8
59e0922
192eae5
59e0922
 
192eae5
59e0922
 
b6e525e
59e0922
b6e525e
59e0922
 
 
998b0a8
59e0922
 
 
 
 
 
998b0a8
59e0922
 
 
 
 
b6e525e
59e0922
b6e525e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59e0922
b6e525e
59e0922
 
 
 
 
 
 
 
 
 
 
 
 
 
ad586dc
59e0922
 
b6079ea
59e0922
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
998b0a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gradio as gr
from huggingface_hub import InferenceClient

"""
Copied from inference in colab notebook
"""
# import torch

# # Monkey-patch to avoid CUDA initialization issues
# torch.cuda.get_device_capability = lambda *args, **kwargs: (0, 0)

# from unsloth.chat_templates import get_chat_template
# from unsloth import FastLanguageModel

# # IMPORTING MODEL AND TOKENIZER β€”β€”β€”β€”β€”β€”β€”β€”

# max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
# dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
# load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "llama_lora_model_1",
#     max_seq_length = max_seq_length,
#     dtype = dtype,
#     load_in_4bit = load_in_4bit,
# )

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "llama-3.1",
# )
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# # RUNNING INFERENCE β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

# def respond(
#     message,
#     history: list[tuple[str, str]],
#     system_message,
#     max_tokens,
#     temperature,
#     top_p,
# ):
#     messages = [{"role": "system", "content": system_message}]

#     for val in history:
#         if val[0]:
#             messages.append({"role": "user", "content": val[0]})
#         if val[1]:
#             messages.append({"role": "assistant", "content": val[1]})

#     messages.append({"role": "user", "content": message})

#     inputs = tokenizer.apply_chat_template(
#         messages,
#         tokenize = True,
#         add_generation_prompt = True, # Must add for generation
#         return_tensors = "pt",
#     )

#     outputs = model.generate(input_ids = inputs, max_new_tokens = max_tokens, use_cache = True,
#                             temperature = 1.5, min_p = 0.1)
#     response = tokenizer.batch_decode(outputs)

#     yield response

"""
For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
"""
client = InferenceClient(model="https://huggingface.co/Heit39/llama_lora_model_1")



def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    response = ""

    for message in client.chat_completion(
        messages,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
    ):
        token = message.choices[0].delta.content

        response += token
        yield response


"""
For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
"""
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p (nucleus sampling)",
        ),
    ],
)


if __name__ == "__main__":
    demo.launch()