File size: 8,831 Bytes
e83f85a
 
9078da1
65f60f3
b28bc26
0679bd1
b52bed9
39598d2
e14d992
9078da1
7d75c5a
3ce4f58
7d75c5a
0a35e8d
 
 
77459be
b28bc26
 
 
 
 
24f840d
aaf89eb
b28bc26
 
 
 
a0c513d
99033fb
76a6e88
 
a0c513d
76a6e88
b0831b2
9078da1
 
 
 
 
 
6def1b9
9078da1
a14c9f3
b28bc26
86917e5
 
77459be
 
 
86917e5
 
 
 
67cc1ee
 
 
 
 
86917e5
67cc1ee
 
 
 
86917e5
67cc1ee
86917e5
 
 
 
 
67cc1ee
24f840d
86917e5
 
 
 
 
 
b28bc26
e131dba
0a7e6ca
 
 
b28bc26
 
 
77459be
7061b48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7f4aa1
 
30d64a7
705763e
 
14cd22f
77459be
 
 
1dd4ded
77459be
 
 
 
67cc1ee
77459be
d4c71d7
705763e
 
 
 
77459be
 
 
 
67cc1ee
 
0a7e6ca
 
e14d992
 
67cc1ee
 
1ca04b5
18a692b
67cc1ee
1ca04b5
18a692b
77459be
 
b098c9f
1ca04b5
baf3691
e7f4aa1
 
 
 
 
77459be
 
67cc1ee
1ca04b5
18a692b
67cc1ee
 
 
1ca04b5
18a692b
67cc1ee
 
 
1ca04b5
18a692b
67cc1ee
 
14cd22f
c0248ea
77459be
c0248ea
 
67cc1ee
 
14cd22f
77459be
 
 
 
 
 
 
67cc1ee
14cd22f
67cc1ee
 
 
 
 
 
 
e7f4aa1
14cd22f
 
 
7061b48
67cc1ee
14cd22f
 
 
 
 
8987295
77459be
b28bc26
 
 
 
77459be
b28bc26
 
76a6e88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8bc467
 
 
 
 
 
b28bc26
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login
import os
import threading
import spaces
from openai import OpenAI 
import sys

# Init ZeroGPU
# spaces.initialize_zero_gpu()

TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
      add_to_git_credential=False)

# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Loki πŸ‘οΈ</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''

# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
    llama_tokenizer.eos_token_id,
    llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# The output
def output_list(output: list):
    """
    Grabs the output from the first position in list,
    and returns it as a string as a response
    """
    cleaned_output = ''.join(filter(None, output))

    return cleaned_output

# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o
def gpt_generation(input: str,
                   llama_output: str,
                   mode: str):
    """
    Passes the llama output and all input,
    returns the stream, so we can yield it in final generation. 
    """
    if llama_output is not None:
        base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
        Llama3 LLM gave the user this response:\n\n {llama_output}\n
        Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
        than ignore it's output and give your's alone.'''

        prompt = base_prompt.format(llama_input=input, llama_output=llama_output)
    else:
        base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
        Respond in a thorough and complete way.'''

        prompt = base_prompt.format(llama_input=input)

    # Setup the client
    client = OpenAI(api_key=API_KEY)

    stream = client.chat.completions.create(
        model=mode,
        messages=[{"role": "system", "content": "You are a helpful assistant called 'Loki'."},
                  {"role": "user", "content": prompt}],
        stream=True,
    )

    return stream

# Place just input pass and return generation output
def llama_generation(input_text: str,
                     history: list,
                     temperature: float,
                     max_new_tokens: int):
    """
    Pass input texts, tokenize, output and back to text.
    """

    conversation = []
    for user, assistant in history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": input_text})

    input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

    streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    # generation arguments to pass in llm generate() eventually
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        eos_token_id=terminators[0]
    )

    # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
    if temperature == 0:
        generate_kwargs["do_sample"] = False
        
    # start the thread
    thread = threading.Thread(target=llama_model.generate, kwargs=generate_kwargs)
    thread.start()
    thread.join()
    return streamer

def check_cuda():
    if torch.cuda.is_available():
        return f"GPU Being Used: {torch.cuda.get_device_name(0)}"
    else:
        return "No GPU is being used right now."
        
first_time = True
llm_mode = ""

@spaces.GPU(decoration=30)
def bot_comms(input_text: str,
              history: list,
              temperature: float,
              max_new_tokens: int):
    """
    The connection between gradio and the LLM's
    """
    global first_time
    global llm_mode

    if input_text == "system details":
        yield f"Python: {sys.version}\nGradio Version: {gr.__version__}\nPyTorch Version: {torch.__version__}"
        return

    if input_text == "mode":
        if llm_mode == "":
            yield "The mode is currently at Loki Default mode"
            return
        else:
            yield f"The current mode: {llm_mode}"
            return

    if input_text == "check cuda":
        cuda_info = check_cuda()
        yield cuda_info
        return

    if input_text == "switch to loki":
        llm_mode = input_text
        yield "Loki is on πŸ‘οΈ"
        return
    
    if input_text == "switch to llama":
        llm_mode = input_text
        yield "Got it! Llama is now activate for your questions only πŸ¦™"
        return

    if input_text == "switch to gpt-4o":
        llm_mode = input_text
        yield "Understood! GPT-4o is now hearing your responses only πŸ‘Ύ"
        return
    
    if input_text == "switch to gpt-3.5-turbo":
        llm_mode = input_text
        yield "Done. GPT-3.5-turbo is ready for your questions! πŸƒ"
        return

    if llm_mode == "switch to llama":
        streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
        outputs = []
        for text in streamer:
            outputs.append(text)
            yield "".join(outputs)

    if llm_mode == "switch to gpt-4o":
        stream = gpt_generation(input=input_text, llama_output="", mode="gpt-4o")
        outputs = []
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)

    if llm_mode == "switch to gpt-3.5-turbo":
        stream = gpt_generation(input=input_text, llama_output="", mode="gpt-3.5-turbo")
        outputs = []
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)
    
    if llm_mode is None or llm_mode == "" or llm_mode == "switch to loki":
        streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
        output_text = output_list([text for text in streamer])
        stream = gpt_generation(input=input_text, llama_output=output_text, mode="gpt-4o")

        outputs = []
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)

chatbot=gr.Chatbot(height=600, label="Loki AI")

with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        fn=bot_comms,
        chatbot=chatbot,
        fill_height=True,
        # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
        additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False, render=False),
        additional_inputs=[
            # Slider feature users can interactive to effect the temperature of model
            gr.Slider(minimum=0,
                      maximum=1,
                      step=0.1,
                      value=0.95,
                      label="Temperature",
                      render=False),
            # Sliding feature for the max tokens for generation on model
            gr.Slider(minimum=128,
                      maximum=1500,
                      step=1,
                      value=512,
                      label="Max new tokens",
                      render=False),
        ],
        examples=[
            ["Make a poem of batman inside willy wonka"],
            ["How can you a burrito with just flour?"],
            ["How was saturn formed in 3 sentences"],
            ["How does the frontal lobe effect playing soccer"],
            ],
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()