File size: 12,715 Bytes
e83f85a
 
9078da1
9d9021a
b28bc26
9078da1
86917e5
9078da1
77459be
 
 
 
7b1925c
b28bc26
67cc1ee
 
 
 
 
 
 
 
 
708593d
 
 
 
 
 
 
 
 
 
00f5f00
 
 
 
 
 
 
 
 
 
 
 
 
 
67cc1ee
 
 
 
99033fb
 
 
b28bc26
 
 
 
 
 
24f840d
aaf89eb
b28bc26
 
 
 
a0c513d
99033fb
76a6e88
 
a0c513d
76a6e88
b0831b2
77459be
9078da1
 
 
 
 
 
6def1b9
9078da1
a14c9f3
b28bc26
86917e5
 
77459be
 
 
86917e5
 
 
 
67cc1ee
 
 
 
 
86917e5
67cc1ee
 
 
 
86917e5
67cc1ee
86917e5
 
 
 
 
67cc1ee
24f840d
86917e5
 
 
 
 
 
b28bc26
77459be
 
 
 
 
b28bc26
 
 
77459be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67cc1ee
77459be
 
 
 
 
 
 
 
 
67cc1ee
 
 
 
 
 
 
77459be
 
 
 
 
67cc1ee
 
 
 
 
 
 
 
 
 
 
 
77459be
 
 
 
 
 
 
39eda37
86917e5
67cc1ee
 
 
 
 
 
77459be
 
 
 
 
 
 
 
 
 
 
67cc1ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77459be
 
b28bc26
 
 
 
77459be
b28bc26
 
76a6e88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8bc467
 
 
 
 
 
b28bc26
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login, HfApi, HfFolder, Repository
import os
from threading import Thread
from openai import OpenAI

# Space id
space_id = "sandz7"

# Authenticate with hf api
api = HfApi()

# switch hardware function
def space_hardware_config(instance_size: str="gpu",
                          instance_type: str="1xL4",
                          vcpus: int=8,
                          memory: int=30):
    """
    This will manually select what hardware we'll use in the space.
    """

    api = HfApi()
    token = HfFolder.get_token()
    if token is None:
        raise ValueError("Hugging Face token not found. Please log in using huggingface-cli or set the token manually.")
    
    space_id = os.getenv("SPACE_ID")
    if not space_id:
        raise ValueError("SPACE_ID environment variable not found.")
    
    space_info = api.repo_info(repo_id=space_id, repo_type="space", token=token)
    print(space_info)

    # # Hardware Configuration
    # space.config["compute"] = {
    #     "instance_type": instance_type,
    #     "instance_size": instance_size,
    #     "disk_size": 50,
    #     "vcpus": vcpus, # number of virtual CPU's
    #     "memory": memory # amount of memory in gb
    # }

    # # Save updated space config
    # api.push_to_hub(space)
    # print("Hardware configuration successfull. Check the cuda command.")

# Automatically place to the standard config we need for loki
space_hardware_config()

TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
      add_to_git_credential=False)

# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Loki πŸ‘οΈ</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''

# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
    llama_tokenizer.eos_token_id,
    llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


# The output
def output_list(output: list):
    """
    Grabs the output from the first position in list,
    and returns it as a string as a response
    """
    cleaned_output = ''.join(filter(None, output))

    return cleaned_output

# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o
def gpt_generation(input: str,
                   llama_output: str,
                   mode: str):
    """
    Passes the llama output and all input,
    returns the stream, so we can yield it in final generation. 
    """
    if llama_output is not None:
        base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
        Llama3 LLM gave the user this response:\n\n {llama_output}\n
        Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
        than ignore it's output and give your's alone.'''

        prompt = base_prompt.format(llama_input=input, llama_output=llama_output)
    else:
        base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
        Respond in a thorough and complete way.'''

        prompt = base_prompt.format(llama_input=input)

    # Setup the client
    client = OpenAI(api_key=API_KEY)

    stream = client.chat.completions.create(
        model=mode,
        messages=[{"role": "system", "content": "You are a helpful assistant called 'Loki'."},
                  {"role": "user", "content": prompt}],
        stream=True,
    )

    return stream

# Place just input pass and return generation output
def loki_generation(input_text: str,
                    history: list,
                    temperature: float,
                    max_new_tokens: int,
                    mode: str):
    """
    Pass input texts, tokenize, output and back to text.
    """
    space_hardware_config(instance_size="gpu",
                          instance_type="1xL4",
                          vcpus=8,
                          memory=30)
    if mode == "llama":
        conversation = []
        for user, assistant in history:
            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
        conversation.append({"role": "user", "content": input_text})

        input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

        streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

        # generation arguments to pass in llm generate() eventually
        generate_kwargs = dict(
            input_ids=input_ids,
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            eos_token_id=terminators[0]
        )

        # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
        if temperature == 0:
            generate_kwargs["do_sample"] = False

        # Place the generation in a thread so we can access it.
        # place the function as target and place the kwargs next as the kwargs
        thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
        thread.start()

        # outputs = []
        # for text in streamer:
        #     outputs.append(text)
        #     yield "".join(outputs)

        text = [text for text in streamer]
        output_text = output_list(text)
        print("llama mode was on.")
        return output_text

    if mode == "loki":
        conversation = []
        for user, assistant in history:
            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
        conversation.append({"role": "user", "content": input_text})

        input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

        streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

        # generation arguments to pass in llm generate() eventually
        generate_kwargs = dict(
            input_ids=input_ids,
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            eos_token_id=terminators[0]
        )

        # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
        if temperature == 0:
            generate_kwargs["do_sample"] = False

        # Place the generation in a thread so we can access it.
        # place the function as target and place the kwargs next as the kwargs
        thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
        thread.start()
        
        llama_outputs = [text for text in streamer]
        output_text = output_list(llama_outputs)
        stream = gpt_generation(input=input_text, llama_output=output_text)
        print("loki mode was on.")
        return stream
        # outputs = []
        # for chunk in stream:
        #     if chunk.choices[0].delta.content is not None:
        #         text = chunk.choices[0].delta.content
        #         outputs.append(text)
        #         yield "".join(outputs)


def check_cuda():
    if torch.cuda.is_available():
        return f"GPU Being Used: {torch.cuda.get_device_name[0]}"
    else:
        return "No GPU is being used right now."
        
first_time = True
llm_mode = ""

def bot_comms(input_text: str,
              history: list,
              temperature: float,
              max_new_tokens: int):
    """
    The connection between gradio and the LLM's
    """
    global first_time
    global llm_mode

    if input_text == "mode":
        if llm_mode == "":
            return "The mode is currently at Loki Default mode"
        else:
            return f"The current mode: {llm_mode}"

    if input_text == "check cuda":
        return check_cuda()
    
    if input_text == "switch to llama":
        llm_mode = input_text
        return "Got it! Llama is now activate for your questions only πŸ¦™"

    if input_text == "switch to gpt-4o":
        llm_mode = input_text
        return "Understood! GPT-4o is now hearing your responses only πŸ‘Ύ"
    
    if input_text == "switch to gpt-3.5-turbo":
        llm_mode = input_text
        return "Done. GPT-3.5-turbo is ready for your questions! πŸƒ"

    if llm_mode == "switch to llama":
        streamer = loki_generation(input_text=input_text,
                                   history=history,
                                   temperature=temperature,
                                   max_new_tokens=max_new_tokens,
                                   mode="llama")
        outputs = []
        for text in streamer:
            outputs.append(text)
            yield "".join(outputs)

    if llm_mode == "switch to gpt-4o":
        space_hardware_config(instance_size="cpu",
                              instance_type="basic",
                              vcpus=2,
                              memory=16)
        stream = gpt_generation(input=input_text,
                                llama_output="",
                                mode="gpt-4o")
        outputs = []
        print("gpt-4o only about to answer.")
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)

    if llm_mode == "switch to gpt-3.5-turbo":
        space_hardware_config(instance_size="cpu",
                              instance_type="basic",
                              vcpus=2,
                              memory=16)
        stream = gpt_generation(input=input_text,
                                llama_output="",
                                mode="gpt-3.5-turbo")
        outputs = []
        print("gpt-3.5-turbo is about to answer.")
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)
    
    if llm_mode is None:
        stream = loki_generation(input_text=input_text,
                                 history=history,
                                 temperature=temperature,
                                 max_new_tokens=max_new_tokens)
        outputs = []
        print("Loki is activate to answer")
        for text in stream:
            outputs.append(text)
            yield "".join(outputs)

chatbot=gr.Chatbot(height=600, label="Loki AI")

with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        fn=bot_comms,
        chatbot=chatbot,
        fill_height=True,
        # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
        additional_inputs_accordion=gr.Accordion(label="βš™οΈ Parameters", open=False, render=False),
        additional_inputs=[
            # Slider feature users can interactive to effect the temperature of model
            gr.Slider(minimum=0,
                      maximum=1,
                      step=0.1,
                      value=0.95,
                      label="Temperature",
                      render=False),
            # Sliding feature for the max tokens for generation on model
            gr.Slider(minimum=128,
                      maximum=1500,
                      step=1,
                      value=512,
                      label="Max new tokens",
                      render=False),
        ],
        examples=[
            ["Make a poem of batman inside willy wonka"],
            ["How can you a burrito with just flour?"],
            ["How was saturn formed in 3 sentences"],
            ["How does the frontal lobe effect playing soccer"],
            ],
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()