File size: 8,831 Bytes
e83f85a 9078da1 65f60f3 b28bc26 0679bd1 b52bed9 39598d2 e14d992 9078da1 7d75c5a 3ce4f58 7d75c5a 0a35e8d 77459be b28bc26 24f840d aaf89eb b28bc26 a0c513d 99033fb 76a6e88 a0c513d 76a6e88 b0831b2 9078da1 6def1b9 9078da1 a14c9f3 b28bc26 86917e5 77459be 86917e5 67cc1ee 86917e5 67cc1ee 86917e5 67cc1ee 86917e5 67cc1ee 24f840d 86917e5 b28bc26 e131dba 0a7e6ca b28bc26 77459be 7061b48 e7f4aa1 30d64a7 705763e 14cd22f 77459be 1dd4ded 77459be 67cc1ee 77459be d4c71d7 705763e 77459be 67cc1ee 0a7e6ca e14d992 67cc1ee 1ca04b5 18a692b 67cc1ee 1ca04b5 18a692b 77459be b098c9f 1ca04b5 baf3691 e7f4aa1 77459be 67cc1ee 1ca04b5 18a692b 67cc1ee 1ca04b5 18a692b 67cc1ee 1ca04b5 18a692b 67cc1ee 14cd22f c0248ea 77459be c0248ea 67cc1ee 14cd22f 77459be 67cc1ee 14cd22f 67cc1ee e7f4aa1 14cd22f 7061b48 67cc1ee 14cd22f 8987295 77459be b28bc26 77459be b28bc26 76a6e88 e8bc467 b28bc26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login
import os
import threading
import spaces
from openai import OpenAI
import sys
# Init ZeroGPU
# spaces.initialize_zero_gpu()
TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
add_to_git_credential=False)
# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')
DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Loki ποΈ</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''
# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
llama_tokenizer.eos_token_id,
llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
# The output
def output_list(output: list):
"""
Grabs the output from the first position in list,
and returns it as a string as a response
"""
cleaned_output = ''.join(filter(None, output))
return cleaned_output
# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o
def gpt_generation(input: str,
llama_output: str,
mode: str):
"""
Passes the llama output and all input,
returns the stream, so we can yield it in final generation.
"""
if llama_output is not None:
base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
Llama3 LLM gave the user this response:\n\n {llama_output}\n
Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
than ignore it's output and give your's alone.'''
prompt = base_prompt.format(llama_input=input, llama_output=llama_output)
else:
base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
Respond in a thorough and complete way.'''
prompt = base_prompt.format(llama_input=input)
# Setup the client
client = OpenAI(api_key=API_KEY)
stream = client.chat.completions.create(
model=mode,
messages=[{"role": "system", "content": "You are a helpful assistant called 'Loki'."},
{"role": "user", "content": prompt}],
stream=True,
)
return stream
# Place just input pass and return generation output
def llama_generation(input_text: str,
history: list,
temperature: float,
max_new_tokens: int):
"""
Pass input texts, tokenize, output and back to text.
"""
conversation = []
for user, assistant in history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": input_text})
input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
# generation arguments to pass in llm generate() eventually
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
eos_token_id=terminators[0]
)
# This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
if temperature == 0:
generate_kwargs["do_sample"] = False
# start the thread
thread = threading.Thread(target=llama_model.generate, kwargs=generate_kwargs)
thread.start()
thread.join()
return streamer
def check_cuda():
if torch.cuda.is_available():
return f"GPU Being Used: {torch.cuda.get_device_name(0)}"
else:
return "No GPU is being used right now."
first_time = True
llm_mode = ""
@spaces.GPU(decoration=30)
def bot_comms(input_text: str,
history: list,
temperature: float,
max_new_tokens: int):
"""
The connection between gradio and the LLM's
"""
global first_time
global llm_mode
if input_text == "system details":
yield f"Python: {sys.version}\nGradio Version: {gr.__version__}\nPyTorch Version: {torch.__version__}"
return
if input_text == "mode":
if llm_mode == "":
yield "The mode is currently at Loki Default mode"
return
else:
yield f"The current mode: {llm_mode}"
return
if input_text == "check cuda":
cuda_info = check_cuda()
yield cuda_info
return
if input_text == "switch to loki":
llm_mode = input_text
yield "Loki is on ποΈ"
return
if input_text == "switch to llama":
llm_mode = input_text
yield "Got it! Llama is now activate for your questions only π¦"
return
if input_text == "switch to gpt-4o":
llm_mode = input_text
yield "Understood! GPT-4o is now hearing your responses only πΎ"
return
if input_text == "switch to gpt-3.5-turbo":
llm_mode = input_text
yield "Done. GPT-3.5-turbo is ready for your questions! π"
return
if llm_mode == "switch to llama":
streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
if llm_mode == "switch to gpt-4o":
stream = gpt_generation(input=input_text, llama_output="", mode="gpt-4o")
outputs = []
for chunk in stream:
if chunk.choices[0].delta.content is not None:
text = chunk.choices[0].delta.content
outputs.append(text)
yield "".join(outputs)
if llm_mode == "switch to gpt-3.5-turbo":
stream = gpt_generation(input=input_text, llama_output="", mode="gpt-3.5-turbo")
outputs = []
for chunk in stream:
if chunk.choices[0].delta.content is not None:
text = chunk.choices[0].delta.content
outputs.append(text)
yield "".join(outputs)
if llm_mode is None or llm_mode == "" or llm_mode == "switch to loki":
streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
output_text = output_list([text for text in streamer])
stream = gpt_generation(input=input_text, llama_output=output_text, mode="gpt-4o")
outputs = []
for chunk in stream:
if chunk.choices[0].delta.content is not None:
text = chunk.choices[0].delta.content
outputs.append(text)
yield "".join(outputs)
chatbot=gr.Chatbot(height=600, label="Loki AI")
with gr.Blocks(fill_height=True) as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
fn=bot_comms,
chatbot=chatbot,
fill_height=True,
# These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
additional_inputs_accordion=gr.Accordion(label="βοΈ Parameters", open=False, render=False),
additional_inputs=[
# Slider feature users can interactive to effect the temperature of model
gr.Slider(minimum=0,
maximum=1,
step=0.1,
value=0.95,
label="Temperature",
render=False),
# Sliding feature for the max tokens for generation on model
gr.Slider(minimum=128,
maximum=1500,
step=1,
value=512,
label="Max new tokens",
render=False),
],
examples=[
["Make a poem of batman inside willy wonka"],
["How can you a burrito with just flour?"],
["How was saturn formed in 3 sentences"],
["How does the frontal lobe effect playing soccer"],
],
cache_examples=False
)
if __name__ == "__main__":
demo.launch() |