import torch import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer from huggingface_hub import login import os import threading import spaces from openai import OpenAI import sys # Init ZeroGPU # spaces.initialize_zero_gpu() TOKEN = os.getenv('HF_AUTH_TOKEN') login(token=TOKEN, add_to_git_credential=False) # Open ai api key API_KEY = os.getenv('OPEN_AI_API_KEY') DESCRIPTION = '''
''' # Place transformers in hardware to prepare for process and generation llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda') terminators = [ llama_tokenizer.eos_token_id, llama_tokenizer.convert_tokens_to_ids("<|eot_id|>") ] # The output def output_list(output: list): """ Grabs the output from the first position in list, and returns it as a string as a response """ cleaned_output = ''.join(filter(None, output)) return cleaned_output # Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base # Prompt for gpt-4o def gpt_generation(input: str, llama_output: str, mode: str): """ Passes the llama output and all input, returns the stream, so we can yield it in final generation. """ if llama_output is not None: base_prompt = '''Here is the users question:\n\n {llama_input}\n\n Llama3 LLM gave the user this response:\n\n {llama_output}\n Answer the users question with the help of Llama3, if Llama3 response wasn't accurate, than ignore it's output and give your's alone.''' prompt = base_prompt.format(llama_input=input, llama_output=llama_output) else: base_prompt = '''Here is the users question:\n\n {llama_input}\n\n Respond in a thorough and complete way.''' prompt = base_prompt.format(llama_input=input) # Setup the client client = OpenAI(api_key=API_KEY) stream = client.chat.completions.create( model=mode, messages=[{"role": "system", "content": "You are a helpful assistant called 'Loki'."}, {"role": "user", "content": prompt}], stream=True, ) return stream # Place just input pass and return generation output def llama_generation(input_text: str, history: list, temperature: float, max_new_tokens: int): """ Pass input texts, tokenize, output and back to text. """ conversation = [] for user, assistant in history: conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}]) conversation.append({"role": "user", "content": input_text}) input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device) streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True) # generation arguments to pass in llm generate() eventually generate_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, temperature=temperature, eos_token_id=terminators[0] ) # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability if temperature == 0: generate_kwargs["do_sample"] = False # start the thread thread = threading.Thread(target=llama_model.generate, kwargs=generate_kwargs) thread.start() thread.join() return streamer def check_cuda(): if torch.cuda.is_available(): return f"GPU Being Used: {torch.cuda.get_device_name(0)}" else: return "No GPU is being used right now." first_time = True llm_mode = "" @spaces.GPU(decoration=30) def bot_comms(input_text: str, history: list, temperature: float, max_new_tokens: int): """ The connection between gradio and the LLM's """ global first_time global llm_mode if input_text == "system details": yield f"Python: {sys.version}\nGradio Version: {gr.__version__}\nPyTorch Version: {torch.__version__}" return if input_text == "mode": if llm_mode == "": yield "The mode is currently at Loki Default mode" return else: yield f"The current mode: {llm_mode}" return if input_text == "check cuda": cuda_info = check_cuda() yield cuda_info return if input_text == "switch to loki": llm_mode = input_text yield "Loki is on 👁️" return if input_text == "switch to llama": llm_mode = input_text yield "Got it! Llama is now activate for your questions only 🦙" return if input_text == "switch to gpt-4o": llm_mode = input_text yield "Understood! GPT-4o is now hearing your responses only 👾" return if input_text == "switch to gpt-3.5-turbo": llm_mode = input_text yield "Done. GPT-3.5-turbo is ready for your questions! 🏃" return if llm_mode == "switch to llama": streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens) outputs = [] for text in streamer: outputs.append(text) yield "".join(outputs) if llm_mode == "switch to gpt-4o": stream = gpt_generation(input=input_text, llama_output="", mode="gpt-4o") outputs = [] for chunk in stream: if chunk.choices[0].delta.content is not None: text = chunk.choices[0].delta.content outputs.append(text) yield "".join(outputs) if llm_mode == "switch to gpt-3.5-turbo": stream = gpt_generation(input=input_text, llama_output="", mode="gpt-3.5-turbo") outputs = [] for chunk in stream: if chunk.choices[0].delta.content is not None: text = chunk.choices[0].delta.content outputs.append(text) yield "".join(outputs) if llm_mode is None or llm_mode == "" or llm_mode == "switch to loki": streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens) output_text = output_list([text for text in streamer]) stream = gpt_generation(input=input_text, llama_output=output_text, mode="gpt-4o") outputs = [] for chunk in stream: if chunk.choices[0].delta.content is not None: text = chunk.choices[0].delta.content outputs.append(text) yield "".join(outputs) chatbot=gr.Chatbot(height=600, label="Loki AI") with gr.Blocks(fill_height=True) as demo: gr.Markdown(DESCRIPTION) gr.ChatInterface( fn=bot_comms, chatbot=chatbot, fill_height=True, # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ # Slider feature users can interactive to effect the temperature of model gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False), # Sliding feature for the max tokens for generation on model gr.Slider(minimum=128, maximum=1500, step=1, value=512, label="Max new tokens", render=False), ], examples=[ ["Make a poem of batman inside willy wonka"], ["How can you a burrito with just flour?"], ["How was saturn formed in 3 sentences"], ["How does the frontal lobe effect playing soccer"], ], cache_examples=False ) if __name__ == "__main__": demo.launch()