Loki 👁️

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login
import os
import threading
import spaces
from openai import OpenAI 
import sys

# Init ZeroGPU
# spaces.initialize_zero_gpu()

TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
      add_to_git_credential=False)

# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Loki 👁️</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''

# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
    llama_tokenizer.eos_token_id,
    llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# The output
def output_list(output: list):
    """
    Grabs the output from the first position in list,
    and returns it as a string as a response
    """
    cleaned_output = ''.join(filter(None, output))

    return cleaned_output

# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o
def gpt_generation(input: str,
                   llama_output: str,
                   mode: str):
    """
    Passes the llama output and all input,
    returns the stream, so we can yield it in final generation. 
    """
    if llama_output is not None:
        base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
        Llama3 LLM gave the user this response:\n\n {llama_output}\n
        Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
        than ignore it's output and give your's alone.'''

        prompt = base_prompt.format(llama_input=input, llama_output=llama_output)
    else:
        base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
        Respond in a thorough and complete way.'''

        prompt = base_prompt.format(llama_input=input)

    # Setup the client
    client = OpenAI(api_key=API_KEY)

    stream = client.chat.completions.create(
        model=mode,
        messages=[{"role": "system", "content": "You are a helpful assistant called 'Loki'."},
                  {"role": "user", "content": prompt}],
        stream=True,
    )

    return stream

# Place just input pass and return generation output
def llama_generation(input_text: str,
                     history: list,
                     temperature: float,
                     max_new_tokens: int):
    """
    Pass input texts, tokenize, output and back to text.
    """

    conversation = []
    for user, assistant in history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": input_text})

    input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

    streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    # generation arguments to pass in llm generate() eventually
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        eos_token_id=terminators[0]
    )

    # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
    if temperature == 0:
        generate_kwargs["do_sample"] = False
        
    # start the thread
    thread = threading.Thread(target=llama_model.generate, kwargs=generate_kwargs)
    thread.start()
    thread.join()
    return streamer

def check_cuda():
    if torch.cuda.is_available():
        return f"GPU Being Used: {torch.cuda.get_device_name(0)}"
    else:
        return "No GPU is being used right now."
        
first_time = True
llm_mode = ""

@spaces.GPU(decoration=30)
def bot_comms(input_text: str,
              history: list,
              temperature: float,
              max_new_tokens: int):
    """
    The connection between gradio and the LLM's
    """
    global first_time
    global llm_mode

    if input_text == "system details":
        yield f"Python: {sys.version}\nGradio Version: {gr.__version__}\nPyTorch Version: {torch.__version__}"
        return

    if input_text == "mode":
        if llm_mode == "":
            yield "The mode is currently at Loki Default mode"
            return
        else:
            yield f"The current mode: {llm_mode}"
            return

    if input_text == "check cuda":
        cuda_info = check_cuda()
        yield cuda_info
        return

    if input_text == "switch to loki":
        llm_mode = input_text
        yield "Loki is on 👁️"
        return
    
    if input_text == "switch to llama":
        llm_mode = input_text
        yield "Got it! Llama is now activate for your questions only 🦙"
        return

    if input_text == "switch to gpt-4o":
        llm_mode = input_text
        yield "Understood! GPT-4o is now hearing your responses only 👾"
        return
    
    if input_text == "switch to gpt-3.5-turbo":
        llm_mode = input_text
        yield "Done. GPT-3.5-turbo is ready for your questions! 🏃"
        return

    if llm_mode == "switch to llama":
        streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
        outputs = []
        for text in streamer:
            outputs.append(text)
            yield "".join(outputs)

    if llm_mode == "switch to gpt-4o":
        stream = gpt_generation(input=input_text, llama_output="", mode="gpt-4o")
        outputs = []
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)

    if llm_mode == "switch to gpt-3.5-turbo":
        stream = gpt_generation(input=input_text, llama_output="", mode="gpt-3.5-turbo")
        outputs = []
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)
    
    if llm_mode is None or llm_mode == "" or llm_mode == "switch to loki":
        streamer = llama_generation(input_text=input_text, history=history, temperature=temperature, max_new_tokens=max_new_tokens)
        output_text = output_list([text for text in streamer])
        stream = gpt_generation(input=input_text, llama_output=output_text, mode="gpt-4o")

        outputs = []
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)

chatbot=gr.Chatbot(height=600, label="Loki AI")

with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        fn=bot_comms,
        chatbot=chatbot,
        fill_height=True,
        # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            # Slider feature users can interactive to effect the temperature of model
            gr.Slider(minimum=0,
                      maximum=1,
                      step=0.1,
                      value=0.95,
                      label="Temperature",
                      render=False),
            # Sliding feature for the max tokens for generation on model
            gr.Slider(minimum=128,
                      maximum=1500,
                      step=1,
                      value=512,
                      label="Max new tokens",
                      render=False),
        ],
        examples=[
            ["Make a poem of batman inside willy wonka"],
            ["How can you a burrito with just flour?"],
            ["How was saturn formed in 3 sentences"],
            ["How does the frontal lobe effect playing soccer"],
            ],
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()