Spaces:

sandz7
/

loki

Runtime error

File size: 12,715 Bytes

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login, HfApi, HfFolder, Repository
import os
from threading import Thread
from openai import OpenAI

# Space id
space_id = "sandz7"

# Authenticate with hf api
api = HfApi()

# switch hardware function
def space_hardware_config(instance_size: str="gpu",
                          instance_type: str="1xL4",
                          vcpus: int=8,
                          memory: int=30):
    """
    This will manually select what hardware we'll use in the space.
    """

    api = HfApi()
    token = HfFolder.get_token()
    if token is None:
        raise ValueError("Hugging Face token not found. Please log in using huggingface-cli or set the token manually.")
    
    space_id = os.getenv("SPACE_ID")
    if not space_id:
        raise ValueError("SPACE_ID environment variable not found.")
    
    space_info = api.repo_info(repo_id=space_id, repo_type="space", token=token)
    print(space_info)

    # # Hardware Configuration
    # space.config["compute"] = {
    #     "instance_type": instance_type,
    #     "instance_size": instance_size,
    #     "disk_size": 50,
    #     "vcpus": vcpus, # number of virtual CPU's
    #     "memory": memory # amount of memory in gb
    # }

    # # Save updated space config
    # api.push_to_hub(space)
    # print("Hardware configuration successfull. Check the cuda command.")

# Automatically place to the standard config we need for loki
space_hardware_config()

TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
      add_to_git_credential=False)

# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Loki 👁️</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''

# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
    llama_tokenizer.eos_token_id,
    llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]


# The output
def output_list(output: list):
    """
    Grabs the output from the first position in list,
    and returns it as a string as a response
    """
    cleaned_output = ''.join(filter(None, output))

    return cleaned_output

# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o
def gpt_generation(input: str,
                   llama_output: str,
                   mode: str):
    """
    Passes the llama output and all input,
    returns the stream, so we can yield it in final generation. 
    """
    if llama_output is not None:
        base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
        Llama3 LLM gave the user this response:\n\n {llama_output}\n
        Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
        than ignore it's output and give your's alone.'''

        prompt = base_prompt.format(llama_input=input, llama_output=llama_output)
    else:
        base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
        Respond in a thorough and complete way.'''

        prompt = base_prompt.format(llama_input=input)

    # Setup the client
    client = OpenAI(api_key=API_KEY)

    stream = client.chat.completions.create(
        model=mode,
        messages=[{"role": "system", "content": "You are a helpful assistant called 'Loki'."},
                  {"role": "user", "content": prompt}],
        stream=True,
    )

    return stream

# Place just input pass and return generation output
def loki_generation(input_text: str,
                    history: list,
                    temperature: float,
                    max_new_tokens: int,
                    mode: str):
    """
    Pass input texts, tokenize, output and back to text.
    """
    space_hardware_config(instance_size="gpu",
                          instance_type="1xL4",
                          vcpus=8,
                          memory=30)
    if mode == "llama":
        conversation = []
        for user, assistant in history:
            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
        conversation.append({"role": "user", "content": input_text})

        input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

        streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

        # generation arguments to pass in llm generate() eventually
        generate_kwargs = dict(
            input_ids=input_ids,
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            eos_token_id=terminators[0]
        )

        # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
        if temperature == 0:
            generate_kwargs["do_sample"] = False

        # Place the generation in a thread so we can access it.
        # place the function as target and place the kwargs next as the kwargs
        thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
        thread.start()

        # outputs = []
        # for text in streamer:
        #     outputs.append(text)
        #     yield "".join(outputs)

        text = [text for text in streamer]
        output_text = output_list(text)
        print("llama mode was on.")
        return output_text

    if mode == "loki":
        conversation = []
        for user, assistant in history:
            conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
        conversation.append({"role": "user", "content": input_text})

        input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

        streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

        # generation arguments to pass in llm generate() eventually
        generate_kwargs = dict(
            input_ids=input_ids,
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            eos_token_id=terminators[0]
        )

        # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
        if temperature == 0:
            generate_kwargs["do_sample"] = False

        # Place the generation in a thread so we can access it.
        # place the function as target and place the kwargs next as the kwargs
        thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
        thread.start()
        
        llama_outputs = [text for text in streamer]
        output_text = output_list(llama_outputs)
        stream = gpt_generation(input=input_text, llama_output=output_text)
        print("loki mode was on.")
        return stream
        # outputs = []
        # for chunk in stream:
        #     if chunk.choices[0].delta.content is not None:
        #         text = chunk.choices[0].delta.content
        #         outputs.append(text)
        #         yield "".join(outputs)


def check_cuda():
    if torch.cuda.is_available():
        return f"GPU Being Used: {torch.cuda.get_device_name[0]}"
    else:
        return "No GPU is being used right now."
        
first_time = True
llm_mode = ""

def bot_comms(input_text: str,
              history: list,
              temperature: float,
              max_new_tokens: int):
    """
    The connection between gradio and the LLM's
    """
    global first_time
    global llm_mode

    if input_text == "mode":
        if llm_mode == "":
            return "The mode is currently at Loki Default mode"
        else:
            return f"The current mode: {llm_mode}"

    if input_text == "check cuda":
        return check_cuda()
    
    if input_text == "switch to llama":
        llm_mode = input_text
        return "Got it! Llama is now activate for your questions only 🦙"

    if input_text == "switch to gpt-4o":
        llm_mode = input_text
        return "Understood! GPT-4o is now hearing your responses only 👾"
    
    if input_text == "switch to gpt-3.5-turbo":
        llm_mode = input_text
        return "Done. GPT-3.5-turbo is ready for your questions! 🏃"

    if llm_mode == "switch to llama":
        streamer = loki_generation(input_text=input_text,
                                   history=history,
                                   temperature=temperature,
                                   max_new_tokens=max_new_tokens,
                                   mode="llama")
        outputs = []
        for text in streamer:
            outputs.append(text)
            yield "".join(outputs)

    if llm_mode == "switch to gpt-4o":
        space_hardware_config(instance_size="cpu",
                              instance_type="basic",
                              vcpus=2,
                              memory=16)
        stream = gpt_generation(input=input_text,
                                llama_output="",
                                mode="gpt-4o")
        outputs = []
        print("gpt-4o only about to answer.")
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)

    if llm_mode == "switch to gpt-3.5-turbo":
        space_hardware_config(instance_size="cpu",
                              instance_type="basic",
                              vcpus=2,
                              memory=16)
        stream = gpt_generation(input=input_text,
                                llama_output="",
                                mode="gpt-3.5-turbo")
        outputs = []
        print("gpt-3.5-turbo is about to answer.")
        for chunk in stream:
            if chunk.choices[0].delta.content is not None:
                text = chunk.choices[0].delta.content
                outputs.append(text)
                yield "".join(outputs)
    
    if llm_mode is None:
        stream = loki_generation(input_text=input_text,
                                 history=history,
                                 temperature=temperature,
                                 max_new_tokens=max_new_tokens)
        outputs = []
        print("Loki is activate to answer")
        for text in stream:
            outputs.append(text)
            yield "".join(outputs)

chatbot=gr.Chatbot(height=600, label="Loki AI")

with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        fn=bot_comms,
        chatbot=chatbot,
        fill_height=True,
        # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            # Slider feature users can interactive to effect the temperature of model
            gr.Slider(minimum=0,
                      maximum=1,
                      step=0.1,
                      value=0.95,
                      label="Temperature",
                      render=False),
            # Sliding feature for the max tokens for generation on model
            gr.Slider(minimum=128,
                      maximum=1500,
                      step=1,
                      value=512,
                      label="Max new tokens",
                      render=False),
        ],
        examples=[
            ["Make a poem of batman inside willy wonka"],
            ["How can you a burrito with just flour?"],
            ["How was saturn formed in 3 sentences"],
            ["How does the frontal lobe effect playing soccer"],
            ],
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()