Spaces:

sandz7
/

loki

Runtime error

File size: 5,765 Bytes

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login
import os
from threading import Thread
from openai import OpenAI


TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
      add_to_git_credential=False)

# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Amphisbeana 🐍</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''

# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
    llama_tokenizer.eos_token_id,
    llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# The output
def output_list(output: list):
    """
    Grabs the output from the first position in list,
    and returns it as a string as a response
    """
    cleaned_output = ''.join(filter(None, output))

    return cleaned_output

# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o
def gpt_4o_generation(llama_input: str,
                      llama_output: str):
    """
    Passes the llama output and all input,
    returns the stream, so we can yield it in final generation. 
    """

    base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
    Llama3 LLM gave the user this response:\n\n {llama_output}\n
    Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
    than ignore it's output and give your's alone.'''

    prompt = base_prompt.format(llama_input=llama_input, llama_output=llama_output)

    # Setup the client
    client = OpenAI(api_key=API_KEY)

    stream = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "system", "content": "You are a helpful assistant called 'Amphisbeana'."},
                  {"role": "user", "content": prompt}],
        stream=True,
    )

    return stream

# Place just input pass and return generation output
def llama_generation(input_text: str,
                     history: list,
                     temperature: float,
                     max_new_tokens: int):
    """
    Pass input texts, tokenize, output and back to text.
    """

    conversation = []
    for user, assistant in history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": input_text})

    input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

    streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    # generation arguments to pass in llm generate() eventually
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        eos_token_id=terminators[0]
    )

    # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
    if temperature == 0:
        generate_kwargs["do_sample"] = False

    # Place the generation in a thread so we can access it.
    # place the function as target and place the kwargs next as the kwargs
    thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
    thread.start()

    llama_outputs = [text for text in streamer]
    output_text = output_list(llama_outputs)
    stream = gpt_4o_generation(llama_input=input_text, llama_output=output_text)
    outputs = []
    for chunk in stream:
        if chunk.choices[0].delta.content is not None:
            text = chunk.choices[0].delta.content
            outputs.append(text)
            yield "".join(outputs)


chatbot=gr.Chatbot(height=600, label="Amphisbeana AI")

with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        fn=llama_generation,
        chatbot=chatbot,
        fill_height=True,
        # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            # Slider feature users can interactive to effect the temperature of model
            gr.Slider(minimum=0,
                      maximum=1,
                      step=0.1,
                      value=0.95,
                      label="Temperature",
                      render=False),
            # Sliding feature for the max tokens for generation on model
            gr.Slider(minimum=128,
                      maximum=1500,
                      step=1,
                      value=512,
                      label="Max new tokens",
                      render=False),
        ],
        examples=[
            ["Make a poem of batman inside willy wonka"],
            ["How can you a burrito with just flour?"],
            ["How was saturn formed in 3 sentences"],
            ["How does the frontal lobe effect playing soccer"],
            ],
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()