Spaces:

sandz7
/

loki

Runtime error

File size: 4,613 Bytes

e83f85a
 
9078da1
99033fb
b28bc26
9078da1
 
b28bc26
99033fb
 
 
b28bc26
 
 
 
 
 
 
aaf89eb
b28bc26
 
 
 
a0c513d
99033fb
76a6e88
 
a0c513d
76a6e88
b0831b2
9078da1
 
 
 
 
 
930773d
9078da1
a14c9f3
b28bc26
 
92c5d55
76a6e88
 
 
b28bc26
 
 
 
76a6e88
 
 
 
 
 
 
aaf89eb
9078da1
76a6e88
 
 
aaf89eb
76a6e88
 
 
722a52a
76a6e88
 
 
 
 
 
9078da1
 
 
 
 
 
 
 
d9a7da0
9078da1
 
 
 
b28bc26
 
 
 
 
 
 
 
 
 
 
 
76a6e88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8bc467
 
 
 
 
 
b28bc26

import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login
import os
from threading import Thread


TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
      add_to_git_credential=False)

# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Amphisbeana 🐍</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''

# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
    llama_tokenizer.eos_token_id,
    llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# The output
def output_list(output: list):
    """
    Grabs the output from the first position in list,
    and returns it as a string as a response
    """
    cleaned_output = '  '.join(filter(None, output))

    return cleaned_output

# Place just input pass and return generation output
def llama_generation(input_text: str,
                     history: list,
                     temperature: float,
                     max_new_tokens: int):
    """
    Pass input texts, tokenize, output and back to text.
    """

    conversation = []
    for user, assistant in history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": input_text})

    input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

    streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)

    # generation arguments to pass in llm generate() eventually
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        eos_token_id=terminators[0]
    )

    # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
    if temperature == 0:
        generate_kwargs["do_sample"] = False

    # Place the generation in a thread so we can access it.
    # place the function as target and place the kwargs next as the kwargs
    thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
    thread.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)

    # Convert output into string
    print(output_list(outputs))


# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o

chatbot=gr.Chatbot(height=600, label="Amphisbeana AI")

with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        fn=llama_generation,
        chatbot=chatbot,
        fill_height=True,
        # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            # Slider feature users can interactive to effect the temperature of model
            gr.Slider(minimum=0,
                      maximum=1,
                      step=0.1,
                      value=0.95,
                      label="Temperature",
                      render=False),
            # Sliding feature for the max tokens for generation on model
            gr.Slider(minimum=128,
                      maximum=1500,
                      step=1,
                      value=512,
                      label="Max new tokens",
                      render=False),
        ],
        examples=[
            ["Make a poem of batman inside willy wonka"],
            ["How can you a burrito with just flour?"],
            ["How was saturn formed in 3 sentences"],
            ["How does the frontal lobe effect playing soccer"],
            ],
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()