Spaces:

sandz7
/

loki

Runtime error

File size: 4,495 Bytes

import torch
import pandas as pd
import numpy as np
import gradio as gr
import re
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import re
from huggingface_hub import login
import os
from threading import Thread

# HF_TOKEN
TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
      add_to_git_credential=False)

# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Amphisbeana 🐍</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a>and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''

# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
    llama_tokenizer.eos_token_id,
    llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

# Place just input pass and return generation output
def llama_generation(input_text: str,
                     history: list,
                     temperature: float,
                     max_new_tokens: int):
    """
    Pass input texts, tokenize, output and back to text.
    """

    conversation = []
    for user, assistant in history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": input_text})

    input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)

    # Skip_prompt, ignores the prompt in the chatbot
    streamer = TextIteratorStreamer(llama_tokenizer, skip_prompt=True, skip_special_tokens=True)

    # generation arguments to pass in llm generate() eventually
    generate_kwargs = dict(
        input_ids=input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=temperature,
        eos_token_id=terminators
    )

    # This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
    if temperature == 0:
        generate_kwargs["do_sample"] = False

    # In order to use the generate_kwargs we need to place it in a thread which can also allow the UI to run different commands even when the model is generating
    # place the function as target and place the kwargs next as the kwargs
    thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
    thread.start()

    outputs = []
    for text in streamer:
        outputs.append(text)
        return "".join(outputs)

# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o

chatbot=gr.Chatbot(height=600, label="Amphisbeana AI")

with gr.Blocks(fill_height=True) as demo:
    gr.Markdown(DESCRIPTION)
    gr.ChatInterface(
        fn=llama_generation,
        chatbot=chatbot,
        fill_height=True,
        # These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            # Slider feature users can interactive to effect the temperature of model
            gr.Slider(minimum=0,
                      maximum=1,
                      step=0.1,
                      value=0.95,
                      label="Temperature",
                      render=False),
            # Sliding feature for the max tokens for generation on model
            gr.Slider(minimum=128,
                      maximum=1500,
                      step=1,
                      value=512,
                      label="Max new tokens",
                      render=False),
        ],
        examples=[
            ["Make a poem of batman inside willy wonka"],
            ["How can you a burrito with just flour?"],
            ["How was saturn formed in 3 sentences"],
            ["How does the frontal lobe effect playing soccer"],
            ],
        cache_examples=False
    )

if __name__ == "__main__":
    demo.launch()