File size: 5,765 Bytes
e83f85a 9078da1 99033fb b28bc26 9078da1 86917e5 9078da1 b28bc26 99033fb b28bc26 aaf89eb b28bc26 a0c513d 99033fb 76a6e88 a0c513d 76a6e88 b0831b2 9078da1 6def1b9 9078da1 a14c9f3 b28bc26 86917e5 b28bc26 92c5d55 76a6e88 b28bc26 76a6e88 aaf89eb 9078da1 76a6e88 aaf89eb 76a6e88 722a52a 76a6e88 9078da1 86917e5 39eda37 86917e5 2e99ee0 b28bc26 76a6e88 e8bc467 b28bc26 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from huggingface_hub import login
import os
from threading import Thread
from openai import OpenAI
TOKEN = os.getenv('HF_AUTH_TOKEN')
login(token=TOKEN,
add_to_git_credential=False)
# Open ai api key
API_KEY = os.getenv('OPEN_AI_API_KEY')
DESCRIPTION = '''
<div>
<h1 style="text-align: center;">Amphisbeana π</h1>
<p>This uses Llama 3 and GPT-4o as generation, both of these make the final generation. <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B"><b>Llama3-8b</b></a> and <a href="https://platform.openai.com/docs/models/gpt-4o"><b>GPT-4o</b></a></p>
</div>
'''
# Place transformers in hardware to prepare for process and generation
llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=TOKEN, torch_dtype=torch.float16).to('cuda')
terminators = [
llama_tokenizer.eos_token_id,
llama_tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
# The output
def output_list(output: list):
"""
Grabs the output from the first position in list,
and returns it as a string as a response
"""
cleaned_output = ''.join(filter(None, output))
return cleaned_output
# Let's just make sure the llama is returning as it should and than place that return output into a function making it fit into a base
# Prompt for gpt-4o
def gpt_4o_generation(llama_input: str,
llama_output: str):
"""
Passes the llama output and all input,
returns the stream, so we can yield it in final generation.
"""
base_prompt = '''Here is the users question:\n\n {llama_input}\n\n
Llama3 LLM gave the user this response:\n\n {llama_output}\n
Answer the users question with the help of Llama3, if Llama3 response wasn't accurate,
than ignore it's output and give your's alone.'''
prompt = base_prompt.format(llama_input=llama_input, llama_output=llama_output)
# Setup the client
client = OpenAI(api_key=API_KEY)
stream = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "system", "content": "You are a helpful assistant called 'Amphisbeana'."},
{"role": "user", "content": prompt}],
stream=True,
)
return stream
# Place just input pass and return generation output
def llama_generation(input_text: str,
history: list,
temperature: float,
max_new_tokens: int):
"""
Pass input texts, tokenize, output and back to text.
"""
conversation = []
for user, assistant in history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": input_text})
input_ids = llama_tokenizer.apply_chat_template(conversation, return_tensors='pt').to(llama_model.device)
streamer = TextIteratorStreamer(llama_tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
# generation arguments to pass in llm generate() eventually
generate_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
temperature=temperature,
eos_token_id=terminators[0]
)
# This makes a greedy generation when temperature is passed to 0 (selects the next token sequence generated by model regardless). Selects each token with the highest probability
if temperature == 0:
generate_kwargs["do_sample"] = False
# Place the generation in a thread so we can access it.
# place the function as target and place the kwargs next as the kwargs
thread = Thread(target=llama_model.generate, kwargs=generate_kwargs)
thread.start()
llama_outputs = [text for text in streamer]
output_text = output_list(llama_outputs)
stream = gpt_4o_generation(llama_input=input_text, llama_output=output_text)
outputs = []
for chunk in stream:
if chunk.choices[0].delta.content is not None:
text = chunk.choices[0].delta.content
outputs.append(text)
yield "".join(outputs)
chatbot=gr.Chatbot(height=600, label="Amphisbeana AI")
with gr.Blocks(fill_height=True) as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
fn=llama_generation,
chatbot=chatbot,
fill_height=True,
# These will effect the parameters args and kwargs inside the llama_generation function, that the ui can interact with from the code
additional_inputs_accordion=gr.Accordion(label="βοΈ Parameters", open=False, render=False),
additional_inputs=[
# Slider feature users can interactive to effect the temperature of model
gr.Slider(minimum=0,
maximum=1,
step=0.1,
value=0.95,
label="Temperature",
render=False),
# Sliding feature for the max tokens for generation on model
gr.Slider(minimum=128,
maximum=1500,
step=1,
value=512,
label="Max new tokens",
render=False),
],
examples=[
["Make a poem of batman inside willy wonka"],
["How can you a burrito with just flour?"],
["How was saturn formed in 3 sentences"],
["How does the frontal lobe effect playing soccer"],
],
cache_examples=False
)
if __name__ == "__main__":
demo.launch() |