|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
import torch |
|
|
|
|
|
model_name = "meta-llama/Llama-2-7b-chat-hf" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") |
|
|
|
def generate_response(message, history): |
|
|
|
prompt = "".join([f"Human: {h[0]}\nAssistant: {h[1]}\n" for h in history]) |
|
prompt += f"Human: {message}\nAssistant:" |
|
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) |
|
outputs = model.generate(**inputs, max_new_tokens=1000, temperature=0.7, do_sample=True) |
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
assistant_response = response.split("Assistant:")[-1].strip() |
|
return assistant_response |
|
|
|
|
|
iface = gr.ChatInterface( |
|
generate_response, |
|
title="Llama-2-7b Chat Interface", |
|
description="Chat with the Llama-2-7b model. Type your message and press Enter.", |
|
examples=[ |
|
"What is the capital of France?", |
|
"Explain quantum computing in simple terms.", |
|
"Write a short poem about artificial intelligence." |
|
], |
|
cache_examples=False, |
|
) |
|
|
|
|
|
iface.launch() |