import time | |
import gradio as gr | |
from ctransformers import AutoModelForCausalLM | |
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. | |
model_path = "TheBloke/WizardLM-7B-uncensored.Q3_K_S.gguf" | |
llm = AutoModelForCausalLM.from_pretrained(model_path, model_type="llama",stream=True) | |
def generate_response(message): | |
response = llm(message) | |
yield response | |
def chatbot(message, history): | |
response_generator = generate_response(message) | |
for response in response_generator: | |
time.sleep(0.3) # Optional delay for a natural chat feel | |
yield response | |
iface = gr.ChatInterface(chatbot) | |
iface.launch() |