import time import gradio as gr from ctransformers import AutoModelForCausalLM # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. llm = AutoModelForCausalLM.from_pretrained("TheBloke/WizardLM-7B-uncensored-GGUF", model_file="WizardLM-7B-uncensored.Q4_K_M.gguf", model_type="llama", stream=True) history = ["Chatbot:"] def generate_response(message): for text in llm(message): response = ''.join(text) history.append(response) yield ' '.join(history) def chatbot(message, history): response_generator = generate_response(message) for response in response_generator: time.sleep(0.1) # Optional delay for a natural chat feel yield response iface = gr.ChatInterface(chatbot) iface.launch()