import os import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" model = Llama( model_path=hf_hub_download( repo_id="unsloth/Qwen3-8B-GGUF", filename="Qwen3-8B-UD-Q8_K_XL.gguf", ) ) def infer(message,history): temp = "" response = model.create_chat_completion( messages=[{"role": "user", "content": message}], temperature=temperature, max_tokens=max_tokens, top_p=top_p, stream=True, ) for streamed in response: delta = streamed["choices"][0].get("delta", {}) #print(delta) text_chunk = delta.get("content", "") temp += text_chunk yield temp with gr.Blocks() as app: chat = gr.ChatInterface(fn=infer) app.launch()