import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


model = Llama(
    model_path=hf_hub_download(
        repo_id="unsloth/Qwen3-8B-GGUF",
        filename="Qwen3-8B-UD-Q8_K_XL.gguf",
    )
)

def infer(message,history):
    temp = ""
    response = model.create_chat_completion(
        messages=[{"role": "user", "content": message}],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        stream=True,
    )
    for streamed in response:
        delta = streamed["choices"][0].get("delta", {})
        #print(delta)
        text_chunk = delta.get("content", "")
        
        temp += text_chunk
        yield temp
with gr.Blocks() as app:
    chat = gr.ChatInterface(fn=infer)
app.launch()