File size: 832 Bytes
db59ff8
8c9b649
 
 
c281c55
8c9b649
 
 
 
8cda0b2
 
8c9b649
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import os
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


model = Llama(
    model_path=hf_hub_download(
        repo_id="unsloth/Qwen3-8B-GGUF",
        filename="Qwen3-8B-UD-Q8_K_XL.gguf",
    )
)

def infer(message,history):
    temp = ""
    response = model.create_chat_completion(
        messages=[{"role": "user", "content": message}],
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        stream=True,
    )
    for streamed in response:
        delta = streamed["choices"][0].get("delta", {})
        #print(delta)
        text_chunk = delta.get("content", "")
        
        temp += text_chunk
        yield temp
with gr.Blocks() as app:
    chat = gr.ChatInterface(fn=infer)
app.launch()