File size: 2,697 Bytes
2c70b2b
cb47c06
 
 
 
 
2c70b2b
cb47c06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c70b2b
cb47c06
 
 
2c70b2b
cb47c06
 
 
 
 
 
 
2c70b2b
cb47c06
 
 
 
 
 
 
 
2c70b2b
cb47c06
 
 
2c70b2b
 
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
 
 
 
 
2c70b2b
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
2c70b2b
cb47c06
 
 
 
 
2c70b2b
cb47c06
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
import time
import subprocess
import os
from llama_cpp import Llama
from huggingface_hub import snapshot_download

# 下载并转换模型
def setup_model(model_id):
    local_dir = model_id.split('/')[-1]
    if not os.path.exists(local_dir):
        snapshot_download(repo_id=model_id, local_dir=local_dir)
    
    # 转换为 GGUF 格式
    gguf_path = f"{local_dir}.gguf"
    if not os.path.exists(gguf_path):
        subprocess.run(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}', shell=True, check=True)
    
    # 量化模型
    quantized_path = f"{local_dir}-Q2_K.gguf"
    if not os.path.exists(quantized_path):
        subprocess.run(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K', shell=True, check=True)
    
    return quantized_path

# 设定模型路径
MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
MODEL_PATH = setup_model(MODEL_ID)

# 加载 Llama 模型
llm = Llama(
    model_path=MODEL_PATH,
    verbose=False,
    n_threads=4,  # 调整线程数
    n_ctx=32768   # 上下文窗口大小
)

def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
    """调用 Llama 模型生成回复"""
    start_time = time.time()
    
    messages = [{"role": "system", "content": system_prompt}]
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    stream = llm.create_chat_completion(
        messages=messages,
        stream=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        max_tokens=max_tokens,
        stop=["<|im_end|>"]
    )
    
    response = ""
    for chunk in stream:
        if "choices" in chunk and chunk["choices"]:
            text = chunk["choices"][0].get("delta", {}).get("content", "")
            response += text
            yield response  # 流式返回文本

    print(f"生成耗时: {time.time() - start_time:.2f} 秒")

# 启动 Gradio ChatInterface
gr.ChatInterface(
    fn=chat_with_model,
    title="Llama GGUF Chatbot",
    description="使用 Llama GGUF 量化模型进行推理",
    additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
    additional_inputs=[
        gr.Textbox("You are a helpful assistant.", label="System Prompt"),
        gr.Slider(0, 1, 0.6, label="Temperature"),
        gr.Slider(100, 4096, 1000, label="Max Tokens"),
        gr.Slider(1, 100, 40, label="Top K"),
        gr.Slider(0, 1, 0.85, label="Top P"),
    ],
).queue().launch()