File size: 3,518 Bytes
2c70b2b
cb47c06
 
 
 
 
2c70b2b
73dbe3a
 
 
 
 
 
 
 
 
 
 
 
cb47c06
f422903
 
 
 
 
 
 
 
 
 
 
 
cb47c06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c70b2b
cb47c06
 
f422903
 
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
 
2c70b2b
cb47c06
 
 
 
 
 
 
 
2c70b2b
cb47c06
 
 
2c70b2b
 
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
 
 
 
 
2c70b2b
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
2c70b2b
cb47c06
 
 
 
 
2c70b2b
cb47c06
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
import time
import subprocess
import os
from llama_cpp import Llama
from huggingface_hub import snapshot_download

def run_command(command, cwd=None):
    """运行系统命令"""
    result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True)
    if result.returncode != 0:
        print(f"命令执行失败: {command}")
        print(f"错误信息: {result.stderr}")
        exit(result.returncode)
    else:
        print(f"命令执行成功: {command}")
        print(result.stdout)


# 下载并转换模型
def git_llama():
   if not os.path.exists('llama.cpp'):
        run_command('git clone https://github.com/ggml-org/llama.cpp.git')

        # 进入仓库目录并编译
        os.chdir('llama.cpp')
        run_command('pip install -r requirements.txt')
        run_command('cmake -B build')
        run_command('cmake --build build --config Release -j 8')
        os.chdir('..')  # 返回上级目录


def setup_model(model_id):
    local_dir = model_id.split('/')[-1]
    if not os.path.exists(local_dir):
        snapshot_download(repo_id=model_id, local_dir=local_dir)
    
    # 转换为 GGUF 格式
    gguf_path = f"{local_dir}.gguf"
    if not os.path.exists(gguf_path):
        subprocess.run(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}', shell=True, check=True)
    
    # 量化模型
    quantized_path = f"{local_dir}-Q2_K.gguf"
    if not os.path.exists(quantized_path):
        subprocess.run(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K', shell=True, check=True)
    
    return quantized_path

# 设定模型路径
MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"

git_llama()
MODEL_PATH = setup_model(MODEL_ID)

# 加载 Llama 模型
llm = Llama(
    model_path=MODEL_PATH,
    verbose=False,
    n_threads=4,  # 调整线程数
    n_ctx=32768   # 上下文窗口大小
)

def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
    """调用 Llama 模型生成回复"""
    start_time = time.time()
    
    messages = [{"role": "system", "content": system_prompt}]
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    stream = llm.create_chat_completion(
        messages=messages,
        stream=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        max_tokens=max_tokens,
        stop=["<|im_end|>"]
    )
    
    response = ""
    for chunk in stream:
        if "choices" in chunk and chunk["choices"]:
            text = chunk["choices"][0].get("delta", {}).get("content", "")
            response += text
            yield response  # 流式返回文本

    print(f"生成耗时: {time.time() - start_time:.2f} 秒")

# 启动 Gradio ChatInterface
gr.ChatInterface(
    fn=chat_with_model,
    title="Llama GGUF Chatbot",
    description="使用 Llama GGUF 量化模型进行推理",
    additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
    additional_inputs=[
        gr.Textbox("You are a helpful assistant.", label="System Prompt"),
        gr.Slider(0, 1, 0.6, label="Temperature"),
        gr.Slider(100, 4096, 1000, label="Max Tokens"),
        gr.Slider(1, 100, 40, label="Top K"),
        gr.Slider(0, 1, 0.85, label="Top P"),
    ],
).queue().launch()