File size: 3,410 Bytes
a9e6e4a
a42c2b7
d1095ec
4db8fd5
cb47c06
73dbe3a
 
 
 
 
 
 
 
 
 
 
5330909
 
 
 
 
 
 
cb2e59c
 
 
f422903
 
 
 
 
cb2e59c
f422903
cb47c06
cb2e59c
cb47c06
 
 
 
 
cb2e59c
d1095ec
cb47c06
d1095ec
cb47c06
2c70b2b
cb47c06
cb2e59c
cb47c06
 
 
 
2c70b2b
cb47c06
 
2c70b2b
 
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
 
 
cb2e59c
2c70b2b
cb2e59c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
QUANT = "Q5_K_M"

import subprocess
def run_command(command, cwd=None):
    """运行系统命令"""
    result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True)
    if result.returncode != 0:
        print(f"命令执行失败: {command}")
        print(f"错误信息: {result.stderr}")
        exit(result.returncode)
    else:
        print(f"命令执行成功: {command}")
        print(result.stdout)

run_command('pip install llama-cpp-python')

import gradio as gr
import os
from llama_cpp import Llama
from huggingface_hub import snapshot_download

def setup_llama_cpp():
    """克隆并编译llama.cpp仓库"""
    if not os.path.exists('llama.cpp'):
        run_command('git clone https://github.com/ggml-org/llama.cpp.git')
        os.chdir('llama.cpp')
        run_command('pip install -r requirements.txt')
        run_command('cmake -B build')
        run_command('cmake --build build --config Release -j 8')
        os.chdir('..')

def setup_model(model_id):
    """下载并转换模型为GGUF格式,返回量化模型路径"""
    local_dir = model_id.split('/')[-1]
    if not os.path.exists(local_dir):
        snapshot_download(repo_id=model_id, local_dir=local_dir)
    gguf_path = f"{local_dir}.gguf"
    if not os.path.exists(gguf_path):
        run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}')
    quantized_path = f"{local_dir}-{QUANT}.gguf"
    if not os.path.exists(quantized_path):
        run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} {QUANT}')
    return quantized_path

def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
    """调用Llama模型生成回复"""
    messages = [{"role": "system", "content": system_prompt}]
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    stream = llm.create_chat_completion(
        messages=messages,
        stream=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        max_tokens=max_tokens,
        stop=["<|im_end|>"]
    )
    response = ""
    for chunk in stream:
        if "choices" in chunk and chunk["choices"]:
            text = chunk["choices"][0].get("delta", {}).get("content", "")
            response += text
            yield response

if __name__ == "__main__":
    setup_llama_cpp()
    MODEL_PATH = setup_model(MODEL_ID)
    llm = Llama(
        model_path=MODEL_PATH,
        verbose=False,
        n_threads=4,
        n_ctx=32768
    )
    gr.ChatInterface(
        fn=chat_with_model,
        title="Llama GGUF Chatbot",
        description="使用Llama GGUF量化模型进行推理",
        additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
        additional_inputs=[
            gr.Textbox("You are a helpful assistant.", label="System Prompt"),
            gr.Slider(0, 1, 0.6, label="Temperature"),
            gr.Slider(100, 4096, 1000, label="Max Tokens"),
            gr.Slider(1, 100, 40, label="Top K"),
            gr.Slider(0, 1, 0.85, label="Top P"),
        ],
    ).queue().launch()