Spaces:
Runtime error
Runtime error
#MODEL_ID = "ibm-granite/granite-3.1-2b-instruct" | |
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" | |
QUANT = "Q5_K_M" | |
import subprocess | |
def run_command(command, cwd=None): | |
"""运行系统命令""" | |
result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True) | |
if result.returncode != 0: | |
print(f"命令执行失败: {command}") | |
print(f"错误信息: {result.stderr}") | |
exit(result.returncode) | |
else: | |
print(f"命令执行成功: {command}") | |
print(result.stdout) | |
run_command('pip install llama-cpp-python') | |
import gradio as gr | |
import os | |
from llama_cpp import Llama | |
from huggingface_hub import snapshot_download | |
def setup_llama_cpp(): | |
"""克隆并编译llama.cpp仓库""" | |
if not os.path.exists('llama.cpp'): | |
run_command('git clone https://github.com/ggml-org/llama.cpp.git') | |
os.chdir('llama.cpp') | |
run_command('pip install -r requirements.txt') | |
run_command('cmake -B build') | |
run_command('cmake --build build --config Release -j 8') | |
os.chdir('..') | |
def setup_model(model_id): | |
"""下载并转换模型为GGUF格式,返回量化模型路径""" | |
local_dir = model_id.split('/')[-1] | |
if not os.path.exists(local_dir): | |
snapshot_download(repo_id=model_id, local_dir=local_dir) | |
gguf_path = f"{local_dir}.gguf" | |
if not os.path.exists(gguf_path): | |
run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}') | |
quantized_path = f"{local_dir}-{QUANT}.gguf" | |
if not os.path.exists(quantized_path): | |
run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} {QUANT}') | |
return quantized_path | |
def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p): | |
"""调用Llama模型生成回复""" | |
messages = [{"role": "system", "content": system_prompt}] | |
for user_msg, assistant_msg in history: | |
messages.append({"role": "user", "content": user_msg}) | |
messages.append({"role": "assistant", "content": assistant_msg}) | |
messages.append({"role": "user", "content": message}) | |
stream = llm.create_chat_completion( | |
messages=messages, | |
stream=True, | |
temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
max_tokens=max_tokens, | |
stop=["<|im_end|>"] | |
) | |
response = "" | |
for chunk in stream: | |
if "choices" in chunk and chunk["choices"]: | |
text = chunk["choices"][0].get("delta", {}).get("content", "") | |
response += text | |
yield response | |
if __name__ == "__main__": | |
setup_llama_cpp() | |
MODEL_PATH = setup_model(MODEL_ID) | |
llm = Llama( | |
model_path=MODEL_PATH, | |
verbose=False, | |
n_threads=4, | |
n_ctx=32768 | |
) | |
gr.ChatInterface( | |
fn=chat_with_model, | |
title="Llama GGUF Chatbot", | |
description="使用Llama GGUF量化模型进行推理", | |
additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False), | |
additional_inputs=[ | |
gr.Textbox("You are a helpful assistant.", label="System Prompt"), | |
gr.Slider(0, 1, 0.6, label="Temperature"), | |
gr.Slider(100, 4096, 1000, label="Max Tokens"), | |
gr.Slider(1, 100, 40, label="Top K"), | |
gr.Slider(0, 1, 0.85, label="Top P"), | |
], | |
).queue().launch() | |