Spaces:
Runtime error
Runtime error
File size: 3,518 Bytes
2c70b2b cb47c06 2c70b2b 73dbe3a cb47c06 f422903 cb47c06 2c70b2b cb47c06 f422903 cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import gradio as gr
import time
import subprocess
import os
from llama_cpp import Llama
from huggingface_hub import snapshot_download
def run_command(command, cwd=None):
"""运行系统命令"""
result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True)
if result.returncode != 0:
print(f"命令执行失败: {command}")
print(f"错误信息: {result.stderr}")
exit(result.returncode)
else:
print(f"命令执行成功: {command}")
print(result.stdout)
# 下载并转换模型
def git_llama():
if not os.path.exists('llama.cpp'):
run_command('git clone https://github.com/ggml-org/llama.cpp.git')
# 进入仓库目录并编译
os.chdir('llama.cpp')
run_command('pip install -r requirements.txt')
run_command('cmake -B build')
run_command('cmake --build build --config Release -j 8')
os.chdir('..') # 返回上级目录
def setup_model(model_id):
local_dir = model_id.split('/')[-1]
if not os.path.exists(local_dir):
snapshot_download(repo_id=model_id, local_dir=local_dir)
# 转换为 GGUF 格式
gguf_path = f"{local_dir}.gguf"
if not os.path.exists(gguf_path):
subprocess.run(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}', shell=True, check=True)
# 量化模型
quantized_path = f"{local_dir}-Q2_K.gguf"
if not os.path.exists(quantized_path):
subprocess.run(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K', shell=True, check=True)
return quantized_path
# 设定模型路径
MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
git_llama()
MODEL_PATH = setup_model(MODEL_ID)
# 加载 Llama 模型
llm = Llama(
model_path=MODEL_PATH,
verbose=False,
n_threads=4, # 调整线程数
n_ctx=32768 # 上下文窗口大小
)
def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
"""调用 Llama 模型生成回复"""
start_time = time.time()
messages = [{"role": "system", "content": system_prompt}]
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
stream = llm.create_chat_completion(
messages=messages,
stream=True,
temperature=temperature,
top_k=top_k,
top_p=top_p,
max_tokens=max_tokens,
stop=["<|im_end|>"]
)
response = ""
for chunk in stream:
if "choices" in chunk and chunk["choices"]:
text = chunk["choices"][0].get("delta", {}).get("content", "")
response += text
yield response # 流式返回文本
print(f"生成耗时: {time.time() - start_time:.2f} 秒")
# 启动 Gradio ChatInterface
gr.ChatInterface(
fn=chat_with_model,
title="Llama GGUF Chatbot",
description="使用 Llama GGUF 量化模型进行推理",
additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
additional_inputs=[
gr.Textbox("You are a helpful assistant.", label="System Prompt"),
gr.Slider(0, 1, 0.6, label="Temperature"),
gr.Slider(100, 4096, 1000, label="Max Tokens"),
gr.Slider(1, 100, 40, label="Top K"),
gr.Slider(0, 1, 0.85, label="Top P"),
],
).queue().launch()
|