test0001 / app.py
hsuwill000's picture
Update app.py
73dbe3a verified
raw
history blame
3.52 kB
import gradio as gr
import time
import subprocess
import os
from llama_cpp import Llama
from huggingface_hub import snapshot_download
def run_command(command, cwd=None):
"""运行系统命令"""
result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True)
if result.returncode != 0:
print(f"命令执行失败: {command}")
print(f"错误信息: {result.stderr}")
exit(result.returncode)
else:
print(f"命令执行成功: {command}")
print(result.stdout)
# 下载并转换模型
def git_llama():
if not os.path.exists('llama.cpp'):
run_command('git clone https://github.com/ggml-org/llama.cpp.git')
# 进入仓库目录并编译
os.chdir('llama.cpp')
run_command('pip install -r requirements.txt')
run_command('cmake -B build')
run_command('cmake --build build --config Release -j 8')
os.chdir('..') # 返回上级目录
def setup_model(model_id):
local_dir = model_id.split('/')[-1]
if not os.path.exists(local_dir):
snapshot_download(repo_id=model_id, local_dir=local_dir)
# 转换为 GGUF 格式
gguf_path = f"{local_dir}.gguf"
if not os.path.exists(gguf_path):
subprocess.run(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}', shell=True, check=True)
# 量化模型
quantized_path = f"{local_dir}-Q2_K.gguf"
if not os.path.exists(quantized_path):
subprocess.run(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K', shell=True, check=True)
return quantized_path
# 设定模型路径
MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
git_llama()
MODEL_PATH = setup_model(MODEL_ID)
# 加载 Llama 模型
llm = Llama(
model_path=MODEL_PATH,
verbose=False,
n_threads=4, # 调整线程数
n_ctx=32768 # 上下文窗口大小
)
def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
"""调用 Llama 模型生成回复"""
start_time = time.time()
messages = [{"role": "system", "content": system_prompt}]
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
stream = llm.create_chat_completion(
messages=messages,
stream=True,
temperature=temperature,
top_k=top_k,
top_p=top_p,
max_tokens=max_tokens,
stop=["<|im_end|>"]
)
response = ""
for chunk in stream:
if "choices" in chunk and chunk["choices"]:
text = chunk["choices"][0].get("delta", {}).get("content", "")
response += text
yield response # 流式返回文本
print(f"生成耗时: {time.time() - start_time:.2f} 秒")
# 启动 Gradio ChatInterface
gr.ChatInterface(
fn=chat_with_model,
title="Llama GGUF Chatbot",
description="使用 Llama GGUF 量化模型进行推理",
additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
additional_inputs=[
gr.Textbox("You are a helpful assistant.", label="System Prompt"),
gr.Slider(0, 1, 0.6, label="Temperature"),
gr.Slider(100, 4096, 1000, label="Max Tokens"),
gr.Slider(1, 100, 40, label="Top K"),
gr.Slider(0, 1, 0.85, label="Top P"),
],
).queue().launch()