Spaces:
Runtime error
Runtime error
File size: 3,394 Bytes
a9e6e4a a42c2b7 a9e6e4a 4db8fd5 cb47c06 73dbe3a 5330909 cb2e59c f422903 cb2e59c f422903 cb47c06 cb2e59c cb47c06 cb2e59c 0d3a8e6 cb47c06 0d3a8e6 cb47c06 2c70b2b cb47c06 cb2e59c cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 2c70b2b cb47c06 cb2e59c 2c70b2b cb2e59c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
#MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
import subprocess
def run_command(command, cwd=None):
"""运行系统命令"""
result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True)
if result.returncode != 0:
print(f"命令执行失败: {command}")
print(f"错误信息: {result.stderr}")
exit(result.returncode)
else:
print(f"命令执行成功: {command}")
print(result.stdout)
run_command('pip install llama-cpp-python')
import gradio as gr
import os
from llama_cpp import Llama
from huggingface_hub import snapshot_download
def setup_llama_cpp():
"""克隆并编译llama.cpp仓库"""
if not os.path.exists('llama.cpp'):
run_command('git clone https://github.com/ggml-org/llama.cpp.git')
os.chdir('llama.cpp')
run_command('pip install -r requirements.txt')
run_command('cmake -B build')
run_command('cmake --build build --config Release -j 8')
os.chdir('..')
def setup_model(model_id):
"""下载并转换模型为GGUF格式,返回量化模型路径"""
local_dir = model_id.split('/')[-1]
if not os.path.exists(local_dir):
snapshot_download(repo_id=model_id, local_dir=local_dir)
gguf_path = f"{local_dir}.gguf"
if not os.path.exists(gguf_path):
run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}')
quantized_path = f"{local_dir}-IQ2_XXS.gguf"
if not os.path.exists(quantized_path):
run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} IQ2_XXS')
return quantized_path
def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
"""调用Llama模型生成回复"""
messages = [{"role": "system", "content": system_prompt}]
for user_msg, assistant_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": assistant_msg})
messages.append({"role": "user", "content": message})
stream = llm.create_chat_completion(
messages=messages,
stream=True,
temperature=temperature,
top_k=top_k,
top_p=top_p,
max_tokens=max_tokens,
stop=["<|im_end|>"]
)
response = ""
for chunk in stream:
if "choices" in chunk and chunk["choices"]:
text = chunk["choices"][0].get("delta", {}).get("content", "")
response += text
yield response
if __name__ == "__main__":
setup_llama_cpp()
MODEL_PATH = setup_model(MODEL_ID)
llm = Llama(
model_path=MODEL_PATH,
verbose=False,
n_threads=4,
n_ctx=32768
)
gr.ChatInterface(
fn=chat_with_model,
title="Llama GGUF Chatbot",
description="使用Llama GGUF量化模型进行推理",
additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
additional_inputs=[
gr.Textbox("You are a helpful assistant.", label="System Prompt"),
gr.Slider(0, 1, 0.6, label="Temperature"),
gr.Slider(100, 4096, 1000, label="Max Tokens"),
gr.Slider(1, 100, 40, label="Top K"),
gr.Slider(0, 1, 0.85, label="Top P"),
],
).queue().launch()
|