import gradio as gr import time import subprocess import os from llama_cpp import Llama from huggingface_hub import snapshot_download def run_command(command, cwd=None): """运行系统命令""" result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True) if result.returncode != 0: print(f"命令执行失败: {command}") print(f"错误信息: {result.stderr}") exit(result.returncode) else: print(f"命令执行成功: {command}") print(result.stdout) # 下载并转换模型 def git_llama(): if not os.path.exists('llama.cpp'): run_command('git clone https://github.com/ggml-org/llama.cpp.git') # 进入仓库目录并编译 os.chdir('llama.cpp') run_command('pip install -r requirements.txt') run_command('cmake -B build') run_command('cmake --build build --config Release -j 8') os.chdir('..') # 返回上级目录 def setup_model(model_id): local_dir = model_id.split('/')[-1] if not os.path.exists(local_dir): snapshot_download(repo_id=model_id, local_dir=local_dir) # 转换为 GGUF 格式 gguf_path = f"{local_dir}.gguf" if not os.path.exists(gguf_path): subprocess.run(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}', shell=True, check=True) # 量化模型 quantized_path = f"{local_dir}-Q2_K.gguf" if not os.path.exists(quantized_path): subprocess.run(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K', shell=True, check=True) return quantized_path # 设定模型路径 MODEL_ID = "ibm-granite/granite-3.1-2b-instruct" git_llama() MODEL_PATH = setup_model(MODEL_ID) # 加载 Llama 模型 llm = Llama( model_path=MODEL_PATH, verbose=False, n_threads=4, # 调整线程数 n_ctx=32768 # 上下文窗口大小 ) def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p): """调用 Llama 模型生成回复""" start_time = time.time() messages = [{"role": "system", "content": system_prompt}] for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) stream = llm.create_chat_completion( messages=messages, stream=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, stop=["<|im_end|>"] ) response = "" for chunk in stream: if "choices" in chunk and chunk["choices"]: text = chunk["choices"][0].get("delta", {}).get("content", "") response += text yield response # 流式返回文本 print(f"生成耗时: {time.time() - start_time:.2f} 秒") # 启动 Gradio ChatInterface gr.ChatInterface( fn=chat_with_model, title="Llama GGUF Chatbot", description="使用 Llama GGUF 量化模型进行推理", additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False), additional_inputs=[ gr.Textbox("You are a helpful assistant.", label="System Prompt"), gr.Slider(0, 1, 0.6, label="Temperature"), gr.Slider(100, 4096, 1000, label="Max Tokens"), gr.Slider(1, 100, 40, label="Top K"), gr.Slider(0, 1, 0.85, label="Top P"), ], ).queue().launch()