#MODEL_ID = "ibm-granite/granite-3.1-2b-instruct" MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" import subprocess def run_command(command, cwd=None): """运行系统命令""" result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True) if result.returncode != 0: print(f"命令执行失败: {command}") print(f"错误信息: {result.stderr}") exit(result.returncode) else: print(f"命令执行成功: {command}") print(result.stdout) run_command('pip install llama-cpp-python') import gradio as gr import os from llama_cpp import Llama from huggingface_hub import snapshot_download def setup_llama_cpp(): """克隆并编译llama.cpp仓库""" if not os.path.exists('llama.cpp'): run_command('git clone https://github.com/ggml-org/llama.cpp.git') os.chdir('llama.cpp') run_command('pip install -r requirements.txt') run_command('cmake -B build') run_command('cmake --build build --config Release -j 8') os.chdir('..') def setup_model(model_id): """下载并转换模型为GGUF格式,返回量化模型路径""" local_dir = model_id.split('/')[-1] if not os.path.exists(local_dir): snapshot_download(repo_id=model_id, local_dir=local_dir) gguf_path = f"{local_dir}.gguf" if not os.path.exists(gguf_path): run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}') quantized_path = f"{local_dir}-IQ2_XXS.gguf" if not os.path.exists(quantized_path): run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} IQ2_XXS') return quantized_path def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p): """调用Llama模型生成回复""" messages = [{"role": "system", "content": system_prompt}] for user_msg, assistant_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": assistant_msg}) messages.append({"role": "user", "content": message}) stream = llm.create_chat_completion( messages=messages, stream=True, temperature=temperature, top_k=top_k, top_p=top_p, max_tokens=max_tokens, stop=["<|im_end|>"] ) response = "" for chunk in stream: if "choices" in chunk and chunk["choices"]: text = chunk["choices"][0].get("delta", {}).get("content", "") response += text yield response if __name__ == "__main__": setup_llama_cpp() MODEL_PATH = setup_model(MODEL_ID) llm = Llama( model_path=MODEL_PATH, verbose=False, n_threads=4, n_ctx=32768 ) gr.ChatInterface( fn=chat_with_model, title="Llama GGUF Chatbot", description="使用Llama GGUF量化模型进行推理", additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False), additional_inputs=[ gr.Textbox("You are a helpful assistant.", label="System Prompt"), gr.Slider(0, 1, 0.6, label="Temperature"), gr.Slider(100, 4096, 1000, label="Max Tokens"), gr.Slider(1, 100, 40, label="Top K"), gr.Slider(0, 1, 0.85, label="Top P"), ], ).queue().launch()