Spaces:

hsuwill000
/

test0001

Runtime error

App Files Files Community

test0001 / app.py

hsuwill000

Update app.py

d1095ec verified 3 months ago

raw

history blame

3.41 kB

	#MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
	MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
	QUANT = "Q5_K_M"

	import subprocess
	def run_command(command, cwd=None):
	"""运行系统命令"""
	result = subprocess.run(command, shell=True, cwd=cwd, text=True, capture_output=True)
	if result.returncode != 0:
	print(f"命令执行失败: {command}")
	print(f"错误信息: {result.stderr}")
	exit(result.returncode)
	else:
	print(f"命令执行成功: {command}")
	print(result.stdout)

	run_command('pip install llama-cpp-python')

	import gradio as gr
	import os
	from llama_cpp import Llama
	from huggingface_hub import snapshot_download

	def setup_llama_cpp():
	"""克隆并编译llama.cpp仓库"""
	if not os.path.exists('llama.cpp'):
	run_command('git clone https://github.com/ggml-org/llama.cpp.git')
	os.chdir('llama.cpp')
	run_command('pip install -r requirements.txt')
	run_command('cmake -B build')
	run_command('cmake --build build --config Release -j 8')
	os.chdir('..')

	def setup_model(model_id):
	"""下载并转换模型为GGUF格式，返回量化模型路径"""
	local_dir = model_id.split('/')[-1]
	if not os.path.exists(local_dir):
	snapshot_download(repo_id=model_id, local_dir=local_dir)
	gguf_path = f"{local_dir}.gguf"
	if not os.path.exists(gguf_path):
	run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}')
	quantized_path = f"{local_dir}-{QUANT}.gguf"
	if not os.path.exists(quantized_path):
	run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} {QUANT}')
	return quantized_path

	def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
	"""调用Llama模型生成回复"""
	messages = [{"role": "system", "content": system_prompt}]
	for user_msg, assistant_msg in history:
	messages.append({"role": "user", "content": user_msg})
	messages.append({"role": "assistant", "content": assistant_msg})
	messages.append({"role": "user", "content": message})
	stream = llm.create_chat_completion(
	messages=messages,
	stream=True,
	temperature=temperature,
	top_k=top_k,
	top_p=top_p,
	max_tokens=max_tokens,
	stop=["<\|im_end\|>"]
	)
	response = ""
	for chunk in stream:
	if "choices" in chunk and chunk["choices"]:
	text = chunk["choices"][0].get("delta", {}).get("content", "")
	response += text
	yield response

	if __name__ == "__main__":
	setup_llama_cpp()
	MODEL_PATH = setup_model(MODEL_ID)
	llm = Llama(
	model_path=MODEL_PATH,
	verbose=False,
	n_threads=4,
	n_ctx=32768
	)
	gr.ChatInterface(
	fn=chat_with_model,
	title="Llama GGUF Chatbot",
	description="使用Llama GGUF量化模型进行推理",
	additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
	additional_inputs=[
	gr.Textbox("You are a helpful assistant.", label="System Prompt"),
	gr.Slider(0, 1, 0.6, label="Temperature"),
	gr.Slider(100, 4096, 1000, label="Max Tokens"),
	gr.Slider(1, 100, 40, label="Top K"),
	gr.Slider(0, 1, 0.85, label="Top P"),
	],
	).queue().launch()