Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import gradio as gr
|
2 |
-
import time
|
3 |
import subprocess
|
4 |
import os
|
5 |
from llama_cpp import Llama
|
@@ -16,61 +15,36 @@ def run_command(command, cwd=None):
|
|
16 |
print(f"命令执行成功: {command}")
|
17 |
print(result.stdout)
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
if not os.path.exists('llama.cpp'):
|
23 |
run_command('git clone https://github.com/ggml-org/llama.cpp.git')
|
24 |
-
|
25 |
-
# 进入仓库目录并编译
|
26 |
os.chdir('llama.cpp')
|
27 |
run_command('pip install -r requirements.txt')
|
28 |
run_command('cmake -B build')
|
29 |
run_command('cmake --build build --config Release -j 8')
|
30 |
-
os.chdir('..')
|
31 |
-
|
32 |
|
33 |
def setup_model(model_id):
|
|
|
34 |
local_dir = model_id.split('/')[-1]
|
35 |
if not os.path.exists(local_dir):
|
36 |
snapshot_download(repo_id=model_id, local_dir=local_dir)
|
37 |
-
|
38 |
-
# 转换为 GGUF 格式
|
39 |
gguf_path = f"{local_dir}.gguf"
|
40 |
if not os.path.exists(gguf_path):
|
41 |
-
|
42 |
-
|
43 |
-
# 量化模型
|
44 |
quantized_path = f"{local_dir}-Q2_K.gguf"
|
45 |
if not os.path.exists(quantized_path):
|
46 |
-
|
47 |
-
|
48 |
return quantized_path
|
49 |
|
50 |
-
# 设定模型路径
|
51 |
-
MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
|
52 |
-
|
53 |
-
git_llama()
|
54 |
-
MODEL_PATH = setup_model(MODEL_ID)
|
55 |
-
|
56 |
-
# 加载 Llama 模型
|
57 |
-
llm = Llama(
|
58 |
-
model_path=MODEL_PATH,
|
59 |
-
verbose=False,
|
60 |
-
n_threads=4, # 调整线程数
|
61 |
-
n_ctx=32768 # 上下文窗口大小
|
62 |
-
)
|
63 |
-
|
64 |
def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
|
65 |
-
"""调用
|
66 |
-
start_time = time.time()
|
67 |
-
|
68 |
messages = [{"role": "system", "content": system_prompt}]
|
69 |
for user_msg, assistant_msg in history:
|
70 |
messages.append({"role": "user", "content": user_msg})
|
71 |
messages.append({"role": "assistant", "content": assistant_msg})
|
72 |
messages.append({"role": "user", "content": message})
|
73 |
-
|
74 |
stream = llm.create_chat_completion(
|
75 |
messages=messages,
|
76 |
stream=True,
|
@@ -80,27 +54,33 @@ def chat_with_model(message, history, system_prompt, temperature, max_tokens, to
|
|
80 |
max_tokens=max_tokens,
|
81 |
stop=["<|im_end|>"]
|
82 |
)
|
83 |
-
|
84 |
response = ""
|
85 |
for chunk in stream:
|
86 |
if "choices" in chunk and chunk["choices"]:
|
87 |
text = chunk["choices"][0].get("delta", {}).get("content", "")
|
88 |
response += text
|
89 |
-
yield response
|
90 |
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import subprocess
|
3 |
import os
|
4 |
from llama_cpp import Llama
|
|
|
15 |
print(f"命令执行成功: {command}")
|
16 |
print(result.stdout)
|
17 |
|
18 |
+
def setup_llama_cpp():
|
19 |
+
"""克隆并编译llama.cpp仓库"""
|
20 |
+
if not os.path.exists('llama.cpp'):
|
|
|
21 |
run_command('git clone https://github.com/ggml-org/llama.cpp.git')
|
|
|
|
|
22 |
os.chdir('llama.cpp')
|
23 |
run_command('pip install -r requirements.txt')
|
24 |
run_command('cmake -B build')
|
25 |
run_command('cmake --build build --config Release -j 8')
|
26 |
+
os.chdir('..')
|
|
|
27 |
|
28 |
def setup_model(model_id):
|
29 |
+
"""下载并转换模型为GGUF格式,返回量化模型路径"""
|
30 |
local_dir = model_id.split('/')[-1]
|
31 |
if not os.path.exists(local_dir):
|
32 |
snapshot_download(repo_id=model_id, local_dir=local_dir)
|
|
|
|
|
33 |
gguf_path = f"{local_dir}.gguf"
|
34 |
if not os.path.exists(gguf_path):
|
35 |
+
run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}')
|
|
|
|
|
36 |
quantized_path = f"{local_dir}-Q2_K.gguf"
|
37 |
if not os.path.exists(quantized_path):
|
38 |
+
run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K')
|
|
|
39 |
return quantized_path
|
40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
|
42 |
+
"""调用Llama模型生成回复"""
|
|
|
|
|
43 |
messages = [{"role": "system", "content": system_prompt}]
|
44 |
for user_msg, assistant_msg in history:
|
45 |
messages.append({"role": "user", "content": user_msg})
|
46 |
messages.append({"role": "assistant", "content": assistant_msg})
|
47 |
messages.append({"role": "user", "content": message})
|
|
|
48 |
stream = llm.create_chat_completion(
|
49 |
messages=messages,
|
50 |
stream=True,
|
|
|
54 |
max_tokens=max_tokens,
|
55 |
stop=["<|im_end|>"]
|
56 |
)
|
|
|
57 |
response = ""
|
58 |
for chunk in stream:
|
59 |
if "choices" in chunk and chunk["choices"]:
|
60 |
text = chunk["choices"][0].get("delta", {}).get("content", "")
|
61 |
response += text
|
62 |
+
yield response
|
63 |
|
64 |
+
if __name__ == "__main__":
|
65 |
+
MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
|
66 |
+
setup_llama_cpp()
|
67 |
+
MODEL_PATH = setup_model(MODEL_ID)
|
68 |
+
llm = Llama(
|
69 |
+
model_path=MODEL_PATH,
|
70 |
+
verbose=False,
|
71 |
+
n_threads=4,
|
72 |
+
n_ctx=32768
|
73 |
+
)
|
74 |
+
gr.ChatInterface(
|
75 |
+
fn=chat_with_model,
|
76 |
+
title="Llama GGUF Chatbot",
|
77 |
+
description="使用Llama GGUF量化模型进行推理",
|
78 |
+
additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
|
79 |
+
additional_inputs=[
|
80 |
+
gr.Textbox("You are a helpful assistant.", label="System Prompt"),
|
81 |
+
gr.Slider(0, 1, 0.6, label="Temperature"),
|
82 |
+
gr.Slider(100, 4096, 1000, label="Max Tokens"),
|
83 |
+
gr.Slider(1, 100, 40, label="Top K"),
|
84 |
+
gr.Slider(0, 1, 0.85, label="Top P"),
|
85 |
+
],
|
86 |
+
).queue().launch()
|