hsuwill000 commited on
Commit
cb2e59c
·
verified ·
1 Parent(s): 73dbe3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -52
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import gradio as gr
2
- import time
3
  import subprocess
4
  import os
5
  from llama_cpp import Llama
@@ -16,61 +15,36 @@ def run_command(command, cwd=None):
16
  print(f"命令执行成功: {command}")
17
  print(result.stdout)
18
 
19
-
20
- # 下载并转换模型
21
- def git_llama():
22
- if not os.path.exists('llama.cpp'):
23
  run_command('git clone https://github.com/ggml-org/llama.cpp.git')
24
-
25
- # 进入仓库目录并编译
26
  os.chdir('llama.cpp')
27
  run_command('pip install -r requirements.txt')
28
  run_command('cmake -B build')
29
  run_command('cmake --build build --config Release -j 8')
30
- os.chdir('..') # 返回上级目录
31
-
32
 
33
  def setup_model(model_id):
 
34
  local_dir = model_id.split('/')[-1]
35
  if not os.path.exists(local_dir):
36
  snapshot_download(repo_id=model_id, local_dir=local_dir)
37
-
38
- # 转换为 GGUF 格式
39
  gguf_path = f"{local_dir}.gguf"
40
  if not os.path.exists(gguf_path):
41
- subprocess.run(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}', shell=True, check=True)
42
-
43
- # 量化模型
44
  quantized_path = f"{local_dir}-Q2_K.gguf"
45
  if not os.path.exists(quantized_path):
46
- subprocess.run(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K', shell=True, check=True)
47
-
48
  return quantized_path
49
 
50
- # 设定模型路径
51
- MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
52
-
53
- git_llama()
54
- MODEL_PATH = setup_model(MODEL_ID)
55
-
56
- # 加载 Llama 模型
57
- llm = Llama(
58
- model_path=MODEL_PATH,
59
- verbose=False,
60
- n_threads=4, # 调整线程数
61
- n_ctx=32768 # 上下文窗口大小
62
- )
63
-
64
  def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
65
- """调用 Llama 模型生成回复"""
66
- start_time = time.time()
67
-
68
  messages = [{"role": "system", "content": system_prompt}]
69
  for user_msg, assistant_msg in history:
70
  messages.append({"role": "user", "content": user_msg})
71
  messages.append({"role": "assistant", "content": assistant_msg})
72
  messages.append({"role": "user", "content": message})
73
-
74
  stream = llm.create_chat_completion(
75
  messages=messages,
76
  stream=True,
@@ -80,27 +54,33 @@ def chat_with_model(message, history, system_prompt, temperature, max_tokens, to
80
  max_tokens=max_tokens,
81
  stop=["<|im_end|>"]
82
  )
83
-
84
  response = ""
85
  for chunk in stream:
86
  if "choices" in chunk and chunk["choices"]:
87
  text = chunk["choices"][0].get("delta", {}).get("content", "")
88
  response += text
89
- yield response # 流式返回文本
90
 
91
- print(f"生成耗时: {time.time() - start_time:.2f} 秒")
92
-
93
- # 启动 Gradio ChatInterface
94
- gr.ChatInterface(
95
- fn=chat_with_model,
96
- title="Llama GGUF Chatbot",
97
- description="使用 Llama GGUF 量化模型进行推理",
98
- additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
99
- additional_inputs=[
100
- gr.Textbox("You are a helpful assistant.", label="System Prompt"),
101
- gr.Slider(0, 1, 0.6, label="Temperature"),
102
- gr.Slider(100, 4096, 1000, label="Max Tokens"),
103
- gr.Slider(1, 100, 40, label="Top K"),
104
- gr.Slider(0, 1, 0.85, label="Top P"),
105
- ],
106
- ).queue().launch()
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  import subprocess
3
  import os
4
  from llama_cpp import Llama
 
15
  print(f"命令执行成功: {command}")
16
  print(result.stdout)
17
 
18
+ def setup_llama_cpp():
19
+ """克隆并编译llama.cpp仓库"""
20
+ if not os.path.exists('llama.cpp'):
 
21
  run_command('git clone https://github.com/ggml-org/llama.cpp.git')
 
 
22
  os.chdir('llama.cpp')
23
  run_command('pip install -r requirements.txt')
24
  run_command('cmake -B build')
25
  run_command('cmake --build build --config Release -j 8')
26
+ os.chdir('..')
 
27
 
28
  def setup_model(model_id):
29
+ """下载并转换模型为GGUF格式,返回量化模型路径"""
30
  local_dir = model_id.split('/')[-1]
31
  if not os.path.exists(local_dir):
32
  snapshot_download(repo_id=model_id, local_dir=local_dir)
 
 
33
  gguf_path = f"{local_dir}.gguf"
34
  if not os.path.exists(gguf_path):
35
+ run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}')
 
 
36
  quantized_path = f"{local_dir}-Q2_K.gguf"
37
  if not os.path.exists(quantized_path):
38
+ run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K')
 
39
  return quantized_path
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
42
+ """调用Llama模型生成回复"""
 
 
43
  messages = [{"role": "system", "content": system_prompt}]
44
  for user_msg, assistant_msg in history:
45
  messages.append({"role": "user", "content": user_msg})
46
  messages.append({"role": "assistant", "content": assistant_msg})
47
  messages.append({"role": "user", "content": message})
 
48
  stream = llm.create_chat_completion(
49
  messages=messages,
50
  stream=True,
 
54
  max_tokens=max_tokens,
55
  stop=["<|im_end|>"]
56
  )
 
57
  response = ""
58
  for chunk in stream:
59
  if "choices" in chunk and chunk["choices"]:
60
  text = chunk["choices"][0].get("delta", {}).get("content", "")
61
  response += text
62
+ yield response
63
 
64
+ if __name__ == "__main__":
65
+ MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
66
+ setup_llama_cpp()
67
+ MODEL_PATH = setup_model(MODEL_ID)
68
+ llm = Llama(
69
+ model_path=MODEL_PATH,
70
+ verbose=False,
71
+ n_threads=4,
72
+ n_ctx=32768
73
+ )
74
+ gr.ChatInterface(
75
+ fn=chat_with_model,
76
+ title="Llama GGUF Chatbot",
77
+ description="使用Llama GGUF量化模型进行推理",
78
+ additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
79
+ additional_inputs=[
80
+ gr.Textbox("You are a helpful assistant.", label="System Prompt"),
81
+ gr.Slider(0, 1, 0.6, label="Temperature"),
82
+ gr.Slider(100, 4096, 1000, label="Max Tokens"),
83
+ gr.Slider(1, 100, 40, label="Top K"),
84
+ gr.Slider(0, 1, 0.85, label="Top P"),
85
+ ],
86
+ ).queue().launch()