Spaces:

hsuwill000
/

test0001

Runtime error

App Files Files Community

hsuwill000 commited on Feb 26

Commit

cb2e59c

verified ·

1 Parent(s): 73dbe3a

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -52

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import gradio as gr
-import time
 import subprocess
 import os
 from llama_cpp import Llama
@@ -16,61 +15,36 @@ def run_command(command, cwd=None):
         print(f"命令执行成功: {command}")
         print(result.stdout)
-# 下载并转换模型
-def git_llama():
-   if not os.path.exists('llama.cpp'):
         run_command('git clone https://github.com/ggml-org/llama.cpp.git')
-        # 进入仓库目录并编译
         os.chdir('llama.cpp')
         run_command('pip install -r requirements.txt')
         run_command('cmake -B build')
         run_command('cmake --build build --config Release -j 8')
-        os.chdir('..')  # 返回上级目录
 def setup_model(model_id):
     local_dir = model_id.split('/')[-1]
     if not os.path.exists(local_dir):
         snapshot_download(repo_id=model_id, local_dir=local_dir)
-    # 转换为 GGUF 格式
     gguf_path = f"{local_dir}.gguf"
     if not os.path.exists(gguf_path):
-        subprocess.run(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}', shell=True, check=True)
-    # 量化模型
     quantized_path = f"{local_dir}-Q2_K.gguf"
     if not os.path.exists(quantized_path):
-        subprocess.run(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K', shell=True, check=True)
     return quantized_path
-# 设定模型路径
-MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
-git_llama()
-MODEL_PATH = setup_model(MODEL_ID)
-# 加载 Llama 模型
-llm = Llama(
-    model_path=MODEL_PATH,
-    verbose=False,
-    n_threads=4,  # 调整线程数
-    n_ctx=32768   # 上下文窗口大小
-)
 def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
-    """调用 Llama 模型生成回复"""
-    start_time = time.time()
     messages = [{"role": "system", "content": system_prompt}]
     for user_msg, assistant_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
     stream = llm.create_chat_completion(
         messages=messages,
         stream=True,
@@ -80,27 +54,33 @@ def chat_with_model(message, history, system_prompt, temperature, max_tokens, to
         max_tokens=max_tokens,
         stop=["<|im_end|>"]
     )
     response = ""
     for chunk in stream:
         if "choices" in chunk and chunk["choices"]:
             text = chunk["choices"][0].get("delta", {}).get("content", "")
             response += text
-            yield response  # 流式返回文本
-    print(f"生成耗时: {time.time() - start_time:.2f} 秒")
-# 启动 Gradio ChatInterface
-gr.ChatInterface(
-    fn=chat_with_model,
-    title="Llama GGUF Chatbot",
-    description="使用 Llama GGUF 量化模型进行推理",
-    additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
-    additional_inputs=[
-        gr.Textbox("You are a helpful assistant.", label="System Prompt"),
-        gr.Slider(0, 1, 0.6, label="Temperature"),
-        gr.Slider(100, 4096, 1000, label="Max Tokens"),
-        gr.Slider(1, 100, 40, label="Top K"),
-        gr.Slider(0, 1, 0.85, label="Top P"),
-    ],
-).queue().launch()

 import gradio as gr
 import subprocess
 import os
 from llama_cpp import Llama
         print(f"命令执行成功: {command}")
         print(result.stdout)
+def setup_llama_cpp():
+    """克隆并编译llama.cpp仓库"""
+    if not os.path.exists('llama.cpp'):
         run_command('git clone https://github.com/ggml-org/llama.cpp.git')
         os.chdir('llama.cpp')
         run_command('pip install -r requirements.txt')
         run_command('cmake -B build')
         run_command('cmake --build build --config Release -j 8')
+        os.chdir('..')
 def setup_model(model_id):
+    """下载并转换模型为GGUF格式，返回量化模型路径"""
     local_dir = model_id.split('/')[-1]
     if not os.path.exists(local_dir):
         snapshot_download(repo_id=model_id, local_dir=local_dir)
     gguf_path = f"{local_dir}.gguf"
     if not os.path.exists(gguf_path):
+        run_command(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}')
     quantized_path = f"{local_dir}-Q2_K.gguf"
     if not os.path.exists(quantized_path):
+        run_command(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K')
     return quantized_path
 def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
+    """调用Llama模型生成回复"""
     messages = [{"role": "system", "content": system_prompt}]
     for user_msg, assistant_msg in history:
         messages.append({"role": "user", "content": user_msg})
         messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
     stream = llm.create_chat_completion(
         messages=messages,
         stream=True,
         max_tokens=max_tokens,
         stop=["<|im_end|>"]
     )
     response = ""
     for chunk in stream:
         if "choices" in chunk and chunk["choices"]:
             text = chunk["choices"][0].get("delta", {}).get("content", "")
             response += text
+            yield response
+if __name__ == "__main__":
+    MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
+    setup_llama_cpp()
+    MODEL_PATH = setup_model(MODEL_ID)
+    llm = Llama(
+        model_path=MODEL_PATH,
+        verbose=False,
+        n_threads=4,
+        n_ctx=32768
+    )
+    gr.ChatInterface(
+        fn=chat_with_model,
+        title="Llama GGUF Chatbot",
+        description="使用Llama GGUF量化模型进行推理",
+        additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
+        additional_inputs=[
+            gr.Textbox("You are a helpful assistant.", label="System Prompt"),
+            gr.Slider(0, 1, 0.6, label="Temperature"),
+            gr.Slider(100, 4096, 1000, label="Max Tokens"),
+            gr.Slider(1, 100, 40, label="Top K"),
+            gr.Slider(0, 1, 0.85, label="Top P"),
+        ],
+    ).queue().launch()