Spaces:

hsuwill000
/

test0001

Runtime error

File size: 2,697 Bytes

2c70b2b
cb47c06
 
 
 
 
2c70b2b
cb47c06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c70b2b
cb47c06
 
 
2c70b2b
cb47c06
 
 
 
 
 
 
2c70b2b
cb47c06
 
 
 
 
 
 
 
2c70b2b
cb47c06
 
 
2c70b2b
 
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
 
 
 
 
2c70b2b
cb47c06
2c70b2b
cb47c06
 
 
 
 
 
2c70b2b
cb47c06
 
 
 
 
2c70b2b
cb47c06

import gradio as gr
import time
import subprocess
import os
from llama_cpp import Llama
from huggingface_hub import snapshot_download

# 下载并转换模型
def setup_model(model_id):
    local_dir = model_id.split('/')[-1]
    if not os.path.exists(local_dir):
        snapshot_download(repo_id=model_id, local_dir=local_dir)
    
    # 转换为 GGUF 格式
    gguf_path = f"{local_dir}.gguf"
    if not os.path.exists(gguf_path):
        subprocess.run(f'python llama.cpp/convert_hf_to_gguf.py ./{local_dir} --outfile {gguf_path}', shell=True, check=True)
    
    # 量化模型
    quantized_path = f"{local_dir}-Q2_K.gguf"
    if not os.path.exists(quantized_path):
        subprocess.run(f'./llama.cpp/build/bin/llama-quantize ./{gguf_path} {quantized_path} Q2_K', shell=True, check=True)
    
    return quantized_path

# 设定模型路径
MODEL_ID = "ibm-granite/granite-3.1-2b-instruct"
MODEL_PATH = setup_model(MODEL_ID)

# 加载 Llama 模型
llm = Llama(
    model_path=MODEL_PATH,
    verbose=False,
    n_threads=4,  # 调整线程数
    n_ctx=32768   # 上下文窗口大小
)

def chat_with_model(message, history, system_prompt, temperature, max_tokens, top_k, top_p):
    """调用 Llama 模型生成回复"""
    start_time = time.time()
    
    messages = [{"role": "system", "content": system_prompt}]
    for user_msg, assistant_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": assistant_msg})
    messages.append({"role": "user", "content": message})
    
    stream = llm.create_chat_completion(
        messages=messages,
        stream=True,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        max_tokens=max_tokens,
        stop=["<|im_end|>"]
    )
    
    response = ""
    for chunk in stream:
        if "choices" in chunk and chunk["choices"]:
            text = chunk["choices"][0].get("delta", {}).get("content", "")
            response += text
            yield response  # 流式返回文本

    print(f"生成耗时: {time.time() - start_time:.2f} 秒")

# 启动 Gradio ChatInterface
gr.ChatInterface(
    fn=chat_with_model,
    title="Llama GGUF Chatbot",
    description="使用 Llama GGUF 量化模型进行推理",
    additional_inputs_accordion=gr.Accordion(label="⚙️ 参数设置", open=False),
    additional_inputs=[
        gr.Textbox("You are a helpful assistant.", label="System Prompt"),
        gr.Slider(0, 1, 0.6, label="Temperature"),
        gr.Slider(100, 4096, 1000, label="Max Tokens"),
        gr.Slider(1, 100, 40, label="Top K"),
        gr.Slider(0, 1, 0.85, label="Top P"),
    ],
).queue().launch()