NanoV / app.py
zhangchenxu's picture
update vllm
1337025
raw
history blame
4.84 kB
import os
import sys
import gradio as gr
from subprocess import Popen, PIPE
# 设置环境变量
MODEL_NAME = os.environ.get("MODEL_NAME", "NousResearch/Nous-Hermes-2-Yi-9B")
API_PORT = int(os.environ.get("API_PORT", 8000))
GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))
# vLLM server进程
vllm_process = None
def start_vllm_server():
global vllm_process
if vllm_process is not None:
return "vLLM 服务已经在运行"
# 构建启动命令
cmd = [
"vllm",
"serve",
MODEL_NAME,
"--host", "0.0.0.0",
"--port", str(API_PORT),
"--dtype", "auto",
"--max-model-len", "2048", # 设置模型最大长度
"--gpu-memory-utilization", "0.9" # 使用90%的GPU内存
]
# 是否启用API密钥
api_key = os.environ.get("API_KEY", "")
if api_key:
cmd.extend(["--api-key", api_key])
# 打印启动命令
print(f"启动命令: {' '.join(cmd)}")
# 启动vLLM服务
try:
vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True)
return "vLLM 服务器已启动!请等待模型加载完成..."
except Exception as e:
return f"启动vLLM服务器时出错: {str(e)}"
def stop_vllm_server():
global vllm_process
if vllm_process is None:
return "vLLM 服务未运行"
vllm_process.terminate()
vllm_process = None
return "vLLM 服务已停止"
def check_server_status():
if vllm_process is None:
return "未运行"
return_code = vllm_process.poll()
if return_code is None:
return "运行中"
else:
return f"已停止 (返回码: {return_code})"
def get_server_logs():
if vllm_process is None:
return "服务未运行,无日志可显示"
# 从进程读取输出
output = ""
while True:
line_out = vllm_process.stdout.readline()
line_err = vllm_process.stderr.readline()
if not line_out and not line_err:
break
if line_out:
output += line_out + "\n"
if line_err:
output += "[ERROR] " + line_err + "\n"
return output if output else "暂无新日志"
def serve_test_ui():
"""提供一个简单的测试UI"""
with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo:
with gr.Row():
with gr.Column():
gr.Markdown("# vLLM OpenAI 兼容API服务控制面板")
with gr.Row():
start_btn = gr.Button("启动服务", variant="primary")
stop_btn = gr.Button("停止服务", variant="stop")
status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False)
refresh_btn = gr.Button("刷新状态")
logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15)
logs_refresh_btn = gr.Button("刷新日志")
# API测试部分
gr.Markdown("## API 信息")
api_info = gr.Markdown(f"""
API地址: `http://localhost:{API_PORT}/v1/...`
Hugging Face Space公开URL: 部署后查看Space详情获取
当前加载模型: `{MODEL_NAME}`
API密钥: `{os.environ.get("API_KEY", "未设置")}`
## 测试命令
```python
from openai import OpenAI
client = OpenAI(
base_url="http://你的HF_SPACE_URL/v1",
api_key="{os.environ.get("API_KEY", "None")}",
)
completion = client.chat.completions.create(
model="{MODEL_NAME}",
messages=[
{"role": "user", "content": "Hello!"}
]
)
print(completion.choices[0].message)
```
""")
# 设置事件处理
start_btn.click(start_vllm_server, inputs=[], outputs=status_text)
stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text)
refresh_btn.click(check_server_status, inputs=[], outputs=status_text)
logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text)
# 页面加载时自动启动服务
demo.load(start_vllm_server, inputs=[], outputs=status_text)
return demo
# 启动测试UI
if __name__ == "__main__":
# 创建并启动UI
demo = serve_test_ui()
demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True)