File size: 4,837 Bytes
1337025
 
b248825
1337025
b248825
1337025
 
 
 
b248825
1337025
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import os
import sys
import gradio as gr
from subprocess import Popen, PIPE

# 设置环境变量
MODEL_NAME = os.environ.get("MODEL_NAME", "NousResearch/Nous-Hermes-2-Yi-9B")
API_PORT = int(os.environ.get("API_PORT", 8000))
GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))

# vLLM server进程
vllm_process = None

def start_vllm_server():
    global vllm_process
    if vllm_process is not None:
        return "vLLM 服务已经在运行"
    
    # 构建启动命令
    cmd = [
        "vllm", 
        "serve", 
        MODEL_NAME,
        "--host", "0.0.0.0", 
        "--port", str(API_PORT),
        "--dtype", "auto",
        "--max-model-len", "2048",  # 设置模型最大长度
        "--gpu-memory-utilization", "0.9"  # 使用90%的GPU内存
    ]
    
    # 是否启用API密钥
    api_key = os.environ.get("API_KEY", "")
    if api_key:
        cmd.extend(["--api-key", api_key])
    
    # 打印启动命令
    print(f"启动命令: {' '.join(cmd)}")
    
    # 启动vLLM服务
    try:
        vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True)
        return "vLLM 服务器已启动!请等待模型加载完成..."
    except Exception as e:
        return f"启动vLLM服务器时出错: {str(e)}"

def stop_vllm_server():
    global vllm_process
    if vllm_process is None:
        return "vLLM 服务未运行"
    
    vllm_process.terminate()
    vllm_process = None
    return "vLLM 服务已停止"

def check_server_status():
    if vllm_process is None:
        return "未运行"
    
    return_code = vllm_process.poll()
    if return_code is None:
        return "运行中"
    else:
        return f"已停止 (返回码: {return_code})"

def get_server_logs():
    if vllm_process is None:
        return "服务未运行,无日志可显示"
    
    # 从进程读取输出
    output = ""
    while True:
        line_out = vllm_process.stdout.readline()
        line_err = vllm_process.stderr.readline()
        
        if not line_out and not line_err:
            break
            
        if line_out:
            output += line_out + "\n"
        if line_err:
            output += "[ERROR] " + line_err + "\n"
            
    return output if output else "暂无新日志"

def serve_test_ui():
    """提供一个简单的测试UI"""
    with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo:
        with gr.Row():
            with gr.Column():
                gr.Markdown("# vLLM OpenAI 兼容API服务控制面板")
                
                with gr.Row():
                    start_btn = gr.Button("启动服务", variant="primary")
                    stop_btn = gr.Button("停止服务", variant="stop")
                
                status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False)
                refresh_btn = gr.Button("刷新状态")
                
                logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15)
                logs_refresh_btn = gr.Button("刷新日志")
                
                # API测试部分
                gr.Markdown("## API 信息")
                api_info = gr.Markdown(f"""
                API地址: `http://localhost:{API_PORT}/v1/...`
                
                Hugging Face Space公开URL: 部署后查看Space详情获取
                
                当前加载模型: `{MODEL_NAME}`
                
                API密钥: `{os.environ.get("API_KEY", "未设置")}`
                
                ## 测试命令
                ```python
                from openai import OpenAI
                
                client = OpenAI(
                    base_url="http://你的HF_SPACE_URL/v1",
                    api_key="{os.environ.get("API_KEY", "None")}",
                )
                
                completion = client.chat.completions.create(
                  model="{MODEL_NAME}",
                  messages=[
                    {"role": "user", "content": "Hello!"}
                  ]
                )
                
                print(completion.choices[0].message)
                ```
                """)
                
        # 设置事件处理
        start_btn.click(start_vllm_server, inputs=[], outputs=status_text)
        stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text)
        refresh_btn.click(check_server_status, inputs=[], outputs=status_text)
        logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text)
        
        # 页面加载时自动启动服务
        demo.load(start_vllm_server, inputs=[], outputs=status_text)
    
    return demo

# 启动测试UI
if __name__ == "__main__":
    # 创建并启动UI
    demo = serve_test_ui()
    demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True)