zhangchenxu commited on
Commit
d813b23
·
1 Parent(s): 6c7be96
Files changed (1) hide show
  1. app.py +27 -277
app.py CHANGED
@@ -1,72 +1,48 @@
1
  import os
2
- import sys
3
  import gradio as gr
4
  from subprocess import Popen, PIPE
5
  import subprocess
6
  import logging
7
 
8
- # 配置日志
9
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
10
  logger = logging.getLogger(__name__)
11
 
12
- # 检测是否有 GPU - 在Hugging Face Spaces环境中更可靠的方法
13
  def check_gpu_available():
14
  try:
15
- # 方法1: 使用nvidia-smi命令检测
16
  nvidia_smi = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
17
  logger.info(f"nvidia-smi 输出: {nvidia_smi.stdout}")
18
  if nvidia_smi.returncode == 0:
19
  logger.info("通过nvidia-smi命令检测到GPU")
20
  return True
21
-
22
- # 方法2: 检查Hugging Face Space环境变量
23
  if os.environ.get("SPACE_RUNTIME_ARCH", "") == "gpu":
24
  logger.info("通过环境变量检测到GPU")
25
  return True
26
-
27
- # 方法3: 如果上面方法都失败,尝试PyTorch
28
  import torch
29
  has_gpu = torch.cuda.is_available()
30
- if has_gpu:
31
- logger.info(f"通过PyTorch检测到GPU,设备数量: {torch.cuda.device_count()}")
32
- else:
33
- logger.info("PyTorch未检测到GPU")
34
  return has_gpu
35
  except Exception as e:
36
  logger.error(f"GPU检测失败: {str(e)}")
37
  return False
38
 
39
- # 设置环境变量
40
- MODEL_NAME = os.environ.get("MODEL_NAME", "zhangchenxu/TinyV-1.5B") # 默认使用更小的模型
41
  API_PORT = int(os.environ.get("API_PORT", 8000))
42
  GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))
43
-
44
- # 其他配置选项
45
  USE_TRANSFORMERS_IMPL = os.environ.get("USE_TRANSFORMERS_IMPL", "true").lower() == "true"
46
  ENFORCE_EAGER = os.environ.get("ENFORCE_EAGER", "true").lower() == "true"
47
-
48
- # 检测 GPU
49
  HAS_GPU = check_gpu_available()
50
- logger.info(f"GPU检测结果: {'有GPU' if HAS_GPU else '无GPU'}")
51
-
52
- # 尝试强制设置为有GPU - 如果你确信环境中有GPU
53
  FORCE_GPU = os.environ.get("FORCE_GPU", "false").lower() == "true"
54
  if FORCE_GPU:
55
- logger.info("强制启用GPU模式")
56
  HAS_GPU = True
57
 
58
- # vLLM server进程
59
  vllm_process = None
60
 
61
  def start_vllm_server():
62
  global vllm_process
63
  if vllm_process is not None:
64
  return "vLLM 服务已经在运行"
65
-
66
- # 设置环境变量以帮助调试
67
  os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
68
-
69
- # 构建启动命令
70
  cmd = [
71
  "vllm",
72
  "serve",
@@ -74,78 +50,33 @@ def start_vllm_server():
74
  "--host", "0.0.0.0",
75
  "--port", str(API_PORT),
76
  "--dtype", "auto",
77
- "--trust-remote-code", # 信任远程代码,许多模型需要这个
78
- "--disable-async-output-proc", # 禁用异步输出处理,解决NotImplementedError
79
  ]
80
-
81
- # 根据配置选项添加参数
82
  if ENFORCE_EAGER:
83
  cmd.append("--enforce-eager")
84
-
85
  if USE_TRANSFORMERS_IMPL:
86
  cmd.extend(["--model-impl", "transformers"])
87
-
88
- # 根据是否有 GPU 添加不同的参数
89
  if HAS_GPU:
90
  logger.info("使用GPU模式启动vLLM")
91
- cmd.extend([
92
- "--device", "cuda",
93
- "--max-model-len", "2048",
94
- "--gpu-memory-utilization", "0.9"
95
- ])
96
  else:
97
  logger.info("使用CPU模式启动vLLM")
98
- cmd.extend([
99
- "--device", "cpu",
100
- "--max-model-len", "1024" # CPU 模式使用更小的上下文长度以节省内存
101
- ])
102
-
103
- # 打印启动命令
104
- cmd_str = " ".join(cmd)
105
- logger.info(f"启动命令: {cmd_str}")
106
-
107
- # 启动vLLM服务
108
- try:
109
- vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True)
110
- return "vLLM 服务器已启动!请等待模型加载完成... (可能需要几分钟)"
111
- except Exception as e:
112
- error_msg = f"启动vLLM服务器时出错: {str(e)}"
113
- logger.error(error_msg)
114
- return error_msg
115
-
116
- # 打印启动命令
117
- cmd_str = " ".join(cmd)
118
- logger.info(f"启动命令: {cmd_str}")
119
-
120
- # 启动vLLM服务
121
- try:
122
- vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True)
123
- return "vLLM 服务器已启动!请等待模型加载完成... (可能需要几分钟)"
124
- except Exception as e:
125
- error_msg = f"启动vLLM服务器时出错: {str(e)}"
126
- logger.error(error_msg)
127
- return error_msg
128
-
129
- # 是否启用API密钥
130
- api_key = os.environ.get("API_KEY", "")
131
- if api_key:
132
- cmd.extend(["--api-key", api_key])
133
-
134
- # 打印启动命令
135
- print(f"启动命令: {' '.join(cmd)}")
136
-
137
- # 启动vLLM���务
138
  try:
139
  vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True)
140
  return "vLLM 服务器已启动!请等待模型加载完成..."
141
  except Exception as e:
 
142
  return f"启动vLLM服务器时出错: {str(e)}"
143
 
144
  def stop_vllm_server():
145
  global vllm_process
146
  if vllm_process is None:
147
  return "vLLM 服务未运行"
148
-
149
  vllm_process.terminate()
150
  vllm_process = None
151
  return "vLLM 服务已停止"
@@ -153,232 +84,51 @@ def stop_vllm_server():
153
  def check_server_status():
154
  if vllm_process is None:
155
  return "未运行"
156
-
157
  return_code = vllm_process.poll()
158
- if return_code is None:
159
- return "运行中"
160
- else:
161
- return f"已停止 (返回码: {return_code})"
162
 
163
  def get_server_logs():
164
  if vllm_process is None:
165
  return "服务未运行,无日志可显示"
166
-
167
- # 检查进程是否仍在运行
168
  if vllm_process.poll() is not None:
169
  return f"服务已停止,返回码: {vllm_process.poll()}"
170
-
171
  try:
172
- # 尝试从进程读取输出,但不会阻塞
173
- output_lines = []
174
-
175
- # 读取stderr (错误日志)
176
  while True:
177
  line = vllm_process.stderr.readline()
178
  if not line:
179
  break
180
  output_lines.append(f"[ERROR] {line.strip()}")
181
-
182
- # 读取stdout (标准输出)
183
  while True:
184
  line = vllm_process.stdout.readline()
185
  if not line:
186
  break
187
  output_lines.append(line.strip())
188
-
189
- if output_lines:
190
- return "\n".join(output_lines)
191
- else:
192
- return "暂无新日志 (服务正在运行)"
193
  except Exception as e:
194
  return f"读取日志时出错: {str(e)}"
195
 
196
  def serve_test_ui():
197
- """提供一个简单的测试UI"""
198
- with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo:
199
  with gr.Row():
200
  with gr.Column():
201
- gr.Markdown("# vLLM OpenAI 兼容API服务控制面板")
202
-
203
- # 系统信息
204
  gpu_info = "已检测到" if HAS_GPU else "未检测到"
205
- system_info = f"""
206
- ## 系统信息
207
- - GPU: {gpu_info}
208
- - 运行环境: {'Hugging Face Space' if 'SPACE_ID' in os.environ else '本地环境'}
209
- - 当前加载模型: `{MODEL_NAME}`
210
- - API密钥: `{os.environ.get("API_KEY", "未设置")}`
211
- """
212
- gr.Markdown(system_info)
213
-
214
- with gr.Row():
215
- start_btn = gr.Button("启动服务", variant="primary")
216
- stop_btn = gr.Button("停止服务", variant="stop")
217
-
218
  status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False)
219
  refresh_btn = gr.Button("刷新状态")
220
-
221
- logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15)
222
  logs_refresh_btn = gr.Button("刷新日志")
223
-
224
- # 高级选项
225
- with gr.Accordion("高级选项", open=False):
226
- model_input = gr.Textbox(label="模型名称", value=MODEL_NAME,
227
- placeholder="输入模型名称,如 TinyLlama/TinyLlama-1.1B-Chat-v1.0")
228
-
229
- with gr.Row():
230
- force_gpu = gr.Checkbox(label="强制使用GPU模式", value=FORCE_GPU,
231
- info="如果自动检测失败但您确定有GPU,请选中此项")
232
- use_transformers = gr.Checkbox(label="使用Transformers实现", value=USE_TRANSFORMERS_IMPL,
233
- info="使用Transformers实现而不是vLLM原生实现,可能更稳定但性能略低")
234
- enforce_eager = gr.Checkbox(label="强制Eager模式", value=ENFORCE_EAGER,
235
- info="强制使用PyTorch的Eager模式,避免CUDA图形相关问题")
236
-
237
- apply_btn = gr.Button("应用设置", variant="primary")
238
-
239
- # API测试部分
240
- gr.Markdown("## API 信息")
241
- api_key = os.environ.get("API_KEY", "未设置")
242
- api_info = gr.Markdown(f"""
243
- API地址: `http://localhost:{API_PORT}/v1/...`
244
-
245
- Hugging Face Space公开URL: 部署后查看Space详情获取
246
-
247
- ## 测试命令
248
- ```python
249
- from openai import OpenAI
250
-
251
- client = OpenAI(
252
- base_url="http://你的HF_SPACE_URL/v1",
253
- api_key="{api_key}",
254
- )
255
-
256
- completion = client.chat.completions.create(
257
- model="{MODEL_NAME}",
258
- messages=[
259
- {{"role": "user", "content": "Hello!"}}
260
- ]
261
- )
262
-
263
- print(completion.choices[0].message)
264
- ```
265
- """)
266
-
267
- # 设置事件处理
268
- start_btn.click(start_vllm_server, inputs=[], outputs=status_text)
269
- stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text)
270
- refresh_btn.click(check_server_status, inputs=[], outputs=status_text)
271
- logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text)
272
-
273
- # 高级选项的事件处理
274
- def apply_settings(model_name, force_gpu_mode, use_transformers_impl, enforce_eager_mode):
275
- global MODEL_NAME, FORCE_GPU, HAS_GPU, USE_TRANSFORMERS_IMPL, ENFORCE_EAGER
276
-
277
- changed = []
278
-
279
- if model_name.strip() and model_name != MODEL_NAME:
280
- MODEL_NAME = model_name.strip()
281
- changed.append(f"模型已更改为: {MODEL_NAME}")
282
-
283
- if force_gpu_mode != FORCE_GPU:
284
- FORCE_GPU = force_gpu_mode
285
- if FORCE_GPU:
286
- HAS_GPU = True
287
- changed.append("已强制启用GPU模式")
288
- else:
289
- HAS_GPU = check_gpu_available()
290
- changed.append(f"已恢复自动检测,GPU状态: {'已检测到' if HAS_GPU else '未检测到'}")
291
-
292
- if use_transformers_impl != USE_TRANSFORMERS_IMPL:
293
- USE_TRANSFORMERS_IMPL = use_transformers_impl
294
- changed.append(f"Transformers实现: {'已启用' if USE_TRANSFORMERS_IMPL else '已禁用'}")
295
-
296
- if enforce_eager_mode != ENFORCE_EAGER:
297
- ENFORCE_EAGER = enforce_eager_mode
298
- changed.append(f"Eager模式: {'已启用' if ENFORCE_EAGER else '已禁用'}")
299
-
300
- if changed:
301
- return "\n".join(changed) + "\n\n设置已应用。如果服务正在运行,需要重启服务以使更改生效。"
302
- else:
303
- return "没有设置被更改"
304
-
305
- apply_btn.click(
306
- apply_settings,
307
- inputs=[model_input, force_gpu, use_transformers, enforce_eager],
308
- outputs=status_text
309
- )
310
-
311
- # 页面加载时不自动启动服务,只显示系统状态
312
- demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}", inputs=[], outputs=status_text)
313
-
314
- return demoinyLlama-1.1B-Chat-v1.0")
315
- change_model_btn = gr.Button("更改模型")
316
-
317
- force_gpu = gr.Checkbox(label="强制使用GPU模式", value=FORCE_GPU,
318
- info="如果自动检测失败但您确定有GPU,请选中此项")
319
-
320
- # API测试部分
321
- gr.Markdown("## API 信息")
322
- api_key = os.environ.get("API_KEY", "未设置")
323
- api_info = gr.Markdown(f"""
324
- API地址: `http://localhost:{API_PORT}/v1/...`
325
-
326
- Hugging Face Space公开URL: 部署后查看Space详情获取
327
-
328
- ## 测试命令
329
- ```python
330
- from openai import OpenAI
331
-
332
- client = OpenAI(
333
- base_url="http://你的HF_SPACE_URL/v1",
334
- api_key="{api_key}",
335
- )
336
-
337
- completion = client.chat.completions.create(
338
- model="{MODEL_NAME}",
339
- messages=[
340
- {{"role": "user", "content": "Hello!"}}
341
- ]
342
- )
343
-
344
- print(completion.choices[0].message)
345
- ```
346
- """)
347
-
348
- # 设置事件处理
349
- start_btn.click(start_vllm_server, inputs=[], outputs=status_text)
350
- stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text)
351
- refresh_btn.click(check_server_status, inputs=[], outputs=status_text)
352
- logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text)
353
-
354
- # 高级选项的事件处理
355
- def change_model(model_name):
356
- global MODEL_NAME
357
- if model_name.strip():
358
- MODEL_NAME = model_name.strip()
359
- return f"模型已更改为: {MODEL_NAME}"
360
- return "模型名称不能为空"
361
-
362
- def toggle_gpu_mode(force):
363
- global HAS_GPU, FORCE_GPU
364
- FORCE_GPU = force
365
- if FORCE_GPU:
366
- HAS_GPU = True
367
- return "已强制启用GPU模式"
368
- else:
369
- HAS_GPU = check_gpu_available()
370
- return f"已恢复自动检测,GPU检测结果: {'已检测到' if HAS_GPU else '未检测到'}"
371
-
372
- change_model_btn.click(change_model, inputs=[model_input], outputs=status_text)
373
- force_gpu.change(toggle_gpu_mode, inputs=[force_gpu], outputs=status_text)
374
-
375
- # 页面加载时自动启动服务
376
- demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}", inputs=[], outputs=status_text)
377
-
378
  return demo
379
 
380
- # 启动测试UI
381
  if __name__ == "__main__":
382
- # 创建并启动UI
383
  demo = serve_test_ui()
384
- demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True)
 
1
  import os
 
2
  import gradio as gr
3
  from subprocess import Popen, PIPE
4
  import subprocess
5
  import logging
6
 
 
7
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
8
  logger = logging.getLogger(__name__)
9
 
 
10
  def check_gpu_available():
11
  try:
 
12
  nvidia_smi = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
13
  logger.info(f"nvidia-smi 输出: {nvidia_smi.stdout}")
14
  if nvidia_smi.returncode == 0:
15
  logger.info("通过nvidia-smi命令检测到GPU")
16
  return True
 
 
17
  if os.environ.get("SPACE_RUNTIME_ARCH", "") == "gpu":
18
  logger.info("通过环境变量检测到GPU")
19
  return True
 
 
20
  import torch
21
  has_gpu = torch.cuda.is_available()
22
+ logger.info(f"通过PyTorch检测到GPU,设备数量: {torch.cuda.device_count()}" if has_gpu else "PyTorch未检测到GPU")
 
 
 
23
  return has_gpu
24
  except Exception as e:
25
  logger.error(f"GPU检测失败: {str(e)}")
26
  return False
27
 
28
+ MODEL_NAME = os.environ.get("MODEL_NAME", "zhangchenxu/TinyV-1.5B")
 
29
  API_PORT = int(os.environ.get("API_PORT", 8000))
30
  GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))
 
 
31
  USE_TRANSFORMERS_IMPL = os.environ.get("USE_TRANSFORMERS_IMPL", "true").lower() == "true"
32
  ENFORCE_EAGER = os.environ.get("ENFORCE_EAGER", "true").lower() == "true"
 
 
33
  HAS_GPU = check_gpu_available()
 
 
 
34
  FORCE_GPU = os.environ.get("FORCE_GPU", "false").lower() == "true"
35
  if FORCE_GPU:
 
36
  HAS_GPU = True
37
 
 
38
  vllm_process = None
39
 
40
  def start_vllm_server():
41
  global vllm_process
42
  if vllm_process is not None:
43
  return "vLLM 服务已经在运行"
44
+
 
45
  os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
 
 
46
  cmd = [
47
  "vllm",
48
  "serve",
 
50
  "--host", "0.0.0.0",
51
  "--port", str(API_PORT),
52
  "--dtype", "auto",
53
+ "--trust-remote-code",
54
+ "--disable-async-output-proc",
55
  ]
56
+
 
57
  if ENFORCE_EAGER:
58
  cmd.append("--enforce-eager")
 
59
  if USE_TRANSFORMERS_IMPL:
60
  cmd.extend(["--model-impl", "transformers"])
 
 
61
  if HAS_GPU:
62
  logger.info("使用GPU模式启动vLLM")
63
+ cmd.extend(["--device", "cuda", "--max-model-len", "2048", "--gpu-memory-utilization", "0.9"])
 
 
 
 
64
  else:
65
  logger.info("使用CPU模式启动vLLM")
66
+ cmd.extend(["--device", "cpu", "--max-model-len", "1024"])
67
+
68
+ logger.info(f"启动命令: {' '.join(cmd)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  try:
70
  vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True)
71
  return "vLLM 服务器已启动!请等待模型加载完成..."
72
  except Exception as e:
73
+ logger.error(f"启动vLLM服务器时出错: {str(e)}")
74
  return f"启动vLLM服务器时出错: {str(e)}"
75
 
76
  def stop_vllm_server():
77
  global vllm_process
78
  if vllm_process is None:
79
  return "vLLM 服务未运行"
 
80
  vllm_process.terminate()
81
  vllm_process = None
82
  return "vLLM 服务已停止"
 
84
  def check_server_status():
85
  if vllm_process is None:
86
  return "未运行"
 
87
  return_code = vllm_process.poll()
88
+ return "运行中" if return_code is None else f"已停止 (返回码: {return_code})"
 
 
 
89
 
90
  def get_server_logs():
91
  if vllm_process is None:
92
  return "服务未运行,无日志可显示"
 
 
93
  if vllm_process.poll() is not None:
94
  return f"服务已停止,返回码: {vllm_process.poll()}"
95
+ output_lines = []
96
  try:
 
 
 
 
97
  while True:
98
  line = vllm_process.stderr.readline()
99
  if not line:
100
  break
101
  output_lines.append(f"[ERROR] {line.strip()}")
 
 
102
  while True:
103
  line = vllm_process.stdout.readline()
104
  if not line:
105
  break
106
  output_lines.append(line.strip())
107
+ return "\n".join(output_lines) if output_lines else "暂无新日志"
 
 
 
 
108
  except Exception as e:
109
  return f"读取日志时出错: {str(e)}"
110
 
111
  def serve_test_ui():
112
+ with gr.Blocks(title="vLLM 控制面板") as demo:
 
113
  with gr.Row():
114
  with gr.Column():
115
+ gr.Markdown("# vLLM 控制面板")
 
 
116
  gpu_info = "已检测到" if HAS_GPU else "未检测到"
117
+ gr.Markdown(f"**GPU:** {gpu_info} \n**模型:** `{MODEL_NAME}`")
118
+ start_btn = gr.Button("启动服务")
119
+ stop_btn = gr.Button("停止服务")
 
 
 
 
 
 
 
 
 
 
120
  status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False)
121
  refresh_btn = gr.Button("刷新状态")
122
+ logs_text = gr.Textbox(label="服务日志", interactive=False, lines=10)
 
123
  logs_refresh_btn = gr.Button("刷新日志")
124
+
125
+ start_btn.click(start_vllm_server, outputs=status_text)
126
+ stop_btn.click(stop_vllm_server, outputs=status_text)
127
+ refresh_btn.click(check_server_status, outputs=status_text)
128
+ logs_refresh_btn.click(get_server_logs, outputs=logs_text)
129
+ demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}", outputs=status_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  return demo
131
 
 
132
  if __name__ == "__main__":
 
133
  demo = serve_test_ui()
134
+ demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True)