NanoV / app.py
zhangchenxu's picture
update
f228726
raw
history blame contribute delete
13.6 kB
import os
import gradio as gr
from subprocess import Popen, PIPE
import subprocess
import logging
import threading
import time
import queue
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def check_gpu_available():
try:
nvidia_smi = subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
logger.info(f"nvidia-smi 输出: {nvidia_smi.stdout}")
if nvidia_smi.returncode == 0:
logger.info("通过nvidia-smi命令检测到GPU")
return True
if os.environ.get("SPACE_RUNTIME_ARCH", "") == "gpu":
logger.info("通过环境变量检测到GPU")
return True
import torch
has_gpu = torch.cuda.is_available()
logger.info(f"通过PyTorch检测到GPU,设备数量: {torch.cuda.device_count()}" if has_gpu else "PyTorch未检测到GPU")
return has_gpu
except Exception as e:
logger.error(f"GPU检测失败: {str(e)}")
return False
MODEL_NAME = os.environ.get("MODEL_NAME", "zhangchenxu/TinyV-1.5B")
API_PORT = int(os.environ.get("API_PORT", 8000))
GRADIO_PORT = int(os.environ.get("GRADIO_PORT", 7860))
API_KEY = os.environ.get("API_KEY", "token-abc123") # 默认API密钥设置为token-abc123
USE_TRANSFORMERS_IMPL = os.environ.get("USE_TRANSFORMERS_IMPL", "true").lower() == "true"
ENFORCE_EAGER = os.environ.get("ENFORCE_EAGER", "true").lower() == "true"
HAS_GPU = check_gpu_available()
FORCE_GPU = os.environ.get("FORCE_GPU", "false").lower() == "true"
if FORCE_GPU:
HAS_GPU = True
vllm_process = None
log_queue = queue.Queue(maxsize=1000) # 用于存储日志的队列
log_thread = None
stop_log_thread = False
def log_reader_thread(process):
"""异步读取进程日志的线程"""
global stop_log_thread
while not stop_log_thread:
# 读取stderr
if process.stderr:
line = process.stderr.readline()
if line:
log_queue.put(f"[ERROR] {line.strip()}")
continue
# 读取stdout
if process.stdout:
line = process.stdout.readline()
if line:
log_queue.put(line.strip())
continue
# 检查进程是否还在运行
if process.poll() is not None:
log_queue.put(f"进程已退出,返回码: {process.poll()}")
break
# 短暂休眠以减少CPU使用
time.sleep(0.1)
def start_vllm_server():
global vllm_process, log_thread, stop_log_thread
if vllm_process is not None:
return "vLLM 服务已经在运行"
os.environ["VLLM_LOGGING_LEVEL"] = "DEBUG"
cmd = [
"vllm",
"serve",
MODEL_NAME,
"--host", "0.0.0.0",
"--port", str(API_PORT),
"--dtype", "auto",
"--trust-remote-code",
"--disable-async-output-proc",
"--api-key", API_KEY, # 添加API密钥
]
if ENFORCE_EAGER:
cmd.append("--enforce-eager")
if USE_TRANSFORMERS_IMPL:
cmd.extend(["--model-impl", "transformers"])
if HAS_GPU:
logger.info("使用GPU模式启动vLLM")
cmd.extend(["--device", "cuda", "--max-model-len", "2048", "--gpu-memory-utilization", "0.9"])
else:
logger.info("使用CPU模式启动vLLM")
cmd.extend(["--device", "cpu", "--max-model-len", "1024"])
logger.info(f"启动命令: {' '.join(cmd)}")
try:
vllm_process = Popen(cmd, stdout=PIPE, stderr=PIPE, text=True, bufsize=1)
# 启动日志读取线程
stop_log_thread = False
log_thread = threading.Thread(target=log_reader_thread, args=(vllm_process,))
log_thread.daemon = True
log_thread.start()
return "vLLM 服务器已启动!请等待模型加载完成..."
except Exception as e:
logger.error(f"启动vLLM服务器时出错: {str(e)}")
return f"启动vLLM服务器时出错: {str(e)}"
def stop_vllm_server():
global vllm_process, stop_log_thread
if vllm_process is None:
return "vLLM 服务未运行"
# 停止日志线程
stop_log_thread = True
if log_thread and log_thread.is_alive():
log_thread.join(timeout=2)
# 终止进程
vllm_process.terminate()
try:
vllm_process.wait(timeout=5)
except subprocess.TimeoutExpired:
vllm_process.kill()
vllm_process = None
return "vLLM 服务已停止"
def check_server_status():
if vllm_process is None:
return "未运行"
return_code = vllm_process.poll()
return "运行中" if return_code is None else f"已停止 (返回码: {return_code})"
def get_server_logs():
"""获取日志,不会阻塞UI"""
if vllm_process is None:
return "服务未运行,无日志可显示"
# 从队列中获取日志
logs = []
try:
# 最多获取200行日志,避免过多
for _ in range(200):
if log_queue.empty():
break
logs.append(log_queue.get_nowait())
log_queue.task_done()
except queue.Empty:
pass
if logs:
return "\n".join(logs)
else:
# 检查进程状态
if vllm_process.poll() is not None:
return f"服务已停止,返回码: {vllm_process.poll()}"
return "服务正在运行,暂无新日志"
def serve_test_ui():
with gr.Blocks(title="vLLM OpenAI兼容API服务") as demo:
with gr.Row():
with gr.Column():
gr.Markdown("# vLLM OpenAI兼容API服务控制面板")
# 系统信息
gpu_info = "已检测到" if HAS_GPU else "未检测到"
system_info = f"""
## 系统信息
- GPU: {gpu_info}
- 运行环境: {'Hugging Face Space' if 'SPACE_ID' in os.environ else '本地环境'}
- 当前加载模型: `{MODEL_NAME}`
- API密钥: `{API_KEY}`
"""
gr.Markdown(system_info)
with gr.Row():
start_btn = gr.Button("启动服务", variant="primary")
stop_btn = gr.Button("停止服务", variant="stop")
status_text = gr.Textbox(label="服务状态", value="未运行", interactive=False)
refresh_btn = gr.Button("刷新状态")
logs_text = gr.Textbox(label="服务日志", interactive=False, lines=15)
logs_refresh_btn = gr.Button("刷新日志")
# 高级选项
with gr.Accordion("高级选项", open=False):
model_input = gr.Textbox(label="模型名称", value=MODEL_NAME,
placeholder="输入模型名称,如 zhangchenxu/TinyV-1.5B")
with gr.Row():
force_gpu = gr.Checkbox(label="强制使用GPU模式", value=FORCE_GPU,
info="如果自动检测失败但您确定有GPU,请选中此项")
use_transformers = gr.Checkbox(label="使用Transformers实现", value=USE_TRANSFORMERS_IMPL,
info="使用Transformers实现而不是vLLM原生实现,可能更稳定但性能略低")
enforce_eager = gr.Checkbox(label="强制Eager模式", value=ENFORCE_EAGER,
info="强制使用PyTorch的Eager模式,避免CUDA图形相关问题")
apply_btn = gr.Button("应用设置", variant="primary")
# API使用说明
gr.Markdown("## API 使用说明")
api_info = gr.Markdown(f"""
### 接口地址
在Hugging Face Space上,接口地址为您Space的URL加上`/v1`路径:
```
https://你的HF_SPACE_URL/v1
```
### 支持的API
1. **Chat Completions API** (`/v1/chat/completions`)
- 用于聊天生成
- 与OpenAI的Chat API兼容
2. **Completions API** (`/v1/completions`)
- 用于文本生成
- 与OpenAI的Completions API兼容
### Python示例代码
```python
from openai import OpenAI
# 创建客户端
client = OpenAI(
base_url="https://你的HF_SPACE_URL/v1",
api_key="{API_KEY}",
)
# 聊天完成示例
chat_completion = client.chat.completions.create(
model="{MODEL_NAME}", # 模型名称可以是任意值
messages=[
{{"role": "user", "content": "Hello!"}}
]
)
print(chat_completion.choices[0].message.content)
# 文本生成示例
completion = client.completions.create(
model="{MODEL_NAME}",
prompt="Once upon a time",
max_tokens=50
)
print(completion.choices[0].text)
```
### curl示例
```bash
# 聊天完成
curl https://你的HF_SPACE_URL/v1/chat/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer {API_KEY}" \\
-d '{{"model": "{MODEL_NAME}", "messages": [{{"role": "user", "content": "Hello!"}}]}}'
# 文本生成
curl https://你的HF_SPACE_URL/v1/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer {API_KEY}" \\
-d '{{"model": "{MODEL_NAME}", "prompt": "Once upon a time", "max_tokens": 50}}'
```
""")
# 自动刷新日志
gr.Markdown("## 日志自动刷新")
auto_refresh = gr.Checkbox(label="启用日志自动刷新", value=False)
# 设置事件处理
start_btn.click(start_vllm_server, inputs=[], outputs=status_text)
stop_btn.click(stop_vllm_server, inputs=[], outputs=status_text)
refresh_btn.click(check_server_status, inputs=[], outputs=status_text)
logs_refresh_btn.click(get_server_logs, inputs=[], outputs=logs_text)
# 高级选项的事件处理
def apply_settings(model_name, force_gpu_mode, use_transformers_impl, enforce_eager_mode):
global MODEL_NAME, FORCE_GPU, HAS_GPU, USE_TRANSFORMERS_IMPL, ENFORCE_EAGER
changed = []
if model_name.strip() and model_name != MODEL_NAME:
MODEL_NAME = model_name.strip()
changed.append(f"模型已更改为: {MODEL_NAME}")
if force_gpu_mode != FORCE_GPU:
FORCE_GPU = force_gpu_mode
if FORCE_GPU:
HAS_GPU = True
changed.append("已强制启用GPU模式")
else:
HAS_GPU = check_gpu_available()
changed.append(f"已恢复自动检测,GPU状态: {'已检测到' if HAS_GPU else '未检测到'}")
if use_transformers_impl != USE_TRANSFORMERS_IMPL:
USE_TRANSFORMERS_IMPL = use_transformers_impl
changed.append(f"Transformers实现: {'已启用' if USE_TRANSFORMERS_IMPL else '已禁用'}")
if enforce_eager_mode != ENFORCE_EAGER:
ENFORCE_EAGER = enforce_eager_mode
changed.append(f"Eager模式: {'已启用' if ENFORCE_EAGER else '已禁用'}")
if changed:
return "\n".join(changed) + "\n\n设置已应用。如果服务正在运行,需要重启服务以使更改生效。"
else:
return "没有设置被更改"
def auto_refresh_logs(auto):
if auto:
# 每秒自动刷新日志
return gr.update(every=1)
return gr.update(every=0)
apply_btn.click(
apply_settings,
inputs=[model_input, force_gpu, use_transformers, enforce_eager],
outputs=status_text
)
auto_refresh.change(
auto_refresh_logs,
inputs=[auto_refresh],
outputs=[logs_text]
)
# 页面加载时自动启动状态检查
demo.load(lambda: f"系统就绪。GPU状态: {'已检测到' if HAS_GPU else '未检测到'}",
inputs=[], outputs=status_text)
return demo
if __name__ == "__main__":
demo = serve_test_ui()
demo.queue().launch(server_name="0.0.0.0", server_port=GRADIO_PORT, share=True)