vllm-ui / app.py
thyecust's picture
Add application file
8c56112
import gradio as gr
import json
def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1,
max_model_len=None, quantization=None, device="auto", **kwargs):
command = ["vllm", "serve"]
# Add required arguments
if model:
command.extend(["--model", model])
# Add optional arguments
if task != "auto":
command.extend(["--task", task])
if tokenizer:
command.extend(["--tokenizer", tokenizer])
if dtype != "auto":
command.extend(["--dtype", dtype])
if tensor_parallel_size != 1:
command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
if max_model_len:
command.extend(["--max-model-len", str(max_model_len)])
if quantization and quantization != "None":
command.extend(["--quantization", quantization])
if device != "auto":
command.extend(["--device", device])
# Add any additional arguments
for key, value in kwargs.items():
if value:
# Convert underscores back to hyphens for command line
arg_name = f"--{key.replace('_', '-')}"
command.extend([arg_name, str(value)])
return " ".join(command)
# Define the interface
with gr.Blocks(title="VLLM Command Generator") as app:
gr.Markdown("# VLLM Command Generator")
gr.Markdown("Fill out the form to generate a vllm serve command")
with gr.Tab("Basic Settings"):
model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf")
task = gr.Dropdown(
choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"],
value="auto",
label="Task"
)
tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer")
dtype = gr.Dropdown(
choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
value="auto",
label="Data Type"
)
device = gr.Dropdown(
choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"],
value="auto",
label="Device"
)
with gr.Tab("Performance Settings"):
tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size")
max_model_len = gr.Number(label="Max Model Length", precision=0)
quantization = gr.Dropdown(
choices=[
"None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8",
"modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin",
"gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes",
"qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao"
],
value="None",
label="Quantization"
)
gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization")
with gr.Tab("Advanced Settings"):
trust_remote_code = gr.Checkbox(label="Trust Remote Code")
max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0)
max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0)
seed = gr.Number(label="Random Seed", precision=0)
additional_args = gr.Textbox(
label="Additional Arguments (JSON format)",
placeholder='{"arg1": "value1", "arg2": "value2"}'
)
output = gr.Textbox(label="Generated Command")
def process_form(*args, **kwargs):
# Extract named arguments from the form
form_data = {
"model": args[0],
"task": args[1],
"tokenizer": args[2],
"dtype": args[3],
"device": args[4],
"tensor_parallel_size": args[5],
"max_model_len": args[6],
"quantization": args[7],
"gpu_memory_utilization": args[8],
"trust_remote_code": args[9],
"max_num_batched_tokens": args[10],
"max_num_seqs": args[11],
"seed": args[12]
}
# Process additional arguments
additional_args_text = args[13]
additional_args = {}
if additional_args_text:
try:
additional_args = json.loads(additional_args_text)
except json.JSONDecodeError:
return "Error: Additional arguments must be valid JSON"
# Filter out None values
form_data = {k: v for k, v in form_data.items() if v is not None and v != ""}
# Add boolean flags correctly
if form_data.get("trust_remote_code"):
form_data["trust_remote_code"] = ""
# Merge with additional args
form_data.update(additional_args)
# Generate command
return generate_vllm_command(**form_data)
submit_btn = gr.Button("Generate Command")
submit_btn.click(
process_form,
inputs=[
model, task, tokenizer, dtype, device,
tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization,
trust_remote_code, max_num_batched_tokens, max_num_seqs, seed,
additional_args
],
outputs=output
)
# Launch the app
if __name__ == "__main__":
app.launch()