|
import gradio as gr |
|
import json |
|
|
|
def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1, |
|
max_model_len=None, quantization=None, device="auto", **kwargs): |
|
command = ["vllm", "serve"] |
|
|
|
|
|
if model: |
|
command.extend(["--model", model]) |
|
|
|
|
|
if task != "auto": |
|
command.extend(["--task", task]) |
|
|
|
if tokenizer: |
|
command.extend(["--tokenizer", tokenizer]) |
|
|
|
if dtype != "auto": |
|
command.extend(["--dtype", dtype]) |
|
|
|
if tensor_parallel_size != 1: |
|
command.extend(["--tensor-parallel-size", str(tensor_parallel_size)]) |
|
|
|
if max_model_len: |
|
command.extend(["--max-model-len", str(max_model_len)]) |
|
|
|
if quantization and quantization != "None": |
|
command.extend(["--quantization", quantization]) |
|
|
|
if device != "auto": |
|
command.extend(["--device", device]) |
|
|
|
|
|
for key, value in kwargs.items(): |
|
if value: |
|
|
|
arg_name = f"--{key.replace('_', '-')}" |
|
command.extend([arg_name, str(value)]) |
|
|
|
return " ".join(command) |
|
|
|
|
|
with gr.Blocks(title="VLLM Command Generator") as app: |
|
gr.Markdown("# VLLM Command Generator") |
|
gr.Markdown("Fill out the form to generate a vllm serve command") |
|
|
|
with gr.Tab("Basic Settings"): |
|
model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf") |
|
task = gr.Dropdown( |
|
choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"], |
|
value="auto", |
|
label="Task" |
|
) |
|
tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer") |
|
dtype = gr.Dropdown( |
|
choices=["auto", "half", "float16", "bfloat16", "float", "float32"], |
|
value="auto", |
|
label="Data Type" |
|
) |
|
device = gr.Dropdown( |
|
choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"], |
|
value="auto", |
|
label="Device" |
|
) |
|
|
|
with gr.Tab("Performance Settings"): |
|
tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size") |
|
max_model_len = gr.Number(label="Max Model Length", precision=0) |
|
quantization = gr.Dropdown( |
|
choices=[ |
|
"None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8", |
|
"modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin", |
|
"gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes", |
|
"qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao" |
|
], |
|
value="None", |
|
label="Quantization" |
|
) |
|
gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization") |
|
|
|
with gr.Tab("Advanced Settings"): |
|
trust_remote_code = gr.Checkbox(label="Trust Remote Code") |
|
max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0) |
|
max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0) |
|
seed = gr.Number(label="Random Seed", precision=0) |
|
additional_args = gr.Textbox( |
|
label="Additional Arguments (JSON format)", |
|
placeholder='{"arg1": "value1", "arg2": "value2"}' |
|
) |
|
|
|
output = gr.Textbox(label="Generated Command") |
|
|
|
def process_form(*args, **kwargs): |
|
|
|
form_data = { |
|
"model": args[0], |
|
"task": args[1], |
|
"tokenizer": args[2], |
|
"dtype": args[3], |
|
"device": args[4], |
|
"tensor_parallel_size": args[5], |
|
"max_model_len": args[6], |
|
"quantization": args[7], |
|
"gpu_memory_utilization": args[8], |
|
"trust_remote_code": args[9], |
|
"max_num_batched_tokens": args[10], |
|
"max_num_seqs": args[11], |
|
"seed": args[12] |
|
} |
|
|
|
|
|
additional_args_text = args[13] |
|
additional_args = {} |
|
if additional_args_text: |
|
try: |
|
additional_args = json.loads(additional_args_text) |
|
except json.JSONDecodeError: |
|
return "Error: Additional arguments must be valid JSON" |
|
|
|
|
|
form_data = {k: v for k, v in form_data.items() if v is not None and v != ""} |
|
|
|
|
|
if form_data.get("trust_remote_code"): |
|
form_data["trust_remote_code"] = "" |
|
|
|
|
|
form_data.update(additional_args) |
|
|
|
|
|
return generate_vllm_command(**form_data) |
|
|
|
submit_btn = gr.Button("Generate Command") |
|
submit_btn.click( |
|
process_form, |
|
inputs=[ |
|
model, task, tokenizer, dtype, device, |
|
tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization, |
|
trust_remote_code, max_num_batched_tokens, max_num_seqs, seed, |
|
additional_args |
|
], |
|
outputs=output |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.launch() |
|
|
|
|