File size: 5,464 Bytes
8c56112 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import gradio as gr
import json
def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1,
max_model_len=None, quantization=None, device="auto", **kwargs):
command = ["vllm", "serve"]
# Add required arguments
if model:
command.extend(["--model", model])
# Add optional arguments
if task != "auto":
command.extend(["--task", task])
if tokenizer:
command.extend(["--tokenizer", tokenizer])
if dtype != "auto":
command.extend(["--dtype", dtype])
if tensor_parallel_size != 1:
command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
if max_model_len:
command.extend(["--max-model-len", str(max_model_len)])
if quantization and quantization != "None":
command.extend(["--quantization", quantization])
if device != "auto":
command.extend(["--device", device])
# Add any additional arguments
for key, value in kwargs.items():
if value:
# Convert underscores back to hyphens for command line
arg_name = f"--{key.replace('_', '-')}"
command.extend([arg_name, str(value)])
return " ".join(command)
# Define the interface
with gr.Blocks(title="VLLM Command Generator") as app:
gr.Markdown("# VLLM Command Generator")
gr.Markdown("Fill out the form to generate a vllm serve command")
with gr.Tab("Basic Settings"):
model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf")
task = gr.Dropdown(
choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"],
value="auto",
label="Task"
)
tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer")
dtype = gr.Dropdown(
choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
value="auto",
label="Data Type"
)
device = gr.Dropdown(
choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"],
value="auto",
label="Device"
)
with gr.Tab("Performance Settings"):
tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size")
max_model_len = gr.Number(label="Max Model Length", precision=0)
quantization = gr.Dropdown(
choices=[
"None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8",
"modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin",
"gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes",
"qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao"
],
value="None",
label="Quantization"
)
gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization")
with gr.Tab("Advanced Settings"):
trust_remote_code = gr.Checkbox(label="Trust Remote Code")
max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0)
max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0)
seed = gr.Number(label="Random Seed", precision=0)
additional_args = gr.Textbox(
label="Additional Arguments (JSON format)",
placeholder='{"arg1": "value1", "arg2": "value2"}'
)
output = gr.Textbox(label="Generated Command")
def process_form(*args, **kwargs):
# Extract named arguments from the form
form_data = {
"model": args[0],
"task": args[1],
"tokenizer": args[2],
"dtype": args[3],
"device": args[4],
"tensor_parallel_size": args[5],
"max_model_len": args[6],
"quantization": args[7],
"gpu_memory_utilization": args[8],
"trust_remote_code": args[9],
"max_num_batched_tokens": args[10],
"max_num_seqs": args[11],
"seed": args[12]
}
# Process additional arguments
additional_args_text = args[13]
additional_args = {}
if additional_args_text:
try:
additional_args = json.loads(additional_args_text)
except json.JSONDecodeError:
return "Error: Additional arguments must be valid JSON"
# Filter out None values
form_data = {k: v for k, v in form_data.items() if v is not None and v != ""}
# Add boolean flags correctly
if form_data.get("trust_remote_code"):
form_data["trust_remote_code"] = ""
# Merge with additional args
form_data.update(additional_args)
# Generate command
return generate_vllm_command(**form_data)
submit_btn = gr.Button("Generate Command")
submit_btn.click(
process_form,
inputs=[
model, task, tokenizer, dtype, device,
tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization,
trust_remote_code, max_num_batched_tokens, max_num_seqs, seed,
additional_args
],
outputs=output
)
# Launch the app
if __name__ == "__main__":
app.launch()
|