import gradio as gr import json def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1, max_model_len=None, quantization=None, device="auto", **kwargs): command = ["vllm", "serve"] # Add required arguments if model: command.extend(["--model", model]) # Add optional arguments if task != "auto": command.extend(["--task", task]) if tokenizer: command.extend(["--tokenizer", tokenizer]) if dtype != "auto": command.extend(["--dtype", dtype]) if tensor_parallel_size != 1: command.extend(["--tensor-parallel-size", str(tensor_parallel_size)]) if max_model_len: command.extend(["--max-model-len", str(max_model_len)]) if quantization and quantization != "None": command.extend(["--quantization", quantization]) if device != "auto": command.extend(["--device", device]) # Add any additional arguments for key, value in kwargs.items(): if value: # Convert underscores back to hyphens for command line arg_name = f"--{key.replace('_', '-')}" command.extend([arg_name, str(value)]) return " ".join(command) # Define the interface with gr.Blocks(title="VLLM Command Generator") as app: gr.Markdown("# VLLM Command Generator") gr.Markdown("Fill out the form to generate a vllm serve command") with gr.Tab("Basic Settings"): model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf") task = gr.Dropdown( choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"], value="auto", label="Task" ) tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer") dtype = gr.Dropdown( choices=["auto", "half", "float16", "bfloat16", "float", "float32"], value="auto", label="Data Type" ) device = gr.Dropdown( choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"], value="auto", label="Device" ) with gr.Tab("Performance Settings"): tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size") max_model_len = gr.Number(label="Max Model Length", precision=0) quantization = gr.Dropdown( choices=[ "None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8", "modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin", "gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes", "qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao" ], value="None", label="Quantization" ) gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization") with gr.Tab("Advanced Settings"): trust_remote_code = gr.Checkbox(label="Trust Remote Code") max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0) max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0) seed = gr.Number(label="Random Seed", precision=0) additional_args = gr.Textbox( label="Additional Arguments (JSON format)", placeholder='{"arg1": "value1", "arg2": "value2"}' ) output = gr.Textbox(label="Generated Command") def process_form(*args, **kwargs): # Extract named arguments from the form form_data = { "model": args[0], "task": args[1], "tokenizer": args[2], "dtype": args[3], "device": args[4], "tensor_parallel_size": args[5], "max_model_len": args[6], "quantization": args[7], "gpu_memory_utilization": args[8], "trust_remote_code": args[9], "max_num_batched_tokens": args[10], "max_num_seqs": args[11], "seed": args[12] } # Process additional arguments additional_args_text = args[13] additional_args = {} if additional_args_text: try: additional_args = json.loads(additional_args_text) except json.JSONDecodeError: return "Error: Additional arguments must be valid JSON" # Filter out None values form_data = {k: v for k, v in form_data.items() if v is not None and v != ""} # Add boolean flags correctly if form_data.get("trust_remote_code"): form_data["trust_remote_code"] = "" # Merge with additional args form_data.update(additional_args) # Generate command return generate_vllm_command(**form_data) submit_btn = gr.Button("Generate Command") submit_btn.click( process_form, inputs=[ model, task, tokenizer, dtype, device, tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization, trust_remote_code, max_num_batched_tokens, max_num_seqs, seed, additional_args ], outputs=output ) # Launch the app if __name__ == "__main__": app.launch()