Spaces:

thyecust
/

vllm-ui

Sleeping

File size: 5,464 Bytes

8c56112

import gradio as gr
import json

def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1, 
                          max_model_len=None, quantization=None, device="auto", **kwargs):
    command = ["vllm", "serve"]

    # Add required arguments
    if model:
        command.extend(["--model", model])

    # Add optional arguments
    if task != "auto":
        command.extend(["--task", task])

    if tokenizer:
        command.extend(["--tokenizer", tokenizer])

    if dtype != "auto":
        command.extend(["--dtype", dtype])

    if tensor_parallel_size != 1:
        command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])

    if max_model_len:
        command.extend(["--max-model-len", str(max_model_len)])

    if quantization and quantization != "None":
        command.extend(["--quantization", quantization])

    if device != "auto":
        command.extend(["--device", device])

    # Add any additional arguments
    for key, value in kwargs.items():
        if value:
            # Convert underscores back to hyphens for command line
            arg_name = f"--{key.replace('_', '-')}"
            command.extend([arg_name, str(value)])

    return " ".join(command)

# Define the interface
with gr.Blocks(title="VLLM Command Generator") as app:
    gr.Markdown("# VLLM Command Generator")
    gr.Markdown("Fill out the form to generate a vllm serve command")

    with gr.Tab("Basic Settings"):
        model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf")
        task = gr.Dropdown(
            choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"],
            value="auto",
            label="Task"
        )
        tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer")
        dtype = gr.Dropdown(
            choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
            value="auto",
            label="Data Type"
        )
        device = gr.Dropdown(
            choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"],
            value="auto",
            label="Device"
        )

    with gr.Tab("Performance Settings"):
        tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size")
        max_model_len = gr.Number(label="Max Model Length", precision=0)
        quantization = gr.Dropdown(
            choices=[
                "None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8", 
                "modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin", 
                "gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes", 
                "qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao"
            ],
            value="None",
            label="Quantization"
        )
        gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization")

    with gr.Tab("Advanced Settings"):
        trust_remote_code = gr.Checkbox(label="Trust Remote Code")
        max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0)
        max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0)
        seed = gr.Number(label="Random Seed", precision=0)
        additional_args = gr.Textbox(
            label="Additional Arguments (JSON format)", 
            placeholder='{"arg1": "value1", "arg2": "value2"}'
        )

    output = gr.Textbox(label="Generated Command")

    def process_form(*args, **kwargs):
        # Extract named arguments from the form
        form_data = {
            "model": args[0],
            "task": args[1],
            "tokenizer": args[2],
            "dtype": args[3],
            "device": args[4],
            "tensor_parallel_size": args[5],
            "max_model_len": args[6],
            "quantization": args[7],
            "gpu_memory_utilization": args[8],
            "trust_remote_code": args[9],
            "max_num_batched_tokens": args[10],
            "max_num_seqs": args[11],
            "seed": args[12]
        }

        # Process additional arguments
        additional_args_text = args[13]
        additional_args = {}
        if additional_args_text:
            try:
                additional_args = json.loads(additional_args_text)
            except json.JSONDecodeError:
                return "Error: Additional arguments must be valid JSON"

        # Filter out None values
        form_data = {k: v for k, v in form_data.items() if v is not None and v != ""}

        # Add boolean flags correctly
        if form_data.get("trust_remote_code"):
            form_data["trust_remote_code"] = ""

        # Merge with additional args
        form_data.update(additional_args)

        # Generate command
        return generate_vllm_command(**form_data)

    submit_btn = gr.Button("Generate Command")
    submit_btn.click(
        process_form,
        inputs=[
            model, task, tokenizer, dtype, device,
            tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization,
            trust_remote_code, max_num_batched_tokens, max_num_seqs, seed,
            additional_args
        ],
        outputs=output
    )

# Launch the app
if __name__ == "__main__":
    app.launch()