File size: 5,464 Bytes
8c56112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import gradio as gr
import json

def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1, 
                          max_model_len=None, quantization=None, device="auto", **kwargs):
    command = ["vllm", "serve"]

    # Add required arguments
    if model:
        command.extend(["--model", model])

    # Add optional arguments
    if task != "auto":
        command.extend(["--task", task])

    if tokenizer:
        command.extend(["--tokenizer", tokenizer])

    if dtype != "auto":
        command.extend(["--dtype", dtype])

    if tensor_parallel_size != 1:
        command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])

    if max_model_len:
        command.extend(["--max-model-len", str(max_model_len)])

    if quantization and quantization != "None":
        command.extend(["--quantization", quantization])

    if device != "auto":
        command.extend(["--device", device])

    # Add any additional arguments
    for key, value in kwargs.items():
        if value:
            # Convert underscores back to hyphens for command line
            arg_name = f"--{key.replace('_', '-')}"
            command.extend([arg_name, str(value)])

    return " ".join(command)

# Define the interface
with gr.Blocks(title="VLLM Command Generator") as app:
    gr.Markdown("# VLLM Command Generator")
    gr.Markdown("Fill out the form to generate a vllm serve command")

    with gr.Tab("Basic Settings"):
        model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf")
        task = gr.Dropdown(
            choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"],
            value="auto",
            label="Task"
        )
        tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer")
        dtype = gr.Dropdown(
            choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
            value="auto",
            label="Data Type"
        )
        device = gr.Dropdown(
            choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"],
            value="auto",
            label="Device"
        )

    with gr.Tab("Performance Settings"):
        tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size")
        max_model_len = gr.Number(label="Max Model Length", precision=0)
        quantization = gr.Dropdown(
            choices=[
                "None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8", 
                "modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin", 
                "gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes", 
                "qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao"
            ],
            value="None",
            label="Quantization"
        )
        gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization")

    with gr.Tab("Advanced Settings"):
        trust_remote_code = gr.Checkbox(label="Trust Remote Code")
        max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0)
        max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0)
        seed = gr.Number(label="Random Seed", precision=0)
        additional_args = gr.Textbox(
            label="Additional Arguments (JSON format)", 
            placeholder='{"arg1": "value1", "arg2": "value2"}'
        )

    output = gr.Textbox(label="Generated Command")

    def process_form(*args, **kwargs):
        # Extract named arguments from the form
        form_data = {
            "model": args[0],
            "task": args[1],
            "tokenizer": args[2],
            "dtype": args[3],
            "device": args[4],
            "tensor_parallel_size": args[5],
            "max_model_len": args[6],
            "quantization": args[7],
            "gpu_memory_utilization": args[8],
            "trust_remote_code": args[9],
            "max_num_batched_tokens": args[10],
            "max_num_seqs": args[11],
            "seed": args[12]
        }

        # Process additional arguments
        additional_args_text = args[13]
        additional_args = {}
        if additional_args_text:
            try:
                additional_args = json.loads(additional_args_text)
            except json.JSONDecodeError:
                return "Error: Additional arguments must be valid JSON"

        # Filter out None values
        form_data = {k: v for k, v in form_data.items() if v is not None and v != ""}

        # Add boolean flags correctly
        if form_data.get("trust_remote_code"):
            form_data["trust_remote_code"] = ""

        # Merge with additional args
        form_data.update(additional_args)

        # Generate command
        return generate_vllm_command(**form_data)

    submit_btn = gr.Button("Generate Command")
    submit_btn.click(
        process_form,
        inputs=[
            model, task, tokenizer, dtype, device,
            tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization,
            trust_remote_code, max_num_batched_tokens, max_num_seqs, seed,
            additional_args
        ],
        outputs=output
    )

# Launch the app
if __name__ == "__main__":
    app.launch()