Add application file
Browse files
app.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import json
|
3 |
+
|
4 |
+
def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1,
|
5 |
+
max_model_len=None, quantization=None, device="auto", **kwargs):
|
6 |
+
command = ["vllm", "serve"]
|
7 |
+
|
8 |
+
# Add required arguments
|
9 |
+
if model:
|
10 |
+
command.extend(["--model", model])
|
11 |
+
|
12 |
+
# Add optional arguments
|
13 |
+
if task != "auto":
|
14 |
+
command.extend(["--task", task])
|
15 |
+
|
16 |
+
if tokenizer:
|
17 |
+
command.extend(["--tokenizer", tokenizer])
|
18 |
+
|
19 |
+
if dtype != "auto":
|
20 |
+
command.extend(["--dtype", dtype])
|
21 |
+
|
22 |
+
if tensor_parallel_size != 1:
|
23 |
+
command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
|
24 |
+
|
25 |
+
if max_model_len:
|
26 |
+
command.extend(["--max-model-len", str(max_model_len)])
|
27 |
+
|
28 |
+
if quantization and quantization != "None":
|
29 |
+
command.extend(["--quantization", quantization])
|
30 |
+
|
31 |
+
if device != "auto":
|
32 |
+
command.extend(["--device", device])
|
33 |
+
|
34 |
+
# Add any additional arguments
|
35 |
+
for key, value in kwargs.items():
|
36 |
+
if value:
|
37 |
+
# Convert underscores back to hyphens for command line
|
38 |
+
arg_name = f"--{key.replace('_', '-')}"
|
39 |
+
command.extend([arg_name, str(value)])
|
40 |
+
|
41 |
+
return " ".join(command)
|
42 |
+
|
43 |
+
# Define the interface
|
44 |
+
with gr.Blocks(title="VLLM Command Generator") as app:
|
45 |
+
gr.Markdown("# VLLM Command Generator")
|
46 |
+
gr.Markdown("Fill out the form to generate a vllm serve command")
|
47 |
+
|
48 |
+
with gr.Tab("Basic Settings"):
|
49 |
+
model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf")
|
50 |
+
task = gr.Dropdown(
|
51 |
+
choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"],
|
52 |
+
value="auto",
|
53 |
+
label="Task"
|
54 |
+
)
|
55 |
+
tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer")
|
56 |
+
dtype = gr.Dropdown(
|
57 |
+
choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
|
58 |
+
value="auto",
|
59 |
+
label="Data Type"
|
60 |
+
)
|
61 |
+
device = gr.Dropdown(
|
62 |
+
choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"],
|
63 |
+
value="auto",
|
64 |
+
label="Device"
|
65 |
+
)
|
66 |
+
|
67 |
+
with gr.Tab("Performance Settings"):
|
68 |
+
tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size")
|
69 |
+
max_model_len = gr.Number(label="Max Model Length", precision=0)
|
70 |
+
quantization = gr.Dropdown(
|
71 |
+
choices=[
|
72 |
+
"None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8",
|
73 |
+
"modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin",
|
74 |
+
"gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes",
|
75 |
+
"qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao"
|
76 |
+
],
|
77 |
+
value="None",
|
78 |
+
label="Quantization"
|
79 |
+
)
|
80 |
+
gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization")
|
81 |
+
|
82 |
+
with gr.Tab("Advanced Settings"):
|
83 |
+
trust_remote_code = gr.Checkbox(label="Trust Remote Code")
|
84 |
+
max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0)
|
85 |
+
max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0)
|
86 |
+
seed = gr.Number(label="Random Seed", precision=0)
|
87 |
+
additional_args = gr.Textbox(
|
88 |
+
label="Additional Arguments (JSON format)",
|
89 |
+
placeholder='{"arg1": "value1", "arg2": "value2"}'
|
90 |
+
)
|
91 |
+
|
92 |
+
output = gr.Textbox(label="Generated Command")
|
93 |
+
|
94 |
+
def process_form(*args, **kwargs):
|
95 |
+
# Extract named arguments from the form
|
96 |
+
form_data = {
|
97 |
+
"model": args[0],
|
98 |
+
"task": args[1],
|
99 |
+
"tokenizer": args[2],
|
100 |
+
"dtype": args[3],
|
101 |
+
"device": args[4],
|
102 |
+
"tensor_parallel_size": args[5],
|
103 |
+
"max_model_len": args[6],
|
104 |
+
"quantization": args[7],
|
105 |
+
"gpu_memory_utilization": args[8],
|
106 |
+
"trust_remote_code": args[9],
|
107 |
+
"max_num_batched_tokens": args[10],
|
108 |
+
"max_num_seqs": args[11],
|
109 |
+
"seed": args[12]
|
110 |
+
}
|
111 |
+
|
112 |
+
# Process additional arguments
|
113 |
+
additional_args_text = args[13]
|
114 |
+
additional_args = {}
|
115 |
+
if additional_args_text:
|
116 |
+
try:
|
117 |
+
additional_args = json.loads(additional_args_text)
|
118 |
+
except json.JSONDecodeError:
|
119 |
+
return "Error: Additional arguments must be valid JSON"
|
120 |
+
|
121 |
+
# Filter out None values
|
122 |
+
form_data = {k: v for k, v in form_data.items() if v is not None and v != ""}
|
123 |
+
|
124 |
+
# Add boolean flags correctly
|
125 |
+
if form_data.get("trust_remote_code"):
|
126 |
+
form_data["trust_remote_code"] = ""
|
127 |
+
|
128 |
+
# Merge with additional args
|
129 |
+
form_data.update(additional_args)
|
130 |
+
|
131 |
+
# Generate command
|
132 |
+
return generate_vllm_command(**form_data)
|
133 |
+
|
134 |
+
submit_btn = gr.Button("Generate Command")
|
135 |
+
submit_btn.click(
|
136 |
+
process_form,
|
137 |
+
inputs=[
|
138 |
+
model, task, tokenizer, dtype, device,
|
139 |
+
tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization,
|
140 |
+
trust_remote_code, max_num_batched_tokens, max_num_seqs, seed,
|
141 |
+
additional_args
|
142 |
+
],
|
143 |
+
outputs=output
|
144 |
+
)
|
145 |
+
|
146 |
+
# Launch the app
|
147 |
+
if __name__ == "__main__":
|
148 |
+
app.launch()
|
149 |
+
|