thyecust commited on
Commit
8c56112
·
1 Parent(s): a7ec1ff

Add application file

Browse files
Files changed (1) hide show
  1. app.py +149 -0
app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+
4
+ def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1,
5
+ max_model_len=None, quantization=None, device="auto", **kwargs):
6
+ command = ["vllm", "serve"]
7
+
8
+ # Add required arguments
9
+ if model:
10
+ command.extend(["--model", model])
11
+
12
+ # Add optional arguments
13
+ if task != "auto":
14
+ command.extend(["--task", task])
15
+
16
+ if tokenizer:
17
+ command.extend(["--tokenizer", tokenizer])
18
+
19
+ if dtype != "auto":
20
+ command.extend(["--dtype", dtype])
21
+
22
+ if tensor_parallel_size != 1:
23
+ command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
24
+
25
+ if max_model_len:
26
+ command.extend(["--max-model-len", str(max_model_len)])
27
+
28
+ if quantization and quantization != "None":
29
+ command.extend(["--quantization", quantization])
30
+
31
+ if device != "auto":
32
+ command.extend(["--device", device])
33
+
34
+ # Add any additional arguments
35
+ for key, value in kwargs.items():
36
+ if value:
37
+ # Convert underscores back to hyphens for command line
38
+ arg_name = f"--{key.replace('_', '-')}"
39
+ command.extend([arg_name, str(value)])
40
+
41
+ return " ".join(command)
42
+
43
+ # Define the interface
44
+ with gr.Blocks(title="VLLM Command Generator") as app:
45
+ gr.Markdown("# VLLM Command Generator")
46
+ gr.Markdown("Fill out the form to generate a vllm serve command")
47
+
48
+ with gr.Tab("Basic Settings"):
49
+ model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf")
50
+ task = gr.Dropdown(
51
+ choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"],
52
+ value="auto",
53
+ label="Task"
54
+ )
55
+ tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer")
56
+ dtype = gr.Dropdown(
57
+ choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
58
+ value="auto",
59
+ label="Data Type"
60
+ )
61
+ device = gr.Dropdown(
62
+ choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"],
63
+ value="auto",
64
+ label="Device"
65
+ )
66
+
67
+ with gr.Tab("Performance Settings"):
68
+ tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size")
69
+ max_model_len = gr.Number(label="Max Model Length", precision=0)
70
+ quantization = gr.Dropdown(
71
+ choices=[
72
+ "None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8",
73
+ "modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin",
74
+ "gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes",
75
+ "qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao"
76
+ ],
77
+ value="None",
78
+ label="Quantization"
79
+ )
80
+ gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization")
81
+
82
+ with gr.Tab("Advanced Settings"):
83
+ trust_remote_code = gr.Checkbox(label="Trust Remote Code")
84
+ max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0)
85
+ max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0)
86
+ seed = gr.Number(label="Random Seed", precision=0)
87
+ additional_args = gr.Textbox(
88
+ label="Additional Arguments (JSON format)",
89
+ placeholder='{"arg1": "value1", "arg2": "value2"}'
90
+ )
91
+
92
+ output = gr.Textbox(label="Generated Command")
93
+
94
+ def process_form(*args, **kwargs):
95
+ # Extract named arguments from the form
96
+ form_data = {
97
+ "model": args[0],
98
+ "task": args[1],
99
+ "tokenizer": args[2],
100
+ "dtype": args[3],
101
+ "device": args[4],
102
+ "tensor_parallel_size": args[5],
103
+ "max_model_len": args[6],
104
+ "quantization": args[7],
105
+ "gpu_memory_utilization": args[8],
106
+ "trust_remote_code": args[9],
107
+ "max_num_batched_tokens": args[10],
108
+ "max_num_seqs": args[11],
109
+ "seed": args[12]
110
+ }
111
+
112
+ # Process additional arguments
113
+ additional_args_text = args[13]
114
+ additional_args = {}
115
+ if additional_args_text:
116
+ try:
117
+ additional_args = json.loads(additional_args_text)
118
+ except json.JSONDecodeError:
119
+ return "Error: Additional arguments must be valid JSON"
120
+
121
+ # Filter out None values
122
+ form_data = {k: v for k, v in form_data.items() if v is not None and v != ""}
123
+
124
+ # Add boolean flags correctly
125
+ if form_data.get("trust_remote_code"):
126
+ form_data["trust_remote_code"] = ""
127
+
128
+ # Merge with additional args
129
+ form_data.update(additional_args)
130
+
131
+ # Generate command
132
+ return generate_vllm_command(**form_data)
133
+
134
+ submit_btn = gr.Button("Generate Command")
135
+ submit_btn.click(
136
+ process_form,
137
+ inputs=[
138
+ model, task, tokenizer, dtype, device,
139
+ tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization,
140
+ trust_remote_code, max_num_batched_tokens, max_num_seqs, seed,
141
+ additional_args
142
+ ],
143
+ outputs=output
144
+ )
145
+
146
+ # Launch the app
147
+ if __name__ == "__main__":
148
+ app.launch()
149
+