Spaces:

thyecust
/

vllm-ui

Sleeping

App Files Files Community

thyecust commited on 19 days ago

Commit

8c56112

1 Parent(s): a7ec1ff

Add application file

Browse files

Files changed (1) hide show

app.py +149 -0

app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import gradio as gr
+import json
+def generate_vllm_command(model, task="auto", tokenizer=None, dtype="auto", tensor_parallel_size=1,
+                          max_model_len=None, quantization=None, device="auto", **kwargs):
+    command = ["vllm", "serve"]
+    # Add required arguments
+    if model:
+        command.extend(["--model", model])
+    # Add optional arguments
+    if task != "auto":
+        command.extend(["--task", task])
+    if tokenizer:
+        command.extend(["--tokenizer", tokenizer])
+    if dtype != "auto":
+        command.extend(["--dtype", dtype])
+    if tensor_parallel_size != 1:
+        command.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
+    if max_model_len:
+        command.extend(["--max-model-len", str(max_model_len)])
+    if quantization and quantization != "None":
+        command.extend(["--quantization", quantization])
+    if device != "auto":
+        command.extend(["--device", device])
+    # Add any additional arguments
+    for key, value in kwargs.items():
+        if value:
+            # Convert underscores back to hyphens for command line
+            arg_name = f"--{key.replace('_', '-')}"
+            command.extend([arg_name, str(value)])
+    return " ".join(command)
+# Define the interface
+with gr.Blocks(title="VLLM Command Generator") as app:
+    gr.Markdown("# VLLM Command Generator")
+    gr.Markdown("Fill out the form to generate a vllm serve command")
+    with gr.Tab("Basic Settings"):
+        model = gr.Textbox(label="Model Path or Name", placeholder="e.g., meta-llama/Llama-2-7b-chat-hf")
+        task = gr.Dropdown(
+            choices=["auto", "generate", "embedding", "embed", "classify", "score", "reward", "transcription"],
+            value="auto",
+            label="Task"
+        )
+        tokenizer = gr.Textbox(label="Tokenizer (optional)", placeholder="Path to tokenizer")
+        dtype = gr.Dropdown(
+            choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
+            value="auto",
+            label="Data Type"
+        )
+        device = gr.Dropdown(
+            choices=["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"],
+            value="auto",
+            label="Device"
+        )
+    with gr.Tab("Performance Settings"):
+        tensor_parallel_size = gr.Slider(minimum=1, maximum=8, value=1, step=1, label="Tensor Parallel Size")
+        max_model_len = gr.Number(label="Max Model Length", precision=0)
+        quantization = gr.Dropdown(
+            choices=[
+                "None", "aqlm", "awq", "deepspeedfp", "tpu_int8", "fp8", "ptpc_fp8", "fbgemm_fp8",
+                "modelopt", "nvfp4", "marlin", "bitblas", "gguf", "gptq_marlin_24", "gptq_marlin",
+                "gptq_bitblas", "awq_marlin", "gptq", "compressed-tensors", "bitsandbytes",
+                "qqq", "hqq", "experts_int8", "neuron_quant", "ipex", "quark", "moe_wna16", "torchao"
+            ],
+            value="None",
+            label="Quantization"
+        )
+        gpu_memory_utilization = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="GPU Memory Utilization")
+    with gr.Tab("Advanced Settings"):
+        trust_remote_code = gr.Checkbox(label="Trust Remote Code")
+        max_num_batched_tokens = gr.Number(label="Max Number of Batched Tokens", precision=0)
+        max_num_seqs = gr.Number(label="Max Number of Sequences", precision=0)
+        seed = gr.Number(label="Random Seed", precision=0)
+        additional_args = gr.Textbox(
+            label="Additional Arguments (JSON format)",
+            placeholder='{"arg1": "value1", "arg2": "value2"}'
+        )
+    output = gr.Textbox(label="Generated Command")
+    def process_form(*args, **kwargs):
+        # Extract named arguments from the form
+        form_data = {
+            "model": args[0],
+            "task": args[1],
+            "tokenizer": args[2],
+            "dtype": args[3],
+            "device": args[4],
+            "tensor_parallel_size": args[5],
+            "max_model_len": args[6],
+            "quantization": args[7],
+            "gpu_memory_utilization": args[8],
+            "trust_remote_code": args[9],
+            "max_num_batched_tokens": args[10],
+            "max_num_seqs": args[11],
+            "seed": args[12]
+        }
+        # Process additional arguments
+        additional_args_text = args[13]
+        additional_args = {}
+        if additional_args_text:
+            try:
+                additional_args = json.loads(additional_args_text)
+            except json.JSONDecodeError:
+                return "Error: Additional arguments must be valid JSON"
+        # Filter out None values
+        form_data = {k: v for k, v in form_data.items() if v is not None and v != ""}
+        # Add boolean flags correctly
+        if form_data.get("trust_remote_code"):
+            form_data["trust_remote_code"] = ""
+        # Merge with additional args
+        form_data.update(additional_args)
+        # Generate command
+        return generate_vllm_command(**form_data)
+    submit_btn = gr.Button("Generate Command")
+    submit_btn.click(
+        process_form,
+        inputs=[
+            model, task, tokenizer, dtype, device,
+            tensor_parallel_size, max_model_len, quantization, gpu_memory_utilization,
+            trust_remote_code, max_num_batched_tokens, max_num_seqs, seed,
+            additional_args
+        ],
+        outputs=output
+    )
+# Launch the app
+if __name__ == "__main__":
+    app.launch()