Spaces:
Sleeping
Sleeping
import gradio as gr | |
from tabulate import tabulate | |
import os | |
# Definisci le variabili di ambiente | |
os.environ['NUM_GPU'] = '1' | |
os.environ['PROMPT_SZ'] = '4096' | |
os.environ['RESPONSE_SZ'] = '256' | |
os.environ['N_CONCURRENT_REQ'] = '10' | |
os.environ['CTX_WINDOW'] = '1024' | |
def greet(name): | |
return f"Ciao, {name}!" | |
def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size): | |
result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size | |
return result if result >= 0 else "OOM" | |
def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops): | |
result = (2 * model_params_billion / num_gpu) / fp16_tflops | |
return result if result >= 0 else "OOM" | |
def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps): | |
result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000 | |
return result if result >= 0 else "OOM" | |
def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size): | |
if isinstance(prefill_time, str) or isinstance(generation_time, str): # Check if any are "NA" | |
return "OOM" | |
return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds | |
def estimate_capacity_latency(model, gpu): | |
kv_cache_tokens = calc_kv_cache_tokens(int(os.environ['NUM_GPU']), gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token) | |
prefill_time_per_token = calc_prefill_time_per_token(int(os.environ['NUM_GPU']), model['params_billion'], gpu['fp16_tflops']) | |
generation_time_per_token = calc_generation_time_per_token(int(os.environ['NUM_GPU']), model['params_billion'], gpu['memory_bandwidth_gbps']) | |
estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, int(os.environ['PROMPT_SZ']), int(os.environ['RESPONSE_SZ'])) | |
return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s" | |
def create_gradio_interface(): | |
gpu_specs = [ | |
{"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600}, | |
{"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933}, | |
# ... altri GPU ... | |
] | |
model_specs = [ | |
{"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128}, | |
{"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128}, | |
# ... altri modelli ... | |
] | |
demo = gr.Interface( | |
fn=estimate_capacity_latency, | |
inputs=[ | |
gr.Textbox(label="Model Name"), | |
gr.Dropdown(choices=[model['name'] for model in model_specs], label="Model Type"), | |
gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type") | |
], | |
outputs=[ | |
gr.HTML(label="Capacity and Latency Table") | |
], | |
title="LLM Capacity and Latency Estimator", | |
description="Estimate LLM capacity and latency based on model and GPU specifications.", | |
theme="minimal" | |
) | |
return demo | |
# Creare l'interfaccia Gradio | |
gr_interface = create_gradio_interface() | |
# Avvia l'interfaccia | |
gr_interface.launch() | |