LLM_Sizing / app.py
farmax's picture
Update app.py
cafecc7 verified
raw
history blame
3.34 kB
import gradio as gr
from tabulate import tabulate
import os
# Definisci le variabili di ambiente
os.environ['NUM_GPU'] = '1'
os.environ['PROMPT_SZ'] = '4096'
os.environ['RESPONSE_SZ'] = '256'
os.environ['N_CONCURRENT_REQ'] = '10'
os.environ['CTX_WINDOW'] = '1024'
def greet(name):
return f"Ciao, {name}!"
def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
return result if result >= 0 else "OOM"
def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
result = (2 * model_params_billion / num_gpu) / fp16_tflops
return result if result >= 0 else "OOM"
def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
return result if result >= 0 else "OOM"
def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
if isinstance(prefill_time, str) or isinstance(generation_time, str): # Check if any are "NA"
return "OOM"
return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds
def estimate_capacity_latency(model, gpu):
kv_cache_tokens = calc_kv_cache_tokens(int(os.environ['NUM_GPU']), gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
prefill_time_per_token = calc_prefill_time_per_token(int(os.environ['NUM_GPU']), model['params_billion'], gpu['fp16_tflops'])
generation_time_per_token = calc_generation_time_per_token(int(os.environ['NUM_GPU']), model['params_billion'], gpu['memory_bandwidth_gbps'])
estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, int(os.environ['PROMPT_SZ']), int(os.environ['RESPONSE_SZ']))
return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
def create_gradio_interface():
gpu_specs = [
{"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
{"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
# ... altri GPU ...
]
model_specs = [
{"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
{"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
# ... altri modelli ...
]
demo = gr.Interface(
fn=estimate_capacity_latency,
inputs=[
gr.Textbox(label="Model Name"),
gr.Dropdown(choices=[model['name'] for model in model_specs], label="Model Type"),
gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
],
outputs=[
gr.HTML(label="Capacity and Latency Table")
],
title="LLM Capacity and Latency Estimator",
description="Estimate LLM capacity and latency based on model and GPU specifications.",
theme="minimal"
)
return demo
# Creare l'interfaccia Gradio
gr_interface = create_gradio_interface()
# Avvia l'interfaccia
gr_interface.launch()