File size: 3,338 Bytes
6383b80
034c6e1
cafecc7
 
 
 
 
 
 
 
034c6e1
5fac534
642b857
 
0d2e72f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d13641
cafecc7
 
 
 
1d13641
 
0d2e72f
74ff3ad
 
 
cafecc7
74ff3ad
cafecc7
01ed4ac
 
 
cafecc7
01ed4ac
 
0d2e72f
 
 
 
01ed4ac
0d2e72f
 
 
 
 
 
 
 
 
 
 
 
cafecc7
0d2e72f
 
cafecc7
0d2e72f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import gradio as gr
from tabulate import tabulate
import os

# Definisci le variabili di ambiente
os.environ['NUM_GPU'] = '1'
os.environ['PROMPT_SZ'] = '4096'
os.environ['RESPONSE_SZ'] = '256'
os.environ['N_CONCURRENT_REQ'] = '10'
os.environ['CTX_WINDOW'] = '1024'

def greet(name):
    return f"Ciao, {name}!"

def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
    result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
    return result if result >= 0 else "OOM"

def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
    result = (2 * model_params_billion / num_gpu) / fp16_tflops
    return result if result >= 0 else "OOM"

def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
    result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
    return result if result >= 0 else "OOM"

def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
    if isinstance(prefill_time, str) or isinstance(generation_time, str):  # Check if any are "NA"
        return "OOM"
    return (prompt_size * prefill_time + response_size * generation_time) / 1000  # convert ms to seconds

def estimate_capacity_latency(model, gpu):
    kv_cache_tokens = calc_kv_cache_tokens(int(os.environ['NUM_GPU']), gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
    prefill_time_per_token = calc_prefill_time_per_token(int(os.environ['NUM_GPU']), model['params_billion'], gpu['fp16_tflops'])
    generation_time_per_token = calc_generation_time_per_token(int(os.environ['NUM_GPU']), model['params_billion'], gpu['memory_bandwidth_gbps'])
    estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, int(os.environ['PROMPT_SZ']), int(os.environ['RESPONSE_SZ']))
    return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"

def create_gradio_interface():
    gpu_specs = [
        {"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
        {"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
        # ... altri GPU ...
    ]

    model_specs = [
        {"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
        {"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
        # ... altri modelli ...
    ]
    
    demo = gr.Interface(
        fn=estimate_capacity_latency,
        inputs=[
            gr.Textbox(label="Model Name"),
            gr.Dropdown(choices=[model['name'] for model in model_specs], label="Model Type"),
            gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
        ],
        outputs=[
            gr.HTML(label="Capacity and Latency Table")
        ],
        title="LLM Capacity and Latency Estimator",
        description="Estimate LLM capacity and latency based on model and GPU specifications.",
        theme="minimal"
    )
    
    return demo

# Creare l'interfaccia Gradio
gr_interface = create_gradio_interface()

# Avvia l'interfaccia
gr_interface.launch()