farmax commited on
Commit
a92af8c
·
verified ·
1 Parent(s): e452071

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +109 -71
app.py CHANGED
@@ -1,60 +1,43 @@
1
  import gradio as gr
 
2
  from tabulate import tabulate
3
- import os
4
 
5
- def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
6
- result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
7
- return result if result >= 0 else "OOM"
 
 
 
 
8
 
9
- def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
10
- result = (2 * model_params_billion / num_gpu) / fp16_tflops
11
- return result if result >= 0 else "OOM"
12
 
13
- def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
14
- result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
15
- return result if result >= 0 else "OOM"
 
 
16
 
17
- def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
18
- if isinstance(prefill_time, str) or isinstance(generation_time, str): # Check if any are "NA"
19
- return "OOM"
20
- return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds
21
 
22
- def estimate_capacity_latency(num_gpu, gpu_dict, model, kv_cache_size_per_token):
23
- # Convert gpu_dict to a dictionary if it's a string
24
- if isinstance(gpu_dict, str):
25
- gpu_dict = eval(gpu_dict)
26
-
27
- # Convert model to a dictionary if it's a string
28
- if isinstance(model, str):
29
- model = eval(model)
30
-
31
- # Now proceed with the calculation
32
- kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu_dict['memory_gb'], model['params_billion'], kv_cache_size_per_token)
33
- prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu_dict['fp16_tflops'])
34
- generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu_dict['memory_bandwidth_gbps'])
35
- estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_sz, response_sz)
36
-
37
- return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
38
-
39
- def create_gradio_interface():
40
- global gpu_specs, model_specs
41
-
42
- # Load GPU specs
43
  gpu_specs = [
44
  {"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
45
- {"name": "A30", "fp16_tflops": 330, "memory_bandwidth_gbps": 933},
46
- {"name": "L40", "fp16_tflops": 181, "memory_bandwidth_gbps": 864},
47
- {"name": "L40s", "fp16_tflops": 362, "memory_bandwidth_gbps": 864},
48
- {"name": "A100 40 GB", "fp16_tflops": 312, "memory_bandwidth_gbps": 1555},
49
- {"name": "A100 40 GB SXM", "fp16_tflops": 312, "memory_bandwidth_gbps": 1555},
50
- {"name": "A100 80 GB PCIe", "fp16_tflops": 312, "memory_bandwidth_gbps": 1935},
51
- {"name": "A100 80 GB SXM", "fp16_tflops": 312, "memory_bandwidth_gbps": 2039},
52
- {"name": "H100 PCIe", "fp16_tflops": 1513, "memory_bandwidth_gbps": 2000},
53
- {"name": "H100 SXM", "fp16_tflops": 1979, "memory_bandwidth_gbps": 3350},
54
- {"name": "H100 NVL", "fp16_tflops": 3958, "memory_bandwidth_gbps": 7800}
 
55
  ]
56
-
57
- # Load model specs
58
  model_specs = [
59
  {"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
60
  {"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
@@ -64,32 +47,87 @@ def create_gradio_interface():
64
  {"name": "Falcon-7B", "params_billion": 7, "d_model": 4544, "n_heads": 71, "n_layers": 32, "max_context_window": 2048, "d_head": 64},
65
  {"name": "Falcon-40B", "params_billion": 40, "d_model": 8192, "n_heads": 128, "n_layers": 60, "max_context_window": 2048, "d_head": 64},
66
  {"name": "Falcon-180B", "params_billion": 180, "d_model": 14848, "n_heads": 232, "n_layers": 80, "max_context_window": 2048, "d_head": 64}
 
67
  ]
 
 
 
 
 
 
68
 
69
- demo = gr.Interface(
70
- fn=estimate_capacity_latency,
71
- inputs=[
72
- gr.Dropdown(choices=[model['name'] for model in model_specs], label="Model Type"),
73
- gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type"),
74
-
75
- gr.Number(value=int(os.environ.get('NUM_GPU', '1')), label='Number of GPUs'),
76
- gr.Number(value=int(os.environ.get('PROMPT_SZ', '4096')), label='Prompt Size'),
77
- gr.Number(value=int(os.environ.get('RESPONSE_SZ', '256')), label='Response Size'),
78
- gr.Number(value=int(os.environ.get('N_CONCURRENT_REQ', '10')), label='Concurrent Requests'),
79
- gr.Number(value=int(os.environ.get('CTX_WINDOW', '1024')), label='Context Window')
80
- ],
81
- outputs=[
82
- gr.HTML(label="Capacity and Latency Table")
83
- ],
84
- title="LLM Capacity and Latency Estimator",
85
- description="Estimate LLM capacity and latency based on model and GPU specifications.",
86
- theme="minimal"
87
- )
88
 
89
- return demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- # Creare l'interfaccia Gradio
92
- gr_interface = create_gradio_interface()
 
 
 
 
 
93
 
94
- # Avvia l'interfaccia
95
- gr_interface.launch()
 
1
  import gradio as gr
2
+ import argparse
3
  from tabulate import tabulate
 
4
 
5
+ def main():
6
+ parser = argparse.ArgumentParser(description='Estimate LLM Memory Footprint')
7
+ parser.add_argument('--num_gpu', type=int, default=1, help='Number of GPUs')
8
+ parser.add_argument('--prompt_sz', type=int, default=4096, help='Prompt size in tokens')
9
+ parser.add_argument('--response_sz', type=int, default=256, help='Response size in tokens')
10
+ parser.add_argument('--n_concurrent_req', type=int, default=10, help='Number of concurrent requests')
11
+ parser.add_argument('--ctx_window', type=int, default=1024, help='Average context window')
12
 
13
+ args = parser.parse_args()
 
 
14
 
15
+ num_gpu = args.num_gpu
16
+ prompt_size = args.prompt_sz
17
+ response_size = args.response_sz
18
+ n_concurrent_request = args.n_concurrent_req
19
+ avg_context_window = args.ctx_window
20
 
21
+ # Print input
22
+ print(f" num_gpu = {num_gpu}, prompt_size = {prompt_size} tokens, response_size = {response_size} tokens")
23
+ print(f" n_concurrent_request = {n_concurrent_request}, avg_context_window = {avg_context_window} tokens")
 
24
 
25
+ # Define variables
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  gpu_specs = [
27
  {"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
28
+ {"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
29
+ {"name": "L40", "fp16_tflops": 181, "memory_gb": 48, "memory_bandwidth_gbps": 864},
30
+ {"name": "L40s", "fp16_tflops": 362, "memory_gb": 48, "memory_bandwidth_gbps": 864},
31
+ {"name": "A100 40 GB", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
32
+ {"name": "A100 40 GB SXM", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
33
+ {"name": "A100 80 GB PCIe", "fp16_tflops": 312, "memory_gb": 80, "memory_bandwidth_gbps": 1935},
34
+ {"name": "A100 80 GB SXM", "fp16_tflops": 312, "memory_gb": 80, "memory_bandwidth_gbps": 2039},
35
+ {"name": "H100 PCIe", "fp16_tflops": 1513, "memory_gb": 80, "memory_bandwidth_gbps": 2000},
36
+ {"name": "H100 SXM", "fp16_tflops": 1979, "memory_gb": 80, "memory_bandwidth_gbps": 3350},
37
+ {"name": "H100 NVL", "fp16_tflops": 3958, "memory_gb": 188, "memory_bandwidth_gbps": 7800}
38
+ # Add or comment out GPU types as needed
39
  ]
40
+
 
41
  model_specs = [
42
  {"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
43
  {"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
 
47
  {"name": "Falcon-7B", "params_billion": 7, "d_model": 4544, "n_heads": 71, "n_layers": 32, "max_context_window": 2048, "d_head": 64},
48
  {"name": "Falcon-40B", "params_billion": 40, "d_model": 8192, "n_heads": 128, "n_layers": 60, "max_context_window": 2048, "d_head": 64},
49
  {"name": "Falcon-180B", "params_billion": 180, "d_model": 14848, "n_heads": 232, "n_layers": 80, "max_context_window": 2048, "d_head": 64}
50
+ # Add or comment out model specifications as needed
51
  ]
52
+
53
+ BYTES_IN_GB = 1_073_741_824 # 1 GB = 1,073,741,824 bytes
54
+
55
+ def estimate_llm_capacity_and_latency(model_name, gpu_name, num_gpu, prompt_size, response_size, n_concurrent_request, avg_context_window):
56
+ def calc_kv_cache_size_per_token(n_layers, d_model):
57
+ return 2 * 2 * n_layers * d_model / BYTES_IN_GB # GB/token
58
 
59
+ def calc_memory_footprint(model_spec, n_concurrent_request, avg_context_window):
60
+ kv_cache_size_per_token = calc_kv_cache_size_per_token(model_spec["n_layers"], model_spec["d_model"])
61
+ target_gpu_mem = kv_cache_size_per_token * avg_context_window * n_concurrent_request + model_spec["params_billion"] * 2
62
+ return target_gpu_mem
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ print(f"\n******************** Estimate LLM Memory Footprint ********************")
65
+ memory_footprint_table = []
66
+ for model_spec in model_specs:
67
+ kv_cache_size_per_token = calc_kv_cache_size_per_token(model_spec["n_layers"], model_spec["d_model"])
68
+ memory_footprint = calc_memory_footprint(model_spec, n_concurrent_request, avg_context_window)
69
+ memory_footprint_table.append([model_spec['name'], f"{kv_cache_size_per_token:.6f} GiB/token", f"{memory_footprint:.2f} GB"])
70
+ print(tabulate(memory_footprint_table, headers=['Model', 'KV Cache Size per Token', 'Memory Footprint'], tablefmt='orgtbl'))
71
+
72
+ def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
73
+ result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
74
+ return result if result >= 0 else "OOM"
75
+
76
+ def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
77
+ result = (2 * model_params_billion / num_gpu) / fp16_tflops
78
+ return result if result >= 0 else "OOM"
79
+
80
+ def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
81
+ result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
82
+ return result if result >= 0 else "OOM"
83
+
84
+ def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
85
+ if isinstance(prefill_time, str) or isinstance(generation_time, str): # Check if any are "NA"
86
+ return "OOM"
87
+ return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds
88
+
89
+ print(f"\n******************** Estimate LLM Capacity and Latency ******************** ")
90
+ capacity_latency_table = []
91
+ for model in model_specs:
92
+ # print(f"Model: {model['name']} ({model['params_billion']}B parameters)")
93
+ kv_cache_size = calc_kv_cache_size_per_token(model['n_layers'], model['d_model'])
94
+ for gpu in gpu_specs:
95
+ kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size)
96
+ prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu['fp16_tflops'])
97
+ generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu['memory_bandwidth_gbps'])
98
+ estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
99
+ capacity_latency_table.append([model['name'], gpu['name'], f"{kv_cache_tokens}", f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"])
100
+ print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'KV Cache Tokens', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
101
+
102
+ def generate_output(model_name, gpu_name, kv_cache_tokens, prefill_time, generation_time, estimated_response_time):
103
+ return {
104
+ "Model": model_name,
105
+ "GPU": gpu_name,
106
+ "KV Cache Tokens": str(kv_cache_tokens),
107
+ "Prefill Time (ms)": f"{prefill_time:.3f}",
108
+ "Generation Time (ms)": f"{generation_time:.3f}",
109
+ "Estimated Response Time (s)": f"{estimated_response_time:.1f}"
110
+ }
111
+
112
+ with gr.Blocks() as demo:
113
+ gr.Markdown("# Estimate LLM Capacity and Latency")
114
+
115
+ num_gpu = gr.Number(label="Number of GPUs", value=1)
116
+ prompt_size = gr.Slider(minimum=1, maximum=8192, label="Prompt Size (tokens)", value=4096)
117
+ response_size = gr.Slider(minimum=1, maximum=8192, label="Response Size (tokens)", value=256)
118
+ n_concurrent_request = gr.Slider(minimum=1, maximum=100, label="Concurrent Requests", value=10)
119
+ avg_context_window = gr.Slider(minimum=1, maximum=131072, label="Average Context Window", value=1024)
120
+
121
+ submit_button = gr.Button("Estimate")
122
+
123
+ table = gr.Table()
124
 
125
+ submit_button.click(
126
+ fn=lambda num_gpu=num_gpu, prompt_size=prompt_size, response_size=response_size,
127
+ n_concurrent_request=n_concurrent_request, avg_context_window=avg_context_window:
128
+ estimate_llm_capacity_and_latency(None, None, num_gpu, prompt_size, response_size, n_concurrent_request, avg_context_window),
129
+ inputs=[num_gpu, prompt_size, response_size, n_concurrent_request, avg_context_window],
130
+ outputs=[table]
131
+ )
132
 
133
+ demo.launch()