Spaces:
Sleeping
Sleeping
File size: 7,838 Bytes
6383b80 034c6e1 5fac534 642b857 1d13641 642b857 f0e5bc0 642b857 f0e5bc0 642b857 1d13641 642b857 1d13641 642b857 5fac534 1d13641 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import gradio as gr
import argparse
from tabulate import tabulate
def greet(name):
return f"Ciao, {name}!"
# Move estimate_capacity_latency outside of main()
def estimate_capacity_latency(model, gpu):
kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu['fp16_tflops'])
generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu['memory_bandwidth_gbps'])
estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
def main():
parser = argparse.ArgumentParser(description='Your script description')
parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
parser.add_argument('-p', '--prompt_sz', type=int, default=4096, help='Prompt size in tokens')
parser.add_argument('-r', '--response_sz', type=int, default=256, help='Response size in tokens')
parser.add_argument('-c', '--n_concurrent_req', type=int, default=10, help='Number of concurrent requests')
parser.add_argument('-w', '-cw', '--ctx_window', type=int, default=1024, help='Average context window')
args = parser.parse_args()
num_gpu = args.num_gpu
prompt_size = args.prompt_sz
response_size = args.response_sz
n_concurrent_request = args.n_concurrent_req
avg_context_window = args.ctx_window
# Print input
print(f" num_gpu = {num_gpu}, prompt_size = {prompt_size} tokens, response_size = {response_size} tokens")
print(f" n_concurrent_request = {n_concurrent_request}, avg_context_window = {avg_context_window} tokens")
# Define variables
gpu_specs = [
{"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
{"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
{"name": "L40", "fp16_tflops": 181, "memory_gb": 48, "memory_bandwidth_gbps": 864},
{"name": "L40s", "fp16_tflops": 362, "memory_gb": 48, "memory_bandwidth_gbps": 864},
{"name": "A100 40 GB", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
{"name": "A100 40 GB SXM", "fp16_tflops": 312, "memory_gb": 40, "memory_bandwidth_gbps": 1555},
{"name": "A100 80 GB PCIe", "fp16_tflops": 312, "memory_gb": 80, "memory_bandwidth_gbps": 1935},
{"name": "A100 80 GB SXM", "fp16_tflops": 312, "memory_gb": 80, "memory_bandwidth_gbps": 2039},
{"name": "H100 PCIe", "fp16_tflops": 1513, "memory_gb": 80, "memory_bandwidth_gbps": 2000},
{"name": "H100 SXM", "fp16_tflops": 1979, "memory_gb": 80, "memory_bandwidth_gbps": 3350},
{"name": "H100 NVL", "fp16_tflops": 3958, "memory_gb": 188, "memory_bandwidth_gbps": 7800}
# Add or comment out GPU types as needed
]
model_specs = [
{"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
{"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
{"name": "Llama-3.1-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 131072, "d_head": 128},
{"name": "Llama-3.1-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 131072, "d_head": 128},
{"name": "Mistral-7B-v0.3", "params_billion": 7, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 32768, "d_head": 128},
{"name": "Falcon-7B", "params_billion": 7, "d_model": 4544, "n_heads": 71, "n_layers": 32, "max_context_window": 2048, "d_head": 64},
{"name": "Falcon-40B", "params_billion": 40, "d_model": 8192, "n_heads": 128, "n_layers": 60, "max_context_window": 2048, "d_head": 64},
{"name": "Falcon-180B", "params_billion": 180, "d_model": 14848, "n_heads": 232, "n_layers": 80, "max_context_window": 2048, "d_head": 64}
# Add or comment out model specifications as needed
]
BYTES_IN_GB = 1_073_741_824 # 1 GB = 1,073,741,824 bytes
def calc_kv_cache_size_per_token(n_layers, d_model):
return 2 * 2 * n_layers * d_model / BYTES_IN_GB # GB/token
def calc_memory_footprint(model_spec, n_concurrent_request, avg_context_window):
kv_cache_size_per_token = calc_kv_cache_size_per_token(model_spec["n_layers"], model_spec["d_model"])
target_gpu_mem = kv_cache_size_per_token * avg_context_window * n_concurrent_request + model_spec["params_billion"] * 2
return target_gpu_mem
def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
return result if result >= 0 else "OOM"
def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
result = (2 * model_params_billion / num_gpu) / fp16_tflops
return result if result >= 0 else "OOM"
def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
return result if result >= 0 else "OOM"
def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
if isinstance(prefill_time, str) or isinstance(generation_time, str): # Check if any are "NA"
return "OOM"
return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds
print(f"\n******************** Estimate LLM Memory Footprint ********************")
memory_footprint_table = []
for model_spec in model_specs:
kv_cache_size_per_token = calc_kv_cache_size_per_token(model_spec["n_layers"], model_spec["d_model"])
memory_footprint = calc_memory_footprint(model_spec, n_concurrent_request, avg_context_window)
memory_footprint_table.append([model_spec['name'], f"{kv_cache_size_per_token:.6f} GiB/token", f"{memory_footprint:.2f} GB"])
print(tabulate(memory_footprint_table, headers=['Model', 'KV Cache Size per Token', 'Memory Footprint'], tablefmt='orgtbl'))
capacity_latency_table = []
for model in model_specs:
for gpu in gpu_specs:
prefill_time, generation_time, estimated_response_time = estimate_capacity_latency(model, gpu)
capacity_latency_table.append([model['name'], gpu['name'], f"{prefill_time}", f"{generation_time}", f"{estimated_response_time}"])
print(f"\n******************** Estimate LLM Capacity and Latency ******************** ")
print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
if __name__ == '__main__':
main()
# Modify create_gradio_interface to use the global estimate_capacity_latency
def create_gradio_interface():
demo = gr.Interface(
fn=estimate_capacity_latency,
inputs=[
gr.Textbox(label="Model Name"),
gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
],
outputs=[
gr.HTML(label="Capacity and Latency Table")
],
title="LLM Capacity and Latency Estimator",
description="Estimate LLM capacity and latency based on model and GPU specifications.",
theme="minimal"
)
return demo
# Create the Gradio interface
gr_interface = create_gradio_interface()
# Start the interface
gr_interface.launch() |