farmax commited on
Commit
0d2e72f
·
verified ·
1 Parent(s): 1d13641

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -25
app.py CHANGED
@@ -5,6 +5,24 @@ from tabulate import tabulate
5
  def greet(name):
6
  return f"Ciao, {name}!"
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  # Move estimate_capacity_latency outside of main()
9
  def estimate_capacity_latency(model, gpu):
10
  kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
@@ -13,6 +31,29 @@ def estimate_capacity_latency(model, gpu):
13
  estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
14
  return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def main():
17
  parser = argparse.ArgumentParser(description='Your script description')
18
  parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
@@ -62,6 +103,7 @@ def main():
62
  ]
63
 
64
  BYTES_IN_GB = 1_073_741_824 # 1 GB = 1,073,741,824 bytes
 
65
  def calc_kv_cache_size_per_token(n_layers, d_model):
66
  return 2 * 2 * n_layers * d_model / BYTES_IN_GB # GB/token
67
 
@@ -70,6 +112,7 @@ def main():
70
  target_gpu_mem = kv_cache_size_per_token * avg_context_window * n_concurrent_request + model_spec["params_billion"] * 2
71
  return target_gpu_mem
72
 
 
73
  def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
74
  result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
75
  return result if result >= 0 else "OOM"
@@ -106,28 +149,4 @@ def main():
106
  print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
107
 
108
  if __name__ == '__main__':
109
- main()
110
-
111
- # Modify create_gradio_interface to use the global estimate_capacity_latency
112
- def create_gradio_interface():
113
- demo = gr.Interface(
114
- fn=estimate_capacity_latency,
115
- inputs=[
116
- gr.Textbox(label="Model Name"),
117
- gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
118
- ],
119
- outputs=[
120
- gr.HTML(label="Capacity and Latency Table")
121
- ],
122
- title="LLM Capacity and Latency Estimator",
123
- description="Estimate LLM capacity and latency based on model and GPU specifications.",
124
- theme="minimal"
125
- )
126
-
127
- return demo
128
-
129
- # Create the Gradio interface
130
- gr_interface = create_gradio_interface()
131
-
132
- # Start the interface
133
- gr_interface.launch()
 
5
  def greet(name):
6
  return f"Ciao, {name}!"
7
 
8
+ # Define all helper functions here
9
+ def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
10
+ result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
11
+ return result if result >= 0 else "OOM"
12
+
13
+ def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
14
+ result = (2 * model_params_billion / num_gpu) / fp16_tflops
15
+ return result if result >= 0 else "OOM"
16
+
17
+ def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
18
+ result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
19
+ return result if result >= 0 else "OOM"
20
+
21
+ def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
22
+ if isinstance(prefill_time, str) or isinstance(generation_time, str): # Check if any are "NA"
23
+ return "OOM"
24
+ return (prompt_size * prefill_time + response_size * generation_time) / 1000 # convert ms to seconds
25
+
26
  # Move estimate_capacity_latency outside of main()
27
  def estimate_capacity_latency(model, gpu):
28
  kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
 
31
  estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
32
  return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
33
 
34
+ def create_gradio_interface():
35
+ demo = gr.Interface(
36
+ fn=estimate_capacity_latency,
37
+ inputs=[
38
+ gr.Textbox(label="Model Name"),
39
+ gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
40
+ ],
41
+ outputs=[
42
+ gr.HTML(label="Capacity and Latency Table")
43
+ ],
44
+ title="LLM Capacity and Latency Estimator",
45
+ description="Estimate LLM capacity and latency based on model and GPU specifications.",
46
+ theme="minimal"
47
+ )
48
+
49
+ return demo
50
+
51
+ # Create the Gradio interface
52
+ gr_interface = create_gradio_interface()
53
+
54
+ # Start the interface
55
+ gr_interface.launch()
56
+
57
  def main():
58
  parser = argparse.ArgumentParser(description='Your script description')
59
  parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
 
103
  ]
104
 
105
  BYTES_IN_GB = 1_073_741_824 # 1 GB = 1,073,741,824 bytes
106
+
107
  def calc_kv_cache_size_per_token(n_layers, d_model):
108
  return 2 * 2 * n_layers * d_model / BYTES_IN_GB # GB/token
109
 
 
112
  target_gpu_mem = kv_cache_size_per_token * avg_context_window * n_concurrent_request + model_spec["params_billion"] * 2
113
  return target_gpu_mem
114
 
115
+ # ... rest of the code remains the same ...
116
  def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
117
  result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
118
  return result if result >= 0 else "OOM"
 
149
  print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
150
 
151
  if __name__ == '__main__':
152
+ main()