Spaces:

farmax
/

LLM_Sizing

Sleeping

App Files Files Community

farmax commited on Nov 9, 2024

Commit

0d2e72f

verified ·

1 Parent(s): 1d13641

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -25

app.py CHANGED Viewed

@@ -5,6 +5,24 @@ from tabulate import tabulate
 def greet(name):
     return f"Ciao, {name}!"
 # Move estimate_capacity_latency outside of main()
 def estimate_capacity_latency(model, gpu):
     kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
@@ -13,6 +31,29 @@ def estimate_capacity_latency(model, gpu):
     estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
     return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
 def main():
     parser = argparse.ArgumentParser(description='Your script description')
     parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
@@ -62,6 +103,7 @@ def main():
     ]
     BYTES_IN_GB = 1_073_741_824  # 1 GB = 1,073,741,824 bytes
     def calc_kv_cache_size_per_token(n_layers, d_model):
         return 2 * 2 * n_layers * d_model / BYTES_IN_GB  # GB/token
@@ -70,6 +112,7 @@ def main():
         target_gpu_mem = kv_cache_size_per_token * avg_context_window * n_concurrent_request + model_spec["params_billion"] * 2
         return target_gpu_mem
     def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
         result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
         return result if result >= 0 else "OOM"
@@ -106,28 +149,4 @@ def main():
     print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
 if __name__ == '__main__':
-    main()
-# Modify create_gradio_interface to use the global estimate_capacity_latency
-def create_gradio_interface():
-    demo = gr.Interface(
-        fn=estimate_capacity_latency,
-        inputs=[
-            gr.Textbox(label="Model Name"),
-            gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
-        ],
-        outputs=[
-            gr.HTML(label="Capacity and Latency Table")
-        ],
-        title="LLM Capacity and Latency Estimator",
-        description="Estimate LLM capacity and latency based on model and GPU specifications.",
-        theme="minimal"
-    )
-    return demo
-# Create the Gradio interface
-gr_interface = create_gradio_interface()
-# Start the interface
-gr_interface.launch()

 def greet(name):
     return f"Ciao, {name}!"
+# Define all helper functions here
+def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
+    result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
+    return result if result >= 0 else "OOM"
+def calc_prefill_time_per_token(num_gpu, model_params_billion, fp16_tflops):
+    result = (2 * model_params_billion / num_gpu) / fp16_tflops
+    return result if result >= 0 else "OOM"
+def calc_generation_time_per_token(num_gpu, model_params_billion, memory_bandwidth_gbps):
+    result = (2 * model_params_billion / num_gpu) / memory_bandwidth_gbps * 1000
+    return result if result >= 0 else "OOM"
+def calc_estimated_response_time(prefill_time, generation_time, prompt_size, response_size):
+    if isinstance(prefill_time, str) or isinstance(generation_time, str):  # Check if any are "NA"
+        return "OOM"
+    return (prompt_size * prefill_time + response_size * generation_time) / 1000  # convert ms to seconds
 # Move estimate_capacity_latency outside of main()
 def estimate_capacity_latency(model, gpu):
     kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
     estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
     return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
+def create_gradio_interface():
+    demo = gr.Interface(
+        fn=estimate_capacity_latency,
+        inputs=[
+            gr.Textbox(label="Model Name"),
+            gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
+        ],
+        outputs=[
+            gr.HTML(label="Capacity and Latency Table")
+        ],
+        title="LLM Capacity and Latency Estimator",
+        description="Estimate LLM capacity and latency based on model and GPU specifications.",
+        theme="minimal"
+    )
+    return demo
+# Create the Gradio interface
+gr_interface = create_gradio_interface()
+# Start the interface
+gr_interface.launch()
 def main():
     parser = argparse.ArgumentParser(description='Your script description')
     parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
     ]
     BYTES_IN_GB = 1_073_741_824  # 1 GB = 1,073,741,824 bytes
     def calc_kv_cache_size_per_token(n_layers, d_model):
         return 2 * 2 * n_layers * d_model / BYTES_IN_GB  # GB/token
         target_gpu_mem = kv_cache_size_per_token * avg_context_window * n_concurrent_request + model_spec["params_billion"] * 2
         return target_gpu_mem
+    # ... rest of the code remains the same ...
     def calc_kv_cache_tokens(num_gpu, gpu_memory_gb, model_params_billion, kv_cache_size):
         result = (num_gpu * gpu_memory_gb - 2 * model_params_billion) / kv_cache_size
         return result if result >= 0 else "OOM"
     print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
 if __name__ == '__main__':
+    main()