Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,6 +5,14 @@ from tabulate import tabulate
|
|
5 |
def greet(name):
|
6 |
return f"Ciao, {name}!"
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def main():
|
9 |
parser = argparse.ArgumentParser(description='Your script description')
|
10 |
parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
|
@@ -87,12 +95,6 @@ def main():
|
|
87 |
memory_footprint_table.append([model_spec['name'], f"{kv_cache_size_per_token:.6f} GiB/token", f"{memory_footprint:.2f} GB"])
|
88 |
print(tabulate(memory_footprint_table, headers=['Model', 'KV Cache Size per Token', 'Memory Footprint'], tablefmt='orgtbl'))
|
89 |
|
90 |
-
def estimate_capacity_latency(model, gpu):
|
91 |
-
kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
|
92 |
-
prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu['fp16_tflops'])
|
93 |
-
generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu['memory_bandwidth_gbps'])
|
94 |
-
estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
|
95 |
-
return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
|
96 |
|
97 |
capacity_latency_table = []
|
98 |
for model in model_specs:
|
@@ -106,6 +108,7 @@ def main():
|
|
106 |
if __name__ == '__main__':
|
107 |
main()
|
108 |
|
|
|
109 |
def create_gradio_interface():
|
110 |
demo = gr.Interface(
|
111 |
fn=estimate_capacity_latency,
|
@@ -123,8 +126,8 @@ def create_gradio_interface():
|
|
123 |
|
124 |
return demo
|
125 |
|
126 |
-
#
|
127 |
gr_interface = create_gradio_interface()
|
128 |
|
129 |
-
#
|
130 |
-
gr_interface.launch()
|
|
|
5 |
def greet(name):
|
6 |
return f"Ciao, {name}!"
|
7 |
|
8 |
+
# Move estimate_capacity_latency outside of main()
|
9 |
+
def estimate_capacity_latency(model, gpu):
|
10 |
+
kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
|
11 |
+
prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu['fp16_tflops'])
|
12 |
+
generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu['memory_bandwidth_gbps'])
|
13 |
+
estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
|
14 |
+
return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
|
15 |
+
|
16 |
def main():
|
17 |
parser = argparse.ArgumentParser(description='Your script description')
|
18 |
parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
|
|
|
95 |
memory_footprint_table.append([model_spec['name'], f"{kv_cache_size_per_token:.6f} GiB/token", f"{memory_footprint:.2f} GB"])
|
96 |
print(tabulate(memory_footprint_table, headers=['Model', 'KV Cache Size per Token', 'Memory Footprint'], tablefmt='orgtbl'))
|
97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
capacity_latency_table = []
|
100 |
for model in model_specs:
|
|
|
108 |
if __name__ == '__main__':
|
109 |
main()
|
110 |
|
111 |
+
# Modify create_gradio_interface to use the global estimate_capacity_latency
|
112 |
def create_gradio_interface():
|
113 |
demo = gr.Interface(
|
114 |
fn=estimate_capacity_latency,
|
|
|
126 |
|
127 |
return demo
|
128 |
|
129 |
+
# Create the Gradio interface
|
130 |
gr_interface = create_gradio_interface()
|
131 |
|
132 |
+
# Start the interface
|
133 |
+
gr_interface.launch()
|