Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import argparse
|
2 |
from tabulate import tabulate
|
3 |
|
@@ -96,5 +97,34 @@ def main(num_gpu, prompt_size, response_size, n_concurrent_request, avg_context_
|
|
96 |
estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
|
97 |
capacity_latency_table.append([model['name'], gpu['name'], f"{kv_cache_tokens}", f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"])
|
98 |
print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'KV Cache Tokens', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
import argparse
|
3 |
from tabulate import tabulate
|
4 |
|
|
|
97 |
estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
|
98 |
capacity_latency_table.append([model['name'], gpu['name'], f"{kv_cache_tokens}", f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"])
|
99 |
print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'KV Cache Tokens', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
|
100 |
+
|
101 |
+
def create_interface():
|
102 |
+
# Definiamo gli elementi dell'interfaccia
|
103 |
+
elements = [
|
104 |
+
gr.Number(label="Num GPU"),
|
105 |
+
gr.Slider(label="Prompt Size", value=4096, minimum=1, maximum=16384),
|
106 |
+
gr.Slider(label="Response Size", value=256, minimum=1, maximum=8192),
|
107 |
+
gr.Number(label="N Concurrent Request"),
|
108 |
+
gr.Slider(label="Avg Context Window", value=1024, minimum=1, maximum=65536)
|
109 |
+
]
|
110 |
+
|
111 |
+
# Definiamo il titolo dell'interfaccia
|
112 |
+
title = "LLM Memory Footprint Estimator"
|
113 |
+
|
114 |
+
# Creiamo l'interfaccia Gradio
|
115 |
+
interface = gr.Interface(
|
116 |
+
fn=main,
|
117 |
+
inputs=elements,
|
118 |
+
outputs="tabulate",
|
119 |
+
title=title,
|
120 |
+
description="Estimate LLM Memory Footprint and Capacity"
|
121 |
+
)
|
122 |
+
|
123 |
+
return interface
|
124 |
+
|
125 |
+
# Creiamo l'interfaccia
|
126 |
+
interface = create_interface()
|
127 |
+
|
128 |
+
# Avviamo il server Gradio
|
129 |
+
if __name__ == "__main__":
|
130 |
+
interface.launch()
|