farmax commited on
Commit
dc16fd5
·
verified ·
1 Parent(s): 40bf649

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -2
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import argparse
2
  from tabulate import tabulate
3
 
@@ -96,5 +97,34 @@ def main(num_gpu, prompt_size, response_size, n_concurrent_request, avg_context_
96
  estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
97
  capacity_latency_table.append([model['name'], gpu['name'], f"{kv_cache_tokens}", f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"])
98
  print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'KV Cache Tokens', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
99
- if __name__ == '__main__':
100
- main(num_gpu, prompt_size, response_size, n_concurrent_request, avg_context_window)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
  import argparse
3
  from tabulate import tabulate
4
 
 
97
  estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
98
  capacity_latency_table.append([model['name'], gpu['name'], f"{kv_cache_tokens}", f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"])
99
  print(tabulate(capacity_latency_table, headers=['Model', 'GPU', 'KV Cache Tokens', 'Prefill Time', 'Generation Time', 'Estimated Response Time'], tablefmt='orgtbl'))
100
+
101
+ def create_interface():
102
+ # Definiamo gli elementi dell'interfaccia
103
+ elements = [
104
+ gr.Number(label="Num GPU"),
105
+ gr.Slider(label="Prompt Size", value=4096, minimum=1, maximum=16384),
106
+ gr.Slider(label="Response Size", value=256, minimum=1, maximum=8192),
107
+ gr.Number(label="N Concurrent Request"),
108
+ gr.Slider(label="Avg Context Window", value=1024, minimum=1, maximum=65536)
109
+ ]
110
+
111
+ # Definiamo il titolo dell'interfaccia
112
+ title = "LLM Memory Footprint Estimator"
113
+
114
+ # Creiamo l'interfaccia Gradio
115
+ interface = gr.Interface(
116
+ fn=main,
117
+ inputs=elements,
118
+ outputs="tabulate",
119
+ title=title,
120
+ description="Estimate LLM Memory Footprint and Capacity"
121
+ )
122
+
123
+ return interface
124
+
125
+ # Creiamo l'interfaccia
126
+ interface = create_interface()
127
+
128
+ # Avviamo il server Gradio
129
+ if __name__ == "__main__":
130
+ interface.launch()