farmax commited on
Commit
1d13641
·
verified ·
1 Parent(s): 6383b80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -5,6 +5,14 @@ from tabulate import tabulate
5
  def greet(name):
6
  return f"Ciao, {name}!"
7
 
 
 
 
 
 
 
 
 
8
  def main():
9
  parser = argparse.ArgumentParser(description='Your script description')
10
  parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
@@ -87,12 +95,6 @@ def main():
87
  memory_footprint_table.append([model_spec['name'], f"{kv_cache_size_per_token:.6f} GiB/token", f"{memory_footprint:.2f} GB"])
88
  print(tabulate(memory_footprint_table, headers=['Model', 'KV Cache Size per Token', 'Memory Footprint'], tablefmt='orgtbl'))
89
 
90
- def estimate_capacity_latency(model, gpu):
91
- kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
92
- prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu['fp16_tflops'])
93
- generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu['memory_bandwidth_gbps'])
94
- estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
95
- return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
96
 
97
  capacity_latency_table = []
98
  for model in model_specs:
@@ -106,6 +108,7 @@ def main():
106
  if __name__ == '__main__':
107
  main()
108
 
 
109
  def create_gradio_interface():
110
  demo = gr.Interface(
111
  fn=estimate_capacity_latency,
@@ -123,8 +126,8 @@ def create_gradio_interface():
123
 
124
  return demo
125
 
126
- # Creare l'interfaccia Gradio
127
  gr_interface = create_gradio_interface()
128
 
129
- # Avviare l'interfaccia
130
- gr_interface.launch()
 
5
  def greet(name):
6
  return f"Ciao, {name}!"
7
 
8
+ # Move estimate_capacity_latency outside of main()
9
+ def estimate_capacity_latency(model, gpu):
10
+ kv_cache_tokens = calc_kv_cache_tokens(num_gpu, gpu['memory_gb'], model['params_billion'], kv_cache_size_per_token)
11
+ prefill_time_per_token = calc_prefill_time_per_token(num_gpu, model['params_billion'], gpu['fp16_tflops'])
12
+ generation_time_per_token = calc_generation_time_per_token(num_gpu, model['params_billion'], gpu['memory_bandwidth_gbps'])
13
+ estimated_response_time = calc_estimated_response_time(prefill_time_per_token, generation_time_per_token, prompt_size, response_size)
14
+ return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
15
+
16
  def main():
17
  parser = argparse.ArgumentParser(description='Your script description')
18
  parser.add_argument('-g', '--num_gpu', type=int, default=1, help='Number of GPUs')
 
95
  memory_footprint_table.append([model_spec['name'], f"{kv_cache_size_per_token:.6f} GiB/token", f"{memory_footprint:.2f} GB"])
96
  print(tabulate(memory_footprint_table, headers=['Model', 'KV Cache Size per Token', 'Memory Footprint'], tablefmt='orgtbl'))
97
 
 
 
 
 
 
 
98
 
99
  capacity_latency_table = []
100
  for model in model_specs:
 
108
  if __name__ == '__main__':
109
  main()
110
 
111
+ # Modify create_gradio_interface to use the global estimate_capacity_latency
112
  def create_gradio_interface():
113
  demo = gr.Interface(
114
  fn=estimate_capacity_latency,
 
126
 
127
  return demo
128
 
129
+ # Create the Gradio interface
130
  gr_interface = create_gradio_interface()
131
 
132
+ # Start the interface
133
+ gr_interface.launch()