Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -32,7 +32,7 @@ def estimate_capacity_latency(model, gpu):
|
|
32 |
return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
|
33 |
|
34 |
def create_gradio_interface():
|
35 |
-
#
|
36 |
gpu_specs = [
|
37 |
{"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
|
38 |
{"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
|
@@ -47,10 +47,23 @@ def create_gradio_interface():
|
|
47 |
{"name": "H100 NVL", "fp16_tflops": 3958, "memory_gb": 188, "memory_bandwidth_gbps": 7800}
|
48 |
]
|
49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
demo = gr.Interface(
|
51 |
fn=estimate_capacity_latency,
|
52 |
inputs=[
|
53 |
gr.Textbox(label="Model Name"),
|
|
|
54 |
gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
|
55 |
],
|
56 |
outputs=[
|
|
|
32 |
return f"{prefill_time_per_token:.3f} ms", f"{generation_time_per_token:.3f} ms", f"{estimated_response_time:.1f} s"
|
33 |
|
34 |
def create_gradio_interface():
|
35 |
+
# Definisci gpu_specs qui così che sia disponibile nella funzione
|
36 |
gpu_specs = [
|
37 |
{"name": "A10", "fp16_tflops": 125, "memory_gb": 24, "memory_bandwidth_gbps": 600},
|
38 |
{"name": "A30", "fp16_tflops": 330, "memory_gb": 24, "memory_bandwidth_gbps": 933},
|
|
|
47 |
{"name": "H100 NVL", "fp16_tflops": 3958, "memory_gb": 188, "memory_bandwidth_gbps": 7800}
|
48 |
]
|
49 |
|
50 |
+
# Definisci model_specs qui così che sia disponibile nella funzione
|
51 |
+
model_specs = [
|
52 |
+
{"name": "Llama-3-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 8192, "d_head": 128},
|
53 |
+
{"name": "Llama-3-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 8192, "d_head": 128},
|
54 |
+
{"name": "Llama-3.1-8B", "params_billion": 8, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 131072, "d_head": 128},
|
55 |
+
{"name": "Llama-3.1-70B", "params_billion": 70, "d_model": 8192, "n_heads": 64, "n_layers": 80, "max_context_window": 131072, "d_head": 128},
|
56 |
+
{"name": "Mistral-7B-v0.3", "params_billion": 7, "d_model": 4096, "n_heads": 32, "n_layers": 32, "max_context_window": 32768, "d_head": 128},
|
57 |
+
{"name": "Falcon-7B", "params_billion": 7, "d_model": 4544, "n_heads": 71, "n_layers": 32, "max_context_window": 2048, "d_head": 64},
|
58 |
+
{"name": "Falcon-40B", "params_billion": 40, "d_model": 8192, "n_heads": 128, "n_layers": 60, "max_context_window": 2048, "d_head": 64},
|
59 |
+
{"name": "Falcon-180B", "params_billion": 180, "d_model": 14848, "n_heads": 232, "n_layers": 80, "max_context_window": 2048, "d_head": 64}
|
60 |
+
]
|
61 |
+
|
62 |
demo = gr.Interface(
|
63 |
fn=estimate_capacity_latency,
|
64 |
inputs=[
|
65 |
gr.Textbox(label="Model Name"),
|
66 |
+
gr.Dropdown(choices=[model['name'] for model in model_specs], label="Model Type"),
|
67 |
gr.Dropdown(choices=[gpu['name'] for gpu in gpu_specs], label="GPU Type")
|
68 |
],
|
69 |
outputs=[
|