Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
@@ -4,33 +4,79 @@ import requests
|
|
4 |
|
5 |
import gradio as gr
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def avaliable_providers():
|
8 |
providers = []
|
9 |
-
|
10 |
headers = {
|
11 |
"Content-Type": "application/json",
|
12 |
}
|
13 |
-
endpoint_url = "https://api.endpoints.huggingface.cloud/provider"
|
14 |
response = requests.get(endpoint_url, headers=headers)
|
15 |
|
16 |
-
|
|
|
|
|
17 |
if provider['status'] == 'available':
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
return providers
|
21 |
|
|
|
|
|
22 |
def update_regions(provider):
|
23 |
avalialbe_regions = []
|
24 |
-
|
25 |
-
headers = {
|
26 |
-
"Content-Type": "application/json",
|
27 |
-
}
|
28 |
-
endpoint_url = f"https://api.endpoints.huggingface.cloud/provider/{provider}/region"
|
29 |
-
response = requests.get(endpoint_url, headers=headers)
|
30 |
|
31 |
-
for region in
|
32 |
-
|
33 |
-
avalialbe_regions.append(f"{region['region']}/{region['label']}")
|
34 |
|
35 |
return gr.Dropdown.update(
|
36 |
choices=avalialbe_regions,
|
@@ -38,28 +84,22 @@ def update_regions(provider):
|
|
38 |
)
|
39 |
|
40 |
def update_compute_options(provider, region):
|
41 |
-
region = region.split("/")[0]
|
42 |
avalialbe_compute_options = []
|
|
|
43 |
|
44 |
-
|
45 |
-
"Content-Type": "application/json",
|
46 |
-
}
|
47 |
-
endpoint_url = f"https://api.endpoints.huggingface.cloud/provider/{provider}/region/{region}/compute"
|
48 |
-
print(endpoint_url)
|
49 |
-
response = requests.get(endpoint_url, headers=headers)
|
50 |
-
|
51 |
-
for compute in response.json()['items']:
|
52 |
if compute['status'] == 'available':
|
53 |
accelerator = compute['accelerator']
|
54 |
numAccelerators = compute['numAccelerators']
|
55 |
memoryGb = compute['memoryGb']
|
56 |
architecture = compute['architecture']
|
57 |
instanceType = compute['instanceType']
|
58 |
-
|
|
|
59 |
type = f"{numAccelerators}vCPU {memoryGb} · {architecture}" if accelerator == "cpu" else f"{numAccelerators}x {architecture}"
|
60 |
-
|
61 |
avalialbe_compute_options.append(
|
62 |
-
f"{compute['accelerator'].upper()} [{compute['instanceSize']}] · {type} · {instanceType}"
|
63 |
)
|
64 |
|
65 |
return gr.Dropdown.update(
|
@@ -77,9 +117,9 @@ def submit(
|
|
77 |
task_selector,
|
78 |
framework_selector,
|
79 |
compute_selector,
|
80 |
-
min_node_selector,
|
81 |
-
max_node_selector,
|
82 |
-
security_selector
|
83 |
):
|
84 |
compute_resources = compute_selector.split("·")
|
85 |
accelerator = compute_resources[0][:3].strip()
|
@@ -89,7 +129,7 @@ def submit(
|
|
89 |
size = compute_resources[0][size_l_index : size_r_index].strip()
|
90 |
|
91 |
type = compute_resources[-1].strip()
|
92 |
-
|
93 |
payload = {
|
94 |
"accountId": hf_account_input.strip(),
|
95 |
"compute": {
|
@@ -107,7 +147,7 @@ def submit(
|
|
107 |
"huggingface": {}
|
108 |
},
|
109 |
"repository": repository_selector.lower(),
|
110 |
-
"revision":
|
111 |
"task": task_selector.lower()
|
112 |
},
|
113 |
"name": endpoint_name_input.strip(),
|
@@ -117,7 +157,7 @@ def submit(
|
|
117 |
},
|
118 |
"type": security_selector.lower()
|
119 |
}
|
120 |
-
|
121 |
print(payload)
|
122 |
|
123 |
payload = json.dumps(payload)
|
@@ -127,7 +167,7 @@ def submit(
|
|
127 |
"Authorization": f"Bearer {hf_token_input.strip()}",
|
128 |
"Content-Type": "application/json",
|
129 |
}
|
130 |
-
endpoint_url = f"https://api.endpoints.huggingface.cloud/endpoint"
|
131 |
print(endpoint_url)
|
132 |
|
133 |
response = requests.post(endpoint_url, headers=headers, data=payload)
|
@@ -143,215 +183,259 @@ def submit(
|
|
143 |
else:
|
144 |
return f"something went wrong {response.status_code} = {response.text}"
|
145 |
|
146 |
-
with gr.Blocks() as hf_endpoint:
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
)
|
217 |
|
218 |
-
|
219 |
-
value=
|
220 |
interactive=False,
|
221 |
-
|
222 |
-
)
|
223 |
-
|
224 |
-
with gr.Row():
|
225 |
-
gr.Markdown("""
|
226 |
-
#### Task
|
227 |
-
""")
|
228 |
-
|
229 |
-
gr.Markdown("""
|
230 |
-
#### Framework
|
231 |
-
""")
|
232 |
-
|
233 |
-
with gr.Row():
|
234 |
-
task_selector = gr.Textbox(
|
235 |
-
value="Custom",
|
236 |
-
interactive=False,
|
237 |
-
show_label=False,
|
238 |
-
)
|
239 |
-
|
240 |
-
framework_selector = gr.Textbox(
|
241 |
-
value="TensorFlow",
|
242 |
-
interactive=False,
|
243 |
-
show_label=False,
|
244 |
)
|
245 |
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
gr.Markdown("""
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
)
|
277 |
-
|
278 |
-
|
279 |
-
value=1,
|
280 |
-
interactive=True,
|
281 |
-
show_label=False,
|
282 |
-
)
|
283 |
-
|
284 |
-
security_selector = gr.Radio(
|
285 |
-
choices=["Protected", "Public", "Private"],
|
286 |
-
value="Public",
|
287 |
-
interactive=True,
|
288 |
-
show_label=False,
|
289 |
-
)
|
290 |
-
|
291 |
-
submit_button = gr.Button(
|
292 |
-
value="Submit",
|
293 |
-
)
|
294 |
-
|
295 |
-
status_txt = gr.Textbox(
|
296 |
-
value="any status update will be displayed here",
|
297 |
-
interactive=False
|
298 |
-
)
|
299 |
-
|
300 |
-
submit_button.click(
|
301 |
-
submit,
|
302 |
-
inputs=[
|
303 |
-
hf_account_input,
|
304 |
-
hf_token_input,
|
305 |
-
endpoint_name_input,
|
306 |
-
provider_selector,
|
307 |
-
region_selector,
|
308 |
-
repository_selector,
|
309 |
-
task_selector,
|
310 |
-
framework_selector,
|
311 |
-
compute_selector,
|
312 |
-
min_node_selector,
|
313 |
-
max_node_selector,
|
314 |
-
security_selector],
|
315 |
-
outputs=status_txt)
|
316 |
-
|
317 |
-
gr.Markdown("""
|
318 |
-
#### Pricing Table(CPU) - 2023/1/11
|
319 |
-
""")
|
320 |
-
|
321 |
-
gr.Dataframe(
|
322 |
-
headers=["provider", "size", "$/h", "vCPUs", "Memory", "Architecture"],
|
323 |
-
datatype=["str", "str", "str", "number", "str", "str"],
|
324 |
-
row_count=8,
|
325 |
-
col_count=(6, "fixed"),
|
326 |
-
value=[
|
327 |
-
["aws", "small", "$0.06", 1, "2GB", "Intel Xeon - Ice Lake"],
|
328 |
-
["aws", "medium", "$0.12", 2, "4GB", "Intel Xeon - Ice Lake"],
|
329 |
-
["aws", "large", "$0.24", 4, "8GB", "Intel Xeon - Ice Lake"],
|
330 |
-
["aws", "xlarge", "$0.48", 8, "16GB", "Intel Xeon - Ice Lake"],
|
331 |
-
["azure", "small", "$0.06", 1, "2GB", "Intel Xeon"],
|
332 |
-
["azure", "medium", "$0.12", 2, "4GB", "Intel Xeon"],
|
333 |
-
["azure", "large", "$0.24", 4, "8GB", "Intel Xeon"],
|
334 |
-
["azure", "xlarge", "$0.48", 8, "16GB", "Intel Xeon"],
|
335 |
-
]
|
336 |
-
)
|
337 |
-
|
338 |
-
gr.Markdown("""
|
339 |
-
#### Pricing Table(GPU) - 2023/1/11
|
340 |
-
""")
|
341 |
-
|
342 |
-
gr.Dataframe(
|
343 |
-
headers=["provider", "size", "$/h", "GPUs", "Memory", "Architecture"],
|
344 |
-
datatype=["str", "str", "str", "number", "str", "str"],
|
345 |
-
row_count=6,
|
346 |
-
col_count=(6, "fixed"),
|
347 |
-
value=[
|
348 |
-
["aws", "small", "$0.60", 1, "14GB", "NVIDIA T4"],
|
349 |
-
["aws", "medium", "$1.30", 1, "24GB", "NVIDIA A10G"],
|
350 |
-
["aws", "large", "$4.50", 4, "156B", "NVIDIA T4"],
|
351 |
-
["aws", "xlarge", "$6.50", 1, "80GB", "NVIDIA A100"],
|
352 |
-
["aws", "xxlarge", "$7.00", 4, "96GB", "NVIDIA A10G"],
|
353 |
-
["aws", "xxxlarge", "$45.0", 8, "640GB", "NVIDIA A100"],
|
354 |
-
]
|
355 |
-
)
|
356 |
-
|
357 |
-
hf_endpoint.launch(enable_queue=True)
|
|
|
4 |
|
5 |
import gradio as gr
|
6 |
|
7 |
+
STYLE = """
|
8 |
+
.group-border {
|
9 |
+
padding: 10px;
|
10 |
+
border-width: 1px;
|
11 |
+
border-radius: 10px;
|
12 |
+
border-color: gray;
|
13 |
+
border-style: dashed;
|
14 |
+
box-shadow: 1px 1px 3px;
|
15 |
+
}
|
16 |
+
.control-label-font {
|
17 |
+
font-size: 13pt !important;
|
18 |
+
}
|
19 |
+
.control-button {
|
20 |
+
background: none !important;
|
21 |
+
border-color: #69ade2 !important;
|
22 |
+
border-width: 2px !important;
|
23 |
+
color: #69ade2 !important;
|
24 |
+
}
|
25 |
+
.center {
|
26 |
+
text-align: center;
|
27 |
+
}
|
28 |
+
.right {
|
29 |
+
text-align: right;
|
30 |
+
}
|
31 |
+
.no-label {
|
32 |
+
padding: 0px !important;
|
33 |
+
}
|
34 |
+
.no-label > label > span {
|
35 |
+
display: none;
|
36 |
+
}
|
37 |
+
.small-big {
|
38 |
+
font-size: 12pt !important;
|
39 |
+
}
|
40 |
+
|
41 |
+
"""
|
42 |
+
|
43 |
def avaliable_providers():
|
44 |
providers = []
|
45 |
+
|
46 |
headers = {
|
47 |
"Content-Type": "application/json",
|
48 |
}
|
49 |
+
endpoint_url = "https://api.endpoints.huggingface.cloud/v2/provider"
|
50 |
response = requests.get(endpoint_url, headers=headers)
|
51 |
|
52 |
+
providers = {}
|
53 |
+
|
54 |
+
for provider in response.json()['vendors']:
|
55 |
if provider['status'] == 'available':
|
56 |
+
regions = {}
|
57 |
+
|
58 |
+
availability = False
|
59 |
+
for region in provider['regions']:
|
60 |
+
if region["status"] == "available":
|
61 |
+
regions[region['name']] = {
|
62 |
+
"label": region['label'],
|
63 |
+
"computes": region['computes']
|
64 |
+
}
|
65 |
+
availability = True
|
66 |
+
|
67 |
+
if availability:
|
68 |
+
providers[provider['name']] = regions
|
69 |
+
|
70 |
return providers
|
71 |
|
72 |
+
providers = avaliable_providers()
|
73 |
+
|
74 |
def update_regions(provider):
|
75 |
avalialbe_regions = []
|
76 |
+
regions = providers[provider]
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
for region, attributes in regions.items():
|
79 |
+
avalialbe_regions.append(f"{region}[{attributes['label']}]")
|
|
|
80 |
|
81 |
return gr.Dropdown.update(
|
82 |
choices=avalialbe_regions,
|
|
|
84 |
)
|
85 |
|
86 |
def update_compute_options(provider, region):
|
|
|
87 |
avalialbe_compute_options = []
|
88 |
+
computes = providers[provider][region.split("[")[0].strip()]["computes"]
|
89 |
|
90 |
+
for compute in computes:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
if compute['status'] == 'available':
|
92 |
accelerator = compute['accelerator']
|
93 |
numAccelerators = compute['numAccelerators']
|
94 |
memoryGb = compute['memoryGb']
|
95 |
architecture = compute['architecture']
|
96 |
instanceType = compute['instanceType']
|
97 |
+
pricePerHour = compute['pricePerHour']
|
98 |
+
|
99 |
type = f"{numAccelerators}vCPU {memoryGb} · {architecture}" if accelerator == "cpu" else f"{numAccelerators}x {architecture}"
|
100 |
+
|
101 |
avalialbe_compute_options.append(
|
102 |
+
f"{compute['accelerator'].upper()} [{compute['instanceSize']}] · {type} · {instanceType} · ${pricePerHour}/hour"
|
103 |
)
|
104 |
|
105 |
return gr.Dropdown.update(
|
|
|
117 |
task_selector,
|
118 |
framework_selector,
|
119 |
compute_selector,
|
120 |
+
min_node_selector,
|
121 |
+
max_node_selector,
|
122 |
+
security_selector
|
123 |
):
|
124 |
compute_resources = compute_selector.split("·")
|
125 |
accelerator = compute_resources[0][:3].strip()
|
|
|
129 |
size = compute_resources[0][size_l_index : size_r_index].strip()
|
130 |
|
131 |
type = compute_resources[-1].strip()
|
132 |
+
|
133 |
payload = {
|
134 |
"accountId": hf_account_input.strip(),
|
135 |
"compute": {
|
|
|
147 |
"huggingface": {}
|
148 |
},
|
149 |
"repository": repository_selector.lower(),
|
150 |
+
"revision": "main",
|
151 |
"task": task_selector.lower()
|
152 |
},
|
153 |
"name": endpoint_name_input.strip(),
|
|
|
157 |
},
|
158 |
"type": security_selector.lower()
|
159 |
}
|
160 |
+
|
161 |
print(payload)
|
162 |
|
163 |
payload = json.dumps(payload)
|
|
|
167 |
"Authorization": f"Bearer {hf_token_input.strip()}",
|
168 |
"Content-Type": "application/json",
|
169 |
}
|
170 |
+
endpoint_url = f"https://api.endpoints.huggingface.cloud/v2/endpoint"
|
171 |
print(endpoint_url)
|
172 |
|
173 |
response = requests.post(endpoint_url, headers=headers, data=payload)
|
|
|
183 |
else:
|
184 |
return f"something went wrong {response.status_code} = {response.text}"
|
185 |
|
186 |
+
with gr.Blocks(css=STYLE) as hf_endpoint:
|
187 |
+
with gr.Tab("🤗 Inference Endpoint"):
|
188 |
+
gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
|
189 |
+
|
190 |
+
with gr.Column(elem_classes=["group-border"]):
|
191 |
+
with gr.Row():
|
192 |
+
with gr.Column():
|
193 |
+
gr.Markdown("""## Hugging Face account ID (name)""")
|
194 |
+
hf_account_input = gr.Textbox(show_label=False, elem_classes=["no-label", "small-big"])
|
195 |
+
|
196 |
+
with gr.Column():
|
197 |
+
gr.Markdown("## Hugging Face access token")
|
198 |
+
hf_token_input = gr.Textbox(show_label=False, type="password", elem_classes=["no-label", "small-big"])
|
199 |
+
|
200 |
+
with gr.Row():
|
201 |
+
with gr.Column():
|
202 |
+
gr.Markdown("""## Target model
|
203 |
+
|
204 |
+
Import a model from the Hugging Face hub""")
|
205 |
+
repository_selector = gr.Textbox(
|
206 |
+
value="NousResearch/Nous-Hermes-Llama2-70b",
|
207 |
+
interactive=False,
|
208 |
+
show_label=False,
|
209 |
+
elem_classes=["no-label", "small-big"]
|
210 |
+
)
|
211 |
+
|
212 |
+
with gr.Column():
|
213 |
+
gr.Markdown("""## Target model version(branch)
|
214 |
+
|
215 |
+
Specify the branch name""")
|
216 |
+
revision_selector = gr.Textbox(
|
217 |
+
value=f"main",
|
218 |
+
interactive=False,
|
219 |
+
show_label=False,
|
220 |
+
elem_classes=["no-label", "small-big"]
|
221 |
+
)
|
222 |
+
|
223 |
+
with gr.Column(elem_classes=["group-border"]):
|
224 |
+
with gr.Column():
|
225 |
+
gr.Markdown("""## Endpoint name
|
226 |
+
|
227 |
+
Input a name for your new endpoint""")
|
228 |
+
endpoint_name_input = gr.Textbox(show_label=False, elem_classes=["no-label", "small-big"])
|
229 |
+
|
230 |
+
with gr.Row():
|
231 |
+
with gr.Column():
|
232 |
+
gr.Markdown("""## Cloud Provider
|
233 |
+
|
234 |
+
Choose between Amazon Web Services and Microsoft Azure""")
|
235 |
+
provider_selector = gr.Dropdown(
|
236 |
+
choices=providers.keys(),
|
237 |
+
interactive=True,
|
238 |
+
show_label=False,
|
239 |
+
elem_classes=["no-label", "small-big"]
|
240 |
+
)
|
241 |
+
|
242 |
+
with gr.Column():
|
243 |
+
gr.Markdown("""## Cloud Region
|
244 |
+
|
245 |
+
Choose one of the regions from each cloud provider""")
|
246 |
+
region_selector = gr.Dropdown(
|
247 |
+
[],
|
248 |
+
value="",
|
249 |
+
interactive=True,
|
250 |
+
show_label=False,
|
251 |
+
elem_classes=["no-label", "small-big"]
|
252 |
+
)
|
253 |
+
|
254 |
+
with gr.Row(visible=False):
|
255 |
+
with gr.Column():
|
256 |
+
gr.Markdown("## Task")
|
257 |
+
task_selector = gr.Textbox(
|
258 |
+
value="Text Generation",
|
259 |
+
interactive=False,
|
260 |
+
show_label=False,
|
261 |
+
elem_classes=["no-label", "small-big"]
|
262 |
+
)
|
263 |
+
|
264 |
+
with gr.Column():
|
265 |
+
gr.Markdown("## Framework")
|
266 |
+
framework_selector = gr.Textbox(
|
267 |
+
value="PyTorch",
|
268 |
+
interactive=False,
|
269 |
+
show_label=False,
|
270 |
+
elem_classes=["no-label", "small-big"]
|
271 |
+
)
|
272 |
+
|
273 |
+
with gr.Column():
|
274 |
+
gr.Markdown("""## Select Compute Instance Type
|
275 |
+
|
276 |
+
Select a CPU or GPU accelerated compute option for inference""")
|
277 |
+
compute_selector = gr.Dropdown(
|
278 |
+
[],
|
279 |
+
value="",
|
280 |
+
interactive=True,
|
281 |
+
show_label=False,
|
282 |
+
elem_classes=["no-label", "small-big"]
|
283 |
+
)
|
284 |
+
|
285 |
+
with gr.Row():
|
286 |
+
with gr.Column():
|
287 |
+
gr.Markdown("""## Min Number of Nodes
|
288 |
+
|
289 |
+
Automatically scale the number of replicas based on load and compute usage""")
|
290 |
+
min_node_selector = gr.Number(
|
291 |
+
value=1,
|
292 |
+
interactive=True,
|
293 |
+
show_label=False,
|
294 |
+
elem_classes=["no-label", "small-big"]
|
295 |
+
)
|
296 |
+
|
297 |
+
with gr.Column():
|
298 |
+
gr.Markdown("""## Max Number of Nodes
|
299 |
+
|
300 |
+
Automatically scale the number of replicas based on load and compute usage""")
|
301 |
+
max_node_selector = gr.Number(
|
302 |
+
value=1,
|
303 |
+
interactive=True,
|
304 |
+
show_label=False,
|
305 |
+
elem_classes=["no-label", "small-big"]
|
306 |
+
)
|
307 |
+
|
308 |
+
with gr.Column():
|
309 |
+
gr.Markdown("""## Security Level
|
310 |
+
|
311 |
+
Choose your endpoint's level of privacy""")
|
312 |
+
security_selector = gr.Radio(
|
313 |
+
choices=["Protected", "Public", "Private"],
|
314 |
+
value="Public",
|
315 |
+
interactive=True,
|
316 |
+
show_label=False,
|
317 |
+
elem_classes=["no-label", "small-big"]
|
318 |
+
)
|
319 |
+
|
320 |
+
with gr.Column(elem_classes=["group-border"]):
|
321 |
+
with gr.Column():
|
322 |
+
gr.Markdown("""## Container Type
|
323 |
+
|
324 |
+
Text Generation Inference is an optimized container for text generation task""")
|
325 |
+
_ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
|
326 |
+
|
327 |
+
with gr.Row():
|
328 |
+
with gr.Column():
|
329 |
+
gr.Markdown("""## Custom Cuda Kernels
|
330 |
+
|
331 |
+
TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
|
332 |
+
_ = gr.Dropdown(
|
333 |
+
value="Enabled",
|
334 |
+
choices=["Enabled", "Disabled"],
|
335 |
+
interactive=True,
|
336 |
+
show_label=False,
|
337 |
+
elem_classes=["no-label", "small-big"]
|
338 |
+
)
|
339 |
+
|
340 |
+
with gr.Column():
|
341 |
+
gr.Markdown("""## Quantization
|
342 |
+
|
343 |
+
Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
|
344 |
+
_ = gr.Dropdown(
|
345 |
+
value="None",
|
346 |
+
choices=["None", "Bitsandbytes", "GPTQ"],
|
347 |
+
interactive=True,
|
348 |
+
show_label=False,
|
349 |
+
elem_classes=["no-label", "small-big"]
|
350 |
+
)
|
351 |
+
|
352 |
+
with gr.Row():
|
353 |
+
with gr.Column():
|
354 |
+
gr.Markdown("""## Max Input Length (per Query)
|
355 |
+
|
356 |
+
Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
|
357 |
+
_ = gr.Number(
|
358 |
+
value=1024,
|
359 |
+
interactive=True,
|
360 |
+
show_label=False,
|
361 |
+
elem_classes=["no-label", "small-big"]
|
362 |
+
)
|
363 |
+
|
364 |
+
with gr.Column():
|
365 |
+
gr.Markdown("""## Max Number of Tokens (per Query)
|
366 |
+
|
367 |
+
The larger this value, the more memory each request will consume and the less effective batching can be.""")
|
368 |
+
_ = gr.Number(
|
369 |
+
value=1512,
|
370 |
+
interactive=True,
|
371 |
+
show_label=False,
|
372 |
+
elem_classes=["no-label", "small-big"]
|
373 |
+
)
|
374 |
+
|
375 |
+
with gr.Row():
|
376 |
+
with gr.Column():
|
377 |
+
gr.Markdown("""## Max Batch Prefill Tokens
|
378 |
+
|
379 |
+
Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
|
380 |
+
_ = gr.Number(
|
381 |
+
value=2048,
|
382 |
+
interactive=True,
|
383 |
+
show_label=False,
|
384 |
+
elem_classes=["no-label", "small-big"]
|
385 |
+
)
|
386 |
+
|
387 |
+
with gr.Column():
|
388 |
+
gr.Markdown("""## Max Batch Total Tokens
|
389 |
+
|
390 |
+
Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
|
391 |
+
_ = gr.Number(
|
392 |
+
value=None,
|
393 |
+
interactive=True,
|
394 |
+
show_label=False,
|
395 |
+
elem_classes=["no-label", "small-big"]
|
396 |
+
)
|
397 |
+
|
398 |
+
submit_button = gr.Button(
|
399 |
+
value="Submit",
|
400 |
+
elem_classes=["control-label-font", "control-button"]
|
401 |
)
|
402 |
|
403 |
+
status_txt = gr.Textbox(
|
404 |
+
value="any status update will be displayed here",
|
405 |
interactive=False,
|
406 |
+
elem_classes=["no-label"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
)
|
408 |
|
409 |
+
provider_selector.change(update_regions, inputs=provider_selector, outputs=region_selector)
|
410 |
+
region_selector.change(update_compute_options, inputs=[provider_selector, region_selector], outputs=compute_selector)
|
411 |
+
|
412 |
+
submit_button.click(
|
413 |
+
submit,
|
414 |
+
inputs=[
|
415 |
+
hf_account_input,
|
416 |
+
hf_token_input,
|
417 |
+
endpoint_name_input,
|
418 |
+
provider_selector,
|
419 |
+
region_selector,
|
420 |
+
repository_selector,
|
421 |
+
task_selector,
|
422 |
+
framework_selector,
|
423 |
+
compute_selector,
|
424 |
+
min_node_selector,
|
425 |
+
max_node_selector,
|
426 |
+
security_selector],
|
427 |
+
outputs=status_txt)
|
428 |
+
|
429 |
+
with gr.Tab("AWS"):
|
430 |
+
gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
|
431 |
+
|
432 |
+
with gr.Tab("GCP"):
|
433 |
+
gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
|
434 |
+
|
435 |
+
with gr.Tab("Azure"):
|
436 |
+
gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
|
437 |
+
|
438 |
+
with gr.Tab("Lambdalabs"):
|
439 |
+
gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
|
440 |
+
|
441 |
+
hf_endpoint.launch(enable_queue=True, debug=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|