chansung commited on
Commit
ef87ed1
·
1 Parent(s): a61933d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +324 -240
app.py CHANGED
@@ -4,33 +4,79 @@ import requests
4
 
5
  import gradio as gr
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def avaliable_providers():
8
  providers = []
9
-
10
  headers = {
11
  "Content-Type": "application/json",
12
  }
13
- endpoint_url = "https://api.endpoints.huggingface.cloud/provider"
14
  response = requests.get(endpoint_url, headers=headers)
15
 
16
- for provider in response.json()['items']:
 
 
17
  if provider['status'] == 'available':
18
- providers.append(provider['vendor'])
19
-
 
 
 
 
 
 
 
 
 
 
 
 
20
  return providers
21
 
 
 
22
  def update_regions(provider):
23
  avalialbe_regions = []
24
-
25
- headers = {
26
- "Content-Type": "application/json",
27
- }
28
- endpoint_url = f"https://api.endpoints.huggingface.cloud/provider/{provider}/region"
29
- response = requests.get(endpoint_url, headers=headers)
30
 
31
- for region in response.json()['items']:
32
- if region['status'] == 'available':
33
- avalialbe_regions.append(f"{region['region']}/{region['label']}")
34
 
35
  return gr.Dropdown.update(
36
  choices=avalialbe_regions,
@@ -38,28 +84,22 @@ def update_regions(provider):
38
  )
39
 
40
  def update_compute_options(provider, region):
41
- region = region.split("/")[0]
42
  avalialbe_compute_options = []
 
43
 
44
- headers = {
45
- "Content-Type": "application/json",
46
- }
47
- endpoint_url = f"https://api.endpoints.huggingface.cloud/provider/{provider}/region/{region}/compute"
48
- print(endpoint_url)
49
- response = requests.get(endpoint_url, headers=headers)
50
-
51
- for compute in response.json()['items']:
52
  if compute['status'] == 'available':
53
  accelerator = compute['accelerator']
54
  numAccelerators = compute['numAccelerators']
55
  memoryGb = compute['memoryGb']
56
  architecture = compute['architecture']
57
  instanceType = compute['instanceType']
58
-
 
59
  type = f"{numAccelerators}vCPU {memoryGb} · {architecture}" if accelerator == "cpu" else f"{numAccelerators}x {architecture}"
60
-
61
  avalialbe_compute_options.append(
62
- f"{compute['accelerator'].upper()} [{compute['instanceSize']}] · {type} · {instanceType}"
63
  )
64
 
65
  return gr.Dropdown.update(
@@ -77,9 +117,9 @@ def submit(
77
  task_selector,
78
  framework_selector,
79
  compute_selector,
80
- min_node_selector,
81
- max_node_selector,
82
- security_selector
83
  ):
84
  compute_resources = compute_selector.split("·")
85
  accelerator = compute_resources[0][:3].strip()
@@ -89,7 +129,7 @@ def submit(
89
  size = compute_resources[0][size_l_index : size_r_index].strip()
90
 
91
  type = compute_resources[-1].strip()
92
-
93
  payload = {
94
  "accountId": hf_account_input.strip(),
95
  "compute": {
@@ -107,7 +147,7 @@ def submit(
107
  "huggingface": {}
108
  },
109
  "repository": repository_selector.lower(),
110
- "revision": head_sha,
111
  "task": task_selector.lower()
112
  },
113
  "name": endpoint_name_input.strip(),
@@ -117,7 +157,7 @@ def submit(
117
  },
118
  "type": security_selector.lower()
119
  }
120
-
121
  print(payload)
122
 
123
  payload = json.dumps(payload)
@@ -127,7 +167,7 @@ def submit(
127
  "Authorization": f"Bearer {hf_token_input.strip()}",
128
  "Content-Type": "application/json",
129
  }
130
- endpoint_url = f"https://api.endpoints.huggingface.cloud/endpoint"
131
  print(endpoint_url)
132
 
133
  response = requests.post(endpoint_url, headers=headers, data=payload)
@@ -143,215 +183,259 @@ def submit(
143
  else:
144
  return f"something went wrong {response.status_code} = {response.text}"
145
 
146
- with gr.Blocks() as hf_endpoint:
147
- providers = avaliable_providers()
148
-
149
- gr.Markdown(
150
- """
151
- ## Deploy Stable Diffusion on 🤗 Endpoint
152
- ---
153
- """)
154
-
155
- gr.Markdown("""
156
- #### Your 🤗 Account ID(Name)
157
- """)
158
- hf_account_input = gr.Textbox(
159
- show_label=False,
160
- )
161
-
162
- gr.Markdown("""
163
- #### Your 🤗 Access Token
164
- """)
165
- hf_token_input = gr.Textbox(
166
- show_label=False,
167
- type="password"
168
- )
169
-
170
- gr.Markdown("""
171
- #### Decide the Endpoint name
172
- """)
173
- endpoint_name_input = gr.Textbox(
174
- show_label=False
175
- )
176
-
177
- with gr.Row():
178
- gr.Markdown("""
179
- #### Cloud Provider
180
- """)
181
-
182
- gr.Markdown("""
183
- #### Cloud Region
184
- """)
185
-
186
- with gr.Row():
187
- provider_selector = gr.Dropdown(
188
- choices=providers,
189
- interactive=True,
190
- show_label=False,
191
- )
192
-
193
- region_selector = gr.Dropdown(
194
- [],
195
- value="",
196
- interactive=True,
197
- show_label=False,
198
- )
199
-
200
- provider_selector.change(update_regions, inputs=provider_selector, outputs=region_selector)
201
-
202
- with gr.Row():
203
- gr.Markdown("""
204
- #### Target Model
205
- """)
206
-
207
- gr.Markdown("""
208
- #### Target Model Version(branch)
209
- """)
210
-
211
- with gr.Row():
212
- repository_selector = gr.Textbox(
213
- value="chansung/my-kitty",
214
- interactive=False,
215
- show_label=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
216
  )
217
 
218
- revision_selector = gr.Textbox(
219
- value=f"main",
220
  interactive=False,
221
- show_label=False,
222
- )
223
-
224
- with gr.Row():
225
- gr.Markdown("""
226
- #### Task
227
- """)
228
-
229
- gr.Markdown("""
230
- #### Framework
231
- """)
232
-
233
- with gr.Row():
234
- task_selector = gr.Textbox(
235
- value="Custom",
236
- interactive=False,
237
- show_label=False,
238
- )
239
-
240
- framework_selector = gr.Textbox(
241
- value="TensorFlow",
242
- interactive=False,
243
- show_label=False,
244
  )
245
 
246
- gr.Markdown("""
247
-
248
- #### Select Compute Instance Type
249
- """)
250
- compute_selector = gr.Dropdown(
251
- [],
252
- value="",
253
- interactive=True,
254
- show_label=False,
255
- )
256
- region_selector.change(update_compute_options, inputs=[provider_selector, region_selector], outputs=compute_selector)
257
-
258
- with gr.Row():
259
- gr.Markdown("""
260
- #### Min Number of Nodes
261
- """)
262
-
263
- gr.Markdown("""
264
- #### Max Number of Nodes
265
- """)
266
-
267
- gr.Markdown("""
268
- #### Security Level
269
- """)
270
-
271
- with gr.Row():
272
- min_node_selector = gr.Number(
273
- value=1,
274
- interactive=True,
275
- show_label=False,
276
- )
277
-
278
- max_node_selector = gr.Number(
279
- value=1,
280
- interactive=True,
281
- show_label=False,
282
- )
283
-
284
- security_selector = gr.Radio(
285
- choices=["Protected", "Public", "Private"],
286
- value="Public",
287
- interactive=True,
288
- show_label=False,
289
- )
290
-
291
- submit_button = gr.Button(
292
- value="Submit",
293
- )
294
-
295
- status_txt = gr.Textbox(
296
- value="any status update will be displayed here",
297
- interactive=False
298
- )
299
-
300
- submit_button.click(
301
- submit,
302
- inputs=[
303
- hf_account_input,
304
- hf_token_input,
305
- endpoint_name_input,
306
- provider_selector,
307
- region_selector,
308
- repository_selector,
309
- task_selector,
310
- framework_selector,
311
- compute_selector,
312
- min_node_selector,
313
- max_node_selector,
314
- security_selector],
315
- outputs=status_txt)
316
-
317
- gr.Markdown("""
318
- #### Pricing Table(CPU) - 2023/1/11
319
- """)
320
-
321
- gr.Dataframe(
322
- headers=["provider", "size", "$/h", "vCPUs", "Memory", "Architecture"],
323
- datatype=["str", "str", "str", "number", "str", "str"],
324
- row_count=8,
325
- col_count=(6, "fixed"),
326
- value=[
327
- ["aws", "small", "$0.06", 1, "2GB", "Intel Xeon - Ice Lake"],
328
- ["aws", "medium", "$0.12", 2, "4GB", "Intel Xeon - Ice Lake"],
329
- ["aws", "large", "$0.24", 4, "8GB", "Intel Xeon - Ice Lake"],
330
- ["aws", "xlarge", "$0.48", 8, "16GB", "Intel Xeon - Ice Lake"],
331
- ["azure", "small", "$0.06", 1, "2GB", "Intel Xeon"],
332
- ["azure", "medium", "$0.12", 2, "4GB", "Intel Xeon"],
333
- ["azure", "large", "$0.24", 4, "8GB", "Intel Xeon"],
334
- ["azure", "xlarge", "$0.48", 8, "16GB", "Intel Xeon"],
335
- ]
336
- )
337
-
338
- gr.Markdown("""
339
- #### Pricing Table(GPU) - 2023/1/11
340
- """)
341
-
342
- gr.Dataframe(
343
- headers=["provider", "size", "$/h", "GPUs", "Memory", "Architecture"],
344
- datatype=["str", "str", "str", "number", "str", "str"],
345
- row_count=6,
346
- col_count=(6, "fixed"),
347
- value=[
348
- ["aws", "small", "$0.60", 1, "14GB", "NVIDIA T4"],
349
- ["aws", "medium", "$1.30", 1, "24GB", "NVIDIA A10G"],
350
- ["aws", "large", "$4.50", 4, "156B", "NVIDIA T4"],
351
- ["aws", "xlarge", "$6.50", 1, "80GB", "NVIDIA A100"],
352
- ["aws", "xxlarge", "$7.00", 4, "96GB", "NVIDIA A10G"],
353
- ["aws", "xxxlarge", "$45.0", 8, "640GB", "NVIDIA A100"],
354
- ]
355
- )
356
-
357
- hf_endpoint.launch(enable_queue=True)
 
4
 
5
  import gradio as gr
6
 
7
+ STYLE = """
8
+ .group-border {
9
+ padding: 10px;
10
+ border-width: 1px;
11
+ border-radius: 10px;
12
+ border-color: gray;
13
+ border-style: dashed;
14
+ box-shadow: 1px 1px 3px;
15
+ }
16
+ .control-label-font {
17
+ font-size: 13pt !important;
18
+ }
19
+ .control-button {
20
+ background: none !important;
21
+ border-color: #69ade2 !important;
22
+ border-width: 2px !important;
23
+ color: #69ade2 !important;
24
+ }
25
+ .center {
26
+ text-align: center;
27
+ }
28
+ .right {
29
+ text-align: right;
30
+ }
31
+ .no-label {
32
+ padding: 0px !important;
33
+ }
34
+ .no-label > label > span {
35
+ display: none;
36
+ }
37
+ .small-big {
38
+ font-size: 12pt !important;
39
+ }
40
+
41
+ """
42
+
43
  def avaliable_providers():
44
  providers = []
45
+
46
  headers = {
47
  "Content-Type": "application/json",
48
  }
49
+ endpoint_url = "https://api.endpoints.huggingface.cloud/v2/provider"
50
  response = requests.get(endpoint_url, headers=headers)
51
 
52
+ providers = {}
53
+
54
+ for provider in response.json()['vendors']:
55
  if provider['status'] == 'available':
56
+ regions = {}
57
+
58
+ availability = False
59
+ for region in provider['regions']:
60
+ if region["status"] == "available":
61
+ regions[region['name']] = {
62
+ "label": region['label'],
63
+ "computes": region['computes']
64
+ }
65
+ availability = True
66
+
67
+ if availability:
68
+ providers[provider['name']] = regions
69
+
70
  return providers
71
 
72
+ providers = avaliable_providers()
73
+
74
  def update_regions(provider):
75
  avalialbe_regions = []
76
+ regions = providers[provider]
 
 
 
 
 
77
 
78
+ for region, attributes in regions.items():
79
+ avalialbe_regions.append(f"{region}[{attributes['label']}]")
 
80
 
81
  return gr.Dropdown.update(
82
  choices=avalialbe_regions,
 
84
  )
85
 
86
  def update_compute_options(provider, region):
 
87
  avalialbe_compute_options = []
88
+ computes = providers[provider][region.split("[")[0].strip()]["computes"]
89
 
90
+ for compute in computes:
 
 
 
 
 
 
 
91
  if compute['status'] == 'available':
92
  accelerator = compute['accelerator']
93
  numAccelerators = compute['numAccelerators']
94
  memoryGb = compute['memoryGb']
95
  architecture = compute['architecture']
96
  instanceType = compute['instanceType']
97
+ pricePerHour = compute['pricePerHour']
98
+
99
  type = f"{numAccelerators}vCPU {memoryGb} · {architecture}" if accelerator == "cpu" else f"{numAccelerators}x {architecture}"
100
+
101
  avalialbe_compute_options.append(
102
+ f"{compute['accelerator'].upper()} [{compute['instanceSize']}] · {type} · {instanceType} · ${pricePerHour}/hour"
103
  )
104
 
105
  return gr.Dropdown.update(
 
117
  task_selector,
118
  framework_selector,
119
  compute_selector,
120
+ min_node_selector,
121
+ max_node_selector,
122
+ security_selector
123
  ):
124
  compute_resources = compute_selector.split("·")
125
  accelerator = compute_resources[0][:3].strip()
 
129
  size = compute_resources[0][size_l_index : size_r_index].strip()
130
 
131
  type = compute_resources[-1].strip()
132
+
133
  payload = {
134
  "accountId": hf_account_input.strip(),
135
  "compute": {
 
147
  "huggingface": {}
148
  },
149
  "repository": repository_selector.lower(),
150
+ "revision": "main",
151
  "task": task_selector.lower()
152
  },
153
  "name": endpoint_name_input.strip(),
 
157
  },
158
  "type": security_selector.lower()
159
  }
160
+
161
  print(payload)
162
 
163
  payload = json.dumps(payload)
 
167
  "Authorization": f"Bearer {hf_token_input.strip()}",
168
  "Content-Type": "application/json",
169
  }
170
+ endpoint_url = f"https://api.endpoints.huggingface.cloud/v2/endpoint"
171
  print(endpoint_url)
172
 
173
  response = requests.post(endpoint_url, headers=headers, data=payload)
 
183
  else:
184
  return f"something went wrong {response.status_code} = {response.text}"
185
 
186
+ with gr.Blocks(css=STYLE) as hf_endpoint:
187
+ with gr.Tab("🤗 Inference Endpoint"):
188
+ gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
189
+
190
+ with gr.Column(elem_classes=["group-border"]):
191
+ with gr.Row():
192
+ with gr.Column():
193
+ gr.Markdown("""## Hugging Face account ID (name)""")
194
+ hf_account_input = gr.Textbox(show_label=False, elem_classes=["no-label", "small-big"])
195
+
196
+ with gr.Column():
197
+ gr.Markdown("## Hugging Face access token")
198
+ hf_token_input = gr.Textbox(show_label=False, type="password", elem_classes=["no-label", "small-big"])
199
+
200
+ with gr.Row():
201
+ with gr.Column():
202
+ gr.Markdown("""## Target model
203
+
204
+ Import a model from the Hugging Face hub""")
205
+ repository_selector = gr.Textbox(
206
+ value="NousResearch/Nous-Hermes-Llama2-70b",
207
+ interactive=False,
208
+ show_label=False,
209
+ elem_classes=["no-label", "small-big"]
210
+ )
211
+
212
+ with gr.Column():
213
+ gr.Markdown("""## Target model version(branch)
214
+
215
+ Specify the branch name""")
216
+ revision_selector = gr.Textbox(
217
+ value=f"main",
218
+ interactive=False,
219
+ show_label=False,
220
+ elem_classes=["no-label", "small-big"]
221
+ )
222
+
223
+ with gr.Column(elem_classes=["group-border"]):
224
+ with gr.Column():
225
+ gr.Markdown("""## Endpoint name
226
+
227
+ Input a name for your new endpoint""")
228
+ endpoint_name_input = gr.Textbox(show_label=False, elem_classes=["no-label", "small-big"])
229
+
230
+ with gr.Row():
231
+ with gr.Column():
232
+ gr.Markdown("""## Cloud Provider
233
+
234
+ Choose between Amazon Web Services and Microsoft Azure""")
235
+ provider_selector = gr.Dropdown(
236
+ choices=providers.keys(),
237
+ interactive=True,
238
+ show_label=False,
239
+ elem_classes=["no-label", "small-big"]
240
+ )
241
+
242
+ with gr.Column():
243
+ gr.Markdown("""## Cloud Region
244
+
245
+ Choose one of the regions from each cloud provider""")
246
+ region_selector = gr.Dropdown(
247
+ [],
248
+ value="",
249
+ interactive=True,
250
+ show_label=False,
251
+ elem_classes=["no-label", "small-big"]
252
+ )
253
+
254
+ with gr.Row(visible=False):
255
+ with gr.Column():
256
+ gr.Markdown("## Task")
257
+ task_selector = gr.Textbox(
258
+ value="Text Generation",
259
+ interactive=False,
260
+ show_label=False,
261
+ elem_classes=["no-label", "small-big"]
262
+ )
263
+
264
+ with gr.Column():
265
+ gr.Markdown("## Framework")
266
+ framework_selector = gr.Textbox(
267
+ value="PyTorch",
268
+ interactive=False,
269
+ show_label=False,
270
+ elem_classes=["no-label", "small-big"]
271
+ )
272
+
273
+ with gr.Column():
274
+ gr.Markdown("""## Select Compute Instance Type
275
+
276
+ Select a CPU or GPU accelerated compute option for inference""")
277
+ compute_selector = gr.Dropdown(
278
+ [],
279
+ value="",
280
+ interactive=True,
281
+ show_label=False,
282
+ elem_classes=["no-label", "small-big"]
283
+ )
284
+
285
+ with gr.Row():
286
+ with gr.Column():
287
+ gr.Markdown("""## Min Number of Nodes
288
+
289
+ Automatically scale the number of replicas based on load and compute usage""")
290
+ min_node_selector = gr.Number(
291
+ value=1,
292
+ interactive=True,
293
+ show_label=False,
294
+ elem_classes=["no-label", "small-big"]
295
+ )
296
+
297
+ with gr.Column():
298
+ gr.Markdown("""## Max Number of Nodes
299
+
300
+ Automatically scale the number of replicas based on load and compute usage""")
301
+ max_node_selector = gr.Number(
302
+ value=1,
303
+ interactive=True,
304
+ show_label=False,
305
+ elem_classes=["no-label", "small-big"]
306
+ )
307
+
308
+ with gr.Column():
309
+ gr.Markdown("""## Security Level
310
+
311
+ Choose your endpoint's level of privacy""")
312
+ security_selector = gr.Radio(
313
+ choices=["Protected", "Public", "Private"],
314
+ value="Public",
315
+ interactive=True,
316
+ show_label=False,
317
+ elem_classes=["no-label", "small-big"]
318
+ )
319
+
320
+ with gr.Column(elem_classes=["group-border"]):
321
+ with gr.Column():
322
+ gr.Markdown("""## Container Type
323
+
324
+ Text Generation Inference is an optimized container for text generation task""")
325
+ _ = gr.Textbox("Text Generation Inference", show_label=False, elem_classes=["no-label", "small-big"])
326
+
327
+ with gr.Row():
328
+ with gr.Column():
329
+ gr.Markdown("""## Custom Cuda Kernels
330
+
331
+ TGI uses custom kernels to speed up inference for some models. You can try disabling them if you encounter issues.""")
332
+ _ = gr.Dropdown(
333
+ value="Enabled",
334
+ choices=["Enabled", "Disabled"],
335
+ interactive=True,
336
+ show_label=False,
337
+ elem_classes=["no-label", "small-big"]
338
+ )
339
+
340
+ with gr.Column():
341
+ gr.Markdown("""## Quantization
342
+
343
+ Quantization can reduce the model size and improve latency, with little degradation in model accuracy.""")
344
+ _ = gr.Dropdown(
345
+ value="None",
346
+ choices=["None", "Bitsandbytes", "GPTQ"],
347
+ interactive=True,
348
+ show_label=False,
349
+ elem_classes=["no-label", "small-big"]
350
+ )
351
+
352
+ with gr.Row():
353
+ with gr.Column():
354
+ gr.Markdown("""## Max Input Length (per Query)
355
+
356
+ Increasing this value can impact the amount of RAM required. Some models can only handle a finite range of sequences.""")
357
+ _ = gr.Number(
358
+ value=1024,
359
+ interactive=True,
360
+ show_label=False,
361
+ elem_classes=["no-label", "small-big"]
362
+ )
363
+
364
+ with gr.Column():
365
+ gr.Markdown("""## Max Number of Tokens (per Query)
366
+
367
+ The larger this value, the more memory each request will consume and the less effective batching can be.""")
368
+ _ = gr.Number(
369
+ value=1512,
370
+ interactive=True,
371
+ show_label=False,
372
+ elem_classes=["no-label", "small-big"]
373
+ )
374
+
375
+ with gr.Row():
376
+ with gr.Column():
377
+ gr.Markdown("""## Max Batch Prefill Tokens
378
+
379
+ Number of prefill tokens used during continuous batching. It can be useful to adjust this number since the prefill operation is memory-intensive and compute-bound.""")
380
+ _ = gr.Number(
381
+ value=2048,
382
+ interactive=True,
383
+ show_label=False,
384
+ elem_classes=["no-label", "small-big"]
385
+ )
386
+
387
+ with gr.Column():
388
+ gr.Markdown("""## Max Batch Total Tokens
389
+
390
+ Number of tokens that can be passed before forcing waiting queries to be put on the batch. A value of 1000 can fit 10 queries of 100 tokens or a single query of 1000 tokens.""")
391
+ _ = gr.Number(
392
+ value=None,
393
+ interactive=True,
394
+ show_label=False,
395
+ elem_classes=["no-label", "small-big"]
396
+ )
397
+
398
+ submit_button = gr.Button(
399
+ value="Submit",
400
+ elem_classes=["control-label-font", "control-button"]
401
  )
402
 
403
+ status_txt = gr.Textbox(
404
+ value="any status update will be displayed here",
405
  interactive=False,
406
+ elem_classes=["no-label"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  )
408
 
409
+ provider_selector.change(update_regions, inputs=provider_selector, outputs=region_selector)
410
+ region_selector.change(update_compute_options, inputs=[provider_selector, region_selector], outputs=compute_selector)
411
+
412
+ submit_button.click(
413
+ submit,
414
+ inputs=[
415
+ hf_account_input,
416
+ hf_token_input,
417
+ endpoint_name_input,
418
+ provider_selector,
419
+ region_selector,
420
+ repository_selector,
421
+ task_selector,
422
+ framework_selector,
423
+ compute_selector,
424
+ min_node_selector,
425
+ max_node_selector,
426
+ security_selector],
427
+ outputs=status_txt)
428
+
429
+ with gr.Tab("AWS"):
430
+ gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
431
+
432
+ with gr.Tab("GCP"):
433
+ gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
434
+
435
+ with gr.Tab("Azure"):
436
+ gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
437
+
438
+ with gr.Tab("Lambdalabs"):
439
+ gr.Markdown("# Deploy LLM on 🤗 Hugging Face Inference Endpoint", elem_classes=["center"])
440
+
441
+ hf_endpoint.launch(enable_queue=True, debug=True)