Spaces:
Paused
Paused
feat(cuda): add cuda information
Browse files
main.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import torch
|
2 |
from typing import Any
|
3 |
from typing import Optional
|
@@ -8,6 +9,26 @@ from vllm import LLM, SamplingParams, RequestOutput
|
|
8 |
|
9 |
# Don't forget to set HF_TOKEN in the env during running
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
app = FastAPI()
|
12 |
|
13 |
# Initialize the LLM engine
|
@@ -22,7 +43,7 @@ engine_llama_3_2: LLM = LLM(
|
|
22 |
max_num_batched_tokens=512, # Reduced for T4
|
23 |
max_num_seqs=16, # Reduced for T4
|
24 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
25 |
-
tensor_parallel_size=
|
26 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
27 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
28 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
@@ -43,7 +64,7 @@ engine_sailor_chat: LLM = LLM(
|
|
43 |
max_num_batched_tokens=512, # Reduced for T4
|
44 |
max_num_seqs=16, # Reduced for T4
|
45 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
46 |
-
tensor_parallel_size=
|
47 |
# max_model_len=32768,
|
48 |
enforce_eager=True, # Disable CUDA graph
|
49 |
dtype='auto', # Use 'half' if you want half precision
|
|
|
1 |
+
import random
|
2 |
import torch
|
3 |
from typing import Any
|
4 |
from typing import Optional
|
|
|
9 |
|
10 |
# Don't forget to set HF_TOKEN in the env during running
|
11 |
|
12 |
+
cuda_num_device: int = 0
|
13 |
+
if torch.cuda.is_available() == 'cuda':
|
14 |
+
random_seed = 42
|
15 |
+
random.seed(random_seed)
|
16 |
+
|
17 |
+
device = torch.device('cuda')
|
18 |
+
torch.cuda.manual_seed(random_seed)
|
19 |
+
|
20 |
+
print(f"Using device: {device}")
|
21 |
+
print(f"CUDA available and enabled. {torch.cuda}")
|
22 |
+
print(f"CUDA is available: {torch.cuda.is_available()}")
|
23 |
+
print(f"CUDA device count: {torch.cuda.device_count()}")
|
24 |
+
print(f"CUDA current device: {torch.cuda.current_device()}")
|
25 |
+
|
26 |
+
for i in range(torch.cuda.device_count()):
|
27 |
+
print('=================================================================')
|
28 |
+
print(torch.cuda.get_device_name(i))
|
29 |
+
print('Memory Usage:')
|
30 |
+
print('Allocated:', round(torch.cuda.memory_allocated(i) / 1024 ** 3, 1), 'GB')
|
31 |
+
print('Cached: ', round(torch.cuda.memory_reserved(i) / 1024 ** 3, 1), 'GB')
|
32 |
app = FastAPI()
|
33 |
|
34 |
# Initialize the LLM engine
|
|
|
43 |
max_num_batched_tokens=512, # Reduced for T4
|
44 |
max_num_seqs=16, # Reduced for T4
|
45 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
46 |
+
tensor_parallel_size=cuda_num_device,
|
47 |
# Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
|
48 |
# 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
|
49 |
# so that's basically 24k / .5k = 24 x 2 =~48 pages.
|
|
|
64 |
max_num_batched_tokens=512, # Reduced for T4
|
65 |
max_num_seqs=16, # Reduced for T4
|
66 |
gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
|
67 |
+
tensor_parallel_size=cuda_num_device,
|
68 |
# max_model_len=32768,
|
69 |
enforce_eager=True, # Disable CUDA graph
|
70 |
dtype='auto', # Use 'half' if you want half precision
|