yusufs commited on
Commit
2457cd7
·
1 Parent(s): 0ef012d

feat(cuda): add cuda information

Browse files
Files changed (1) hide show
  1. main.py +23 -2
main.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import torch
2
  from typing import Any
3
  from typing import Optional
@@ -8,6 +9,26 @@ from vllm import LLM, SamplingParams, RequestOutput
8
 
9
  # Don't forget to set HF_TOKEN in the env during running
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  app = FastAPI()
12
 
13
  # Initialize the LLM engine
@@ -22,7 +43,7 @@ engine_llama_3_2: LLM = LLM(
22
  max_num_batched_tokens=512, # Reduced for T4
23
  max_num_seqs=16, # Reduced for T4
24
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
25
- tensor_parallel_size=2,
26
  # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
27
  # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
28
  # so that's basically 24k / .5k = 24 x 2 =~48 pages.
@@ -43,7 +64,7 @@ engine_sailor_chat: LLM = LLM(
43
  max_num_batched_tokens=512, # Reduced for T4
44
  max_num_seqs=16, # Reduced for T4
45
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
46
- tensor_parallel_size=2,
47
  # max_model_len=32768,
48
  enforce_eager=True, # Disable CUDA graph
49
  dtype='auto', # Use 'half' if you want half precision
 
1
+ import random
2
  import torch
3
  from typing import Any
4
  from typing import Optional
 
9
 
10
  # Don't forget to set HF_TOKEN in the env during running
11
 
12
+ cuda_num_device: int = 0
13
+ if torch.cuda.is_available() == 'cuda':
14
+ random_seed = 42
15
+ random.seed(random_seed)
16
+
17
+ device = torch.device('cuda')
18
+ torch.cuda.manual_seed(random_seed)
19
+
20
+ print(f"Using device: {device}")
21
+ print(f"CUDA available and enabled. {torch.cuda}")
22
+ print(f"CUDA is available: {torch.cuda.is_available()}")
23
+ print(f"CUDA device count: {torch.cuda.device_count()}")
24
+ print(f"CUDA current device: {torch.cuda.current_device()}")
25
+
26
+ for i in range(torch.cuda.device_count()):
27
+ print('=================================================================')
28
+ print(torch.cuda.get_device_name(i))
29
+ print('Memory Usage:')
30
+ print('Allocated:', round(torch.cuda.memory_allocated(i) / 1024 ** 3, 1), 'GB')
31
+ print('Cached: ', round(torch.cuda.memory_reserved(i) / 1024 ** 3, 1), 'GB')
32
  app = FastAPI()
33
 
34
  # Initialize the LLM engine
 
43
  max_num_batched_tokens=512, # Reduced for T4
44
  max_num_seqs=16, # Reduced for T4
45
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
46
+ tensor_parallel_size=cuda_num_device,
47
  # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
48
  # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
49
  # so that's basically 24k / .5k = 24 x 2 =~48 pages.
 
64
  max_num_batched_tokens=512, # Reduced for T4
65
  max_num_seqs=16, # Reduced for T4
66
  gpu_memory_utilization=0.85, # Slightly increased, adjust if needed
67
+ tensor_parallel_size=cuda_num_device,
68
  # max_model_len=32768,
69
  enforce_eager=True, # Disable CUDA graph
70
  dtype='auto', # Use 'half' if you want half precision