Spaces:

yusufs
/

vllm-inference

Paused

App Files Files

yusufs commited on Nov 27, 2024

Commit

2457cd7

1 Parent(s): 0ef012d

feat(cuda): add cuda information

Browse files

Files changed (1) hide show

main.py +23 -2

main.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 from typing import Any
 from typing import Optional
@@ -8,6 +9,26 @@ from vllm import LLM, SamplingParams, RequestOutput
 # Don't forget to set HF_TOKEN in the env during running
 app = FastAPI()
 # Initialize the LLM engine
@@ -22,7 +43,7 @@ engine_llama_3_2: LLM = LLM(
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
-    tensor_parallel_size=2,
     # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
     # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
     # so that's basically 24k / .5k = 24 x 2 =~48 pages.
@@ -43,7 +64,7 @@ engine_sailor_chat: LLM = LLM(
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
-    tensor_parallel_size=2,
     # max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     dtype='auto',                  # Use 'half' if you want half precision

+import random
 import torch
 from typing import Any
 from typing import Optional
 # Don't forget to set HF_TOKEN in the env during running
+cuda_num_device: int = 0
+if torch.cuda.is_available() == 'cuda':
+    random_seed = 42
+    random.seed(random_seed)
+    device = torch.device('cuda')
+    torch.cuda.manual_seed(random_seed)
+    print(f"Using device: {device}")
+    print(f"CUDA available and enabled. {torch.cuda}")
+    print(f"CUDA is available: {torch.cuda.is_available()}")
+    print(f"CUDA device count: {torch.cuda.device_count()}")
+    print(f"CUDA current device: {torch.cuda.current_device()}")
+    for i in range(torch.cuda.device_count()):
+        print('=================================================================')
+        print(torch.cuda.get_device_name(i))
+        print('Memory Usage:')
+        print('Allocated:', round(torch.cuda.memory_allocated(i) / 1024 ** 3, 1), 'GB')
+        print('Cached:   ', round(torch.cuda.memory_reserved(i) / 1024 ** 3, 1), 'GB')
 app = FastAPI()
 # Initialize the LLM engine
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
+    tensor_parallel_size=cuda_num_device,
     # Llama-3.2-3B-Instruct max context length is 131072, but we reduce it to 32k.
     # 32k tokens, 3/4 of 32k is 24k words, each page average is 500 or 0.5k words,
     # so that's basically 24k / .5k = 24 x 2 =~48 pages.
     max_num_batched_tokens=512,    # Reduced for T4
     max_num_seqs=16,               # Reduced for T4
     gpu_memory_utilization=0.85,   # Slightly increased, adjust if needed
+    tensor_parallel_size=cuda_num_device,
     # max_model_len=32768,
     enforce_eager=True,            # Disable CUDA graph
     dtype='auto',                  # Use 'half' if you want half precision