Spaces:

ethanlshen
/

SuperposedDecoding

Runtime error

ethanlshen commited on Jun 25, 2024

Commit

d8add38

verified ·

1 Parent(s): 37a92ca

Remove parallel

Files changed (1) hide show

superposed/llama/superposed_generation.py CHANGED Viewed

@@ -34,29 +34,29 @@ class SuperposedLlama:
         model_parallel_size: Optional[int] = None,
         seed: int = 1,
     ):
-        if not torch.distributed.is_initialized():
-            torch.distributed.init_process_group("nccl")
-        if not model_parallel_is_initialized():
-            if model_parallel_size is None:
-                model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
-            initialize_model_parallel(model_parallel_size)
-        local_rank = int(os.environ.get("LOCAL_RANK", 0))
         if device == None:
             torch.cuda.set_device(local_rank)
             device = torch.cuda.current_device()
         torch.manual_seed(seed)
-        if local_rank > 0:
-            sys.stdout = open(os.devnull, "w")
-        start_time = time.time()
         checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
-        assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
-        assert model_parallel_size == len(
-            checkpoints
-        ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
-        ckpt_path = checkpoints[get_model_parallel_rank()]
         checkpoint = torch.load(ckpt_path, map_location="cpu")
         with open(Path(ckpt_dir) / "params.json", "r") as f:
             params = json.loads(f.read())

         model_parallel_size: Optional[int] = None,
         seed: int = 1,
     ):
+        # if not torch.distributed.is_initialized():
+        #     torch.distributed.init_process_group("nccl")
+        # if not model_parallel_is_initialized():
+        #     if model_parallel_size is None:
+        #         model_parallel_size = int(os.environ.get("WORLD_SIZE", 1))
+        #     initialize_model_parallel(model_parallel_size)
+        # local_rank = int(os.environ.get("LOCAL_RANK", 0))
         if device == None:
             torch.cuda.set_device(local_rank)
             device = torch.cuda.current_device()
         torch.manual_seed(seed)
+        # if local_rank > 0:
+        #     sys.stdout = open(os.devnull, "w")
+        # start_time = time.time()
         checkpoints = sorted(Path(ckpt_dir).glob("*.pth"))
+        # assert len(checkpoints) > 0, f"no checkpoint files found in {ckpt_dir}"
+        # assert model_parallel_size == len(
+        #     checkpoints
+        # ), f"Loading a checkpoint for MP={len(checkpoints)} but world size is {model_parallel_size}"
+        ckpt_path = checkpoints[0]
         checkpoint = torch.load(ckpt_path, map_location="cpu")
         with open(Path(ckpt_dir) / "params.json", "r") as f:
             params = json.loads(f.read())