George-API commited on
Commit
29848e1
·
verified ·
1 Parent(s): aa250a7

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +5 -5
run_cloud_training.py CHANGED
@@ -571,6 +571,11 @@ def train(config_path, dataset_name, output_dir):
571
  # Initialize ds_config_path to None before checking
572
  ds_config_path = None
573
 
 
 
 
 
 
574
  # Check if DeepSpeed config is available
575
  deepspeed_config = config.get("deepspeed_config", None)
576
  if deepspeed_config:
@@ -644,11 +649,6 @@ def train(config_path, dataset_name, output_dir):
644
  reports = ["none"]
645
  logger.warning("No reporting backends available - training metrics won't be logged")
646
 
647
- # Optimize batch size for multi-GPU setup
648
- # For 4x L4 GPUs (24GB each), we can safely use a larger batch size
649
- per_device_train_batch_size = 4 if gpu_count >= 4 else 2
650
- logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
651
-
652
  training_args_dict = {
653
  "output_dir": output_dir,
654
  "num_train_epochs": training_config.get("num_train_epochs", 3),
 
571
  # Initialize ds_config_path to None before checking
572
  ds_config_path = None
573
 
574
+ # Optimize batch size for multi-GPU setup
575
+ # For 4x L4 GPUs (24GB each), we can safely use a larger batch size
576
+ per_device_train_batch_size = 4 if gpu_count >= 4 else 2
577
+ logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
578
+
579
  # Check if DeepSpeed config is available
580
  deepspeed_config = config.get("deepspeed_config", None)
581
  if deepspeed_config:
 
649
  reports = ["none"]
650
  logger.warning("No reporting backends available - training metrics won't be logged")
651
 
 
 
 
 
 
652
  training_args_dict = {
653
  "output_dir": output_dir,
654
  "num_train_epochs": training_config.get("num_train_epochs", 3),