Spaces:
Running
Running
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +5 -5
run_cloud_training.py
CHANGED
@@ -571,6 +571,11 @@ def train(config_path, dataset_name, output_dir):
|
|
571 |
# Initialize ds_config_path to None before checking
|
572 |
ds_config_path = None
|
573 |
|
|
|
|
|
|
|
|
|
|
|
574 |
# Check if DeepSpeed config is available
|
575 |
deepspeed_config = config.get("deepspeed_config", None)
|
576 |
if deepspeed_config:
|
@@ -644,11 +649,6 @@ def train(config_path, dataset_name, output_dir):
|
|
644 |
reports = ["none"]
|
645 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
646 |
|
647 |
-
# Optimize batch size for multi-GPU setup
|
648 |
-
# For 4x L4 GPUs (24GB each), we can safely use a larger batch size
|
649 |
-
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
650 |
-
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
651 |
-
|
652 |
training_args_dict = {
|
653 |
"output_dir": output_dir,
|
654 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|
|
|
571 |
# Initialize ds_config_path to None before checking
|
572 |
ds_config_path = None
|
573 |
|
574 |
+
# Optimize batch size for multi-GPU setup
|
575 |
+
# For 4x L4 GPUs (24GB each), we can safely use a larger batch size
|
576 |
+
per_device_train_batch_size = 4 if gpu_count >= 4 else 2
|
577 |
+
logger.info(f"Using batch size: {per_device_train_batch_size} per device (effective batch size: {per_device_train_batch_size * gpu_count * training_config.get('gradient_accumulation_steps', 4)})")
|
578 |
+
|
579 |
# Check if DeepSpeed config is available
|
580 |
deepspeed_config = config.get("deepspeed_config", None)
|
581 |
if deepspeed_config:
|
|
|
649 |
reports = ["none"]
|
650 |
logger.warning("No reporting backends available - training metrics won't be logged")
|
651 |
|
|
|
|
|
|
|
|
|
|
|
652 |
training_args_dict = {
|
653 |
"output_dir": output_dir,
|
654 |
"num_train_epochs": training_config.get("num_train_epochs", 3),
|