George-API commited on
Commit
d1150e7
·
verified ·
1 Parent(s): af30cf6

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +118 -55
run_cloud_training.py CHANGED
@@ -31,25 +31,49 @@ DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
31
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
32
  logger = logging.getLogger(__name__)
33
 
34
- # Check if CUDA is available for bitsandbytes
35
- def is_bnb_available():
36
- """Check if bitsandbytes with CUDA is available"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
  import bitsandbytes as bnb
39
- if torch.cuda.is_available():
40
- # Try to create a dummy 4-bit tensor to see if it works
41
- try:
42
- _ = torch.zeros(1, dtype=torch.float16, device="cuda").to(bnb.nn.Linear4bit)
43
- logger.info("BitsAndBytes with CUDA support is available")
44
- return True
45
- except Exception as e:
46
- logger.warning(f"CUDA available but bitsandbytes test failed: {e}")
47
- return False
48
- else:
49
- logger.warning("CUDA not available for bitsandbytes")
 
50
  return False
51
- except (ImportError, RuntimeError) as e:
52
- logger.warning(f"Error checking bitsandbytes: {e}")
 
 
 
 
53
  return False
54
 
55
  # Create a marker file to indicate training is active
@@ -282,6 +306,17 @@ def load_and_prepare_dataset(dataset_name, config):
282
  logger.error(f"Error loading dataset: {str(e)}")
283
  raise
284
 
 
 
 
 
 
 
 
 
 
 
 
285
  # Main training function
286
  def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_name=None, private_repo=False):
287
  # Load environment variables
@@ -310,8 +345,19 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
310
  # Load and prepare dataset with proper sorting
311
  dataset = load_and_prepare_dataset(dataset_name, config)
312
 
 
 
 
313
  # Load model settings
314
- model_name = model_config.get("model_name_or_path")
 
 
 
 
 
 
 
 
315
  logger.info(f"Using model: {model_name}")
316
 
317
  # Initialize tokenizer
@@ -325,8 +371,13 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
325
  # Get quantization config
326
  quant_config = config.get("quantization_config", {})
327
 
328
- # Check if bitsandbytes with CUDA is available
329
- use_4bit = is_bnb_available() and quant_config.get("load_in_4bit", True)
 
 
 
 
 
330
 
331
  # Create model with proper configuration
332
  logger.info(f"Loading model (4-bit quantization: {use_4bit})")
@@ -354,15 +405,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
354
  # CPU fallback (or non-quantized GPU) mode
355
  logger.warning("Loading model in CPU fallback mode (no 4-bit quantization)")
356
 
357
- # Determine best dtype based on available hardware
358
- if torch.cuda.is_available():
359
- dtype = torch.float16
360
- device_map = "auto"
361
- logger.info("Using GPU with fp16")
362
- else:
363
- dtype = torch.float32
364
- device_map = "cpu"
365
- logger.info("Using CPU with fp32")
366
 
367
  # Load model without quantization
368
  model = AutoModelForCausalLM.from_pretrained(
@@ -374,11 +420,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
374
  low_cpu_mem_usage=True
375
  )
376
 
377
- # Apply rope scaling if configured
378
- if "rope_scaling" in model_config:
379
  logger.info(f"Applying rope scaling: {model_config['rope_scaling']}")
380
- if hasattr(model.config, "rope_scaling"):
381
- model.config.rope_scaling = model_config["rope_scaling"]
382
 
383
  # Create LoRA config
384
  logger.info("Creating LoRA configuration")
@@ -395,23 +440,35 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
395
  model = get_peft_model(model, lora_config_obj)
396
  logger.info("Successfully applied LoRA")
397
 
398
- # Determine batch size based on available hardware
399
- if torch.cuda.is_available():
400
- gpu_info = torch.cuda.get_device_properties(0)
401
- logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
402
-
403
- # Check if it's an L40S or high-memory GPU
404
- if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
405
- logger.info("Detected L40S GPU - optimizing for high-memory GPU")
406
- per_device_train_batch_size = training_config.get("per_device_train_batch_size", 4)
 
 
 
 
 
 
 
 
 
407
  else:
408
- # Use a smaller batch size for other GPUs
409
- per_device_train_batch_size = 2
410
- logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
 
 
 
 
 
411
  else:
412
- # Use minimal batch size for CPU
413
- per_device_train_batch_size = 1
414
- logger.warning("No GPU detected - using minimal batch size for CPU training")
415
 
416
  # Configure reporting backends
417
  reports = training_config.get("report_to", ["tensorboard"])
@@ -420,7 +477,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
420
  logger.info("Creating training arguments")
421
  training_args = TrainingArguments(
422
  output_dir=output_dir,
423
- num_train_epochs=training_config.get("num_train_epochs", 3),
424
  per_device_train_batch_size=per_device_train_batch_size,
425
  gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
426
  learning_rate=training_config.get("learning_rate", 2e-5),
@@ -428,21 +485,20 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
428
  warmup_ratio=training_config.get("warmup_ratio", 0.03),
429
  weight_decay=training_config.get("weight_decay", 0.01),
430
  optim=training_config.get("optim", "adamw_torch"),
431
- fp16=torch.cuda.is_available() and hardware_config.get("fp16", True),
432
- bf16=torch.cuda.is_available() and hardware_config.get("bf16", False),
433
  max_grad_norm=training_config.get("max_grad_norm", 0.3),
434
  logging_steps=training_config.get("logging_steps", 10),
435
  save_steps=training_config.get("save_steps", 200),
436
  save_total_limit=training_config.get("save_total_limit", 3),
437
- evaluation_strategy=training_config.get("evaluation_strategy", "steps"),
438
- eval_steps=training_config.get("eval_steps", 200),
439
- load_best_model_at_end=training_config.get("load_best_model_at_end", True),
440
  report_to=reports,
441
  logging_first_step=training_config.get("logging_first_step", True),
442
  disable_tqdm=training_config.get("disable_tqdm", False),
443
  remove_unused_columns=False,
444
- gradient_checkpointing=torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True),
445
- dataloader_num_workers=training_config.get("dataloader_num_workers", 4)
446
  )
447
 
448
  # Create trainer with pre-tokenized collator
@@ -510,9 +566,16 @@ if __name__ == "__main__":
510
  help="Repository name for the model on Hugging Face Hub")
511
  parser.add_argument("--private_repo", action="store_true",
512
  help="Make the Hugging Face Hub repository private")
 
 
513
 
514
  args = parser.parse_args()
515
 
 
 
 
 
 
516
  try:
517
  output_path = train(
518
  args.config,
 
31
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
32
  logger = logging.getLogger(__name__)
33
 
34
+ # Determine if we're running in HF Space
35
+ def is_running_in_space():
36
+ """Check if we're running in a Hugging Face Space"""
37
+ return os.environ.get("SPACE_ID") is not None
38
+
39
+ # Check if fully compatible CUDA is available for training
40
+ def is_cuda_fully_available():
41
+ """
42
+ Check if CUDA is fully available for training with bitsandbytes.
43
+ More strict than torch.cuda.is_available() - requires full GPU compatibility.
44
+ """
45
+ # If running in HF Space, default to CPU mode unless explicitly overridden
46
+ if is_running_in_space() and os.environ.get("FORCE_GPU") != "1":
47
+ logger.warning("Running in Hugging Face Space - defaulting to CPU mode for stability")
48
+ return False
49
+
50
+ # If CUDA is not available according to PyTorch, we definitely can't use it
51
+ if not torch.cuda.is_available():
52
+ logger.warning("CUDA not available according to PyTorch")
53
+ return False
54
+
55
+ # Check if bitsandbytes is properly installed and compatible with our GPU
56
  try:
57
  import bitsandbytes as bnb
58
+ logger.info("BitsAndBytes package is installed")
59
+
60
+ # Try to create a dummy 4-bit computation to verify compatibility
61
+ try:
62
+ dummy = torch.zeros(1, device="cuda")
63
+ a = bnb.nn.Linear4bit(1, 1)
64
+ a.to(device="cuda")
65
+ result = a(dummy)
66
+ logger.info("BitsAndBytes with CUDA is working correctly")
67
+ return True
68
+ except Exception as e:
69
+ logger.warning(f"BitsAndBytes CUDA compatibility test failed: {str(e)}")
70
  return False
71
+
72
+ except ImportError:
73
+ logger.warning("BitsAndBytes package not installed - cannot use 4-bit quantization")
74
+ return False
75
+ except Exception as e:
76
+ logger.warning(f"Unexpected error checking BitsAndBytes: {str(e)}")
77
  return False
78
 
79
  # Create a marker file to indicate training is active
 
306
  logger.error(f"Error loading dataset: {str(e)}")
307
  raise
308
 
309
+ # Load a simpler, smaller model for CPU mode
310
+ def get_small_model_name(original_model_name):
311
+ """Get a smaller model name for CPU mode"""
312
+ # If using DeepSeek-R1-Distill-Qwen-14B, use a smaller model
313
+ if "DeepSeek" in original_model_name and "14B" in original_model_name:
314
+ logger.info("Using smaller model for CPU mode")
315
+ return "distilgpt2" # Much smaller model
316
+
317
+ # Otherwise just use the original model
318
+ return original_model_name
319
+
320
  # Main training function
321
  def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_name=None, private_repo=False):
322
  # Load environment variables
 
345
  # Load and prepare dataset with proper sorting
346
  dataset = load_and_prepare_dataset(dataset_name, config)
347
 
348
+ # Determine if we can use CUDA with bitsandbytes
349
+ can_use_4bit = is_cuda_fully_available()
350
+
351
  # Load model settings
352
+ original_model_name = model_config.get("model_name_or_path")
353
+
354
+ # For CPU mode, use a smaller model
355
+ if not can_use_4bit and is_running_in_space():
356
+ model_name = get_small_model_name(original_model_name)
357
+ logger.warning(f"Using smaller model {model_name} in CPU mode for Hugging Face Space")
358
+ else:
359
+ model_name = original_model_name
360
+
361
  logger.info(f"Using model: {model_name}")
362
 
363
  # Initialize tokenizer
 
371
  # Get quantization config
372
  quant_config = config.get("quantization_config", {})
373
 
374
+ # Determine if we should use 4-bit quantization
375
+ if can_use_4bit and quant_config.get("load_in_4bit", True):
376
+ use_4bit = True
377
+ logger.info("Using 4-bit quantization with CUDA")
378
+ else:
379
+ use_4bit = False
380
+ logger.warning("Using CPU mode without quantization")
381
 
382
  # Create model with proper configuration
383
  logger.info(f"Loading model (4-bit quantization: {use_4bit})")
 
405
  # CPU fallback (or non-quantized GPU) mode
406
  logger.warning("Loading model in CPU fallback mode (no 4-bit quantization)")
407
 
408
+ # Force CPU (safest option in HF Spaces)
409
+ device_map = "cpu"
410
+ dtype = torch.float32
411
+ logger.info("Forcing CPU mode for stability")
 
 
 
 
 
412
 
413
  # Load model without quantization
414
  model = AutoModelForCausalLM.from_pretrained(
 
420
  low_cpu_mem_usage=True
421
  )
422
 
423
+ # Apply rope scaling if configured and available
424
+ if "rope_scaling" in model_config and hasattr(model.config, "rope_scaling"):
425
  logger.info(f"Applying rope scaling: {model_config['rope_scaling']}")
426
+ model.config.rope_scaling = model_config["rope_scaling"]
 
427
 
428
  # Create LoRA config
429
  logger.info("Creating LoRA configuration")
 
440
  model = get_peft_model(model, lora_config_obj)
441
  logger.info("Successfully applied LoRA")
442
 
443
+ # Always use minimal batch size for HF Space CPU
444
+ if is_running_in_space() and not can_use_4bit:
445
+ per_device_train_batch_size = 1
446
+ logger.warning("Using minimal batch size for CPU training in Hugging Face Space")
447
+ else:
448
+ # Determine batch size based on available hardware
449
+ if torch.cuda.is_available():
450
+ gpu_info = torch.cuda.get_device_properties(0)
451
+ logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
452
+
453
+ # Check if it's an L40S or high-memory GPU
454
+ if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
455
+ logger.info("Detected L40S GPU - optimizing for high-memory GPU")
456
+ per_device_train_batch_size = training_config.get("per_device_train_batch_size", 4)
457
+ else:
458
+ # Use a smaller batch size for other GPUs
459
+ per_device_train_batch_size = 2
460
+ logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
461
  else:
462
+ # Use minimal batch size for CPU
463
+ per_device_train_batch_size = 1
464
+ logger.warning("No GPU detected - using minimal batch size for CPU training")
465
+
466
+ # For Space CPU training mode, use minimal epochs
467
+ if is_running_in_space() and not can_use_4bit:
468
+ num_train_epochs = 1
469
+ logger.warning("Reducing to 1 epoch for CPU training in Space")
470
  else:
471
+ num_train_epochs = training_config.get("num_train_epochs", 3)
 
 
472
 
473
  # Configure reporting backends
474
  reports = training_config.get("report_to", ["tensorboard"])
 
477
  logger.info("Creating training arguments")
478
  training_args = TrainingArguments(
479
  output_dir=output_dir,
480
+ num_train_epochs=num_train_epochs,
481
  per_device_train_batch_size=per_device_train_batch_size,
482
  gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
483
  learning_rate=training_config.get("learning_rate", 2e-5),
 
485
  warmup_ratio=training_config.get("warmup_ratio", 0.03),
486
  weight_decay=training_config.get("weight_decay", 0.01),
487
  optim=training_config.get("optim", "adamw_torch"),
488
+ fp16=False, # Disable for stability
489
+ bf16=False, # Disable for stability
490
  max_grad_norm=training_config.get("max_grad_norm", 0.3),
491
  logging_steps=training_config.get("logging_steps", 10),
492
  save_steps=training_config.get("save_steps", 200),
493
  save_total_limit=training_config.get("save_total_limit", 3),
494
+ evaluation_strategy="no", # Simplified for Space
495
+ load_best_model_at_end=False, # Simplified for Space
 
496
  report_to=reports,
497
  logging_first_step=training_config.get("logging_first_step", True),
498
  disable_tqdm=training_config.get("disable_tqdm", False),
499
  remove_unused_columns=False,
500
+ gradient_checkpointing=False, # Disable for stability
501
+ dataloader_num_workers=0 # Simplified for Space
502
  )
503
 
504
  # Create trainer with pre-tokenized collator
 
566
  help="Repository name for the model on Hugging Face Hub")
567
  parser.add_argument("--private_repo", action="store_true",
568
  help="Make the Hugging Face Hub repository private")
569
+ parser.add_argument("--force_cpu", action="store_true",
570
+ help="Force CPU mode even if CUDA is available")
571
 
572
  args = parser.parse_args()
573
 
574
+ # Force CPU mode if requested
575
+ if args.force_cpu:
576
+ os.environ["FORCE_GPU"] = "0"
577
+ logger.info("Forcing CPU mode as requested")
578
+
579
  try:
580
  output_path = train(
581
  args.config,