George-API commited on
Commit
0349da5
·
verified ·
1 Parent(s): d1150e7

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +119 -49
run_cloud_training.py CHANGED
@@ -5,7 +5,7 @@ Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
5
  - Optimized for L40S GPU
6
  - Works with pre-tokenized datasets
7
  - Research training only (no inference)
8
- - Added CPU fallback support for Hugging Face Spaces
9
  """
10
 
11
  import os
@@ -24,6 +24,9 @@ from huggingface_hub import HfApi, upload_folder
24
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
25
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
26
 
 
 
 
27
  # Default dataset with proper namespace
28
  DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
29
 
@@ -36,45 +39,77 @@ def is_running_in_space():
36
  """Check if we're running in a Hugging Face Space"""
37
  return os.environ.get("SPACE_ID") is not None
38
 
 
 
 
 
 
 
 
 
 
 
 
39
  # Check if fully compatible CUDA is available for training
40
- def is_cuda_fully_available():
41
  """
42
  Check if CUDA is fully available for training with bitsandbytes.
43
  More strict than torch.cuda.is_available() - requires full GPU compatibility.
44
  """
45
- # If running in HF Space, default to CPU mode unless explicitly overridden
 
 
 
 
 
 
 
 
 
 
 
 
46
  if is_running_in_space() and os.environ.get("FORCE_GPU") != "1":
47
- logger.warning("Running in Hugging Face Space - defaulting to CPU mode for stability")
48
- return False
 
 
 
 
 
49
 
50
  # If CUDA is not available according to PyTorch, we definitely can't use it
51
- if not torch.cuda.is_available():
52
  logger.warning("CUDA not available according to PyTorch")
53
  return False
54
 
55
- # Check if bitsandbytes is properly installed and compatible with our GPU
56
- try:
57
- import bitsandbytes as bnb
58
- logger.info("BitsAndBytes package is installed")
59
-
60
- # Try to create a dummy 4-bit computation to verify compatibility
61
  try:
62
- dummy = torch.zeros(1, device="cuda")
63
- a = bnb.nn.Linear4bit(1, 1)
64
- a.to(device="cuda")
65
- result = a(dummy)
66
- logger.info("BitsAndBytes with CUDA is working correctly")
67
- return True
 
 
 
 
 
 
 
 
 
 
 
 
68
  except Exception as e:
69
- logger.warning(f"BitsAndBytes CUDA compatibility test failed: {str(e)}")
70
  return False
71
-
72
- except ImportError:
73
- logger.warning("BitsAndBytes package not installed - cannot use 4-bit quantization")
74
- return False
75
- except Exception as e:
76
- logger.warning(f"Unexpected error checking BitsAndBytes: {str(e)}")
77
- return False
78
 
79
  # Create a marker file to indicate training is active
80
  def create_training_marker(output_dir):
@@ -345,14 +380,19 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
345
  # Load and prepare dataset with proper sorting
346
  dataset = load_and_prepare_dataset(dataset_name, config)
347
 
348
- # Determine if we can use CUDA with bitsandbytes
349
- can_use_4bit = is_cuda_fully_available()
350
-
351
  # Load model settings
352
  original_model_name = model_config.get("model_name_or_path")
 
 
 
 
 
 
 
 
353
 
354
- # For CPU mode, use a smaller model
355
- if not can_use_4bit and is_running_in_space():
356
  model_name = get_small_model_name(original_model_name)
357
  logger.warning(f"Using smaller model {model_name} in CPU mode for Hugging Face Space")
358
  else:
@@ -372,17 +412,31 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
372
  quant_config = config.get("quantization_config", {})
373
 
374
  # Determine if we should use 4-bit quantization
375
- if can_use_4bit and quant_config.get("load_in_4bit", True):
 
 
 
 
376
  use_4bit = True
377
  logger.info("Using 4-bit quantization with CUDA")
378
  else:
379
  use_4bit = False
380
  logger.warning("Using CPU mode without quantization")
381
 
382
- # Create model with proper configuration
383
- logger.info(f"Loading model (4-bit quantization: {use_4bit})")
384
-
385
- if use_4bit:
 
 
 
 
 
 
 
 
 
 
386
  # Create quantization config for GPU
387
  bnb_config = BitsAndBytesConfig(
388
  load_in_4bit=True,
@@ -441,7 +495,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
441
  logger.info("Successfully applied LoRA")
442
 
443
  # Always use minimal batch size for HF Space CPU
444
- if is_running_in_space() and not can_use_4bit:
445
  per_device_train_batch_size = 1
446
  logger.warning("Using minimal batch size for CPU training in Hugging Face Space")
447
  else:
@@ -463,12 +517,28 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
463
  per_device_train_batch_size = 1
464
  logger.warning("No GPU detected - using minimal batch size for CPU training")
465
 
466
- # For Space CPU training mode, use minimal epochs
467
- if is_running_in_space() and not can_use_4bit:
468
- num_train_epochs = 1
469
- logger.warning("Reducing to 1 epoch for CPU training in Space")
470
- else:
471
  num_train_epochs = training_config.get("num_train_epochs", 3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
 
473
  # Configure reporting backends
474
  reports = training_config.get("report_to", ["tensorboard"])
@@ -479,26 +549,26 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
479
  output_dir=output_dir,
480
  num_train_epochs=num_train_epochs,
481
  per_device_train_batch_size=per_device_train_batch_size,
482
- gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
483
  learning_rate=training_config.get("learning_rate", 2e-5),
484
  lr_scheduler_type=training_config.get("lr_scheduler_type", "cosine"),
485
  warmup_ratio=training_config.get("warmup_ratio", 0.03),
486
  weight_decay=training_config.get("weight_decay", 0.01),
487
  optim=training_config.get("optim", "adamw_torch"),
488
- fp16=False, # Disable for stability
489
- bf16=False, # Disable for stability
490
  max_grad_norm=training_config.get("max_grad_norm", 0.3),
491
  logging_steps=training_config.get("logging_steps", 10),
492
  save_steps=training_config.get("save_steps", 200),
493
  save_total_limit=training_config.get("save_total_limit", 3),
494
- evaluation_strategy="no", # Simplified for Space
495
- load_best_model_at_end=False, # Simplified for Space
496
  report_to=reports,
497
  logging_first_step=training_config.get("logging_first_step", True),
498
  disable_tqdm=training_config.get("disable_tqdm", False),
499
  remove_unused_columns=False,
500
- gradient_checkpointing=False, # Disable for stability
501
- dataloader_num_workers=0 # Simplified for Space
502
  )
503
 
504
  # Create trainer with pre-tokenized collator
 
5
  - Optimized for L40S GPU
6
  - Works with pre-tokenized datasets
7
  - Research training only (no inference)
8
+ - CLOUD BASED TRAINING - Hugging Face Spaces
9
  """
10
 
11
  import os
 
24
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
25
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
26
 
27
+ # Force GPU mode in Space if we're using a pre-quantized model
28
+ os.environ["FORCE_GPU"] = "1"
29
+
30
  # Default dataset with proper namespace
31
  DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
32
 
 
39
  """Check if we're running in a Hugging Face Space"""
40
  return os.environ.get("SPACE_ID") is not None
41
 
42
+ # Check if a model is pre-quantized (4-bit or 8-bit)
43
+ def is_model_pre_quantized(model_name):
44
+ """Check if model is already pre-quantized based on name"""
45
+ pre_quantized_keywords = ["bnb-4bit", "4bit", "8bit", "quantized", "unsloth"]
46
+ return any(keyword in model_name.lower() for keyword in pre_quantized_keywords)
47
+
48
+ # Check if GPU is available
49
+ def is_gpu_available():
50
+ """Simple check if CUDA is available according to PyTorch"""
51
+ return torch.cuda.is_available()
52
+
53
  # Check if fully compatible CUDA is available for training
54
+ def is_cuda_fully_available(model_name):
55
  """
56
  Check if CUDA is fully available for training with bitsandbytes.
57
  More strict than torch.cuda.is_available() - requires full GPU compatibility.
58
  """
59
+ # If model is pre-quantized and we're in a Space with GPU selected, trust it
60
+ if is_running_in_space() and is_model_pre_quantized(model_name) and is_gpu_available():
61
+ logger.info("Pre-quantized model detected with GPU in Hugging Face Space - using GPU mode")
62
+ return True
63
+
64
+ # For non-Space environments, or non-pre-quantized models, do detailed checks
65
+
66
+ # If FORCE_GPU is set, trust that
67
+ if os.environ.get("FORCE_GPU") == "1":
68
+ logger.info("GPU mode forced by environment variable")
69
+ return True
70
+
71
+ # If running in Space and FORCE_GPU not explicitly set, be cautious
72
  if is_running_in_space() and os.environ.get("FORCE_GPU") != "1":
73
+ # Check if CUDA is actually available
74
+ if is_gpu_available():
75
+ logger.info("GPU detected in Hugging Face Space")
76
+ return True
77
+ else:
78
+ logger.warning("No GPU detected in Hugging Face Space despite hardware selection")
79
+ return False
80
 
81
  # If CUDA is not available according to PyTorch, we definitely can't use it
82
+ if not is_gpu_available():
83
  logger.warning("CUDA not available according to PyTorch")
84
  return False
85
 
86
+ # Only test bitsandbytes if necessary (not for pre-quantized models)
87
+ if not is_model_pre_quantized(model_name):
 
 
 
 
88
  try:
89
+ import bitsandbytes as bnb
90
+ logger.info("BitsAndBytes package is installed")
91
+
92
+ # Try to create a dummy 4-bit computation to verify compatibility
93
+ try:
94
+ dummy = torch.zeros(1, device="cuda")
95
+ a = bnb.nn.Linear4bit(1, 1)
96
+ a.to(device="cuda")
97
+ result = a(dummy)
98
+ logger.info("BitsAndBytes with CUDA is working correctly")
99
+ return True
100
+ except Exception as e:
101
+ logger.warning(f"BitsAndBytes CUDA compatibility test failed: {str(e)}")
102
+ return False
103
+
104
+ except ImportError:
105
+ logger.warning("BitsAndBytes package not installed - cannot use 4-bit quantization")
106
+ return False
107
  except Exception as e:
108
+ logger.warning(f"Unexpected error checking BitsAndBytes: {str(e)}")
109
  return False
110
+
111
+ # For pre-quantized models without bitsandbytes test
112
+ return is_gpu_available()
 
 
 
 
113
 
114
  # Create a marker file to indicate training is active
115
  def create_training_marker(output_dir):
 
380
  # Load and prepare dataset with proper sorting
381
  dataset = load_and_prepare_dataset(dataset_name, config)
382
 
 
 
 
383
  # Load model settings
384
  original_model_name = model_config.get("model_name_or_path")
385
+
386
+ # Special handling for pre-quantized models like unsloth models
387
+ is_pre_quantized = is_model_pre_quantized(original_model_name)
388
+ if is_pre_quantized:
389
+ logger.info(f"Detected pre-quantized model: {original_model_name}")
390
+
391
+ # Determine if we can use CUDA with bitsandbytes
392
+ can_use_4bit = is_cuda_fully_available(original_model_name)
393
 
394
+ # For CPU mode, use a smaller model (unless pre-quantized)
395
+ if not can_use_4bit and is_running_in_space() and not is_pre_quantized:
396
  model_name = get_small_model_name(original_model_name)
397
  logger.warning(f"Using smaller model {model_name} in CPU mode for Hugging Face Space")
398
  else:
 
412
  quant_config = config.get("quantization_config", {})
413
 
414
  # Determine if we should use 4-bit quantization
415
+ # Pre-quantized models always use their built-in quantization
416
+ if is_pre_quantized:
417
+ use_4bit = True
418
+ logger.info("Using pre-quantized model with built-in quantization")
419
+ elif can_use_4bit and quant_config.get("load_in_4bit", True):
420
  use_4bit = True
421
  logger.info("Using 4-bit quantization with CUDA")
422
  else:
423
  use_4bit = False
424
  logger.warning("Using CPU mode without quantization")
425
 
426
+ # For pre-quantized models, always use device_map="auto"
427
+ if is_pre_quantized and is_gpu_available():
428
+ logger.info("Loading pre-quantized model with GPU support")
429
+ model = AutoModelForCausalLM.from_pretrained(
430
+ model_name,
431
+ device_map="auto",
432
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
433
+ trust_remote_code=True,
434
+ use_cache=model_config.get("use_cache", False)
435
+ )
436
+ # Create model with proper configuration for non-pre-quantized models
437
+ elif use_4bit and not is_pre_quantized:
438
+ logger.info(f"Loading model with 4-bit quantization")
439
+
440
  # Create quantization config for GPU
441
  bnb_config = BitsAndBytesConfig(
442
  load_in_4bit=True,
 
495
  logger.info("Successfully applied LoRA")
496
 
497
  # Always use minimal batch size for HF Space CPU
498
+ if is_running_in_space() and not can_use_4bit and not is_pre_quantized:
499
  per_device_train_batch_size = 1
500
  logger.warning("Using minimal batch size for CPU training in Hugging Face Space")
501
  else:
 
517
  per_device_train_batch_size = 1
518
  logger.warning("No GPU detected - using minimal batch size for CPU training")
519
 
520
+ # Use full training parameters for pre-quantized models or GPU mode
521
+ if is_pre_quantized or can_use_4bit or not is_running_in_space():
 
 
 
522
  num_train_epochs = training_config.get("num_train_epochs", 3)
523
+ gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 4)
524
+ fp16 = torch.cuda.is_available() and hardware_config.get("fp16", True)
525
+ bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
526
+ gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
527
+ dataloader_workers = training_config.get("dataloader_num_workers", 4)
528
+ evaluation_strategy = training_config.get("evaluation_strategy", "steps")
529
+ load_best_model_at_end = training_config.get("load_best_model_at_end", True)
530
+ logger.info("Using full training parameters for GPU mode")
531
+ else:
532
+ # For Space CPU training mode, use minimal parameters
533
+ num_train_epochs = 1
534
+ gradient_accumulation_steps = 1
535
+ fp16 = False
536
+ bf16 = False
537
+ gradient_checkpointing = False
538
+ dataloader_workers = 0
539
+ evaluation_strategy = "no"
540
+ load_best_model_at_end = False
541
+ logger.warning("Using minimal parameters for CPU training in Space")
542
 
543
  # Configure reporting backends
544
  reports = training_config.get("report_to", ["tensorboard"])
 
549
  output_dir=output_dir,
550
  num_train_epochs=num_train_epochs,
551
  per_device_train_batch_size=per_device_train_batch_size,
552
+ gradient_accumulation_steps=gradient_accumulation_steps,
553
  learning_rate=training_config.get("learning_rate", 2e-5),
554
  lr_scheduler_type=training_config.get("lr_scheduler_type", "cosine"),
555
  warmup_ratio=training_config.get("warmup_ratio", 0.03),
556
  weight_decay=training_config.get("weight_decay", 0.01),
557
  optim=training_config.get("optim", "adamw_torch"),
558
+ fp16=fp16,
559
+ bf16=bf16,
560
  max_grad_norm=training_config.get("max_grad_norm", 0.3),
561
  logging_steps=training_config.get("logging_steps", 10),
562
  save_steps=training_config.get("save_steps", 200),
563
  save_total_limit=training_config.get("save_total_limit", 3),
564
+ evaluation_strategy=evaluation_strategy,
565
+ load_best_model_at_end=load_best_model_at_end,
566
  report_to=reports,
567
  logging_first_step=training_config.get("logging_first_step", True),
568
  disable_tqdm=training_config.get("disable_tqdm", False),
569
  remove_unused_columns=False,
570
+ gradient_checkpointing=gradient_checkpointing,
571
+ dataloader_num_workers=dataloader_workers
572
  )
573
 
574
  # Create trainer with pre-tokenized collator