George-API commited on
Commit
b571ee2
·
verified ·
1 Parent(s): 3e18b42

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +76 -22
run_cloud_training.py CHANGED
@@ -5,6 +5,7 @@ Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
5
  - Optimized for L40S GPU
6
  - Works with pre-tokenized datasets
7
  - Research training only (no inference)
 
8
  """
9
 
10
  import os
@@ -30,6 +31,27 @@ DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
30
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
  logger = logging.getLogger(__name__)
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # Create a marker file to indicate training is active
34
  def create_training_marker(output_dir):
35
  os.makedirs(output_dir, exist_ok=True)
@@ -300,26 +322,57 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
300
  )
301
  tokenizer.pad_token = tokenizer.eos_token
302
 
303
- # Create quantization config
304
  quant_config = config.get("quantization_config", {})
305
- bnb_config = BitsAndBytesConfig(
306
- load_in_4bit=quant_config.get("load_in_4bit", True),
307
- bnb_4bit_compute_dtype=torch.float16,
308
- bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
309
- bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
310
- )
311
 
312
  # Create model with proper configuration
313
- logger.info("Loading pre-quantized model")
314
- model = AutoModelForCausalLM.from_pretrained(
315
- model_name,
316
- quantization_config=bnb_config,
317
- device_map="auto",
318
- torch_dtype=torch.float16,
319
- trust_remote_code=True,
320
- use_cache=model_config.get("use_cache", False),
321
- attn_implementation=hardware_config.get("attn_implementation", "eager")
322
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  # Apply rope scaling if configured
325
  if "rope_scaling" in model_config:
@@ -342,7 +395,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
342
  model = get_peft_model(model, lora_config_obj)
343
  logger.info("Successfully applied LoRA")
344
 
345
- # Check for L40S GPU and optimize batch size
346
  if torch.cuda.is_available():
347
  gpu_info = torch.cuda.get_device_properties(0)
348
  logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
@@ -356,8 +409,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
356
  per_device_train_batch_size = 2
357
  logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
358
  else:
 
359
  per_device_train_batch_size = 1
360
- logger.warning("No GPU detected - using minimal batch size")
361
 
362
  # Configure reporting backends
363
  reports = training_config.get("report_to", ["tensorboard"])
@@ -374,8 +428,8 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
374
  warmup_ratio=training_config.get("warmup_ratio", 0.03),
375
  weight_decay=training_config.get("weight_decay", 0.01),
376
  optim=training_config.get("optim", "adamw_torch"),
377
- fp16=hardware_config.get("fp16", True),
378
- bf16=hardware_config.get("bf16", False),
379
  max_grad_norm=training_config.get("max_grad_norm", 0.3),
380
  logging_steps=training_config.get("logging_steps", 10),
381
  save_steps=training_config.get("save_steps", 200),
@@ -387,7 +441,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
387
  logging_first_step=training_config.get("logging_first_step", True),
388
  disable_tqdm=training_config.get("disable_tqdm", False),
389
  remove_unused_columns=False,
390
- gradient_checkpointing=hardware_config.get("gradient_checkpointing", True),
391
  dataloader_num_workers=training_config.get("dataloader_num_workers", 4)
392
  )
393
 
 
5
  - Optimized for L40S GPU
6
  - Works with pre-tokenized datasets
7
  - Research training only (no inference)
8
+ - Added CPU fallback support for Hugging Face Spaces
9
  """
10
 
11
  import os
 
31
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
32
  logger = logging.getLogger(__name__)
33
 
34
+ # Check if CUDA is available for bitsandbytes
35
+ def is_bnb_available():
36
+ """Check if bitsandbytes with CUDA is available"""
37
+ try:
38
+ import bitsandbytes as bnb
39
+ if torch.cuda.is_available():
40
+ # Try to create a dummy 4-bit tensor to see if it works
41
+ try:
42
+ _ = torch.zeros(1, dtype=torch.float16, device="cuda").to(bnb.nn.Linear4bit)
43
+ logger.info("BitsAndBytes with CUDA support is available")
44
+ return True
45
+ except Exception as e:
46
+ logger.warning(f"CUDA available but bitsandbytes test failed: {e}")
47
+ return False
48
+ else:
49
+ logger.warning("CUDA not available for bitsandbytes")
50
+ return False
51
+ except (ImportError, RuntimeError) as e:
52
+ logger.warning(f"Error checking bitsandbytes: {e}")
53
+ return False
54
+
55
  # Create a marker file to indicate training is active
56
  def create_training_marker(output_dir):
57
  os.makedirs(output_dir, exist_ok=True)
 
322
  )
323
  tokenizer.pad_token = tokenizer.eos_token
324
 
325
+ # Get quantization config
326
  quant_config = config.get("quantization_config", {})
327
+
328
+ # Check if bitsandbytes with CUDA is available
329
+ use_4bit = is_bnb_available() and quant_config.get("load_in_4bit", True)
 
 
 
330
 
331
  # Create model with proper configuration
332
+ logger.info(f"Loading model (4-bit quantization: {use_4bit})")
333
+
334
+ if use_4bit:
335
+ # Create quantization config for GPU
336
+ bnb_config = BitsAndBytesConfig(
337
+ load_in_4bit=True,
338
+ bnb_4bit_compute_dtype=torch.float16,
339
+ bnb_4bit_quant_type=quant_config.get("bnb_4bit_quant_type", "nf4"),
340
+ bnb_4bit_use_double_quant=quant_config.get("bnb_4bit_use_double_quant", True)
341
+ )
342
+
343
+ # Load 4-bit quantized model for GPU
344
+ model = AutoModelForCausalLM.from_pretrained(
345
+ model_name,
346
+ quantization_config=bnb_config,
347
+ device_map="auto",
348
+ torch_dtype=torch.float16,
349
+ trust_remote_code=True,
350
+ use_cache=model_config.get("use_cache", False),
351
+ attn_implementation=hardware_config.get("attn_implementation", "eager")
352
+ )
353
+ else:
354
+ # CPU fallback (or non-quantized GPU) mode
355
+ logger.warning("Loading model in CPU fallback mode (no 4-bit quantization)")
356
+
357
+ # Determine best dtype based on available hardware
358
+ if torch.cuda.is_available():
359
+ dtype = torch.float16
360
+ device_map = "auto"
361
+ logger.info("Using GPU with fp16")
362
+ else:
363
+ dtype = torch.float32
364
+ device_map = "cpu"
365
+ logger.info("Using CPU with fp32")
366
+
367
+ # Load model without quantization
368
+ model = AutoModelForCausalLM.from_pretrained(
369
+ model_name,
370
+ device_map=device_map,
371
+ torch_dtype=dtype,
372
+ trust_remote_code=True,
373
+ use_cache=model_config.get("use_cache", False),
374
+ low_cpu_mem_usage=True
375
+ )
376
 
377
  # Apply rope scaling if configured
378
  if "rope_scaling" in model_config:
 
395
  model = get_peft_model(model, lora_config_obj)
396
  logger.info("Successfully applied LoRA")
397
 
398
+ # Determine batch size based on available hardware
399
  if torch.cuda.is_available():
400
  gpu_info = torch.cuda.get_device_properties(0)
401
  logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
 
409
  per_device_train_batch_size = 2
410
  logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
411
  else:
412
+ # Use minimal batch size for CPU
413
  per_device_train_batch_size = 1
414
+ logger.warning("No GPU detected - using minimal batch size for CPU training")
415
 
416
  # Configure reporting backends
417
  reports = training_config.get("report_to", ["tensorboard"])
 
428
  warmup_ratio=training_config.get("warmup_ratio", 0.03),
429
  weight_decay=training_config.get("weight_decay", 0.01),
430
  optim=training_config.get("optim", "adamw_torch"),
431
+ fp16=torch.cuda.is_available() and hardware_config.get("fp16", True),
432
+ bf16=torch.cuda.is_available() and hardware_config.get("bf16", False),
433
  max_grad_norm=training_config.get("max_grad_norm", 0.3),
434
  logging_steps=training_config.get("logging_steps", 10),
435
  save_steps=training_config.get("save_steps", 200),
 
441
  logging_first_step=training_config.get("logging_first_step", True),
442
  disable_tqdm=training_config.get("disable_tqdm", False),
443
  remove_unused_columns=False,
444
+ gradient_checkpointing=torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True),
445
  dataloader_num_workers=training_config.get("dataloader_num_workers", 4)
446
  )
447