Spaces:

George-API
/

qwen4bit

Sleeping

App Files Files Community

George-API commited on Mar 16

Commit

0349da5

verified ·

1 Parent(s): d1150e7

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +119 -49

run_cloud_training.py CHANGED Viewed

@@ -5,7 +5,7 @@ Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
 - Optimized for L40S GPU
 - Works with pre-tokenized datasets
 - Research training only (no inference)
-- Added CPU fallback support for Hugging Face Spaces
 """
 import os
@@ -24,6 +24,9 @@ from huggingface_hub import HfApi, upload_folder
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
 # Default dataset with proper namespace
 DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
@@ -36,45 +39,77 @@ def is_running_in_space():
     """Check if we're running in a Hugging Face Space"""
     return os.environ.get("SPACE_ID") is not None
 # Check if fully compatible CUDA is available for training
-def is_cuda_fully_available():
     """
     Check if CUDA is fully available for training with bitsandbytes.
     More strict than torch.cuda.is_available() - requires full GPU compatibility.
     """
-    # If running in HF Space, default to CPU mode unless explicitly overridden
     if is_running_in_space() and os.environ.get("FORCE_GPU") != "1":
-        logger.warning("Running in Hugging Face Space - defaulting to CPU mode for stability")
-        return False
     # If CUDA is not available according to PyTorch, we definitely can't use it
-    if not torch.cuda.is_available():
         logger.warning("CUDA not available according to PyTorch")
         return False
-    # Check if bitsandbytes is properly installed and compatible with our GPU
-    try:
-        import bitsandbytes as bnb
-        logger.info("BitsAndBytes package is installed")
-        # Try to create a dummy 4-bit computation to verify compatibility
         try:
-            dummy = torch.zeros(1, device="cuda")
-            a = bnb.nn.Linear4bit(1, 1)
-            a.to(device="cuda")
-            result = a(dummy)
-            logger.info("BitsAndBytes with CUDA is working correctly")
-            return True
         except Exception as e:
-            logger.warning(f"BitsAndBytes CUDA compatibility test failed: {str(e)}")
             return False
-    except ImportError:
-        logger.warning("BitsAndBytes package not installed - cannot use 4-bit quantization")
-        return False
-    except Exception as e:
-        logger.warning(f"Unexpected error checking BitsAndBytes: {str(e)}")
-        return False
 # Create a marker file to indicate training is active
 def create_training_marker(output_dir):
@@ -345,14 +380,19 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         # Load and prepare dataset with proper sorting
         dataset = load_and_prepare_dataset(dataset_name, config)
-        # Determine if we can use CUDA with bitsandbytes
-        can_use_4bit = is_cuda_fully_available()
         # Load model settings
         original_model_name = model_config.get("model_name_or_path")
-        # For CPU mode, use a smaller model
-        if not can_use_4bit and is_running_in_space():
             model_name = get_small_model_name(original_model_name)
             logger.warning(f"Using smaller model {model_name} in CPU mode for Hugging Face Space")
         else:
@@ -372,17 +412,31 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         quant_config = config.get("quantization_config", {})
         # Determine if we should use 4-bit quantization
-        if can_use_4bit and quant_config.get("load_in_4bit", True):
             use_4bit = True
             logger.info("Using 4-bit quantization with CUDA")
         else:
             use_4bit = False
             logger.warning("Using CPU mode without quantization")
-        # Create model with proper configuration
-        logger.info(f"Loading model (4-bit quantization: {use_4bit})")
-        if use_4bit:
             # Create quantization config for GPU
             bnb_config = BitsAndBytesConfig(
                 load_in_4bit=True,
@@ -441,7 +495,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         logger.info("Successfully applied LoRA")
         # Always use minimal batch size for HF Space CPU
-        if is_running_in_space() and not can_use_4bit:
             per_device_train_batch_size = 1
             logger.warning("Using minimal batch size for CPU training in Hugging Face Space")
         else:
@@ -463,12 +517,28 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
                 per_device_train_batch_size = 1
                 logger.warning("No GPU detected - using minimal batch size for CPU training")
-        # For Space CPU training mode, use minimal epochs
-        if is_running_in_space() and not can_use_4bit:
-            num_train_epochs = 1
-            logger.warning("Reducing to 1 epoch for CPU training in Space")
-        else:
             num_train_epochs = training_config.get("num_train_epochs", 3)
         # Configure reporting backends
         reports = training_config.get("report_to", ["tensorboard"])
@@ -479,26 +549,26 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             output_dir=output_dir,
             num_train_epochs=num_train_epochs,
             per_device_train_batch_size=per_device_train_batch_size,
-            gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
             learning_rate=training_config.get("learning_rate", 2e-5),
             lr_scheduler_type=training_config.get("lr_scheduler_type", "cosine"),
             warmup_ratio=training_config.get("warmup_ratio", 0.03),
             weight_decay=training_config.get("weight_decay", 0.01),
             optim=training_config.get("optim", "adamw_torch"),
-            fp16=False,  # Disable for stability
-            bf16=False,  # Disable for stability
             max_grad_norm=training_config.get("max_grad_norm", 0.3),
             logging_steps=training_config.get("logging_steps", 10),
             save_steps=training_config.get("save_steps", 200),
             save_total_limit=training_config.get("save_total_limit", 3),
-            evaluation_strategy="no",  # Simplified for Space
-            load_best_model_at_end=False,  # Simplified for Space
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
             disable_tqdm=training_config.get("disable_tqdm", False),
             remove_unused_columns=False,
-            gradient_checkpointing=False,  # Disable for stability
-            dataloader_num_workers=0  # Simplified for Space
         )
         # Create trainer with pre-tokenized collator

 - Optimized for L40S GPU
 - Works with pre-tokenized datasets
 - Research training only (no inference)
+- CLOUD BASED TRAINING - Hugging Face Spaces
 """
 import os
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
 os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
+# Force GPU mode in Space if we're using a pre-quantized model
+os.environ["FORCE_GPU"] = "1"
 # Default dataset with proper namespace
 DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
     """Check if we're running in a Hugging Face Space"""
     return os.environ.get("SPACE_ID") is not None
+# Check if a model is pre-quantized (4-bit or 8-bit)
+def is_model_pre_quantized(model_name):
+    """Check if model is already pre-quantized based on name"""
+    pre_quantized_keywords = ["bnb-4bit", "4bit", "8bit", "quantized", "unsloth"]
+    return any(keyword in model_name.lower() for keyword in pre_quantized_keywords)
+# Check if GPU is available
+def is_gpu_available():
+    """Simple check if CUDA is available according to PyTorch"""
+    return torch.cuda.is_available()
 # Check if fully compatible CUDA is available for training
+def is_cuda_fully_available(model_name):
     """
     Check if CUDA is fully available for training with bitsandbytes.
     More strict than torch.cuda.is_available() - requires full GPU compatibility.
     """
+    # If model is pre-quantized and we're in a Space with GPU selected, trust it
+    if is_running_in_space() and is_model_pre_quantized(model_name) and is_gpu_available():
+        logger.info("Pre-quantized model detected with GPU in Hugging Face Space - using GPU mode")
+        return True
+    # For non-Space environments, or non-pre-quantized models, do detailed checks
+    # If FORCE_GPU is set, trust that
+    if os.environ.get("FORCE_GPU") == "1":
+        logger.info("GPU mode forced by environment variable")
+        return True
+    # If running in Space and FORCE_GPU not explicitly set, be cautious
     if is_running_in_space() and os.environ.get("FORCE_GPU") != "1":
+        # Check if CUDA is actually available
+        if is_gpu_available():
+            logger.info("GPU detected in Hugging Face Space")
+            return True
+        else:
+            logger.warning("No GPU detected in Hugging Face Space despite hardware selection")
+            return False
     # If CUDA is not available according to PyTorch, we definitely can't use it
+    if not is_gpu_available():
         logger.warning("CUDA not available according to PyTorch")
         return False
+    # Only test bitsandbytes if necessary (not for pre-quantized models)
+    if not is_model_pre_quantized(model_name):
         try:
+            import bitsandbytes as bnb
+            logger.info("BitsAndBytes package is installed")
+            # Try to create a dummy 4-bit computation to verify compatibility
+            try:
+                dummy = torch.zeros(1, device="cuda")
+                a = bnb.nn.Linear4bit(1, 1)
+                a.to(device="cuda")
+                result = a(dummy)
+                logger.info("BitsAndBytes with CUDA is working correctly")
+                return True
+            except Exception as e:
+                logger.warning(f"BitsAndBytes CUDA compatibility test failed: {str(e)}")
+                return False
+        except ImportError:
+            logger.warning("BitsAndBytes package not installed - cannot use 4-bit quantization")
+            return False
         except Exception as e:
+            logger.warning(f"Unexpected error checking BitsAndBytes: {str(e)}")
             return False
+    # For pre-quantized models without bitsandbytes test
+    return is_gpu_available()
 # Create a marker file to indicate training is active
 def create_training_marker(output_dir):
         # Load and prepare dataset with proper sorting
         dataset = load_and_prepare_dataset(dataset_name, config)
         # Load model settings
         original_model_name = model_config.get("model_name_or_path")
+        # Special handling for pre-quantized models like unsloth models
+        is_pre_quantized = is_model_pre_quantized(original_model_name)
+        if is_pre_quantized:
+            logger.info(f"Detected pre-quantized model: {original_model_name}")
+        # Determine if we can use CUDA with bitsandbytes
+        can_use_4bit = is_cuda_fully_available(original_model_name)
+        # For CPU mode, use a smaller model (unless pre-quantized)
+        if not can_use_4bit and is_running_in_space() and not is_pre_quantized:
             model_name = get_small_model_name(original_model_name)
             logger.warning(f"Using smaller model {model_name} in CPU mode for Hugging Face Space")
         else:
         quant_config = config.get("quantization_config", {})
         # Determine if we should use 4-bit quantization
+        # Pre-quantized models always use their built-in quantization
+        if is_pre_quantized:
+            use_4bit = True
+            logger.info("Using pre-quantized model with built-in quantization")
+        elif can_use_4bit and quant_config.get("load_in_4bit", True):
             use_4bit = True
             logger.info("Using 4-bit quantization with CUDA")
         else:
             use_4bit = False
             logger.warning("Using CPU mode without quantization")
+        # For pre-quantized models, always use device_map="auto"
+        if is_pre_quantized and is_gpu_available():
+            logger.info("Loading pre-quantized model with GPU support")
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                device_map="auto",
+                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+                trust_remote_code=True,
+                use_cache=model_config.get("use_cache", False)
+            )
+        # Create model with proper configuration for non-pre-quantized models
+        elif use_4bit and not is_pre_quantized:
+            logger.info(f"Loading model with 4-bit quantization")
             # Create quantization config for GPU
             bnb_config = BitsAndBytesConfig(
                 load_in_4bit=True,
         logger.info("Successfully applied LoRA")
         # Always use minimal batch size for HF Space CPU
+        if is_running_in_space() and not can_use_4bit and not is_pre_quantized:
             per_device_train_batch_size = 1
             logger.warning("Using minimal batch size for CPU training in Hugging Face Space")
         else:
                 per_device_train_batch_size = 1
                 logger.warning("No GPU detected - using minimal batch size for CPU training")
+        # Use full training parameters for pre-quantized models or GPU mode
+        if is_pre_quantized or can_use_4bit or not is_running_in_space():
             num_train_epochs = training_config.get("num_train_epochs", 3)
+            gradient_accumulation_steps = training_config.get("gradient_accumulation_steps", 4)
+            fp16 = torch.cuda.is_available() and hardware_config.get("fp16", True)
+            bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
+            gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
+            dataloader_workers = training_config.get("dataloader_num_workers", 4)
+            evaluation_strategy = training_config.get("evaluation_strategy", "steps")
+            load_best_model_at_end = training_config.get("load_best_model_at_end", True)
+            logger.info("Using full training parameters for GPU mode")
+        else:
+            # For Space CPU training mode, use minimal parameters
+            num_train_epochs = 1
+            gradient_accumulation_steps = 1
+            fp16 = False
+            bf16 = False
+            gradient_checkpointing = False
+            dataloader_workers = 0
+            evaluation_strategy = "no"
+            load_best_model_at_end = False
+            logger.warning("Using minimal parameters for CPU training in Space")
         # Configure reporting backends
         reports = training_config.get("report_to", ["tensorboard"])
             output_dir=output_dir,
             num_train_epochs=num_train_epochs,
             per_device_train_batch_size=per_device_train_batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
             learning_rate=training_config.get("learning_rate", 2e-5),
             lr_scheduler_type=training_config.get("lr_scheduler_type", "cosine"),
             warmup_ratio=training_config.get("warmup_ratio", 0.03),
             weight_decay=training_config.get("weight_decay", 0.01),
             optim=training_config.get("optim", "adamw_torch"),
+            fp16=fp16,
+            bf16=bf16,
             max_grad_norm=training_config.get("max_grad_norm", 0.3),
             logging_steps=training_config.get("logging_steps", 10),
             save_steps=training_config.get("save_steps", 200),
             save_total_limit=training_config.get("save_total_limit", 3),
+            evaluation_strategy=evaluation_strategy,
+            load_best_model_at_end=load_best_model_at_end,
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
             disable_tqdm=training_config.get("disable_tqdm", False),
             remove_unused_columns=False,
+            gradient_checkpointing=gradient_checkpointing,
+            dataloader_num_workers=dataloader_workers
         )
         # Create trainer with pre-tokenized collator