Spaces:

George-API
/

qwen4bit

Running

App Files Files Community

George-API commited on Mar 15

Commit

d1150e7

verified ·

1 Parent(s): af30cf6

Upload run_cloud_training.py with huggingface_hub

Browse files

Files changed (1) hide show

run_cloud_training.py +118 -55

run_cloud_training.py CHANGED Viewed

@@ -31,25 +31,49 @@ DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Check if CUDA is available for bitsandbytes
-def is_bnb_available():
-    """Check if bitsandbytes with CUDA is available"""
     try:
         import bitsandbytes as bnb
-        if torch.cuda.is_available():
-            # Try to create a dummy 4-bit tensor to see if it works
-            try:
-                _ = torch.zeros(1, dtype=torch.float16, device="cuda").to(bnb.nn.Linear4bit)
-                logger.info("BitsAndBytes with CUDA support is available")
-                return True
-            except Exception as e:
-                logger.warning(f"CUDA available but bitsandbytes test failed: {e}")
-                return False
-        else:
-            logger.warning("CUDA not available for bitsandbytes")
             return False
-    except (ImportError, RuntimeError) as e:
-        logger.warning(f"Error checking bitsandbytes: {e}")
         return False
 # Create a marker file to indicate training is active
@@ -282,6 +306,17 @@ def load_and_prepare_dataset(dataset_name, config):
         logger.error(f"Error loading dataset: {str(e)}")
         raise
 # Main training function
 def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_name=None, private_repo=False):
     # Load environment variables
@@ -310,8 +345,19 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         # Load and prepare dataset with proper sorting
         dataset = load_and_prepare_dataset(dataset_name, config)
         # Load model settings
-        model_name = model_config.get("model_name_or_path")
         logger.info(f"Using model: {model_name}")
         # Initialize tokenizer
@@ -325,8 +371,13 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         # Get quantization config
         quant_config = config.get("quantization_config", {})
-        # Check if bitsandbytes with CUDA is available
-        use_4bit = is_bnb_available() and quant_config.get("load_in_4bit", True)
         # Create model with proper configuration
         logger.info(f"Loading model (4-bit quantization: {use_4bit})")
@@ -354,15 +405,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             # CPU fallback (or non-quantized GPU) mode
             logger.warning("Loading model in CPU fallback mode (no 4-bit quantization)")
-            # Determine best dtype based on available hardware
-            if torch.cuda.is_available():
-                dtype = torch.float16
-                device_map = "auto"
-                logger.info("Using GPU with fp16")
-            else:
-                dtype = torch.float32
-                device_map = "cpu"
-                logger.info("Using CPU with fp32")
             # Load model without quantization
             model = AutoModelForCausalLM.from_pretrained(
@@ -374,11 +420,10 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
                 low_cpu_mem_usage=True
             )
-        # Apply rope scaling if configured
-        if "rope_scaling" in model_config:
             logger.info(f"Applying rope scaling: {model_config['rope_scaling']}")
-            if hasattr(model.config, "rope_scaling"):
-                model.config.rope_scaling = model_config["rope_scaling"]
         # Create LoRA config
         logger.info("Creating LoRA configuration")
@@ -395,23 +440,35 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         model = get_peft_model(model, lora_config_obj)
         logger.info("Successfully applied LoRA")
-        # Determine batch size based on available hardware
-        if torch.cuda.is_available():
-            gpu_info = torch.cuda.get_device_properties(0)
-            logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
-            # Check if it's an L40S or high-memory GPU
-            if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
-                logger.info("Detected L40S GPU - optimizing for high-memory GPU")
-                per_device_train_batch_size = training_config.get("per_device_train_batch_size", 4)
             else:
-                # Use a smaller batch size for other GPUs
-                per_device_train_batch_size = 2
-                logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
         else:
-            # Use minimal batch size for CPU
-            per_device_train_batch_size = 1
-            logger.warning("No GPU detected - using minimal batch size for CPU training")
         # Configure reporting backends
         reports = training_config.get("report_to", ["tensorboard"])
@@ -420,7 +477,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
         logger.info("Creating training arguments")
         training_args = TrainingArguments(
             output_dir=output_dir,
-            num_train_epochs=training_config.get("num_train_epochs", 3),
             per_device_train_batch_size=per_device_train_batch_size,
             gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
             learning_rate=training_config.get("learning_rate", 2e-5),
@@ -428,21 +485,20 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
             warmup_ratio=training_config.get("warmup_ratio", 0.03),
             weight_decay=training_config.get("weight_decay", 0.01),
             optim=training_config.get("optim", "adamw_torch"),
-            fp16=torch.cuda.is_available() and hardware_config.get("fp16", True),
-            bf16=torch.cuda.is_available() and hardware_config.get("bf16", False),
             max_grad_norm=training_config.get("max_grad_norm", 0.3),
             logging_steps=training_config.get("logging_steps", 10),
             save_steps=training_config.get("save_steps", 200),
             save_total_limit=training_config.get("save_total_limit", 3),
-            evaluation_strategy=training_config.get("evaluation_strategy", "steps"),
-            eval_steps=training_config.get("eval_steps", 200),
-            load_best_model_at_end=training_config.get("load_best_model_at_end", True),
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
             disable_tqdm=training_config.get("disable_tqdm", False),
             remove_unused_columns=False,
-            gradient_checkpointing=torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True),
-            dataloader_num_workers=training_config.get("dataloader_num_workers", 4)
         )
         # Create trainer with pre-tokenized collator
@@ -510,9 +566,16 @@ if __name__ == "__main__":
                       help="Repository name for the model on Hugging Face Hub")
     parser.add_argument("--private_repo", action="store_true",
                       help="Make the Hugging Face Hub repository private")
     args = parser.parse_args()
     try:
         output_path = train(
             args.config,

 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Determine if we're running in HF Space
+def is_running_in_space():
+    """Check if we're running in a Hugging Face Space"""
+    return os.environ.get("SPACE_ID") is not None
+# Check if fully compatible CUDA is available for training
+def is_cuda_fully_available():
+    """
+    Check if CUDA is fully available for training with bitsandbytes.
+    More strict than torch.cuda.is_available() - requires full GPU compatibility.
+    """
+    # If running in HF Space, default to CPU mode unless explicitly overridden
+    if is_running_in_space() and os.environ.get("FORCE_GPU") != "1":
+        logger.warning("Running in Hugging Face Space - defaulting to CPU mode for stability")
+        return False
+    # If CUDA is not available according to PyTorch, we definitely can't use it
+    if not torch.cuda.is_available():
+        logger.warning("CUDA not available according to PyTorch")
+        return False
+    # Check if bitsandbytes is properly installed and compatible with our GPU
     try:
         import bitsandbytes as bnb
+        logger.info("BitsAndBytes package is installed")
+        # Try to create a dummy 4-bit computation to verify compatibility
+        try:
+            dummy = torch.zeros(1, device="cuda")
+            a = bnb.nn.Linear4bit(1, 1)
+            a.to(device="cuda")
+            result = a(dummy)
+            logger.info("BitsAndBytes with CUDA is working correctly")
+            return True
+        except Exception as e:
+            logger.warning(f"BitsAndBytes CUDA compatibility test failed: {str(e)}")
             return False
+    except ImportError:
+        logger.warning("BitsAndBytes package not installed - cannot use 4-bit quantization")
+        return False
+    except Exception as e:
+        logger.warning(f"Unexpected error checking BitsAndBytes: {str(e)}")
         return False
 # Create a marker file to indicate training is active
         logger.error(f"Error loading dataset: {str(e)}")
         raise
+# Load a simpler, smaller model for CPU mode
+def get_small_model_name(original_model_name):
+    """Get a smaller model name for CPU mode"""
+    # If using DeepSeek-R1-Distill-Qwen-14B, use a smaller model
+    if "DeepSeek" in original_model_name and "14B" in original_model_name:
+        logger.info("Using smaller model for CPU mode")
+        return "distilgpt2"  # Much smaller model
+    # Otherwise just use the original model
+    return original_model_name
 # Main training function
 def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_name=None, private_repo=False):
     # Load environment variables
         # Load and prepare dataset with proper sorting
         dataset = load_and_prepare_dataset(dataset_name, config)
+        # Determine if we can use CUDA with bitsandbytes
+        can_use_4bit = is_cuda_fully_available()
         # Load model settings
+        original_model_name = model_config.get("model_name_or_path")
+        # For CPU mode, use a smaller model
+        if not can_use_4bit and is_running_in_space():
+            model_name = get_small_model_name(original_model_name)
+            logger.warning(f"Using smaller model {model_name} in CPU mode for Hugging Face Space")
+        else:
+            model_name = original_model_name
         logger.info(f"Using model: {model_name}")
         # Initialize tokenizer
         # Get quantization config
         quant_config = config.get("quantization_config", {})
+        # Determine if we should use 4-bit quantization
+        if can_use_4bit and quant_config.get("load_in_4bit", True):
+            use_4bit = True
+            logger.info("Using 4-bit quantization with CUDA")
+        else:
+            use_4bit = False
+            logger.warning("Using CPU mode without quantization")
         # Create model with proper configuration
         logger.info(f"Loading model (4-bit quantization: {use_4bit})")
             # CPU fallback (or non-quantized GPU) mode
             logger.warning("Loading model in CPU fallback mode (no 4-bit quantization)")
+            # Force CPU (safest option in HF Spaces)
+            device_map = "cpu"
+            dtype = torch.float32
+            logger.info("Forcing CPU mode for stability")
             # Load model without quantization
             model = AutoModelForCausalLM.from_pretrained(
                 low_cpu_mem_usage=True
             )
+        # Apply rope scaling if configured and available
+        if "rope_scaling" in model_config and hasattr(model.config, "rope_scaling"):
             logger.info(f"Applying rope scaling: {model_config['rope_scaling']}")
+            model.config.rope_scaling = model_config["rope_scaling"]
         # Create LoRA config
         logger.info("Creating LoRA configuration")
         model = get_peft_model(model, lora_config_obj)
         logger.info("Successfully applied LoRA")
+        # Always use minimal batch size for HF Space CPU
+        if is_running_in_space() and not can_use_4bit:
+            per_device_train_batch_size = 1
+            logger.warning("Using minimal batch size for CPU training in Hugging Face Space")
+        else:
+            # Determine batch size based on available hardware
+            if torch.cuda.is_available():
+                gpu_info = torch.cuda.get_device_properties(0)
+                logger.info(f"GPU: {gpu_info.name}, VRAM: {gpu_info.total_memory / 1e9:.2f} GB")
+                # Check if it's an L40S or high-memory GPU
+                if "L40S" in gpu_info.name or gpu_info.total_memory > 40e9:
+                    logger.info("Detected L40S GPU - optimizing for high-memory GPU")
+                    per_device_train_batch_size = training_config.get("per_device_train_batch_size", 4)
+                else:
+                    # Use a smaller batch size for other GPUs
+                    per_device_train_batch_size = 2
+                    logger.info(f"Using conservative batch size for non-L40S GPU: {per_device_train_batch_size}")
             else:
+                # Use minimal batch size for CPU
+                per_device_train_batch_size = 1
+                logger.warning("No GPU detected - using minimal batch size for CPU training")
+        # For Space CPU training mode, use minimal epochs
+        if is_running_in_space() and not can_use_4bit:
+            num_train_epochs = 1
+            logger.warning("Reducing to 1 epoch for CPU training in Space")
         else:
+            num_train_epochs = training_config.get("num_train_epochs", 3)
         # Configure reporting backends
         reports = training_config.get("report_to", ["tensorboard"])
         logger.info("Creating training arguments")
         training_args = TrainingArguments(
             output_dir=output_dir,
+            num_train_epochs=num_train_epochs,
             per_device_train_batch_size=per_device_train_batch_size,
             gradient_accumulation_steps=training_config.get("gradient_accumulation_steps", 4),
             learning_rate=training_config.get("learning_rate", 2e-5),
             warmup_ratio=training_config.get("warmup_ratio", 0.03),
             weight_decay=training_config.get("weight_decay", 0.01),
             optim=training_config.get("optim", "adamw_torch"),
+            fp16=False,  # Disable for stability
+            bf16=False,  # Disable for stability
             max_grad_norm=training_config.get("max_grad_norm", 0.3),
             logging_steps=training_config.get("logging_steps", 10),
             save_steps=training_config.get("save_steps", 200),
             save_total_limit=training_config.get("save_total_limit", 3),
+            evaluation_strategy="no",  # Simplified for Space
+            load_best_model_at_end=False,  # Simplified for Space
             report_to=reports,
             logging_first_step=training_config.get("logging_first_step", True),
             disable_tqdm=training_config.get("disable_tqdm", False),
             remove_unused_columns=False,
+            gradient_checkpointing=False,  # Disable for stability
+            dataloader_num_workers=0  # Simplified for Space
         )
         # Create trainer with pre-tokenized collator
                       help="Repository name for the model on Hugging Face Hub")
     parser.add_argument("--private_repo", action="store_true",
                       help="Make the Hugging Face Hub repository private")
+    parser.add_argument("--force_cpu", action="store_true",
+                      help="Force CPU mode even if CUDA is available")
     args = parser.parse_args()
+    # Force CPU mode if requested
+    if args.force_cpu:
+        os.environ["FORCE_GPU"] = "0"
+        logger.info("Forcing CPU mode as requested")
     try:
         output_path = train(
             args.config,