George-API commited on
Commit
1faea13
·
verified ·
1 Parent(s): 0349da5

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +57 -9
run_cloud_training.py CHANGED
@@ -2,8 +2,7 @@
2
 
3
  """
4
  Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
5
- - Optimized for L40S GPU
6
- - Works with pre-tokenized datasets
7
  - Research training only (no inference)
8
  - CLOUD BASED TRAINING - Hugging Face Spaces
9
  """
@@ -13,6 +12,8 @@ import logging
13
  import json
14
  import torch
15
  import argparse
 
 
16
  from datasets import load_dataset
17
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoConfig, BitsAndBytesConfig
18
  from transformers.data.data_collator import DataCollatorMixin
@@ -27,6 +28,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
27
  # Force GPU mode in Space if we're using a pre-quantized model
28
  os.environ["FORCE_GPU"] = "1"
29
 
 
 
 
30
  # Default dataset with proper namespace
31
  DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
32
 
@@ -294,8 +298,43 @@ class PreTokenizedCollator(DataCollatorMixin):
294
 
295
  return batch
296
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  # Load and prepare dataset with proper sorting
298
- def load_and_prepare_dataset(dataset_name, config):
299
  """Load and prepare the dataset for fine-tuning with proper sorting"""
300
  # Use the default dataset if the provided one matches the default name without namespace
301
  if dataset_name == "phi4-cognitive-dataset":
@@ -323,6 +362,10 @@ def load_and_prepare_dataset(dataset_name, config):
323
  dataset_config = config.get("dataset_config", {})
324
  sort_field = dataset_config.get("sort_by_field", "prompt_number")
325
 
 
 
 
 
326
  # Sort in ascending order by specified field
327
  logger.info(f"Sorting dataset by {sort_field} in ascending order")
328
  dataset = dataset.sort(sort_field)
@@ -377,9 +420,6 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
377
  if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
378
  logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
379
 
380
- # Load and prepare dataset with proper sorting
381
- dataset = load_and_prepare_dataset(dataset_name, config)
382
-
383
  # Load model settings
384
  original_model_name = model_config.get("model_name_or_path")
385
 
@@ -408,6 +448,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
408
  )
409
  tokenizer.pad_token = tokenizer.eos_token
410
 
 
 
 
411
  # Get quantization config
412
  quant_config = config.get("quantization_config", {})
413
 
@@ -525,7 +568,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
525
  bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
526
  gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
527
  dataloader_workers = training_config.get("dataloader_num_workers", 4)
528
- evaluation_strategy = training_config.get("evaluation_strategy", "steps")
529
  load_best_model_at_end = training_config.get("load_best_model_at_end", True)
530
  logger.info("Using full training parameters for GPU mode")
531
  else:
@@ -536,7 +579,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
536
  bf16 = False
537
  gradient_checkpointing = False
538
  dataloader_workers = 0
539
- evaluation_strategy = "no"
540
  load_best_model_at_end = False
541
  logger.warning("Using minimal parameters for CPU training in Space")
542
 
@@ -561,7 +604,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
561
  logging_steps=training_config.get("logging_steps", 10),
562
  save_steps=training_config.get("save_steps", 200),
563
  save_total_limit=training_config.get("save_total_limit", 3),
564
- evaluation_strategy=evaluation_strategy,
565
  load_best_model_at_end=load_best_model_at_end,
566
  report_to=reports,
567
  logging_first_step=training_config.get("logging_first_step", True),
@@ -581,6 +624,11 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
581
  pad_token_id=tokenizer.pad_token_id,
582
  tokenizer=tokenizer
583
  ),
 
 
 
 
 
584
  )
585
 
586
  # Start training
 
2
 
3
  """
4
  Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
5
+ - Optimized for L40S GPU with pre-tokenized datasets
 
6
  - Research training only (no inference)
7
  - CLOUD BASED TRAINING - Hugging Face Spaces
8
  """
 
12
  import json
13
  import torch
14
  import argparse
15
+ import shutil
16
+ from pathlib import Path
17
  from datasets import load_dataset
18
  from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoConfig, BitsAndBytesConfig
19
  from transformers.data.data_collator import DataCollatorMixin
 
28
  # Force GPU mode in Space if we're using a pre-quantized model
29
  os.environ["FORCE_GPU"] = "1"
30
 
31
+ # Create triton directory to avoid warning
32
+ os.makedirs(os.path.expanduser("~/.triton/autotune"), exist_ok=True)
33
+
34
  # Default dataset with proper namespace
35
  DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
36
 
 
298
 
299
  return batch
300
 
301
+ # Preprocess dataset to ensure all entries are pre-tokenized
302
+ def preprocess_dataset(dataset, tokenizer):
303
+ """Ensure dataset is fully pre-tokenized to avoid tokenization during training"""
304
+ logger.info("Pre-processing dataset to ensure all entries are tokenized")
305
+
306
+ def process_example(example):
307
+ # If already has input_ids as list of integers, keep as is
308
+ if 'input_ids' in example and isinstance(example['input_ids'], list) and all(isinstance(x, int) for x in example['input_ids']):
309
+ return example
310
+
311
+ # If has conversations with content field
312
+ if 'conversations' in example:
313
+ conversations = example['conversations']
314
+ if isinstance(conversations, list) and len(conversations) > 0:
315
+ # If conversations has content field, tokenize it
316
+ if isinstance(conversations[0], dict) and 'content' in conversations[0]:
317
+ content = conversations[0]['content']
318
+ if isinstance(content, str):
319
+ example['input_ids'] = tokenizer.encode(content, add_special_tokens=False)
320
+ return example
321
+
322
+ # For any other format, try to extract text and tokenize
323
+ text = None
324
+ if 'text' in example:
325
+ text = example['text']
326
+ elif 'content' in example:
327
+ text = example['content']
328
+
329
+ if text and isinstance(text, str):
330
+ example['input_ids'] = tokenizer.encode(text, add_special_tokens=False)
331
+
332
+ return example
333
+
334
+ return dataset.map(process_example)
335
+
336
  # Load and prepare dataset with proper sorting
337
+ def load_and_prepare_dataset(dataset_name, config, tokenizer=None):
338
  """Load and prepare the dataset for fine-tuning with proper sorting"""
339
  # Use the default dataset if the provided one matches the default name without namespace
340
  if dataset_name == "phi4-cognitive-dataset":
 
362
  dataset_config = config.get("dataset_config", {})
363
  sort_field = dataset_config.get("sort_by_field", "prompt_number")
364
 
365
+ # Preprocess dataset to ensure all entries are pre-tokenized
366
+ if tokenizer is not None:
367
+ dataset = preprocess_dataset(dataset, tokenizer)
368
+
369
  # Sort in ascending order by specified field
370
  logger.info(f"Sorting dataset by {sort_field} in ascending order")
371
  dataset = dataset.sort(sort_field)
 
420
  if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
421
  logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
422
 
 
 
 
423
  # Load model settings
424
  original_model_name = model_config.get("model_name_or_path")
425
 
 
448
  )
449
  tokenizer.pad_token = tokenizer.eos_token
450
 
451
+ # Load and prepare dataset with proper sorting
452
+ dataset = load_and_prepare_dataset(dataset_name, config, tokenizer)
453
+
454
  # Get quantization config
455
  quant_config = config.get("quantization_config", {})
456
 
 
568
  bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
569
  gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
570
  dataloader_workers = training_config.get("dataloader_num_workers", 4)
571
+ eval_strategy = training_config.get("eval_strategy", "steps") # Updated from evaluation_strategy
572
  load_best_model_at_end = training_config.get("load_best_model_at_end", True)
573
  logger.info("Using full training parameters for GPU mode")
574
  else:
 
579
  bf16 = False
580
  gradient_checkpointing = False
581
  dataloader_workers = 0
582
+ eval_strategy = "no"
583
  load_best_model_at_end = False
584
  logger.warning("Using minimal parameters for CPU training in Space")
585
 
 
604
  logging_steps=training_config.get("logging_steps", 10),
605
  save_steps=training_config.get("save_steps", 200),
606
  save_total_limit=training_config.get("save_total_limit", 3),
607
+ eval_strategy=eval_strategy, # Updated from evaluation_strategy
608
  load_best_model_at_end=load_best_model_at_end,
609
  report_to=reports,
610
  logging_first_step=training_config.get("logging_first_step", True),
 
624
  pad_token_id=tokenizer.pad_token_id,
625
  tokenizer=tokenizer
626
  ),
627
+ # Add label_names to avoid warning
628
+ compute_metrics=None,
629
+ tokenizer=tokenizer, # Provide tokenizer for proper padding
630
+ # Define label_names to fix warning
631
+ label_names=["labels"]
632
  )
633
 
634
  # Start training