George-API commited on
Commit
3e18b42
·
verified ·
1 Parent(s): 467f05c

Upload run_cloud_training.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_cloud_training.py +129 -4
run_cloud_training.py CHANGED
@@ -17,11 +17,15 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments,
17
  from transformers.data.data_collator import DataCollatorMixin
18
  from peft import LoraConfig, get_peft_model
19
  from dotenv import load_dotenv
 
20
 
21
  # Basic environment setup for L40S
22
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
23
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
24
 
 
 
 
25
  # Set up logging
26
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
27
  logger = logging.getLogger(__name__)
@@ -41,6 +45,84 @@ def remove_training_marker():
41
  os.remove("TRAINING_ACTIVE")
42
  logger.info("Removed training active marker")
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # Custom data collator for pre-tokenized data
45
  class PreTokenizedCollator(DataCollatorMixin):
46
  def __init__(self, pad_token_id=0, tokenizer=None):
@@ -134,11 +216,23 @@ class PreTokenizedCollator(DataCollatorMixin):
134
  # Load and prepare dataset with proper sorting
135
  def load_and_prepare_dataset(dataset_name, config):
136
  """Load and prepare the dataset for fine-tuning with proper sorting"""
 
 
 
 
 
137
  logger.info(f"Loading dataset: {dataset_name}")
138
 
139
  try:
140
  # Load dataset
141
- dataset = load_dataset(dataset_name)
 
 
 
 
 
 
 
142
 
143
  # Extract the split we want to use (usually 'train')
144
  if 'train' in dataset:
@@ -167,7 +261,7 @@ def load_and_prepare_dataset(dataset_name, config):
167
  raise
168
 
169
  # Main training function
170
- def train(config_path, dataset_name, output_dir):
171
  # Load environment variables
172
  load_dotenv()
173
 
@@ -186,6 +280,11 @@ def train(config_path, dataset_name, output_dir):
186
  lora_config = config.get("lora_config", {})
187
  dataset_config = config.get("dataset_config", {})
188
 
 
 
 
 
 
189
  # Load and prepare dataset with proper sorting
190
  dataset = load_and_prepare_dataset(dataset_name, config)
191
 
@@ -327,6 +426,16 @@ def train(config_path, dataset_name, output_dir):
327
  json.dump(config, f, indent=2)
328
 
329
  logger.info("Training complete - RESEARCH PHASE ONLY")
 
 
 
 
 
 
 
 
 
 
330
  return output_dir
331
 
332
  finally:
@@ -337,16 +446,32 @@ if __name__ == "__main__":
337
  parser = argparse.ArgumentParser(description="Fine-tune DeepSeek model (Research Only)")
338
  parser.add_argument("--config", type=str, default="transformers_config.json",
339
  help="Path to the configuration file")
340
- parser.add_argument("--dataset", type=str, default="phi4-cognitive-dataset",
341
  help="Dataset name or path")
342
  parser.add_argument("--output_dir", type=str, default="fine_tuned_model",
343
  help="Output directory for the fine-tuned model")
 
 
 
 
 
 
344
 
345
  args = parser.parse_args()
346
 
347
  try:
348
- output_path = train(args.config, args.dataset, args.output_dir)
 
 
 
 
 
 
 
349
  print(f"Research training completed. Model saved to: {output_path}")
 
 
 
350
  except Exception as e:
351
  logging.error(f"Training failed: {str(e)}")
352
  remove_training_marker() # Clean up marker if training fails
 
17
  from transformers.data.data_collator import DataCollatorMixin
18
  from peft import LoraConfig, get_peft_model
19
  from dotenv import load_dotenv
20
+ from huggingface_hub import HfApi, upload_folder
21
 
22
  # Basic environment setup for L40S
23
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"
24
  os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
25
 
26
+ # Default dataset with proper namespace
27
+ DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
28
+
29
  # Set up logging
30
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
31
  logger = logging.getLogger(__name__)
 
45
  os.remove("TRAINING_ACTIVE")
46
  logger.info("Removed training active marker")
47
 
48
+ # Function to upload model to Hugging Face Hub
49
+ def upload_to_huggingface(output_dir, repo_name=None, private=False):
50
+ """
51
+ Upload the trained model to Hugging Face Hub
52
+
53
+ Args:
54
+ output_dir: Directory containing the model files
55
+ repo_name: Name of the repository on HF Hub (default: derived from output_dir)
56
+ private: Whether the repository should be private (default: False)
57
+
58
+ Returns:
59
+ str: URL of the uploaded model on HF Hub
60
+ """
61
+ logger.info(f"Uploading model from {output_dir} to Hugging Face Hub")
62
+
63
+ # Get HF token from environment
64
+ token = os.environ.get("HF_TOKEN")
65
+ if not token:
66
+ logger.error("HF_TOKEN environment variable not set. Please set it to upload to Hugging Face Hub.")
67
+ logger.error("You can get a token from https://huggingface.co/settings/tokens")
68
+ raise ValueError("HF_TOKEN not set")
69
+
70
+ # Get or create repo name
71
+ if not repo_name:
72
+ # Use the output directory name as the repository name
73
+ repo_name = os.path.basename(os.path.normpath(output_dir))
74
+ logger.info(f"Using repository name: {repo_name}")
75
+
76
+ # Get HF username
77
+ api = HfApi(token=token)
78
+ user_info = api.whoami()
79
+ username = user_info["name"]
80
+
81
+ # Create full repository name
82
+ full_repo_name = f"{username}/{repo_name}"
83
+ logger.info(f"Creating repository: {full_repo_name}")
84
+
85
+ # Create repository if it doesn't exist
86
+ api.create_repo(
87
+ repo_id=full_repo_name,
88
+ exist_ok=True,
89
+ private=private
90
+ )
91
+
92
+ # Upload model files
93
+ logger.info(f"Uploading files from {output_dir} to {full_repo_name}")
94
+ api.upload_folder(
95
+ folder_path=output_dir,
96
+ repo_id=full_repo_name,
97
+ commit_message="Upload model files"
98
+ )
99
+
100
+ # Create model card
101
+ model_card = f"""
102
+ # {repo_name}
103
+
104
+ This model was fine-tuned using the script at https://github.com/George-API/phi4-cognitive-dataset.
105
+
106
+ ## Model details
107
+ - Base model: DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
108
+ - Dataset: {DEFAULT_DATASET}
109
+ - Training: Research only
110
+ """
111
+
112
+ with open(os.path.join(output_dir, "README.md"), "w") as f:
113
+ f.write(model_card)
114
+
115
+ # Upload the model card
116
+ api.upload_file(
117
+ path_or_fileobj=os.path.join(output_dir, "README.md"),
118
+ path_in_repo="README.md",
119
+ repo_id=full_repo_name,
120
+ commit_message="Add model card"
121
+ )
122
+
123
+ logger.info(f"Model successfully uploaded to https://huggingface.co/{full_repo_name}")
124
+ return f"https://huggingface.co/{full_repo_name}"
125
+
126
  # Custom data collator for pre-tokenized data
127
  class PreTokenizedCollator(DataCollatorMixin):
128
  def __init__(self, pad_token_id=0, tokenizer=None):
 
216
  # Load and prepare dataset with proper sorting
217
  def load_and_prepare_dataset(dataset_name, config):
218
  """Load and prepare the dataset for fine-tuning with proper sorting"""
219
+ # Use the default dataset if the provided one matches the default name without namespace
220
+ if dataset_name == "phi4-cognitive-dataset":
221
+ dataset_name = DEFAULT_DATASET
222
+ logger.info(f"Using full dataset path: {dataset_name}")
223
+
224
  logger.info(f"Loading dataset: {dataset_name}")
225
 
226
  try:
227
  # Load dataset
228
+ try:
229
+ dataset = load_dataset(dataset_name)
230
+ except Exception as e:
231
+ if "doesn't exist on the Hub or cannot be accessed" in str(e):
232
+ logger.error(f"Dataset '{dataset_name}' not found. Make sure it exists and is accessible.")
233
+ logger.error(f"If using a private dataset, check your HF_TOKEN is set in your environment.")
234
+ logger.error(f"If missing namespace, try using the full path: 'George-API/phi4-cognitive-dataset'")
235
+ raise
236
 
237
  # Extract the split we want to use (usually 'train')
238
  if 'train' in dataset:
 
261
  raise
262
 
263
  # Main training function
264
+ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_name=None, private_repo=False):
265
  # Load environment variables
266
  load_dotenv()
267
 
 
280
  lora_config = config.get("lora_config", {})
281
  dataset_config = config.get("dataset_config", {})
282
 
283
+ # Log dataset info before loading
284
+ logger.info(f"Will load dataset: {dataset_name}")
285
+ if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
286
+ logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
287
+
288
  # Load and prepare dataset with proper sorting
289
  dataset = load_and_prepare_dataset(dataset_name, config)
290
 
 
426
  json.dump(config, f, indent=2)
427
 
428
  logger.info("Training complete - RESEARCH PHASE ONLY")
429
+
430
+ # Upload to Hugging Face Hub if requested
431
+ if upload_to_hub:
432
+ hub_url = upload_to_huggingface(
433
+ output_dir=output_dir,
434
+ repo_name=hub_repo_name,
435
+ private=private_repo
436
+ )
437
+ logger.info(f"Model uploaded to Hugging Face Hub: {hub_url}")
438
+
439
  return output_dir
440
 
441
  finally:
 
446
  parser = argparse.ArgumentParser(description="Fine-tune DeepSeek model (Research Only)")
447
  parser.add_argument("--config", type=str, default="transformers_config.json",
448
  help="Path to the configuration file")
449
+ parser.add_argument("--dataset", type=str, default=DEFAULT_DATASET,
450
  help="Dataset name or path")
451
  parser.add_argument("--output_dir", type=str, default="fine_tuned_model",
452
  help="Output directory for the fine-tuned model")
453
+ parser.add_argument("--upload_to_hub", action="store_true",
454
+ help="Upload the model to Hugging Face Hub after training")
455
+ parser.add_argument("--hub_repo_name", type=str, default=None,
456
+ help="Repository name for the model on Hugging Face Hub")
457
+ parser.add_argument("--private_repo", action="store_true",
458
+ help="Make the Hugging Face Hub repository private")
459
 
460
  args = parser.parse_args()
461
 
462
  try:
463
+ output_path = train(
464
+ args.config,
465
+ args.dataset,
466
+ args.output_dir,
467
+ upload_to_hub=args.upload_to_hub,
468
+ hub_repo_name=args.hub_repo_name,
469
+ private_repo=args.private_repo
470
+ )
471
  print(f"Research training completed. Model saved to: {output_path}")
472
+
473
+ if args.upload_to_hub:
474
+ print("Model was also uploaded to Hugging Face Hub.")
475
  except Exception as e:
476
  logging.error(f"Training failed: {str(e)}")
477
  remove_training_marker() # Clean up marker if training fails