Spaces:
Running
Running
Upload run_cloud_training.py with huggingface_hub
Browse files- run_cloud_training.py +57 -9
run_cloud_training.py
CHANGED
@@ -2,8 +2,7 @@
|
|
2 |
|
3 |
"""
|
4 |
Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
|
5 |
-
- Optimized for L40S GPU
|
6 |
-
- Works with pre-tokenized datasets
|
7 |
- Research training only (no inference)
|
8 |
- CLOUD BASED TRAINING - Hugging Face Spaces
|
9 |
"""
|
@@ -13,6 +12,8 @@ import logging
|
|
13 |
import json
|
14 |
import torch
|
15 |
import argparse
|
|
|
|
|
16 |
from datasets import load_dataset
|
17 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoConfig, BitsAndBytesConfig
|
18 |
from transformers.data.data_collator import DataCollatorMixin
|
@@ -27,6 +28,9 @@ os.environ["TRANSFORMERS_NO_FLASH_ATTENTION"] = "1"
|
|
27 |
# Force GPU mode in Space if we're using a pre-quantized model
|
28 |
os.environ["FORCE_GPU"] = "1"
|
29 |
|
|
|
|
|
|
|
30 |
# Default dataset with proper namespace
|
31 |
DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
|
32 |
|
@@ -294,8 +298,43 @@ class PreTokenizedCollator(DataCollatorMixin):
|
|
294 |
|
295 |
return batch
|
296 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
297 |
# Load and prepare dataset with proper sorting
|
298 |
-
def load_and_prepare_dataset(dataset_name, config):
|
299 |
"""Load and prepare the dataset for fine-tuning with proper sorting"""
|
300 |
# Use the default dataset if the provided one matches the default name without namespace
|
301 |
if dataset_name == "phi4-cognitive-dataset":
|
@@ -323,6 +362,10 @@ def load_and_prepare_dataset(dataset_name, config):
|
|
323 |
dataset_config = config.get("dataset_config", {})
|
324 |
sort_field = dataset_config.get("sort_by_field", "prompt_number")
|
325 |
|
|
|
|
|
|
|
|
|
326 |
# Sort in ascending order by specified field
|
327 |
logger.info(f"Sorting dataset by {sort_field} in ascending order")
|
328 |
dataset = dataset.sort(sort_field)
|
@@ -377,9 +420,6 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
377 |
if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
|
378 |
logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
|
379 |
|
380 |
-
# Load and prepare dataset with proper sorting
|
381 |
-
dataset = load_and_prepare_dataset(dataset_name, config)
|
382 |
-
|
383 |
# Load model settings
|
384 |
original_model_name = model_config.get("model_name_or_path")
|
385 |
|
@@ -408,6 +448,9 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
408 |
)
|
409 |
tokenizer.pad_token = tokenizer.eos_token
|
410 |
|
|
|
|
|
|
|
411 |
# Get quantization config
|
412 |
quant_config = config.get("quantization_config", {})
|
413 |
|
@@ -525,7 +568,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
525 |
bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
|
526 |
gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
|
527 |
dataloader_workers = training_config.get("dataloader_num_workers", 4)
|
528 |
-
|
529 |
load_best_model_at_end = training_config.get("load_best_model_at_end", True)
|
530 |
logger.info("Using full training parameters for GPU mode")
|
531 |
else:
|
@@ -536,7 +579,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
536 |
bf16 = False
|
537 |
gradient_checkpointing = False
|
538 |
dataloader_workers = 0
|
539 |
-
|
540 |
load_best_model_at_end = False
|
541 |
logger.warning("Using minimal parameters for CPU training in Space")
|
542 |
|
@@ -561,7 +604,7 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
561 |
logging_steps=training_config.get("logging_steps", 10),
|
562 |
save_steps=training_config.get("save_steps", 200),
|
563 |
save_total_limit=training_config.get("save_total_limit", 3),
|
564 |
-
|
565 |
load_best_model_at_end=load_best_model_at_end,
|
566 |
report_to=reports,
|
567 |
logging_first_step=training_config.get("logging_first_step", True),
|
@@ -581,6 +624,11 @@ def train(config_path, dataset_name, output_dir, upload_to_hub=False, hub_repo_n
|
|
581 |
pad_token_id=tokenizer.pad_token_id,
|
582 |
tokenizer=tokenizer
|
583 |
),
|
|
|
|
|
|
|
|
|
|
|
584 |
)
|
585 |
|
586 |
# Start training
|
|
|
2 |
|
3 |
"""
|
4 |
Simplified fine-tuning script for DeepSeek-R1-Distill-Qwen-14B-unsloth-bnb-4bit
|
5 |
+
- Optimized for L40S GPU with pre-tokenized datasets
|
|
|
6 |
- Research training only (no inference)
|
7 |
- CLOUD BASED TRAINING - Hugging Face Spaces
|
8 |
"""
|
|
|
12 |
import json
|
13 |
import torch
|
14 |
import argparse
|
15 |
+
import shutil
|
16 |
+
from pathlib import Path
|
17 |
from datasets import load_dataset
|
18 |
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, AutoConfig, BitsAndBytesConfig
|
19 |
from transformers.data.data_collator import DataCollatorMixin
|
|
|
28 |
# Force GPU mode in Space if we're using a pre-quantized model
|
29 |
os.environ["FORCE_GPU"] = "1"
|
30 |
|
31 |
+
# Create triton directory to avoid warning
|
32 |
+
os.makedirs(os.path.expanduser("~/.triton/autotune"), exist_ok=True)
|
33 |
+
|
34 |
# Default dataset with proper namespace
|
35 |
DEFAULT_DATASET = "George-API/phi4-cognitive-dataset"
|
36 |
|
|
|
298 |
|
299 |
return batch
|
300 |
|
301 |
+
# Preprocess dataset to ensure all entries are pre-tokenized
|
302 |
+
def preprocess_dataset(dataset, tokenizer):
|
303 |
+
"""Ensure dataset is fully pre-tokenized to avoid tokenization during training"""
|
304 |
+
logger.info("Pre-processing dataset to ensure all entries are tokenized")
|
305 |
+
|
306 |
+
def process_example(example):
|
307 |
+
# If already has input_ids as list of integers, keep as is
|
308 |
+
if 'input_ids' in example and isinstance(example['input_ids'], list) and all(isinstance(x, int) for x in example['input_ids']):
|
309 |
+
return example
|
310 |
+
|
311 |
+
# If has conversations with content field
|
312 |
+
if 'conversations' in example:
|
313 |
+
conversations = example['conversations']
|
314 |
+
if isinstance(conversations, list) and len(conversations) > 0:
|
315 |
+
# If conversations has content field, tokenize it
|
316 |
+
if isinstance(conversations[0], dict) and 'content' in conversations[0]:
|
317 |
+
content = conversations[0]['content']
|
318 |
+
if isinstance(content, str):
|
319 |
+
example['input_ids'] = tokenizer.encode(content, add_special_tokens=False)
|
320 |
+
return example
|
321 |
+
|
322 |
+
# For any other format, try to extract text and tokenize
|
323 |
+
text = None
|
324 |
+
if 'text' in example:
|
325 |
+
text = example['text']
|
326 |
+
elif 'content' in example:
|
327 |
+
text = example['content']
|
328 |
+
|
329 |
+
if text and isinstance(text, str):
|
330 |
+
example['input_ids'] = tokenizer.encode(text, add_special_tokens=False)
|
331 |
+
|
332 |
+
return example
|
333 |
+
|
334 |
+
return dataset.map(process_example)
|
335 |
+
|
336 |
# Load and prepare dataset with proper sorting
|
337 |
+
def load_and_prepare_dataset(dataset_name, config, tokenizer=None):
|
338 |
"""Load and prepare the dataset for fine-tuning with proper sorting"""
|
339 |
# Use the default dataset if the provided one matches the default name without namespace
|
340 |
if dataset_name == "phi4-cognitive-dataset":
|
|
|
362 |
dataset_config = config.get("dataset_config", {})
|
363 |
sort_field = dataset_config.get("sort_by_field", "prompt_number")
|
364 |
|
365 |
+
# Preprocess dataset to ensure all entries are pre-tokenized
|
366 |
+
if tokenizer is not None:
|
367 |
+
dataset = preprocess_dataset(dataset, tokenizer)
|
368 |
+
|
369 |
# Sort in ascending order by specified field
|
370 |
logger.info(f"Sorting dataset by {sort_field} in ascending order")
|
371 |
dataset = dataset.sort(sort_field)
|
|
|
420 |
if dataset_name != DEFAULT_DATASET and "phi4-cognitive-dataset" in dataset_name:
|
421 |
logger.warning(f"Dataset name may need namespace prefix. Current: {dataset_name}")
|
422 |
|
|
|
|
|
|
|
423 |
# Load model settings
|
424 |
original_model_name = model_config.get("model_name_or_path")
|
425 |
|
|
|
448 |
)
|
449 |
tokenizer.pad_token = tokenizer.eos_token
|
450 |
|
451 |
+
# Load and prepare dataset with proper sorting
|
452 |
+
dataset = load_and_prepare_dataset(dataset_name, config, tokenizer)
|
453 |
+
|
454 |
# Get quantization config
|
455 |
quant_config = config.get("quantization_config", {})
|
456 |
|
|
|
568 |
bf16 = torch.cuda.is_available() and hardware_config.get("bf16", False)
|
569 |
gradient_checkpointing = torch.cuda.is_available() and hardware_config.get("gradient_checkpointing", True)
|
570 |
dataloader_workers = training_config.get("dataloader_num_workers", 4)
|
571 |
+
eval_strategy = training_config.get("eval_strategy", "steps") # Updated from evaluation_strategy
|
572 |
load_best_model_at_end = training_config.get("load_best_model_at_end", True)
|
573 |
logger.info("Using full training parameters for GPU mode")
|
574 |
else:
|
|
|
579 |
bf16 = False
|
580 |
gradient_checkpointing = False
|
581 |
dataloader_workers = 0
|
582 |
+
eval_strategy = "no"
|
583 |
load_best_model_at_end = False
|
584 |
logger.warning("Using minimal parameters for CPU training in Space")
|
585 |
|
|
|
604 |
logging_steps=training_config.get("logging_steps", 10),
|
605 |
save_steps=training_config.get("save_steps", 200),
|
606 |
save_total_limit=training_config.get("save_total_limit", 3),
|
607 |
+
eval_strategy=eval_strategy, # Updated from evaluation_strategy
|
608 |
load_best_model_at_end=load_best_model_at_end,
|
609 |
report_to=reports,
|
610 |
logging_first_step=training_config.get("logging_first_step", True),
|
|
|
624 |
pad_token_id=tokenizer.pad_token_id,
|
625 |
tokenizer=tokenizer
|
626 |
),
|
627 |
+
# Add label_names to avoid warning
|
628 |
+
compute_metrics=None,
|
629 |
+
tokenizer=tokenizer, # Provide tokenizer for proper padding
|
630 |
+
# Define label_names to fix warning
|
631 |
+
label_names=["labels"]
|
632 |
)
|
633 |
|
634 |
# Start training
|