from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, EarlyStoppingCallback, PreTrainedTokenizer from peft import LoraConfig, get_peft_model, TaskType import torch def initialize_deepseek_model(model, device, tokenizer, train_dataset, val_dataset, MODEL_DIR): lora_config = LoraConfig( r=16, # Rank of LoRA matrices (adjust for memory vs. accuracy) lora_alpha=32, # Scaling factor lora_dropout=0.0, # Dropout for regularization bias="none", task_type=TaskType.CAUSAL_LM, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ] ) model = get_peft_model(model, lora_config) model = model.to(device) training_args = TrainingArguments( output_dir=MODEL_DIR, eval_strategy="epoch", # Evaluate at the end of each epoch save_strategy="epoch", # Save model every epoch per_device_train_batch_size=1, # LoRA allows higher batch size per_device_eval_batch_size=1, gradient_accumulation_steps=16, num_train_epochs=10, # Increase if needed learning_rate=5e-5, # Higher LR since we're only training LoRA layers weight_decay=0.001, logging_steps=50, # Print loss every 50 steps save_total_limit=2, # Keep last 4 checkpoints bf16=True if torch.cuda.is_available() else False, push_to_hub=False, load_best_model_at_end=True, metric_for_best_model="eval_loss", greater_is_better=False ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] ) return model, trainer