Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

nicoleathy commited on Jul 18, 2024

Commit

f11a0ff

verified ·

1 Parent(s): 391d684

Delete competition/llama.py

Browse files

Files changed (1) hide show

competition/llama.py +0 -139

competition/llama.py DELETED Viewed

@@ -1,139 +0,0 @@
-from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
-from datasets import Dataset
-import pandas as pd
-from sklearn.model_selection import train_test_split
-from peft import get_peft_model, LoraConfig, TaskType
-import evaluate
-import numpy as np
-from tqdm import tqdm
-# Load the dataset
-file_path = 'train_en.csv'
-dataset = pd.read_csv(file_path)
-# Map labels to expected responses
-label_mapping = {
-    "Yes": 0,
-    "No": 1,
-    "It doesn't matter": 2,
-    "Unimportant": 2,
-    "Incorrect questioning": 3,
-    "Correct answers": 4
-}
-# Apply label mapping
-dataset['label'] = dataset['label'].map(label_mapping)
-# Handle NaN values: Drop rows where label is NaN
-dataset = dataset.dropna(subset=['label'])
-# Ensure labels are integers
-dataset['label'] = dataset['label'].astype(int)
-# Combine "text" and "puzzle" columns
-dataset['combined_text'] = dataset['text'] + " " + dataset['puzzle']
-# Split the dataset into training and validation sets
-train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)
-# Convert the dataframes to datasets
-train_dataset = Dataset.from_pandas(train_df)
-val_dataset = Dataset.from_pandas(val_df)
-# Load the tokenizer and model
-model_name = "meta-llama/Meta-Llama-3-8B"  # Replace with the actual model name
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
-# Add a padding token if it's not already present
-if tokenizer.pad_token is None:
-    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
-    model.resize_token_embeddings(len(tokenizer))
-    tokenizer.pad_token = tokenizer.eos_token  # Set the padding token explicitly
-# Ensure the padding token is set correctly in the model configuration
-model.config.pad_token_id = tokenizer.pad_token_id
-# Tokenize the data
-def tokenize_function(examples):
-    return tokenizer(examples['combined_text'], truncation=True, padding='max_length', max_length=128)
-train_dataset = train_dataset.map(tokenize_function, batched=True, num_proc=4)  # Use multiprocessing
-val_dataset = val_dataset.map(tokenize_function, batched=True, num_proc=4)
-# Set the format for PyTorch
-train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
-val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
-# Define LoRA configuration
-lora_config = LoraConfig(
-    task_type=TaskType.SEQ_CLS,
-    r=16,
-    lora_alpha=16,
-    target_modules=["q_proj", "v_proj"],
-    lora_dropout=0.05,
-    bias="none"
-)
-# Apply LoRA to the model
-model = get_peft_model(model, lora_config)
-model.print_trainable_parameters()
-# Training arguments
-training_args = TrainingArguments(
-    output_dir='./results',
-    learning_rate=1e-4,
-    lr_scheduler_type="linear",
-    warmup_ratio=0.1,
-    max_grad_norm=0.3,
-    per_device_train_batch_size=8,  # Increase batch size if memory allows
-    per_device_eval_batch_size=8,
-    num_train_epochs=3,
-    weight_decay=0.001,
-    evaluation_strategy="epoch",
-    save_strategy="epoch",
-    load_best_model_at_end=True,
-    report_to="wandb",
-    fp16=True,
-    gradient_checkpointing=True,
-    gradient_accumulation_steps=2,  # Adjust based on memory constraints
-    dataloader_num_workers=4,
-    logging_steps=100,
-    save_total_limit=2,
-)
-def compute_metrics(eval_pred):
-    precision_metric = evaluate.load("precision")
-    recall_metric = evaluate.load("recall")
-    f1_metric = evaluate.load("f1")
-    accuracy_metric = evaluate.load("accuracy")
-    logits, labels = eval_pred
-    predictions = np.argmax(logits, axis=-1)
-    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")["precision"]
-    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")["recall"]
-    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
-    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
-    return {"precision": precision, "recall": recall, "f1-score": f1, 'accuracy': accuracy}
-# Initialize the Trainer
-trainer = Trainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=val_dataset,
-    compute_metrics=compute_metrics
-)
-# Train the model with progress bar
-trainer.train()
-# Save the model
-model.save_pretrained('trained_llama_model')
-tokenizer.save_pretrained('trained_llama_model')
-# Evaluate the model with progress bar
-eval_results = trainer.evaluate()
-print(eval_results)