File size: 2,940 Bytes
5cbc1e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import PolynomialFeatures
# Load the data
train_data = pd.read_csv("./input/train.csv")
test_data = pd.read_csv("./input/test.csv")
# Separate features and target
X = train_data.drop(["Age", "id"], axis=1)
y = train_data["Age"]
test_X = test_data.drop(["id"], axis=1)
# Generate polynomial features for selected columns and ensure unique feature names by adding a prefix
def generate_poly_features(
df, feature_names, degree=2, include_bias=False, prefix="poly_"
):
poly_features = PolynomialFeatures(degree=degree, include_bias=include_bias)
selected_features = df[feature_names]
poly_features_array = poly_features.fit_transform(selected_features)
poly_feature_names = [
prefix + name for name in poly_features.get_feature_names_out(feature_names)
]
return pd.DataFrame(poly_features_array, columns=poly_feature_names)
# Apply polynomial feature generation to both train and test datasets
poly_features_train = generate_poly_features(X, ["Length", "Diameter", "Height"])
poly_features_test = generate_poly_features(test_X, ["Length", "Diameter", "Height"])
# Concatenate the polynomial features with the original dataset
X_poly = pd.concat([X.reset_index(drop=True), poly_features_train], axis=1)
test_X_poly = pd.concat([test_X.reset_index(drop=True), poly_features_test], axis=1)
# Specify categorical features
cat_features = ["Sex"]
# Initialize KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)
# Initialize an empty list to store MAE for each fold
mae_scores = []
# Define hyperparameters
hyperparams = {
"iterations": 1500,
"learning_rate": 0.05,
"depth": 8,
"loss_function": "MAE",
"cat_features": cat_features,
"verbose": 0,
}
# Loop over each fold
for train_index, test_index in kf.split(X_poly):
X_train, X_val = X_poly.iloc[train_index], X_poly.iloc[test_index]
y_train, y_val = y.iloc[train_index], y.iloc[test_index]
# Initialize CatBoostRegressor with hyperparameters
model = CatBoostRegressor(**hyperparams)
# Train the model
model.fit(
X_train,
y_train,
cat_features=cat_features,
eval_set=(X_val, y_val),
early_stopping_rounds=100,
verbose=0,
)
# Predict on validation set
predictions = model.predict(X_val)
# Calculate and print MAE
mae = mean_absolute_error(y_val, predictions)
mae_scores.append(mae)
# Print the average MAE across all folds
print(f"Average MAE across all folds: {sum(mae_scores) / len(mae_scores)}")
# Predict on the test set
test_predictions = model.predict(test_X_poly)
# Prepare submission file
submission_df = pd.DataFrame({"id": test_data["id"], "Age": test_predictions})
submission_df.to_csv("./working/submission.csv", index=False)
|