|
import pandas as pd |
|
from sklearn.linear_model import LinearRegression |
|
from sklearn.model_selection import cross_val_score, KFold |
|
from sklearn.impute import SimpleImputer |
|
import numpy as np |
|
|
|
|
|
data = pd.read_csv("./input/data.csv") |
|
|
|
|
|
numeric_columns = data.select_dtypes(include=[np.number]).columns |
|
X = data[numeric_columns].drop(columns=["x_e_out [-]"]) |
|
y = data["x_e_out [-]"] |
|
|
|
|
|
train_data = data.dropna(subset=["x_e_out [-]"]) |
|
nan_data = data[data["x_e_out [-]"].isna()] |
|
|
|
|
|
imputer = SimpleImputer(strategy="mean") |
|
X_train_imputed = imputer.fit_transform( |
|
train_data[numeric_columns].drop(columns=["x_e_out [-]"]) |
|
) |
|
y_train = train_data["x_e_out [-]"] |
|
|
|
|
|
model = LinearRegression() |
|
kf = KFold(n_splits=10, shuffle=True, random_state=1) |
|
rmse_scores = cross_val_score( |
|
model, X_train_imputed, y_train, scoring="neg_root_mean_squared_error", cv=kf |
|
) |
|
print(f"10-fold CV RMSE: {-np.mean(rmse_scores):.4f} (+/- {np.std(rmse_scores):.4f})") |
|
|
|
|
|
model.fit(X_train_imputed, y_train) |
|
|
|
|
|
test_data = pd.read_csv("./input/sample_submission.csv") |
|
|
|
|
|
X_test = nan_data[numeric_columns].drop(columns=["x_e_out [-]"]) |
|
X_test_imputed = imputer.transform(X_test) |
|
|
|
|
|
nan_data["x_e_out [-]"] = model.predict(X_test_imputed) |
|
|
|
|
|
test_data = test_data.merge(nan_data[["id", "x_e_out [-]"]], on="id", how="left") |
|
|
|
|
|
test_data.to_csv("./working/submission.csv", index=False) |
|
|