|
import pandas as pd |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.metrics import mean_squared_error |
|
from math import sqrt |
|
|
|
|
|
data = pd.read_csv("./input/data.csv") |
|
|
|
|
|
cols_with_missing = [col for col in data.columns if data[col].isnull().any()] |
|
|
|
|
|
imputer = SimpleImputer(strategy="mean") |
|
|
|
|
|
data[cols_with_missing] = imputer.fit_transform(data[cols_with_missing]) |
|
|
|
|
|
sample_submission = pd.read_csv("./input/sample_submission.csv") |
|
|
|
|
|
sample_submission[["row_id", "column"]] = sample_submission["row-col"].str.split( |
|
"-", expand=True |
|
) |
|
sample_submission["row_id"] = sample_submission["row_id"].astype(int) |
|
|
|
|
|
original_values = [] |
|
imputed_values = [] |
|
for index, row in sample_submission.iterrows(): |
|
original_values.append(row["value"]) |
|
imputed_values.append(data.at[row["row_id"], row["column"]]) |
|
|
|
rmse = sqrt(mean_squared_error(original_values, imputed_values)) |
|
print(f"Validation RMSE: {rmse}") |
|
|
|
|
|
sample_submission["value"] = imputed_values |
|
sample_submission[["row-col", "value"]].to_csv("./working/submission.csv", index=False) |
|
|