|
import pandas as pd |
|
from sklearn.ensemble import RandomForestClassifier |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
|
|
X_train = pd.read_csv("./input/X_train.csv") |
|
y_train = pd.read_csv("./input/y_train.csv") |
|
X_test = pd.read_csv("./input/X_test.csv") |
|
|
|
|
|
train_data = X_train.merge(y_train, on="series_id", how="inner") |
|
|
|
|
|
features = train_data.drop( |
|
["row_id", "series_id", "measurement_number", "group_id", "surface"], axis=1 |
|
) |
|
labels = train_data["surface"] |
|
|
|
|
|
scaler = StandardScaler() |
|
features = scaler.fit_transform(features) |
|
|
|
|
|
X_train, X_val, y_train, y_val = train_test_split( |
|
features, labels, test_size=0.2, random_state=42 |
|
) |
|
|
|
|
|
rf = RandomForestClassifier(n_estimators=100, random_state=42) |
|
|
|
|
|
rf.fit(X_train, y_train) |
|
|
|
|
|
y_pred = rf.predict(X_val) |
|
|
|
|
|
accuracy = accuracy_score(y_val, y_pred) |
|
print(f"Validation Accuracy: {accuracy}") |
|
|
|
|
|
test_features = X_test.drop(["row_id", "series_id", "measurement_number"], axis=1) |
|
test_features = scaler.transform(test_features) |
|
|
|
|
|
test_predictions = rf.predict(test_features) |
|
|
|
|
|
submission = pd.DataFrame( |
|
{"series_id": X_test["series_id"], "surface": test_predictions} |
|
) |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|