Spaces:

dexhunter
/

aideml

Running

App Files Files Community

aideml / sample_results /tabular-playground-series-apr-2021.py

dominikschmidt

add open-source AIDE

39c930a about 1 year ago

raw

history blame contribute delete

1.87 kB

	import pandas as pd
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.model_selection import cross_val_score
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline

	# Load the data
	train_data = pd.read_csv("./input/train.csv")
	test_data = pd.read_csv("./input/test.csv")

	# Features and target
	X = train_data.drop(["Survived", "PassengerId", "Name", "Ticket", "Cabin"], axis=1)
	y = train_data["Survived"]

	# Preprocessing for numerical data
	numerical_transformer = SimpleImputer(strategy="median")

	# Preprocessing for categorical data
	categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
	categorical_transformer = Pipeline(
	steps=[
	("imputer", SimpleImputer(strategy="most_frequent")),
	("onehot", OneHotEncoder(handle_unknown="ignore")),
	]
	)

	# Bundle preprocessing for numerical and categorical data
	preprocessor = ColumnTransformer(
	transformers=[
	("num", numerical_transformer, ["Age", "Fare"]),
	("cat", categorical_transformer, categorical_cols),
	]
	)

	# Define the model
	model = RandomForestClassifier(n_estimators=100, random_state=0)

	# Bundle preprocessing and modeling code in a pipeline
	clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

	# Cross-validation scores
	scores = cross_val_score(clf, X, y, cv=10, scoring="accuracy")
	print(f"Average cross-validation score: {scores.mean():.4f}")

	# Preprocessing of test data, fit model
	clf.fit(X, y)
	test_X = test_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
	test_preds = clf.predict(test_X)

	# Save test predictions to file
	output = pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": test_preds})
	output.to_csv("./working/submission.csv", index=False)