|
import pandas as pd |
|
from sklearn.cluster import KMeans |
|
from sklearn.metrics import silhouette_score |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
|
|
data = pd.read_csv("./input/data.csv") |
|
data.drop("id", axis=1, inplace=True) |
|
|
|
|
|
scaler = StandardScaler() |
|
data_scaled = scaler.fit_transform(data) |
|
|
|
|
|
range_n_clusters = list(range(2, 11)) |
|
silhouette_scores = [] |
|
for n_clusters in range_n_clusters: |
|
clusterer = KMeans(n_clusters=n_clusters, random_state=42) |
|
cluster_labels = clusterer.fit_predict(data_scaled) |
|
silhouette_avg = silhouette_score(data_scaled, cluster_labels) |
|
silhouette_scores.append(silhouette_avg) |
|
|
|
|
|
optimal_n_clusters = range_n_clusters[silhouette_scores.index(max(silhouette_scores))] |
|
|
|
|
|
kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42) |
|
data["Predicted"] = kmeans.fit_predict(data_scaled) |
|
|
|
|
|
final_silhouette_score = silhouette_score(data_scaled, data["Predicted"]) |
|
print( |
|
f"Silhouette Score for optimal clusters {optimal_n_clusters}: {final_silhouette_score}" |
|
) |
|
|
|
|
|
submission = pd.read_csv("./input/sample_submission.csv") |
|
submission["Predicted"] = data["Predicted"] |
|
submission.to_csv("./working/submission.csv", index=False) |
|
|