zhengyao jiang commited on
Commit
5cbc1e9
·
1 Parent(s): 60701db

update readme and include example scripts

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +72 -0
  2. examples/bike-sharing-demand.py +58 -0
  3. examples/career-con-2019.py +54 -0
  4. examples/cat-in-the-dat-ii.py +78 -0
  5. examples/cat-in-the-dat.py +58 -0
  6. examples/ciphertext-challenge-ii.py +59 -0
  7. examples/ciphertext-challenge-iii.py +31 -0
  8. examples/competitive-data-science-predict-future-sales.py +77 -0
  9. examples/digit-recognizer.py +79 -0
  10. examples/dont-overfit-ii.py +45 -0
  11. examples/facial-keypoints-detection.py +106 -0
  12. examples/forest-cover-type-prediction.py +33 -0
  13. examples/godaddy-microbusiness-density-forecasting.py +77 -0
  14. examples/home-data-for-ml-course.py +64 -0
  15. examples/house-prices-advanced-regression-techniques.py +79 -0
  16. examples/icr-identify-age-related-conditions.py +66 -0
  17. examples/jigsaw-toxic-comment-classification-challenge.py +34 -0
  18. examples/new-york-city-taxi-fare-prediction.py +116 -0
  19. examples/nlp-getting-started.py +36 -0
  20. examples/optiver-trading-at-the-close.py +58 -0
  21. examples/playground-series-s3e1.py +55 -0
  22. examples/playground-series-s3e11.py +33 -0
  23. examples/playground-series-s3e13.py +69 -0
  24. examples/playground-series-s3e14.py +61 -0
  25. examples/playground-series-s3e15.py +51 -0
  26. examples/playground-series-s3e16.py +90 -0
  27. examples/playground-series-s3e17.py +66 -0
  28. examples/playground-series-s3e18.py +91 -0
  29. examples/playground-series-s3e19.py +83 -0
  30. examples/playground-series-s3e20.py +60 -0
  31. examples/playground-series-s3e22.py +64 -0
  32. examples/playground-series-s3e23.py +70 -0
  33. examples/playground-series-s3e24.py +103 -0
  34. examples/playground-series-s3e25.py +63 -0
  35. examples/playground-series-s3e26.py +65 -0
  36. examples/playground-series-s3e3.py +65 -0
  37. examples/playground-series-s3e5.py +39 -0
  38. examples/playground-series-s3e7.py +33 -0
  39. examples/playground-series-s3e9.py +35 -0
  40. examples/playground-series-s4e1.py +58 -0
  41. examples/playground-series-s4e2.py +65 -0
  42. examples/santa-2019-revenge-of-the-accountants.py +112 -0
  43. examples/scrabble-player-rating.py +54 -0
  44. examples/sentiment-analysis-on-movie-reviews.py +55 -0
  45. examples/spaceship-titanic.py +78 -0
  46. examples/tabular-playground-series-apr-2021.py +54 -0
  47. examples/tabular-playground-series-apr-2022.py +43 -0
  48. examples/tabular-playground-series-aug-2021.py +46 -0
  49. examples/tabular-playground-series-aug-2022.py +69 -0
  50. examples/tabular-playground-series-dec-2021.py +31 -0
README.md ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AIDE: Autonomous AI for Data Science
2
+ Welcome to the official repository for AIDE, an AI system that can automatically solve data science tasks at a human level, and with human input, it can perform even better. We believe giving developers and researchers direct access to AIDE locally, with local compute and choice to use their own LLM keys, is the most straightforward way to make it useful. That's why we'll open-source it, and the tentative timeline is it will arrive before the end of April. Currently, this repository serves as a gallery showcasing its solutions for 60+ Kaggle competitions we tested.
3
+
4
+ ## About AIDE
5
+ AIDE is an AI-powered data science assistant that can autonomously understand task requirements, design, and implement solutions. By leveraging large language models and innovative agent architectures, such as the Solution Space Tree Search algorithm, AIDE has achieved human-level performance on a wide range of data science tasks, outperforming over 50% of human data scientists on Kaggle competitions.
6
+
7
+ ## Gallary
8
+ | task_name | top%_mean | file_link | competition_link |
9
+ |:----------------------------------------------|------------:|:--------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------|
10
+ | bike-sharing-demand | 0.24 | [bike-sharing-demand.py](examples/bike-sharing-demand.py) | [competition link](www.kaggle.com/competitions/bike-sharing-demand/overview) |
11
+ | tabular-playground-series-jul-2021 | 0.16 | [tabular-playground-series-jul-2021.py](examples/tabular-playground-series-jul-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-jul-2021/overview) |
12
+ | tabular-playground-series-jan-2022 | 0.48 | [tabular-playground-series-jan-2022.py](examples/tabular-playground-series-jan-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-jan-2022/overview) |
13
+ | tabular-playground-series-feb-2022 | 0.25 | [tabular-playground-series-feb-2022.py](examples/tabular-playground-series-feb-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-feb-2022/overview) |
14
+ | tabular-playground-series-feb-2021 | 0.71 | [tabular-playground-series-feb-2021.py](examples/tabular-playground-series-feb-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-feb-2021/overview) |
15
+ | tabular-playground-series-aug-2022 | 0.59 | [tabular-playground-series-aug-2022.py](examples/tabular-playground-series-aug-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-aug-2022/overview) |
16
+ | playground-series-s3e24 | 0.55 | [playground-series-s3e24.py](examples/playground-series-s3e24.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e24/overview) |
17
+ | playground-series-s3e23 | 0.2 | [playground-series-s3e23.py](examples/playground-series-s3e23.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e23/overview) |
18
+ | playground-series-s3e22 | 0.87 | [playground-series-s3e22.py](examples/playground-series-s3e22.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e22/overview) |
19
+ | playground-series-s3e19 | 0.18 | [playground-series-s3e19.py](examples/playground-series-s3e19.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e19/overview) |
20
+ | playground-series-s3e18 | 0.26 | [playground-series-s3e18.py](examples/playground-series-s3e18.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e18/overview) |
21
+ | playground-series-s3e17 | 0.46 | [playground-series-s3e17.py](examples/playground-series-s3e17.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e17/overview) |
22
+ | playground-series-s3e16 | 0.64 | [playground-series-s3e16.py](examples/playground-series-s3e16.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e16/overview) |
23
+ | playground-series-s3e14 | 0.71 | [playground-series-s3e14.py](examples/playground-series-s3e14.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e14/overview) |
24
+ | optiver-trading-at-the-close | 0.99 | [optiver-trading-at-the-close.py](examples/optiver-trading-at-the-close.py) | [competition link](www.kaggle.com/competitions/optiver-trading-at-the-close/overview) |
25
+ | new-york-city-taxi-fare-prediction | 1 | [new-york-city-taxi-fare-prediction.py](examples/new-york-city-taxi-fare-prediction.py) | [competition link](www.kaggle.com/competitions/new-york-city-taxi-fare-prediction/overview) |
26
+ | playground-series-s3e25 | 0.79 | [playground-series-s3e25.py](examples/playground-series-s3e25.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e25/overview) |
27
+ | tmdb-box-office-prediction | 0.68 | [tmdb-box-office-prediction.py](examples/tmdb-box-office-prediction.py) | [competition link](www.kaggle.com/competitions/tmdb-box-office-prediction/overview) |
28
+ | icr-identify-age-related-conditions | 0.91 | [icr-identify-age-related-conditions.py](examples/icr-identify-age-related-conditions.py) | [competition link](www.kaggle.com/competitions/icr-identify-age-related-conditions/overview) |
29
+ | house-prices-advanced-regression-techniques | 0.41 | [house-prices-advanced-regression-techniques.py](examples/house-prices-advanced-regression-techniques.py) | [competition link](www.kaggle.com/competitions/house-prices-advanced-regression-techniques/overview) |
30
+ | godaddy-microbusiness-density-forecasting | 0.65 | [godaddy-microbusiness-density-forecasting.py](examples/godaddy-microbusiness-density-forecasting.py) | [competition link](www.kaggle.com/competitions/godaddy-microbusiness-density-forecasting/overview) |
31
+ | cat-in-the-dat | 0.69 | [cat-in-the-dat.py](examples/cat-in-the-dat.py) | [competition link](www.kaggle.com/competitions/cat-in-the-dat/overview) |
32
+ | tabular-playground-series-apr-2021 | 0.77 | [tabular-playground-series-apr-2021.py](examples/tabular-playground-series-apr-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-apr-2021/overview) |
33
+ | tabular-playground-series-apr-2022 | 0.51 | [tabular-playground-series-apr-2022.py](examples/tabular-playground-series-apr-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-apr-2022/overview) |
34
+ | tabular-playground-series-aug-2021 | 0.16 | [tabular-playground-series-aug-2021.py](examples/tabular-playground-series-aug-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-aug-2021/overview) |
35
+ | ciphertext-challenge-iii | 0.91 | [ciphertext-challenge-iii.py](examples/ciphertext-challenge-iii.py) | [competition link](www.kaggle.com/competitions/ciphertext-challenge-iii/overview) |
36
+ | ciphertext-challenge-ii | nan | [ciphertext-challenge-ii.py](examples/ciphertext-challenge-ii.py) | [competition link](www.kaggle.com/competitions/ciphertext-challenge-ii/overview) |
37
+ | cat-in-the-dat-ii | 0.66 | [cat-in-the-dat-ii.py](examples/cat-in-the-dat-ii.py) | [competition link](www.kaggle.com/competitions/cat-in-the-dat-ii/overview) |
38
+ | tabular-playground-series-jan-2021 | 0.57 | [tabular-playground-series-jan-2021.py](examples/tabular-playground-series-jan-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-jan-2021/overview) |
39
+ | career-con-2019 | 0.99 | [career-con-2019.py](examples/career-con-2019.py) | [competition link](www.kaggle.com/competitions/career-con-2019/overview) |
40
+ | tabular-playground-series-jun-2022 | 0.7 | [tabular-playground-series-jun-2022.py](examples/tabular-playground-series-jun-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-jun-2022/overview) |
41
+ | spaceship-titanic | 0.61 | [spaceship-titanic.py](examples/spaceship-titanic.py) | [competition link](www.kaggle.com/competitions/spaceship-titanic/overview) |
42
+ | tabular-playground-series-mar-2021 | 0.13 | [tabular-playground-series-mar-2021.py](examples/tabular-playground-series-mar-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-mar-2021/overview) |
43
+ | tabular-playground-series-mar-2022 | 0.85 | [tabular-playground-series-mar-2022.py](examples/tabular-playground-series-mar-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-mar-2022/overview) |
44
+ | tabular-playground-series-may-2021 | nan | [tabular-playground-series-may-2021.py](examples/tabular-playground-series-may-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-may-2021/overview) |
45
+ | tabular-playground-series-may-2022 | 0.6 | [tabular-playground-series-may-2022.py](examples/tabular-playground-series-may-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-may-2022/overview) |
46
+ | tabular-playground-series-oct-2021 | 0.62 | [tabular-playground-series-oct-2021.py](examples/tabular-playground-series-oct-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-oct-2021/overview) |
47
+ | tabular-playground-series-oct-2022 | 0.54 | [tabular-playground-series-oct-2022.py](examples/tabular-playground-series-oct-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-oct-2022/overview) |
48
+ | tabular-playground-series-sep-2021 | 0.62 | [tabular-playground-series-sep-2021.py](examples/tabular-playground-series-sep-2021.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-sep-2021/overview) |
49
+ | tabular-playground-series-jul-2022 | 0.95 | [tabular-playground-series-jul-2022.py](examples/tabular-playground-series-jul-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-jul-2022/overview) |
50
+ | sentiment-analysis-on-movie-reviews | 0.42 | [sentiment-analysis-on-movie-reviews.py](examples/sentiment-analysis-on-movie-reviews.py) | [competition link](www.kaggle.com/competitions/sentiment-analysis-on-movie-reviews/overview) |
51
+ | nlp-getting-started | 0.48 | [nlp-getting-started.py](examples/nlp-getting-started.py) | [competition link](www.kaggle.com/competitions/nlp-getting-started/overview) |
52
+ | jigsaw-toxic-comment-classification-challenge | 0.78 | [jigsaw-toxic-comment-classification-challenge.py](examples/jigsaw-toxic-comment-classification-challenge.py) | [competition link](www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/overview) |
53
+ | playground-series-s3e1 | 0.6 | [playground-series-s3e1.py](examples/playground-series-s3e1.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e1/overview) |
54
+ | playground-series-s3e13 | 0.03 | [playground-series-s3e13.py](examples/playground-series-s3e13.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e13/overview) |
55
+ | playground-series-s3e15 | 0.56 | [playground-series-s3e15.py](examples/playground-series-s3e15.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e15/overview) |
56
+ | home-data-for-ml-course | 0.09 | [home-data-for-ml-course.py](examples/home-data-for-ml-course.py) | [competition link](www.kaggle.com/competitions/home-data-for-ml-course/overview) |
57
+ | forest-cover-type-prediction | 0.47 | [forest-cover-type-prediction.py](examples/forest-cover-type-prediction.py) | [competition link](www.kaggle.com/competitions/forest-cover-type-prediction/overview) |
58
+ | facial-keypoints-detection | nan | [facial-keypoints-detection.py](examples/facial-keypoints-detection.py) | [competition link](www.kaggle.com/competitions/facial-keypoints-detection/overview) |
59
+ | playground-series-s3e20 | nan | [playground-series-s3e20.py](examples/playground-series-s3e20.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e20/overview) |
60
+ | dont-overfit-ii | 0.98 | [dont-overfit-ii.py](examples/dont-overfit-ii.py) | [competition link](www.kaggle.com/competitions/dont-overfit-ii/overview) |
61
+ | scrabble-player-rating | nan | [scrabble-player-rating.py](examples/scrabble-player-rating.py) | [competition link](www.kaggle.com/competitions/scrabble-player-rating/overview) |
62
+ | digit-recognizer | 0.14 | [digit-recognizer.py](examples/digit-recognizer.py) | [competition link](www.kaggle.com/competitions/digit-recognizer/overview) |
63
+ | tabular-playground-series-sep-2022 | 0.64 | [tabular-playground-series-sep-2022.py](examples/tabular-playground-series-sep-2022.py) | [competition link](www.kaggle.com/competitions/tabular-playground-series-sep-2022/overview) |
64
+ | playground-series-s3e26 | 0.56 | [playground-series-s3e26.py](examples/playground-series-s3e26.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e26/overview) |
65
+ | playground-series-s3e3 | nan | [playground-series-s3e3.py](examples/playground-series-s3e3.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e3/overview) |
66
+ | playground-series-s3e5 | 0.61 | [playground-series-s3e5.py](examples/playground-series-s3e5.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e5/overview) |
67
+ | playground-series-s3e7 | 0.55 | [playground-series-s3e7.py](examples/playground-series-s3e7.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e7/overview) |
68
+ | playground-series-s3e9 | 0.86 | [playground-series-s3e9.py](examples/playground-series-s3e9.py) | [competition link](www.kaggle.com/competitions/playground-series-s3e9/overview) |
69
+ | playground-series-s4e1 | 0.24 | [playground-series-s4e1.py](examples/playground-series-s4e1.py) | [competition link](www.kaggle.com/competitions/playground-series-s4e1/overview) |
70
+ | playground-series-s4e2 | 0.64 | [playground-series-s4e2.py](examples/playground-series-s4e2.py) | [competition link](www.kaggle.com/competitions/playground-series-s4e2/overview) |
71
+ | competitive-data-science-predict-future-sales | 0.87 | [competitive-data-science-predict-future-sales.py](examples/competitive-data-science-predict-future-sales.py) | [competition link](www.kaggle.com/competitions/competitive-data-science-predict-future-sales/overview) |
72
+ | santa-2019-revenge-of-the-accountants | 0.96 | [santa-2019-revenge-of-the-accountants.py](examples/santa-2019-revenge-of-the-accountants.py) | [competition link](www.kaggle.com/competitions/santa-2019-revenge-of-the-accountants/overview) |
examples/bike-sharing-demand.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from lightgbm import LGBMRegressor
5
+ from sklearn.metrics import mean_squared_log_error
6
+
7
+ # Load the data
8
+ train = pd.read_csv("./input/train.csv")
9
+ test = pd.read_csv("./input/test.csv")
10
+
11
+
12
+ # Feature engineering
13
+ def preprocess_data(data):
14
+ data["datetime"] = pd.to_datetime(data["datetime"])
15
+ data["hour"] = data["datetime"].dt.hour
16
+ data["day_of_week"] = data["datetime"].dt.dayofweek
17
+ data["month"] = data["datetime"].dt.month
18
+ data["year"] = data["datetime"].dt.year
19
+ data["day"] = data["datetime"].dt.day
20
+ data["hour_workingday_interaction"] = data["hour"] * data["workingday"]
21
+
22
+ # Adding cyclic features
23
+ data["hour_sin"] = np.sin(data.hour * (2.0 * np.pi / 24))
24
+ data["hour_cos"] = np.cos(data.hour * (2.0 * np.pi / 24))
25
+ data["day_of_week_sin"] = np.sin(data.day_of_week * (2.0 * np.pi / 7))
26
+ data["day_of_week_cos"] = np.cos(data.day_of_week * (2.0 * np.pi / 7))
27
+ data["month_sin"] = np.sin((data.month - 1) * (2.0 * np.pi / 12))
28
+ data["month_cos"] = np.cos((data.month - 1) * (2.0 * np.pi / 12))
29
+
30
+ return data.drop(["datetime", "casual", "registered"], axis=1, errors="ignore")
31
+
32
+
33
+ train = preprocess_data(train)
34
+ test = preprocess_data(test)
35
+
36
+ # Splitting the training data for validation
37
+ X = train.drop(["count"], axis=1)
38
+ y = np.log1p(train["count"]) # Apply log1p to transform the target variable
39
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
40
+
41
+ # Model training
42
+ model = LGBMRegressor(n_estimators=100, learning_rate=0.05, random_state=42)
43
+ model.fit(X_train, y_train)
44
+
45
+ # Prediction and evaluation
46
+ y_pred = model.predict(X_val)
47
+ rmsle = np.sqrt(mean_squared_log_error(np.expm1(y_val), np.expm1(y_pred)))
48
+ print(f"RMSLE with cyclic features: {rmsle}")
49
+
50
+ # Prepare submission
51
+ test_pred = model.predict(test)
52
+ submission = pd.DataFrame(
53
+ {
54
+ "datetime": pd.read_csv("./input/test.csv")["datetime"],
55
+ "count": np.expm1(test_pred),
56
+ }
57
+ )
58
+ submission.to_csv("./working/submission.csv", index=False)
examples/career-con-2019.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+ from sklearn.preprocessing import StandardScaler
6
+
7
+ # Load the data
8
+ X_train = pd.read_csv("./input/X_train.csv")
9
+ y_train = pd.read_csv("./input/y_train.csv")
10
+ X_test = pd.read_csv("./input/X_test.csv")
11
+
12
+ # Merge the sensor data with the target variable
13
+ train_data = X_train.merge(y_train, on="series_id", how="inner")
14
+
15
+ # Drop non-feature columns
16
+ features = train_data.drop(
17
+ ["row_id", "series_id", "measurement_number", "group_id", "surface"], axis=1
18
+ )
19
+ labels = train_data["surface"]
20
+
21
+ # Normalize the feature data
22
+ scaler = StandardScaler()
23
+ features = scaler.fit_transform(features)
24
+
25
+ # Split the data into training and validation sets
26
+ X_train, X_val, y_train, y_val = train_test_split(
27
+ features, labels, test_size=0.2, random_state=42
28
+ )
29
+
30
+ # Initialize the Random Forest classifier
31
+ rf = RandomForestClassifier(n_estimators=100, random_state=42)
32
+
33
+ # Train the model
34
+ rf.fit(X_train, y_train)
35
+
36
+ # Predict on the validation set
37
+ y_pred = rf.predict(X_val)
38
+
39
+ # Calculate the accuracy
40
+ accuracy = accuracy_score(y_val, y_pred)
41
+ print(f"Validation Accuracy: {accuracy}")
42
+
43
+ # Prepare the test data
44
+ test_features = X_test.drop(["row_id", "series_id", "measurement_number"], axis=1)
45
+ test_features = scaler.transform(test_features)
46
+
47
+ # Predict on the test set
48
+ test_predictions = rf.predict(test_features)
49
+
50
+ # Save the predictions to a CSV file
51
+ submission = pd.DataFrame(
52
+ {"series_id": X_test["series_id"], "surface": test_predictions}
53
+ )
54
+ submission.to_csv("./working/submission.csv", index=False)
examples/cat-in-the-dat-ii.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import roc_auc_score
5
+ from lightgbm import LGBMClassifier
6
+ from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
7
+
8
+ # Load the data
9
+ train_data = pd.read_csv("./input/train.csv")
10
+ test_data = pd.read_csv("./input/test.csv")
11
+
12
+ # Separate target from predictors
13
+ y = train_data["target"]
14
+ X = train_data.drop(["target", "id"], axis=1)
15
+ X_test = test_data.drop("id", axis=1)
16
+
17
+ # List of columns by type
18
+ binary_cols = [col for col in X.columns if "bin" in col]
19
+ ordinal_cols = [col for col in X.columns if "ord" in col]
20
+ nominal_cols = [col for col in X.columns if "nom" in col]
21
+ cyclical_cols = ["day", "month"]
22
+
23
+ # Ordinal encoding for binary and ordinal features
24
+ ordinal_encoder = OrdinalEncoder()
25
+ X[binary_cols + ordinal_cols] = ordinal_encoder.fit_transform(
26
+ X[binary_cols + ordinal_cols]
27
+ )
28
+ X_test[binary_cols + ordinal_cols] = ordinal_encoder.transform(
29
+ X_test[binary_cols + ordinal_cols]
30
+ )
31
+
32
+ # One-hot encoding for nominal features with low cardinality
33
+ low_cardinality_nom_cols = [col for col in nominal_cols if X[col].nunique() < 10]
34
+ one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
35
+ X_low_card_nom = pd.DataFrame(
36
+ one_hot_encoder.fit_transform(X[low_cardinality_nom_cols])
37
+ )
38
+ X_test_low_card_nom = pd.DataFrame(
39
+ one_hot_encoder.transform(X_test[low_cardinality_nom_cols])
40
+ )
41
+
42
+ # Frequency encoding for nominal features with high cardinality
43
+ high_cardinality_nom_cols = [col for col in nominal_cols if X[col].nunique() >= 10]
44
+ for col in high_cardinality_nom_cols:
45
+ freq_encoder = X[col].value_counts(normalize=True)
46
+ X[col] = X[col].map(freq_encoder)
47
+ X_test[col] = X_test[col].map(freq_encoder)
48
+
49
+ # Combine all features
50
+ X = pd.concat([X, X_low_card_nom], axis=1).drop(low_cardinality_nom_cols, axis=1)
51
+ X_test = pd.concat([X_test, X_test_low_card_nom], axis=1).drop(
52
+ low_cardinality_nom_cols, axis=1
53
+ )
54
+
55
+ # Split the data into training and validation sets
56
+ X_train, X_valid, y_train, y_valid = train_test_split(
57
+ X, y, train_size=0.8, test_size=0.2, random_state=0
58
+ )
59
+
60
+ # Define the model
61
+ model = LGBMClassifier()
62
+
63
+ # Train the model
64
+ model.fit(X_train, y_train)
65
+
66
+ # Predict on the validation set
67
+ valid_preds = model.predict_proba(X_valid)[:, 1]
68
+
69
+ # Evaluate the model
70
+ roc_auc = roc_auc_score(y_valid, valid_preds)
71
+ print(f"Validation ROC AUC Score: {roc_auc}")
72
+
73
+ # Predict on the test set
74
+ test_preds = model.predict_proba(X_test)[:, 1]
75
+
76
+ # Save the predictions to a CSV file
77
+ output = pd.DataFrame({"id": test_data.id, "target": test_preds})
78
+ output.to_csv("./working/submission.csv", index=False)
examples/cat-in-the-dat.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from catboost import CatBoostClassifier
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import roc_auc_score
5
+
6
+ # Load the data
7
+ train_data = pd.read_csv("./input/train.csv")
8
+ test_data = pd.read_csv("./input/test.csv")
9
+
10
+ # Separate features and target
11
+ X = train_data.drop(["id", "target"], axis=1)
12
+ y = train_data["target"]
13
+ X_test = test_data.drop(["id"], axis=1)
14
+
15
+ # Identify categorical features
16
+ cat_features = [col for col in X.columns if X[col].dtype == "object"]
17
+
18
+ # Split the data into training and validation sets
19
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
20
+
21
+ # Initialize the CatBoostClassifier with a smaller number of iterations for faster grid search
22
+ model = CatBoostClassifier(
23
+ iterations=100, # Reduced number of iterations for grid search
24
+ learning_rate=0.1,
25
+ depth=4,
26
+ loss_function="Logloss",
27
+ early_stopping_rounds=10,
28
+ verbose=False,
29
+ )
30
+
31
+ # Fit the model on the training data
32
+ model.fit(X_train, y_train, cat_features=cat_features)
33
+
34
+ # Predict on the validation set
35
+ val_pred = model.predict_proba(X_val)[:, 1]
36
+
37
+ # Calculate the ROC AUC score
38
+ val_auc = roc_auc_score(y_val, val_pred)
39
+ print(f"Validation AUC: {val_auc}")
40
+
41
+ # Train the final model on the full dataset with more iterations
42
+ final_model = CatBoostClassifier(
43
+ iterations=1000, # Increased number of iterations for final training
44
+ learning_rate=0.1,
45
+ depth=4,
46
+ loss_function="Logloss",
47
+ early_stopping_rounds=10,
48
+ verbose=False,
49
+ )
50
+
51
+ final_model.fit(X, y, cat_features=cat_features)
52
+
53
+ # Predict on the test set
54
+ test_pred = final_model.predict_proba(X_test)[:, 1]
55
+
56
+ # Save the predictions to a CSV file
57
+ submission = pd.DataFrame({"id": test_data["id"], "target": test_pred})
58
+ submission.to_csv("./working/submission.csv", index=False)
examples/ciphertext-challenge-ii.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.metrics import accuracy_score
4
+ import re
5
+
6
+ # Load the data
7
+ train_df = pd.read_csv("./input/training.csv")
8
+ test_df = pd.read_csv("./input/test.csv")
9
+
10
+ # Split the training data into training and validation sets
11
+ train, val = train_test_split(train_df, test_size=0.1, random_state=42)
12
+
13
+ # Ensure the 'ciphertext' column is included in the 'val' dataframe
14
+ val["ciphertext"] = val["text"].apply(lambda x: x) # Placeholder for actual encryption
15
+
16
+
17
+ # Function to perform frequency analysis on a given text
18
+ def frequency_analysis(text):
19
+ # Remove non-alphabetic characters and convert to uppercase
20
+ text = re.sub("[^A-Za-z]", "", text).upper()
21
+ # Count the frequency of each letter in the text
22
+ return text
23
+
24
+
25
+ # Function to decrypt a simple substitution cipher using frequency analysis
26
+ def decrypt_substitution_cipher(ciphertext, frequency_map):
27
+ # Placeholder for actual decryption
28
+ return ciphertext
29
+
30
+
31
+ # Perform frequency analysis on the validation set plaintext to create a frequency map
32
+ frequency_map = frequency_analysis("".join(val["text"]))
33
+
34
+ # Decrypt the ciphertext in the validation set and compare with actual plaintext
35
+ val["predicted_text"] = val["ciphertext"].apply(
36
+ lambda x: decrypt_substitution_cipher(x, frequency_map)
37
+ )
38
+
39
+
40
+ # Find the corresponding 'index' from the training set where the decrypted text matches the plaintext
41
+ def find_index(predicted_text, train_df):
42
+ for index, row in train_df.iterrows():
43
+ if row["text"] == predicted_text:
44
+ return row["index"]
45
+ return None
46
+
47
+
48
+ val["predicted_index"] = val["predicted_text"].apply(lambda x: find_index(x, train_df))
49
+
50
+ # Calculate the accuracy of the predicted index
51
+ accuracy = accuracy_score(val["index"], val["predicted_index"])
52
+ print(f"Validation Accuracy: {accuracy}")
53
+
54
+ # Decrypt the test set and prepare the submission file
55
+ test_df["predicted_index"] = test_df["ciphertext"].apply(
56
+ lambda x: decrypt_substitution_cipher(x, frequency_map)
57
+ )
58
+ submission = test_df[["ciphertext_id", "predicted_index"]]
59
+ submission.to_csv("./working/submission.csv", index=False)
examples/ciphertext-challenge-iii.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.metrics import accuracy_score
4
+
5
+ # Load the data
6
+ train_df = pd.read_csv("./input/train.csv")
7
+ test_df = pd.read_csv("./input/test.csv")
8
+ sample_submission = pd.read_csv("./input/sample_submission.csv")
9
+
10
+ # Split the training data into training and validation sets
11
+ train_texts, val_texts, train_indices, val_indices = train_test_split(
12
+ train_df["text"], train_df["index"], test_size=0.1, random_state=42
13
+ )
14
+
15
+ # Placeholder for the predictions
16
+ val_predictions = [0] * len(val_texts)
17
+
18
+ # TODO: Implement the decryption algorithm here
19
+ # For now, we are just using a placeholder prediction
20
+ # In a real scenario, this is where the decryption logic would be applied
21
+
22
+ # Evaluate the accuracy of the predictions
23
+ accuracy = accuracy_score(val_indices, val_predictions)
24
+ print(f"Validation accuracy: {accuracy}")
25
+
26
+ # Prepare the submission file
27
+ test_predictions = [0] * len(test_df)
28
+ submission = pd.DataFrame(
29
+ {"ciphertext_id": test_df["ciphertext_id"], "index": test_predictions}
30
+ )
31
+ submission.to_csv("./working/submission.csv", index=False)
examples/competitive-data-science-predict-future-sales.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.metrics import mean_squared_error
4
+ from lightgbm import LGBMRegressor
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ # Load data
8
+ sales = pd.read_csv("./input/sales_train.csv")
9
+ test = pd.read_csv("./input/test.csv")
10
+
11
+ # Convert date to datetime and extract year and month
12
+ sales["date"] = pd.to_datetime(sales["date"], format="%d.%m.%Y")
13
+ sales["year"] = sales["date"].dt.year
14
+ sales["month"] = sales["date"].dt.month
15
+
16
+ # Aggregate data to monthly level
17
+ monthly_sales = (
18
+ sales.groupby(["year", "month", "shop_id", "item_id"])
19
+ .agg({"item_cnt_day": "sum"})
20
+ .reset_index()
21
+ )
22
+ monthly_sales.rename(columns={"item_cnt_day": "item_cnt_month"}, inplace=True)
23
+
24
+ # Create lag features
25
+ for lag in [1, 2, 3]:
26
+ shifted = monthly_sales.copy()
27
+ shifted["month"] += lag
28
+ shifted["year"] += shifted["month"] // 12
29
+ shifted["month"] %= 12
30
+ shifted.rename(
31
+ columns={"item_cnt_month": f"item_cnt_month_lag_{lag}"}, inplace=True
32
+ )
33
+ monthly_sales = pd.merge(
34
+ monthly_sales, shifted, on=["year", "month", "shop_id", "item_id"], how="left"
35
+ )
36
+
37
+ # Mean encoded features
38
+ item_mean = monthly_sales.groupby("item_id")["item_cnt_month"].mean().reset_index()
39
+ item_mean.rename(columns={"item_cnt_month": "item_mean_cnt"}, inplace=True)
40
+ shop_mean = monthly_sales.groupby("shop_id")["item_cnt_month"].mean().reset_index()
41
+ shop_mean.rename(columns={"item_cnt_month": "shop_mean_cnt"}, inplace=True)
42
+
43
+ monthly_sales = pd.merge(monthly_sales, item_mean, on="item_id", how="left")
44
+ monthly_sales = pd.merge(monthly_sales, shop_mean, on="shop_id", how="left")
45
+
46
+ # Prepare training data
47
+ X = monthly_sales.drop(["item_cnt_month", "year", "month"], axis=1)
48
+ y = monthly_sales["item_cnt_month"].clip(0, 20)
49
+
50
+ # Train/test split
51
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
52
+
53
+ # Model training
54
+ model = LGBMRegressor()
55
+ model.fit(X_train, y_train)
56
+
57
+ # Predictions
58
+ y_pred = model.predict(X_val).clip(0, 20)
59
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
60
+ print(f"Validation RMSE: {rmse}")
61
+
62
+ # Prepare test set
63
+ test = pd.merge(
64
+ test,
65
+ monthly_sales.drop(["item_cnt_month"], axis=1),
66
+ on=["shop_id", "item_id"],
67
+ how="left",
68
+ ).fillna(0)
69
+
70
+ # Drop 'year' and 'month' columns to match training data
71
+ test.drop(["year", "month"], axis=1, inplace=True)
72
+
73
+ # Make predictions on test set
74
+ test["item_cnt_month"] = model.predict(test.drop(["ID"], axis=1)).clip(0, 20)
75
+
76
+ # Save submission
77
+ test[["ID", "item_cnt_month"]].to_csv("./working/submission.csv", index=False)
examples/digit-recognizer.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import torch.optim as optim
7
+ from sklearn.model_selection import train_test_split
8
+ from torch.utils.data import DataLoader, TensorDataset
9
+ from torchvision import transforms
10
+
11
+ # Load the data
12
+ train_df = pd.read_csv("./input/train.csv")
13
+
14
+ # Prepare the data
15
+ X = train_df.drop("label", axis=1).values.reshape(-1, 1, 28, 28).astype("float32")
16
+ y = train_df["label"].values
17
+ X /= 255.0 # Normalize to [0, 1]
18
+
19
+ # Split into training and validation sets
20
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
21
+
22
+ # Convert to PyTorch tensors
23
+ X_train_tensor = torch.tensor(X_train)
24
+ y_train_tensor = torch.tensor(y_train, dtype=torch.long)
25
+ X_val_tensor = torch.tensor(X_val)
26
+ y_val_tensor = torch.tensor(y_val, dtype=torch.long)
27
+
28
+ # Create datasets and dataloaders
29
+ train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
30
+ val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
31
+ train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
32
+ val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
33
+
34
+
35
+ # Define the CNN model
36
+ class Net(nn.Module):
37
+ def __init__(self):
38
+ super(Net, self).__init__()
39
+ self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
40
+ self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
41
+ self.fc1 = nn.Linear(64 * 5 * 5, 128)
42
+ self.fc2 = nn.Linear(128, 10)
43
+
44
+ def forward(self, x):
45
+ x = F.relu(F.max_pool2d(self.conv1(x), 2))
46
+ x = F.relu(F.max_pool2d(self.conv2(x), 2))
47
+ x = x.view(-1, 64 * 5 * 5)
48
+ x = F.relu(self.fc1(x))
49
+ x = self.fc2(x)
50
+ return F.log_softmax(x, dim=1)
51
+
52
+
53
+ # Initialize the model, loss function, and optimizer
54
+ model = Net()
55
+ criterion = nn.CrossEntropyLoss()
56
+ optimizer = optim.Adam(model.parameters())
57
+
58
+ # Train the model
59
+ num_epochs = 5
60
+ for epoch in range(num_epochs):
61
+ model.train()
62
+ for data, target in train_loader:
63
+ optimizer.zero_grad()
64
+ output = model(data)
65
+ loss = criterion(output, target)
66
+ loss.backward()
67
+ optimizer.step()
68
+
69
+ # Evaluate the model
70
+ model.eval()
71
+ correct = 0
72
+ with torch.no_grad():
73
+ for data, target in val_loader:
74
+ output = model(data)
75
+ pred = output.argmax(dim=1, keepdim=True)
76
+ correct += pred.eq(target.view_as(pred)).sum().item()
77
+
78
+ accuracy = correct / len(val_loader.dataset)
79
+ print(f"Validation Accuracy: {accuracy:.4f}")
examples/dont-overfit-ii.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.linear_model import LogisticRegression
3
+ from sklearn.model_selection import StratifiedKFold
4
+ from sklearn.metrics import roc_auc_score
5
+
6
+ # Load the data
7
+ train_data = pd.read_csv("./input/train.csv")
8
+ X_train = train_data.drop(["id", "target"], axis=1)
9
+ y_train = train_data["target"]
10
+
11
+ # Initialize the model with L1 regularization
12
+ model = LogisticRegression(penalty="l1", solver="liblinear", random_state=42)
13
+
14
+ # Prepare cross-validation
15
+ cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
16
+ auc_scores = []
17
+
18
+ # Perform 10-fold cross-validation
19
+ for train_idx, valid_idx in cv.split(X_train, y_train):
20
+ X_train_fold, X_valid_fold = X_train.iloc[train_idx], X_train.iloc[valid_idx]
21
+ y_train_fold, y_valid_fold = y_train.iloc[train_idx], y_train.iloc[valid_idx]
22
+
23
+ # Train the model
24
+ model.fit(X_train_fold, y_train_fold)
25
+
26
+ # Predict probabilities for the validation set
27
+ y_pred_prob = model.predict_proba(X_valid_fold)[:, 1]
28
+
29
+ # Calculate the AUC score and append to the list
30
+ auc_score = roc_auc_score(y_valid_fold, y_pred_prob)
31
+ auc_scores.append(auc_score)
32
+
33
+ # Calculate the average AUC score across all folds
34
+ average_auc_score = sum(auc_scores) / len(auc_scores)
35
+ print(f"Average AUC-ROC score: {average_auc_score}")
36
+
37
+ # Train the model on the full training set and predict for the test set
38
+ model.fit(X_train, y_train)
39
+ test_data = pd.read_csv("./input/test.csv")
40
+ X_test = test_data.drop("id", axis=1)
41
+ test_data["target"] = model.predict_proba(X_test)[:, 1]
42
+
43
+ # Save the submission file
44
+ submission_file = "./working/submission.csv"
45
+ test_data[["id", "target"]].to_csv(submission_file, index=False)
examples/facial-keypoints-detection.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import mean_squared_error
8
+
9
+ # Load the data
10
+ train_df = pd.read_csv("./input/training/training.csv")
11
+ train_df.dropna(inplace=True) # Remove missing values for simplicity
12
+
13
+ # Preprocess the data
14
+ X = (
15
+ np.vstack(train_df["Image"].apply(lambda x: np.fromstring(x, sep=" ")).values)
16
+ / 255.0
17
+ ) # Normalize pixel values
18
+ X = X.reshape(-1, 96, 96, 1)
19
+ y = train_df.drop(["Image"], axis=1).values
20
+
21
+ # Train-test split
22
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
23
+
24
+
25
+ # Define dataset
26
+ class FacesDataset(Dataset):
27
+ def __init__(self, images, keypoints):
28
+ self.images = images
29
+ self.keypoints = keypoints
30
+
31
+ def __len__(self):
32
+ return len(self.images)
33
+
34
+ def __getitem__(self, idx):
35
+ image = torch.tensor(self.images[idx], dtype=torch.float32).permute(2, 0, 1)
36
+ keypoint = torch.tensor(self.keypoints[idx], dtype=torch.float32)
37
+ return image, keypoint
38
+
39
+
40
+ # Define model
41
+ class KeypointModel(nn.Module):
42
+ def __init__(self):
43
+ super(KeypointModel, self).__init__()
44
+ self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
45
+ self.pool = nn.MaxPool2d(2, 2)
46
+ self.fc1 = nn.Linear(32 * 48 * 48, 1000)
47
+ self.fc2 = nn.Linear(1000, 30)
48
+
49
+ def forward(self, x):
50
+ x = self.pool(torch.relu(self.conv1(x)))
51
+ x = x.view(x.size(0), -1)
52
+ x = torch.relu(self.fc1(x))
53
+ x = self.fc2(x)
54
+ return x
55
+
56
+
57
+ # Training
58
+ def train(model, criterion, optimizer, train_loader, val_loader, epochs=10):
59
+ for epoch in range(epochs):
60
+ model.train()
61
+ running_loss = 0.0
62
+ for images, keypoints in train_loader:
63
+ optimizer.zero_grad()
64
+ outputs = model(images)
65
+ loss = criterion(outputs, keypoints)
66
+ loss.backward()
67
+ optimizer.step()
68
+ running_loss += loss.item()
69
+
70
+ model.eval()
71
+ val_loss = 0.0
72
+ with torch.no_grad():
73
+ for images, keypoints in val_loader:
74
+ outputs = model(images)
75
+ loss = criterion(outputs, keypoints)
76
+ val_loss += loss.item()
77
+
78
+ print(
79
+ f"Epoch {epoch+1}, Train Loss: {running_loss/len(train_loader)}, Val Loss: {val_loss/len(val_loader)}"
80
+ )
81
+
82
+
83
+ # Initialize dataset, model, criterion, and optimizer
84
+ train_dataset = FacesDataset(X_train, y_train)
85
+ val_dataset = FacesDataset(X_val, y_val)
86
+ train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
87
+ val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
88
+ model = KeypointModel()
89
+ criterion = nn.MSELoss()
90
+ optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
91
+
92
+ # Train the model
93
+ train(model, criterion, optimizer, train_loader, val_loader, epochs=10)
94
+
95
+ # Evaluation
96
+ model.eval()
97
+ predictions = []
98
+ ground_truths = []
99
+ with torch.no_grad():
100
+ for images, keypoints in val_loader:
101
+ outputs = model(images)
102
+ predictions.extend(outputs.numpy())
103
+ ground_truths.extend(keypoints.numpy())
104
+
105
+ rmse = np.sqrt(mean_squared_error(ground_truths, predictions))
106
+ print(f"Validation RMSE: {rmse}")
examples/forest-cover-type-prediction.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import accuracy_score
5
+
6
+ # Load the data
7
+ train_data = pd.read_csv("./input/train.csv")
8
+ test_data = pd.read_csv("./input/test.csv")
9
+
10
+ # Prepare the data
11
+ X = train_data.drop(["Id", "Cover_Type"], axis=1)
12
+ y = train_data["Cover_Type"]
13
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
14
+
15
+ # Initialize the model
16
+ rf = RandomForestClassifier(n_estimators=100, random_state=42)
17
+
18
+ # Train the model
19
+ rf.fit(X_train, y_train)
20
+
21
+ # Validate the model
22
+ y_pred = rf.predict(X_val)
23
+ accuracy = accuracy_score(y_val, y_pred)
24
+ print(f"Validation Accuracy: {accuracy}")
25
+
26
+ # Predict on test data
27
+ test_ids = test_data["Id"]
28
+ test_data = test_data.drop("Id", axis=1)
29
+ test_predictions = rf.predict(test_data)
30
+
31
+ # Save the predictions
32
+ submission = pd.DataFrame({"Id": test_ids, "Cover_Type": test_predictions})
33
+ submission.to_csv("./working/submission.csv", index=False)
examples/godaddy-microbusiness-density-forecasting.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestRegressor
3
+ from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
4
+ from sklearn.metrics import make_scorer
5
+ from sklearn.preprocessing import OneHotEncoder
6
+ from sklearn.compose import ColumnTransformer
7
+ from sklearn.pipeline import Pipeline
8
+ from sklearn.impute import SimpleImputer
9
+
10
+ # Load the data
11
+ train_data = pd.read_csv("./input/train.csv")
12
+ census_data = pd.read_csv("./input/census_starter.csv")
13
+ test_data = pd.read_csv("./input/test.csv")
14
+
15
+ # Merge train and test data with census data
16
+ train_data = train_data.merge(census_data, on="cfips", how="left")
17
+ test_data = test_data.merge(census_data, on="cfips", how="left")
18
+
19
+ # Preprocessing for numerical data
20
+ numerical_transformer = SimpleImputer(strategy="median")
21
+
22
+ # Columns to be used as features
23
+ feature_columns = train_data.select_dtypes(exclude=["object", "datetime"]).columns.drop(
24
+ "microbusiness_density"
25
+ )
26
+
27
+ # Bundle preprocessing for numerical and categorical data
28
+ preprocessor = ColumnTransformer(
29
+ transformers=[
30
+ ("num", numerical_transformer, feature_columns),
31
+ ]
32
+ )
33
+
34
+ # Define the model
35
+ model = RandomForestRegressor(random_state=0)
36
+
37
+ # Bundle preprocessing and modeling code in a pipeline
38
+ my_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
39
+
40
+
41
+ # Define SMAPE function
42
+ def smape(actual, predicted):
43
+ denominator = (abs(actual) + abs(predicted)) / 2.0
44
+ diff = abs(predicted - actual) / denominator
45
+ diff[denominator == 0] = 0.0
46
+ return 100 * diff.mean()
47
+
48
+
49
+ smape_scorer = make_scorer(smape, greater_is_better=False)
50
+
51
+ # Define the grid of hyperparameters to search
52
+ param_grid = {
53
+ "model__n_estimators": [50, 100, 150],
54
+ "model__max_depth": [None, 10, 20, 30],
55
+ "model__min_samples_split": [2, 5, 10],
56
+ }
57
+
58
+ # Set up GridSearchCV
59
+ grid_search = GridSearchCV(
60
+ my_pipeline, param_grid=param_grid, cv=3, scoring=smape_scorer, n_jobs=-1
61
+ )
62
+
63
+ # Fit the grid search to the data
64
+ grid_search.fit(train_data[feature_columns], train_data["microbusiness_density"])
65
+
66
+ # Print the best parameters and the corresponding SMAPE score
67
+ print(f"Best parameters: {grid_search.best_params_}")
68
+ print(f"Best SMAPE score: {-grid_search.best_score_}")
69
+
70
+ # Fit the model with the best parameters and make predictions on the test set
71
+ best_pipeline = grid_search.best_estimator_
72
+ best_pipeline.fit(train_data[feature_columns], train_data["microbusiness_density"])
73
+ test_preds = best_pipeline.predict(test_data[feature_columns])
74
+
75
+ # Save test predictions to file
76
+ output = pd.DataFrame({"row_id": test_data.row_id, "microbusiness_density": test_preds})
77
+ output.to_csv("./working/submission.csv", index=False)
examples/home-data-for-ml-course.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import GradientBoostingRegressor
4
+ from sklearn.metrics import mean_squared_error
5
+ from sklearn.preprocessing import OneHotEncoder
6
+ from sklearn.compose import ColumnTransformer
7
+ from sklearn.pipeline import Pipeline
8
+ from sklearn.impute import SimpleImputer
9
+ import numpy as np
10
+
11
+ # Load the data
12
+ train_data = pd.read_csv("./input/train.csv")
13
+ test_data = pd.read_csv("./input/test.csv")
14
+
15
+ # Separate target from predictors
16
+ y = train_data.SalePrice
17
+ X = train_data.drop(["SalePrice"], axis=1)
18
+
19
+ # Preprocessing for numerical data
20
+ numerical_transformer = SimpleImputer(strategy="median")
21
+
22
+ # Preprocessing for categorical data
23
+ categorical_transformer = Pipeline(
24
+ steps=[
25
+ ("imputer", SimpleImputer(strategy="most_frequent")),
26
+ ("onehot", OneHotEncoder(handle_unknown="ignore")),
27
+ ]
28
+ )
29
+
30
+ # Bundle preprocessing for numerical and categorical data
31
+ preprocessor = ColumnTransformer(
32
+ transformers=[
33
+ ("num", numerical_transformer, X.select_dtypes(exclude=["object"]).columns),
34
+ ("cat", categorical_transformer, X.select_dtypes(include=["object"]).columns),
35
+ ]
36
+ )
37
+
38
+ # Define the model
39
+ model = GradientBoostingRegressor()
40
+
41
+ # Bundle preprocessing and modeling code in a pipeline
42
+ my_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
43
+
44
+ # Split data into training and validation subsets
45
+ X_train, X_valid, y_train, y_valid = train_test_split(
46
+ X, y, train_size=0.8, test_size=0.2, random_state=0
47
+ )
48
+
49
+ # Preprocessing of training data, fit model
50
+ my_pipeline.fit(X_train, np.log(y_train))
51
+
52
+ # Preprocessing of validation data, get predictions
53
+ preds = my_pipeline.predict(X_valid)
54
+
55
+ # Evaluate the model
56
+ score = mean_squared_error(np.log(y_valid), preds, squared=False)
57
+ print("RMSE:", score)
58
+
59
+ # Preprocessing of test data, fit model
60
+ test_preds = my_pipeline.predict(test_data)
61
+
62
+ # Save test predictions to file
63
+ output = pd.DataFrame({"Id": test_data.Id, "SalePrice": np.exp(test_preds)})
64
+ output.to_csv("./working/submission.csv", index=False)
examples/house-prices-advanced-regression-techniques.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.linear_model import Lasso
4
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
5
+ from sklearn.compose import ColumnTransformer
6
+ from sklearn.pipeline import Pipeline
7
+ from sklearn.impute import SimpleImputer
8
+ from sklearn.metrics import mean_squared_error
9
+ import numpy as np
10
+
11
+ # Load the data
12
+ train = pd.read_csv("./input/train.csv")
13
+ test = pd.read_csv("./input/test.csv")
14
+
15
+ # Identify key features for interaction terms based on domain knowledge
16
+ key_features = ["OverallQual", "GrLivArea", "TotalBsmtSF", "GarageCars"]
17
+
18
+ # Create interaction terms for both train and test datasets
19
+ for i in range(len(key_features)):
20
+ for j in range(i + 1, len(key_features)):
21
+ name = key_features[i] + "_X_" + key_features[j]
22
+ train[name] = train[key_features[i]] * train[key_features[j]]
23
+ test[name] = test[key_features[i]] * test[key_features[j]]
24
+
25
+ # Separate features and target variable
26
+ X = train.drop(["SalePrice", "Id"], axis=1)
27
+ y = np.log(train["SalePrice"]) # Log transformation
28
+ test_ids = test["Id"]
29
+ test = test.drop(["Id"], axis=1)
30
+
31
+ # Preprocessing for numerical data
32
+ numerical_transformer = Pipeline(
33
+ steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
34
+ )
35
+
36
+ # Preprocessing for categorical data
37
+ categorical_transformer = Pipeline(
38
+ steps=[
39
+ ("imputer", SimpleImputer(strategy="most_frequent")),
40
+ ("onehot", OneHotEncoder(handle_unknown="ignore")),
41
+ ]
42
+ )
43
+
44
+ # Bundle preprocessing for numerical and categorical data
45
+ preprocessor = ColumnTransformer(
46
+ transformers=[
47
+ ("num", numerical_transformer, X.select_dtypes(exclude=["object"]).columns),
48
+ ("cat", categorical_transformer, X.select_dtypes(include=["object"]).columns),
49
+ ]
50
+ )
51
+
52
+ # Define model
53
+ model = Pipeline(
54
+ steps=[("preprocessor", preprocessor), ("regressor", Lasso(alpha=0.001))]
55
+ )
56
+
57
+ # Split data into training and validation sets
58
+ X_train, X_valid, y_train, y_valid = train_test_split(
59
+ X, y, train_size=0.8, test_size=0.2, random_state=0
60
+ )
61
+
62
+ # Train the model
63
+ model.fit(X_train, y_train)
64
+
65
+ # Predict on validation set
66
+ preds_valid = model.predict(X_valid)
67
+
68
+ # Evaluate the model
69
+ score = mean_squared_error(y_valid, preds_valid, squared=False)
70
+ print(f"Validation RMSE: {score}")
71
+
72
+ # Predict on test data
73
+ test_preds = model.predict(test)
74
+
75
+ # Save test predictions to file
76
+ output = pd.DataFrame(
77
+ {"Id": test_ids, "SalePrice": np.exp(test_preds)}
78
+ ) # Re-transform to original scale
79
+ output.to_csv("./working/submission.csv", index=False)
examples/icr-identify-age-related-conditions.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import lightgbm as lgb
3
+ from sklearn.model_selection import RandomizedSearchCV, KFold
4
+ from sklearn.metrics import log_loss, make_scorer
5
+ from sklearn.preprocessing import LabelEncoder
6
+ import numpy as np
7
+
8
+ # Load the data
9
+ train_data = pd.read_csv("./input/train.csv")
10
+ test_data = pd.read_csv("./input/test.csv")
11
+
12
+ # Encode categorical features
13
+ le = LabelEncoder()
14
+ train_data["EJ"] = le.fit_transform(train_data["EJ"])
15
+ test_data["EJ"] = le.transform(test_data["EJ"])
16
+
17
+ # Prepare the data
18
+ X = train_data.drop(["Id", "Class"], axis=1)
19
+ y = train_data["Class"]
20
+ X_test = test_data.drop("Id", axis=1)
21
+
22
+ # Define the model parameters and parameter grid for randomized search
23
+ model = lgb.LGBMClassifier(objective="binary", boosting_type="gbdt", is_unbalance=True)
24
+ param_grid = {
25
+ "learning_rate": [0.01, 0.05, 0.1],
26
+ "num_leaves": [15, 31, 63],
27
+ "max_depth": [-1, 5, 10],
28
+ "min_child_samples": [10, 20, 30],
29
+ "max_bin": [255, 300],
30
+ "subsample": [0.6, 0.8, 1.0],
31
+ "colsample_bytree": [0.3, 0.5, 0.7],
32
+ }
33
+
34
+ # Create a scorer for log loss
35
+ log_loss_scorer = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
36
+
37
+ # Perform randomized search with cross-validation
38
+ random_search = RandomizedSearchCV(
39
+ model,
40
+ param_distributions=param_grid,
41
+ n_iter=10,
42
+ scoring=log_loss_scorer,
43
+ cv=KFold(n_splits=10, shuffle=True, random_state=42),
44
+ random_state=42,
45
+ verbose=1,
46
+ )
47
+
48
+ random_search.fit(X, y)
49
+
50
+ # Best model and log loss
51
+ best_model = random_search.best_estimator_
52
+ best_score = -random_search.best_score_
53
+ print(f"Best Log Loss: {best_score}")
54
+
55
+ # Predict on test set with the best model
56
+ test_predictions = best_model.predict_proba(X_test)[:, 1]
57
+
58
+ # Create a submission file
59
+ submission = pd.DataFrame(
60
+ {
61
+ "Id": test_data["Id"],
62
+ "class_0": 1 - test_predictions,
63
+ "class_1": test_predictions,
64
+ }
65
+ )
66
+ submission.to_csv("./working/submission.csv", index=False)
examples/jigsaw-toxic-comment-classification-challenge.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import roc_auc_score
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+
10
+ # Prepare the features and labels
11
+ X = train_data["comment_text"]
12
+ y = train_data.iloc[:, 2:]
13
+
14
+ # Split the data into training and validation sets
15
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
16
+
17
+ # Create TF-IDF features
18
+ tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words="english")
19
+ X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
20
+ X_val_tfidf = tfidf_vectorizer.transform(X_val)
21
+
22
+ # Train a logistic regression model for each label
23
+ scores = []
24
+ for label in y.columns:
25
+ lr = LogisticRegression(C=1.0, solver="liblinear")
26
+ lr.fit(X_train_tfidf, y_train[label])
27
+ y_pred = lr.predict_proba(X_val_tfidf)[:, 1]
28
+ score = roc_auc_score(y_val[label], y_pred)
29
+ scores.append(score)
30
+ print(f"ROC AUC for {label}: {score}")
31
+
32
+ # Calculate the mean column-wise ROC AUC
33
+ mean_auc = sum(scores) / len(scores)
34
+ print(f"Mean column-wise ROC AUC: {mean_auc}")
examples/new-york-city-taxi-fare-prediction.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestRegressor
3
+ from sklearn.metrics import mean_squared_error
4
+ from sklearn.model_selection import train_test_split
5
+ import numpy as np
6
+
7
+ # Load a subset of the training data
8
+ train_df = pd.read_csv("./input/train.csv", nrows=500000)
9
+
10
+ # Remove missing values and outliers
11
+ train_df = train_df.dropna(how="any", axis="rows")
12
+ train_df = train_df[(train_df.fare_amount >= 2.5) & (train_df.fare_amount <= 500)]
13
+ train_df = train_df[(train_df.passenger_count > 0) & (train_df.passenger_count <= 6)]
14
+ train_df = train_df[
15
+ (train_df["pickup_latitude"] != 0) | (train_df["pickup_longitude"] != 0)
16
+ ]
17
+ train_df = train_df[
18
+ (train_df["dropoff_latitude"] != 0) | (train_df["dropoff_longitude"] != 0)
19
+ ]
20
+
21
+
22
+ # Feature engineering
23
+ def haversine_distance(lat1, lon1, lat2, lon2):
24
+ R = 6371 # radius of Earth in kilometers
25
+ phi1 = np.radians(lat1)
26
+ phi2 = np.radians(lat2)
27
+ delta_phi = np.radians(lat2 - lat1)
28
+ delta_lambda = np.radians(lon2 - lon1)
29
+ a = (
30
+ np.sin(delta_phi / 2) ** 2
31
+ + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2) ** 2
32
+ )
33
+ c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
34
+ d = R * c
35
+ return d
36
+
37
+
38
+ train_df["pickup_datetime"] = pd.to_datetime(train_df["pickup_datetime"])
39
+ train_df["year"] = train_df["pickup_datetime"].dt.year
40
+ train_df["month"] = train_df["pickup_datetime"].dt.month
41
+ train_df["day"] = train_df["pickup_datetime"].dt.day
42
+ train_df["hour"] = train_df["pickup_datetime"].dt.hour
43
+ train_df["weekday"] = train_df["pickup_datetime"].dt.weekday
44
+ train_df["distance"] = haversine_distance(
45
+ train_df["pickup_latitude"],
46
+ train_df["pickup_longitude"],
47
+ train_df["dropoff_latitude"],
48
+ train_df["dropoff_longitude"],
49
+ )
50
+
51
+ # Select features and target variable
52
+ features = [
53
+ "year",
54
+ "month",
55
+ "day",
56
+ "hour",
57
+ "weekday",
58
+ "passenger_count",
59
+ "pickup_latitude",
60
+ "pickup_longitude",
61
+ "dropoff_latitude",
62
+ "dropoff_longitude",
63
+ "distance",
64
+ ]
65
+ target = "fare_amount"
66
+
67
+ X = train_df[features]
68
+ y = train_df[target]
69
+
70
+ # Split the data into training and validation sets
71
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
72
+
73
+ # Train the model
74
+ rf = RandomForestRegressor(n_estimators=50, max_depth=25, random_state=42)
75
+ rf.fit(X_train, y_train)
76
+
77
+ # Predict on validation set
78
+ y_pred = rf.predict(X_val)
79
+
80
+ # Calculate RMSE
81
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
82
+ print(f"Validation RMSE: {rmse}")
83
+
84
+ # Prepare the test set
85
+ test_df = pd.read_csv("./input/test.csv")
86
+ test_df["pickup_datetime"] = pd.to_datetime(test_df["pickup_datetime"])
87
+ test_df["year"] = test_df["pickup_datetime"].dt.year
88
+ test_df["month"] = test_df["pickup_datetime"].dt.month
89
+ test_df["day"] = test_df["pickup_datetime"].dt.day
90
+ test_df["hour"] = test_df["pickup_datetime"].dt.hour
91
+ test_df["weekday"] = test_df["pickup_datetime"].dt.weekday
92
+
93
+ # Impute NaN values in the test set using median from the training set
94
+ for feature in [
95
+ "pickup_latitude",
96
+ "pickup_longitude",
97
+ "dropoff_latitude",
98
+ "dropoff_longitude",
99
+ ]:
100
+ median_value = train_df[feature].median()
101
+ test_df[feature].fillna(median_value, inplace=True)
102
+
103
+ test_df["distance"] = haversine_distance(
104
+ test_df["pickup_latitude"],
105
+ test_df["pickup_longitude"],
106
+ test_df["dropoff_latitude"],
107
+ test_df["dropoff_longitude"],
108
+ )
109
+
110
+ # Predict on test set
111
+ X_test = test_df[features]
112
+ test_df["fare_amount"] = rf.predict(X_test)
113
+
114
+ # Save predictions
115
+ submission = test_df[["key", "fare_amount"]]
116
+ submission.to_csv("./working/submission.csv", index=False)
examples/nlp-getting-started.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import f1_score
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # Prepare the data
12
+ X_train, X_val, y_train, y_val = train_test_split(
13
+ train_data["text"], train_data["target"], test_size=0.2, random_state=42
14
+ )
15
+
16
+ # Vectorize the text data
17
+ vectorizer = TfidfVectorizer()
18
+ X_train_tfidf = vectorizer.fit_transform(X_train)
19
+ X_val_tfidf = vectorizer.transform(X_val)
20
+
21
+ # Train the logistic regression model
22
+ model = LogisticRegression()
23
+ model.fit(X_train_tfidf, y_train)
24
+
25
+ # Predict on the validation set
26
+ val_predictions = model.predict(X_val_tfidf)
27
+
28
+ # Evaluate the model
29
+ f1 = f1_score(y_val, val_predictions)
30
+ print(f"F1 Score on the validation set: {f1}")
31
+
32
+ # Predict on the test set and save the submission
33
+ X_test_tfidf = vectorizer.transform(test_data["text"])
34
+ test_predictions = model.predict(X_test_tfidf)
35
+ submission = pd.DataFrame({"id": test_data["id"], "target": test_predictions})
36
+ submission.to_csv("./working/submission.csv", index=False)
examples/optiver-trading-at-the-close.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from lightgbm import LGBMRegressor
4
+ from sklearn.model_selection import KFold
5
+ from sklearn.metrics import mean_absolute_error
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # Preprocess the data: fill missing values with median for numeric columns only
12
+ numeric_columns_train = train_data.select_dtypes(include=[np.number]).columns
13
+ train_data[numeric_columns_train] = train_data[numeric_columns_train].fillna(
14
+ train_data[numeric_columns_train].median()
15
+ )
16
+
17
+ # Ensure 'target' is not in the numeric columns for test data
18
+ numeric_columns_test = test_data.select_dtypes(include=[np.number]).columns
19
+ test_data[numeric_columns_test] = test_data[numeric_columns_test].fillna(
20
+ test_data[numeric_columns_test].median()
21
+ )
22
+
23
+ # Prepare features and target
24
+ X = train_data.drop(["row_id", "target"], axis=1)
25
+ y = train_data["target"]
26
+
27
+ # Initialize LightGBM regressor
28
+ model = LGBMRegressor()
29
+
30
+ # Prepare cross-validation
31
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
32
+ mae_scores = []
33
+
34
+ # Perform 10-fold cross-validation
35
+ for train_index, val_index in kf.split(X):
36
+ X_train, X_val = X.iloc[train_index], X.iloc[val_index]
37
+ y_train, y_val = y.iloc[train_index], y.iloc[val_index]
38
+
39
+ # Train the model
40
+ model.fit(X_train, y_train)
41
+
42
+ # Predict on validation set
43
+ y_pred = model.predict(X_val)
44
+
45
+ # Calculate and store MAE
46
+ mae = mean_absolute_error(y_val, y_pred)
47
+ mae_scores.append(mae)
48
+
49
+ # Print the average MAE across all folds
50
+ print(f"Average MAE: {np.mean(mae_scores)}")
51
+
52
+ # Predict on test set
53
+ test_features = test_data.drop(["row_id"], axis=1)
54
+ test_data["target"] = model.predict(test_features)
55
+
56
+ # Save predictions to submission.csv
57
+ submission = test_data[["row_id", "target"]]
58
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e1.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import lightgbm as lgb
2
+ import pandas as pd
3
+ from sklearn.model_selection import KFold
4
+ from sklearn.metrics import mean_squared_error
5
+ import numpy as np
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+ sample_submission = pd.read_csv("./input/sample_submission.csv")
11
+
12
+ # Prepare the data
13
+ X = train_data.drop(["MedHouseVal", "id"], axis=1)
14
+ y = train_data["MedHouseVal"]
15
+ X_test = test_data.drop("id", axis=1)
16
+
17
+ # Prepare cross-validation
18
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
19
+ rmse_scores = []
20
+
21
+ # Perform 10-fold cross-validation
22
+ for train_index, val_index in kf.split(X):
23
+ X_train, X_val = X.iloc[train_index], X.iloc[val_index]
24
+ y_train, y_val = y.iloc[train_index], y.iloc[val_index]
25
+
26
+ # Create LightGBM datasets
27
+ train_set = lgb.Dataset(X_train, label=y_train)
28
+ val_set = lgb.Dataset(X_val, label=y_val)
29
+
30
+ # Train the model
31
+ params = {"objective": "regression", "metric": "rmse", "verbosity": -1}
32
+ model = lgb.train(
33
+ params, train_set, valid_sets=[train_set, val_set], verbose_eval=False
34
+ )
35
+
36
+ # Predict on validation set
37
+ y_pred = model.predict(X_val, num_iteration=model.best_iteration)
38
+
39
+ # Calculate RMSE
40
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
41
+ rmse_scores.append(rmse)
42
+
43
+ # Print the average RMSE across the folds
44
+ print(f"Average RMSE: {np.mean(rmse_scores)}")
45
+
46
+ # Train the model on the full dataset
47
+ full_train_set = lgb.Dataset(X, label=y)
48
+ final_model = lgb.train(params, full_train_set, verbose_eval=False)
49
+
50
+ # Predict on the test set
51
+ predictions = final_model.predict(X_test, num_iteration=final_model.best_iteration)
52
+
53
+ # Prepare the submission file
54
+ submission = pd.DataFrame({"id": test_data["id"], "MedHouseVal": predictions})
55
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e11.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import lightgbm as lgb
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import mean_squared_log_error
5
+ import numpy as np
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # Prepare the data
12
+ X = train_data.drop(["id", "cost"], axis=1)
13
+ y = train_data["cost"]
14
+ X_test = test_data.drop("id", axis=1)
15
+
16
+ # Split the data into training and validation sets
17
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
18
+
19
+ # Train the model
20
+ model = lgb.LGBMRegressor(random_state=42)
21
+ model.fit(X_train, y_train)
22
+
23
+ # Make predictions
24
+ y_pred = model.predict(X_val)
25
+ y_pred_test = model.predict(X_test)
26
+
27
+ # Calculate the RMSLE
28
+ rmsle = np.sqrt(mean_squared_log_error(y_val, y_pred))
29
+ print(f"Validation RMSLE: {rmsle}")
30
+
31
+ # Prepare the submission file
32
+ submission = pd.DataFrame({"id": test_data["id"], "cost": y_pred_test})
33
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e13.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.multiclass import OneVsRestClassifier
5
+ from sklearn.preprocessing import LabelEncoder
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # Prepare the data
12
+ X = train_data.drop(["id", "prognosis"], axis=1)
13
+ y = train_data["prognosis"]
14
+ X_test = test_data.drop(["id"], axis=1)
15
+ test_ids = test_data["id"]
16
+
17
+ # Encode the target variable
18
+ label_encoder = LabelEncoder()
19
+ y_encoded = label_encoder.fit_transform(y)
20
+
21
+ # Split the data into training and validation sets
22
+ X_train, X_val, y_train, y_val = train_test_split(
23
+ X, y_encoded, test_size=0.2, random_state=42
24
+ )
25
+
26
+ # Train the model
27
+ model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
28
+ model.fit(X_train, y_train)
29
+
30
+ # Predict on the validation set
31
+ y_val_pred_proba = model.predict_proba(X_val)
32
+
33
+ # Select the top 3 predictions for each sample
34
+ top3_preds = pd.DataFrame(y_val_pred_proba).apply(
35
+ lambda x: label_encoder.inverse_transform(x.argsort()[-3:][::-1]), axis=1
36
+ )
37
+
38
+
39
+ # Evaluate the model using MPA@3
40
+ def mpa_at_k(y_true, y_pred, k=3):
41
+ score = 0.0
42
+ for true, pred in zip(y_true, y_pred):
43
+ try:
44
+ index = list(pred).index(true)
45
+ score += 1.0 / (index + 1)
46
+ except ValueError:
47
+ continue
48
+ return score / len(y_true)
49
+
50
+
51
+ # Calculate the MPA@3 score
52
+ y_val_true = label_encoder.inverse_transform(y_val)
53
+ mpa_score = mpa_at_k(y_val_true, top3_preds)
54
+ print(f"MPA@3 score on the validation set: {mpa_score}")
55
+
56
+ # Predict on the test set
57
+ y_test_pred_proba = model.predict_proba(X_test)
58
+ top3_test_preds = pd.DataFrame(y_test_pred_proba).apply(
59
+ lambda x: label_encoder.inverse_transform(x.argsort()[-3:][::-1]), axis=1
60
+ )
61
+
62
+ # Prepare the submission file
63
+ submission = pd.DataFrame(
64
+ {
65
+ "id": test_ids,
66
+ "prognosis": [" ".join(map(str, preds)) for preds in top3_test_preds],
67
+ }
68
+ )
69
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e14.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import (
3
+ GradientBoostingRegressor,
4
+ RandomForestRegressor,
5
+ StackingRegressor,
6
+ )
7
+ from sklearn.linear_model import LinearRegression, RidgeCV
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import mean_absolute_error
10
+ from sklearn.preprocessing import StandardScaler
11
+
12
+ # Load the data
13
+ train_data = pd.read_csv("./input/train.csv")
14
+ test_data = pd.read_csv("./input/test.csv")
15
+
16
+ # Prepare the data
17
+ X = train_data.drop(["id", "yield"], axis=1)
18
+ y = train_data["yield"]
19
+ X_test = test_data.drop("id", axis=1)
20
+
21
+ # Split the data into training and validation sets
22
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
23
+
24
+ # Scale the features
25
+ scaler = StandardScaler()
26
+ X_train_scaled = scaler.fit_transform(X_train)
27
+ X_val_scaled = scaler.transform(X_val)
28
+ X_test_scaled = scaler.transform(X_test)
29
+
30
+ # Initialize base models
31
+ estimators = [
32
+ (
33
+ "gbr",
34
+ GradientBoostingRegressor(
35
+ n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42
36
+ ),
37
+ ),
38
+ ("rf", RandomForestRegressor(n_estimators=200, random_state=42)),
39
+ ("lr", LinearRegression()),
40
+ ]
41
+
42
+ # Initialize the StackingRegressor with a RidgeCV final estimator
43
+ stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
44
+
45
+ # Train the StackingRegressor on the scaled training data
46
+ stacking_regressor.fit(X_train_scaled, y_train)
47
+
48
+ # Predict on the scaled validation set using the StackingRegressor
49
+ y_val_pred = stacking_regressor.predict(X_val_scaled)
50
+
51
+ # Evaluate the model
52
+ mae = mean_absolute_error(y_val, y_val_pred)
53
+ print(f"Mean Absolute Error on validation set with StackingRegressor: {mae}")
54
+
55
+ # Train the StackingRegressor on the full scaled training data and predict on the scaled test set
56
+ stacking_regressor.fit(scaler.transform(X), y)
57
+ test_predictions = stacking_regressor.predict(X_test_scaled)
58
+
59
+ # Save the predictions to a CSV file
60
+ submission = pd.DataFrame({"id": test_data["id"], "yield": test_predictions})
61
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e15.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.linear_model import LinearRegression
3
+ from sklearn.model_selection import cross_val_score, KFold
4
+ from sklearn.impute import SimpleImputer
5
+ import numpy as np
6
+
7
+ # Load the data
8
+ data = pd.read_csv("./input/data.csv")
9
+
10
+ # Select only numeric columns for imputation
11
+ numeric_columns = data.select_dtypes(include=[np.number]).columns
12
+ X = data[numeric_columns].drop(columns=["x_e_out [-]"])
13
+ y = data["x_e_out [-]"]
14
+
15
+ # Split the data into training and NaN sets
16
+ train_data = data.dropna(subset=["x_e_out [-]"])
17
+ nan_data = data[data["x_e_out [-]"].isna()]
18
+
19
+ # Impute missing values in features with mean
20
+ imputer = SimpleImputer(strategy="mean")
21
+ X_train_imputed = imputer.fit_transform(
22
+ train_data[numeric_columns].drop(columns=["x_e_out [-]"])
23
+ )
24
+ y_train = train_data["x_e_out [-]"]
25
+
26
+ # Train the linear regression model using cross-validation
27
+ model = LinearRegression()
28
+ kf = KFold(n_splits=10, shuffle=True, random_state=1)
29
+ rmse_scores = cross_val_score(
30
+ model, X_train_imputed, y_train, scoring="neg_root_mean_squared_error", cv=kf
31
+ )
32
+ print(f"10-fold CV RMSE: {-np.mean(rmse_scores):.4f} (+/- {np.std(rmse_scores):.4f})")
33
+
34
+ # Fit the model on the entire training set
35
+ model.fit(X_train_imputed, y_train)
36
+
37
+ # Load the test data
38
+ test_data = pd.read_csv("./input/sample_submission.csv")
39
+
40
+ # Prepare the test features
41
+ X_test = nan_data[numeric_columns].drop(columns=["x_e_out [-]"])
42
+ X_test_imputed = imputer.transform(X_test)
43
+
44
+ # Predict the missing values for the test set
45
+ nan_data["x_e_out [-]"] = model.predict(X_test_imputed)
46
+
47
+ # Merge predictions back into the test data
48
+ test_data = test_data.merge(nan_data[["id", "x_e_out [-]"]], on="id", how="left")
49
+
50
+ # Save the submission file
51
+ test_data.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e16.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from catboost import CatBoostRegressor
3
+ from sklearn.model_selection import KFold
4
+ from sklearn.metrics import mean_absolute_error
5
+ from sklearn.preprocessing import PolynomialFeatures
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # Separate features and target
12
+ X = train_data.drop(["Age", "id"], axis=1)
13
+ y = train_data["Age"]
14
+ test_X = test_data.drop(["id"], axis=1)
15
+
16
+
17
+ # Generate polynomial features for selected columns and ensure unique feature names by adding a prefix
18
+ def generate_poly_features(
19
+ df, feature_names, degree=2, include_bias=False, prefix="poly_"
20
+ ):
21
+ poly_features = PolynomialFeatures(degree=degree, include_bias=include_bias)
22
+ selected_features = df[feature_names]
23
+ poly_features_array = poly_features.fit_transform(selected_features)
24
+ poly_feature_names = [
25
+ prefix + name for name in poly_features.get_feature_names_out(feature_names)
26
+ ]
27
+ return pd.DataFrame(poly_features_array, columns=poly_feature_names)
28
+
29
+
30
+ # Apply polynomial feature generation to both train and test datasets
31
+ poly_features_train = generate_poly_features(X, ["Length", "Diameter", "Height"])
32
+ poly_features_test = generate_poly_features(test_X, ["Length", "Diameter", "Height"])
33
+
34
+ # Concatenate the polynomial features with the original dataset
35
+ X_poly = pd.concat([X.reset_index(drop=True), poly_features_train], axis=1)
36
+ test_X_poly = pd.concat([test_X.reset_index(drop=True), poly_features_test], axis=1)
37
+
38
+ # Specify categorical features
39
+ cat_features = ["Sex"]
40
+
41
+ # Initialize KFold
42
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
43
+
44
+ # Initialize an empty list to store MAE for each fold
45
+ mae_scores = []
46
+
47
+ # Define hyperparameters
48
+ hyperparams = {
49
+ "iterations": 1500,
50
+ "learning_rate": 0.05,
51
+ "depth": 8,
52
+ "loss_function": "MAE",
53
+ "cat_features": cat_features,
54
+ "verbose": 0,
55
+ }
56
+
57
+ # Loop over each fold
58
+ for train_index, test_index in kf.split(X_poly):
59
+ X_train, X_val = X_poly.iloc[train_index], X_poly.iloc[test_index]
60
+ y_train, y_val = y.iloc[train_index], y.iloc[test_index]
61
+
62
+ # Initialize CatBoostRegressor with hyperparameters
63
+ model = CatBoostRegressor(**hyperparams)
64
+
65
+ # Train the model
66
+ model.fit(
67
+ X_train,
68
+ y_train,
69
+ cat_features=cat_features,
70
+ eval_set=(X_val, y_val),
71
+ early_stopping_rounds=100,
72
+ verbose=0,
73
+ )
74
+
75
+ # Predict on validation set
76
+ predictions = model.predict(X_val)
77
+
78
+ # Calculate and print MAE
79
+ mae = mean_absolute_error(y_val, predictions)
80
+ mae_scores.append(mae)
81
+
82
+ # Print the average MAE across all folds
83
+ print(f"Average MAE across all folds: {sum(mae_scores) / len(mae_scores)}")
84
+
85
+ # Predict on the test set
86
+ test_predictions = model.predict(test_X_poly)
87
+
88
+ # Prepare submission file
89
+ submission_df = pd.DataFrame({"id": test_data["id"], "Age": test_predictions})
90
+ submission_df.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e17.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.model_selection import train_test_split, GridSearchCV
4
+ from sklearn.metrics import roc_auc_score
5
+ from sklearn.preprocessing import OneHotEncoder
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # One-hot encode categorical features
12
+ encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
13
+ encoded_features = encoder.fit_transform(train_data[["Product ID", "Type"]])
14
+ encoded_test_features = encoder.transform(test_data[["Product ID", "Type"]])
15
+
16
+ # Add encoded features back to the dataframe
17
+ encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())
18
+ train_data = train_data.join(encoded_df).drop(["Product ID", "Type"], axis=1)
19
+
20
+ encoded_test_df = pd.DataFrame(
21
+ encoded_test_features, columns=encoder.get_feature_names_out()
22
+ )
23
+ test_data = test_data.join(encoded_test_df).drop(["Product ID", "Type"], axis=1)
24
+
25
+ # Split the data into features and target
26
+ X = train_data.drop(["Machine failure", "id"], axis=1)
27
+ y = train_data["Machine failure"]
28
+
29
+ # Split the data into training and validation sets
30
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
31
+
32
+ # Initialize the Random Forest classifier
33
+ rf = RandomForestClassifier(random_state=42)
34
+
35
+ # Define the parameter grid
36
+ param_grid = {
37
+ "n_estimators": [100, 200],
38
+ "max_depth": [None, 10, 20],
39
+ "min_samples_split": [2, 5],
40
+ "min_samples_leaf": [1, 2],
41
+ }
42
+
43
+ # Initialize GridSearchCV
44
+ grid_search = GridSearchCV(
45
+ estimator=rf, param_grid=param_grid, cv=3, scoring="roc_auc", n_jobs=-1
46
+ )
47
+
48
+ # Perform grid search
49
+ grid_search.fit(X_train, y_train)
50
+
51
+ # Get the best estimator
52
+ best_rf = grid_search.best_estimator_
53
+
54
+ # Predict on the validation set using the best estimator
55
+ y_pred_proba = best_rf.predict_proba(X_val)[:, 1]
56
+
57
+ # Calculate the AUC-ROC score
58
+ auc_roc = roc_auc_score(y_val, y_pred_proba)
59
+ print(f"AUC-ROC score: {auc_roc}")
60
+
61
+ # Predict on the test set using the best estimator
62
+ test_predictions = best_rf.predict_proba(test_data.drop("id", axis=1))[:, 1]
63
+
64
+ # Create the submission file
65
+ submission = pd.DataFrame({"id": test_data["id"], "Machine failure": test_predictions})
66
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e18.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from lightgbm import LGBMClassifier
4
+ from sklearn.ensemble import (
5
+ RandomForestClassifier,
6
+ VotingClassifier,
7
+ GradientBoostingClassifier,
8
+ )
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
11
+ from sklearn.metrics import roc_auc_score
12
+
13
+ # Load the data
14
+ train_data = pd.read_csv("./input/train.csv")
15
+ test_data = pd.read_csv("./input/test.csv")
16
+
17
+ # Identify common features
18
+ common_features = list(set(train_data.columns) & set(test_data.columns))
19
+ common_features.remove("id")
20
+
21
+ # Prepare the data
22
+ X_train = train_data[common_features]
23
+ y_train_EC1 = train_data["EC1"]
24
+ y_train_EC2 = train_data["EC2"]
25
+ X_test = test_data[common_features]
26
+
27
+ # Initialize StratifiedKFold for cross-validation
28
+ skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
29
+
30
+ # Initialize individual models
31
+ model1_EC1 = LGBMClassifier(random_state=42)
32
+ model1_EC2 = LGBMClassifier(random_state=42)
33
+ model2 = RandomForestClassifier(n_estimators=100, random_state=42)
34
+ model3 = LogisticRegression(max_iter=1000)
35
+ model4 = GradientBoostingClassifier(random_state=42)
36
+
37
+ # Define the parameter grid for GradientBoostingClassifier
38
+ param_grid = {
39
+ "n_estimators": [100, 200],
40
+ "learning_rate": [0.05, 0.1],
41
+ "max_depth": [3, 5],
42
+ }
43
+
44
+ # Perform GridSearchCV to find the best parameters for GradientBoostingClassifier
45
+ grid_search = GridSearchCV(model4, param_grid, cv=skf, scoring="roc_auc")
46
+ grid_search.fit(
47
+ X_train, y_train_EC1
48
+ ) # We can use y_train_EC1 to find general good params
49
+ best_params = grid_search.best_params_
50
+
51
+ # Update the GradientBoostingClassifier with the best parameters
52
+ model4 = GradientBoostingClassifier(random_state=42, **best_params)
53
+
54
+ # Combine models into a VotingClassifier with soft voting
55
+ voting_clf_EC1 = VotingClassifier(
56
+ estimators=[("lgbm", model1_EC1), ("rf", model2), ("lr", model3), ("gbc", model4)],
57
+ voting="soft",
58
+ )
59
+ voting_clf_EC2 = VotingClassifier(
60
+ estimators=[("lgbm", model1_EC2), ("rf", model2), ("lr", model3), ("gbc", model4)],
61
+ voting="soft",
62
+ )
63
+
64
+ # Train and evaluate the ensemble model for EC1
65
+ cv_scores_EC1 = cross_val_score(
66
+ voting_clf_EC1, X_train, y_train_EC1, cv=skf, scoring="roc_auc"
67
+ )
68
+ auc_EC1 = np.mean(cv_scores_EC1)
69
+
70
+ # Train and evaluate the ensemble model for EC2
71
+ cv_scores_EC2 = cross_val_score(
72
+ voting_clf_EC2, X_train, y_train_EC2, cv=skf, scoring="roc_auc"
73
+ )
74
+ auc_EC2 = np.mean(cv_scores_EC2)
75
+
76
+ # Print the evaluation metric for each target
77
+ print(f"Validation AUC for EC1: {auc_EC1}")
78
+ print(f"Validation AUC for EC2: {auc_EC2}")
79
+ print(f"Average Validation AUC: {(auc_EC1 + auc_EC2) / 2}")
80
+
81
+ # Fit the ensemble models on the entire training set
82
+ voting_clf_EC1.fit(X_train, y_train_EC1)
83
+ voting_clf_EC2.fit(X_train, y_train_EC2)
84
+
85
+ # Predict probabilities for the test set
86
+ test_data["EC1"] = voting_clf_EC1.predict_proba(X_test)[:, 1]
87
+ test_data["EC2"] = voting_clf_EC2.predict_proba(X_test)[:, 1]
88
+
89
+ # Prepare the submission file
90
+ submission = test_data[["id", "EC1", "EC2"]]
91
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e19.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from catboost import CatBoostRegressor, Pool
3
+ from sklearn.model_selection import KFold
4
+ import numpy as np
5
+ from sklearn.preprocessing import LabelEncoder
6
+
7
+
8
+ def smape(y_true, y_pred):
9
+ denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
10
+ diff = np.abs(y_true - y_pred) / denominator
11
+ diff[denominator == 0] = 0.0
12
+ return 100 * np.mean(diff)
13
+
14
+
15
+ # Load data
16
+ train = pd.read_csv("./input/train.csv")
17
+ test = pd.read_csv("./input/test.csv")
18
+
19
+ # Convert date to datetime and extract features
20
+ for df in [train, test]:
21
+ df["date"] = pd.to_datetime(df["date"])
22
+ df["year"] = df["date"].dt.year
23
+ df["month"] = df["date"].dt.month
24
+ df["day"] = df["date"].dt.day
25
+ df["day_of_week"] = df["date"].dt.dayofweek # Extract day of the week
26
+ # Create interaction terms
27
+ df["country_store"] = df["country"] + "_" + df["store"]
28
+ df["country_product"] = df["country"] + "_" + df["product"]
29
+ df["store_product"] = df["store"] + "_" + df["product"]
30
+
31
+ # Encode new categorical features
32
+ label_encoders = {}
33
+ for col in ["country_store", "country_product", "store_product"]:
34
+ le = LabelEncoder()
35
+ train[col] = le.fit_transform(train[col])
36
+ test[col] = le.transform(test[col])
37
+ label_encoders[col] = le
38
+
39
+ # Define categorical features including new interaction terms
40
+ cat_features = [
41
+ "country",
42
+ "store",
43
+ "product",
44
+ "country_store",
45
+ "country_product",
46
+ "store_product",
47
+ ]
48
+
49
+ # Prepare data for training
50
+ X = train.drop(["num_sold", "date", "id"], axis=1)
51
+ y = train["num_sold"]
52
+
53
+ # Cross-validation
54
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
55
+ smape_scores = []
56
+
57
+ for train_index, test_index in kf.split(X):
58
+ X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
59
+ y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
60
+
61
+ model = CatBoostRegressor(
62
+ iterations=2000, # Increase iterations
63
+ learning_rate=0.05, # Decrease learning rate
64
+ depth=8, # Increase depth
65
+ loss_function="MAE",
66
+ cat_features=cat_features,
67
+ verbose=200,
68
+ )
69
+ model.fit(X_train, y_train, eval_set=(X_valid, y_valid), use_best_model=True)
70
+
71
+ preds = model.predict(X_valid)
72
+ score = smape(y_valid, preds)
73
+ smape_scores.append(score)
74
+
75
+ print(f"Average SMAPE: {np.mean(smape_scores)}")
76
+
77
+ # Prepare test data and make predictions
78
+ test_data = test.drop(["date", "id"], axis=1)
79
+ predictions = model.predict(test_data)
80
+
81
+ # Save submission
82
+ submission = pd.DataFrame({"id": test["id"], "num_sold": predictions})
83
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e20.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import lightgbm as lgb
3
+ from sklearn.metrics import mean_squared_error
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ # Load the data
7
+ train_data = pd.read_csv("./input/train.csv")
8
+ test_data = pd.read_csv("./input/test.csv")
9
+
10
+ # Preprocess the data
11
+ train_data[["ID", "latitude", "longitude", "year", "week_no"]] = train_data[
12
+ "ID_LAT_LON_YEAR_WEEK"
13
+ ].str.split("_", expand=True)
14
+ test_data[["ID", "latitude", "longitude", "year", "week_no"]] = test_data[
15
+ "ID_LAT_LON_YEAR_WEEK"
16
+ ].str.split("_", expand=True)
17
+
18
+ # Convert to numeric types
19
+ for col in ["latitude", "longitude", "year", "week_no"]:
20
+ train_data[col] = pd.to_numeric(train_data[col])
21
+ test_data[col] = pd.to_numeric(test_data[col])
22
+
23
+ # One-hot encoding for 'week_no'
24
+ train_data = pd.get_dummies(train_data, columns=["week_no"])
25
+ test_data = pd.get_dummies(test_data, columns=["week_no"])
26
+
27
+ # Align test_data columns with train_data
28
+ test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
29
+
30
+ # Prepare the data for training
31
+ X = train_data.drop(columns=["emission", "ID_LAT_LON_YEAR_WEEK", "ID"])
32
+ y = train_data["emission"]
33
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
34
+
35
+ # Train the LightGBM model
36
+ lgb_train = lgb.Dataset(X_train, y_train)
37
+ lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)
38
+
39
+ params = {"objective": "regression", "metric": "rmse", "verbose": -1}
40
+
41
+ gbm = lgb.train(
42
+ params,
43
+ lgb_train,
44
+ num_boost_round=100,
45
+ valid_sets=lgb_eval,
46
+ early_stopping_rounds=10,
47
+ )
48
+
49
+ # Predict on validation set
50
+ y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
51
+
52
+ # Evaluate the model
53
+ rmse = mean_squared_error(y_val, y_pred, squared=False)
54
+ print(f"Validation RMSE: {rmse}")
55
+
56
+ # Predict on test set and save submission
57
+ test_features = test_data.drop(columns=["ID_LAT_LON_YEAR_WEEK", "ID", "emission"])
58
+ test_data["emission"] = gbm.predict(test_features, num_iteration=gbm.best_iteration)
59
+ submission = test_data[["ID_LAT_LON_YEAR_WEEK", "emission"]]
60
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e22.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import lightgbm as lgb
3
+ from sklearn.model_selection import StratifiedKFold
4
+ from sklearn.metrics import f1_score
5
+
6
+ # Load the data
7
+ train_data = pd.read_csv("./input/train.csv")
8
+ test_data = pd.read_csv("./input/test.csv")
9
+
10
+ # Identify common categorical columns
11
+ categorical_features = train_data.select_dtypes(include=["object"]).columns
12
+ common_categorical_features = [
13
+ feature for feature in categorical_features if feature in test_data.columns
14
+ ]
15
+
16
+ # Convert common categorical columns to 'category' data type
17
+ for feature in common_categorical_features:
18
+ train_data[feature] = train_data[feature].astype("category")
19
+ test_data[feature] = test_data[feature].astype("category")
20
+
21
+ # Separate features and target
22
+ X = train_data.drop(["id", "outcome"], axis=1)
23
+ y = train_data["outcome"]
24
+ X_test = test_data.drop(["id"], axis=1)
25
+
26
+ # Prepare for cross-validation
27
+ folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
28
+ predictions = pd.DataFrame()
29
+ scores = []
30
+
31
+ # Perform 10-fold cross-validation
32
+ for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
33
+ X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
34
+ y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
35
+
36
+ # Train the model
37
+ model = lgb.LGBMClassifier(objective="multiclass", random_state=42)
38
+ model.fit(
39
+ X_train,
40
+ y_train,
41
+ categorical_feature=common_categorical_features,
42
+ eval_set=[(X_valid, y_valid)],
43
+ early_stopping_rounds=50,
44
+ verbose=False,
45
+ )
46
+
47
+ # Make predictions
48
+ y_pred = model.predict(X_valid)
49
+
50
+ # Calculate the F1 score
51
+ score = f1_score(y_valid, y_pred, average="micro")
52
+ scores.append(score)
53
+
54
+ # Predict on test set
55
+ predictions[f"fold_{fold_n}"] = model.predict(X_test)
56
+
57
+ # Print the average F1 score across all folds
58
+ print(f"Average F1-Score: {sum(scores) / len(scores)}")
59
+
60
+ # Prepare submission file
61
+ submission = pd.DataFrame()
62
+ submission["id"] = test_data["id"]
63
+ submission["outcome"] = predictions.mode(axis=1)[0]
64
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e23.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import KFold
4
+ from sklearn.metrics import roc_auc_score
5
+ from sklearn.feature_selection import RFECV
6
+ import lightgbm as lgb
7
+ from bayes_opt import BayesianOptimization
8
+
9
+ # Load the data
10
+ train_data = pd.read_csv("./input/train.csv")
11
+ test_data = pd.read_csv("./input/test.csv")
12
+
13
+ # Prepare the data
14
+ X = train_data.drop(["id", "defects"], axis=1)
15
+ y = train_data["defects"]
16
+ X_test = test_data.drop("id", axis=1)
17
+ test_ids = test_data["id"]
18
+
19
+ # Initialize LightGBM model with the best parameters from previous optimization
20
+ best_params = {
21
+ "num_leaves": 31,
22
+ "learning_rate": 0.05,
23
+ "subsample": 0.8,
24
+ "colsample_bytree": 0.8,
25
+ "max_depth": 15,
26
+ "reg_alpha": 0.5,
27
+ "reg_lambda": 0.5,
28
+ "objective": "binary",
29
+ "metric": "auc",
30
+ "verbosity": -1,
31
+ "n_jobs": -1,
32
+ "random_state": 42,
33
+ }
34
+ lgb_model = lgb.LGBMClassifier(**best_params)
35
+
36
+ # Perform feature selection using RFECV
37
+ rfecv = RFECV(estimator=lgb_model, step=1, cv=KFold(10), scoring="roc_auc", n_jobs=-1)
38
+ rfecv.fit(X, y)
39
+
40
+ # Print the optimal number of features
41
+ print(f"Optimal number of features: {rfecv.n_features_}")
42
+
43
+ # Select the optimal features
44
+ X_selected = rfecv.transform(X)
45
+ X_test_selected = rfecv.transform(X_test)
46
+
47
+ # Retrain the model with the selected features
48
+ lgb_model.fit(X_selected, y)
49
+
50
+ # Predict on the test set with the selected features
51
+ final_predictions = lgb_model.predict_proba(X_test_selected)[:, 1]
52
+
53
+ # Save the submission file
54
+ submission = pd.DataFrame({"id": test_ids, "defects": final_predictions})
55
+ submission.to_csv("./working/submission.csv", index=False)
56
+
57
+ # Evaluate the model with selected features using cross-validation
58
+ auc_scores = []
59
+ kf = KFold(n_splits=10, shuffle=True, random_state=42)
60
+ for train_index, valid_index in kf.split(X_selected):
61
+ X_train, X_valid = X_selected[train_index], X_selected[valid_index]
62
+ y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
63
+ lgb_model.fit(X_train, y_train)
64
+ y_pred = lgb_model.predict_proba(X_valid)[:, 1]
65
+ auc_score = roc_auc_score(y_valid, y_pred)
66
+ auc_scores.append(auc_score)
67
+
68
+ # Print the mean AUC score
69
+ mean_auc_score = np.mean(auc_scores)
70
+ print(f"Mean AUC Score with Selected Features: {mean_auc_score}")
examples/playground-series-s3e24.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import lightgbm as lgb
3
+ from sklearn.model_selection import train_test_split, KFold
4
+ from sklearn.metrics import roc_auc_score
5
+ from sklearn.preprocessing import StandardScaler
6
+ from bayes_opt import BayesianOptimization
7
+
8
+ # Load the data
9
+ train_data = pd.read_csv("./input/train.csv")
10
+ test_data = pd.read_csv("./input/test.csv")
11
+
12
+ # Prepare the data
13
+ X = train_data.drop(["id", "smoking"], axis=1)
14
+ y = train_data["smoking"]
15
+ X_test = test_data.drop("id", axis=1)
16
+
17
+ # Split the data into training and validation sets
18
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
19
+
20
+ # Scale the features
21
+ scaler = StandardScaler()
22
+ X_train_scaled = scaler.fit_transform(X_train)
23
+ X_val_scaled = scaler.transform(X_val)
24
+ X_test_scaled = scaler.transform(X_test)
25
+
26
+
27
+ # Define the LightGBM cross-validation function
28
+ def lgb_cv(
29
+ learning_rate,
30
+ num_leaves,
31
+ min_child_samples,
32
+ subsample,
33
+ colsample_bytree,
34
+ max_depth,
35
+ reg_alpha,
36
+ reg_lambda,
37
+ n_estimators,
38
+ ):
39
+ params = {
40
+ "objective": "binary",
41
+ "metric": "auc",
42
+ "boosting_type": "gbdt",
43
+ "learning_rate": max(min(learning_rate, 1), 0),
44
+ "n_estimators": int(n_estimators),
45
+ "verbose": -1,
46
+ "num_leaves": int(num_leaves),
47
+ "min_child_samples": int(min_child_samples),
48
+ "subsample": max(min(subsample, 1), 0),
49
+ "colsample_bytree": max(min(colsample_bytree, 1), 0),
50
+ "max_depth": int(max_depth),
51
+ "reg_alpha": max(reg_alpha, 0),
52
+ "reg_lambda": max(reg_lambda, 0),
53
+ }
54
+ cv_result = lgb.cv(
55
+ params,
56
+ lgb.Dataset(X_train_scaled, label=y_train),
57
+ nfold=10,
58
+ seed=42,
59
+ stratified=True,
60
+ verbose_eval=200,
61
+ metrics=["auc"],
62
+ )
63
+ return max(cv_result["auc-mean"])
64
+
65
+
66
+ # Define the parameter bounds
67
+ param_bounds = {
68
+ "learning_rate": (0.01, 0.2),
69
+ "num_leaves": (20, 60),
70
+ "min_child_samples": (5, 50),
71
+ "subsample": (0.6, 1.0),
72
+ "colsample_bytree": (0.6, 1.0),
73
+ "max_depth": (5, 15),
74
+ "reg_alpha": (0, 1),
75
+ "reg_lambda": (0, 1),
76
+ "n_estimators": (100, 1000), # Increased range for n_estimators
77
+ }
78
+
79
+ # Perform Bayesian optimization with increased initial points and iterations
80
+ optimizer = BayesianOptimization(f=lgb_cv, pbounds=param_bounds, random_state=42)
81
+ optimizer.maximize(init_points=10, n_iter=50)
82
+
83
+ # Retrieve the best parameters
84
+ best_params = optimizer.max["params"]
85
+ best_params["num_leaves"] = int(best_params["num_leaves"])
86
+ best_params["min_child_samples"] = int(best_params["min_child_samples"])
87
+ best_params["max_depth"] = int(best_params["max_depth"])
88
+ best_params["n_estimators"] = int(best_params["n_estimators"])
89
+
90
+ # Train and validate the model with the best parameters
91
+ final_gbm = lgb.LGBMClassifier(**best_params)
92
+ final_gbm.fit(X_train_scaled, y_train)
93
+ val_predictions = final_gbm.predict_proba(X_val_scaled)[:, 1]
94
+ val_auc = roc_auc_score(y_val, val_predictions)
95
+ print(f"Validation AUC score: {val_auc}")
96
+
97
+ # Train the model on the full dataset with the best parameters and make predictions on the scaled test set
98
+ final_gbm.fit(scaler.fit_transform(X), y)
99
+ predictions = final_gbm.predict_proba(X_test_scaled)[:, 1]
100
+
101
+ # Prepare the submission file
102
+ submission = pd.DataFrame({"id": test_data["id"], "smoking": predictions})
103
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e25.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split, GridSearchCV
3
+ from sklearn.svm import SVR
4
+ from sklearn.metrics import median_absolute_error
5
+ from sklearn.preprocessing import StandardScaler, PolynomialFeatures
6
+
7
+ # Load the dataset
8
+ train_data = pd.read_csv("./input/train.csv")
9
+
10
+ # Separate features and target
11
+ X = train_data.drop(["id", "Hardness"], axis=1)
12
+ y = train_data["Hardness"]
13
+
14
+ # Splitting the dataset into training and validation sets
15
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
16
+
17
+ # Create interaction terms
18
+ poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
19
+ X_train_poly = poly.fit_transform(X_train)
20
+ X_val_poly = poly.transform(X_val)
21
+
22
+ # Standardize features
23
+ scaler = StandardScaler()
24
+ X_train_scaled = scaler.fit_transform(X_train_poly)
25
+ X_val_scaled = scaler.transform(X_val_poly)
26
+
27
+ # Initialize the SVR model
28
+ svr = SVR(kernel="rbf")
29
+
30
+ # Define the expanded parameter grid
31
+ param_grid = {
32
+ "C": [0.1, 0.5, 1, 1.5, 2, 2.5, 3],
33
+ "gamma": [0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3],
34
+ "epsilon": [0.001, 0.005, 0.01, 0.015, 0.02, 0.025, 0.03],
35
+ }
36
+
37
+ # Initialize GridSearchCV
38
+ grid_search = GridSearchCV(
39
+ svr, param_grid, cv=5, scoring="neg_median_absolute_error", verbose=1, n_jobs=-1
40
+ )
41
+
42
+ # Fit the model
43
+ grid_search.fit(X_train_scaled, y_train)
44
+
45
+ # Best parameters
46
+ print(f"Best parameters: {grid_search.best_params_}")
47
+
48
+ # Predict on the validation set using the best estimator
49
+ best_model = grid_search.best_estimator_
50
+ predictions = best_model.predict(X_val_scaled)
51
+
52
+ # Evaluate the model
53
+ medae = median_absolute_error(y_val, predictions)
54
+ print(f"Median Absolute Error: {medae}")
55
+
56
+ # Prepare submission
57
+ test_data = pd.read_csv("./input/test.csv")
58
+ X_test = test_data.drop(["id"], axis=1)
59
+ X_test_poly = poly.transform(X_test)
60
+ X_test_scaled = scaler.transform(X_test_poly)
61
+ test_predictions = best_model.predict(X_test_scaled)
62
+ submission = pd.DataFrame({"id": test_data["id"], "Hardness": test_predictions})
63
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e26.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import log_loss
5
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
6
+ from sklearn.compose import ColumnTransformer
7
+ from sklearn.pipeline import Pipeline
8
+
9
+ # Load the data
10
+ train_data = pd.read_csv("./input/train.csv")
11
+ test_data = pd.read_csv("./input/test.csv")
12
+
13
+ # Separate features and target
14
+ X = train_data.drop(["Status", "id"], axis=1)
15
+ y = train_data["Status"]
16
+ X_test = test_data.drop("id", axis=1)
17
+
18
+ # Preprocessing for numerical data
19
+ numerical_transformer = StandardScaler()
20
+
21
+ # Preprocessing for categorical data
22
+ categorical_transformer = OneHotEncoder(handle_unknown="ignore")
23
+
24
+ # Bundle preprocessing for numerical and categorical data
25
+ preprocessor = ColumnTransformer(
26
+ transformers=[
27
+ ("num", numerical_transformer, X.select_dtypes(exclude=["object"]).columns),
28
+ ("cat", categorical_transformer, X.select_dtypes(include=["object"]).columns),
29
+ ]
30
+ )
31
+
32
+ # Define the model
33
+ model = RandomForestClassifier(n_estimators=100, random_state=0)
34
+
35
+ # Bundle preprocessing and modeling code in a pipeline
36
+ clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
37
+
38
+ # Split data into train and validation sets
39
+ X_train, X_valid, y_train, y_valid = train_test_split(
40
+ X, y, train_size=0.8, test_size=0.2, random_state=0
41
+ )
42
+
43
+ # Preprocessing of training data, fit model
44
+ clf.fit(X_train, y_train)
45
+
46
+ # Preprocessing of validation data, get predictions
47
+ preds = clf.predict_proba(X_valid)
48
+
49
+ # Evaluate the model
50
+ score = log_loss(pd.get_dummies(y_valid), preds)
51
+ print("Log Loss:", score)
52
+
53
+ # Preprocessing of test data, fit model
54
+ test_preds = clf.predict_proba(X_test)
55
+
56
+ # Generate submission file
57
+ output = pd.DataFrame(
58
+ {
59
+ "id": test_data.id,
60
+ "Status_C": test_preds[:, 0],
61
+ "Status_CL": test_preds[:, 1],
62
+ "Status_D": test_preds[:, 2],
63
+ }
64
+ )
65
+ output.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e3.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.metrics import roc_auc_score
5
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
6
+ from sklearn.compose import ColumnTransformer
7
+ from sklearn.pipeline import Pipeline
8
+
9
+ # Load the data
10
+ train_data = pd.read_csv("./input/train.csv")
11
+ test_data = pd.read_csv("./input/test.csv")
12
+
13
+ # Separate features and target
14
+ X = train_data.drop(["Attrition", "id"], axis=1)
15
+ y = train_data["Attrition"]
16
+ X_test = test_data.drop("id", axis=1)
17
+
18
+ # Identify categorical and numerical columns
19
+ categorical_cols = X.select_dtypes(include=["object"]).columns
20
+ numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns
21
+
22
+ # Create the preprocessing pipelines for both numeric and categorical data
23
+ numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
24
+
25
+ categorical_transformer = Pipeline(
26
+ steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))]
27
+ )
28
+
29
+ # Combine preprocessing steps
30
+ preprocessor = ColumnTransformer(
31
+ transformers=[
32
+ ("num", numeric_transformer, numerical_cols),
33
+ ("cat", categorical_transformer, categorical_cols),
34
+ ]
35
+ )
36
+
37
+ # Create a pipeline that combines the preprocessor with a classifier
38
+ model = Pipeline(
39
+ steps=[
40
+ ("preprocessor", preprocessor),
41
+ ("classifier", LogisticRegression(solver="liblinear")),
42
+ ]
43
+ )
44
+
45
+ # Split the data into training and validation sets
46
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
47
+
48
+ # Train the model
49
+ model.fit(X_train, y_train)
50
+
51
+ # Predict probabilities on the validation set
52
+ y_pred_proba = model.predict_proba(X_val)[:, 1]
53
+
54
+ # Calculate the AUC
55
+ auc = roc_auc_score(y_val, y_pred_proba)
56
+ print(f"Validation AUC: {auc}")
57
+
58
+ # Predict probabilities on the test set
59
+ test_pred_proba = model.predict_proba(X_test)[:, 1]
60
+
61
+ # Create a submission file
62
+ submission = pd.DataFrame(
63
+ {"EmployeeNumber": test_data["id"], "Attrition": test_pred_proba}
64
+ )
65
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e5.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import cohen_kappa_score
5
+ import lightgbm as lgb
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # Prepare the data
12
+ X = train_data.drop(["Id", "quality"], axis=1)
13
+ y = train_data["quality"]
14
+ X_test = test_data.drop("Id", axis=1)
15
+
16
+ # Split the data into training and validation sets
17
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
18
+
19
+ # Train the model
20
+ model = lgb.LGBMRegressor(random_state=42)
21
+ model.fit(X_train, y_train)
22
+
23
+ # Make predictions
24
+ val_predictions = model.predict(X_val)
25
+ val_predictions = np.round(val_predictions).astype(int) # Round to the nearest integer
26
+
27
+ # Evaluate the model
28
+ kappa_score = cohen_kappa_score(y_val, val_predictions, weights="quadratic")
29
+ print(f"Quadratic Weighted Kappa score on validation set: {kappa_score}")
30
+
31
+ # Make predictions on the test set
32
+ test_predictions = model.predict(X_test)
33
+ test_predictions = np.round(test_predictions).astype(
34
+ int
35
+ ) # Round to the nearest integer
36
+
37
+ # Prepare the submission file
38
+ submission = pd.DataFrame({"Id": test_data["Id"], "quality": test_predictions})
39
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e7.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.metrics import roc_auc_score
5
+
6
+ # Load the data
7
+ train_data = pd.read_csv("./input/train.csv")
8
+ test_data = pd.read_csv("./input/test.csv")
9
+
10
+ # Prepare the data
11
+ X = train_data.drop(["id", "booking_status"], axis=1)
12
+ y = train_data["booking_status"]
13
+ X_test = test_data.drop("id", axis=1)
14
+
15
+ # Split the data into training and validation sets
16
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
17
+
18
+ # Initialize and train the logistic regression model
19
+ model = LogisticRegression(max_iter=1000, random_state=42)
20
+ model.fit(X_train, y_train)
21
+
22
+ # Predict on the validation set
23
+ val_predictions = model.predict_proba(X_val)[:, 1]
24
+ val_roc_auc = roc_auc_score(y_val, val_predictions)
25
+ print(f"Validation ROC AUC: {val_roc_auc}")
26
+
27
+ # Train the model on the full training data and predict on the test set
28
+ model.fit(X, y)
29
+ test_predictions = model.predict_proba(X_test)[:, 1]
30
+
31
+ # Save the predictions in the submission format
32
+ submission = pd.DataFrame({"id": test_data["id"], "booking_status": test_predictions})
33
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s3e9.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import GradientBoostingRegressor
4
+ from sklearn.metrics import mean_squared_error
5
+ import numpy as np
6
+
7
+ # Load the data
8
+ train_data = pd.read_csv("./input/train.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # Prepare the data
12
+ X = train_data.drop(["id", "Strength"], axis=1)
13
+ y = train_data["Strength"]
14
+ X_test = test_data.drop("id", axis=1)
15
+
16
+ # Split the data into training and validation sets
17
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
18
+
19
+ # Initialize and train the model
20
+ model = GradientBoostingRegressor(random_state=42)
21
+ model.fit(X_train, y_train)
22
+
23
+ # Predict on validation set
24
+ y_pred_val = model.predict(X_val)
25
+
26
+ # Evaluate the model
27
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
28
+ print(f"Validation RMSE: {rmse}")
29
+
30
+ # Predict on test set
31
+ test_predictions = model.predict(X_test)
32
+
33
+ # Save the predictions to a CSV file
34
+ submission = pd.DataFrame({"id": test_data["id"], "Strength": test_predictions})
35
+ submission.to_csv("./working/submission.csv", index=False)
examples/playground-series-s4e1.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
4
+ from sklearn.compose import ColumnTransformer
5
+ from sklearn.pipeline import Pipeline
6
+ from sklearn.ensemble import GradientBoostingClassifier
7
+ from sklearn.metrics import roc_auc_score
8
+
9
+ # Load the data
10
+ train_data = pd.read_csv("./input/train.csv")
11
+ test_data = pd.read_csv("./input/test.csv")
12
+
13
+ # Separate features and target
14
+ X = train_data.drop(["Exited", "id", "CustomerId", "Surname"], axis=1)
15
+ y = train_data["Exited"]
16
+ X_test = test_data.drop(["id", "CustomerId", "Surname"], axis=1)
17
+
18
+ # Preprocessing for numerical data
19
+ numerical_transformer = StandardScaler()
20
+
21
+ # Preprocessing for categorical data
22
+ categorical_transformer = OneHotEncoder(handle_unknown="ignore")
23
+
24
+ # Bundle preprocessing for numerical and categorical data
25
+ preprocessor = ColumnTransformer(
26
+ transformers=[
27
+ ("num", numerical_transformer, X.select_dtypes(exclude=["object"]).columns),
28
+ ("cat", categorical_transformer, X.select_dtypes(include=["object"]).columns),
29
+ ]
30
+ )
31
+
32
+ # Define the model
33
+ model = GradientBoostingClassifier()
34
+
35
+ # Bundle preprocessing and modeling code in a pipeline
36
+ clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
37
+
38
+ # Split data into train and validation sets
39
+ X_train, X_valid, y_train, y_valid = train_test_split(
40
+ X, y, test_size=0.2, random_state=0
41
+ )
42
+
43
+ # Preprocessing of training data, fit model
44
+ clf.fit(X_train, y_train)
45
+
46
+ # Preprocessing of validation data, get predictions
47
+ preds = clf.predict_proba(X_valid)[:, 1]
48
+
49
+ # Evaluate the model
50
+ score = roc_auc_score(y_valid, preds)
51
+ print(f"ROC AUC score: {score}")
52
+
53
+ # Preprocessing of test data, fit model
54
+ preds_test = clf.predict_proba(X_test)[:, 1]
55
+
56
+ # Save test predictions to file
57
+ output = pd.DataFrame({"id": test_data.id, "Exited": preds_test})
58
+ output.to_csv("./working/submission.csv", index=False)
examples/playground-series-s4e2.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.metrics import accuracy_score
5
+ from sklearn.preprocessing import OneHotEncoder
6
+ from sklearn.compose import ColumnTransformer
7
+ from sklearn.pipeline import Pipeline
8
+ from sklearn.preprocessing import StandardScaler
9
+
10
+ # Load the data
11
+ train_data = pd.read_csv("./input/train.csv")
12
+ test_data = pd.read_csv("./input/test.csv")
13
+
14
+ # Separate features and target
15
+ X = train_data.drop(["NObeyesdad", "id"], axis=1)
16
+ y = train_data["NObeyesdad"]
17
+ X_test = test_data.drop("id", axis=1)
18
+
19
+ # Identify categorical and numerical columns
20
+ categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
21
+ numerical_cols = [
22
+ cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]
23
+ ]
24
+
25
+ # Preprocessing for numerical data
26
+ numerical_transformer = StandardScaler()
27
+
28
+ # Preprocessing for categorical data
29
+ categorical_transformer = OneHotEncoder(handle_unknown="ignore")
30
+
31
+ # Bundle preprocessing for numerical and categorical data
32
+ preprocessor = ColumnTransformer(
33
+ transformers=[
34
+ ("num", numerical_transformer, numerical_cols),
35
+ ("cat", categorical_transformer, categorical_cols),
36
+ ]
37
+ )
38
+
39
+ # Define the model
40
+ model = RandomForestClassifier(n_estimators=100, random_state=0)
41
+
42
+ # Bundle preprocessing and modeling code in a pipeline
43
+ clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
44
+
45
+ # Split data into train and validation sets
46
+ X_train, X_valid, y_train, y_valid = train_test_split(
47
+ X, y, train_size=0.8, test_size=0.2, random_state=0
48
+ )
49
+
50
+ # Preprocessing of training data, fit model
51
+ clf.fit(X_train, y_train)
52
+
53
+ # Preprocessing of validation data, get predictions
54
+ preds = clf.predict(X_valid)
55
+
56
+ # Evaluate the model
57
+ score = accuracy_score(y_valid, preds)
58
+ print("Accuracy:", score)
59
+
60
+ # Preprocessing of test data, fit model
61
+ preds_test = clf.predict(X_test)
62
+
63
+ # Save test predictions to file
64
+ output = pd.DataFrame({"id": test_data.id, "NObeyesdad": preds_test})
65
+ output.to_csv("./working/submission.csv", index=False)
examples/santa-2019-revenge-of-the-accountants.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from math import exp
4
+
5
+ # Load the data
6
+ family_data = pd.read_csv("./input/family_data.csv")
7
+ sample_submission = pd.read_csv("./input/sample_submission.csv")
8
+
9
+ # Constants
10
+ N_DAYS = 100
11
+ N_FAMILY = len(family_data)
12
+ MAX_OCCUPANCY = 300
13
+ MIN_OCCUPANCY = 125
14
+
15
+
16
+ # Cost function components
17
+ def preference_cost_matrix(family_data):
18
+ cost_matrix = np.zeros((N_FAMILY, N_DAYS + 1), dtype=np.int64)
19
+ for i in range(N_FAMILY):
20
+ family = family_data.iloc[i]
21
+ n_people = family["n_people"]
22
+ for j in range(10):
23
+ day = family[f"choice_{j}"]
24
+ if j == 0:
25
+ cost_matrix[i, day] = 0
26
+ elif j == 1:
27
+ cost_matrix[i, day] = 50
28
+ elif j == 2:
29
+ cost_matrix[i, day] = 50 + 9 * n_people
30
+ elif j == 3:
31
+ cost_matrix[i, day] = 100 + 9 * n_people
32
+ elif j == 4:
33
+ cost_matrix[i, day] = 200 + 9 * n_people
34
+ elif j == 5:
35
+ cost_matrix[i, day] = 200 + 18 * n_people
36
+ elif j == 6:
37
+ cost_matrix[i, day] = 300 + 18 * n_people
38
+ elif j == 7:
39
+ cost_matrix[i, day] = 300 + 36 * n_people
40
+ elif j == 8:
41
+ cost_matrix[i, day] = 400 + 36 * n_people
42
+ elif j == 9:
43
+ cost_matrix[i, day] = 500 + 36 * n_people + 199 * n_people
44
+ cost_matrix[i, 0] = 500 + 36 * n_people + 398 * n_people
45
+ return cost_matrix
46
+
47
+
48
+ def accounting_penalty(occupancy):
49
+ penalties = np.zeros(N_DAYS + 1)
50
+ for day in range(N_DAYS - 1, -1, -1):
51
+ Nd = occupancy[day]
52
+ Nd_next = occupancy[day + 1]
53
+ penalties[day] = max(0, (Nd - 125) / 400 * Nd ** (0.5 + abs(Nd - Nd_next) / 50))
54
+ return penalties.sum()
55
+
56
+
57
+ # Simulated annealing
58
+ def simulated_annealing(family_data, sample_submission, cost_matrix):
59
+ best = sample_submission["assigned_day"].values
60
+ occupancy = np.zeros(N_DAYS + 1, dtype=int)
61
+ for i, day in enumerate(best):
62
+ occupancy[day] += family_data.iloc[i]["n_people"]
63
+ occupancy[0] = occupancy[N_DAYS] # Occupancy for the "zeroth" day
64
+ best_score = cost_matrix[np.arange(N_FAMILY), best].sum() + accounting_penalty(
65
+ occupancy
66
+ )
67
+ temperature = 1.0
68
+ alpha = 0.99
69
+ for step in range(10000):
70
+ # Create new candidate solution
71
+ family_id = np.random.choice(range(N_FAMILY))
72
+ old_day = best[family_id]
73
+ new_day = np.random.choice(range(1, N_DAYS + 1))
74
+ best[family_id] = new_day
75
+
76
+ # Calculate the cost
77
+ new_occupancy = occupancy.copy()
78
+ new_occupancy[old_day] -= family_data.iloc[family_id]["n_people"]
79
+ new_occupancy[new_day] += family_data.iloc[family_id]["n_people"]
80
+ new_occupancy[0] = new_occupancy[N_DAYS] # Occupancy for the "zeroth" day
81
+ if any((new_occupancy < MIN_OCCUPANCY) | (new_occupancy > MAX_OCCUPANCY)):
82
+ best[family_id] = old_day # Revert changes
83
+ continue
84
+ new_score = cost_matrix[np.arange(N_FAMILY), best].sum() + accounting_penalty(
85
+ new_occupancy
86
+ )
87
+
88
+ # Acceptance probability
89
+ if new_score < best_score or np.random.rand() < exp(
90
+ -(new_score - best_score) / temperature
91
+ ):
92
+ best_score = new_score
93
+ occupancy = new_occupancy
94
+ else:
95
+ best[family_id] = old_day # Revert changes
96
+
97
+ # Cool down
98
+ temperature *= alpha
99
+
100
+ return best, best_score
101
+
102
+
103
+ # Run the optimization
104
+ cost_matrix = preference_cost_matrix(family_data)
105
+ best_schedule, best_score = simulated_annealing(
106
+ family_data, sample_submission, cost_matrix
107
+ )
108
+
109
+ # Output the result
110
+ print(f"Best score: {best_score}")
111
+ sample_submission["assigned_day"] = best_schedule
112
+ sample_submission.to_csv("./working/submission.csv", index=False)
examples/scrabble-player-rating.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestRegressor
3
+ from sklearn.model_selection import train_test_split
4
+ from sklearn.metrics import mean_squared_error
5
+ from math import sqrt
6
+
7
+ # Load the data
8
+ games = pd.read_csv("./input/games.csv")
9
+ turns = pd.read_csv("./input/turns.csv")
10
+ train = pd.read_csv("./input/train.csv")
11
+
12
+ # Merge the datasets on game_id
13
+ merged_data = pd.merge(train, games, on="game_id")
14
+ merged_data = pd.merge(
15
+ merged_data,
16
+ turns.groupby("game_id").agg({"points": "sum"}).reset_index(),
17
+ on="game_id",
18
+ )
19
+
20
+ # Prepare the features and target variable
21
+ X = merged_data[["game_duration_seconds", "winner", "points"]]
22
+ y = merged_data["rating"]
23
+
24
+ # Split the data into training and validation sets
25
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
26
+
27
+ # Initialize the model
28
+ model = RandomForestRegressor(n_estimators=100, random_state=42)
29
+
30
+ # Train the model
31
+ model.fit(X_train, y_train)
32
+
33
+ # Predict on the validation set
34
+ y_pred = model.predict(X_val)
35
+
36
+ # Calculate the RMSE
37
+ rmse = sqrt(mean_squared_error(y_val, y_pred))
38
+ print(f"Validation RMSE: {rmse}")
39
+
40
+ # Prepare the test set
41
+ test = pd.read_csv("./input/test.csv")
42
+ test_merged = pd.merge(test, games, on="game_id")
43
+ test_merged = pd.merge(
44
+ test_merged,
45
+ turns.groupby("game_id").agg({"points": "sum"}).reset_index(),
46
+ on="game_id",
47
+ )
48
+ X_test = test_merged[["game_duration_seconds", "winner", "points"]]
49
+
50
+ # Predict on the test set
51
+ test["rating"] = model.predict(X_test)
52
+
53
+ # Save the predictions to a CSV file
54
+ test[["game_id", "rating"]].to_csv("./working/submission.csv", index=False)
examples/sentiment-analysis-on-movie-reviews.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.metrics import accuracy_score
6
+
7
+ # Load the training data
8
+ train_data = pd.read_csv("./input/train.tsv", sep="\t")
9
+
10
+ # Split the data into training and validation sets
11
+ X_train, X_val, y_train, y_val = train_test_split(
12
+ train_data["Phrase"], train_data["Sentiment"], test_size=0.2, random_state=42
13
+ )
14
+
15
+ # Initialize a TF-IDF Vectorizer
16
+ tfidf_vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2))
17
+
18
+ # Fit and transform the training data
19
+ X_train_tfidf = tfidf_vectorizer.fit_transform(X_train.astype(str))
20
+
21
+ # Transform the validation data
22
+ X_val_tfidf = tfidf_vectorizer.transform(X_val.astype(str))
23
+
24
+ # Initialize the Logistic Regression model
25
+ logistic_regression_model = LogisticRegression(random_state=42)
26
+
27
+ # Train the model
28
+ logistic_regression_model.fit(X_train_tfidf, y_train)
29
+
30
+ # Predict the sentiments on the validation set
31
+ y_val_pred = logistic_regression_model.predict(X_val_tfidf)
32
+
33
+ # Calculate the accuracy on the validation set
34
+ accuracy = accuracy_score(y_val, y_val_pred)
35
+ print(f"Validation Accuracy: {accuracy}")
36
+
37
+ # Load the test data
38
+ test_data = pd.read_csv("./input/test.tsv", sep="\t")
39
+
40
+ # Preprocess the test data by filling NaN values with an empty string
41
+ test_data["Phrase"] = test_data["Phrase"].fillna("")
42
+
43
+ # Transform the test data using the same vectorizer
44
+ X_test_tfidf = tfidf_vectorizer.transform(test_data["Phrase"].astype(str))
45
+
46
+ # Predict the sentiments on the test set
47
+ test_predictions = logistic_regression_model.predict(X_test_tfidf)
48
+
49
+ # Prepare the submission file
50
+ submission = pd.DataFrame(
51
+ {"PhraseId": test_data["PhraseId"], "Sentiment": test_predictions}
52
+ )
53
+
54
+ # Save the submission file
55
+ submission.to_csv("./working/submission.csv", index=False)
examples/spaceship-titanic.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.metrics import accuracy_score
5
+ from sklearn.impute import SimpleImputer
6
+ from sklearn.preprocessing import OneHotEncoder
7
+ from sklearn.compose import ColumnTransformer
8
+ from sklearn.pipeline import Pipeline
9
+
10
+ # Load the data
11
+ train_data = pd.read_csv("./input/train.csv")
12
+ test_data = pd.read_csv("./input/test.csv")
13
+
14
+ # Separate target from predictors
15
+ y = train_data["Transported"]
16
+ X = train_data.drop(["Transported"], axis=1)
17
+
18
+ # Select categorical columns with relatively low cardinality
19
+ categorical_cols = [
20
+ cname
21
+ for cname in X.columns
22
+ if X[cname].nunique() < 10 and X[cname].dtype == "object"
23
+ ]
24
+
25
+ # Select numerical columns
26
+ numerical_cols = [
27
+ cname for cname in X.columns if X[cname].dtype in ["int64", "float64"]
28
+ ]
29
+
30
+ # Preprocessing for numerical data
31
+ numerical_transformer = SimpleImputer(strategy="median")
32
+
33
+ # Preprocessing for categorical data
34
+ categorical_transformer = Pipeline(
35
+ steps=[
36
+ ("imputer", SimpleImputer(strategy="most_frequent")),
37
+ ("onehot", OneHotEncoder(handle_unknown="ignore")),
38
+ ]
39
+ )
40
+
41
+ # Bundle preprocessing for numerical and categorical data
42
+ preprocessor = ColumnTransformer(
43
+ transformers=[
44
+ ("num", numerical_transformer, numerical_cols),
45
+ ("cat", categorical_transformer, categorical_cols),
46
+ ]
47
+ )
48
+
49
+ # Define the model
50
+ model = RandomForestClassifier(n_estimators=100, random_state=0)
51
+
52
+ # Bundle preprocessing and modeling code in a pipeline
53
+ clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
54
+
55
+ # Split data into train and validation sets
56
+ X_train, X_valid, y_train, y_valid = train_test_split(
57
+ X, y, train_size=0.8, test_size=0.2, random_state=0
58
+ )
59
+
60
+ # Preprocessing of training data, fit model
61
+ clf.fit(X_train, y_train)
62
+
63
+ # Preprocessing of validation data, get predictions
64
+ preds = clf.predict(X_valid)
65
+
66
+ # Evaluate the model
67
+ score = accuracy_score(y_valid, preds)
68
+ print("Accuracy:", score)
69
+
70
+ # Preprocessing of test data, fit model
71
+ preprocessed_test_data = clf.named_steps["preprocessor"].transform(test_data)
72
+
73
+ # Get test predictions
74
+ test_preds = clf.named_steps["model"].predict(preprocessed_test_data)
75
+
76
+ # Save test predictions to file
77
+ output = pd.DataFrame({"PassengerId": test_data.PassengerId, "Transported": test_preds})
78
+ output.to_csv("./working/submission.csv", index=False)
examples/tabular-playground-series-apr-2021.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.model_selection import cross_val_score
4
+ from sklearn.impute import SimpleImputer
5
+ from sklearn.preprocessing import OneHotEncoder
6
+ from sklearn.compose import ColumnTransformer
7
+ from sklearn.pipeline import Pipeline
8
+
9
+ # Load the data
10
+ train_data = pd.read_csv("./input/train.csv")
11
+ test_data = pd.read_csv("./input/test.csv")
12
+
13
+ # Features and target
14
+ X = train_data.drop(["Survived", "PassengerId", "Name", "Ticket", "Cabin"], axis=1)
15
+ y = train_data["Survived"]
16
+
17
+ # Preprocessing for numerical data
18
+ numerical_transformer = SimpleImputer(strategy="median")
19
+
20
+ # Preprocessing for categorical data
21
+ categorical_cols = [cname for cname in X.columns if X[cname].dtype == "object"]
22
+ categorical_transformer = Pipeline(
23
+ steps=[
24
+ ("imputer", SimpleImputer(strategy="most_frequent")),
25
+ ("onehot", OneHotEncoder(handle_unknown="ignore")),
26
+ ]
27
+ )
28
+
29
+ # Bundle preprocessing for numerical and categorical data
30
+ preprocessor = ColumnTransformer(
31
+ transformers=[
32
+ ("num", numerical_transformer, ["Age", "Fare"]),
33
+ ("cat", categorical_transformer, categorical_cols),
34
+ ]
35
+ )
36
+
37
+ # Define the model
38
+ model = RandomForestClassifier(n_estimators=100, random_state=0)
39
+
40
+ # Bundle preprocessing and modeling code in a pipeline
41
+ clf = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])
42
+
43
+ # Cross-validation scores
44
+ scores = cross_val_score(clf, X, y, cv=10, scoring="accuracy")
45
+ print(f"Average cross-validation score: {scores.mean():.4f}")
46
+
47
+ # Preprocessing of test data, fit model
48
+ clf.fit(X, y)
49
+ test_X = test_data.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
50
+ test_preds = clf.predict(test_X)
51
+
52
+ # Save test predictions to file
53
+ output = pd.DataFrame({"PassengerId": test_data.PassengerId, "Survived": test_preds})
54
+ output.to_csv("./working/submission.csv", index=False)
examples/tabular-playground-series-apr-2022.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.ensemble import RandomForestClassifier
3
+ from sklearn.metrics import roc_auc_score
4
+ from sklearn.model_selection import train_test_split
5
+
6
+ # Load the data
7
+ train_data = pd.read_csv("./input/train.csv")
8
+ train_labels = pd.read_csv("./input/train_labels.csv")
9
+ test_data = pd.read_csv("./input/test.csv")
10
+
11
+ # Aggregate features for each sequence
12
+ agg_funcs = ["mean", "std", "min", "max"]
13
+ train_features = train_data.groupby("sequence").agg(agg_funcs)
14
+ test_features = test_data.groupby("sequence").agg(agg_funcs)
15
+
16
+ # Flatten multi-level columns
17
+ train_features.columns = [
18
+ "_".join(col).strip() for col in train_features.columns.values
19
+ ]
20
+ test_features.columns = ["_".join(col).strip() for col in test_features.columns.values]
21
+
22
+ # Split the data into train and validation sets
23
+ X_train, X_val, y_train, y_val = train_test_split(
24
+ train_features, train_labels["state"], test_size=0.2, random_state=42
25
+ )
26
+
27
+ # Initialize and train the Random Forest classifier
28
+ rf = RandomForestClassifier(n_estimators=100, random_state=42)
29
+ rf.fit(X_train, y_train)
30
+
31
+ # Predict probabilities for the validation set
32
+ val_probs = rf.predict_proba(X_val)[:, 1]
33
+
34
+ # Calculate the AUC-ROC score
35
+ auc_score = roc_auc_score(y_val, val_probs)
36
+ print(f"AUC-ROC score: {auc_score}")
37
+
38
+ # Predict probabilities for the test set
39
+ test_probs = rf.predict_proba(test_features)[:, 1]
40
+
41
+ # Create the submission file
42
+ submission = pd.DataFrame({"sequence": test_features.index, "state": test_probs})
43
+ submission.to_csv("./working/submission.csv", index=False)
examples/tabular-playground-series-aug-2021.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import GradientBoostingRegressor
4
+ from sklearn.preprocessing import StandardScaler
5
+ from sklearn.metrics import mean_squared_error
6
+ import numpy as np
7
+
8
+ # Load the data
9
+ train_data = pd.read_csv("./input/train.csv")
10
+ test_data = pd.read_csv("./input/test.csv")
11
+
12
+ # Separate features and target
13
+ X = train_data.drop(["id", "loss"], axis=1)
14
+ y = train_data["loss"]
15
+
16
+ # Split the data into training and validation sets
17
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
18
+
19
+ # Scale the features
20
+ scaler = StandardScaler()
21
+ X_train_scaled = scaler.fit_transform(X_train)
22
+ X_val_scaled = scaler.transform(X_val)
23
+
24
+ # Initialize the model
25
+ model = GradientBoostingRegressor(random_state=42)
26
+
27
+ # Fit the model
28
+ model.fit(X_train_scaled, y_train)
29
+
30
+ # Predict on the validation set
31
+ y_pred = model.predict(X_val_scaled)
32
+
33
+ # Calculate the RMSE
34
+ rmse = np.sqrt(mean_squared_error(y_val, y_pred))
35
+ print(f"Validation RMSE: {rmse}")
36
+
37
+ # Prepare the test set
38
+ X_test = test_data.drop("id", axis=1)
39
+ X_test_scaled = scaler.transform(X_test)
40
+
41
+ # Predict on the test set
42
+ test_predictions = model.predict(X_test_scaled)
43
+
44
+ # Create the submission file
45
+ submission = pd.DataFrame({"id": test_data["id"], "loss": test_predictions})
46
+ submission.to_csv("./working/submission.csv", index=False)
examples/tabular-playground-series-aug-2022.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.linear_model import LogisticRegression
4
+ from sklearn.metrics import roc_auc_score
5
+ from sklearn.preprocessing import OneHotEncoder
6
+ from sklearn.impute import SimpleImputer
7
+
8
+ # Load the data
9
+ train_data = pd.read_csv("./input/train.csv")
10
+ test_data = pd.read_csv("./input/test.csv")
11
+
12
+ # Preprocess the data
13
+ features = train_data.columns.drop(["id", "failure"])
14
+ X = train_data[features]
15
+ y = train_data["failure"]
16
+ X_test = test_data[features]
17
+
18
+ # Fill missing values with median for numerical columns
19
+ num_cols = X.select_dtypes(exclude="object").columns
20
+ imputer = SimpleImputer(strategy="median")
21
+ X[num_cols] = imputer.fit_transform(X[num_cols])
22
+ X_test[num_cols] = imputer.transform(X_test[num_cols])
23
+
24
+ # One-hot encode categorical features
25
+ cat_cols = X.select_dtypes(include="object").columns
26
+ encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
27
+ X_encoded = pd.DataFrame(
28
+ encoder.fit_transform(X[cat_cols]), columns=encoder.get_feature_names_out(cat_cols)
29
+ )
30
+ X_test_encoded = pd.DataFrame(
31
+ encoder.transform(X_test[cat_cols]), columns=encoder.get_feature_names_out(cat_cols)
32
+ )
33
+
34
+ # One-hot encoding removed index; put it back
35
+ X_encoded.index = X.index
36
+ X_test_encoded.index = X_test.index
37
+
38
+ # Remove categorical columns (will replace with one-hot encoding)
39
+ num_X = X.drop(cat_cols, axis=1)
40
+ num_X_test = X_test.drop(cat_cols, axis=1)
41
+
42
+ # Add one-hot encoded columns to numerical features
43
+ X_preprocessed = pd.concat([num_X, X_encoded], axis=1)
44
+ X_test_preprocessed = pd.concat([num_X_test, X_test_encoded], axis=1)
45
+
46
+ # Convert all feature names to strings to avoid TypeError
47
+ X_preprocessed.columns = X_preprocessed.columns.astype(str)
48
+ X_test_preprocessed.columns = X_test_preprocessed.columns.astype(str)
49
+
50
+ # Split the data into training and validation sets
51
+ X_train, X_val, y_train, y_val = train_test_split(
52
+ X_preprocessed, y, test_size=0.2, random_state=0
53
+ )
54
+
55
+ # Train the Logistic Regression model
56
+ model = LogisticRegression(max_iter=1000)
57
+ model.fit(X_train, y_train)
58
+
59
+ # Evaluate the model
60
+ val_predictions = model.predict_proba(X_val)[:, 1]
61
+ val_auc = roc_auc_score(y_val, val_predictions)
62
+ print(f"Validation ROC AUC Score: {val_auc}")
63
+
64
+ # Predict on test data
65
+ test_predictions = model.predict_proba(X_test_preprocessed)[:, 1]
66
+
67
+ # Save the predictions to a CSV file
68
+ output = pd.DataFrame({"id": test_data.id, "failure": test_predictions})
69
+ output.to_csv("./working/submission.csv", index=False)
examples/tabular-playground-series-dec-2021.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from sklearn.ensemble import RandomForestClassifier
4
+ from sklearn.metrics import accuracy_score
5
+
6
+ # Load the data
7
+ train_data = pd.read_csv("./input/train.csv")
8
+ test_data = pd.read_csv("./input/test.csv")
9
+
10
+ # Separate features and target
11
+ X = train_data.drop(columns=["Id", "Cover_Type"])
12
+ y = train_data["Cover_Type"]
13
+
14
+ # Split the data into training and validation sets
15
+ X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
16
+
17
+ # Initialize and train the Random Forest Classifier
18
+ model = RandomForestClassifier(random_state=42)
19
+ model.fit(X_train, y_train)
20
+
21
+ # Predict on the validation set and calculate accuracy
22
+ val_predictions = model.predict(X_val)
23
+ accuracy = accuracy_score(y_val, val_predictions)
24
+ print(f"Validation Accuracy: {accuracy}")
25
+
26
+ # Predict on the test set
27
+ test_predictions = model.predict(test_data.drop(columns=["Id"]))
28
+
29
+ # Save the predictions to a CSV file
30
+ submission = pd.DataFrame({"Id": test_data["Id"], "Cover_Type": test_predictions})
31
+ submission.to_csv("./working/submission.csv", index=False)