File size: 10,721 Bytes
ef02bce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.utils.class_weight import compute_class_weight\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.linear_model import LinearRegression\n",
    "import numpy as np\n",
    "import pickle\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From the `prediction_tests.ipynb` file, I found that I want to use the following features:\n",
    " - Temperature\n",
    " - Altimeter_Pressure\n",
    " - Visibility\n",
    " - Wind_Speed\n",
    " - Precipitation\n",
    "\n",
    "The dataset is heavily imbalanced, as most flights are on time. I tried altering the weights of the classes in random forest and `SMOTE`. I found that altering the weights produced a higher recall for the minority class. This is my most important metric, so I will use this. \n",
    "\n",
    "We will create three models for each airport:\n",
    " -  Flight Delay Boolean Classification\n",
    " -  Flight Cancellation Boolean Classification\n",
    " -  Flight Delay Regression\n",
    "\n",
    "The first two models will be random forests, and the third will be a linear regression.\n",
    "\n",
    "The models will then be saved to `joblib`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "origins_folder = os.path.join(\"..\", \"data\", \"origins\")\n",
    "\n",
    "models = [\n",
    "    \"Flight Cancellation Boolean Classification\",\n",
    "    \"Flight Delay Boolean Classification\",\n",
    "    \"Flight Delay Regression\",]\n",
    "\n",
    "seed_value = 0\n",
    "os.environ['PYTHONHASHSEED'] = str(seed_value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weather_cancellation_classification(data_frame_copy, origin_name):\n",
    "    # Split X and y\n",
    "    X = data_frame_copy.drop(columns=['WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n",
    "    y = data_frame_copy['WeatherCancellation']\n",
    "\n",
    "    # Split data into training and testing sets\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
    "\n",
    "    # Compute class weights\n",
    "    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)\n",
    "\n",
    "    # Train model\n",
    "    model = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_cancellation_classification.pkl\")\n",
    "\n",
    "    joblib.dump(model, model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weather_delay_classification(data_frame_copy, origin_name):\n",
    "    # Split X and y\n",
    "    X = data_frame_copy.drop(columns=['WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n",
    "    y = data_frame_copy['WeatherOrNasDelay']\n",
    "\n",
    "    # Split data into training and testing sets\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
    "\n",
    "    # Compute class weights\n",
    "    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)\n",
    "\n",
    "    # Train model\n",
    "    model = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_delay_classification.pkl\")\n",
    "\n",
    "    joblib.dump(model, model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weather_delay_regression(data_frame_copy, origin_name):\n",
    "    # Split X and y\n",
    "    X = data_frame_copy.drop(columns=['WeatherAndNasDelay', 'WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n",
    "    y = data_frame_copy['WeatherAndNasDelay']\n",
    "\n",
    "    # Split the data into training and testing sets\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "    # Train model\n",
    "    model = LinearRegression()\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_delay_regression.pkl\")\n",
    "\n",
    "    joblib.dump(model, model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data_frame(origin):\n",
    "    data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n",
    "\n",
    "    data_frame['WeatherCancellation'] = data_frame['CancellationReason'] == 'Weather'\n",
    "    data_frame['WeatherOrNasDelay'] = (data_frame['WeatherDelay'] > 0) | (data_frame['NASDelay'] > 0)\n",
    "\n",
    "    data_frame.drop(columns=['Time','Origin','Dest','Carrier','Cancelled','CancellationReason','CarrierDelay', 'Sea_Level_Pressure','SecurityDelay',\n",
    "                   'LateAircraftDelay', 'Feels_Like_Temperature', 'Wind_Gust', 'Delayed', 'Ice_Accretion_3hr', 'DepDelayMinutes'], inplace=True)\n",
    "    \n",
    "    return data_frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WeatherDelay           float64\n",
      "NASDelay               float64\n",
      "Temperature            float64\n",
      "Altimeter_Pressure     float64\n",
      "Visibility             float64\n",
      "Wind_Speed             float64\n",
      "Precipitation          float64\n",
      "WeatherCancellation       bool\n",
      "WeatherOrNasDelay         bool\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(get_data_frame(\"ATL.csv\").dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Origin: JFK Flight Delay Regression:  50%|█████     | 45/90 [00:16<00:13,  3.38it/s]C:\\Users\\wipar\\AppData\\Local\\Temp\\ipykernel_3440\\2046165627.py:2: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n",
      "Origin: TPA Flight Delay Regression: 100%|██████████| 90/90 [00:30<00:00,  2.93it/s]\n"
     ]
    }
   ],
   "source": [
    "total_iterations = len(os.listdir(origins_folder) * 3)\n",
    "progress_bar = tqdm(total=total_iterations, position=0)\n",
    "\n",
    "for origin in os.listdir(origins_folder):\n",
    "\n",
    "    if not os.path.exists(os.path.splitext(origin)[0]):\n",
    "        os.makedirs(os.path.splitext(origin)[0])\n",
    "    \n",
    "    # progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Loading Data...\")\n",
    "    # data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n",
    "    # progress_bar.update(1)\n",
    "\n",
    "    # data_frame['WeatherCancellation'] = data_frame['CancellationReason'] == 'Weather'\n",
    "    # data_frame['WeatherOrNasDelay'] = (data_frame['WeatherDelay'] > 0) | (data_frame['NASDelay'] > 0)\n",
    "\n",
    "    # data_frame.drop(columns=['Time','Origin','Dest','Carrier','Cancelled','CancellationReason','CarrierDelay', 'Sea_Level_Pressure','SecurityDelay',\n",
    "    #                'LateAircraftDelay', 'Feels_Like_Temperature', 'Wind_Gust', 'Delayed', 'Ice_Accretion_3hr', 'DepDelayMinutes', 'WeatherDelay', 'NASDelay'], inplace=True)\n",
    "\n",
    "\n",
    "    if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_cancellation_classification.pkl\")):\n",
    "        progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Cancellation Boolean Classification\")\n",
    "        try:\n",
    "            weather_cancellation_classification(get_data_frame(origin), origin)\n",
    "        except:\n",
    "            print(f\"Error in {os.path.splitext(origin)[0]} weather_cancellation_classification\")\n",
    "    progress_bar.update(1)\n",
    "\n",
    "\n",
    "    if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_delay_classification.pkl\")):\n",
    "        progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Delay Boolean Classification\")\n",
    "        try:\n",
    "            weather_delay_classification(get_data_frame(origin), origin)\n",
    "        except:\n",
    "            print(f\"Error in {os.path.splitext(origin)[0]} weather_delay_classification\")\n",
    "    progress_bar.update(1)\n",
    "\n",
    "    # if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_delay_regression.pkl\")):\n",
    "    try:\n",
    "        data_frame = get_data_frame(origin)\n",
    "\n",
    "        data_frame['WeatherDelay'] = data_frame['WeatherDelay'].fillna(0)\n",
    "        data_frame['NASDelay'] = data_frame['NASDelay'].fillna(0)\n",
    "        data_frame = data_frame[(data_frame['WeatherDelay'] != 0) & (data_frame['NASDelay'] != 0)]\n",
    "\n",
    "        data_frame['WeatherAndNasDelay'] = data_frame['WeatherDelay'] + data_frame['NASDelay'] \n",
    "\n",
    "        progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Delay Regression\")\n",
    "        weather_delay_regression(data_frame, origin)\n",
    "    except:\n",
    "        print(f\"Error in {os.path.splitext(origin)[0]} weather_delay_regression\")\n",
    "    progress_bar.update(1)\n",
    "\n",
    "\n",
    "progress_bar.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}