{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import os\n", "from tqdm import tqdm\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.utils.class_weight import compute_class_weight\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LinearRegression\n", "import numpy as np\n", "import pickle\n", "import joblib" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "From the `prediction_tests.ipynb` file, I found that I want to use the following features:\n", " - Temperature\n", " - Altimeter_Pressure\n", " - Visibility\n", " - Wind_Speed\n", " - Precipitation\n", "\n", "The dataset is heavily imbalanced, as most flights are on time. I tried altering the weights of the classes in random forest and `SMOTE`. I found that altering the weights produced a higher recall for the minority class. This is my most important metric, so I will use this. \n", "\n", "We will create three models for each airport:\n", " - Flight Delay Boolean Classification\n", " - Flight Cancellation Boolean Classification\n", " - Flight Delay Regression\n", "\n", "The first two models will be random forests, and the third will be a linear regression.\n", "\n", "The models will then be saved to `joblib`" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "origins_folder = os.path.join(\"..\", \"data\", \"origins\")\n", "\n", "models = [\n", " \"Flight Cancellation Boolean Classification\",\n", " \"Flight Delay Boolean Classification\",\n", " \"Flight Delay Regression\",]\n", "\n", "seed_value = 0\n", "os.environ['PYTHONHASHSEED'] = str(seed_value)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def weather_cancellation_classification(data_frame_copy, origin_name):\n", " # Split X and y\n", " X = data_frame_copy.drop(columns=['WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n", " y = data_frame_copy['WeatherCancellation']\n", "\n", " # Split data into training and testing sets\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", "\n", " # Compute class weights\n", " class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)\n", "\n", " # Train model\n", " model = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})\n", " model.fit(X_train, y_train)\n", "\n", " model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_cancellation_classification.pkl\")\n", "\n", " joblib.dump(model, model_path)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def weather_delay_classification(data_frame_copy, origin_name):\n", " # Split X and y\n", " X = data_frame_copy.drop(columns=['WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n", " y = data_frame_copy['WeatherOrNasDelay']\n", "\n", " # Split data into training and testing sets\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n", "\n", " # Compute class weights\n", " class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)\n", "\n", " # Train model\n", " model = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})\n", " model.fit(X_train, y_train)\n", "\n", " model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_delay_classification.pkl\")\n", "\n", " joblib.dump(model, model_path)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def weather_delay_regression(data_frame_copy, origin_name):\n", " # Split X and y\n", " X = data_frame_copy.drop(columns=['WeatherAndNasDelay', 'WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n", " y = data_frame_copy['WeatherAndNasDelay']\n", "\n", " # Split the data into training and testing sets\n", " X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", " # Train model\n", " model = LinearRegression()\n", " model.fit(X_train, y_train)\n", "\n", " model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_delay_regression.pkl\")\n", "\n", " joblib.dump(model, model_path)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def get_data_frame(origin):\n", " data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n", "\n", " data_frame['WeatherCancellation'] = data_frame['CancellationReason'] == 'Weather'\n", " data_frame['WeatherOrNasDelay'] = (data_frame['WeatherDelay'] > 0) | (data_frame['NASDelay'] > 0)\n", "\n", " data_frame.drop(columns=['Time','Origin','Dest','Carrier','Cancelled','CancellationReason','CarrierDelay', 'Sea_Level_Pressure','SecurityDelay',\n", " 'LateAircraftDelay', 'Feels_Like_Temperature', 'Wind_Gust', 'Delayed', 'Ice_Accretion_3hr', 'DepDelayMinutes'], inplace=True)\n", " \n", " return data_frame" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WeatherDelay float64\n", "NASDelay float64\n", "Temperature float64\n", "Altimeter_Pressure float64\n", "Visibility float64\n", "Wind_Speed float64\n", "Precipitation float64\n", "WeatherCancellation bool\n", "WeatherOrNasDelay bool\n", "dtype: object\n" ] } ], "source": [ "print(get_data_frame(\"ATL.csv\").dtypes)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Origin: JFK Flight Delay Regression: 50%|█████ | 45/90 [00:16<00:13, 3.38it/s]C:\\Users\\wipar\\AppData\\Local\\Temp\\ipykernel_3440\\2046165627.py:2: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n", "Origin: TPA Flight Delay Regression: 100%|██████████| 90/90 [00:30<00:00, 2.93it/s]\n" ] } ], "source": [ "total_iterations = len(os.listdir(origins_folder) * 3)\n", "progress_bar = tqdm(total=total_iterations, position=0)\n", "\n", "for origin in os.listdir(origins_folder):\n", "\n", " if not os.path.exists(os.path.splitext(origin)[0]):\n", " os.makedirs(os.path.splitext(origin)[0])\n", " \n", " # progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Loading Data...\")\n", " # data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n", " # progress_bar.update(1)\n", "\n", " # data_frame['WeatherCancellation'] = data_frame['CancellationReason'] == 'Weather'\n", " # data_frame['WeatherOrNasDelay'] = (data_frame['WeatherDelay'] > 0) | (data_frame['NASDelay'] > 0)\n", "\n", " # data_frame.drop(columns=['Time','Origin','Dest','Carrier','Cancelled','CancellationReason','CarrierDelay', 'Sea_Level_Pressure','SecurityDelay',\n", " # 'LateAircraftDelay', 'Feels_Like_Temperature', 'Wind_Gust', 'Delayed', 'Ice_Accretion_3hr', 'DepDelayMinutes', 'WeatherDelay', 'NASDelay'], inplace=True)\n", "\n", "\n", " if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_cancellation_classification.pkl\")):\n", " progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Cancellation Boolean Classification\")\n", " try:\n", " weather_cancellation_classification(get_data_frame(origin), origin)\n", " except:\n", " print(f\"Error in {os.path.splitext(origin)[0]} weather_cancellation_classification\")\n", " progress_bar.update(1)\n", "\n", "\n", " if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_delay_classification.pkl\")):\n", " progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Delay Boolean Classification\")\n", " try:\n", " weather_delay_classification(get_data_frame(origin), origin)\n", " except:\n", " print(f\"Error in {os.path.splitext(origin)[0]} weather_delay_classification\")\n", " progress_bar.update(1)\n", "\n", " # if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_delay_regression.pkl\")):\n", " try:\n", " data_frame = get_data_frame(origin)\n", "\n", " data_frame['WeatherDelay'] = data_frame['WeatherDelay'].fillna(0)\n", " data_frame['NASDelay'] = data_frame['NASDelay'].fillna(0)\n", " data_frame = data_frame[(data_frame['WeatherDelay'] != 0) & (data_frame['NASDelay'] != 0)]\n", "\n", " data_frame['WeatherAndNasDelay'] = data_frame['WeatherDelay'] + data_frame['NASDelay'] \n", "\n", " progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Delay Regression\")\n", " weather_delay_regression(data_frame, origin)\n", " except:\n", " print(f\"Error in {os.path.splitext(origin)[0]} weather_delay_regression\")\n", " progress_bar.update(1)\n", "\n", "\n", "progress_bar.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }