Spaces:

Jaspann
/

FlightSure

Sleeping

File size: 10,721 Bytes

ef02bce

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from tqdm import tqdm\n",
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.utils.class_weight import compute_class_weight\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.linear_model import LinearRegression\n",
    "import numpy as np\n",
    "import pickle\n",
    "import joblib"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "From the `prediction_tests.ipynb` file, I found that I want to use the following features:\n",
    " - Temperature\n",
    " - Altimeter_Pressure\n",
    " - Visibility\n",
    " - Wind_Speed\n",
    " - Precipitation\n",
    "\n",
    "The dataset is heavily imbalanced, as most flights are on time. I tried altering the weights of the classes in random forest and `SMOTE`. I found that altering the weights produced a higher recall for the minority class. This is my most important metric, so I will use this. \n",
    "\n",
    "We will create three models for each airport:\n",
    " -  Flight Delay Boolean Classification\n",
    " -  Flight Cancellation Boolean Classification\n",
    " -  Flight Delay Regression\n",
    "\n",
    "The first two models will be random forests, and the third will be a linear regression.\n",
    "\n",
    "The models will then be saved to `joblib`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "origins_folder = os.path.join(\"..\", \"data\", \"origins\")\n",
    "\n",
    "models = [\n",
    "    \"Flight Cancellation Boolean Classification\",\n",
    "    \"Flight Delay Boolean Classification\",\n",
    "    \"Flight Delay Regression\",]\n",
    "\n",
    "seed_value = 0\n",
    "os.environ['PYTHONHASHSEED'] = str(seed_value)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weather_cancellation_classification(data_frame_copy, origin_name):\n",
    "    # Split X and y\n",
    "    X = data_frame_copy.drop(columns=['WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n",
    "    y = data_frame_copy['WeatherCancellation']\n",
    "\n",
    "    # Split data into training and testing sets\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
    "\n",
    "    # Compute class weights\n",
    "    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)\n",
    "\n",
    "    # Train model\n",
    "    model = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_cancellation_classification.pkl\")\n",
    "\n",
    "    joblib.dump(model, model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weather_delay_classification(data_frame_copy, origin_name):\n",
    "    # Split X and y\n",
    "    X = data_frame_copy.drop(columns=['WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n",
    "    y = data_frame_copy['WeatherOrNasDelay']\n",
    "\n",
    "    # Split data into training and testing sets\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
    "\n",
    "    # Compute class weights\n",
    "    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)\n",
    "\n",
    "    # Train model\n",
    "    model = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_delay_classification.pkl\")\n",
    "\n",
    "    joblib.dump(model, model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def weather_delay_regression(data_frame_copy, origin_name):\n",
    "    # Split X and y\n",
    "    X = data_frame_copy.drop(columns=['WeatherAndNasDelay', 'WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])\n",
    "    y = data_frame_copy['WeatherAndNasDelay']\n",
    "\n",
    "    # Split the data into training and testing sets\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "    # Train model\n",
    "    model = LinearRegression()\n",
    "    model.fit(X_train, y_train)\n",
    "\n",
    "    model_path = os.path.join(os.path.splitext(origin_name)[0], \"weather_delay_regression.pkl\")\n",
    "\n",
    "    joblib.dump(model, model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data_frame(origin):\n",
    "    data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n",
    "\n",
    "    data_frame['WeatherCancellation'] = data_frame['CancellationReason'] == 'Weather'\n",
    "    data_frame['WeatherOrNasDelay'] = (data_frame['WeatherDelay'] > 0) | (data_frame['NASDelay'] > 0)\n",
    "\n",
    "    data_frame.drop(columns=['Time','Origin','Dest','Carrier','Cancelled','CancellationReason','CarrierDelay', 'Sea_Level_Pressure','SecurityDelay',\n",
    "                   'LateAircraftDelay', 'Feels_Like_Temperature', 'Wind_Gust', 'Delayed', 'Ice_Accretion_3hr', 'DepDelayMinutes'], inplace=True)\n",
    "    \n",
    "    return data_frame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WeatherDelay           float64\n",
      "NASDelay               float64\n",
      "Temperature            float64\n",
      "Altimeter_Pressure     float64\n",
      "Visibility             float64\n",
      "Wind_Speed             float64\n",
      "Precipitation          float64\n",
      "WeatherCancellation       bool\n",
      "WeatherOrNasDelay         bool\n",
      "dtype: object\n"
     ]
    }
   ],
   "source": [
    "print(get_data_frame(\"ATL.csv\").dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Origin: JFK Flight Delay Regression:  50%|█████     | 45/90 [00:16<00:13,  3.38it/s]C:\\Users\\wipar\\AppData\\Local\\Temp\\ipykernel_3440\\2046165627.py:2: DtypeWarning: Columns (5) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n",
      "Origin: TPA Flight Delay Regression: 100%|██████████| 90/90 [00:30<00:00,  2.93it/s]\n"
     ]
    }
   ],
   "source": [
    "total_iterations = len(os.listdir(origins_folder) * 3)\n",
    "progress_bar = tqdm(total=total_iterations, position=0)\n",
    "\n",
    "for origin in os.listdir(origins_folder):\n",
    "\n",
    "    if not os.path.exists(os.path.splitext(origin)[0]):\n",
    "        os.makedirs(os.path.splitext(origin)[0])\n",
    "    \n",
    "    # progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Loading Data...\")\n",
    "    # data_frame = pd.read_csv(os.path.join(origins_folder, origin))\n",
    "    # progress_bar.update(1)\n",
    "\n",
    "    # data_frame['WeatherCancellation'] = data_frame['CancellationReason'] == 'Weather'\n",
    "    # data_frame['WeatherOrNasDelay'] = (data_frame['WeatherDelay'] > 0) | (data_frame['NASDelay'] > 0)\n",
    "\n",
    "    # data_frame.drop(columns=['Time','Origin','Dest','Carrier','Cancelled','CancellationReason','CarrierDelay', 'Sea_Level_Pressure','SecurityDelay',\n",
    "    #                'LateAircraftDelay', 'Feels_Like_Temperature', 'Wind_Gust', 'Delayed', 'Ice_Accretion_3hr', 'DepDelayMinutes', 'WeatherDelay', 'NASDelay'], inplace=True)\n",
    "\n",
    "\n",
    "    if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_cancellation_classification.pkl\")):\n",
    "        progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Cancellation Boolean Classification\")\n",
    "        try:\n",
    "            weather_cancellation_classification(get_data_frame(origin), origin)\n",
    "        except:\n",
    "            print(f\"Error in {os.path.splitext(origin)[0]} weather_cancellation_classification\")\n",
    "    progress_bar.update(1)\n",
    "\n",
    "\n",
    "    if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_delay_classification.pkl\")):\n",
    "        progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Delay Boolean Classification\")\n",
    "        try:\n",
    "            weather_delay_classification(get_data_frame(origin), origin)\n",
    "        except:\n",
    "            print(f\"Error in {os.path.splitext(origin)[0]} weather_delay_classification\")\n",
    "    progress_bar.update(1)\n",
    "\n",
    "    # if not os.path.exists(os.path.join(os.path.splitext(origin)[0], \"weather_delay_regression.pkl\")):\n",
    "    try:\n",
    "        data_frame = get_data_frame(origin)\n",
    "\n",
    "        data_frame['WeatherDelay'] = data_frame['WeatherDelay'].fillna(0)\n",
    "        data_frame['NASDelay'] = data_frame['NASDelay'].fillna(0)\n",
    "        data_frame = data_frame[(data_frame['WeatherDelay'] != 0) & (data_frame['NASDelay'] != 0)]\n",
    "\n",
    "        data_frame['WeatherAndNasDelay'] = data_frame['WeatherDelay'] + data_frame['NASDelay'] \n",
    "\n",
    "        progress_bar.set_description(f\"Origin: {os.path.splitext(origin)[0]} Flight Delay Regression\")\n",
    "        weather_delay_regression(data_frame, origin)\n",
    "    except:\n",
    "        print(f\"Error in {os.path.splitext(origin)[0]} weather_delay_regression\")\n",
    "    progress_bar.update(1)\n",
    "\n",
    "\n",
    "progress_bar.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}