In [5]:
import os
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
import numpy as np
import pickle
import joblib

From the `prediction_tests.ipynb` file, I found that I want to use the following features:
 - Temperature
 - Altimeter_Pressure
 - Visibility
 - Wind_Speed
 - Precipitation

The dataset is heavily imbalanced, as most flights are on time. I tried altering the weights of the classes in random forest and `SMOTE`. I found that altering the weights produced a higher recall for the minority class. This is my most important metric, so I will use this. 

We will create three models for each airport:
 - Flight Delay Boolean Classification
 - Flight Cancellation Boolean Classification
 - Flight Delay Regression

The first two models will be random forests, and the third will be a linear regression.

The models will then be saved to `joblib`

In [6]:
origins_folder = os.path.join("..", "data", "origins")

models = [
 "Flight Cancellation Boolean Classification",
 "Flight Delay Boolean Classification",
 "Flight Delay Regression",]

seed_value = 0
os.environ['PYTHONHASHSEED'] = str(seed_value)

In [7]:
def weather_cancellation_classification(data_frame_copy, origin_name):
 # Split X and y
 X = data_frame_copy.drop(columns=['WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])
 y = data_frame_copy['WeatherCancellation']

 # Split data into training and testing sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

 # Compute class weights
 class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

 # Train model
 model = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})
 model.fit(X_train, y_train)

 model_path = os.path.join(os.path.splitext(origin_name)[0], "weather_cancellation_classification.pkl")

 joblib.dump(model, model_path)

In [8]:
def weather_delay_classification(data_frame_copy, origin_name):
 # Split X and y
 X = data_frame_copy.drop(columns=['WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])
 y = data_frame_copy['WeatherOrNasDelay']

 # Split data into training and testing sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

 # Compute class weights
 class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

 # Train model
 model = RandomForestClassifier(class_weight={0: class_weights[0], 1: class_weights[1]})
 model.fit(X_train, y_train)

 model_path = os.path.join(os.path.splitext(origin_name)[0], "weather_delay_classification.pkl")

 joblib.dump(model, model_path)

In [9]:
def weather_delay_regression(data_frame_copy, origin_name):
 # Split X and y
 X = data_frame_copy.drop(columns=['WeatherAndNasDelay', 'WeatherCancellation', 'WeatherOrNasDelay', 'WeatherDelay', 'NASDelay'])
 y = data_frame_copy['WeatherAndNasDelay']

 # Split the data into training and testing sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 # Train model
 model = LinearRegression()
 model.fit(X_train, y_train)

 model_path = os.path.join(os.path.splitext(origin_name)[0], "weather_delay_regression.pkl")

 joblib.dump(model, model_path)

In [10]:
def get_data_frame(origin):
 data_frame = pd.read_csv(os.path.join(origins_folder, origin))

 data_frame['WeatherCancellation'] = data_frame['CancellationReason'] == 'Weather'
 data_frame['WeatherOrNasDelay'] = (data_frame['WeatherDelay'] > 0) | (data_frame['NASDelay'] > 0)

 data_frame.drop(columns=['Time','Origin','Dest','Carrier','Cancelled','CancellationReason','CarrierDelay', 'Sea_Level_Pressure','SecurityDelay',
 'LateAircraftDelay', 'Feels_Like_Temperature', 'Wind_Gust', 'Delayed', 'Ice_Accretion_3hr', 'DepDelayMinutes'], inplace=True)
 
 return data_frame

In [11]:
print(get_data_frame("ATL.csv").dtypes)

WeatherDelay float64
NASDelay float64
Temperature float64
Altimeter_Pressure float64
Visibility float64
Wind_Speed float64
Precipitation float64
WeatherCancellation bool
WeatherOrNasDelay bool
dtype: object


In [12]:
total_iterations = len(os.listdir(origins_folder) * 3)
progress_bar = tqdm(total=total_iterations, position=0)

for origin in os.listdir(origins_folder):

 if not os.path.exists(os.path.splitext(origin)[0]):
 os.makedirs(os.path.splitext(origin)[0])
 
 # progress_bar.set_description(f"Origin: {os.path.splitext(origin)[0]} Loading Data...")
 # data_frame = pd.read_csv(os.path.join(origins_folder, origin))
 # progress_bar.update(1)

 # data_frame['WeatherCancellation'] = data_frame['CancellationReason'] == 'Weather'
 # data_frame['WeatherOrNasDelay'] = (data_frame['WeatherDelay'] > 0) | (data_frame['NASDelay'] > 0)

 # data_frame.drop(columns=['Time','Origin','Dest','Carrier','Cancelled','CancellationReason','CarrierDelay', 'Sea_Level_Pressure','SecurityDelay',
 # 'LateAircraftDelay', 'Feels_Like_Temperature', 'Wind_Gust', 'Delayed', 'Ice_Accretion_3hr', 'DepDelayMinutes', 'WeatherDelay', 'NASDelay'], inplace=True)


 if not os.path.exists(os.path.join(os.path.splitext(origin)[0], "weather_cancellation_classification.pkl")):
 progress_bar.set_description(f"Origin: {os.path.splitext(origin)[0]} Flight Cancellation Boolean Classification")
 try:
 weather_cancellation_classification(get_data_frame(origin), origin)
 except:
 print(f"Error in {os.path.splitext(origin)[0]} weather_cancellation_classification")
 progress_bar.update(1)


 if not os.path.exists(os.path.join(os.path.splitext(origin)[0], "weather_delay_classification.pkl")):
 progress_bar.set_description(f"Origin: {os.path.splitext(origin)[0]} Flight Delay Boolean Classification")
 try:
 weather_delay_classification(get_data_frame(origin), origin)
 except:
 print(f"Error in {os.path.splitext(origin)[0]} weather_delay_classification")
 progress_bar.update(1)

 # if not os.path.exists(os.path.join(os.path.splitext(origin)[0], "weather_delay_regression.pkl")):
 try:
 data_frame = get_data_frame(origin)

 data_frame['WeatherDelay'] = data_frame['WeatherDelay'].fillna(0)
 data_frame['NASDelay'] = data_frame['NASDelay'].fillna(0)
 data_frame = data_frame[(data_frame['WeatherDelay'] != 0) & (data_frame['NASDelay'] != 0)]

 data_frame['WeatherAndNasDelay'] = data_frame['WeatherDelay'] + data_frame['NASDelay'] 

 progress_bar.set_description(f"Origin: {os.path.splitext(origin)[0]} Flight Delay Regression")
 weather_delay_regression(data_frame, origin)
 except:
 print(f"Error in {os.path.splitext(origin)[0]} weather_delay_regression")
 progress_bar.update(1)


progress_bar.close()

 data_frame = pd.read_csv(os.path.join(origins_folder, origin))
Origin: TPA Flight Delay Regression: 100%|██████████| 90/90 [00:30<00:00, 2.93it/s]
