|
import streamlit as st |
|
from data_utils import * |
|
import xarray as xr |
|
import numpy as np |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import pickle |
|
import glob, os |
|
import re |
|
import tensorflow as tf |
|
import netCDF4 |
|
import copy |
|
import string |
|
import h5py |
|
from tqdm import tqdm |
|
|
|
|
|
st.title('A _Quickstart Notebook_ for :blue[ClimSim]:') |
|
st.link_button("Go to ClimSim Github Repository", "https://github.com/leap-stc/ClimSim/tree/main",use_container_width=True) |
|
st.header('**Step 1:** Import data_utils') |
|
st.code('''from data_utils import *''',language='python') |
|
|
|
|
|
|
|
st.header('**Step 2:** Instantiate class') |
|
st.link_button("Go to original grid_info", "https://github.com/leap-stc/ClimSim/tree/main/grid_info",use_container_width=True) |
|
st.link_button("Go to original input_mean input_max input_min output_scale", "https://github.com/leap-stc/ClimSim/tree/main/preprocessing/normalizations",use_container_width=True) |
|
st.code('''#Change the path to your own |
|
grid_info = xr.open_dataset('ClimSim_low-res_grid-info.nc') |
|
input_mean = xr.open_dataset('input_mean.nc') |
|
input_max = xr.open_dataset('input_max.nc') |
|
input_min = xr.open_dataset('input_min.nc') |
|
output_scale = xr.open_dataset('output_scale.nc') |
|
|
|
data = data_utils(grid_info = grid_info, |
|
input_mean = input_mean, |
|
input_max = input_max, |
|
input_min = input_min, |
|
output_scale = output_scale) |
|
|
|
# set variables to V1 subset |
|
data.set_to_v1_vars()''',language='python') |
|
|
|
grid_info = xr.open_dataset('ClimSim_low-res_grid-info.nc') |
|
input_mean = xr.open_dataset('input_mean.nc') |
|
input_max = xr.open_dataset('input_max.nc') |
|
input_min = xr.open_dataset('input_min.nc') |
|
output_scale = xr.open_dataset('output_scale.nc') |
|
|
|
data = data_utils(grid_info = grid_info, |
|
input_mean = input_mean, |
|
input_max = input_max, |
|
input_min = input_min, |
|
output_scale = output_scale) |
|
|
|
data.set_to_v1_vars() |
|
|
|
|
|
|
|
st.header('**Step 3:** Load training and validation data') |
|
st.link_button("Go to Original Dataset", "https://huggingface.co/datasets/LEAP/subsampled_low_res/tree/main",use_container_width=True) |
|
st.code('''data.input_train = data.load_npy_file('train_input_small.npy') |
|
data.target_train = data.load_npy_file('train_target_small.npy') |
|
data.input_val = data.load_npy_file('val_input_small.npy') |
|
data.target_val = data.load_npy_file('val_target_small.npy')''',language='python') |
|
|
|
data.input_train = data.load_npy_file('train_input_small.npy') |
|
data.target_train = data.load_npy_file('train_target_small.npy') |
|
data.input_val = data.load_npy_file('val_input_small.npy') |
|
data.target_val = data.load_npy_file('val_target_small.npy') |
|
|
|
|
|
|
|
st.header('**Step 4:** Train models') |
|
st.subheader('Train constant prediction model') |
|
st.latex(r'''\hat{y}=E[y_{train}]''') |
|
st.code('''const_model = data.target_train.mean(axis = 0)''',language='python') |
|
|
|
const_model = data.target_train.mean(axis = 0) |
|
|
|
|
|
|
|
st.subheader('Train multiple linear regression model') |
|
st.latex(r'''\beta=(X^{T}_{train} X_{train})^{-1} X^{T}_{train} y_{train} \\ |
|
\hat{y}=X^{T}_{input} \beta \\ |
|
\text{where } X_{train} \text{ and } X_{input} \text{ correspond to the training data and the input data you would like to inference on, respectively.} \\ |
|
X_{train} \text{ and } X_{input} \text{ both have a column of ones concatenated to the feature space for the bias.}''') |
|
st.text('adding bias unit') |
|
st.code('''X = data.input_train |
|
bias_vector = np.ones((X.shape[0], 1)) |
|
X = np.concatenate((X, bias_vector), axis=1)''',language='python') |
|
|
|
X = data.input_train |
|
bias_vector = np.ones((X.shape[0], 1)) |
|
X = np.concatenate((X, bias_vector), axis=1) |
|
|
|
|
|
|
|
st.text('create model') |
|
st.code('''mlr_weights = np.linalg.inv(X.transpose()@X)@X.transpose()@data.target_train''',language='python') |
|
|
|
mlr_weights = np.linalg.inv(X.transpose()@X)@X.transpose()@data.target_train |
|
|
|
|
|
|
|
st.subheader('Train your models here') |
|
st.code('''### |
|
# train your model here |
|
###''',language='python') |
|
|
|
|
|
st.header('**Step 5:** Evaluate on validation data') |
|
st.subheader('Set pressure grid') |
|
st.code('''data.set_pressure_grid(data_split = 'val')''',language='python') |
|
|
|
data.set_pressure_grid(data_split = 'val') |
|
|
|
|
|
|
|
st.subheader('Load predictions') |
|
st.code('''# Constant Prediction |
|
const_pred_val = np.repeat(const_model[np.newaxis, :], data.target_val.shape[0], axis = 0) |
|
print(const_pred_val.shape) |
|
|
|
# Multiple Linear Regression |
|
X_val = data.input_val |
|
bias_vector_val = np.ones((X_val.shape[0], 1)) |
|
X_val = np.concatenate((X_val, bias_vector_val), axis=1) |
|
mlr_pred_val = X_val@mlr_weights |
|
print(mlr_pred_val.shape) |
|
|
|
# Load your prediction here |
|
|
|
# Load predictions into data_utils object |
|
data.model_names = ['const', 'mlr'] # add names of your models here |
|
preds = [const_pred_val, mlr_pred_val] # add your custom predictions here |
|
data.preds_val = dict(zip(data.model_names, preds))''',language='python') |
|
|
|
|
|
const_pred_val = np.repeat(const_model[np.newaxis, :], data.target_val.shape[0], axis = 0) |
|
print(const_pred_val.shape) |
|
|
|
X_val = data.input_val |
|
bias_vector_val = np.ones((X_val.shape[0], 1)) |
|
X_val = np.concatenate((X_val, bias_vector_val), axis=1) |
|
|
|
mlr_pred_val = X_val@mlr_weights |
|
print(mlr_pred_val.shape) |
|
|
|
data.model_names = ['const', 'mlr'] |
|
preds = [const_pred_val, mlr_pred_val] |
|
data.preds_val = dict(zip(data.model_names, preds)) |
|
|
|
|
|
|
|
st.subheader('Weight predictions and target') |
|
st.text('''1.Undo output scaling |
|
2.Weight vertical levels by dp/g |
|
3.Weight horizontal area of each grid cell by a[x]/mean(a[x]) |
|
4.Convert units to a common energy unit''') |
|
st.code('''data.reweight_target(data_split = 'val') |
|
data.reweight_preds(data_split = 'val')''',language='python') |
|
|
|
data.reweight_target(data_split = 'val') |
|
data.reweight_preds(data_split = 'val') |
|
|
|
|
|
|
|
st.subheader('Set and calculate metrics') |
|
st.code('''data.metrics_names = ['MAE', 'RMSE', 'R2', 'bias'] |
|
data.create_metrics_df(data_split = 'val')''',language='python') |
|
|
|
data.metrics_names = ['MAE', 'RMSE', 'R2', 'bias'] |
|
data.create_metrics_df(data_split = 'val') |
|
|
|
|
|
|
|
st.subheader('Create plots') |
|
st.code('''# set plotting settings |
|
%config InlineBackend.figure_format = 'retina' |
|
letters = string.ascii_lowercase |
|
|
|
# create custom dictionary for plotting |
|
dict_var = data.metrics_var_val |
|
plot_df_byvar = {} |
|
for metric in data.metrics_names: |
|
plot_df_byvar[metric] = pd.DataFrame([dict_var[model][metric] for model in data.model_names], |
|
index=data.model_names) |
|
plot_df_byvar[metric] = plot_df_byvar[metric].rename(columns = data.var_short_names).transpose() |
|
|
|
# plot figure |
|
fig, axes = plt.subplots(nrows = len(data.metrics_names), sharex = True) |
|
for i in range(len(data.metrics_names)): |
|
plot_df_byvar[data.metrics_names[i]].plot.bar( |
|
legend = False, |
|
ax = axes[i]) |
|
if data.metrics_names[i] != 'R2': |
|
axes[i].set_ylabel('$W/m^2$') |
|
else: |
|
axes[i].set_ylim(0,1) |
|
|
|
axes[i].set_title(f'({letters[i]}) {data.metrics_names[i]}') |
|
axes[i].set_xlabel('Output variable') |
|
axes[i].set_xticklabels(plot_df_byvar[data.metrics_names[i]].index, \ |
|
rotation=0, ha='center') |
|
|
|
axes[0].legend(columnspacing = .9, |
|
labelspacing = .3, |
|
handleheight = .07, |
|
handlelength = 1.5, |
|
handletextpad = .2, |
|
borderpad = .2, |
|
ncol = 3, |
|
loc = 'upper right') |
|
fig.set_size_inches(7,8) |
|
fig.tight_layout()''',language='python') |
|
|
|
letters = string.ascii_lowercase |
|
|
|
dict_var = data.metrics_var_val |
|
plot_df_byvar = {} |
|
for metric in data.metrics_names: |
|
plot_df_byvar[metric] = pd.DataFrame([dict_var[model][metric] for model in data.model_names], |
|
index=data.model_names) |
|
plot_df_byvar[metric] = plot_df_byvar[metric].rename(columns = data.var_short_names).transpose() |
|
|
|
fig, axes = plt.subplots(nrows = len(data.metrics_names), sharex = True) |
|
for i in range(len(data.metrics_names)): |
|
plot_df_byvar[data.metrics_names[i]].plot.bar( |
|
legend = False, |
|
ax = axes[i]) |
|
if data.metrics_names[i] != 'R2': |
|
axes[i].set_ylabel('$W/m^2$') |
|
else: |
|
axes[i].set_ylim(0,1) |
|
axes[i].set_title(f'({letters[i]}) {data.metrics_names[i]}') |
|
|
|
axes[i].set_xlabel('Output variable') |
|
axes[i].set_xticklabels(plot_df_byvar[data.metrics_names[i]].index, \ |
|
rotation=0, ha='center') |
|
|
|
axes[0].legend(columnspacing = .9, |
|
labelspacing = .3, |
|
handleheight = .07, |
|
handlelength = 1.5, |
|
handletextpad = .2, |
|
borderpad = .2, |
|
ncol = 3, |
|
loc = 'upper right') |
|
fig.set_size_inches(7,8) |
|
fig.tight_layout() |
|
|
|
st.pyplot(fig) |
|
st.text('If you trained models with different hyperparameters, use the ones that performed the best on validation data for evaluation on scoring data.') |
|
|
|
|
|
st.header('**Step 6:** Evaluate on scoring data') |
|
st.subheader('Do this at the VERY END (when you have finished tuned the hyperparameters for your model and are seeking a final evaluation)') |
|
st.subheader('Load scoring data') |
|
st.code('''data.input_scoring = np.load('scoring_input_small.npy') |
|
data.target_scoring = np.load('scoring_target_small.npy') |
|
''',language='python') |
|
|
|
data.input_scoring = np.load('scoring_input_small.npy') |
|
data.target_scoring = np.load('scoring_target_small.npy') |
|
|
|
|
|
|
|
st.subheader('Set pressure grid') |
|
st.code('''data.set_pressure_grid(data_split = 'scoring')''',language='python') |
|
|
|
data.set_pressure_grid(data_split = 'scoring') |
|
|
|
|
|
|
|
st.subheader('Load predictions') |
|
st.code('''# constant prediction |
|
const_pred_scoring = np.repeat(const_model[np.newaxis, :], data.target_scoring.shape[0], axis = 0) |
|
print(const_pred_scoring.shape) |
|
|
|
# multiple linear regression |
|
X_scoring = data.input_scoring |
|
bias_vector_scoring = np.ones((X_scoring.shape[0], 1)) |
|
X_scoring = np.concatenate((X_scoring, bias_vector_scoring), axis=1) |
|
mlr_pred_scoring = X_scoring@mlr_weights |
|
print(mlr_pred_scoring.shape) |
|
|
|
# Your model prediction here |
|
|
|
# Load predictions into object |
|
data.model_names = ['const', 'mlr'] # model name here |
|
preds = [const_pred_scoring, mlr_pred_scoring] # add prediction here |
|
data.preds_scoring = dict(zip(data.model_names, preds))''',language='python') |
|
|
|
const_pred_scoring = np.repeat(const_model[np.newaxis, :], data.target_scoring.shape[0], axis = 0) |
|
print(const_pred_scoring.shape) |
|
|
|
X_scoring = data.input_scoring |
|
bias_vector_scoring = np.ones((X_scoring.shape[0], 1)) |
|
X_scoring = np.concatenate((X_scoring, bias_vector_scoring), axis=1) |
|
mlr_pred_scoring = X_scoring@mlr_weights |
|
print(mlr_pred_scoring.shape) |
|
|
|
data.model_names = ['const', 'mlr'] |
|
preds = [const_pred_scoring, mlr_pred_scoring] |
|
data.preds_scoring = dict(zip(data.model_names, preds)) |
|
|
|
|
|
st.subheader('Weight predictions and target') |
|
st.text('''1.Undo output scaling |
|
2.Weight vertical levels by dp/g |
|
3.Weight horizontal area of each grid cell by a[x]/mean(a[x]) |
|
4.Convert units to a common energy unit''') |
|
st.code('''# weight predictions and target |
|
data.reweight_target(data_split = 'scoring') |
|
data.reweight_preds(data_split = 'scoring') |
|
|
|
# set and calculate metrics |
|
data.metrics_names = ['MAE', 'RMSE', 'R2', 'bias'] |
|
data.create_metrics_df(data_split = 'scoring')''',language='python') |
|
|
|
|
|
data.reweight_target(data_split = 'scoring') |
|
data.reweight_preds(data_split = 'scoring') |
|
|
|
|
|
data.metrics_names = ['MAE', 'RMSE', 'R2', 'bias'] |
|
data.create_metrics_df(data_split = 'scoring') |
|
|
|
|
|
|
|
|
|
st.subheader('Create plots') |
|
st.code('''# set plotting settings |
|
%config InlineBackend.figure_format = 'retina' |
|
letters = string.ascii_lowercase |
|
|
|
# create custom dictionary for plotting |
|
dict_var = data.metrics_var_scoring |
|
plot_df_byvar = {} |
|
for metric in data.metrics_names: |
|
plot_df_byvar[metric] = pd.DataFrame([dict_var[model][metric] for model in data.model_names], |
|
index=data.model_names) |
|
plot_df_byvar[metric] = plot_df_byvar[metric].rename(columns = data.var_short_names).transpose() |
|
|
|
# plot figure |
|
fig, axes = plt.subplots(nrows = len(data.metrics_names), sharex = True) |
|
for i in range(len(data.metrics_names)): |
|
plot_df_byvar[data.metrics_names[i]].plot.bar( |
|
legend = False, |
|
ax = axes[i]) |
|
if data.metrics_names[i] != 'R2': |
|
axes[i].set_ylabel('$W/m^2$') |
|
else: |
|
axes[i].set_ylim(0,1) |
|
|
|
axes[i].set_title(f'({letters[i]}) {data.metrics_names[i]}') |
|
axes[i].set_xlabel('Output variable') |
|
axes[i].set_xticklabels(plot_df_byvar[data.metrics_names[i]].index, \ |
|
rotation=0, ha='center') |
|
|
|
axes[0].legend(columnspacing = .9, |
|
labelspacing = .3, |
|
handleheight = .07, |
|
handlelength = 1.5, |
|
handletextpad = .2, |
|
borderpad = .2, |
|
ncol = 3, |
|
loc = 'upper right') |
|
fig.set_size_inches(7,8) |
|
fig.tight_layout()''') |
|
|
|
letters = string.ascii_lowercase |
|
|
|
dict_var = data.metrics_var_scoring |
|
plot_df_byvar = {} |
|
for metric in data.metrics_names: |
|
plot_df_byvar[metric] = pd.DataFrame([dict_var[model][metric] for model in data.model_names], |
|
index=data.model_names) |
|
plot_df_byvar[metric] = plot_df_byvar[metric].rename(columns = data.var_short_names).transpose() |
|
|
|
fig, axes = plt.subplots(nrows = len(data.metrics_names), sharex = True) |
|
for i in range(len(data.metrics_names)): |
|
plot_df_byvar[data.metrics_names[i]].plot.bar( |
|
legend = False, |
|
ax = axes[i]) |
|
if data.metrics_names[i] != 'R2': |
|
axes[i].set_ylabel('$W/m^2$') |
|
else: |
|
axes[i].set_ylim(0,1) |
|
|
|
axes[i].set_title(f'({letters[i]}) {data.metrics_names[i]}') |
|
axes[i].set_xlabel('Output variable') |
|
axes[i].set_xticklabels(plot_df_byvar[data.metrics_names[i]].index, \ |
|
rotation=0, ha='center') |
|
|
|
axes[0].legend(columnspacing = .9, |
|
labelspacing = .3, |
|
handleheight = .07, |
|
handlelength = 1.5, |
|
handletextpad = .2, |
|
borderpad = .2, |
|
ncol = 3, |
|
loc = 'upper right') |
|
fig.set_size_inches(7,8) |
|
fig.tight_layout() |
|
|
|
st.pyplot(fig) |
|
|