Comparing Two Climate Models¶
In this notebook, I will be comparing two climate reanalysis models:
- NCEP-DOE Reanalysis 2: Surface
- ERA5
I will be looking at the following variables:
- Surface Pressure
- Mean Sea Level Pressure
- Total Column Water
The idea is simple: these two models should have very similar properties. I will be trying to user RBIG in order to assess how similar these models are. I'll be looking at the following IT measures
- Entropy
- Total Correlation
- Mutual Information
If these climate models are that similar, then they should exhibit similar IT measures.
Data - Climate Models¶
import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../../"
sys.path.insert(0, f'{source_path}')
import numpy as np
# Data Loaders
from src.data.climate.amip import DataDownloader, DataLoader
from src.data.climate.era5 import get_era5_data
from src.data.climate.ncep import get_ncep_data
from src.features.climate.build_features import (
get_time_overlap, check_time_coords, regrid_2_lower_res, get_spatial_cubes, normalize_data)
from src.experiments.climate.amip_global import (
experiment_loop_comparative,
experiment_loop_individual
)
# Stat Tools
from src.models.train_models import run_rbig_models
from scipy import stats
import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%load_ext autoreload
%autoreload 2
amip_data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/amip/"
era5_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/era5/"
ncep_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/ncep/"
results_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/"
Demo Experiment¶
Experimental Paams¶
class DataArgs:
data_path = "/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/amip/"
results_path = "/home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip"
class CMIPArgs:
# Fixed Params
spatial_windows = [
1, 2, # Spatial Window for Density Cubes
3,4,5,6,7,8,9,10
]
# Free Params
variables = [
'psl' # Mean Surface Pressure
]
cmip_models = [
"inmcm4",
"access1_0",
"bcc_csm1_1",
"bcc_csm1_1_m",
"bnu_esm",
"giss_e2_r",
"cnrm_cm5",
"ipsl_cm5a_lr",
"ipsl_cm5a_mr",
"ipsl_cm5b_lr",
"mpi_esm_lr",
"mpi_esm_mr",
"noresm1_m",
]
base_models = [
'ncep',
"era5"
]
def run_exp():
for ibase in CMIPArgs.base_models:
print('Base Model:', ibase)
for ivariable in CMIPArgs.variables:
print('Variable:', ivariable)
for icmip in CMIPArgs.cmip_models:
print("CMIP Model:", icmip)
from src.data.climate.amip import get_base_model
base_dat = get_base_model(CMIPArgs.base_models[0], CMIPArgs.variables[0])
base_dat
from src.data.climate.cmip5 import get_cmip5_model
cmip_dat = get_cmip5_model(CMIPArgs.cmip_models[0], CMIPArgs.variables[0])
cmip_dat
def run_exp():
for ibase in CMIPArgs.base_models:
for ivariable in CMIPArgs.variables:
for icmip in CMIPArgs.cmip_models:
print(ibase)
print(ivariable)
print(icmip)
Part II - Regrid Data¶
base_dat, cmip_dat = regrid_2_lower_res(base_dat, cmip_dat)
assert(base_dat.shape[1] == cmip_dat.shape[1])
assert(base_dat.shape[2] == cmip_dat.shape[2])
base_dat
Get Features Loop¶
from src.experiments.climate.amip_local import get_features_loop
base_dat, cmip_dat = get_features_loop('ncep', 'access1_0', 'psl', 'test')
base_dat.shape, cmip_dat.shape
Generate Temporal Data Loop¶
from typing import Optional
def generate_temporal_data(base_dat, cmip_dat, time: Optional[str] = "month"):
if time == "month":
time_stamps = min(len(base_dat.time), len(cmip_dat.time))
for itime in range(time_stamps):
itime_stamp = base_dat.time.values
ibase_dat = base_dat.isel(time=itime)
icmip_dat = cmip_dat.isel(time=itime)
ibase_dat = ibase_dat.expand_dims({"time": 1})
icmip_dat = icmip_dat.expand_dims({"time": 1})
yield ibase_dat, icmip_dat
elif time == "year":
base_dat = base_dat.groupby('time.year')
cmip_dat = cmip_dat.groupby('time.year')
for ibase_dat, icmip_dat in zip(base_dat, cmip_dat):
yield ibase_dat[1], icmip_dat[1]
for (ibase_dat, icmip_dat) in generate_temporal_data(base_dat, cmip_dat, 'year'):
print(ibase_dat.shape, icmip_dat.shape)
break
Part IV - Get Density Cubes¶
base_df = get_spatial_cubes(ibase_dat, CMIPArgs.spatial_windows[3])
cmip_df = get_spatial_cubes(icmip_dat, CMIPArgs.spatial_windows[3])
base_df.shape, cmip_df.shape
Test Individual Loop¶
test_base_model = 'ncep'
test_cmip_model = 'inmcm4'
test_variable = 'psl'
test_spatial_window = 7
subsamples = 1_000
res = experiment_loop_individual(
test_base_model,
test_cmip_model,
test_variable,
test_spatial_window,
subsamples
)
res
Test Comparative Loop¶
test_base_model = 'ncep'
test_cmip_model = 'inmcm4'
test_variable = 'psl'
test_spatial_window = 7
subsamples = 1_000
res = experiment_loop_comparative(
test_base_model,
test_cmip_model,
test_variable,
test_spatial_window,
subsamples
)
Part IV - Groupby time stamp¶
time_stamps = min(len(base_dat.time), len(cmip_dat.time))
with tqdm(range(time_stamps)) as progress_bar:
for itime in progress_bar:
print(base_dat.isel(time=itime))
ibase_dat = base_dat.isel(time=itime, drop=False)
icmip_dat = cmip_dat.isel(time=itime)
# print(ibase_dat)
# print(icmip_dat)
break
Part IV - Get Density Cubes¶
Part V - Normalize¶
base_norm = normalize_data(base_df)
cmip_norm = normalize_data(cmip_df)
base_tc, base_h, t1 = run_rbig_models(base_norm[:1_000], measure='t', verbose=None)
cmip_tc, cmip_h, t2 = run_rbig_models(cmip_norm[:1_000], measure='t', verbose=None)
base_tc/16, cmip_tc/16
base_h/16, cmip_h/16
type(t1)
Mutual Information¶
mi, t_ = run_rbig_models(base_norm[:1_000], cmip_norm[:1_000], measure='mi', verbose=None)
mi
from scipy import stats
pears = stats.pearsonr(base_norm[:1_000].ravel(), cmip_norm[:1_000].ravel())
spears = stats.spearmanr(base_norm[:1_000].ravel(), cmip_norm[:1_000].ravel())
kend = stats.kendalltau(base_norm[:1_000].ravel(), cmip_norm[:1_000].ravel())
pears[0], spears[0], kend[0]
from
Experiment I - Comparing Climate Models¶
Mean Sea Level Pressure¶
ERA5 vs NCEP¶
# Experiment class
class ClimateEntropy:
def __init__(self, save_path: None, variable: str='mslp', save_name=None, mi: bool=True):
self.variable = variable
self.results_path = save_path
self.results_df = pd.DataFrame()
self.save_name = save_name
self.mi = mi
def run_experiment(self, climate_model1: pd.DataFrame, climate_model2: pd.DataFrame) -> None:
"""Training loop that goes through each year and calculates the entropy,
total correlation and mutual information between the two models."""
time_length = len(climate_model1.groupby('time.year'))
# Normalize BEFORE the individual calculations
climate_model1[self.variable] = normalize_temporal(climate_model1[self.variable])
model1_id = climate_model1.attrs['model_id']
model2_id = climate_model2.attrs['model_id']
climate_model2[self.variable] = normalize_temporal(climate_model2[self.variable])
with tqdm(zip(
climate_model1.groupby('time.year'),
climate_model2.groupby('time.year')
), total=time_length) as progress_bar:
for imodel1, imodel2 in progress_bar:
# Update params in progress bar
# Transform to dataframe, remove spatial dimensions
X1 = self._get_time_features(imodel1[1][self.variable])
X2 = self._get_time_features(imodel2[1][self.variable])
# Normalize inputs
min_max_scaler = preprocessing.StandardScaler()
X1 = min_max_scaler.fit_transform(X1.values)
X2 = min_max_scaler.fit_transform(X2.values)
dims = X1.shape[1]
# =============================
# Calculate Mutual Information
# =============================
if self.mi == False:
mi_ = None
mi_t_ = None
else:
mi_, mi_t_ = run_rbig_models(X1, X2, measure='mi', verbose=None)
# Update params in progress bar
postfix = dict(
)
# ========================================
# Calculate Entropy and Total Correlation
# ========================================
# Model I
tc1_, h1_, h_t1_ = run_rbig_models(X1, measure='t', verbose=None)
self._update_results(
model=model1_id,
year=imodel1[0],
h_time=h_t1_,
tc=tc1_,
h=h1_,
mi=mi_,
mi_time=mi_t_,
dims=dims,
)
# Model II
tc2_, h2_, h_t2_ = run_rbig_models(X2, measure='t', verbose=None)
self._update_results(
model=model2_id,
year=imodel2[0],
h_time=h_t2_,
tc=tc2_,
h=h2_,
mi=mi_,
mi_time=mi_t_,
dims=dims,
)
# Update params in progress bar
postfix = dict(
year=imodel1[0],
mi=f"{mi_:.3f}" if self.mi is True else None,
h1=f"{h1_:.3f}",
tc1=f"{tc1_:.3f}",
h2=f"{h2_:.3f}",
tc2=f"{tc2_:.3f}",
)
progress_bar.set_postfix(postfix)
return None
def _get_time_features(self, data_df: pd.DataFrame)-> pd.DataFrame:
"""This function collapses the spatial dimensions as pivots. This allows
us to only consider time as the input feature."""
return data_df.to_dataframe().unstack(level=0).reset_index().drop(columns=['lat', 'lon']).dropna()
def _update_results(self, model, year, tc, h, h_time, mi, mi_time, dims):
"""appends new values to the results dataframe."""
self.results_df = self.results_df.append({
'model': model,
'year': year,
'tc': tc,
'h': h,
'h_time': h_time,
'mi': mi,
'mi_time': mi_time,
'dims': dims,
}, ignore_index=True
)
if self.results_path is not None:
self._save_results()
return self
def _save_results(self):
"""Saves the dataframe to the assigned results path."""
self.results_df.to_csv(f"{self.results_path}{self.variable}_{self.save_name}.csv")
return None
# Initialize experiment
short_decade_exp = ClimateEntropy(
save_path=f"{results_path}",
variable='mslp',
save_name='era_ncep'
)
# run experiment (shorter decade)
short_decade_exp.run_experiment(era5_data_regrid, ncep_data)
# extract results
results_df = short_decade_exp.results_df
ERA5 vs CMIP5¶
2006-01-16, 2025-12-16, 1979-01-01, 2019-07-01
# Initialize experiment
short_decade_exp = ClimateEntropy(
save_path=f"{results_path}", variable='mslp', save_name='era_cmip',
mi=True
)
# run experiment (shorter decade)
short_decade_exp.run_experiment(
era5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01')),
cmip5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01'))
)
NCEP vs CMIP5¶
# Initialize experiment
short_decade_exp = ClimateEntropy(
save_path=f"{results_path}", variable='mslp', save_name='ncep_cmip',
mi=True
)
# run experiment (shorter decade)
short_decade_exp.run_experiment(
ncep_data.sel(time=slice('2006-01-16', '2019-07-01')),
cmip5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01'))
)