Skip to content

Comparing Two Climate Models

In this notebook, I will be comparing two climate reanalysis models:

  • NCEP-DOE Reanalysis 2: Surface
  • ERA5

I will be looking at the following variables:

  • Surface Pressure
  • Mean Sea Level Pressure
  • Total Column Water

The idea is simple: these two models should have very similar properties. I will be trying to user RBIG in order to assess how similar these models are. I'll be looking at the following IT measures

  • Entropy
  • Total Correlation
  • Mutual Information

If these climate models are that similar, then they should exhibit similar IT measures.

Data - Climate Models

import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../../"
sys.path.insert(0, f'{source_path}')

import numpy as np

# Data Loaders
from src.data.climate.amip import DataDownloader, DataLoader

from src.data.climate.era5 import get_era5_data
from src.data.climate.ncep import get_ncep_data
from src.features.climate.build_features import (
    get_time_overlap, check_time_coords, regrid_2_lower_res, get_spatial_cubes, normalize_data)

from src.experiments.climate.amip_global import (
    experiment_loop_comparative, 
    experiment_loop_individual
)
# Stat Tools
from src.models.train_models import run_rbig_models
from scipy import stats

import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
amip_data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/amip/"
era5_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/era5/"
ncep_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/ncep/"
results_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/"

Demo Experiment

Experimental Paams

class DataArgs:
    data_path = "/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/amip/"
    results_path = "/home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip"

class CMIPArgs:


    # Fixed Params
    spatial_windows = [
        1, 2,                # Spatial Window for Density Cubes
        3,4,5,6,7,8,9,10
    ]

    # Free Params
    variables = [
        'psl'               # Mean Surface Pressure
    ]
    cmip_models = [
        "inmcm4",
        "access1_0",
        "bcc_csm1_1",
        "bcc_csm1_1_m",
        "bnu_esm",
        "giss_e2_r",
        "cnrm_cm5",
        "ipsl_cm5a_lr",
        "ipsl_cm5a_mr",
        "ipsl_cm5b_lr",
        "mpi_esm_lr",
        "mpi_esm_mr",
        "noresm1_m",
    ]

    base_models = [
        'ncep',
        "era5"
    ]
def run_exp():

    for ibase in CMIPArgs.base_models:
        print('Base Model:', ibase)
        for ivariable in CMIPArgs.variables:
            print('Variable:', ivariable)
            for icmip in CMIPArgs.cmip_models:
                print("CMIP Model:", icmip)
run_exp()
Base Model: ncep
Variable: psl
CMIP Model: inmcm4
CMIP Model: access1_0
CMIP Model: bcc_csm1_1
CMIP Model: bcc_csm1_1_m
CMIP Model: bnu_esm
CMIP Model: giss_e2_r
CMIP Model: cnrm_cm5
CMIP Model: ipsl_cm5a_lr
CMIP Model: ipsl_cm5a_mr
CMIP Model: ipsl_cm5b_lr
CMIP Model: mpi_esm_lr
CMIP Model: mpi_esm_mr
CMIP Model: noresm1_m
Base Model: era5
Variable: psl
CMIP Model: inmcm4
CMIP Model: access1_0
CMIP Model: bcc_csm1_1
CMIP Model: bcc_csm1_1_m
CMIP Model: bnu_esm
CMIP Model: giss_e2_r
CMIP Model: cnrm_cm5
CMIP Model: ipsl_cm5a_lr
CMIP Model: ipsl_cm5a_mr
CMIP Model: ipsl_cm5b_lr
CMIP Model: mpi_esm_lr
CMIP Model: mpi_esm_mr
CMIP Model: noresm1_m

Part I - Grab Data

from src.data.climate.amip import get_base_model

base_dat = get_base_model(CMIPArgs.base_models[0], CMIPArgs.variables[0])
base_dat
<xarray.DataArray 'psl' (time: 489, lat: 73, lon: 144)>
dask.array<open_dataset-0ace0936f02ce97f3d79321b6d5f6a55mslp, shape=(489, 73, 144), dtype=float32, chunksize=(489, 73, 144), chunktype=numpy.ndarray>
Coordinates:
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... -82.5 -85.0 -87.5 -90.0
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
  * time     (time) datetime64[ns] 1979-01-01 1979-02-01 ... 2019-09-01
Attributes:
    long_name:             Monthly Mean Sea Level Pressure
    valid_range:           [-32766  15234]
    unpacked_valid_range:  [ 77000. 125000.]
    actual_range:          [ 95644. 105703.]
    units:                 Pascals
    precision:             0
    GRIB_id:               2
    GRIB_name:             PRMSL
    var_desc:              Mean Sea Level Pressure
    dataset:               NCEP/DOE AMIP-II Reanalysis (Reanalysis-2) Monthly...
    level_desc:            Sea Level
    statistic:             Individual Obs
    parent_stat:           Other
    standard_name:         pressure
    cell_methods:          time: mean (monthly from 6-hourly values)
    model_id:              ncep
from src.data.climate.cmip5 import get_cmip5_model


cmip_dat = get_cmip5_model(CMIPArgs.cmip_models[0], CMIPArgs.variables[0])
cmip_dat
<xarray.DataArray 'psl' (time: 360, lat: 120, lon: 180)>
dask.array<open_dataset-0a95b28dbc539d100c816bea9e2db9e7psl, shape=(360, 120, 180), dtype=float32, chunksize=(360, 120, 180), chunktype=numpy.ndarray>
Coordinates:
  * time     (time) object 1979-01-16 12:00:00 ... 2008-12-16 12:00:00
  * lat      (lat) float64 -89.25 -87.75 -86.25 -84.75 ... 86.25 87.75 89.25
  * lon      (lon) float64 0.0 2.0 4.0 6.0 8.0 ... 350.0 352.0 354.0 356.0 358.0
Attributes:
    standard_name:     air_pressure_at_sea_level
    long_name:         Sea Level Pressure
    comment:           not, in general, the same as surface pressure
    units:             Pa
    original_name:     psl
    cell_methods:      time: mean (interval: 1 month)
    history:           2011-02-07T11:53:19Z altered by CMOR: Reordered dimens...
    associated_files:  baseURL: http://cmip-pcmdi.llnl.gov/CMIP5/dataLocation...
    model_id:          inmcm4
def run_exp():

    for ibase in CMIPArgs.base_models:
        for ivariable in CMIPArgs.variables:
            for icmip in CMIPArgs.cmip_models:
                print(ibase)
                print(ivariable)
                print(icmip)

Part II - Regrid Data

base_dat, cmip_dat = regrid_2_lower_res(base_dat, cmip_dat)

assert(base_dat.shape[1] == cmip_dat.shape[1])
assert(base_dat.shape[2] == cmip_dat.shape[2])
base_dat
Create weight file: nearest_s2d_120x180_73x144.nc
Remove file nearest_s2d_120x180_73x144.nc
<xarray.DataArray 'psl' (time: 489, lat: 73, lon: 144)>
dask.array<open_dataset-0ace0936f02ce97f3d79321b6d5f6a55mslp, shape=(489, 73, 144), dtype=float32, chunksize=(489, 73, 144), chunktype=numpy.ndarray>
Coordinates:
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... -82.5 -85.0 -87.5 -90.0
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
  * time     (time) datetime64[ns] 1979-01-01 1979-02-01 ... 2019-09-01
Attributes:
    long_name:             Monthly Mean Sea Level Pressure
    valid_range:           [-32766  15234]
    unpacked_valid_range:  [ 77000. 125000.]
    actual_range:          [ 95644. 105703.]
    units:                 Pascals
    precision:             0
    GRIB_id:               2
    GRIB_name:             PRMSL
    var_desc:              Mean Sea Level Pressure
    dataset:               NCEP/DOE AMIP-II Reanalysis (Reanalysis-2) Monthly...
    level_desc:            Sea Level
    statistic:             Individual Obs
    parent_stat:           Other
    standard_name:         pressure
    cell_methods:          time: mean (monthly from 6-hourly values)
    model_id:              ncep

Part III - Find Overlapping Times

base_dat.shape, cmip_dat.shape
((489, 73, 144), (360, 73, 144))
base_dat, cmip_dat = get_time_overlap(base_dat, cmip_dat)

Part IV - Get Density Cubes

base_df = get_spatial_cubes(base_dat, CMIPArgs.spatial_windows[3])
cmip_df = get_spatial_cubes(cmip_dat, CMIPArgs.spatial_windows[3])
base_df.shape
(4826430, 16)

Test Individual Loop

test_base_model = 'ncep'
test_cmip_model = 'inmcm4'
test_variable = 'psl'
test_spatial_window = 7
subsamples = 1_000

res = experiment_loop_individual(
    test_base_model,
    test_cmip_model,
    test_variable,
    test_spatial_window,
    subsamples
)
Create weight file: nearest_s2d_120x180_73x144.nc
Remove file nearest_s2d_120x180_73x144.nc
res
{'h_base': -149.66922579994676,
 'tc_base': 184.67797240499547,
 'h_cmip': -112.08752658960748,
 'tc_cmip': 124.29975263830998,
 't_base': 12.304942846298218,
 't_cmip': 13.265496253967285}

Test Comparative Loop

test_base_model = 'ncep'
test_cmip_model = 'inmcm4'
test_variable = 'psl'
test_spatial_window = 7
subsamples = 1_000

res = experiment_loop_comparative(
    test_base_model,
    test_cmip_model,
    test_variable,
    test_spatial_window,
    subsamples
)
Create weight file: nearest_s2d_120x180_73x144.nc
Remove file nearest_s2d_120x180_73x144.nc
(3319314, 49) (3328560, 49)
res
{'mi': 12.93477259679793,
 'time_mi': 116.64088153839111,
 'pearson': 0.6832584799235359,
 'spearman': 0.6455462181832066,
 'kendelltau': 0.45468733761829255}

Experimental Loop


Part IV - Groupby time stamp

time_stamps = min(len(base_dat.time), len(cmip_dat.time))

with tqdm(range(time_stamps)) as progress_bar:
    for itime in progress_bar:
        ibase_dat = base_dat.isel(time=itime, drop=False)
        icmip_dat = cmip_dat.isel(time=itime)
        print(ibase_dat)
        print(icmip_dat)
        break
  0%|          | 0/359 [00:00<?, ?it/s]
<xarray.DataArray 'psl' (lat: 73, lon: 144)>
dask.array<getitem, shape=(73, 144), dtype=float32, chunksize=(73, 144), chunktype=numpy.ndarray>
Coordinates:
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... -82.5 -85.0 -87.5 -90.0
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
    time     datetime64[ns] 1979-02-01
Attributes:
    long_name:             Monthly Mean Sea Level Pressure
    valid_range:           [-32766  15234]
    unpacked_valid_range:  [ 77000. 125000.]
    actual_range:          [ 95644. 105703.]
    units:                 Pascals
    precision:             0
    GRIB_id:               2
    GRIB_name:             PRMSL
    var_desc:              Mean Sea Level Pressure
    dataset:               NCEP/DOE AMIP-II Reanalysis (Reanalysis-2) Monthly...
    level_desc:            Sea Level
    statistic:             Individual Obs
    parent_stat:           Other
    standard_name:         pressure
    cell_methods:          time: mean (monthly from 6-hourly values)
    model_id:              ncep
<xarray.DataArray 'psl' (lat: 73, lon: 144)>
dask.array<getitem, shape=(73, 144), dtype=float64, chunksize=(73, 144), chunktype=numpy.ndarray>
Coordinates:
    time     datetime64[ns] 1979-01-16T12:00:00
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... -82.5 -85.0 -87.5 -90.0
Attributes:
    regrid_method:  nearest_s2d
    model_id:       inmcm4

Part IV - Get Density Cubes


Part V - Normalize

base_norm = normalize_data(base_df)
cmip_norm = normalize_data(cmip_df)
base_norm[:None].shape
(4826430, 16)

Part VI - RBIG Algorithm

Entropy, Total Correlation

base_tc, base_h, t1 = run_rbig_models(base_norm[:1_000], measure='t', verbose=None)
cmip_tc, cmip_h, t2 = run_rbig_models(cmip_norm[:1_000], measure='t', verbose=None)
base_tc/16, cmip_tc/16
(2.8645279873342417, 2.6572342136429885)
base_h/16, cmip_h/16
(-2.7045988055470094, -2.0434281745463228)
type(t1)
float

Mutual Information

mi, t_ = run_rbig_models(base_norm[:1_000], cmip_norm[:1_000], measure='mi', verbose=None)
mi
0.9339957437091543
from scipy import stats
pears = stats.pearsonr(base_norm[:1_000].ravel(), cmip_norm[:1_000].ravel())
spears = stats.spearmanr(base_norm[:1_000].ravel(), cmip_norm[:1_000].ravel())
kend = stats.kendalltau(base_norm[:1_000].ravel(), cmip_norm[:1_000].ravel())
pears[0], spears[0], kend[0]
(0.19924053221759874, 0.18606468822908886, 0.1297966352157317)
from 

Experiment I - Comparing Climate Models

Mean Sea Level Pressure

ERA5 vs NCEP

# Experiment class
class ClimateEntropy:
    def __init__(self, save_path: None, variable: str='mslp', save_name=None, mi: bool=True):

        self.variable = variable

        self.results_path = save_path
        self.results_df = pd.DataFrame()
        self.save_name = save_name
        self.mi = mi

    def run_experiment(self, climate_model1: pd.DataFrame, climate_model2: pd.DataFrame) -> None:
        """Training loop that goes through each year and calculates the entropy,
        total correlation and mutual information between the two models."""
        time_length = len(climate_model1.groupby('time.year'))

        # Normalize BEFORE the individual calculations
        climate_model1[self.variable] = normalize_temporal(climate_model1[self.variable])

        model1_id = climate_model1.attrs['model_id']
        model2_id = climate_model2.attrs['model_id']
        climate_model2[self.variable] = normalize_temporal(climate_model2[self.variable])

        with tqdm(zip(
            climate_model1.groupby('time.year'), 
            climate_model2.groupby('time.year')
        ), total=time_length) as progress_bar:

            for imodel1, imodel2 in progress_bar:

                # Update params in progress bar

                # Transform to dataframe, remove spatial dimensions
                X1 = self._get_time_features(imodel1[1][self.variable])
                X2 = self._get_time_features(imodel2[1][self.variable])

                # Normalize inputs
                min_max_scaler = preprocessing.StandardScaler()
                X1 = min_max_scaler.fit_transform(X1.values)
                X2 = min_max_scaler.fit_transform(X2.values)

                dims = X1.shape[1]

                # =============================
                # Calculate Mutual Information
                # =============================
                if self.mi == False:
                    mi_ = None
                    mi_t_ = None
                else:
                    mi_, mi_t_ = run_rbig_models(X1, X2, measure='mi', verbose=None)


                # Update params in progress bar
                postfix = dict(

                )

                # ========================================
                # Calculate Entropy and Total Correlation
                # ========================================

                # Model I
                tc1_, h1_, h_t1_ = run_rbig_models(X1, measure='t', verbose=None)

                self._update_results(
                    model=model1_id,
                    year=imodel1[0],
                    h_time=h_t1_,
                    tc=tc1_,
                    h=h1_,
                    mi=mi_,
                    mi_time=mi_t_,
                    dims=dims,

                )


                # Model II
                tc2_, h2_, h_t2_ = run_rbig_models(X2, measure='t', verbose=None)
                self._update_results(
                    model=model2_id,
                    year=imodel2[0],
                    h_time=h_t2_,
                    tc=tc2_,
                    h=h2_,
                    mi=mi_,
                    mi_time=mi_t_,
                    dims=dims,

                )

                # Update params in progress bar
                postfix = dict(
                    year=imodel1[0],
                    mi=f"{mi_:.3f}" if self.mi is True else None,
                    h1=f"{h1_:.3f}",
                    tc1=f"{tc1_:.3f}",
                    h2=f"{h2_:.3f}",
                    tc2=f"{tc2_:.3f}",
                )



                progress_bar.set_postfix(postfix)


        return None

    def _get_time_features(self, data_df: pd.DataFrame)-> pd.DataFrame:
        """This function collapses the spatial dimensions as pivots. This allows
        us to only consider time as the input feature."""
        return data_df.to_dataframe().unstack(level=0).reset_index().drop(columns=['lat', 'lon']).dropna()

    def _update_results(self, model, year, tc, h, h_time, mi, mi_time, dims):
        """appends new values to the results dataframe."""
        self.results_df = self.results_df.append({
            'model': model,
            'year': year,
            'tc': tc,
            'h': h,
            'h_time': h_time,
            'mi': mi,
            'mi_time': mi_time,
            'dims': dims,
        }, ignore_index=True
        )

        if self.results_path is not None:
            self._save_results()
        return self

    def _save_results(self):
        """Saves the dataframe to the assigned results path."""
        self.results_df.to_csv(f"{self.results_path}{self.variable}_{self.save_name}.csv")
        return None
# Initialize experiment
short_decade_exp = ClimateEntropy(
    save_path=f"{results_path}", 
    variable='mslp',
    save_name='era_ncep'
)

# run experiment (shorter decade)
short_decade_exp.run_experiment(era5_data_regrid, ncep_data)
100%|██████████| 41/41 [1:43:03<00:00, 150.83s/it, year=2019, mi=4.766, h1=-1.313, tc1=9.551, h2=-3.049, tc2=13.532] 
# extract results
results_df = short_decade_exp.results_df

ERA5 vs CMIP5

2006-01-16, 2025-12-16, 1979-01-01, 2019-07-01
# Initialize experiment
short_decade_exp = ClimateEntropy(
    save_path=f"{results_path}", variable='mslp', save_name='era_cmip',
    mi=True
)

# run experiment (shorter decade)
short_decade_exp.run_experiment(
    era5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01')), 
    cmip5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01'))
)
100%|██████████| 14/14 [35:08<00:00, 150.61s/it, year=2019, mi=3.290, h1=-1.509, tc1=9.747, h2=-0.807, tc2=8.142]  

NCEP vs CMIP5

# Initialize experiment
short_decade_exp = ClimateEntropy(
    save_path=f"{results_path}", variable='mslp', save_name='ncep_cmip',
    mi=True
)

# run experiment (shorter decade)
short_decade_exp.run_experiment(
    ncep_data.sel(time=slice('2006-01-16', '2019-07-01')), 
    cmip5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01'))
)
100%|██████████| 14/14 [35:04<00:00, 150.30s/it, year=2019, mi=3.552, h1=-1.495, tc1=9.760, h2=-0.807, tc2=8.142]