Skip to content

Comparing Two Climate Models

In this notebook, I will be comparing two climate reanalysis models:

  • NCEP-DOE Reanalysis 2: Surface
  • ERA5

I will be looking at the following variables:

  • Surface Pressure
  • Mean Sea Level Pressure
  • Total Column Water

The idea is simple: these two models should have very similar properties. I will be trying to user RBIG in order to assess how similar these models are. I'll be looking at the following IT measures

  • Entropy
  • Total Correlation
  • Mutual Information

If these climate models are that similar, then they should exhibit similar IT measures.

Data - Climate Models

import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../"
sys.path.insert(0, f'{source_path}')

import numpy as np

# Import RBIG Helper
from src.models.train_models import run_rbig_models

# ESDC tools
sys.path.insert(0, f'/home/emmanuel/code/py_esdc')
from esdc.standardize import normalize_temporal
from esdc.transform import regrid_data

import cdsapi
from zipfile import ZipFile
import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

%load_ext autoreload
%autoreload 2
data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/"
results_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/"

ERA5

era5_data = xr.open_dataset(f"{data_path}ERA5.nc")
era5_data = era5_data.rename({'msl': 'mslp', 'latitude': 'lat', 'longitude': 'lon'})
# era5_data = era5_data.rename({'latitude': 'lat'})
# era5_data.attrs['model_id'] = 'era5'
# rescale model from 0.25 to 2.5 degrees
# era5_data = era5_data.coarsen(lat=10, lon=10, boundary='pad').mean()
era5_data.attrs['model_id'] = 'era5'
era5_data
<xarray.Dataset>
Dimensions:  (lat: 721, lon: 1440, time: 487)
Coordinates:
  * lon      (lon) float32 0.0 0.25 0.5 0.75 1.0 ... 359.0 359.25 359.5 359.75
  * lat      (lat) float32 90.0 89.75 89.5 89.25 ... -89.25 -89.5 -89.75 -90.0
  * time     (time) datetime64[ns] 1979-01-01 1979-02-01 ... 2019-07-01
Data variables:
    mslp     (time, lat, lon) float32 ...
    sp       (time, lat, lon) float32 ...
Attributes:
    Conventions:  CF-1.6
    history:      2019-10-07 09:20:10 GMT by grib_to_netcdf-2.10.0: /opt/ecmw...
    model_id:     era5

NCAR-NCEP-DOE-II

ncep_data = xr.open_mfdataset(f"{data_path}*mon.mean.nc")
ncep_data = ncep_data.rename({'pres': 'sp'})
ncep_data.attrs['model_id'] = 'ncar_ncep_doe_2'
ncep_data
<xarray.Dataset>
Dimensions:    (lat: 73, lon: 144, nbnds: 2, time: 489)
Coordinates:
  * lat        (lat) float32 90.0 87.5 85.0 82.5 ... -82.5 -85.0 -87.5 -90.0
  * lon        (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
  * time       (time) datetime64[ns] 1979-01-01 1979-02-01 ... 2019-09-01
Dimensions without coordinates: nbnds
Data variables:
    mslp       (time, lat, lon) float32 dask.array<chunksize=(489, 73, 144), meta=np.ndarray>
    time_bnds  (time, nbnds) datetime64[ns] dask.array<chunksize=(489, 2), meta=np.ndarray>
    pr_wtr     (time, lat, lon) float32 dask.array<chunksize=(489, 73, 144), meta=np.ndarray>
    sp         (time, lat, lon) float32 dask.array<chunksize=(489, 73, 144), meta=np.ndarray>
Attributes:
    Conventions:    CF-1.0
    title:          Monthly NCEP/DOE Reanalysis 2
    history:        created 2002/03 by Hoop (netCDF2.3)
    comments:       Data is from \nNCEP/DOE AMIP-II Reanalysis (Reanalysis-2)...
    platform:       Model
    source:         NCEP/DOE AMIP-II Reanalysis (Reanalysis-2) Model
    institution:    National Centers for Environmental Prediction
    dataset_title:  NCEP-DOE AMIP-II Reanalysis
    References:     https://www.esrl.noaa.gov/psd/data/gridded/data.ncep.rean...
    source_url:     http://www.cpc.ncep.noaa.gov/products/wesley/reanalysis2/
    model_id:       ncar_ncep_doe_2

Regridding

era5_data_regrid = regrid_data(ncep_data, era5_data)
Create weight file: nearest_s2d_721x1440_73x144.nc
Remove file nearest_s2d_721x1440_73x144.nc
era5_data_regrid.attrs = era5_data.attrs
era5_data_regrid = xr.Dataset()
era5_data_regrid = xr.Dataset()
era5_data_regrid['sp'] = era5_regrid
era5_data_regrid.attrs = era5_data.attrs
era5_data_regrid
<xarray.Dataset>
Dimensions:  (lat: 73, lon: 144, time: 487)
Coordinates:
  * time     (time) datetime64[ns] 1979-01-01 1979-02-01 ... 2019-07-01
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... -82.5 -85.0 -87.5 -90.0
Data variables:
    sp       (time, lat, lon) float64 1.027e+05 1.027e+05 ... 6.859e+04
Attributes:
    Conventions:  CF-1.6
    history:      2019-10-07 09:20:10 GMT by grib_to_netcdf-2.10.0: /opt/ecmw...
    model_id:     era5

CMIP5

cmip5_data = xr.open_dataset(f"{data_path}CMIP5.nc")
cmip5_data = cmip5_data.rename({'psl': 'mslp'})

# rescale model from 0.25 to 2.5 degrees
# cmip5_data = cmip5_data.coarsen(lat=1, boundary='pad').mean()
cmip5_data.attrs['model_id'] = 'cmip5'
cmip5_data
<xarray.Dataset>
Dimensions:    (bnds: 2, lat: 90, lon: 144, time: 240)
Coordinates:
  * time       (time) object 2006-01-16 12:00:00 ... 2025-12-16 12:00:00
  * lat        (lat) float64 -89.0 -87.0 -85.0 -83.0 ... 83.0 85.0 87.0 89.0
  * lon        (lon) float64 1.25 3.75 6.25 8.75 ... 351.2 353.8 356.2 358.8
Dimensions without coordinates: bnds
Data variables:
    time_bnds  (time, bnds) object ...
    lat_bnds   (lat, bnds) float64 ...
    lon_bnds   (lon, bnds) float64 ...
    mslp       (time, lat, lon) float32 ...
Attributes:
    institution:            NASA/GISS (Goddard Institute for Space Studies) N...
    institute_id:           NASA-GISS
    experiment_id:          rcp85
    source:                 GISS-E2-R-E135RCP85aF40oQ32 Atmosphere: GISS-E2; ...
    model_id:               cmip5
    forcing:                GHG, LU, Sl, Vl, BC, OC, SA, Oz (also includes or...
    parent_experiment_id:   historical
    parent_experiment_rip:  r1i1p1
    branch_time:            2006.0
    contact:                Kenneth Lo (cdkkl@giss.nasa.gov)
    references:             www.giss.nasa.gov/research/modeling
    initialization_method:  1
    physics_version:        1
    tracking_id:            71ff3d6b-02eb-470f-a25d-5e79c1b8c1b5
    product:                output
    experiment:             RCP8.5
    frequency:              mon
    creation_date:          2011-08-30T18:58:55Z
    history:                2011-08-30T18:58:55Z CMOR rewrote data to comply ...
    Conventions:            CF-1.4
    project_id:             CMIP5
    table_id:               Table Amon (31 January 2011) 53b766a395ac41696af4...
    title:                  GISS-E2-R model output prepared for CMIP5 RCP8.5
    parent_experiment:      historical
    modeling_realm:         atmos
    realization:            1
    cmor_version:           2.5.7
cmip5_regrid = regrid_data(ncep_data.mslp, cmip5_data.mslp)
cmip5_data_regrid = xr.Dataset()
cmip5_data_regrid['mslp'] = cmip5_regrid
cmip5_data_regrid.attrs = cmip5_data.attrs
cmip5_data_regrid
Reuse existing file: nearest_s2d_90x144_73x144.nc
Remove file nearest_s2d_90x144_73x144.nc
<xarray.Dataset>
Dimensions:  (lat: 73, lon: 144, time: 240)
Coordinates:
  * time     (time) object 2006-01-16 12:00:00 ... 2025-12-16 12:00:00
  * lon      (lon) float32 0.0 2.5 5.0 7.5 10.0 ... 350.0 352.5 355.0 357.5
  * lat      (lat) float32 90.0 87.5 85.0 82.5 80.0 ... -82.5 -85.0 -87.5 -90.0
Data variables:
    mslp     (time, lat, lon) float64 9.993e+04 9.993e+04 ... 9.985e+04
Attributes:
    institution:            NASA/GISS (Goddard Institute for Space Studies) N...
    institute_id:           NASA-GISS
    experiment_id:          rcp85
    source:                 GISS-E2-R-E135RCP85aF40oQ32 Atmosphere: GISS-E2; ...
    model_id:               cmip5
    forcing:                GHG, LU, Sl, Vl, BC, OC, SA, Oz (also includes or...
    parent_experiment_id:   historical
    parent_experiment_rip:  r1i1p1
    branch_time:            2006.0
    contact:                Kenneth Lo (cdkkl@giss.nasa.gov)
    references:             www.giss.nasa.gov/research/modeling
    initialization_method:  1
    physics_version:        1
    tracking_id:            71ff3d6b-02eb-470f-a25d-5e79c1b8c1b5
    product:                output
    experiment:             RCP8.5
    frequency:              mon
    creation_date:          2011-08-30T18:58:55Z
    history:                2011-08-30T18:58:55Z CMOR rewrote data to comply ...
    Conventions:            CF-1.4
    project_id:             CMIP5
    table_id:               Table Amon (31 January 2011) 53b766a395ac41696af4...
    title:                  GISS-E2-R model output prepared for CMIP5 RCP8.5
    parent_experiment:      historical
    modeling_realm:         atmos
    realization:            1
    cmor_version:           2.5.7

Experiment I - Comparing Climate Models

Mean Sea Level Pressure

ERA5 vs NCEP

# Experiment class
class ClimateEntropy:
    def __init__(self, save_path: None, variable: str='mslp', save_name=None, mi: bool=True):

        self.variable = variable

        self.results_path = save_path
        self.results_df = pd.DataFrame()
        self.save_name = save_name
        self.mi = mi

    def run_experiment(self, climate_model1: pd.DataFrame, climate_model2: pd.DataFrame) -> None:
        """Training loop that goes through each year and calculates the entropy,
        total correlation and mutual information between the two models."""
        time_length = len(climate_model1.groupby('time.year'))

        # Normalize BEFORE the individual calculations
        climate_model1[self.variable] = normalize_temporal(climate_model1[self.variable])

        model1_id = climate_model1.attrs['model_id']
        model2_id = climate_model2.attrs['model_id']
        climate_model2[self.variable] = normalize_temporal(climate_model2[self.variable])

        with tqdm(zip(
            climate_model1.groupby('time.year'), 
            climate_model2.groupby('time.year')
        ), total=time_length) as progress_bar:

            for imodel1, imodel2 in progress_bar:

                # Update params in progress bar

                # Transform to dataframe, remove spatial dimensions
                X1 = self._get_time_features(imodel1[1][self.variable])
                X2 = self._get_time_features(imodel2[1][self.variable])

                # Normalize inputs
                min_max_scaler = preprocessing.StandardScaler()
                X1 = min_max_scaler.fit_transform(X1.values)
                X2 = min_max_scaler.fit_transform(X2.values)

                dims = X1.shape[1]

                # =============================
                # Calculate Mutual Information
                # =============================
                if self.mi == False:
                    mi_ = None
                    mi_t_ = None
                else:
                    mi_, mi_t_ = run_rbig_models(X1, X2, measure='mi', verbose=None)


                # Update params in progress bar
                postfix = dict(

                )

                # ========================================
                # Calculate Entropy and Total Correlation
                # ========================================

                # Model I
                tc1_, h1_, h_t1_ = run_rbig_models(X1, measure='t', verbose=None)

                self._update_results(
                    model=model1_id,
                    year=imodel1[0],
                    h_time=h_t1_,
                    tc=tc1_,
                    h=h1_,
                    mi=mi_,
                    mi_time=mi_t_,
                    dims=dims,

                )


                # Model II
                tc2_, h2_, h_t2_ = run_rbig_models(X2, measure='t', verbose=None)
                self._update_results(
                    model=model2_id,
                    year=imodel2[0],
                    h_time=h_t2_,
                    tc=tc2_,
                    h=h2_,
                    mi=mi_,
                    mi_time=mi_t_,
                    dims=dims,

                )

                # Update params in progress bar
                postfix = dict(
                    year=imodel1[0],
                    mi=f"{mi_:.3f}" if self.mi is True else None,
                    h1=f"{h1_:.3f}",
                    tc1=f"{tc1_:.3f}",
                    h2=f"{h2_:.3f}",
                    tc2=f"{tc2_:.3f}",
                )



                progress_bar.set_postfix(postfix)


        return None

    def _get_time_features(self, data_df: pd.DataFrame)-> pd.DataFrame:
        """This function collapses the spatial dimensions as pivots. This allows
        us to only consider time as the input feature."""
        return data_df.to_dataframe().unstack(level=0).reset_index().drop(columns=['lat', 'lon']).dropna()

    def _update_results(self, model, year, tc, h, h_time, mi, mi_time, dims):
        """appends new values to the results dataframe."""
        self.results_df = self.results_df.append({
            'model': model,
            'year': year,
            'tc': tc,
            'h': h,
            'h_time': h_time,
            'mi': mi,
            'mi_time': mi_time,
            'dims': dims,
        }, ignore_index=True
        )

        if self.results_path is not None:
            self._save_results()
        return self

    def _save_results(self):
        """Saves the dataframe to the assigned results path."""
        self.results_df.to_csv(f"{self.results_path}{self.variable}_{self.save_name}.csv")
        return None
# Initialize experiment
short_decade_exp = ClimateEntropy(
    save_path=f"{results_path}", 
    variable='mslp',
    save_name='era_ncep'
)

# run experiment (shorter decade)
short_decade_exp.run_experiment(era5_data_regrid, ncep_data)
100%|██████████| 41/41 [1:43:03<00:00, 150.83s/it, year=2019, mi=4.766, h1=-1.313, tc1=9.551, h2=-3.049, tc2=13.532] 
# extract results
results_df = short_decade_exp.results_df

ERA5 vs CMIP5

2006-01-16, 2025-12-16, 1979-01-01, 2019-07-01
# Initialize experiment
short_decade_exp = ClimateEntropy(
    save_path=f"{results_path}", variable='mslp', save_name='era_cmip',
    mi=True
)

# run experiment (shorter decade)
short_decade_exp.run_experiment(
    era5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01')), 
    cmip5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01'))
)
100%|██████████| 14/14 [35:08<00:00, 150.61s/it, year=2019, mi=3.290, h1=-1.509, tc1=9.747, h2=-0.807, tc2=8.142]  

NCEP vs CMIP5

# Initialize experiment
short_decade_exp = ClimateEntropy(
    save_path=f"{results_path}", variable='mslp', save_name='ncep_cmip',
    mi=True
)

# run experiment (shorter decade)
short_decade_exp.run_experiment(
    ncep_data.sel(time=slice('2006-01-16', '2019-07-01')), 
    cmip5_data_regrid.sel(time=slice('2006-01-16', '2019-07-01'))
)
100%|██████████| 14/14 [35:04<00:00, 150.30s/it, year=2019, mi=3.552, h1=-1.495, tc1=9.760, h2=-0.807, tc2=8.142]