Visually Comparing Climate Models¶

Summary¶

In this notebook, I will be comparing three climate reanalysis models:

NCEP-DOE Reanalysis 2: Surface
ERA5
CMIP5

I will be looking at the following variables:

Mean Sea Level Pressure (CMIP5, ERA5, NCEP)
Surface Pressure (ERA5, NCEP)

I will be trying to user RBIG in order to assess how similar these models are. I'll be looking at the following IT measures. If these climate models are that similar, then they should exhibit similar IT measures.

Preprocessing Steps¶

Regridded Spatially

The ERA5 had the coarsest spatial resolution (2.5 x 2.5).
I regridded the NCEP from (0.25 x 0.25) to (2.5 x 2.5).
I regridded the CMIP5 from (2 x 2.5) to (2.5 x 2.5).

Temporal Resolution

ERA5 and NCEP go from 1980-2019
CMIP5 goes from 2006-2018
For comparing ERA5 vs CMIP5 and NCEP vs CMIPF, I found the same time components

Measures¶

I'm measuring the following:

Entropy - expected uncertainty
Total Correlation - amount of redundant information between features
Mutual Information - amount of information shared between variables

Data¶

Inputs

I'm taking each year as is. Each spatial location is a sample and each year is a feature. My inputs are:

$X \in \mathbb{R}^{\text{spatial } \times \text{ month}}$

Outputs

All my information theory measures are in nats. They are scalars.

Hypothesis¶

Simple: The ERA5 and the NCEP model should be more similar than the CMIP5 model compared to each of them.

Data - Climate Models¶

import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../../"
sys.path.insert(0, f'{source_path}')


# ESDC tools
sys.path.insert(0, f'/home/emmanuel/code/py_esdc')
# from esdc.preprocessing import normalize_temporal

import cdsapi
from zipfile import ZipFile
import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing

# Visualization Tools
from src.data.climate.loader import ResultsLoader
from src.visualization.climate.global_it import plot_global_entropy, plot_global_diff_entropy, plot_global_mutual_info, plot_global_diff_mutual_info
# from src.visualization.climate import PlotResults

import seaborn as sns
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
plt.style.use(['seaborn-notebook', 'fivethirtyeight'])

%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/"
results_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/"

Experiment I - Global Entropy¶

from pathlib import Path
data_path = f"/home/emmanuel/projects/projects/2020_rbig_rs/data/climate/results/amip/local/compare/"

def get_results_files(base_model: str, trials: bool)-> pd.DataFrame:

    path = Path(data_path)

    if base_model == 'ncep':
        base_pattern = 'ncep'

    elif base_model == 'era5':
        base_pattern = 'era5'

    else:
        raise ValueError('Unrecognized base model:', base_model)
    trials_ext = 'tr100_v1'
#     if trials == True:
#         trials_ext = 'v1'
#     elif trials == False:
#         trials_ext = 'v2'
#     else:
#         raise ValueError("Unrecognized trials extentions:", trials)

    filename_pattern = base_pattern + '*' + trials_ext + '.csv'

    df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern)]
    results_df   = pd.concat(df_from_each_file, ignore_index=True)
    return results_df

def post_processing_compare(df: pd.DataFrame)-> pd.DataFrame:

    # divide by the spatial resolution
    df['mi'] = df['mi'] / (df['spatial'] ** 2)

    return df

!ls /home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip/

compare_trial_v1.csv         spatial_compare_v3.csv
global                spatial_compare_v4.csv
global_compare_v2.csv         spatial_compare_v5.csv
global_compare_v3.csv         spatial_individual_trial_v1.csv
global_individual_v2.csv      spatial_individual_v2.csv
global_individual_v3.csv      spatial_individual_v3.csv
global_individual_v4.csv      spatial_individual_v4.csv
individual_trial_v1.csv       spatial_individual_v5.csv
local                 spatial_individual_v6.csv
spatial_compare_trial_v1.csv  v1.csv
spatial_compare_v2.csv        v2.csv

Entropy¶

df = pd.read_csv('/home/emmanuel/projects/2020_rbig_rs/data/climate/interim/amip/global/individual/ncep_bnu_esm_tr100_v1.csv')
df.head()

	Unnamed: 0	base	cmip	h_base	h_cmip	spatial	subsample	t_base	t_cmip	trial	variable
0	0	ncep	bnu_esm	1.230567	1.290014	1.0	50000.0	1.301512	1.309189	0.0	psl
1	1	ncep	bnu_esm	1.220025	1.284430	1.0	50000.0	1.288438	1.305716	1.0	psl
2	2	ncep	bnu_esm	1.221952	1.291945	1.0	50000.0	1.292614	1.298469	2.0	psl
3	3	ncep	bnu_esm	1.222894	1.289754	1.0	50000.0	1.295369	1.305580	3.0	psl
4	4	ncep	bnu_esm	1.228588	1.290732	1.0	50000.0	1.302234	1.308249	4.0	psl

data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/interim/amip/global/individual/"

results_df = get_results_files('era5', False)
results_df.head()

	base	cmip	h_base	h_cmip	spatial	subsample	t_base	t_cmip	trial	variable
0	era5	bcc_csm1_1	1.218948	1.314152	1.0	50000.0	1.496089	1.480152	0.0	psl
1	era5	bcc_csm1_1	1.208673	1.308177	1.0	50000.0	1.498710	1.504606	1.0	psl
2	era5	bcc_csm1_1	1.214323	1.305756	1.0	50000.0	1.498986	1.505039	2.0	psl
3	era5	bcc_csm1_1	1.215485	1.309741	1.0	50000.0	1.543764	1.508892	3.0	psl
4	era5	bcc_csm1_1	1.217624	1.315335	1.0	50000.0	1.544796	1.531270	4.0	psl

results_df.cmip.unique().tolist()

['bcc_csm1_1',
 'access1_0',
 'mpi_esm_lr',
 'ipsl_cm5a_lr',
 'noresm1_m',
 'giss_e2_r',
 'cnrm_cm5',
 'bnu_esm']

SAVEPATH = '/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/amip/global/entropy/'
def plot_global_entropy(results_df: pd.DataFrame, base: str, cmip: str, normalized=True, log_mi=True, save=True)-> None:

    if normalized == True:
        results_df['h_cmip'] = results_df['h_cmip'] / results_df['spatial']**2
        results_df['h_base'] = results_df['h_base'] / results_df['spatial']**2
    results_df = results_df[results_df['base'] == base]
    results_df = results_df[results_df['cmip'] == cmip]
    fig, ax = plt.subplots()
    sns.scatterplot(data=results_df, x='spatial', y='h_base', label=f"{base}", ax=ax)
    sns.scatterplot(data=results_df, x='spatial', y='h_cmip', label=f"{cmip}", ax=ax)
    ax.set_title(f'{base.upper()} vs CMIP: {cmip.upper()}')
    ax.set_xlabel('Spatial Features')
#     ax.set_xlim([2, 6])
    ax.set_ylabel('Entropy, H')
    ax.legend()

    if save:
        savename = f"global_h_{base}_{cmip}.png"
        fig.savefig(SAVEPATH + savename, )
    else:
        plt.show()
    return None

Example¶

results_df.cmip.unique().tolist()

['bcc_csm1_1',
 'access1_0',
 'mpi_esm_lr',
 'ipsl_cm5a_lr',
 'noresm1_m',
 'giss_e2_r',
 'cnrm_cm5',
 'bnu_esm']

results_df = get_results_files('era5', False)
plot_global_entropy(results_df, 'era5', 'ipsl_cm5a_lr', normalized=True, save=True)

for imodel in results_df.cmip.unique().tolist():
    results_df = get_results_files('era5', False)
    plot_global_entropy(results_df, 'era5', imodel, normalized=True, save=True)

# plot_global_entropy(results, 'ncep', 'inmcm4', normalized=False)

for imodel in results_df.cmip.unique().tolist():
    results_df = get_results_files('ncep', False)
    plot_global_entropy(results_df, 'ncep', imodel, normalized=True,)

Difference in Entropy¶

import numpy as np

def plot_global_diff_entropy(results_df: pd.DataFrame, base: str, normalized=True, log_mi=False)-> None:

    results_copy = results_df[results_df['base'] == base]

    if normalized == True:
        results_copy['h_base'] = results_copy['h_base'] / results_copy['spatial']**2
        results_copy['h_cmip'] = results_copy['h_cmip'] / results_copy['spatial']**2

    results_copy = results_df[results_df['base'] == base]

    # calculate difference
    results_copy['h_diff'] = np.abs(results_copy['h_cmip'] - results_copy['h_base'])


    if log_mi == True:
        results_copy['h_diff'] = np.log(results_copy['h_diff'])

    fig, ax = plt.subplots()
#     sns.scatterplot(ax=ax, data=results_copy, x='spatial', y='h_diff', hue='base', color='black')
    sns.lineplot(ax=ax, data=results_copy, x='spatial', y='h_diff', hue='cmip')

    plt.title(f'')
    plt.xlabel('Spatial Features')
    plt.ylabel('Entropy, H')
    plt.legend(ncol=2, bbox_to_anchor=(2.05, 1), fontsize=16)
#     plt.tight_layout()
    plt.show()

results_df = get_results_files('era5', False)
plot_global_diff_entropy(results_df, 'era5', normalized=True, save=True)

results_df = get_results_files('ncep', False)
plot_global_diff_entropy(results_df, 'ncep', normalized=True, save=True)

Mutual Information¶

from pathlib import Path
data_path = f"/home/emmanuel/projects/projects/2020_rbig_rs/data/climate/results/amip/global/compare/"

def get_results_files(base_model: str, trials: bool)-> pd.DataFrame:

    path = Path(data_path)

    if base_model == 'ncep':
        base_pattern = 'ncep'

    elif base_model == 'era5':
        base_pattern = 'era5'

    else:
        raise ValueError('Unrecognized base model:', base_model)

    if trials == True:
        trials_ext = 'v1'
    elif trials == False:
        trials_ext = 'v2'
    else:
        raise ValueError("Unrecognized trials extentions:", trials)

    filename_pattern = base_pattern + '*' + trials_ext + '.csv'
#     [print(f) for f in path.rglob(filename_pattern)]
#     [print(pd.read_csv(f, index_col=0).columns.shape) for f in path.rglob(filename_pattern)]
    df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern)]
    results_df   = pd.concat(df_from_each_file, ignore_index=True)
    return results_df

def post_processing_compare(df: pd.DataFrame)-> pd.DataFrame:

    # divide by the spatial resolution
    df['mi'] = df['mi'] / (df['spatial'] ** 2)

    return df

data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip/global/compare/"

results_df = get_results_files('ncep', True)
results_df.tail()

	base	cmip	kendelltau	mi	pearson	spatial	spearman	subsample	time_mi	trial	variable
4310	ncep	bcc_csm1_1	0.003454	0.023890	0.000746	3.0	0.005176	50000.0	86.576983	5.0	psl
4311	ncep	bcc_csm1_1	-0.003573	0.028074	-0.007091	3.0	-0.005361	50000.0	76.669153	6.0	psl
4312	ncep	bcc_csm1_1	0.003528	0.027675	0.008871	3.0	0.005295	50000.0	85.649547	7.0	psl
4313	ncep	bcc_csm1_1	0.004418	0.066879	0.012918	3.0	0.006640	50000.0	149.052748	8.0	psl
4314	ncep	bcc_csm1_1	0.002212	0.025614	0.002713	3.0	0.003324	50000.0	100.513506	9.0	psl

results_df.cmip.unique().tolist()

['access1_0',
 'ipsl_cm5a_lr',
 'giss_e2_r',
 'cnrm_cm5',
 'mpi_esm_lr',
 'inmcm4',
 'bnu_esm',
 'bcc_csm1_1_m',
 'noresm1_m',
 'bcc_csm1_1']

results_df = get_results_files('ncep', True)
plot_global_mutual_info(results_df, base='ncep', cmip='giss_e2_r', measure='mi', normalized=True, log_mi=True)

results_df = get_results_files('ncep', True)
plot_global_mutual_info(results_df, base='ncep', cmip=None, measure='mi', normalized=True, )

results_df = get_results_files('era5', True)
plot_global_mutual_info(results_df, base='era5', cmip=None, measure='mi', normalized=True, log_mi=True)

for imodel in results_df.cmip.unique().tolist():
    results_df = get_results_files('ncep', True)
    plot_global_mutual_info(results_df, base='ncep', cmip=imodel, measure='mi', normalized=True, log_mi=True)

for imodel in results_df.cmip.unique().tolist():
    results_df = get_results_files('era5', True)
    plot_global_mutual_info(results_df, base='era5', cmip=imodel, measure='mi', normalized=True, log_mi=False)

def plot_global_mi(results_df: pd.DataFrame, base: str, cmip: str, measure='mi', normalized=True)-> None:

    results_df = results_df[results_df['base'] == base]
    results_df = results_df[results_df['cmip'] == cmip]

    if normalized and measure == 'mi':
        results_df[measure] = results_df[measure] / results_df['spatial']
    sns.scatterplot(data=results_df, x='spatial', y=measure, label=f"{base}")
    sns.scatterplot(data=results_df, x='spatial', y=measure, label=f"{cmip}")
    plt.title(f'{base.upper()}, CMIP: {cmip.upper()}')
    plt.xlabel('Spatial Features')
    plt.ylabel(f'{measure.upper()}')
    plt.legend()
    plt.show()

plot_global_mi(results_df, 'ncep', 'access1_0', 'mi')
# plot_global_mi(results_df, 'era5', 'noresm1_m', 'pearson')
# plot_global_mi(results_df, 'era5', 'noresm1_m', 'spearman')
# plot_global_mi(results_df, 'era5', 'noresm1_m', 'kendelltau')

plot_global_mi(results_df, 'ncep', 'giss_e2_r', 'mi')
plot_global_mi(results_df, 'ncep', 'giss_e2_r', 'pearson')
plot_global_mi(results_df, 'ncep', 'giss_e2_r', 'spearman')
plot_global_mi(results_df, 'ncep', 'giss_e2_r', 'kendelltau')

def plot_global_diff_mi(results_df: pd.DataFrame, base: str, measure='mi', normalized=True, log_mi=True)-> None:


    results_copy = results_df[results_df['base'] == base]


    if normalized == True:
        results_copy[measure] = results_copy[measure] / results_copy['spatial']

    if log_mi == True and measure == 'mi':
        results_copy[measure] = np.log(results_copy[measure])

    fig, ax = plt.subplots()
#     sns.scatterplot(ax=ax, data=results_copy, x='spatial', y='h_diff', hue='base', color='black')
    sns.scatterplot(ax=ax, data=results_copy, x='spatial', y=measure, hue='cmip')

    plt.title(f'')
    plt.xlabel('Spatial Features')
    plt.ylabel(f'{measure.upper()}')
    plt.legend(ncol=2, bbox_to_anchor=(2.05, 1), fontsize=16)
#     plt.tight_layout()
    plt.show()

giss_e2_r

plot_global_diff_mi(results_df, 'ncep', 'mi', True, True)
plot_global_diff_mi(results_df, 'ncep', 'pearson', True, True)
plot_global_diff_mi(results_df, 'ncep', 'spearman', True, True)
plot_global_diff_mi(results_df, 'ncep', 'kendelltau', True, True)