Skip to content

Visually Comparing Climate Models


Summary

In this notebook, I will be comparing three climate reanalysis models:

  • NCEP-DOE Reanalysis 2: Surface
  • ERA5
  • CMIP5

I will be looking at the following variables:

  • Mean Sea Level Pressure (CMIP5, ERA5, NCEP)
  • Surface Pressure (ERA5, NCEP)

I will be trying to user RBIG in order to assess how similar these models are. I'll be looking at the following IT measures. If these climate models are that similar, then they should exhibit similar IT measures.


Preprocessing Steps

Regridded Spatially

  • The ERA5 had the coarsest spatial resolution (2.5 x 2.5).
  • I regridded the NCEP from (0.25 x 0.25) to (2.5 x 2.5).
  • I regridded the CMIP5 from (2 x 2.5) to (2.5 x 2.5).

Temporal Resolution

  • ERA5 and NCEP go from 1980-2019
  • CMIP5 goes from 2006-2018
  • For comparing ERA5 vs CMIP5 and NCEP vs CMIPF, I found the same time components


Measures

I'm measuring the following:

  • Entropy - expected uncertainty
  • Total Correlation - amount of redundant information between features
  • Mutual Information - amount of information shared between variables


Data

Inputs

I'm taking each year as is. Each spatial location is a sample and each year is a feature. My inputs are:

X \in \mathbb{R}^{\text{spatial } \times \text{ month}}

Outputs

All my information theory measures are in nats. They are scalars.


Hypothesis

Simple: The ERA5 and the NCEP model should be more similar than the CMIP5 model compared to each of them.


Data - Climate Models

import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../../"
sys.path.insert(0, f'{source_path}')


# ESDC tools
sys.path.insert(0, f'/home/emmanuel/code/py_esdc')
# from esdc.preprocessing import normalize_temporal

import cdsapi
from zipfile import ZipFile
import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing

# Visualization Tools
from src.data.climate.loader import ResultsLoader
from src.visualization.climate.global_it import plot_global_entropy, plot_global_diff_entropy, plot_global_mutual_info, plot_global_diff_mutual_info
# from src.visualization.climate import PlotResults

import seaborn as sns
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
plt.style.use(['seaborn-notebook', 'fivethirtyeight'])

%matplotlib inline

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/"
results_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/"

Experiment I - Global Entropy

from pathlib import Path
data_path = f"/home/emmanuel/projects/projects/2020_rbig_rs/data/climate/results/amip/local/compare/"

def get_results_files(base_model: str, trials: bool)-> pd.DataFrame:

    path = Path(data_path)

    if base_model == 'ncep':
        base_pattern = 'ncep'

    elif base_model == 'era5':
        base_pattern = 'era5'

    else:
        raise ValueError('Unrecognized base model:', base_model)
    trials_ext = 'tr100_v1'
#     if trials == True:
#         trials_ext = 'v1'
#     elif trials == False:
#         trials_ext = 'v2'
#     else:
#         raise ValueError("Unrecognized trials extentions:", trials)

    filename_pattern = base_pattern + '*' + trials_ext + '.csv'

    df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern)]
    results_df   = pd.concat(df_from_each_file, ignore_index=True)
    return results_df

def post_processing_compare(df: pd.DataFrame)-> pd.DataFrame:

    # divide by the spatial resolution
    df['mi'] = df['mi'] / (df['spatial'] ** 2)

    return df
!ls /home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip/
compare_trial_v1.csv         spatial_compare_v3.csv
global                spatial_compare_v4.csv
global_compare_v2.csv         spatial_compare_v5.csv
global_compare_v3.csv         spatial_individual_trial_v1.csv
global_individual_v2.csv      spatial_individual_v2.csv
global_individual_v3.csv      spatial_individual_v3.csv
global_individual_v4.csv      spatial_individual_v4.csv
individual_trial_v1.csv       spatial_individual_v5.csv
local                 spatial_individual_v6.csv
spatial_compare_trial_v1.csv  v1.csv
spatial_compare_v2.csv        v2.csv

Entropy

df = pd.read_csv('/home/emmanuel/projects/2020_rbig_rs/data/climate/interim/amip/global/individual/ncep_bnu_esm_tr100_v1.csv')
df.head()
Unnamed: 0 base cmip h_base h_cmip spatial subsample t_base t_cmip tc_base tc_cmip trial variable
0 0 ncep bnu_esm 1.230567 1.290014 1.0 50000.0 1.301512 1.309189 0.0 0.0 0.0 psl
1 1 ncep bnu_esm 1.220025 1.284430 1.0 50000.0 1.288438 1.305716 0.0 0.0 1.0 psl
2 2 ncep bnu_esm 1.221952 1.291945 1.0 50000.0 1.292614 1.298469 0.0 0.0 2.0 psl
3 3 ncep bnu_esm 1.222894 1.289754 1.0 50000.0 1.295369 1.305580 0.0 0.0 3.0 psl
4 4 ncep bnu_esm 1.228588 1.290732 1.0 50000.0 1.302234 1.308249 0.0 0.0 4.0 psl
data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/interim/amip/global/individual/"

results_df = get_results_files('era5', False)
results_df.head()
base cmip h_base h_cmip spatial subsample t_base t_cmip tc_base tc_cmip trial variable
0 era5 bcc_csm1_1 1.218948 1.314152 1.0 50000.0 1.496089 1.480152 0.0 0.0 0.0 psl
1 era5 bcc_csm1_1 1.208673 1.308177 1.0 50000.0 1.498710 1.504606 0.0 0.0 1.0 psl
2 era5 bcc_csm1_1 1.214323 1.305756 1.0 50000.0 1.498986 1.505039 0.0 0.0 2.0 psl
3 era5 bcc_csm1_1 1.215485 1.309741 1.0 50000.0 1.543764 1.508892 0.0 0.0 3.0 psl
4 era5 bcc_csm1_1 1.217624 1.315335 1.0 50000.0 1.544796 1.531270 0.0 0.0 4.0 psl
results_df.cmip.unique().tolist()
['bcc_csm1_1',
 'access1_0',
 'mpi_esm_lr',
 'ipsl_cm5a_lr',
 'noresm1_m',
 'giss_e2_r',
 'cnrm_cm5',
 'bnu_esm']
SAVEPATH = '/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/amip/global/entropy/'
def plot_global_entropy(results_df: pd.DataFrame, base: str, cmip: str, normalized=True, log_mi=True, save=True)-> None:

    if normalized == True:
        results_df['h_cmip'] = results_df['h_cmip'] / results_df['spatial']**2
        results_df['h_base'] = results_df['h_base'] / results_df['spatial']**2
    results_df = results_df[results_df['base'] == base]
    results_df = results_df[results_df['cmip'] == cmip]
    fig, ax = plt.subplots()
    sns.scatterplot(data=results_df, x='spatial', y='h_base', label=f"{base}", ax=ax)
    sns.scatterplot(data=results_df, x='spatial', y='h_cmip', label=f"{cmip}", ax=ax)
    ax.set_title(f'{base.upper()} vs CMIP: {cmip.upper()}')
    ax.set_xlabel('Spatial Features')
#     ax.set_xlim([2, 6])
    ax.set_ylabel('Entropy, H')
    ax.legend()

    if save:
        savename = f"global_h_{base}_{cmip}.png"
        fig.savefig(SAVEPATH + savename, )
    else:
        plt.show()
    return None

Example

results_df.cmip.unique().tolist()
['bcc_csm1_1',
 'access1_0',
 'mpi_esm_lr',
 'ipsl_cm5a_lr',
 'noresm1_m',
 'giss_e2_r',
 'cnrm_cm5',
 'bnu_esm']
results_df = get_results_files('era5', False)
plot_global_entropy(results_df, 'era5', 'ipsl_cm5a_lr', normalized=True, save=True)
for imodel in results_df.cmip.unique().tolist():
    results_df = get_results_files('era5', False)
    plot_global_entropy(results_df, 'era5', imodel, normalized=True, save=True)

# plot_global_entropy(results, 'ncep', 'inmcm4', normalized=False)
for imodel in results_df.cmip.unique().tolist():
    results_df = get_results_files('ncep', False)
    plot_global_entropy(results_df, 'ncep', imodel, normalized=True,)

Difference in Entropy

import numpy as np

def plot_global_diff_entropy(results_df: pd.DataFrame, base: str, normalized=True, log_mi=False)-> None:

    results_copy = results_df[results_df['base'] == base]

    if normalized == True:
        results_copy['h_base'] = results_copy['h_base'] / results_copy['spatial']**2
        results_copy['h_cmip'] = results_copy['h_cmip'] / results_copy['spatial']**2

    results_copy = results_df[results_df['base'] == base]

    # calculate difference
    results_copy['h_diff'] = np.abs(results_copy['h_cmip'] - results_copy['h_base'])


    if log_mi == True:
        results_copy['h_diff'] = np.log(results_copy['h_diff'])

    fig, ax = plt.subplots()
#     sns.scatterplot(ax=ax, data=results_copy, x='spatial', y='h_diff', hue='base', color='black')
    sns.lineplot(ax=ax, data=results_copy, x='spatial', y='h_diff', hue='cmip')

    plt.title(f'')
    plt.xlabel('Spatial Features')
    plt.ylabel('Entropy, H')
    plt.legend(ncol=2, bbox_to_anchor=(2.05, 1), fontsize=16)
#     plt.tight_layout()
    plt.show()
results_df = get_results_files('era5', False)
plot_global_diff_entropy(results_df, 'era5', normalized=True, save=True)
results_df = get_results_files('ncep', False)
plot_global_diff_entropy(results_df, 'ncep', normalized=True, save=True)

Mutual Information

from pathlib import Path
data_path = f"/home/emmanuel/projects/projects/2020_rbig_rs/data/climate/results/amip/global/compare/"

def get_results_files(base_model: str, trials: bool)-> pd.DataFrame:

    path = Path(data_path)

    if base_model == 'ncep':
        base_pattern = 'ncep'

    elif base_model == 'era5':
        base_pattern = 'era5'

    else:
        raise ValueError('Unrecognized base model:', base_model)

    if trials == True:
        trials_ext = 'v1'
    elif trials == False:
        trials_ext = 'v2'
    else:
        raise ValueError("Unrecognized trials extentions:", trials)

    filename_pattern = base_pattern + '*' + trials_ext + '.csv'
#     [print(f) for f in path.rglob(filename_pattern)]
#     [print(pd.read_csv(f, index_col=0).columns.shape) for f in path.rglob(filename_pattern)]
    df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern)]
    results_df   = pd.concat(df_from_each_file, ignore_index=True)
    return results_df

def post_processing_compare(df: pd.DataFrame)-> pd.DataFrame:

    # divide by the spatial resolution
    df['mi'] = df['mi'] / (df['spatial'] ** 2)

    return df
data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip/global/compare/"

results_df = get_results_files('ncep', True)
results_df.tail()
base cmip kendelltau mi pearson spatial spearman subsample time_mi trial variable
4310 ncep bcc_csm1_1 0.003454 0.023890 0.000746 3.0 0.005176 50000.0 86.576983 5.0 psl
4311 ncep bcc_csm1_1 -0.003573 0.028074 -0.007091 3.0 -0.005361 50000.0 76.669153 6.0 psl
4312 ncep bcc_csm1_1 0.003528 0.027675 0.008871 3.0 0.005295 50000.0 85.649547 7.0 psl
4313 ncep bcc_csm1_1 0.004418 0.066879 0.012918 3.0 0.006640 50000.0 149.052748 8.0 psl
4314 ncep bcc_csm1_1 0.002212 0.025614 0.002713 3.0 0.003324 50000.0 100.513506 9.0 psl
results_df.cmip.unique().tolist()
['access1_0',
 'ipsl_cm5a_lr',
 'giss_e2_r',
 'cnrm_cm5',
 'mpi_esm_lr',
 'inmcm4',
 'bnu_esm',
 'bcc_csm1_1_m',
 'noresm1_m',
 'bcc_csm1_1']
results_df = get_results_files('ncep', True)
plot_global_mutual_info(results_df, base='ncep', cmip='giss_e2_r', measure='mi', normalized=True, log_mi=True)
results_df = get_results_files('ncep', True)
plot_global_mutual_info(results_df, base='ncep', cmip=None, measure='mi', normalized=True, )
results_df = get_results_files('era5', True)
plot_global_mutual_info(results_df, base='era5', cmip=None, measure='mi', normalized=True, log_mi=True)
for imodel in results_df.cmip.unique().tolist():
    results_df = get_results_files('ncep', True)
    plot_global_mutual_info(results_df, base='ncep', cmip=imodel, measure='mi', normalized=True, log_mi=True)
for imodel in results_df.cmip.unique().tolist():
    results_df = get_results_files('era5', True)
    plot_global_mutual_info(results_df, base='era5', cmip=imodel, measure='mi', normalized=True, log_mi=False)
def plot_global_mi(results_df: pd.DataFrame, base: str, cmip: str, measure='mi', normalized=True)-> None:

    results_df = results_df[results_df['base'] == base]
    results_df = results_df[results_df['cmip'] == cmip]

    if normalized and measure == 'mi':
        results_df[measure] = results_df[measure] / results_df['spatial']
    sns.scatterplot(data=results_df, x='spatial', y=measure, label=f"{base}")
    sns.scatterplot(data=results_df, x='spatial', y=measure, label=f"{cmip}")
    plt.title(f'{base.upper()}, CMIP: {cmip.upper()}')
    plt.xlabel('Spatial Features')
    plt.ylabel(f'{measure.upper()}')
    plt.legend()
    plt.show()
plot_global_mi(results_df, 'ncep', 'access1_0', 'mi')
# plot_global_mi(results_df, 'era5', 'noresm1_m', 'pearson')
# plot_global_mi(results_df, 'era5', 'noresm1_m', 'spearman')
# plot_global_mi(results_df, 'era5', 'noresm1_m', 'kendelltau')
plot_global_mi(results_df, 'ncep', 'giss_e2_r', 'mi')
plot_global_mi(results_df, 'ncep', 'giss_e2_r', 'pearson')
plot_global_mi(results_df, 'ncep', 'giss_e2_r', 'spearman')
plot_global_mi(results_df, 'ncep', 'giss_e2_r', 'kendelltau')
def plot_global_diff_mi(results_df: pd.DataFrame, base: str, measure='mi', normalized=True, log_mi=True)-> None:


    results_copy = results_df[results_df['base'] == base]


    if normalized == True:
        results_copy[measure] = results_copy[measure] / results_copy['spatial']

    if log_mi == True and measure == 'mi':
        results_copy[measure] = np.log(results_copy[measure])

    fig, ax = plt.subplots()
#     sns.scatterplot(ax=ax, data=results_copy, x='spatial', y='h_diff', hue='base', color='black')
    sns.scatterplot(ax=ax, data=results_copy, x='spatial', y=measure, hue='cmip')

    plt.title(f'')
    plt.xlabel('Spatial Features')
    plt.ylabel(f'{measure.upper()}')
    plt.legend(ncol=2, bbox_to_anchor=(2.05, 1), fontsize=16)
#     plt.tight_layout()
    plt.show()
giss_e2_r
plot_global_diff_mi(results_df, 'ncep', 'mi', True, True)
plot_global_diff_mi(results_df, 'ncep', 'pearson', True, True)
plot_global_diff_mi(results_df, 'ncep', 'spearman', True, True)
plot_global_diff_mi(results_df, 'ncep', 'kendelltau', True, True)