Visually Comparing Climate Models¶

Summary¶

In this notebook, I will be comparing three climate reanalysis models:

NCEP-DOE Reanalysis 2: Surface
ERA5
CMIP5

I will be looking at the following variables:

Mean Sea Level Pressure (CMIP5, ERA5, NCEP)
Surface Pressure (ERA5, NCEP)

I will be trying to user RBIG in order to assess how similar these models are. I'll be looking at the following IT measures. If these climate models are that similar, then they should exhibit similar IT measures.

Preprocessing Steps¶

Regridded Spatially

The ERA5 had the coarsest spatial resolution (2.5 x 2.5).
I regridded the NCEP from (0.25 x 0.25) to (2.5 x 2.5).
I regridded the CMIP5 from (2 x 2.5) to (2.5 x 2.5).

Temporal Resolution

ERA5 and NCEP go from 1980-2019
CMIP5 goes from 2006-2018
For comparing ERA5 vs CMIP5 and NCEP vs CMIPF, I found the same time components

Measures¶

I'm measuring the following:

Entropy - expected uncertainty
Total Correlation - amount of redundant information between features
Mutual Information - amount of information shared between variables

Data¶

Inputs

I'm taking each year as is. Each spatial location is a sample and each year is a feature. My inputs are:

$X \in \mathbb{R}^{\text{spatial } \times \text{ month}}$

Outputs

All my information theory measures are in nats. They are scalars.

Hypothesis¶

Simple: The ERA5 and the NCEP model should be more similar than the CMIP5 model compared to each of them.

Data - Climate Models¶

!ls /home/emmanuel/projects/2020_rbig_rs/notebooks/climate/../../

LICENSE    data            environment_dev.yml  notebooks   reports   src
README.md  environment.yml  environment_gpu.yml  references  setup.py

import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../../"
sys.path.insert(0, f'{source_path}')

# Import RBIG Helper
# from src.models.train_models import run_rbig_models

# ESDC tools
sys.path.insert(0, f'/home/emmanuel/code/py_esdc')
# from esdc.preprocessing import normalize_temporal
from pathlib import Path


import cdsapi
from zipfile import ZipFile
import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing

# Visualization Tools
from src.data.climate.loader import ResultsLoader
# from src.visualization.climate import PlotResults
from src.visualization.climate.local import plot_diff, plot_individual, plot_individual_diff, plot_individual_all

import seaborn as sns
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
plt.style.use(['seaborn-poster', 'fivethirtyeight'])

%matplotlib inline

%load_ext autoreload
%autoreload 2

Entropy¶

def get_entropy_results(base_model: str, results: str='interim', model: str='rcp')-> pd.DataFrame:
    base_path = '/home/emmanuel/projects/2020_rbig_rs/data/climate/'

    # CHECK: RCP or AMIP
    if model == 'rcp':
        data_path = f"/rcp/local/individual/"
        ext = 'v3'
    elif model == 'amip':
        data_path = f"/amip/local/individual/"
        ext = 'v4'
    else:
        raise ValueError('Unrecognized model: ', model)

#     print(base_path + results + data_path)
    path = Path(base_path + results + data_path)

    if base_model == 'ncep':
        filename_pattern = 'ncep_*_'

    elif base_model == 'era5':
        filename_pattern = 'era5_*_'

    else:
        raise ValueError('Unrecognized base model:', base_model)

    df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern + ext + '.csv')]
    results_df   = pd.concat(df_from_each_file, ignore_index=True)
    return results_df

from typing import List

def post_processing_entropy(df: pd.DataFrame, model: str='amip', exclude: List[str]=['inmcm4'], normalized: bool=False)-> pd.DataFrame:

    # exclude models
    df = df[~df['cmip'].isin(exclude)]

    # divide by the spatial resolution

    if model == 'amip':
        banned_cmip_dates = [
            '1979-01-16 12:00:00', 
            '2009-01-16 12:00:00', 
            '2010-01-16 12:00:00'
        ]
        banned_base_dates = ['1979-01-01', '2010-01-01']
        banned_out_dates = [
         '2006-02-01',
         '2011-01-01',
         '2012-01-01',
         '2013-01-01',
         '2014-01-01',
         '2015-01-01',
         '2016-01-01',
         '2017-01-01',
         '2018-01-01',
         '2019-01-01'
        ]
    elif model == 'rcp':
        banned_cmip_dates = []
        banned_base_dates = []
        banned_out_dates = []
    elif model == 'none':
        banned_out_dates = []
        banned_cmip_dates = []
        banned_base_dates = []
    else:
        raise ValueError('Unrecognized dataset')
    # divide by the spatial resolution
    if normalized:
        df['h_base'] = df['h_base'] / (df['spatial'] ** 2)
        df['h_cmip'] = df['h_cmip'] / (df['spatial'] ** 2)

    df = df[~df.cmip_time.isin(banned_cmip_dates)]
    df = df[~df.base_time.isin(banned_base_dates)]
    df = df[~df.base_time.isin(banned_out_dates)]
    return df

NCEP - Individual IT Measures¶

data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/rcp/local/individual/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/rcp/local/individual/"

# extract results
results_df = get_entropy_results('ncep', 'interim', 'amip')

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1'
]
results_df = post_processing_entropy(results_df, exclude=exclude)

/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

results_df.cmip.unique().tolist()

['mpi_esm_lr', 'noresm1_m', 'mpi_esm_mr', 'access1_0', 'ipsl_cm5a_mr']

results_df.head()

	base	base_time	cmip	cmip_time	h_base	h_cmip	spatial	subsample	t_base	t_cmip	trial	variable
0	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.173002	1.263623	1.0	50000.0	1.319296	1.451097	0.0	psl
1	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.177404	1.269881	1.0	50000.0	1.351144	1.327829	1.0	psl
2	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.186812	1.277439	1.0	50000.0	1.282283	1.448870	2.0	psl
3	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.183485	1.273517	1.0	50000.0	1.317563	1.319149	3.0	psl
4	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.193985	1.280803	1.0	50000.0	1.315317	1.527269	4.0	psl

results_df[results_df['cmip'] == 'inmcm4'].unique()

	base	base_time	cmip	cmip_time	h_base	h_cmip	spatial	subsample	t_base	t_cmip	trial	variable
0	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.173002	1.263623	1.0	50000.0	1.319296	1.451097	0.0	psl
1	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.177404	1.269881	1.0	50000.0	1.351144	1.327829	1.0	psl
2	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.186812	1.277439	1.0	50000.0	1.282283	1.448870	2.0	psl
3	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.183485	1.273517	1.0	50000.0	1.317563	1.319149	3.0	psl
4	ncep	1979-01-01	inmcm4	1979-01-16 12:00:00	1.193985	1.280803	1.0	50000.0	1.315317	1.527269	4.0	psl

AMIP¶

# extract results
base_model = 'ncep'
exp = 'amip'
results_df = get_entropy_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
results_df = post_processing(results_df, 'amip')

results_df[results_df['cmip'] == 'inmcm4'].base_time.unique().tolist()

['2006-02-01',
 '2007-01-01',
 '2008-01-01',
 '2011-01-01',
 '2012-01-01',
 '2013-01-01',
 '2014-01-01',
 '2015-01-01',
 '2016-01-01',
 '2017-01-01',
 '2018-01-01',
 '2019-01-01']

results_df.base_time.unique().tolist()

['1980-01-01',
 '1981-01-01',
 '1982-01-01',
 '1983-01-01',
 '1984-01-01',
 '1985-01-01',
 '1986-01-01',
 '1987-01-01',
 '1988-01-01',
 '1989-01-01',
 '1990-01-01',
 '1991-01-01',
 '1992-01-01',
 '1993-01-01',
 '1994-01-01',
 '1995-01-01',
 '1996-01-01',
 '1997-01-01',
 '1998-01-01',
 '1999-01-01',
 '2000-01-01',
 '2001-01-01',
 '2002-01-01',
 '2003-01-01',
 '2004-01-01',
 '2005-01-01',
 '2006-01-01',
 '2007-01-01',
 '2008-01-01']

# extract results
base_model = 'ncep'
exp = 'amip'
results_df = get_entropy_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1'
]
normalized = True
results_df = post_processing_entropy(results_df, model=exp, exclude=exclude, normalized=normalized)

for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:

    plot_individual_all(results_df, base_model, ispatial, model=exp, save=True)

/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

RCP¶

We want to extract the different CMIP models. The other parameters are constant for now.

# extract results
base_model = 'ncep'
exp = 'rcp'
results_df = get_entropy_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1',
    'access1_3'
]

normalized = True
results_df = post_processing_entropy(results_df, model=exp, exclude=exclude, normalized=normalized)

for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:

    plot_individual_all(results_df, base_model, ispatial, model=exp, save=True)

/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:42: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

ERA5 - Individual IT Measures¶

AMIP¶

# extract results
base_model = 'era5'
exp = 'amip'
results_df = get_entropy_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1'
]

results_df = post_processing_entropy(results_df, model=exp, exclude=exclude)

for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:

    plot_individual_all(results_df, base_model, ispatial, model=exp, save=True)

/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

RCP¶

# extract results
base_model = 'era5'
exp = 'rcp'
results_df = get_entropy_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1',
    'access1_3'
]

results_df = post_processing_entropy(results_df, model=exp, exclude=exclude)

for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:

    plot_individual_all(results_df, base_model, ispatial, model=exp, save=True)

/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:40: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:41: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Mutual Information¶

def get_mutual_info_results(base_model: str, results: str='interim', model: str='rcp')-> pd.DataFrame:
    base_path = '/home/emmanuel/projects/2020_rbig_rs/data/climate/'



    # CHECK: RCP or AMIP
    if model == 'rcp':
        data_path = f"/rcp/local/compare/"
    elif model == 'amip':
        data_path = f"/amip/local/compare/"
    else:
        raise ValueError('Unrecognized model: ', model)

#     print(base_path + results + data_path)
    path = Path(base_path + results + data_path)

    if base_model == 'ncep':
        filename_pattern = 'ncep_*_v3.csv'

    elif base_model == 'era5':
        filename_pattern = 'era5_*_v3.csv'

    else:
        raise ValueError('Unrecognized base model:', base_model)

    df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern)]
    results_df   = pd.concat(df_from_each_file, ignore_index=True)
    return results_df

# def get_results_files_mi(base_model: str)-> pd.DataFrame:

#     path = Path(data_path)

#     if base_model == 'ncep':
#         filename_pattern = 'ncep_*_v3.csv'

#     elif base_model == 'era5':
#         filename_pattern = 'era5_*_v3.csv'

#     else:
#         raise ValueError('Unrecognized base model:', base_model)

#     df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern)]
#     results_df   = pd.concat(df_from_each_file, ignore_index=True)
#     return results_df

from typing import List

def post_processing_mi(df: pd.DataFrame, info: str='h', exp: str='amip', exclude: List[str]=['inmcm4'], normalized: bool=True)-> pd.DataFrame:

#     # subset models
#     cmip_models = [
#         'mpi_esm_lr',
#         'noresm1_m',
# #         'inmcm4',
#         'mpi_esm_mr',
#         'access1_0',
#         'ipsl_cm5a_mr'
#     ]
    df = df[~df['cmip'].isin(exclude)]

    # divide by the spatial resolution
    if exp == 'amip':
        banned_cmip_dates = ['1979-01-16 12:00:00', '2009-01-16 12:00:00', '2010-01-16 12:00:00']
        banned_base_dates = ['1979-01-01', '2010-01-01']
    elif exp == 'rcp':
        banned_cmip_dates = ['2019-01-16 12:00:00']
        banned_base_dates = []
    else:
        raise ValueError('Unrecognized exp:', exp)

    if info == 'h':
        if normalized:
            df['h_base'] = df['h_base'] / (df['spatial'] ** 2)
            df['h_cmip'] = df['h_cmip'] / (df['spatial'] ** 2)
        else:
            pass

    elif info == 'mi':
        if normalized:
            df['mi'] = df['mi'] / (df['spatial'] ** 2)
        else:
            pass
        df = df[~df.cmip_time.isin(banned_cmip_dates)]
        df = df[~df.base_time.isin(banned_base_dates)]

    return df

data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/interim/amip/local/compare/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/amip/local/compare/"

results_df.cmip.unique().tolist()

['mpi_esm_lr', 'access1_0']

NCEP - Comparative IT Measures¶

AMIP¶

results_df.cmip.unique().tolist()

['mpi_esm_lr', 'ipsl_cm5a_lr', 'noresm1_m', 'mpi_esm_mr', 'access1_0']

results_df.head()

	base	base_time	cmip	cmip_time	kendelltau	mi	pearson	spatial	spearman	subsample	time_mi	trial	variable
3220	ncep	1980-01-01	mpi_esm_lr	1980-01-16 12:00:00	0.599945	0.746869	0.791773	1	0.748195	10000	1.75668	0	psl
3221	ncep	1980-01-01	mpi_esm_lr	1980-01-16 12:00:00	0.591253	0.694736	0.787028	1	0.740457	10000	1.83795	1	psl
3222	ncep	1980-01-01	mpi_esm_lr	1980-01-16 12:00:00	0.584682	0.71889	0.780007	1	0.729812	10000	2.21417	2	psl
3223	ncep	1980-01-01	mpi_esm_lr	1980-01-16 12:00:00	0.598413	0.716736	0.783515	1	0.745998	10000	1.94358	3	psl
3224	ncep	1980-01-01	mpi_esm_lr	1980-01-16 12:00:00	0.59382	0.706751	0.787425	1	0.743049	10000	1.97115	4	psl

# extract results
base_model = 'ncep'
exp = 'amip'
results_df = get_mutual_info_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1'
]
normalized = False
results_df = post_processing_mi(results_df, info='mi', exclude=exclude, normalized=normalized)
results_df.cmip[results_df['cmip'] == 'cnrm_cm5'] = 'mpi_esm_mr'


for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:

    plot_individual_all(results_df, base_model, spatial_res=ispatial, info='mi',  model=exp, save=True)

/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

RCP¶

# extract results
base_model = 'ncep'
exp = 'rcp'
results_df = get_mutual_info_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1',
    'access1_3'
]
normalized = False
results_df = post_processing_mi(results_df, info='mi', exclude=exclude, exp=exp, normalized=normalized)
results_df.cmip[results_df['cmip'] == 'cnrm_cm5'] = 'mpi_esm_mr'


for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:

    plot_individual_all(results_df, base_model, spatial_res=ispatial, info='mi',  model=exp, save=True)

/home/emmanuel/.conda/envs/2019_rbig_ad/lib/python3.6/site-packages/ipykernel_launcher.py:20: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

ERA5 - Comparative IT Measures¶

AMIP¶

# extract results
base_model = 'era5'
exp = 'amip'
results_df = get_mutual_info_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1'
]
normalized = False
results_df = post_processing_mi(results_df, info='mi', exclude=exclude, normalized=normalized)

for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:

    plot_individual_all(results_df, base_model, spatial_res=ispatial, info='mi',  model=exp, save=True)

RCP¶

# extract results
base_model = 'era5'
exp = 'rcp'
results_df = get_mutual_info_results(
    base_model, 
    results='results', 
    model=exp
)

# post processing
exclude = [
    'inmcm4',
    'giss_e2_r',
    'bnu_esm',
    'bcc_csm1_1',
    'access1_3'
]
normalized = False
results_df = post_processing_mi(results_df, info='mi', exclude=exclude, normalized=normalized, exp=exp)

for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:

    plot_individual_all(results_df, base_model, spatial_res=ispatial, info='mi',  model=exp, save=True)