Visually Comparing Climate Models¶
Summary¶
In this notebook, I will be comparing three climate reanalysis models:
- NCEP-DOE Reanalysis 2: Surface
- ERA5
- CMIP5
I will be looking at the following variables:
- Mean Sea Level Pressure (CMIP5, ERA5, NCEP)
- Surface Pressure (ERA5, NCEP)
I will be trying to user RBIG in order to assess how similar these models are. I'll be looking at the following IT measures. If these climate models are that similar, then they should exhibit similar IT measures.
Preprocessing Steps¶
Regridded Spatially
- The ERA5 had the coarsest spatial resolution (2.5 x 2.5).
- I regridded the NCEP from (0.25 x 0.25) to (2.5 x 2.5).
- I regridded the CMIP5 from (2 x 2.5) to (2.5 x 2.5).
Temporal Resolution
- ERA5 and NCEP go from 1980-2019
- CMIP5 goes from 2006-2018
- For comparing ERA5 vs CMIP5 and NCEP vs CMIPF, I found the same time components
Measures¶
I'm measuring the following:
- Entropy - expected uncertainty
- Total Correlation - amount of redundant information between features
- Mutual Information - amount of information shared between variables
Data¶
Inputs
I'm taking each year as is. Each spatial location is a sample and each year is a feature. My inputs are:
X \in \mathbb{R}^{\text{spatial } \times \text{ month}}
Outputs
All my information theory measures are in nats. They are scalars.
Hypothesis¶
Simple: The ERA5 and the NCEP model should be more similar than the CMIP5 model compared to each of them.
Data - Climate Models¶
!ls /home/emmanuel/projects/2020_rbig_rs/notebooks/climate/../../
import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../../"
sys.path.insert(0, f'{source_path}')
# Import RBIG Helper
# from src.models.train_models import run_rbig_models
# ESDC tools
sys.path.insert(0, f'/home/emmanuel/code/py_esdc')
# from esdc.preprocessing import normalize_temporal
from pathlib import Path
import cdsapi
from zipfile import ZipFile
import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing
# Visualization Tools
from src.data.climate.loader import ResultsLoader
# from src.visualization.climate import PlotResults
from src.visualization.climate.local import plot_diff, plot_individual, plot_individual_diff, plot_individual_all
import seaborn as sns
import matplotlib.pyplot as plt
# plt.style.use('ggplot')
plt.style.use(['seaborn-poster', 'fivethirtyeight'])
%matplotlib inline
%load_ext autoreload
%autoreload 2
Entropy¶
def get_entropy_results(base_model: str, results: str='interim', model: str='rcp')-> pd.DataFrame:
base_path = '/home/emmanuel/projects/2020_rbig_rs/data/climate/'
# CHECK: RCP or AMIP
if model == 'rcp':
data_path = f"/rcp/local/individual/"
ext = 'v3'
elif model == 'amip':
data_path = f"/amip/local/individual/"
ext = 'v4'
else:
raise ValueError('Unrecognized model: ', model)
# print(base_path + results + data_path)
path = Path(base_path + results + data_path)
if base_model == 'ncep':
filename_pattern = 'ncep_*_'
elif base_model == 'era5':
filename_pattern = 'era5_*_'
else:
raise ValueError('Unrecognized base model:', base_model)
df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern + ext + '.csv')]
results_df = pd.concat(df_from_each_file, ignore_index=True)
return results_df
from typing import List
def post_processing_entropy(df: pd.DataFrame, model: str='amip', exclude: List[str]=['inmcm4'], normalized: bool=False)-> pd.DataFrame:
# exclude models
df = df[~df['cmip'].isin(exclude)]
# divide by the spatial resolution
if model == 'amip':
banned_cmip_dates = [
'1979-01-16 12:00:00',
'2009-01-16 12:00:00',
'2010-01-16 12:00:00'
]
banned_base_dates = ['1979-01-01', '2010-01-01']
banned_out_dates = [
'2006-02-01',
'2011-01-01',
'2012-01-01',
'2013-01-01',
'2014-01-01',
'2015-01-01',
'2016-01-01',
'2017-01-01',
'2018-01-01',
'2019-01-01'
]
elif model == 'rcp':
banned_cmip_dates = []
banned_base_dates = []
banned_out_dates = []
elif model == 'none':
banned_out_dates = []
banned_cmip_dates = []
banned_base_dates = []
else:
raise ValueError('Unrecognized dataset')
# divide by the spatial resolution
if normalized:
df['h_base'] = df['h_base'] / (df['spatial'] ** 2)
df['h_cmip'] = df['h_cmip'] / (df['spatial'] ** 2)
df = df[~df.cmip_time.isin(banned_cmip_dates)]
df = df[~df.base_time.isin(banned_base_dates)]
df = df[~df.base_time.isin(banned_out_dates)]
return df
NCEP - Individual IT Measures¶
data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/rcp/local/individual/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/rcp/local/individual/"
# extract results
results_df = get_entropy_results('ncep', 'interim', 'amip')
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1'
]
results_df = post_processing_entropy(results_df, exclude=exclude)
results_df.cmip.unique().tolist()
results_df.head()
results_df[results_df['cmip'] == 'inmcm4'].unique()
AMIP¶
# extract results
base_model = 'ncep'
exp = 'amip'
results_df = get_entropy_results(
base_model,
results='results',
model=exp
)
# post processing
results_df = post_processing(results_df, 'amip')
results_df[results_df['cmip'] == 'inmcm4'].base_time.unique().tolist()
results_df.base_time.unique().tolist()
# extract results
base_model = 'ncep'
exp = 'amip'
results_df = get_entropy_results(
base_model,
results='results',
model=exp
)
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1'
]
normalized = True
results_df = post_processing_entropy(results_df, model=exp, exclude=exclude, normalized=normalized)
for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:
plot_individual_all(results_df, base_model, ispatial, model=exp, save=True)
RCP¶
We want to extract the different CMIP models. The other parameters are constant for now.
# extract results
base_model = 'ncep'
exp = 'rcp'
results_df = get_entropy_results(
base_model,
results='results',
model=exp
)
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1',
'access1_3'
]
normalized = True
results_df = post_processing_entropy(results_df, model=exp, exclude=exclude, normalized=normalized)
for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:
plot_individual_all(results_df, base_model, ispatial, model=exp, save=True)
ERA5 - Individual IT Measures¶
AMIP¶
# extract results
base_model = 'era5'
exp = 'amip'
results_df = get_entropy_results(
base_model,
results='results',
model=exp
)
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1'
]
results_df = post_processing_entropy(results_df, model=exp, exclude=exclude)
for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:
plot_individual_all(results_df, base_model, ispatial, model=exp, save=True)
RCP¶
# extract results
base_model = 'era5'
exp = 'rcp'
results_df = get_entropy_results(
base_model,
results='results',
model=exp
)
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1',
'access1_3'
]
results_df = post_processing_entropy(results_df, model=exp, exclude=exclude)
for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:
plot_individual_all(results_df, base_model, ispatial, model=exp, save=True)
Mutual Information¶
def get_mutual_info_results(base_model: str, results: str='interim', model: str='rcp')-> pd.DataFrame:
base_path = '/home/emmanuel/projects/2020_rbig_rs/data/climate/'
# CHECK: RCP or AMIP
if model == 'rcp':
data_path = f"/rcp/local/compare/"
elif model == 'amip':
data_path = f"/amip/local/compare/"
else:
raise ValueError('Unrecognized model: ', model)
# print(base_path + results + data_path)
path = Path(base_path + results + data_path)
if base_model == 'ncep':
filename_pattern = 'ncep_*_v3.csv'
elif base_model == 'era5':
filename_pattern = 'era5_*_v3.csv'
else:
raise ValueError('Unrecognized base model:', base_model)
df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern)]
results_df = pd.concat(df_from_each_file, ignore_index=True)
return results_df
# def get_results_files_mi(base_model: str)-> pd.DataFrame:
# path = Path(data_path)
# if base_model == 'ncep':
# filename_pattern = 'ncep_*_v3.csv'
# elif base_model == 'era5':
# filename_pattern = 'era5_*_v3.csv'
# else:
# raise ValueError('Unrecognized base model:', base_model)
# df_from_each_file = [pd.read_csv(f, index_col=0) for f in path.rglob(filename_pattern)]
# results_df = pd.concat(df_from_each_file, ignore_index=True)
# return results_df
from typing import List
def post_processing_mi(df: pd.DataFrame, info: str='h', exp: str='amip', exclude: List[str]=['inmcm4'], normalized: bool=True)-> pd.DataFrame:
# # subset models
# cmip_models = [
# 'mpi_esm_lr',
# 'noresm1_m',
# # 'inmcm4',
# 'mpi_esm_mr',
# 'access1_0',
# 'ipsl_cm5a_mr'
# ]
df = df[~df['cmip'].isin(exclude)]
# divide by the spatial resolution
if exp == 'amip':
banned_cmip_dates = ['1979-01-16 12:00:00', '2009-01-16 12:00:00', '2010-01-16 12:00:00']
banned_base_dates = ['1979-01-01', '2010-01-01']
elif exp == 'rcp':
banned_cmip_dates = ['2019-01-16 12:00:00']
banned_base_dates = []
else:
raise ValueError('Unrecognized exp:', exp)
if info == 'h':
if normalized:
df['h_base'] = df['h_base'] / (df['spatial'] ** 2)
df['h_cmip'] = df['h_cmip'] / (df['spatial'] ** 2)
else:
pass
elif info == 'mi':
if normalized:
df['mi'] = df['mi'] / (df['spatial'] ** 2)
else:
pass
df = df[~df.cmip_time.isin(banned_cmip_dates)]
df = df[~df.base_time.isin(banned_base_dates)]
return df
data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/interim/amip/local/compare/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/amip/local/compare/"
results_df.cmip.unique().tolist()
NCEP - Comparative IT Measures¶
AMIP¶
results_df.cmip.unique().tolist()
results_df.head()
# extract results
base_model = 'ncep'
exp = 'amip'
results_df = get_mutual_info_results(
base_model,
results='results',
model=exp
)
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1'
]
normalized = False
results_df = post_processing_mi(results_df, info='mi', exclude=exclude, normalized=normalized)
results_df.cmip[results_df['cmip'] == 'cnrm_cm5'] = 'mpi_esm_mr'
for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:
plot_individual_all(results_df, base_model, spatial_res=ispatial, info='mi', model=exp, save=True)
RCP¶
# extract results
base_model = 'ncep'
exp = 'rcp'
results_df = get_mutual_info_results(
base_model,
results='results',
model=exp
)
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1',
'access1_3'
]
normalized = False
results_df = post_processing_mi(results_df, info='mi', exclude=exclude, exp=exp, normalized=normalized)
results_df.cmip[results_df['cmip'] == 'cnrm_cm5'] = 'mpi_esm_mr'
for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:
plot_individual_all(results_df, base_model, spatial_res=ispatial, info='mi', model=exp, save=True)
ERA5 - Comparative IT Measures¶
AMIP¶
# extract results
base_model = 'era5'
exp = 'amip'
results_df = get_mutual_info_results(
base_model,
results='results',
model=exp
)
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1'
]
normalized = False
results_df = post_processing_mi(results_df, info='mi', exclude=exclude, normalized=normalized)
for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:
plot_individual_all(results_df, base_model, spatial_res=ispatial, info='mi', model=exp, save=True)
RCP¶
# extract results
base_model = 'era5'
exp = 'rcp'
results_df = get_mutual_info_results(
base_model,
results='results',
model=exp
)
# post processing
exclude = [
'inmcm4',
'giss_e2_r',
'bnu_esm',
'bcc_csm1_1',
'access1_3'
]
normalized = False
results_df = post_processing_mi(results_df, info='mi', exclude=exclude, normalized=normalized, exp=exp)
for ispatial in [1.0, 2.0, 3.0, 4.0, 5.0]:
plot_individual_all(results_df, base_model, spatial_res=ispatial, info='mi', model=exp, save=True)