Information Theory Measures¶
In this notebook, I will be demonstrating some of the aspects of information theory measures.
Data - Climate Models¶
import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../../"
sys.path.insert(0, f'{source_path}')
import numpy as np
# Data Loaders
from src.data.climate.amip import DataDownloader, DataLoader
from src.data.climate.era5 import get_era5_data
from src.data.climate.ncep import get_ncep_data
from src.features.climate.build_features import (
get_time_overlap, check_time_coords, regrid_2_lower_res, get_spatial_cubes, normalize_data)
from src.experiments.climate.amip_global import (
experiment_loop_comparative,
experiment_loop_individual
)
# Stat Tools
from src.models.information.entropy import RBIGEstimator as RBIGENTEST
from src.models.information.mutual_information import RBIGEstimator as RBIGMIEST
from scipy import stats
import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
%load_ext autoreload
%autoreload 2
amip_data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/amip/"
era5_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/era5/"
ncep_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/ncep/"
results_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/"
Demo Experiment¶
Experimental Paams¶
class DataArgs:
data_path = "/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/amip/"
results_path = "/home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip"
class CMIPArgs:
# Fixed Params
spatial_windows = [
1, 2, # Spatial Window for Density Cubes
3,4,5,6,7,8,9,10
]
# Free Params
variables = [
'psl' # Mean Surface Pressure
]
cmip_models = [
"inmcm4",
"access1_0",
"bcc_csm1_1",
"bcc_csm1_1_m",
"bnu_esm",
"giss_e2_r",
"cnrm_cm5",
"ipsl_cm5a_lr",
"ipsl_cm5a_mr",
"ipsl_cm5b_lr",
"mpi_esm_lr",
"mpi_esm_mr",
"noresm1_m",
]
base_models = [
'ncep',
"era5"
]
Part I - Grab Data¶
from src.data.climate.amip import get_base_model
base_dat = get_base_model(CMIPArgs.base_models[0], CMIPArgs.variables[0])
# base_dat
from src.data.climate.cmip5 import get_cmip5_model
cmip_dat = get_cmip5_model(CMIPArgs.cmip_models[0], CMIPArgs.variables[0])
# cmip_dat
Part II - Regrid Data¶
base_dat, cmip_dat = regrid_2_lower_res(base_dat, cmip_dat)
assert(base_dat.shape[1] == cmip_dat.shape[1])
assert(base_dat.shape[2] == cmip_dat.shape[2])
# base_dat
Part III - Find Overlapping Times¶
base_dat.shape, cmip_dat.shape
base_dat, cmip_dat = get_time_overlap(base_dat, cmip_dat)
Part IV - Get Density Cubes¶
base_df = get_spatial_cubes(base_dat, CMIPArgs.spatial_windows[3])
cmip_df = get_spatial_cubes(cmip_dat, CMIPArgs.spatial_windows[3])
Information Theory Measures¶
Entropy, H(X)¶
subsample = 10_000
batch_size = None
bootstrap = False
ent_est = RBIGENTEST(
batch_size=batch_size,
bootstrap=bootstrap,
)
ent_est.fit(base_df[:subsample])
h = ent_est.score(base_df[:subsample])
h
with Bootstrap¶
batch_size = 10_000
bootstrap = True
n_iterations = 100
ent_est = RBIGENTEST(
batch_size=batch_size,
bootstrap=bootstrap,
n_iterations=n_iterations
)
ent_est.fit(base_df)
h = ent_est.score(base_df)
h
plt.hist(ent_est.raw_scores)
plt.hist(ent_est.raw_scores)
W. Batches¶
subsample = 40_000
ent_est = RBIGENTEST(batch_size=10_000)
ent_est.fit(base_df[:subsample])
h = ent_est.score(base_df[:subsample])
h
ent_est.raw_scores
Total Correlation, TC(X)¶
subsample = 40_000
tc_est = RBIGMIEST(batch_size=None)
tc_est.fit(base_df[:subsample])
tc = tc_est.score(base_df[:subsample])
tc
w. Batches¶
subsample = 40_000
tc_est = RBIGMIEST(batch_size=10_000)
tc_est.fit(base_df[:subsample])
tc = tc_est.score(base_df[:subsample])
tc
tc_est.raw_scores
Mutual Information, MI(X)¶
subsample = 100_000
mi_est = RBIGMIEST(batch_size=None)
mi_est.fit(
base_df[:subsample],
cmip_df[:subsample]
)
mi = mi_est.score(base_df[:subsample])
mi
w. Batches¶
subsample = 100_000
mi_est = RBIGMIEST(batch_size=50_000)
mi_est.fit(
base_df[:subsample],
cmip_df[:subsample]
)
mi = mi_est.score(base_df[:subsample])
mi
mi_est.raw_values
Mutual Information II, H(X) + H(Y) - H(X,Y)¶
subsample = 100_000
batch_size = 25_000
# H(X)
print('H(X)')
x_ent_est = RBIGENTEST(batch_size=batch_size)
x_ent_est.fit(base_df.values[:subsample])
h_x = x_ent_est.score(base_df.values[:subsample])
# H(Y)
print('H(Y)')
y_ent_est = RBIGENTEST(batch_size=batch_size)
y_ent_est.fit(cmip_df.values[:subsample])
h_y = y_ent_est.score(cmip_df.values[:subsample])
# H(X,Y)
print('H(X,Y)')
xy_ent_est = RBIGENTEST(batch_size=50_000)
xy_ent_est.fit(
np.hstack(
(
base_df.values[:subsample],
cmip_df.values[:subsample]
)
),
)
h_xy = xy_ent_est.score(base_df.values[:subsample])
# H(X,Y)
print('H(X,Y)')
xy_ent_est = RBIGENTEST(batch_size=50_000)
xy_ent_est.fit(
np.hstack(
(
base_df.values[:subsample],
cmip_df.values[:subsample]
)
),
)
h_xy = xy_ent_est.score(base_df.values[:subsample])
h_xy
h_x, h_y, h_xy, h_x + h_y - h_xy
Correlation: Pearson, Spearman, KendallTau¶
pear = stats.pearsonr(
base_df[:subsample].ravel(),
cmip_df[:subsample].ravel(),
)
spear = stats.spearmanr(
base_df[:subsample].ravel(),
cmip_df[:subsample].ravel(),
)
kend = stats.kendalltau(
base_df[:subsample].ravel(),
cmip_df[:subsample].ravel(),
)
pear[0], spear[0], kend[0]