Information Theory Measures¶

In this notebook, I will be demonstrating some of the aspects of information theory measures.

Data - Climate Models¶

import os, sys
cwd = os.getcwd()
source_path = f"{cwd}/../../../"
sys.path.insert(0, f'{source_path}')

import numpy as np

# Data Loaders
from src.data.climate.amip import DataDownloader, DataLoader

from src.data.climate.era5 import get_era5_data
from src.data.climate.ncep import get_ncep_data
from src.features.climate.build_features import (
    get_time_overlap, check_time_coords, regrid_2_lower_res, get_spatial_cubes, normalize_data)

from src.experiments.climate.amip_global import (
    experiment_loop_comparative, 
    experiment_loop_individual
)
# Stat Tools
from src.models.information.entropy import RBIGEstimator as RBIGENTEST
from src.models.information.mutual_information import RBIGEstimator as RBIGMIEST

from scipy import stats

import pandas as pd
import xarray as xr
from tqdm import tqdm
from sklearn import preprocessing

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

%load_ext autoreload
%autoreload 2

amip_data_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/amip/"
era5_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/era5/"
ncep_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/ncep/"
results_path = f"/home/emmanuel/projects/2020_rbig_rs/data/climate/results/"
fig_path = f"/home/emmanuel/projects/2020_rbig_rs/reports/figures/climate/"

Demo Experiment¶

Experimental Paams¶

class DataArgs:
    data_path = "/home/emmanuel/projects/2020_rbig_rs/data/climate/raw/amip/"
    results_path = "/home/emmanuel/projects/2020_rbig_rs/data/climate/results/amip"

class CMIPArgs:


    # Fixed Params
    spatial_windows = [
        1, 2,                # Spatial Window for Density Cubes
        3,4,5,6,7,8,9,10
    ]

    # Free Params
    variables = [
        'psl'               # Mean Surface Pressure
    ]
    cmip_models = [
        "inmcm4",
        "access1_0",
        "bcc_csm1_1",
        "bcc_csm1_1_m",
        "bnu_esm",
        "giss_e2_r",
        "cnrm_cm5",
        "ipsl_cm5a_lr",
        "ipsl_cm5a_mr",
        "ipsl_cm5b_lr",
        "mpi_esm_lr",
        "mpi_esm_mr",
        "noresm1_m",
    ]

    base_models = [
        'ncep',
        "era5"
    ]

Part I - Grab Data¶

from src.data.climate.amip import get_base_model

base_dat = get_base_model(CMIPArgs.base_models[0], CMIPArgs.variables[0])
# base_dat

from src.data.climate.cmip5 import get_cmip5_model


cmip_dat = get_cmip5_model(CMIPArgs.cmip_models[0], CMIPArgs.variables[0])
# cmip_dat

Part II - Regrid Data¶

base_dat, cmip_dat = regrid_2_lower_res(base_dat, cmip_dat)

assert(base_dat.shape[1] == cmip_dat.shape[1])
assert(base_dat.shape[2] == cmip_dat.shape[2])
# base_dat

Create weight file: nearest_s2d_120x180_73x144.nc
Remove file nearest_s2d_120x180_73x144.nc

Part III - Find Overlapping Times¶

base_dat.shape, cmip_dat.shape

((489, 73, 144), (360, 73, 144))

base_dat, cmip_dat = get_time_overlap(base_dat, cmip_dat)

Part IV - Get Density Cubes¶

base_df = get_spatial_cubes(base_dat, CMIPArgs.spatial_windows[3])
cmip_df = get_spatial_cubes(cmip_dat, CMIPArgs.spatial_windows[3])

base_df.shape

(3543330, 16)

Normalize¶

base_df = normalize_data(base_df)
cmip_df = normalize_data(cmip_df)

Information Theory Measures¶

Entropy, H( $X$ )¶

subsample = 10_000
batch_size = None
bootstrap = False
ent_est = RBIGENTEST(
    batch_size=batch_size,
    bootstrap=bootstrap,
)

ent_est.fit(base_df[:subsample])

h = ent_est.score(base_df[:subsample])

h

-33.116467738349236

with Bootstrap¶

batch_size = 10_000
bootstrap = True
n_iterations = 100

ent_est = RBIGENTEST(
    batch_size=batch_size,
    bootstrap=bootstrap,
    n_iterations=n_iterations
)

ent_est.fit(base_df)

h = ent_est.score(base_df)

h

-31.881844520814997

plt.hist(ent_est.raw_scores)

-28.48503649185888

plt.hist(ent_est.raw_scores)

(array([ 1.,  0.,  1.,  0.,  9., 11., 26., 28., 14., 10.]),
 array([-30.9162801 , -30.56606644, -30.21585278, -29.86563913,
        -29.51542547, -29.16521182, -28.81499816, -28.46478451,
        -28.11457085, -27.76435719, -27.41414354]),
 <a list of 10 Patch objects>)

W. Batches¶

subsample = 40_000

ent_est = RBIGENTEST(batch_size=10_000)

ent_est.fit(base_df[:subsample])

h = ent_est.score(base_df[:subsample])

h

-31.84759524855099

ent_est.raw_scores

[-32.17903374504498,
 -31.753140917432507,
 -31.67399995592763,
 -31.784206375798846]

Total Correlation, TC( $X$ )¶

subsample = 40_000

tc_est = RBIGMIEST(batch_size=None)

tc_est.fit(base_df[:subsample])

tc = tc_est.score(base_df[:subsample])

tc

51.735384060195784

w. Batches¶

subsample = 40_000

tc_est = RBIGMIEST(batch_size=10_000)

tc_est.fit(base_df[:subsample])

tc = tc_est.score(base_df[:subsample])

tc

50.6219155716329

tc_est.raw_scores

[50.29844313632438, 51.03391865505402, 50.72933249988033, 50.425967995272856]

Mutual Information, MI( $X$ )¶

subsample = 100_000

mi_est = RBIGMIEST(batch_size=None)

mi_est.fit(
    base_df[:subsample],
    cmip_df[:subsample]
)

mi = mi_est.score(base_df[:subsample])

mi

1.2438747143982896

w. Batches¶

subsample = 100_000

mi_est = RBIGMIEST(batch_size=50_000)

mi_est.fit(
    base_df[:subsample],
    cmip_df[:subsample]
)

mi = mi_est.score(base_df[:subsample])

mi

1.215228412628969

mi_est.raw_values

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-84-f0a0474b33b1> in <module>
----> 1 mi_est.raw_values

AttributeError: 'RBIGEstimator' object has no attribute 'raw_values'

Mutual Information II, H(X) + H(Y) - H(X,Y)¶

subsample = 100_000
batch_size = 25_000

# H(X)
print('H(X)')
x_ent_est = RBIGENTEST(batch_size=batch_size)

x_ent_est.fit(base_df.values[:subsample])

h_x = x_ent_est.score(base_df.values[:subsample])

# H(Y)
print('H(Y)')
y_ent_est = RBIGENTEST(batch_size=batch_size)

y_ent_est.fit(cmip_df.values[:subsample])

h_y = y_ent_est.score(cmip_df.values[:subsample])

# H(X,Y)
print('H(X,Y)')
xy_ent_est = RBIGENTEST(batch_size=50_000)

xy_ent_est.fit(
    np.hstack(
        (
            base_df.values[:subsample],
            cmip_df.values[:subsample]
        )
    ),

)

h_xy = xy_ent_est.score(base_df.values[:subsample])

H(X)
H(Y)
H(X,Y)

# H(X,Y)
print('H(X,Y)')
xy_ent_est = RBIGENTEST(batch_size=50_000)

xy_ent_est.fit(
    np.hstack(
        (
            base_df.values[:subsample],
            cmip_df.values[:subsample]
        )
    ),

)

h_xy = xy_ent_est.score(base_df.values[:subsample])
h_xy

H(X,Y)

165.23978712025018

h_x, h_y, h_xy, h_x + h_y - h_xy

(79.10616714936484, 87.19046271977632, 165.45410606367204, 0.8425238054691135)

0.4360788203771051

Correlation: Pearson, Spearman, KendallTau¶

pear = stats.pearsonr(
    base_df[:subsample].ravel(),
    cmip_df[:subsample].ravel(),
)

spear = stats.spearmanr(
    base_df[:subsample].ravel(),
    cmip_df[:subsample].ravel(),
)

kend = stats.kendalltau(
    base_df[:subsample].ravel(),
    cmip_df[:subsample].ravel(),
)

pear[0], spear[0], kend[0]

Information Theory Measures¶

Data - Climate Models¶

Demo Experiment¶

Experimental Paams¶

Part I - Grab Data¶

Part II - Regrid Data¶

Part III - Find Overlapping Times¶

Part IV - Get Density Cubes¶

Normalize¶

Information Theory Measures¶

Entropy, H(XX)¶

with Bootstrap¶

W. Batches¶

Total Correlation, TC(XX)¶

w. Batches¶

Mutual Information, MI(XX)¶

w. Batches¶

Mutual Information II, H(X) + H(Y) - H(X,Y)¶

Correlation: Pearson, Spearman, KendallTau¶

Entropy, H( $X$ )¶

Total Correlation, TC( $X$ )¶

Mutual Information, MI( $X$ )¶