Results - Prelim. Spa-Temp Experiment¶
In this notebook, I will be walking through how we can estimate different methods based on the density cubes that we derive.
import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))
import pathlib
FIG_PATH = pathlib.Path(str(here())).joinpath('reports/figures/spa_temp/demos')
# standard python packages
import xarray as xr
import pandas as pd
import numpy as np
#
# Experiment Functions
from src.data.esdc import get_dataset
from src.features import Metrics
from src.features.temporal import select_period, get_smoke_test_time
from src.features.spatial import select_region, get_europe
from src.models.train_models import get_similarity_scores
from src.experiments.utils import dict_product, run_parallel_step
from src.features import Metrics
from src.features.density import get_density_cubes
from src.features.preprocessing import standardizer_data, get_reference_cube, get_common_indices
from src.models.similarity import cka_coefficient, rv_coefficient, rbig_it_measures
# # esdc tools
# from src.esdc.subset import select_pixel
# from src.esdc.shape import ShapeFileExtract, rasterize
# from esdc.transform import DensityCubes
from typing import List, Dict
import xarray as xr
from tqdm import tqdm
import cartopy
import cartopy.crs as ccrs
# NUMPY SETTINGS
import numpy as onp
onp.set_printoptions(precision=3, suppress=True)
# MATPLOTLIB Settings
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# SEABORN SETTINGS
import seaborn as sns
sns.set_context(context='talk',font_scale=0.7)
# sns.set(rc={'figure.figsize': (12, 9.)})
# sns.set_style("whitegrid")
# PANDAS SETTINGS
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
# LOGGING SETTINGS
import sys
import logging
logging.basicConfig(
level=logging.INFO,
stream=sys.stdout,
format='%(asctime)s:%(levelname)s:%(message)s'
)
logger = logging.getLogger()
#logger.setLevel(logging.INFO)
%load_ext autoreload
%autoreload 2
Results Plot¶
For this first set of results, I'm looking at the following:
- Entropy of the different spatial-temporal density cube compositions.
- MI between just the sames with not spatial-temporal features and the density cubes with different spatiel-temporal compositions.
Hypothesis
- Less mutual information with more features
- I'm not sure about entropy.
!ls /home/emmanuel/projects/2020_rbig_rs/data/spa_temp/trial_experiment/
RES_PATH = pathlib.Path(str(root)).joinpath(
"data/spa_temp/trial_experiment/v1_gpp_spain.csv"
)
results = pd.read_csv(str(RES_PATH))
# results['ratio'] = results['spatial'] / results['temporal']
variable = 'gpp'
results.tail()
Entropy¶
sub_df = results.copy()
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
# sub_df['dimensions'] = sub_df['spatial'] ** 2 + sub_df['temporal']
# sub_df['nH'] = (sub_df['rbig_H_y'] / sub_df['dimensions']) #* np.log(2)
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.scatterplot(x='ratio', y='rbig_H_y', hue='n_dimensions', data=sub_df, ax=ax, label='Entropy',)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(xscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='Entropy'
)
plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='rbig_H_y', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='Entropy'
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_entropy.png'))
plt.show()
Normalized¶
We can normalize the entropy using a bias term:
sub_df = results.copy()
# find the min,max values
min_H = sub_df[['rbig_H_y', 'rbig_H_x']].min().min()
max_H = sub_df[['rbig_H_y', 'rbig_H_x']].max().max()
# Rescale via min/max values
def logbias(x):
return x + np.abs(min_H)
print(min_H, max_H)
print(sub_df['rbig_H_y'].min(), sub_df['rbig_H_y'].max())
sub_df['H_y_bias'] = sub_df['rbig_H_y'].apply(logbias) # /sub_df["n_dimensions"]
sub_df['H_x_bias'] = sub_df['rbig_H_x'].apply(logbias)
print(sub_df['H_y_bias'].min(), sub_df['H_y_bias'].max())
# Plot stuff
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='H_y_bias', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
# yscale='symlog',
xlabel='Spatial / Temporal Ratio',
ylabel='Entropy'
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_n_entropy.png'))
plt.show()
Mutual Information¶
sub_df = results.copy()
# Plot stuff
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
# sub_df['rbig_I_xy'] = sub_df['rbig_I_xy']# * np.log(2)
# # Figure
# fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
# sns.scatterplot(x='ratio', y='rbig_I_xy', hue='n_dimensions', data=sub_df, ax=ax, label='Entropy',)
# # sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
# ax.set(xscale='log',
# xlabel='Spatial / Temporal Ratio',
# ylabel='Mutual Information'
# )
# plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='rbig_I_xy', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
# yscale='symlog',
xlabel='Spatial / Temporal Ratio',
ylabel='Mutual Information',
# ylim=[-0.05, 0.6]
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_mi.png'))
plt.show()
Normalized Mutual Information¶
$$ \text{nI}(\mathbf{X,Y}) = \frac{I(\mathbf{X,Y})}{\sqrt{H(\mathbf{X})H(\mathbf{Y})}} $$
# ==============
# Entropy Bias
# ==============
sub_df = results.copy()
# find the min,max values
min_H = sub_df[['rbig_H_y', 'rbig_H_x']].min().min()
max_H = sub_df[['rbig_H_y', 'rbig_H_x']].max().max()
# Rescale via min/max values
def logbias(x):
return x + np.abs(min_H)
sub_df['H_y_bias'] = sub_df['rbig_H_y'].apply(logbias) # /sub_df["n_dimensions"]
sub_df['H_x_bias'] = sub_df['rbig_H_x'].apply(logbias)
# ==============
# Normalized MI
# ==============
sub_df['I_norm'] = sub_df['rbig_I_xy'] / np.sqrt(sub_df['H_x_bias']) / np.sqrt(sub_df['H_y_bias'])
# Plot stuff
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
# plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='I_norm', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
# yscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='Mutual Information'
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_nmi.png'))
plt.show()
RV Coefficient¶
# plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='rv_coef', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
# yscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='RV Coefficient',
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_rv.png'))
plt.show()
CKA¶
# Plot stuff
sub_df = results.copy()
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
sub_df["cka_xy_norm"] *= (1 / (10_000**2))
# plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='cka_coeff', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
# yscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='cKernel Alignment',
# ylim=[0.0, 0.0015]
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_cka.png'))
plt.show()
Variation of Information¶
There are a few ways to calculate VI and I will outline how we can calculate them and their normalized variants.
Method I - Variation of Information¶
Firstly, the VI measure was developed by Meila, 2007 [pg 879, eq 19] and is written as:
To get the normalized variant, we can use the measure was developed by Meila, 2007 [pg 886, sec 5.1] which is called the Adjusted VI distance. Knowing that the VI measure is upper bounded by \log N Meila, 2007 [pg 881, eq. 24], they simply divide the VI measure by \log N to get a measure between 0 and 1. So we get
Note: In theory, this should be bounded between 0 and 1 but I'm not sure what happens with nevative values...
# get number of samples
n_samples = sub_df['rbig_H_x'].shape[0]
# calculate variation of information
sub_df['vi_joint'] = sub_df['rbig_H_x'] + sub_df['rbig_H_y'] - 2 * sub_df['rbig_I_xy']
sub_df['vi_joint_norm'] = (sub_df['vi_joint']) / np.log(n_samples)
# Plot Unnormalized
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
sns.scatterplot(x='ratio', y='vi_joint', data=sub_df, hue='n_dimensions', ax=ax[0], label='nMI')
sns.scatterplot(x='ratio', y='vi_joint', data=sub_df, hue='n_dimensions', ax=ax[1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
ax[0].set_yscale('symlog')
ax[0].set_xscale('log')
ax[0].set_ylabel('VI (Meila 2005)')
ax[0].set_xlabel('Spatial / Temporal Ratio')
ax[1].set_yscale('symlog')
ax[1].set_xscale('log')
ax[1].set_ylabel('')
ax[1].set_xlabel('Spatial / Temporal Ratio')
# Plot Normalized
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
sns.scatterplot(x='ratio', y='vi_joint_norm', data=sub_df, hue='n_dimensions', ax=ax[0], label='nMI')
sns.scatterplot(x='ratio', y='vi_joint_norm', data=sub_df, hue='n_dimensions', ax=ax[1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
ax[0].set_yscale('symlog')
ax[0].set_xscale('log')
ax[0].set_ylabel('Normalized VI (Meila 2007)')
ax[0].set_xlabel('Spatial / Temporal Ratio')
ax[1].set_yscale('symlog')
ax[1].set_xscale('log')
ax[1].set_ylabel('')
ax[1].set_xlabel('Spatial / Temporal Ratio')
Method III - Variation of Information (Joint)¶
This is analogous to the measure above but the way they do the normalization is different. This one I found in this paper [pg. 2842, table 3].
Note: Below I show how the quantity on the right is equivalent to the middle term:
$$ \begin{aligned} 1 - \frac{I(X;Y)}{H(X,Y)} &= 1 - \frac{I(X;Y)}{H(X) + H(Y) - I(X;Y)} \ &= \frac{H(X) + H(Y) - I(X;Y)}{H(X) + H(Y) - I(X;Y)} - \frac{I(X;Y)}{H(X) + H(Y) - I(X;Y)} \ &= \frac{H(X) + H(Y) - 2I(X;Y)}{H(X) + H(Y) - I(X;Y)} \ &= \frac{VI(X;Y)}{H(X,Y)} \end{aligned} $$
# get number of samples
n_samples = sub_df['rbig_H_x'].shape[0]
# calculate variation of information
sub_df['vi'] = sub_df['rbig_H_x'] + sub_df['rbig_H_y'] - 2 * sub_df['rbig_I_xy']
sub_df['H_xy'] = sub_df['rbig_H_x'] + sub_df['rbig_H_y'] - sub_df['rbig_I_xy']
sub_df['nVI_joint'] = sub_df['vi'] / sub_df['H_xy']
# sub_df['d_joint'] = 1 - (sub_df['rbig_I_xy'] / sub_df['H_xy'])
# Plot Unnormalized
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
sns.scatterplot(x='ratio', y='nVI_joint', data=sub_df, hue='n_dimensions', ax=ax[0], label='nMI')
sns.scatterplot(x='ratio', y='nVI_joint', data=sub_df, hue='n_dimensions', ax=ax[1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
ax[0].set_yscale('log')
# ax[0].set_ylim([0.9, 1.1])
ax[0].set_xscale('log')
ax[0].set_ylabel('nVI (Joint) (Kaskov 2005)')
ax[0].set_xlabel('Spatial / Temporal Ratio')
ax[1].set_yscale('log')
ax[1].set_xscale('log')
ax[1].set_ylabel('')
ax[1].set_xlabel('Spatial / Temporal Ratio')
Furthermore, I didn't estimate the joint entropy H(X,Y) directly with RBIG. So we could be introducing a lot of errors by doing all of the manipulations to acquire the joint entropy.
Method III - Variation of Information (Max)¶
This is the last variation of the VI method found in this paper [pg. 2842, table 3].
And apparently this is the tightest bound over the VI measure. Another name is called the Normalized Information Distance.
sub_df['nVI_max'] = 1 - (sub_df['rbig_I_xy'] / np.maximum(sub_df['rbig_H_x'], sub_df['rbig_H_y']))
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
sns.scatterplot(x='ratio', y='nVI_max', data=sub_df, hue='n_dimensions', ax=ax[0], label='nMI')
sns.scatterplot(x='ratio', y='nVI_max', data=sub_df, hue='n_dimensions', ax=ax[1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
# ax[0].set_yscale('log')
# ax[0].set_ylim([0.9, 1.1])
ax[0].set(
xscale='log',
ylabel='nVI (Max) (Kaskov 2005)',
xlabel='Spatial / Temporal Ratio'
)
ax[1].set(
xscale='log',
ylabel='',
xlabel='Spatial / Temporal Ratio'
)
plt.tight_layout()
plt.show()
Now, later, I will have to figure out how this ties into the Taylor Diagram. But for now, I'm quite happy.
from sklearn.preprocessing import MinMaxScaler
# find the min,max of all entropy values
min_H = sub_df[['rbig_H_x', 'rbig_H_y']].min().min()
max_H = sub_df[['rbig_H_x', 'rbig_H_y']].max().max()
# Scale between 0 and the max value?
sub_df['rbig_H_xs'] = MinMaxScaler((0, max_H)).fit_transform(sub_df['rbig_H_x'].values[:, None])
sub_df['rbig_H_ys'] = MinMaxScaler((0, max_H)).fit_transform(sub_df['rbig_H_y'].values[:, None])
# calculate normalized MI with the scaled versions
sub_df['d_max'] = (sub_df['rbig_I_xy'] / np.sqrt(sub_df['rbig_H_xs']) / np.sqrt(sub_df['rbig_H_ys'])) #* np.log(2)
fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(10, 10))
sns.scatterplot(x='ratio', y='d_max', data=sub_df, hue='n_dimensions', ax=ax[0,0], label='nMI')
sns.scatterplot(x='ratio', y='d_max', data=sub_df, hue='n_dimensions', ax=ax[0,1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
# ax[0,0].set_yscale('log')
ax[0,0].set_xscale('log')
ax[0,0].set_ylabel('Normalized Mutual Information')
ax[0,0].set_xlabel('Spatial / Temporal Ratio')
# ax[0, 1].set_yscale('log')
ax[0, 1].set_xscale('log')
ax[0, 1].set_ylabel('')
ax[0, 1].set_xlabel('Spatial / Temporal Ratio')
sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, hue='n_dimensions', ax=ax[1,0], label='nMI')
sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, hue='n_dimensions', ax=ax[1,1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
ax[1,0].set_yscale('log')
ax[1,0].set_xscale('log')
ax[1,0].set_ylabel('Coefficient')
ax[1,0].set_xlabel('Spatial / Temporal Ratio')
ax[1, 1].set_yscale('log')
ax[1, 1].set_xscale('log')
ax[1, 1].set_ylabel('')
ax[1, 1].set_xlabel('Spatial / Temporal Ratio')
plt.show()
So right away, we see that the RV Coefficient doesn't fluxuate at all (or at least it is rather random and I don't see a clear pattern.
Mutual Information
MI clearly has a pattern. That's good, it's picking up something about the feature representations. The only problem is it's difficult to interpret because it's not really between 0 and 1, so I'm not sure how we're supposed to use this in a Taylor Diagram.