Results - Prelim. Spa-Temp Experiment¶
In this notebook, I will be walking through how we can estimate different methods based on the density cubes that we derive.
import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))
import pathlib
FIG_PATH = pathlib.Path(str(here())).joinpath('reports/figures/spa_temp/demos')
# standard python packages
import xarray as xr
import pandas as pd
import numpy as np
#
# Experiment Functions
from src.data.esdc import get_dataset
from src.features import Metrics
from src.features.temporal import select_period, get_smoke_test_time
from src.features.spatial import select_region, get_europe
from src.models.train_models import get_similarity_scores
from src.experiments.utils import dict_product, run_parallel_step
from src.features import Metrics
from src.features.density import get_density_cubes
from src.features.preprocessing import standardizer_data, get_reference_cube, get_common_indices
from src.models.similarity import cka_coefficient, rv_coefficient, rbig_it_measures
# # esdc tools
# from src.esdc.subset import select_pixel
# from src.esdc.shape import ShapeFileExtract, rasterize
# from esdc.transform import DensityCubes
from typing import List, Dict
import xarray as xr
from tqdm import tqdm
import cartopy
import cartopy.crs as ccrs
# NUMPY SETTINGS
import numpy as onp
onp.set_printoptions(precision=3, suppress=True)
# MATPLOTLIB Settings
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# SEABORN SETTINGS
import seaborn as sns
sns.set_context(context='talk',font_scale=0.7)
# sns.set(rc={'figure.figsize': (12, 9.)})
# sns.set_style("whitegrid")
# PANDAS SETTINGS
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
# LOGGING SETTINGS
import sys
import logging
logging.basicConfig(
level=logging.INFO,
stream=sys.stdout,
format='%(asctime)s:%(levelname)s:%(message)s'
)
logger = logging.getLogger()
#logger.setLevel(logging.INFO)
%load_ext autoreload
%autoreload 2
Results Plot¶
For this first set of results, I'm looking at the following:
- Entropy of the different spatial-temporal density cube compositions.
- MI between just the sames with not spatial-temporal features and the density cubes with different spatiel-temporal compositions.
Hypothesis
- Less mutual information with more features
- I'm not sure about entropy.
!ls /home/emmanuel/projects/2020_rbig_rs/data/spa_temp/trial_experiment/
RES_PATH = pathlib.Path(str(root)).joinpath(
"data/spa_temp/trial_experiment/v1_reg_gpp_spain.csv"
)
results = pd.read_csv(str(RES_PATH))
# results['ratio'] = results['spatial'] / results['temporal']
variable = 'gpp'
results.tail()
Entropy¶
sub_df = results.copy()
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
# sub_df['dimensions'] = sub_df['spatial'] ** 2 + sub_df['temporal']
# sub_df['nH'] = (sub_df['rbig_H_y'] / sub_df['dimensions']) #* np.log(2)
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
# fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
# sns.scatterplot(x='ratio', y='rf_rmse', hue='n_dimensions', data=sub_df, ax=ax,)
# # sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
# ax.set(xscale='log',
# xlabel='Spatial / Temporal Ratio',
# ylabel='Entropy'
# )
# plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='rf_rmse', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='RMSE'
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_rf_rmse.png'))
plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='rlr_rmse', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='RMSE'
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_rlr_rmse.png'))
plt.show()
Variation of Information¶
There are a few ways to calculate VI and I will outline how we can calculate them and their normalized variants.
Method I - Variation of Information¶
Firstly, the VI measure was developed by Meila, 2007 [pg 879, eq 19] and is written as:
To get the normalized variant, we can use the measure was developed by Meila, 2007 [pg 886, sec 5.1] which is called the Adjusted VI distance. Knowing that the VI measure is upper bounded by \log N Meila, 2007 [pg 881, eq. 24], they simply divide the VI measure by \log N to get a measure between 0 and 1. So we get
Note: In theory, this should be bounded between 0 and 1 but I'm not sure what happens with nevative values...
!ls /home/emmanuel/projects/2020_rbig_rs/data/spa_temp/trial_experiment/
RES_PATH = pathlib.Path(str(root)).joinpath(
"data/spa_temp/trial_experiment/v5_gpp_spain.csv"
)
results = pd.read_csv(str(RES_PATH))
# results['ratio'] = results['spatial'] / results['temporal']
variable = 'gpp'
results.tail()
def variation_of_info(Hx, Hy, Ixy, bias=True):
# find the min value of all Entropy values
min_H = np.minimum(Hx, Hy).min()
max_H = np.maximum(Hx, Hy).max()
def add_bias(x):
return x + np.abs(min_H)
Hx = add_bias(Hx)
Hy = add_bias(Hy)
# Calculate normalized I
Ixy_norm = Ixy / np.sqrt(Hx) / np.sqrt(Hy)
# Variation of Information
vi_value = Hx + Hy - 2 * np.sqrt(Hx) * np.sqrt(Hy) * Ixy_norm
return vi_value, Ixy_norm, Hx, Hy
# ==============
# Entropy Bias
# ==============
sub_df = results.copy()
# calculate new values
vi, i_norm, hx_norm, hy_norm = variation_of_info(
sub_df['rbig_H_x'],
sub_df['rbig_H_y'],
sub_df['rbig_I_xy'],
)
sub_df['vi'] = vi
# Plot stuff
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
# plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='vi', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
# yscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='Variation of Information'
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_vi.png'))
plt.show()
RV - RMSE¶
def calculate_rv_rmse(x_norm, y_norm, rv_coeff, bias=True):
# Variation of Information
rmse_value = x_norm**2 + y_norm**2 - 2 * x_norm * y_norm * rv_coeff
return rmse_value
sub_df = results.copy()
# calculate new values
rv_rmse = calculate_rv_rmse(
sub_df['rv_x_norm'],
sub_df['rv_y_norm'],
sub_df['rv_coeff'],
)
sub_df['rv_rmse'] = rv_rmse
# Plot stuff
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
# plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='rv_rmse', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
# yscale='log',
xlabel='Spatial / Temporal Ratio',
ylabel='RV RMSE',
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_rv_rmse.png'))
plt.show()
CKA¶
def calculate_cka_dist(x_norm, y_norm, xy_norm, bias=True):
coeff = xy_norm / x_norm / y_norm
# Variation of Information
cka_dist_value = x_norm + y_norm - 2 * x_norm * y_norm * coeff
return cka_dist_value
sub_df = results.copy()
# calculate new values
rv_rmse = calculate_rv_rmse(
sub_df['cka_x_norm'],
sub_df['cka_y_norm'],
sub_df['cka_xy_norm'],
)
sub_df['cka_dist'] = rv_rmse
# Plot stuff
sub_df['ratio'] = sub_df['spatial'] / sub_df['temporal']
sub_df["spatial"] = sub_df["spatial"].astype('category')
sub_df["temporal"] = sub_df["temporal"].astype('category')
sub_df["n_dimensions"] = sub_df["n_dimensions"].astype('category')
# plt.show()
fig, ax = plt.subplots(ncols=1, figsize=(7, 5))
sns.lineplot(x='ratio', y='cka_dist', hue='n_dimensions', data=sub_df, ax=ax,)
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax, label='RV')
ax.set(
xscale='log',
# yscale='symlog',
xlabel='Spatial / Temporal Ratio',
ylabel='CKA RMSE',
)
plt.tight_layout()
fig.savefig(FIG_PATH.joinpath(f'{variable}_cka_rmse.png'))
plt.show()
# get number of samples
n_samples = sub_df['rbig_H_x'].shape[0]
# calculate variation of information
sub_df['vi_joint'] = sub_df['rbig_H_x'] + sub_df['rbig_H_y'] - 2 * sub_df['rbig_I_xy']
sub_df['vi_joint_norm'] = (sub_df['vi_joint']) / np.log(n_samples)
# Plot Unnormalized
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
sns.scatterplot(x='ratio', y='vi_joint', data=sub_df, hue='n_dimensions', ax=ax[0], label='nMI')
sns.scatterplot(x='ratio', y='vi_joint', data=sub_df, hue='n_dimensions', ax=ax[1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
ax[0].set_yscale('symlog')
ax[0].set_xscale('log')
ax[0].set_ylabel('VI (Meila 2005)')
ax[0].set_xlabel('Spatial / Temporal Ratio')
ax[1].set_yscale('symlog')
ax[1].set_xscale('log')
ax[1].set_ylabel('')
ax[1].set_xlabel('Spatial / Temporal Ratio')
# Plot Normalized
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
sns.scatterplot(x='ratio', y='vi_joint_norm', data=sub_df, hue='n_dimensions', ax=ax[0], label='nMI')
sns.scatterplot(x='ratio', y='vi_joint_norm', data=sub_df, hue='n_dimensions', ax=ax[1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
ax[0].set_yscale('symlog')
ax[0].set_xscale('log')
ax[0].set_ylabel('Normalized VI (Meila 2007)')
ax[0].set_xlabel('Spatial / Temporal Ratio')
ax[1].set_yscale('symlog')
ax[1].set_xscale('log')
ax[1].set_ylabel('')
ax[1].set_xlabel('Spatial / Temporal Ratio')
Method III - Variation of Information (Joint)¶
This is analogous to the measure above but the way they do the normalization is different. This one I found in this paper [pg. 2842, table 3].
Note: Below I show how the quantity on the right is equivalent to the middle term:
$$ \begin{aligned} 1 - \frac{I(X;Y)}{H(X,Y)} &= 1 - \frac{I(X;Y)}{H(X) + H(Y) - I(X;Y)} \ &= \frac{H(X) + H(Y) - I(X;Y)}{H(X) + H(Y) - I(X;Y)} - \frac{I(X;Y)}{H(X) + H(Y) - I(X;Y)} \ &= \frac{H(X) + H(Y) - 2I(X;Y)}{H(X) + H(Y) - I(X;Y)} \ &= \frac{VI(X;Y)}{H(X,Y)} \end{aligned} $$
# get number of samples
n_samples = sub_df['rbig_H_x'].shape[0]
# calculate variation of information
sub_df['vi'] = sub_df['rbig_H_x'] + sub_df['rbig_H_y'] - 2 * sub_df['rbig_I_xy']
sub_df['H_xy'] = sub_df['rbig_H_x'] + sub_df['rbig_H_y'] - sub_df['rbig_I_xy']
sub_df['nVI_joint'] = sub_df['vi'] / sub_df['H_xy']
# sub_df['d_joint'] = 1 - (sub_df['rbig_I_xy'] / sub_df['H_xy'])
# Plot Unnormalized
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
sns.scatterplot(x='ratio', y='nVI_joint', data=sub_df, hue='n_dimensions', ax=ax[0], label='nMI')
sns.scatterplot(x='ratio', y='nVI_joint', data=sub_df, hue='n_dimensions', ax=ax[1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
ax[0].set_yscale('log')
# ax[0].set_ylim([0.9, 1.1])
ax[0].set_xscale('log')
ax[0].set_ylabel('nVI (Joint) (Kaskov 2005)')
ax[0].set_xlabel('Spatial / Temporal Ratio')
ax[1].set_yscale('log')
ax[1].set_xscale('log')
ax[1].set_ylabel('')
ax[1].set_xlabel('Spatial / Temporal Ratio')
Furthermore, I didn't estimate the joint entropy H(X,Y) directly with RBIG. So we could be introducing a lot of errors by doing all of the manipulations to acquire the joint entropy.
Method III - Variation of Information (Max)¶
This is the last variation of the VI method found in this paper [pg. 2842, table 3].
And apparently this is the tightest bound over the VI measure. Another name is called the Normalized Information Distance.
sub_df['nVI_max'] = 1 - (sub_df['rbig_I_xy'] / np.maximum(sub_df['rbig_H_x'], sub_df['rbig_H_y']))
fig, ax = plt.subplots(ncols=2, figsize=(10, 5))
sns.scatterplot(x='ratio', y='nVI_max', data=sub_df, hue='n_dimensions', ax=ax[0], label='nMI')
sns.scatterplot(x='ratio', y='nVI_max', data=sub_df, hue='n_dimensions', ax=ax[1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
# ax[0].set_yscale('log')
# ax[0].set_ylim([0.9, 1.1])
ax[0].set(
xscale='log',
ylabel='nVI (Max) (Kaskov 2005)',
xlabel='Spatial / Temporal Ratio'
)
ax[1].set(
xscale='log',
ylabel='',
xlabel='Spatial / Temporal Ratio'
)
plt.tight_layout()
plt.show()
Now, later, I will have to figure out how this ties into the Taylor Diagram. But for now, I'm quite happy.
from sklearn.preprocessing import MinMaxScaler
# find the min,max of all entropy values
min_H = sub_df[['rbig_H_x', 'rbig_H_y']].min().min()
max_H = sub_df[['rbig_H_x', 'rbig_H_y']].max().max()
# Scale between 0 and the max value?
sub_df['rbig_H_xs'] = MinMaxScaler((0, max_H)).fit_transform(sub_df['rbig_H_x'].values[:, None])
sub_df['rbig_H_ys'] = MinMaxScaler((0, max_H)).fit_transform(sub_df['rbig_H_y'].values[:, None])
# calculate normalized MI with the scaled versions
sub_df['d_max'] = (sub_df['rbig_I_xy'] / np.sqrt(sub_df['rbig_H_xs']) / np.sqrt(sub_df['rbig_H_ys'])) #* np.log(2)
fig, ax = plt.subplots(ncols=2, nrows=2, figsize=(10, 10))
sns.scatterplot(x='ratio', y='d_max', data=sub_df, hue='n_dimensions', ax=ax[0,0], label='nMI')
sns.scatterplot(x='ratio', y='d_max', data=sub_df, hue='n_dimensions', ax=ax[0,1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
# ax[0,0].set_yscale('log')
ax[0,0].set_xscale('log')
ax[0,0].set_ylabel('Normalized Mutual Information')
ax[0,0].set_xlabel('Spatial / Temporal Ratio')
# ax[0, 1].set_yscale('log')
ax[0, 1].set_xscale('log')
ax[0, 1].set_ylabel('')
ax[0, 1].set_xlabel('Spatial / Temporal Ratio')
sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, hue='n_dimensions', ax=ax[1,0], label='nMI')
sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, hue='n_dimensions', ax=ax[1,1], label='nMI')
# sns.scatterplot(x='ratio', y='rv_coeff', data=sub_df, ax=ax[1], label='RV')
ax[1,0].set_yscale('log')
ax[1,0].set_xscale('log')
ax[1,0].set_ylabel('Coefficient')
ax[1,0].set_xlabel('Spatial / Temporal Ratio')
ax[1, 1].set_yscale('log')
ax[1, 1].set_xscale('log')
ax[1, 1].set_ylabel('')
ax[1, 1].set_xlabel('Spatial / Temporal Ratio')
plt.show()
So right away, we see that the RV Coefficient doesn't fluxuate at all (or at least it is rather random and I don't see a clear pattern.
Mutual Information
MI clearly has a pattern. That's good, it's picking up something about the feature representations. The only problem is it's difficult to interpret because it's not really between 0 and 1, so I'm not sure how we're supposed to use this in a Taylor Diagram.