Code Review V - Fitting MI¶

In this code review, we're going to be reviewing how we can try to fit a curve for the relationship between mutual information and the centered kernel alignment (CKA) scorer.

Code Preamble¶

# toy datasets
import sys
from pyprojroot import here
sys.path.insert(0, str(here()))


import warnings
from typing import Optional, Tuple
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
import argparse
from sklearn.utils import check_random_state

# toy datasets
from src.data.distribution import DataParams, Inputs

# Kernel Dependency measure
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process.kernels import RBF
from src.models.dependence import HSICModel

# RBIG IT measures
from src.features.utils import df_query, subset_dataframe
# Plotting
from src.visualization.distribution import plot_scorer, plot_score_vs_mi

# experiment helpers
from src.experiments.utils import dict_product, run_parallel_step
from tqdm import tqdm

# Plotting Procedures
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
sns.reset_defaults()
# sns.set_style('whitegrid')
#sns.set_context('talk')
sns.set_context(context='poster',font_scale=0.7, rc={'font.family': 'sans-serif'})
# sns.set(font='sans-serif')
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Query Data¶

DATA_PATH = "data/results/distributions/mutual_info/"

results_df = pd.concat([
    pd.read_csv(here() / f"{DATA_PATH}v5_gauss.csv"),
    pd.read_csv(here() / f"{DATA_PATH}v5_tstudent.csv")
], axis=1)

results_df = results_df.loc[:, ~results_df.columns.str.match('Unnamed')]

results_df = results_df.astype(object).replace(np.nan, 'None')

/home/emmanuel/.conda/envs/hsic_align/lib/python3.8/site-packages/IPython/core/interactiveshell.py:3062: DtypeWarning: Columns (12,13) have mixed types.Specify dtype option on import or set low_memory=False.
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,

Gaussian Distribution¶

# initialize list of queries
queries = []

# query dataframe for median
dataset_methods = ['gauss']
queries.append(df_query('dataset', dataset_methods))

# query dataframe for median
sigma_methods = ['median']
queries.append(df_query('sigma_method', sigma_methods))

# query dataframe for scott and silverman methods
sigma_percents = [40., 50., 60.]
queries.append(df_query('sigma_percent', sigma_percents))

# query dataframe for RBF Kernel
dimension_query = [False]
queries.append(df_query('per_dimension', dimension_query))

# query dataframe for HSIC
scorer_query = ['cka']
queries.append(df_query('scorer', scorer_query))

sub_df = subset_dataframe(results_df, queries)



# # plot - score vs mi
# plot_score_vs_mi(sub_df,  scorer='cka', compare='dimension');

sub_df.head(3)

	dataset	trial	std	nu	samples	dimensions	standardize	per_dimension	separate_scales	sigma_method	sigma_percent	sigma_X	sigma_Y	scorer	score	mutual_info
16500	gauss	1	1	1	50	2	True	False	True	median	40	1.9787121467565072	1.9476870941963393	cka	0.066137	0.000000
16501	gauss	1	2	1	50	2	True	False	True	median	40	2.0194286080222534	2.0089767519975203	cka	0.052168	0.002053
16502	gauss	1	3	1	50	2	True	False	True	median	40	2.0088368919994264	2.006548619075144	cka	0.046326	0.007718

Extreme Values¶

So there are a few extreme values (i.e. values that appear to fall outside of the trend). I would like to highlight in what settings they were found.

# necessary columns for plotting
columns = ['score', 'mutual_info', 'dimensions', 'samples']
sub_df = sub_df[columns]

# change column types to categorical for plotting
ind_cols = [
    'samples', 
    'dimensions'
]
sub_df[ind_cols] = sub_df[ind_cols].astype('category')

# Plot
fig, ax = plt.subplots(ncols=2, figsize=(12, 5))

sns.scatterplot(
    ax=ax[0], x='score', y='mutual_info', 
    data=sub_df, 
    marker='.',
    hue='samples',
)
ax[0].set_title("Comparing Samples")
ax[0].set_xlabel('CKA Score')
ax[0].set_ylabel('Mutual Information')
ax[0].set_yscale('symlog')

sns.scatterplot(
    ax=ax[1], x='score', y='mutual_info', 
    data=sub_df, 
    marker='.',
    hue='dimensions',
)
ax[1].set_title("Comparing Dimensions")
ax[1].set_xlabel('CKA Score')
ax[1].set_ylabel('Mutual Information')
ax[1].set_yscale('symlog')

plt.tight_layout()
plt.show()

So it appears that our estimation is at it's worse when we have a setting where we have a low number of samples and a high number of dimensions when there is a low amount of mutual information.

Note: I find this a bit funny because kernels are known for being good for situations with a high number of samples and a low number of dimensions.

Exact Relation¶

So there is a formula that describes the exact relationship between mutual information and the linear kernel for a Gaussian distribution. It's:

$I(\mathbf{X;Y}) = - \frac{1}{2} \log(1-\rho)$

where $\rho= \frac{|C|}{|C_{XX}||C_{YY}|}$ . This is essentially the closed form solution for the MI between two Gaussian distributions. And $\rho$ is the score that we should obtain. I didn't actually calculate the closed-form solution (although I could in the future). But I would like to see if the score that I estimated approximates the true score that we should obtain if we were to assume a Gaussian. So I'll solve this equation for $\rho$ and then plot my estimated $\hat{\rho}$ .

$$ \rho = 1 - \exp^{-2 I} $$

# calculate the real score based on the MI
sub_df['score_real'] = 1 - np.exp(- 2 * sub_df['mutual_info'])

# calculate the pearson, spearman between our estimate and the real score
from scipy import stats
p_score = stats.pearsonr(
    sub_df['score'],
    sub_df['score_real']
)[0]

sp_score = stats.spearmanr(
    sub_df['score'],
    sub_df['score_real']
)[0]

# Plot
fig, ax = plt.subplots(ncols=1, figsize=(7, 7))

sns.regplot(
    ax=ax, x='score_real', y='score', 
    data=sub_df, 
    marker='.',
    color='black',
    scatter_kws={'color': 'lightblue', 'label': 'Points'}
)
ax.set_title("Approximate Relationship")
ax.set_xlabel('CKA Score')
ax.set_ylabel('True Score')
# ax.set_ylim([0.0, 8])

# Plot I
# ax.plot(np.sort(sub_df['score']), sub_df['mi_kernel'],
#        linewidth=3, color='black', label='Fitted Curve')

ax.legend(['Regression Line', 'Points'])
ax.annotate(f"Pearson: {p_score:.2f}\nSpearman: {sp_score:.2f}", (-0.025, .75), fontsize=15)
plt.show()

So, there is clearly a relationship between the two curves. And you won't find any other curve with any other score. So for approximating mutual information, this would be the estimate that you would want to use.