Code Review V - Fitting MI¶
In this code review, we're going to be reviewing how we can try to fit a curve for the relationship between mutual information and the centered kernel alignment (CKA) scorer.
Code Preamble¶
# toy datasets
import sys
from pyprojroot import here
sys.path.insert(0, str(here()))
import warnings
from typing import Optional, Tuple
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
import argparse
from sklearn.utils import check_random_state
# toy datasets
from src.data.distribution import DataParams, Inputs
# Kernel Dependency measure
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process.kernels import RBF
from src.models.dependence import HSICModel
# RBIG IT measures
from src.features.utils import df_query, subset_dataframe
# Plotting
from src.visualization.distribution import plot_scorer, plot_score_vs_mi
# experiment helpers
from src.experiments.utils import dict_product, run_parallel_step
from tqdm import tqdm
# Plotting Procedures
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
sns.reset_defaults()
# sns.set_style('whitegrid')
#sns.set_context('talk')
sns.set_context(context='poster',font_scale=0.7, rc={'font.family': 'sans-serif'})
# sns.set(font='sans-serif')
%matplotlib inline
%load_ext autoreload
%autoreload 2
Query Data¶
DATA_PATH = "data/results/distributions/mutual_info/"
results_df = pd.concat([
pd.read_csv(here() / f"{DATA_PATH}v5_gauss.csv"),
pd.read_csv(here() / f"{DATA_PATH}v5_tstudent.csv")
], axis=1)
results_df = results_df.loc[:, ~results_df.columns.str.match('Unnamed')]
results_df = results_df.astype(object).replace(np.nan, 'None')
Gaussian Distribution¶
# initialize list of queries
queries = []
# query dataframe for median
dataset_methods = ['gauss']
queries.append(df_query('dataset', dataset_methods))
# query dataframe for median
sigma_methods = ['median']
queries.append(df_query('sigma_method', sigma_methods))
# query dataframe for scott and silverman methods
sigma_percents = [40., 50., 60.]
queries.append(df_query('sigma_percent', sigma_percents))
# query dataframe for RBF Kernel
dimension_query = [False]
queries.append(df_query('per_dimension', dimension_query))
# query dataframe for HSIC
scorer_query = ['cka']
queries.append(df_query('scorer', scorer_query))
sub_df = subset_dataframe(results_df, queries)
# # plot - score vs mi
# plot_score_vs_mi(sub_df, scorer='cka', compare='dimension');
sub_df.head(3)
Extreme Values¶
So there are a few extreme values (i.e. values that appear to fall outside of the trend). I would like to highlight in what settings they were found.
# necessary columns for plotting
columns = ['score', 'mutual_info', 'dimensions', 'samples']
sub_df = sub_df[columns]
# change column types to categorical for plotting
ind_cols = [
'samples',
'dimensions'
]
sub_df[ind_cols] = sub_df[ind_cols].astype('category')
# Plot
fig, ax = plt.subplots(ncols=2, figsize=(12, 5))
sns.scatterplot(
ax=ax[0], x='score', y='mutual_info',
data=sub_df,
marker='.',
hue='samples',
)
ax[0].set_title("Comparing Samples")
ax[0].set_xlabel('CKA Score')
ax[0].set_ylabel('Mutual Information')
ax[0].set_yscale('symlog')
sns.scatterplot(
ax=ax[1], x='score', y='mutual_info',
data=sub_df,
marker='.',
hue='dimensions',
)
ax[1].set_title("Comparing Dimensions")
ax[1].set_xlabel('CKA Score')
ax[1].set_ylabel('Mutual Information')
ax[1].set_yscale('symlog')
plt.tight_layout()
plt.show()
Note: I find this a bit funny because kernels are known for being good for situations with a high number of samples and a low number of dimensions.
Exact Relation¶
So there is a formula that describes the exact relationship between mutual information and the linear kernel for a Gaussian distribution. It's:
where \rho= \frac{|C|}{|C_{XX}||C_{YY}|}. This is essentially the closed form solution for the MI between two Gaussian distributions. And \rho is the score that we should obtain. I didn't actually calculate the closed-form solution (although I could in the future). But I would like to see if the score that I estimated approximates the true score that we should obtain if we were to assume a Gaussian. So I'll solve this equation for \rho and then plot my estimated \hat{\rho}.
$$ \rho = 1 - \exp^{-2 I} $$
# calculate the real score based on the MI
sub_df['score_real'] = 1 - np.exp(- 2 * sub_df['mutual_info'])
# calculate the pearson, spearman between our estimate and the real score
from scipy import stats
p_score = stats.pearsonr(
sub_df['score'],
sub_df['score_real']
)[0]
sp_score = stats.spearmanr(
sub_df['score'],
sub_df['score_real']
)[0]
# Plot
fig, ax = plt.subplots(ncols=1, figsize=(7, 7))
sns.regplot(
ax=ax, x='score_real', y='score',
data=sub_df,
marker='.',
color='black',
scatter_kws={'color': 'lightblue', 'label': 'Points'}
)
ax.set_title("Approximate Relationship")
ax.set_xlabel('CKA Score')
ax.set_ylabel('True Score')
# ax.set_ylim([0.0, 8])
# Plot I
# ax.plot(np.sort(sub_df['score']), sub_df['mi_kernel'],
# linewidth=3, color='black', label='Fitted Curve')
ax.legend(['Regression Line', 'Points'])
ax.annotate(f"Pearson: {p_score:.2f}\nSpearman: {sp_score:.2f}", (-0.025, .75), fontsize=15)
plt.show()