Distribution Experiment - Walkthrough¶
import sys, os
# Insert path to model directory,.
cwd = os.getcwd()
path = f"{cwd}/../../src"
sys.path.insert(0, path)
import warnings
import tqdm
import random
import pandas as pd
import numpy as np
import argparse
from sklearn.utils import check_random_state
# toy datasets
from data.it_data import MIData
# Kernel Dependency measure
from models.dependence import HSIC
from models.kernel import estimate_sigma, sigma_to_gamma, gamma_to_sigma, get_param_grid
# RBIG IT measures
from models.ite_algorithms import run_rbig_models
# experiment helpers
from tqdm import tqdm
import prefect
# Plotting Procedures
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
sns.set_style("dark")
sns.set_context("poster")
warnings.filterwarnings('ignore') # get rid of annoying warnings
%matplotlib inline
warnings.filterwarnings('ignore') # get rid of annoying warnings
%load_ext autoreload
%autoreload 2
Datasets¶
- Samples - [500, 1K, 5K, 10K, 30K, 50K]
- Dimensions - [ 2, 3, 10, 50, 100]
- trials -
1:5
- IT measures - Mutual Information
- Distributions - [Gaussian, T-Student]
Example Gaussian Distribution: 2D¶
# dataloader params
dataset = 'gauss'
# initialize dataloader
dataloader = MIData(dataset)
# dataset params
samples = 100
dimensions = 2
std = 1
trial = 1
# extract dataset
X, Y, mi_val = dataloader.get_data(samples=samples, dimensions=dimensions, std=std, trial=trial)
from prefect import task, Flow, Parameter
@task
def get_data(dataset: str, samples: int, dimensions: int, dof: int, trial: int):
dataloader = MIData(dataset)
# extract dataset
X, Y, mi_val = dataloader.get_data(
samples=samples,
dimensions=dimensions,
std=dof,
trial=trial
)
data = {
'X': X,
"Y": Y,
"mi_val": mi_val
}
return data
class GetData(Task):
def __init__(
self,
dataset: str='gauss',
samples: int=100,
dimensions: int=2,
std: int=1,
trial: int=1,
*args,
**kwargs
):
super().__init__(*args, **kwargs)
self.dataset = dataset
self.samples = samples
self.dimensions = dimensions
self.std = std
self.trial = trial
def run(self, )
with Flow("Run experiment") as flow:
# Data Params
dataset = Parameter("dataset", default='gauss')
samples = Parameter("samples", default=100)
dimensions = Parameter("dimensions", default=2)
std = Parameter("std", default=1)
trial = Parameter("trial", default=1)
# Load Data
data = get_data(dataset, samples, dimensions, std, trial)
flow.run()
flow.tasks
class ExpParams:
# dataset params
samples = 100
dimensions = 2
std = 1
trial = 1
# plot data
fig = plt.figure(figsize=(10, 10))
g = sns.jointplot(
x=X[:, 0],
y=X[:, 1],
)
plt.title('X')
plt.show()
# plot data
fig = plt.figure(figsize=(10, 10))
g = sns.jointplot(
x=Y[:, 0],
y=Y[:, 1],
)
plt.title('Y')
plt.show()
HSIC Algorithms¶
HSIC¶
algorithm path: src/models/dependence.py
1. Initialize Gamma¶
# sigma initialization params
percent = .2
method = 'belkin'
# initialize sigma
sigma_init_X = estimate_sigma(X, method=method, percent=percent)
sigma_init_Y = estimate_sigma(Y, method=method, percent=percent)
print(f'Sigma_x: ', sigma_init_X)
print(f'Sigma_y: ', sigma_init_Y)
1.1 Mean of Initial sigmas¶
sigma_init = np.mean([sigma_init_X, sigma_init_Y])
print(f'Sigma_init (Belkin): ', sigma_init)
1.3 Convert Gamma to Sigma¶
The standard kernel function is:
K(x,y)= \exp(-\frac{||x-y||^2}{2\sigma^2})
Sklearn uses the following RBF kernel function:
K(x,y)= \exp(-\gamma||x-y||^2)
So the following relationship is:
\gamma = \frac{1}{2\sigma^2}
# convert sigma to gamma
gamma_init = sigma_to_gamma(sigma_init)
# check if true
assert(gamma_init == 1 / (2 * sigma_init ** 2))
print('Gamma_init (Belkin):', gamma_init)
1.4 Create Function¶
from typing import Optional
@task
def get_gamma_init(data, method: str, percent: Optional[float]=None) -> float:
"""Get Gamma initializer
Parameters
----------
method : str,
the initialization method
percent : float
if using the Belkin method, this uses a percentage
of the kth nearest neighbour
Returns
-------
gamma_init : float
the initial gamma value
"""
# initialize sigma
sigma_init_X = estimate_sigma(data["X"], method=method, percent=percent)
sigma_init_Y = estimate_sigma(data["Y"], method=method, percent=percent)
# mean of the two
sigma_init = np.mean([sigma_init_X, sigma_init_Y])
# convert sigma to gamma
gamma_init = sigma_to_gamma(sigma_init)
# return initial gamma value
return {"gamma_init": gamma_init}
data
with Flow("Run experiment") as flow:
# Data Params
dataset = Parameter("dataset", default='gauss')
samples = Parameter("samples", default=100)
dimensions = Parameter("dimensions", default=2)
std = Parameter("std", default=1)
trial = Parameter("trial", default=1)
# Load Data
data = get_data(dataset, samples, dimensions, std, trial)
# Gamma Parameters
method = Parameter("method", default='median')
percent = Parameter("percent", default=0.2)
# get gamma
gamma_init = get_gamma_init(data, method, percent)
flow.run()
4. Calculate HSIC¶
# hsic parameters
kernel = 'rbf'
scorer = 'hsic'
subsample = None
bias = True
# initialize HSIC model
clf_hsic = HSIC(
gamma=gamma_init,
kernel=kernel,
scorer=scorer,
subsample=subsample,
bias=bias
)
# fit model to data
clf_hsic.fit(X, Y)
# get hsic value
hsic_value = clf_hsic.score(X)
print('HSIC: ', hsic_value)
@task
def get_hsic(data, scorer: str, gamma_init) -> float:
"""Gets the HSIC parameters
Parameters
----------
X : np.ndarray, (n_samples, d_dimensions)
1st input array
Y : np.ndarray, (n_samples, d_dimensions)
2nd input array
scorer : str,
the scorer to calculate the hsic
* hsic - HSIC method
* tka - kernel tangent alignment
* ctka - centered kernel tangent alignment
gamma_init : float
the initial gamma parameter
Returns
-------
hsic_value : float
the hsic value calculated from the scorer
"""
# hsic parameters
kernel = 'rbf'
subsample = None
bias = True
# initialize HSIC model
clf_hsic = HSIC(
gamma=gamma_init['gamma_init'],
kernel=kernel,
scorer=scorer,
subsample=subsample,
bias=bias
)
# fit model to data
clf_hsic.fit(data['X'], data['Y'])
# get hsic value
hsic_value = clf_hsic.score(data['X'])
return {"hsic_value":hsic_value}
with Flow("Run experiment") as flow:
# Data Params
dataset = Parameter("dataset", default='gauss')
samples = Parameter("samples", default=100)
dimensions = Parameter("dimensions", default=2)
std = Parameter("std", default=1)
trial = Parameter("trial", default=1)
# Load Data
data = get_data(dataset, samples, dimensions, std, trial)
# Gamma Parameters
method = Parameter("method", default='median')
percent = Parameter("percent", default=0.2)
# get gamma
gamma_init = get_gamma_init(data, method, percent)
# HSIC Params
scorer = Parameter("scorer", default='hsic')
# Get HSIC value
hsic_value = get_hsic(data, scorer, gamma_init)
# Save data
for isamples in [50, 100]:
flow.run(parameters={'samples': isamples})
Experiment I - Different Scorers¶
We are looking at different "HSIC scorers". They are:
HSIC
HSIC = \frac{1}{n(n-1)}\langle K_xH,K_yH \rangle_F
Notice: we have the centered kernels, K_xH and no normalization.
TKA
TKA = \frac{\langle K_x,K_y \rangle_F}{||K_x||_F||K_y||_F}
Notice: We have the uncentered kernels and a normalization factor.
cTKA
cTKA = \frac{\langle K_xH,K_yH \rangle_F}{||K_xH||_F||K_yH||_F}
Notice: We have the centered kernels and a normalization factor.
# experimental parameters
method = 'belkin'
percent = 0.2
hsic_values = dict()
for iscorer in ['hsic', 'tka', 'ctka']:
# get initial gamma
gamma_init = get_gamma_init(X, Y, method, percent)
# get HSIC value
hsic_values[iscorer] = get_hsic(X, Y, iscorer, gamma_init)
print(hsic_values)
Experiment II - Different Scorers, Initializers¶
# dataset params
dataset = 'gauss'
samples = 100
dimensions = 2
std = 1
trial = 1
# extract dataset
X, Y, mi_val = MIData(dataset).get_data(samples=samples, dimensions=dimensions, std=std, trial=trial)
# experimental parameters
scorers = ['hsic', 'tka', 'ctka']
gamma_methods = [
('silverman',None),
('scott', None),
('median', None),
('belkin', 0.1),
('belkin', 0.2),
('belkin', 0.4),
('belkin', 0.8),
]
# results dataframe
results_df = pd.DataFrame()
# run experiment
for iscorer in scorers:
for imethod in gamma_methods:
# initialize gamma
gamma_init = get_gamma_init(X, Y, imethod[0], imethod[1])
# get hsic_value
hsic_value = get_hsic(X, Y, iscorer, gamma_init)
# append results to dataframe
results_df = results_df.append({
'scorer': iscorer,
'gamma_method': f"{imethod[0]}_{imethod[1]}" if imethod[1] is not None else f"{imethod[0]}",
'gamma_init': gamma_init,
'hsic_value': hsic_value
}, ignore_index=True)
results_df.head()
# plot the results
def plot_scorer(scorer:str)-> None:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5,5))
sns.scatterplot(
x='gamma_init',
y='hsic_value',
hue='gamma_method',
data=results_df[results_df['scorer'] == scorer],
ax=ax
)
ax.set_ylabel('Score')
ax.set_xlabel('Gamma Initialization')
ax.legend(prop={'size':9})
ax.set_title(scorer.upper())
plt.show()
plot_scorer('hsic')
plot_scorer('tka')
plot_scorer('ctka')
Experiment II - Different Scorers, Initializations and Degree of Freedom¶
In this experiment, we'll be looking at how do the HSIC values change depending upon the gamma initialization as well as the degree of freedom we choose. In the Gaussian distribution, this is the standard deviation, \sigma and the T-Student distribution this is the \nu parameter.
from tqdm import trange, tqdm
# dataset params
dataset = 'gauss'
samples = 100
dimensions = 2
std = 1
trial = 1
# experimental parameters
scorers = ['hsic', 'tka', 'ctka']
gamma_methods = [
('silverman',None),
('scott', None),
('median', None),
('belkin', 0.1),
('belkin', 0.2),
('belkin', 0.4),
('belkin', 0.8),
('max', None)
]
dof_params = np.linspace(1,11, 11, endpoint=True)
# results dataframe
results_df = pd.DataFrame()
# run experiment
with tqdm(gamma_methods) as gamma_bar:
for imethod in gamma_bar:
for iscorer in scorers:
for idof in dof_params:
# extract dataset
X, Y, mi_val = MIData(dataset).get_data(samples=samples, dimensions=dimensions, std=int(idof), trial=trial)
# initialize gamma
if imethod[0] == 'max':
clf_hsic = train_rbf_hsic(X, Y, iscorer, 50, 1, 'median')
hsic_value = clf_hsic.score(X)
else:
gamma_init = get_gamma_init(X, Y, imethod[0], imethod[1])
hsic_value = get_hsic(X, Y, iscorer, gamma_init)
# append results to dataframe
results_df = results_df.append({
'scorer': iscorer,
'gamma_method': f"{imethod[0]}_{imethod[1]}" if imethod[1] is not None else f"{imethod[0]}",
'gamma_init': gamma_init,
'hsic_value': hsic_value,
'std': idof,
'mi_value': mi_val,
}, ignore_index=True)
results_df.head()
# plot the results
def plot_scorer_mi(df: pd.DataFrame, scorer:str)-> None:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5,5))
sns.scatterplot(
x='hsic_value',
y='mi_value',
hue='gamma_method',
data=df[df['scorer'] == scorer],
ax=ax
)
ax.set_ylabel('Mutual Information')
ax.set_xlabel('Score')
ax.legend(prop={'size':9})
ax.set_title(scorer.upper())
plt.show()
plot_scorer_mi(results_df, 'hsic')
plot_scorer_mi(results_df, 'tka')
plot_scorer_mi(results_df, 'ctka')
Same Experiment, but with a higher number of samples and dimensions¶
# dataset params
dataset = 'gauss'
samples = [1_000]
dimensions = [50]
trials = [1]
# experimental parameters
scorers = ['hsic', 'tka', 'ctka']
gamma_methods = [
('silverman',None),
('scott', None),
('median', None),
('belkin', 0.1),
('belkin', 0.2),
('belkin', 0.4),
('belkin', 0.8),
('max', None)
]
dof_params = np.linspace(1,11, 11, endpoint=True)
# results dataframe
results_df = pd.DataFrame()
# run experiment
with tqdm(gamma_methods) as gamma_bar:
for imethod in gamma_bar:
for isample in samples:
for idim in dimensions:
for itrial in trials:
for iscorer in scorers:
for idof in dof_params:
# extract dataset
X, Y, mi_val = MIData(dataset).get_data(samples=isample, dimensions=idim, std=int(idof), trial=itrial)
# initialize gamma
print(imethod[0], imethod[1])
if imethod[0] == 'max':
clf_hsic = train_rbf_hsic(X, Y, iscorer, 50, 1, 'median')
hsic_value = clf_hsic.score(X)
else:
gamma_init = get_gamma_init(X, Y, imethod[0], imethod[1])
hsic_value = get_hsic(X, Y, iscorer, gamma_init)
# get hsic_value
# append results to dataframe
results_df = results_df.append({
'samples': isample,
'dimensions': idim,
'trial': itrial,
'scorer': iscorer,
'gamma_method': f"{imethod[0]}_{imethod[1]}" if imethod[1] is not None else f"{imethod[0]}",
'gamma_init': gamma_init,
'hsic_value': hsic_value,
'std': idof,
'mi_value': mi_val,
}, ignore_index=True)
results_df.head()
res_high_df = results_df.copy()
plot_scorer_mi(res_high_df, 'hsic')
plot_scorer_mi(res_high_df, 'tka')
plot_scorer_mi(res_high_df, 'ctka')