Distribution Experiment - Walkthrough¶

import sys, os
# Insert path to model directory,.
cwd = os.getcwd()
path = f"{cwd}/../../src"
sys.path.insert(0, path)

import warnings
import tqdm
import random
import pandas as pd
import numpy as np
import argparse
from sklearn.utils import check_random_state

# toy datasets
from data.it_data import MIData

# Kernel Dependency measure
from models.dependence import HSIC
from models.kernel import estimate_sigma, sigma_to_gamma, gamma_to_sigma, get_param_grid

# RBIG IT measures
from models.ite_algorithms import run_rbig_models

# experiment helpers
from tqdm import tqdm
import prefect

# Plotting Procedures
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
sns.set_style("dark")
sns.set_context("poster")
warnings.filterwarnings('ignore') # get rid of annoying warnings
%matplotlib inline

warnings.filterwarnings('ignore') # get rid of annoying warnings

%load_ext autoreload
%autoreload 2

Datasets¶

Samples - [500, 1K, 5K, 10K, 30K, 50K]
Dimensions - [ 2, 3, 10, 50, 100]
trials - 1:5
IT measures - Mutual Information
Distributions - [Gaussian, T-Student]

Example Gaussian Distribution: 2D¶

# dataloader params
dataset = 'gauss'

# initialize dataloader
dataloader = MIData(dataset)

# dataset params
samples = 100
dimensions = 2
std = 1
trial = 1

# extract dataset
X, Y, mi_val = dataloader.get_data(samples=samples, dimensions=dimensions, std=std, trial=trial)

from prefect import task, Flow, Parameter

@task
def get_data(dataset: str, samples: int, dimensions: int, dof: int, trial: int):

    dataloader = MIData(dataset)
    # extract dataset
    X, Y, mi_val = dataloader.get_data(
        samples=samples, 
        dimensions=dimensions, 
        std=dof, 
        trial=trial
    )

    data = {
        'X': X,
        "Y": Y,
        "mi_val": mi_val
    }

    return data

class GetData(Task):
    def __init__(
        self, 
        dataset: str='gauss', 
        samples: int=100, 
        dimensions: int=2, 
        std: int=1, 
        trial: int=1,
        *args,
        **kwargs
    ):
        super().__init__(*args, **kwargs)
        self.dataset = dataset
        self.samples = samples
        self.dimensions = dimensions
        self.std = std
        self.trial = trial

    def run(self, )

with Flow("Run experiment") as flow:

    # Data Params
    dataset = Parameter("dataset", default='gauss')
    samples = Parameter("samples", default=100)
    dimensions = Parameter("dimensions", default=2)
    std = Parameter("std", default=1)
    trial = Parameter("trial", default=1)

    # Load Data
    data = get_data(dataset, samples, dimensions, std, trial)


flow.run()

[2019-10-22 14:16:39,783] INFO - prefect.FlowRunner | Beginning Flow run for 'Run experiment'
[2019-10-22 14:16:39,787] INFO - prefect.FlowRunner | Starting flow run.
[2019-10-22 14:16:39,800] INFO - prefect.TaskRunner | Task 'dimensions': Starting task run...
[2019-10-22 14:16:39,806] INFO - prefect.TaskRunner | Task 'dimensions': finished task run for task with final state: 'Success'
[2019-10-22 14:16:39,816] INFO - prefect.TaskRunner | Task 'trial': Starting task run...
[2019-10-22 14:16:39,821] INFO - prefect.TaskRunner | Task 'trial': finished task run for task with final state: 'Success'
[2019-10-22 14:16:39,831] INFO - prefect.TaskRunner | Task 'dataset': Starting task run...
[2019-10-22 14:16:39,836] INFO - prefect.TaskRunner | Task 'dataset': finished task run for task with final state: 'Success'
[2019-10-22 14:16:39,847] INFO - prefect.TaskRunner | Task 'samples': Starting task run...
[2019-10-22 14:16:39,852] INFO - prefect.TaskRunner | Task 'samples': finished task run for task with final state: 'Success'
[2019-10-22 14:16:39,862] INFO - prefect.TaskRunner | Task 'std': Starting task run...
[2019-10-22 14:16:39,867] INFO - prefect.TaskRunner | Task 'std': finished task run for task with final state: 'Success'
[2019-10-22 14:16:39,876] INFO - prefect.TaskRunner | Task 'get_data': Starting task run...
[2019-10-22 14:16:39,885] INFO - prefect.TaskRunner | Task 'get_data': finished task run for task with final state: 'Success'
[2019-10-22 14:16:39,888] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded

<Success: "All reference tasks succeeded.">

flow.tasks

{<Parameter: dataset>,
 <Parameter: dimensions>,
 <Parameter: samples>,
 <Parameter: std>,
 <Parameter: trial>,
 <Task: get_data>}

class ExpParams:
    # dataset params
    samples = 100
    dimensions = 2
    std = 1
    trial = 1

# plot data
fig  = plt.figure(figsize=(10, 10))

g = sns.jointplot(
    x=X[:, 0],
    y=X[:, 1],
)
plt.title('X')
plt.show()

# plot data
fig  = plt.figure(figsize=(10, 10))

g = sns.jointplot(
    x=Y[:, 0],
    y=Y[:, 1],
)
plt.title('Y')
plt.show()

<Figure size 720x720 with 0 Axes>

<Figure size 720x720 with 0 Axes>

HSIC Algorithms¶

HSIC¶

algorithm path: src/models/dependence.py

1. Initialize Gamma¶

# sigma initialization params
percent = .2
method = 'belkin'

# initialize sigma
sigma_init_X = estimate_sigma(X, method=method, percent=percent)
sigma_init_Y = estimate_sigma(Y, method=method, percent=percent)

print(f'Sigma_x: ', sigma_init_X)
print(f'Sigma_y: ', sigma_init_Y)

Sigma_x:  0.9583709525127224
Sigma_y:  0.8719893561231888

1.1 Mean of Initial sigmas¶

sigma_init = np.mean([sigma_init_X, sigma_init_Y])

print(f'Sigma_init (Belkin): ', sigma_init)

Sigma_init (Belkin):  0.9151801543179556

1.3 Convert Gamma to Sigma¶

The standard kernel function is:

$K(x,y)= \exp(-\frac{||x-y||^2}{2\sigma^2})$

Sklearn uses the following RBF kernel function:

$K(x,y)= \exp(-\gamma||x-y||^2)$

So the following relationship is:

$<span><span class="MathJax_Preview">\gamma = \frac{1}{2\sigma^2}</span><script type="math/tex">\gamma = \frac{1}{2\sigma^2}$

# convert sigma to gamma
gamma_init = sigma_to_gamma(sigma_init)

# check if true
assert(gamma_init == 1 / (2 * sigma_init ** 2))

print('Gamma_init (Belkin):', gamma_init)

Gamma_init (Belkin): 0.5969759242357159

1.4 Create Function¶

from typing import Optional

@task
def get_gamma_init(data, method: str, percent: Optional[float]=None) -> float:
    """Get Gamma initializer

    Parameters
    ----------
    method : str,
        the initialization method

    percent : float
        if using the Belkin method, this uses a percentage
        of the kth nearest neighbour

    Returns
    -------
    gamma_init : float
        the initial gamma value
    """

    # initialize sigma
    sigma_init_X = estimate_sigma(data["X"], method=method, percent=percent)
    sigma_init_Y = estimate_sigma(data["Y"], method=method, percent=percent)

    # mean of the two
    sigma_init = np.mean([sigma_init_X, sigma_init_Y])

    # convert sigma to gamma
    gamma_init = sigma_to_gamma(sigma_init)
    # return initial gamma value


    return {"gamma_init": gamma_init}

data

<Task: get_data>

with Flow("Run experiment") as flow:

    # Data Params
    dataset = Parameter("dataset", default='gauss')
    samples = Parameter("samples", default=100)
    dimensions = Parameter("dimensions", default=2)
    std = Parameter("std", default=1)
    trial = Parameter("trial", default=1)

    # Load Data
    data = get_data(dataset, samples, dimensions, std, trial)

    # Gamma Parameters
    method = Parameter("method", default='median')
    percent = Parameter("percent", default=0.2)

    # get gamma
    gamma_init = get_gamma_init(data, method, percent)



flow.run()

[2019-10-22 14:17:31,762] INFO - prefect.FlowRunner | Beginning Flow run for 'Run experiment'
[2019-10-22 14:17:31,766] INFO - prefect.FlowRunner | Starting flow run.
[2019-10-22 14:17:31,780] INFO - prefect.TaskRunner | Task 'dimensions': Starting task run...
[2019-10-22 14:17:31,786] INFO - prefect.TaskRunner | Task 'dimensions': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,797] INFO - prefect.TaskRunner | Task 'dataset': Starting task run...
[2019-10-22 14:17:31,803] INFO - prefect.TaskRunner | Task 'dataset': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,813] INFO - prefect.TaskRunner | Task 'std': Starting task run...
[2019-10-22 14:17:31,818] INFO - prefect.TaskRunner | Task 'std': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,828] INFO - prefect.TaskRunner | Task 'trial': Starting task run...
[2019-10-22 14:17:31,833] INFO - prefect.TaskRunner | Task 'trial': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,843] INFO - prefect.TaskRunner | Task 'method': Starting task run...
[2019-10-22 14:17:31,848] INFO - prefect.TaskRunner | Task 'method': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,857] INFO - prefect.TaskRunner | Task 'percent': Starting task run...
[2019-10-22 14:17:31,862] INFO - prefect.TaskRunner | Task 'percent': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,872] INFO - prefect.TaskRunner | Task 'samples': Starting task run...
[2019-10-22 14:17:31,877] INFO - prefect.TaskRunner | Task 'samples': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,887] INFO - prefect.TaskRunner | Task 'get_data': Starting task run...
[2019-10-22 14:17:31,895] INFO - prefect.TaskRunner | Task 'get_data': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,910] INFO - prefect.TaskRunner | Task 'get_gamma_init': Starting task run...
[2019-10-22 14:17:31,918] INFO - prefect.TaskRunner | Task 'get_gamma_init': finished task run for task with final state: 'Success'
[2019-10-22 14:17:31,921] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded

<Task: get_gamma_init>

<Success: "All reference tasks succeeded.">

4. Calculate HSIC¶

# hsic parameters
kernel = 'rbf'
scorer = 'hsic'
subsample = None 
bias = True

# initialize HSIC model
clf_hsic = HSIC(
    gamma=gamma_init,
    kernel=kernel,
    scorer=scorer,
    subsample=subsample,
    bias=bias
)

# fit model to data
clf_hsic.fit(X, Y)

# get hsic value
hsic_value = clf_hsic.score(X)

print('HSIC: ', hsic_value)

HSIC:  0.004420621559606378

@task
def get_hsic(data, scorer: str, gamma_init) -> float:
    """Gets the HSIC parameters

    Parameters
    ----------
    X : np.ndarray, (n_samples, d_dimensions)
        1st input array 

    Y : np.ndarray, (n_samples, d_dimensions)
        2nd input array

    scorer : str, 
        the scorer to calculate the hsic
        * hsic - HSIC method
        * tka  - kernel tangent alignment
        * ctka - centered kernel tangent alignment

    gamma_init : float
        the initial gamma parameter

    Returns
    -------
    hsic_value : float
        the hsic value calculated from the scorer
    """
    # hsic parameters
    kernel = 'rbf'
    subsample = None 
    bias = True

    # initialize HSIC model
    clf_hsic = HSIC(
        gamma=gamma_init['gamma_init'],
        kernel=kernel,
        scorer=scorer,
        subsample=subsample,
        bias=bias
    )

    # fit model to data
    clf_hsic.fit(data['X'], data['Y'])

    # get hsic value
    hsic_value = clf_hsic.score(data['X'])
    return {"hsic_value":hsic_value}

with Flow("Run experiment") as flow:

    # Data Params
    dataset = Parameter("dataset", default='gauss')
    samples = Parameter("samples", default=100)
    dimensions = Parameter("dimensions", default=2)
    std = Parameter("std", default=1)
    trial = Parameter("trial", default=1)

    # Load Data
    data = get_data(dataset, samples, dimensions, std, trial)

    # Gamma Parameters
    method = Parameter("method", default='median')
    percent = Parameter("percent", default=0.2)

    # get gamma
    gamma_init = get_gamma_init(data, method, percent)

    # HSIC Params
    scorer = Parameter("scorer", default='hsic')

    # Get HSIC value
    hsic_value = get_hsic(data, scorer, gamma_init)

    # Save data


for isamples in [50, 100]:
    flow.run(parameters={'samples': isamples})

[2019-10-22 14:24:44,522] INFO - prefect.FlowRunner | Beginning Flow run for 'Run experiment'
[2019-10-22 14:24:44,526] INFO - prefect.FlowRunner | Starting flow run.
[2019-10-22 14:24:44,541] INFO - prefect.TaskRunner | Task 'samples': Starting task run...
[2019-10-22 14:24:44,547] INFO - prefect.TaskRunner | Task 'samples': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,558] INFO - prefect.TaskRunner | Task 'std': Starting task run...
[2019-10-22 14:24:44,564] INFO - prefect.TaskRunner | Task 'std': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,574] INFO - prefect.TaskRunner | Task 'trial': Starting task run...
[2019-10-22 14:24:44,579] INFO - prefect.TaskRunner | Task 'trial': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,586] INFO - prefect.TaskRunner | Task 'scorer': Starting task run...
[2019-10-22 14:24:44,590] INFO - prefect.TaskRunner | Task 'scorer': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,597] INFO - prefect.TaskRunner | Task 'method': Starting task run...
[2019-10-22 14:24:44,601] INFO - prefect.TaskRunner | Task 'method': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,607] INFO - prefect.TaskRunner | Task 'dataset': Starting task run...
[2019-10-22 14:24:44,612] INFO - prefect.TaskRunner | Task 'dataset': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,618] INFO - prefect.TaskRunner | Task 'percent': Starting task run...
[2019-10-22 14:24:44,622] INFO - prefect.TaskRunner | Task 'percent': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,629] INFO - prefect.TaskRunner | Task 'dimensions': Starting task run...
[2019-10-22 14:24:44,633] INFO - prefect.TaskRunner | Task 'dimensions': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,639] INFO - prefect.TaskRunner | Task 'get_data': Starting task run...
[2019-10-22 14:24:44,648] INFO - prefect.TaskRunner | Task 'get_data': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,663] INFO - prefect.TaskRunner | Task 'get_gamma_init': Starting task run...
[2019-10-22 14:24:44,670] INFO - prefect.TaskRunner | Task 'get_gamma_init': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,683] INFO - prefect.TaskRunner | Task 'get_hsic': Starting task run...
[2019-10-22 14:24:44,691] INFO - prefect.TaskRunner | Task 'get_hsic': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,693] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded
[2019-10-22 14:24:44,696] INFO - prefect.FlowRunner | Beginning Flow run for 'Run experiment'
[2019-10-22 14:24:44,699] INFO - prefect.FlowRunner | Starting flow run.
[2019-10-22 14:24:44,710] INFO - prefect.TaskRunner | Task 'samples': Starting task run...
[2019-10-22 14:24:44,715] INFO - prefect.TaskRunner | Task 'samples': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,725] INFO - prefect.TaskRunner | Task 'std': Starting task run...
[2019-10-22 14:24:44,730] INFO - prefect.TaskRunner | Task 'std': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,740] INFO - prefect.TaskRunner | Task 'trial': Starting task run...
[2019-10-22 14:24:44,745] INFO - prefect.TaskRunner | Task 'trial': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,755] INFO - prefect.TaskRunner | Task 'scorer': Starting task run...
[2019-10-22 14:24:44,760] INFO - prefect.TaskRunner | Task 'scorer': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,769] INFO - prefect.TaskRunner | Task 'method': Starting task run...
[2019-10-22 14:24:44,775] INFO - prefect.TaskRunner | Task 'method': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,784] INFO - prefect.TaskRunner | Task 'dataset': Starting task run...
[2019-10-22 14:24:44,789] INFO - prefect.TaskRunner | Task 'dataset': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,799] INFO - prefect.TaskRunner | Task 'percent': Starting task run...
[2019-10-22 14:24:44,804] INFO - prefect.TaskRunner | Task 'percent': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,813] INFO - prefect.TaskRunner | Task 'dimensions': Starting task run...
[2019-10-22 14:24:44,818] INFO - prefect.TaskRunner | Task 'dimensions': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,828] INFO - prefect.TaskRunner | Task 'get_data': Starting task run...
[2019-10-22 14:24:44,836] INFO - prefect.TaskRunner | Task 'get_data': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,851] INFO - prefect.TaskRunner | Task 'get_gamma_init': Starting task run...
[2019-10-22 14:24:44,859] INFO - prefect.TaskRunner | Task 'get_gamma_init': finished task run for task with final state: 'Success'
[2019-10-22 14:24:44,873] INFO - prefect.TaskRunner | Task 'get_hsic': Starting task run...
[2019-10-22 14:24:44,886] INFO - prefect.TaskRunner | Task 'get_hsic': finished task run for task with final state: 'Success'

0.002959198175196714
0.0011819445258014604

[2019-10-22 14:24:44,889] INFO - prefect.FlowRunner | Flow run SUCCESS: all reference tasks succeeded

Experiment I - Different Scorers¶

We are looking at different "HSIC scorers". They are:

HSIC

$HSIC = \frac{1}{n(n-1)}\langle K_xH,K_yH \rangle_F$

Notice: we have the centered kernels, $K_xH$ and no normalization.

TKA

$TKA = \frac{\langle K_x,K_y \rangle_F}{||K_x||_F||K_y||_F}$

Notice: We have the uncentered kernels and a normalization factor.

cTKA

$cTKA = \frac{\langle K_xH,K_yH \rangle_F}{||K_xH||_F||K_yH||_F}$

Notice: We have the centered kernels and a normalization factor.

# experimental parameters
method = 'belkin'
percent = 0.2

hsic_values = dict()


for iscorer in ['hsic', 'tka', 'ctka']:

    # get initial gamma
    gamma_init = get_gamma_init(X, Y, method, percent)

    # get HSIC value
    hsic_values[iscorer] = get_hsic(X, Y, iscorer, gamma_init)

print(hsic_values)

{'hsic': 0.004420621559606378, 'tka': 0.543442521262865, 'ctka': 0.06751049509744558}

Experiment II - Different Scorers, Initializers¶

# dataset params
dataset = 'gauss'
samples = 100
dimensions = 2
std = 1
trial = 1

# extract dataset
X, Y, mi_val = MIData(dataset).get_data(samples=samples, dimensions=dimensions, std=std, trial=trial)

# experimental parameters
scorers = ['hsic', 'tka', 'ctka']
gamma_methods = [
    ('silverman',None),
    ('scott', None),
    ('median', None),
    ('belkin', 0.1),
    ('belkin', 0.2),
    ('belkin', 0.4),
    ('belkin', 0.8),
]

# results dataframe
results_df = pd.DataFrame()


# run experiment
for iscorer in scorers:
    for imethod in gamma_methods:

        # initialize gamma
        gamma_init = get_gamma_init(X, Y, imethod[0], imethod[1])

        # get hsic_value
        hsic_value = get_hsic(X, Y, iscorer, gamma_init)

        # append results to dataframe
        results_df = results_df.append({
            'scorer': iscorer,
            'gamma_method': f"{imethod[0]}_{imethod[1]}" if imethod[1] is not None else f"{imethod[0]}",
            'gamma_init': gamma_init,
            'hsic_value': hsic_value
        }, ignore_index=True)

results_df.head()

	gamma_init	gamma_method	hsic_value	scorer
0	2.320794	silverman	0.008088	hsic
1	2.320794	scott	0.008088	hsic
2	0.185138	median	0.001182	hsic
3	1.239664	belkin_0.1	0.006772	hsic
4	0.596976	belkin_0.2	0.004421	hsic

# plot the results

def plot_scorer(scorer:str)-> None:

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5,5))

    sns.scatterplot(
        x='gamma_init', 
        y='hsic_value',
        hue='gamma_method',
        data=results_df[results_df['scorer'] == scorer],
        ax=ax
    )
    ax.set_ylabel('Score')
    ax.set_xlabel('Gamma Initialization')
    ax.legend(prop={'size':9})
    ax.set_title(scorer.upper())
    plt.show()

plot_scorer('hsic')
plot_scorer('tka')
plot_scorer('ctka')

Experiment II - Different Scorers, Initializations and Degree of Freedom¶

In this experiment, we'll be looking at how do the HSIC values change depending upon the gamma initialization as well as the degree of freedom we choose. In the Gaussian distribution, this is the standard deviation, $\sigma$ and the T-Student distribution this is the $\nu$ parameter.

from tqdm import trange, tqdm

# dataset params
dataset = 'gauss'
samples = 100
dimensions = 2
std = 1
trial = 1



# experimental parameters
scorers = ['hsic', 'tka', 'ctka']
gamma_methods = [
    ('silverman',None),
    ('scott', None),
    ('median', None),
    ('belkin', 0.1),
    ('belkin', 0.2),
    ('belkin', 0.4),
    ('belkin', 0.8),
    ('max', None)
]
dof_params = np.linspace(1,11, 11, endpoint=True)

# results dataframe
results_df = pd.DataFrame()


# run experiment
with tqdm(gamma_methods) as gamma_bar:
    for imethod in gamma_bar:
        for iscorer in scorers:
            for idof in dof_params:

                # extract dataset
                X, Y, mi_val = MIData(dataset).get_data(samples=samples, dimensions=dimensions, std=int(idof), trial=trial)

                # initialize gamma
                if imethod[0] == 'max':
                    clf_hsic = train_rbf_hsic(X, Y, iscorer, 50, 1, 'median')
                    hsic_value = clf_hsic.score(X)
                else:
                    gamma_init = get_gamma_init(X, Y, imethod[0], imethod[1])
                    hsic_value = get_hsic(X, Y, iscorer, gamma_init)

                # append results to dataframe
                results_df = results_df.append({
                    'scorer': iscorer,
                    'gamma_method': f"{imethod[0]}_{imethod[1]}" if imethod[1] is not None else f"{imethod[0]}",
                    'gamma_init': gamma_init,
                    'hsic_value': hsic_value,
                    'std': idof,
                    'mi_value': mi_val,
                }, ignore_index=True)

results_df.head()

100%|██████████| 8/8 [00:11<00:00,  1.44s/it]

	gamma_init	gamma_method	hsic_value	mi_value	scorer	std
0	2.320794	silverman	0.008088	0.000000	hsic	1.0
1	2.320794	silverman	0.008068	0.002053	hsic	2.0
2	2.320794	silverman	0.008096	0.007718	hsic	3.0
3	2.320794	silverman	0.008163	0.016467	hsic	4.0
4	2.320794	silverman	0.008259	0.027999	hsic	5.0

# plot the results

def plot_scorer_mi(df: pd.DataFrame, scorer:str)-> None:

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(5,5))

    sns.scatterplot(
        x='hsic_value', 
        y='mi_value',
        hue='gamma_method',
        data=df[df['scorer'] == scorer],
        ax=ax
    )
    ax.set_ylabel('Mutual Information')
    ax.set_xlabel('Score')
    ax.legend(prop={'size':9})
    ax.set_title(scorer.upper())
    plt.show()

plot_scorer_mi(results_df, 'hsic')
plot_scorer_mi(results_df, 'tka')
plot_scorer_mi(results_df, 'ctka')

Same Experiment, but with a higher number of samples and dimensions¶

# dataset params
dataset = 'gauss'
samples = [1_000]
dimensions = [50]
trials = [1]



# experimental parameters
scorers = ['hsic', 'tka', 'ctka']
gamma_methods = [
    ('silverman',None),
    ('scott', None),
    ('median', None),
    ('belkin', 0.1),
    ('belkin', 0.2),
    ('belkin', 0.4),
    ('belkin', 0.8),
    ('max', None)
]
dof_params = np.linspace(1,11, 11, endpoint=True)

# results dataframe
results_df = pd.DataFrame()


# run experiment
with tqdm(gamma_methods) as gamma_bar:
    for imethod in gamma_bar:
        for isample in samples:
            for idim in dimensions:
                for itrial in trials:
                    for iscorer in scorers:
                        for idof in dof_params:

                            # extract dataset
                            X, Y, mi_val = MIData(dataset).get_data(samples=isample, dimensions=idim, std=int(idof), trial=itrial)

                            # initialize gamma
                            print(imethod[0], imethod[1])
                            if imethod[0] == 'max':
                                clf_hsic = train_rbf_hsic(X, Y, iscorer, 50, 1, 'median')
                                hsic_value = clf_hsic.score(X)
                            else:

                                gamma_init = get_gamma_init(X, Y, imethod[0], imethod[1])
                                hsic_value = get_hsic(X, Y, iscorer, gamma_init)

                            # get hsic_value


                            # append results to dataframe
                            results_df = results_df.append({
                                'samples': isample,
                                'dimensions': idim,
                                'trial': itrial,
                                'scorer': iscorer,
                                'gamma_method': f"{imethod[0]}_{imethod[1]}" if imethod[1] is not None else f"{imethod[0]}",
                                'gamma_init': gamma_init,
                                'hsic_value': hsic_value,
                                'std': idof,
                                'mi_value': mi_val,
                            }, ignore_index=True)

results_df.head()

  0%|          | 0/8 [00:00<?, ?it/s]

silverman None

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-87-b149632f687f> in <module>
     44                             else:
     45 
---> 46                                 gamma_init = get_gamma_init(X, Y, imethod[0], imethod[1])
     47                                 hsic_value = get_hsic(X, Y, iscorer, gamma_init)
     48 

~/.conda/envs/it4dnn/lib/python3.6/site-packages/prefect/core/task.py in __call__(self, mapped, task_args, upstream_tasks, flow, *args, **kwargs)
    362         new = self.copy(**(task_args or {}))
    363         new.bind(
--> 364             *args, mapped=mapped, upstream_tasks=upstream_tasks, flow=flow, **kwargs
    365         )
    366         return new

~/.conda/envs/it4dnn/lib/python3.6/site-packages/prefect/core/task.py in bind(self, mapped, upstream_tasks, flow, *args, **kwargs)
    402         # this will raise an error if callargs weren't all provided
    403         signature = inspect.signature(self.run)
--> 404         callargs = dict(signature.bind(*args, **kwargs).arguments)  # type: Dict
    405 
    406         # bind() compresses all variable keyword arguments under the ** argument name,

~/.conda/envs/it4dnn/lib/python3.6/inspect.py in bind(*args, **kwargs)
   2995         if the passed arguments can not be bound.
   2996         """
-> 2997         return args[0]._bind(args[1:], kwargs)
   2998 
   2999     def bind_partial(*args, **kwargs):

~/.conda/envs/it4dnn/lib/python3.6/inspect.py in _bind(self, args, kwargs, partial)
   2916                     param = next(parameters)
   2917                 except StopIteration:
-> 2918                     raise TypeError('too many positional arguments') from None
   2919                 else:
   2920                     if param.kind in (_VAR_KEYWORD, _KEYWORD_ONLY):

TypeError: too many positional arguments

res_high_df = results_df.copy()

plot_scorer_mi(res_high_df, 'hsic')
plot_scorer_mi(res_high_df, 'tka')
plot_scorer_mi(res_high_df, 'ctka')