Skip to content

Experiment III - Samples vs Dimensions

import sys, os
import warnings
import tqdm
import random
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

# Insert path to model directory,.
cwd = os.getcwd()
path = f"{cwd}/../../src"
sys.path.insert(0, path)

# toy datasets
from data.toy import RBIGData

# Kernel Dependency measure
from models.dependence import HSIC, train_rbf_hsic
from models.kernel import estimate_sigma, sigma_to_gamma, gamma_to_sigma, get_param_grid

# RBIG IT measures
from models.ite_algorithms import run_rbig_models

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

warnings.filterwarnings('ignore') # get rid of annoying warnings

%load_ext autoreload
%autoreload 2
clf_rbigdata = RBIGData(dataset=dataset, info_meas=info_meas)

Datasets

* Samples - [500, 1K, 5K, 10K, 30K, 50K] * Dimensions - [ 2, 3, 10, 50, 100] * trials - [1,5] * IT measures - [TC, H, MI, KLD] * Distributions - [Linear, Gaussian, T-Student]

# dataset params
dataset = 'gauss'
info_meas = 'mi'

# initialize dataset Gen
clf_rbigdata = RBIGData(dataset=dataset, info_meas=info_meas)

# extract data and results
d_dimensions = 2
n_samples = 500
t_trials = 2
nu = None
mu = None

data = clf_rbigdata.get_data(
    d_dimensions=d_dimensions, 
    n_samples=n_samples,
    t_trials=t_trials,
    nu=nu,
    mu=mu
)

X, Y = data['X'], data['Y']
# extract data and results
d_dimensions = 2
n_samples = 500
t_trials = 2
nu = 1
mu = 4

results = clf_rbigdata.get_results(
    d_dimensions=d_dimensions, 
    n_samples=n_samples,
    t_trials=t_trials,
    nu=nu,
    mu=mu
)
DATA_MI_tstu_nd_2_Ns_500_tryal_2_nu_1.mat

HSIC Measures

# hsic params
kernel = 'rbf'
scorers = ['hsic', 'tka', 'ctka']
subsample = None
bias = True

# HSIC train parameters
n_gamma = 1000
sigma_est = 'mean'
factor = 1
verbose = 1
n_jobs = -1
cv = 2

for iscorer in scorers:
    print(f'Scorer: {iscorer}')
    # initialize HSIC calculator
    clf_hsic = HSIC(
        kernel=kernel, 
        scorer=iscorer, 
        subsample=subsample,
        bias=bias
    )

    # calculate HSIC return scorer
    clf_hsic = train_rbf_hsic(
        X, Y, 
        clf_hsic, 
        n_gamma=n_gamma, 
        factor=factor, 
        sigma_est=sigma_est, 
        verbose=verbose, 
        n_jobs=n_jobs, 
        cv=cv
    )

#     # hsic value and kernel alignment score
#     hsic_val = clf_hsic.hsic_value
Scorer: hsic
Fitting 2 folds for each of 1000 candidates, totalling 2000 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 151 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:    2.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 28 concurrent workers.
Best HSIC score: 0.00515
gamma: 0.384
Scorer: tka
Fitting 2 folds for each of 1000 candidates, totalling 2000 fits
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 28 concurrent workers.
Best TKA score: 0.99683
gamma: 0.005
Scorer: ctka
Fitting 2 folds for each of 1000 candidates, totalling 2000 fits
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:    0.4s
Best CTKA score: 0.70761
gamma: 50.000
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed:    1.3s finished

Experiment Class

class LargeScaleKTA:
    def __init__(
        self, 
        seed=123, 
        n_gamma=100, 
        factor=1,
        sigma_est='silverman',
        save_path=None,
        save_name='scale_test'
    ):

        # fixed experimental params
        self.seed        = seed
        self.n_gamma     = n_gamma
        self.factor      = factor
        self.sigma_est   = sigma_est
        self.save_path   = save_path
        self.save_name   = save_name
        self.datasets = [
            'gauss',
            'tstudent',
        ]
        self.nus = [
            1, 2, 3
        ]
        self.trials = [1,2,3,4,5]
        self.n_samples = [
            500, 
            1_000, 
            5_000, 
            10_000, 
#             30_000, 
#             50_000
        ]
        self.d_dimensions = [
            2, 3, 10, 50, 100
        ]
        # free experimental params
        self.scorers = [
            'hsic',
            'tka',
            'ctka',
        ]

        # saved dataframe

        pass

    def run_experiment(self):

        # initialize results dataframe
        self.results_df = self.generate_results_df()

        # Loop through datasets
        for idataset in self.datasets:
            print(f"Function: {idataset}")

            # Loop through samples
            for isample in self.n_samples:
                for idimension in self.d_dimensions:

                    # Loop through random seeds
                    for itrial in self.trials:

                        if idataset == 'tstudent':

                            for inu in self.nus:

                                X, Y = self._get_data(idataset, 'mi', idimension, isample, itrial, inu)



                                # Loop through HSIC scoring methods
                                for hsic_method in self.scorers:



                                    # =======================
                                    # HSIC MEASURES
                                    # =======================

                                    # Calculate HSIC
                                    hsic_score, gamma = self._get_hsic(X, Y, hsic_method)

                                    # append results to results dataframe
                                    self.results_df = self.append_results(
                                        self.results_df, 
                                        idataset, 
                                        itrial, 
                                        isample, 
                                        idimensions,
                                        inu,
                                        gamma, 
                                        hsic_method, 
                                        hsic_score, 
                                    )

                                    # save results to csv
                                    self.save_data(self.results_df)


                        elif idataset == 'gauss':
                            X, Y = self._get_data(idataset, 'mi', idimension, isample, itrial, None)


                            # =======================
                            # HSIC MEASURES
                            # =======================


                            # Loop through HSIC scoring methods
                            for hsic_method in self.scorers:



                                # =======================
                                # HSIC MEASURES
                                # =======================

                                # Calculate HSIC
                                hsic_score, gamma = self._get_hsic(X, Y, hsic_method)

                                # append results to results dataframe
                                self.results_df = self.append_results(
                                    self.results_df, 
                                    idataset, 
                                    itrial, 
                                    isample, 
                                    idimension,
                                    np.nan,
                                    gamma, 
                                    hsic_method, 
                                    hsic_score, 
                                )

                                # save results to csv
                                self.save_data(self.results_df)

                        else:
                            raise ValueError(f"Unrecognized dataset: {idataset}")

        return self

    def _get_data(self, dataset, info_meas, dimensions, samples, trials, nu):

        # initialize dataset Generator
        clf_rbigdata = RBIGData(dataset=dataset, info_meas=info_meas)

        data = clf_rbigdata.get_data(
            d_dimensions=dimensions, 
            n_samples=samples,
            t_trials=trials,
            nu=nu,
        )
        return data['X'], data['Y']

    def _get_hsic(self, X, Y, scorer):

        # hsic params
        kernel = 'rbf'
        subsample = None
        bias = True

        # initialize HSIC calculator
        clf_hsic = HSIC(
            kernel=kernel, 
            scorer=scorer, 
            subsample=subsample,
            bias=bias
        )

        # calculate HSIC return scorer
        clf_hsic = train_rbf_hsic(
            X, Y, 
            clf_hsic, 
            n_gamma=self.n_gamma, 
            factor=self.factor, 
            sigma_est=self.sigma_est, 
            verbose=0, 
            n_jobs=-1, 
            cv=3
        )
        # hsic value and kernel alignment score
        return clf_hsic.hsic_value, clf_hsic.gamma

    def generate_results_df(self):
        return pd.DataFrame(columns=[
            'dataset',
            'trial',
            'n_samples',
            'd_dimensions',
            'nu',
            'gamma',
            'scorer',
            'value',
        ])

    def append_results(
        self, 
        results_df, 
        dataset, 
        trial, 
        n_samples, 
        d_dimensions,
        nu,
        gamma, 
        hsic_method, 
        hsic_score, 
    ):  
        # append data 
        return results_df.append({
            'dataset': dataset,
            'trial': trial,
            'n_samples': n_samples,
            'd_dimensions': d_dimensions,
            'nu': nu,
            'gamma': gamma,
            'scorer': hsic_method,
            'value': hsic_score,
        }, ignore_index=True)

    def load_data(self):
        pass

    def save_data(self, results_df):
        results_df.to_csv(f"{self.save_path}{self.save_name}.csv")
# experimental params
seed        = 123                   # reproducibility
n_gamma     = 100                    # number of points in gamma param grid
factor      = 1                     # log factor for gamma param grid bounds
sigma_est   = 'mean'                # sigma initialization
save_path   = f'{cwd}/../../results/hsic/'
save_name   = 'trial_large_v1'

# initialize experiment class
clf_exp = LargeScaleKTA(
    seed=seed,
    factor=factor,
    sigma_est=sigma_est,
    n_gamma=n_gamma,
    save_path=save_path,
    save_name=save_name,
)

# run full experiment
clf_exp.run_experiment()
Function: gauss