Skip to content

Walkthrough - Gamma in the Distribution Sapce

import sys, os

# Insert path to model directory,.
cwd = os.getcwd()
path = f"{cwd}/../../src"
sys.path.insert(0, path)

# Insert path to package,.
pysim_path = f"/home/emmanuel/code/pysim/"
sys.path.insert(0, pysim_path)

import warnings
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
import argparse
from sklearn.utils import check_random_state

# toy datasets
from data.distribution import DataParams

from features.utils import dict_product


# Kernel Dependency measure
from pysim.kernel.hsic import HSIC
from pysim.kernel.utils import estimate_gamma, GammaParam, SigmaParam

# RBIG IT measures
# from models.ite_algorithms import run_rbig_models
from models.dependence import HSICModel

# Plotting
from visualization.distribution import plot_scorer

# experiment helpers
from tqdm import tqdm

# Plotting Procedures
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['seaborn-paper'])
%matplotlib inline

warnings.filterwarnings('ignore') # get rid of annoying warnings

%load_ext autoreload
%autoreload 2
FIG_PATH = "/home/emmanuel/projects/2019_hsic_align/results/figures/distribution_experiment/gamma_space/"
RES_PATH = "/home/emmanuel/projects/2019_hsic_align/data/results/distributions/gamma_space/"

Datasets

  • Samples - [500, 1K, 5K, 10K, 30K, 50K]
  • Dimensions - [ 2, 3, 10, 50, 100]
  • trials - 1:5
  • IT measures - Mutual Information
  • Distributions - [Gaussian, T-Student]

Functions

# HSIC FUNCTION
def get_hsic(X, Y, scorer: str, sigma_X, sigma_Y):

    # init hsic model class
    hsic_model = HSICModel()

    # hsic model params
    hsic_model.kernel_X = RBF(sigma_X)
    hsic_model.kernel_Y = RBF(sigma_Y)

    # get hsic score
    hsic_val = hsic_model.get_score(inputs.X, inputs.Y, scorer)

    return hsic_val

Example Gaussian Distribution: 2D

def plot_data(X):
    # plot data
    fig  = plt.figure()

    g = sns.jointplot(
        x=X[:, 0],
        y=X[:, 1],
    )
    plt.show()

Param I: Standardize or Non-Standardized Data

\bar{x} = \frac{x - \mu_x}{\sigma_x}

a) Not Standardized Data

# initialize Data Params class
example_params = DataParams(standardize=False)

# generate data from params
inputs_nstd = example_params.generate_data()

plot_data(inputs_nstd.X)
plt.gcf()
plt.savefig(f"{FIG_PATH}1_dataX_nstd.png")
plot_data(inputs_nstd.Y)
plt.gcf()
plt.savefig(f"{FIG_PATH}1_dataY_nstd.png")
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>

b) Standardized Data

# initialize Data Params class
example_params = DataParams(standardize=True)

# generate data from params
inputs_std = example_params.generate_data()

plot_data(inputs_std.X)
plt.gcf()
plt.savefig(f"{FIG_PATH}1_dataX_std.png")
plot_data(inputs_std.Y)
plt.gcf()
plt.savefig(f"{FIG_PATH}1_dataY_std.png")
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>

HSIC Algorithms

  • Hilbert-Schmidt Independence Criterion (HSIC)
  • Kernel Alignment (KA)
  • Centered Kernel Alignment (CKA)

HSIC

algorithm path: src/models/dependence.py

2. Estimate HSIC

2.1 - Same Length Scale

# init gamma estimator
sigma_estimator = SigmaParam(method='median', percent=0.5, per_dimension=False)
# estimate sigma
sigma_X = sigma_estimator.estimate_sigma(X=inputs.X, )
sigma_Y = sigma_estimator.estimate_sigma(X=inputs.Y, )

print(f'Sigma_x: ', sigma_X)
print(f'Sigma_y: ', sigma_Y)

# init hsic model class
hsic_model = HSICModel()

# hsic model params
hsic_model.kernel_X = RBF(sigma_X)
hsic_model.kernel_Y = RBF(sigma_Y)

# get hsic score
hsic_val = hsic_model.get_score(inputs.X, inputs.Y, 'hsic')

print(f"HSIC: {hsic_val:.5f}")
Sigma_x:  1.6287986984984644
Sigma_y:  1.6423135950442704
HSIC: 0.00136

2.2 - Different Length Scale

from sklearn.gaussian_process.kernels import RBF
# init gamma estimator
sigma_estimator = SigmaParam(method='median', percent=0.5, per_dimension=True)
# estimate sigma
sigma_X = sigma_estimator.estimate_sigma(X=inputs.X, )
sigma_Y = sigma_estimator.estimate_sigma(X=inputs.Y, )

print(f'Sigma_x: ', sigma_X)
print(f'Sigma_y: ', sigma_Y)

# init hsic model class
hsic_model = HSICModel(kernel_X = RBF(sigma_X), kernel_Y = RBF(sigma_Y))

# get hsic score
hsic_val = hsic_model.get_score(inputs.X, inputs.Y, 'hsic')

print(f"HSIC: {hsic_val:.5f}")
Sigma_x:  [0.90645763 0.79282285]
Sigma_y:  [0.8172308  0.89065151]
HSIC: 0.00512

Experiment I - Different Scorers

We are looking at different "HSIC scorers". They are:


HSIC

HSIC = \frac{1}{n(n-1)}\langle K_xH,K_yH \rangle_F

Notice: we have the centered kernels, K_xH and no normalization.


TKA

TKA = \frac{\langle K_x,K_y \rangle_F}{||K_x||_F||K_y||_F}

Notice: We have the uncentered kernels and a normalization factor.


cTKA

cTKA = \frac{\langle K_xH,K_yH \rangle_F}{||K_xH||_F||K_yH||_F}

Notice: We have the centered kernels and a normalization factor.

# init gamma estimator
sigma_estimator = SigmaParam(method='median', percent=0.5, per_dimension=True)
# estimate sigma
sigma_X = sigma_estimator.estimate_sigma(X=inputs.X, )
sigma_Y = sigma_estimator.estimate_sigma(X=inputs.Y, )

print(f'Sigma_x: ', sigma_X)
print(f'Sigma_y: ', sigma_Y)


# experimental parameters
hsic_model = HSICModel()

# init hsic model class
hsic_model = HSICModel()

# hsic model params
hsic_model.kernel_X = RBF(sigma_X)
hsic_model.kernel_Y = RBF(sigma_Y)




scores = dict()

# loop through scorers
for iscorer in ['hsic', 'ka', 'cka']:

    # calculate hsic score
    iscore = hsic_model.get_score(inputs_std.X, inputs_std.Y, iscorer)

    # get HSIC value
    print(f'{iscorer.upper()}: {iscore:.4f}')
Sigma_x:  [0.90645763 0.79282285]
Sigma_y:  [0.8172308  0.89065151]
HSIC: 0.0051
KA: 0.4938
CKA: 0.0792

There is an obvious difference between each of them even though the distribution hasn't changed at all. It's clear that each of the methods have a slightly different form. So perhaps it's something to do with the parameters we've chosen. We need to investigate things further.

Experiment II - Gamma Space

Method 2 - Multiprocessing

from experiments.utils import dict_product, run_parallel_step
# Data Parameters
example_data = DataParams()
example_data.std = 10
inputs = example_data.generate_data()


# experimental parameters
n_grid_points = 20
gamma_grid = np.logspace(-3, 3, n_grid_points)
parameters = {
    "scorer": ['hsic', 'ka', 'cka'],
    "gamma_X": np.copy(gamma_grid),
    "gamma_Y": np.copy(gamma_grid),
}

# create a list of all param combinations
parameters = list(dict_product(parameters))
n_params = len(parameters)
print('# of Params:', n_params)
# of Params: 1200
from typing import Dict

# define step function
def step(params: Dict, inputs):

    # init hsic model class
    hsic_model = HSICModel()

    # hsic model params
    hsic_model.gamma_X = params['gamma_X']
    hsic_model.gamma_Y = params['gamma_Y']

    # get hsic score
    score = hsic_model.get_score(inputs.X, inputs.Y, params['scorer'])

    results_df = pd.DataFrame({
        'scorer': [params['scorer']],
        'gamma_X': [params['gamma_X']],
        'gamma_Y': [params['gamma_Y']],
        'score': score,
    },)

    return results_df
verbose = 1
n_jobs = 2

results = run_parallel_step(
    exp_step=step, 
    parameters=parameters,
    n_jobs=n_jobs,
    verbose=verbose,
    inputs=inputs
)
[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 956 tasks      | elapsed:    2.0s
[Parallel(n_jobs=2)]: Done 1200 out of 1200 | elapsed:    2.5s finished
results_full_df = pd.concat(results, ignore_index=True)
results_full_df.tail()
scorer gamma_X gamma_Y score
1195 cka 1000.0 54.555948 0.875085
1196 cka 1000.0 112.883789 0.930452
1197 cka 1000.0 233.572147 0.963310
1198 cka 1000.0 483.293024 0.981687
1199 cka 1000.0 1000.000000 0.991891

Viz - Heatmap of Values

from typing import Optional

def plot_gamma_grid(grid_df: pd.DataFrame, scorer: str, ax: Optional=None):

    if ax is None:
        fig, ax = plt.subplots(figsize=(8,6.5))

    # ===========================================
    # Plot Gridded DataFrame
    # ===========================================


    # subset hsic method
    grid_df_ = grid_df[grid_df['scorer'] == scorer].drop('scorer', axis=1)

    # create a heatmap
    grid_df_ = pd.pivot_table(grid_df_, values='score', index=['gamma_Y'], columns='gamma_X')
#     print(grid_df_)
    # min max
    if scorer == 'hsic':
        vmax = 0.11
    else:
        vmax = 1.0

    # heatmap_data.columns = np.sqrt(1 / 2 * heatmap_data.columns.values)
    # heatmap_data.index = np.sqrt(1 / 2 * heatmap_data.index.values)
    pts = sns.heatmap(
        data=grid_df_,
        xticklabels=grid_df_.columns.values.round(decimals=2),
        yticklabels=grid_df_.index.values.round(decimals=2),
        vmin=0, vmax=vmax, ax=ax
    )


    return plt.gcf(), ax
# # =======================
# # HSIC
# # =======================
# scorer = 'hsic'

# fig, ax = plot_gamma_grid(param_df, scorer=scorer)

# # ax.legend(ncol=1, fontsize=15)
# ax.set_xlabel(r'$\gamma_X$')
# ax.set_ylabel(r'$\gamma_Y$')
# ax.set_title(f'{scorer.upper()} Score, MI: {inputs.mutual_info:.2f}')
# plt.tight_layout()
# plt.show()

# # =======================
# # Kernel Alignment
# # =======================
# scorer = 'ka'

# fig, ax = plot_gamma_grid(param_df, scorer=scorer)

# # ax.legend(ncol=1, fontsize=15)
# ax.set_xlabel(r'$\gamma_X$')
# ax.set_ylabel(r'$\gamma_Y$')
# ax.set_title(f'{scorer.upper()} Score, MI: {inputs.mutual_info:.2f}')
# plt.tight_layout()
# plt.show()

# ==========================
# Centered Kernel Alignment
# ==========================
scorer = 'cka'

fig, ax = plot_gamma_grid(results_full_df, scorer=scorer)

# ax.legend(ncol=1, fontsize=15)
ax.set_xlabel(r'$\gamma_X$')
ax.set_ylabel(r'$\gamma_Y$')
ax.set_title(f'{scorer.upper()} Score, MI: {inputs.mutual_info:.2f}')
plt.tight_layout()
plt.show()

Experiment III - Gamma Estimators

# Data Parameters
example_data = DataParams()
example_data.std = 10
inputs = example_data.generate_data()


# experimental parameters
gamma_estimators = [
    ('silverman',None, None),
    ('scott', None, None),
    *[('median', x, None) for x in np.arange(0.1, 1.0, 0.1, dtype=np.float64)]
]
parameters = {
    "scorer": ['hsic', 'ka', 'cka'],
    "gamma_est": gamma_estimators
}

# create a list of all param combinations
parameters = list(dict_product(parameters))
n_params = len(parameters)
print('# of Params:', n_params)
# of Params: 33
from tqdm import tqdm


# init hsic model class
hsic_model = HSICModel()

# results dataframe
gamma_ests = list()


# run experiment
for iparams in tqdm(parameters):

    # estimate gamma
    gamma_X = estimate_gamma(
        X=inputs.X, 
        method=iparams['gamma_est'][0], 
        percent=iparams['gamma_est'][1], 
        scale=iparams['gamma_est'][2]
    )
    gamma_Y = estimate_gamma(
        X=inputs.Y, 
        method=iparams['gamma_est'][0], 
        percent=iparams['gamma_est'][1], 
        scale=iparams['gamma_est'][2]
    )




    # hsic model params
    hsic_model.gamma_X = gamma_X
    hsic_model.gamma_Y = gamma_Y

    # get hsic score
    score = hsic_model.get_score(inputs.X, inputs.Y, iparams['scorer'])


#         # init gamma estimator
#         gamma_est = GammaParam(*imethod)

#         # initialize gamma
#         gamma_init = get_gamma_init(X, Y, imethod[0], imethod[1])

#         # get hsic_value
#         hsic_value = get_hsic(X, Y, iscorer, gamma_init, maximum=False)

        # append results to dataframe
    gamma_ests.append(
        pd.DataFrame({
        'scorer': [iparams['scorer']],
        'gamma_method': [iparams['gamma_est'][0]],
        'gamma_scale': [iparams['gamma_est'][2]],
        'gamma_percent': [iparams['gamma_est'][1]],
        'gamma_X': [gamma_X],
        'gamma_Y': [gamma_Y],
        'score': [score],
    }))

# results_df.head()
100%|██████████| 33/33 [00:00<00:00, 139.93it/s]
gamma_ests_df = pd.concat(gamma_ests, ignore_index=True)
gamma_ests_df.tail()
scorer gamma_method gamma_scale gamma_percent gamma_X gamma_Y score
28 cka median None 0.5 0.201849 0.240391 0.144038
29 cka median None 0.6 0.149283 0.183536 0.154599
30 cka median None 0.7 0.112474 0.142939 0.164639
31 cka median None 0.8 0.087687 0.116909 0.172534
32 cka median None 0.9 0.062305 0.082550 0.183174
from typing import List

def plot_estimated_params(
    ax, 
    gamma_est_df: pd.DataFrame, 
    gamma_estimators: List, 
    scorer: str
):
    # subset hsic method
    df_ = gamma_est_df[gamma_est_df['scorer'] == scorer]

    # subset gamma estimators
    for iestimator in gamma_estimators:
        # subsets
        sub_df = df_[df_['gamma_method'] == iestimator[0]]
        if iestimator[1] is not None:
            sub_df = sub_df[sub_df['gamma_percent'] == iestimator[1]]
        if iestimator[2] is not None:
            sub_df = sub_df[sub_df['gamma_scale'] == iestimator[2]]

        name = list(filter(None, iestimator))
        name = '_'.join(str(i) for i in name)
        ax.scatter(
            sub_df.gamma_X, 
            sub_df.gamma_Y,
            s=500, label=f"{name}", zorder=3, marker='.')


    return ax
# ==========================
# Centered Kernel Alignment
# ==========================
scorer = 'cka'

# Plot Grid
fig, ax = plot_gamma_grid(results_full_df, scorer=scorer)

# Plot points
ax = plot_estimated_params(ax, gamma_ests_df, gamma_estimators, scorer, )

ax.legend(ncol=1, fontsize=8)
ax.set_xlabel(r'$\gamma_X$')
ax.set_ylabel(r'$\gamma_Y$')
ax.set_title(f'{scorer.upper()} Score, MI: {inputs.mutual_info:.2f}')
plt.tight_layout()
plt.show()

Viz - Combined Function

def plot_dist_params(param_df, gamma_ests_df, gamma_estimators, scorer: str):

    # Plot Grid
    fig, ax = plot_gamma_grid(param_df, scorer=scorer)

    # Plot points
    ax = plot_estimated_params(ax, gamma_ests_df, gamma_estimators, scorer, )

    ax.legend(ncol=1, fontsize=8)
    ax.set_xlabel(r'$\gamma_X$')
    ax.set_ylabel(r'$\gamma_Y$')
    ax.set_title(f'{scorer.upper()} Score, MI: {inputs.mutual_info:.2f}')
    plt.tight_layout()
    plt.show()
plot_dist_params(results_full_df, gamma_ests_df, gamma_estimators, 'hsic')
plot_dist_params(results_full_df, gamma_ests_df, gamma_estimators, 'ka')
plot_dist_params(results_full_df, gamma_ests_df, gamma_estimators, 'cka')