Skip to content
Source

Parameter Space - 1D

I've broken down the components so that it's easier to pass appropriate parameters. The main functions as as follows:

  • Standardize Data
    • yes or no
  • Get Sigma
    • Handles the ways to estimate the parameter
    • per dimension
    • per dataset (x and/or y)
    • estimator (median, mean, silverman, scott, etc)
  • Get HSIC scorer
    • Handles the HSIC method (hsic, ka, cka)
Code Blocks
import sys, os

# Insert path to model directory,.
cwd = os.getcwd()
path = f"{cwd}/../../src"
sys.path.insert(0, path)

# Insert path to package,.
pysim_path = f"/home/emmanuel/code/pysim/"
sys.path.insert(0, pysim_path)

import warnings
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
import argparse
from sklearn.utils import check_random_state

# toy datasets
from data.toy import generate_dependence_data
from data.distribution import DataParams
from features.utils import dict_product


# Kernel Dependency measure
from sklearn.gaussian_process.kernels import RBF
from pysim.kernel.hsic import HSIC
from pysim.kernel.utils import estimate_sigma, get_sigma_grid#GammaParam, SigmaParam

# RBIG IT measures
from models.dependence import HSICModel

# Plotting
from visualization.distribution import plot_scorer

# experiment helpers
from tqdm import tqdm
from experiments.utils import dict_product, run_parallel_step

# Plotting Procedures
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['seaborn-paper'])
warnings.filterwarnings('ignore') # get rid of annoying warnings
%matplotlib inline

%load_ext autoreload
%autoreload 2
from typing import Optional, Tuple

def standardize_data(
    X: np.ndarray, 
    Y: np.ndarray, 
    standardize: bool=False
) -> Tuple[np.ndarray, np.ndarray]:
    X = StandardScaler().fit_transform(X)
    Y = StandardScaler().fit_transform(Y)
    return X, Y
def get_sigma(
    X: np.ndarray, 
    Y: np.ndarray, 
    method: str='silverman', 
    percent: Optional[float]=None,
    per_dimension: bool=False, 
    separate_scales: bool=False
) -> Tuple[np.ndarray, np.ndarray]:
    # sigma parameters
    subsample = None
    random_state = 123

    sigma_X = estimate_sigma(
        X, 
        subsample=subsample,
        method=method,
        percent=percent,
        random_state=random_state,
        per_dimension=per_dimension
    )

    sigma_Y = estimate_sigma(
        Y, 
        subsample=subsample,
        method=method,
        percent=percent,
        random_state=random_state,
        per_dimension=per_dimension
    )

    if separate_scales:
        sigma_X = np.mean([sigma_X, sigma_Y])
        sigma_Y = np.mean([sigma_X, sigma_Y])
    return sigma_X, sigma_Y
def get_hsic(
    X: np.ndarray, 
    Y: np.ndarray, 
    scorer: str, 
    sigma_X: Optional[float]=None, 
    sigma_Y: Optional[float]=None
) -> float:

    # init hsic model class

    hsic_model = HSICModel()
    # hsic model params
    if sigma_X is not None:

        hsic_model.kernel_X = RBF(sigma_X)
        hsic_model.kernel_Y = RBF(sigma_Y)

    # get hsic score
    hsic_val = hsic_model.get_score(X, Y, scorer)

    return hsic_val

def plot_toy_data(X, Y, subsample: Optional[int]=None):

    # plot
    fig, ax = plt.subplots()

    ax.scatter(X[:subsample,:], Y[:subsample,:])
    return fig, ax

Datasets

For this experiment, we will be looking at 4 simple 1D datasets:

  • Line
  • Sine
  • Circle
  • Random
Code Block
datasets = ['line', 'sine', 'circle', 'random']
num_points = 1_000
seed = 123
noise = 0.1

for idataset in datasets:
    # get dataset
    X, Y = generate_dependence_data(
        dataset=idataset,
        num_points=num_points,
        seed=seed,
        noise_x=noise,
        noise_y=noise
    )

    fig, ax = plot_toy_data(X, Y, 100)
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)
    plt.tight_layout()
    fig.savefig(FIG_PATH + f"demo_{idataset}.png")
    plt.show()

png

Research Questions

  • Which Algorithm?
  • Which Parameter Estimator?
  • Standardize or Not?

Part I - The Sigma Space

Experiment

For this first part, we want to look a the entire \sigma space for the RBF kernel. We will vary the \sigma parameter and use 20 grid poinnts for both X and Y. Since we're dealing with 1D data, we will not have to worry about per dimension estimates.

Free Parameters

  • Dataset (sine, line, circle, random)
  • Scorer (hsic, cka, ka)
  • Sigma X,Y (grid space)

We fix all other parameters as they are not necessary for this first step.

Code Blocks
def sigma_grid(sigma_X, factor=2, n_grid_points=20):
    return np.logspace(
        np.log10(sigma_X * 10**(-factor)),
        np.log10(sigma_X * 10**(factor)),
        n_grid_points
    )

This made a list of dictionary values with every possible combination of the parameters we listed. Now if we call the first element of this list, we can pass these parameters into our HSIC function to calculate the score. This allows us to do the calculations in parallel instead of looping through every single combination.

# experimental parameters
n_grid_points = 40
# sigma_grid = np.logspace(-3, 3, n_grid_points)

# initialize sigma (use the median)
sigma_X, sigma_Y = get_sigma(X,Y, method='mean')
print(sigma_X, sigma_Y)

# create a grid
sigma_X_grid = sigma_grid(sigma_X, factor=3, n_grid_points=n_grid_points)
sigma_Y_grid = sigma_grid(sigma_Y, factor=3, n_grid_points=n_grid_points)

# create a parameter grid
parameters = {
    "dataset": ['sine', 'line', 'random', 'circle'],
    "scorer": ['hsic', 'ka', 'cka'],
    "sigma_X": np.copy(sigma_X_grid),
    "sigma_Y": np.copy(sigma_Y_grid),
}

# Get a list of all parameters
parameters = list(dict_product(parameters))

# check # of parameters
n_params = len(parameters)
print(f"Number of params: {n_params}")
print(f"First set of params:\n{parameters[0]}")
0.32917944341805494 0.3439255265652333
Number of params: 19200
First set of params:
{'dataset': 'sine', 'scorer': 'hsic', 'sigma_X': 0.00032917944341805485, 'sigma_Y': 0.00034392552656523323}

Now, we need to make an experimental step function. This function will be the HSIC function that called within the parallel loop. I want it to also return a pd.DataFrame with the columns holding the parameters. This will make things easier for us to keep track of things as well as plot our results.

from typing import Dict

def step(params: Dict, X: np.ndarray, Y: np.ndarray)-> pd.DataFrame:

    # get dataset
    X, Y = generate_dependence_data(
        dataset=params['dataset'],
        num_points=1_000,
        seed=123,
        noise_x=0.1,
        noise_y=0.1
    )

    # calculate the hsic value
    score = get_hsic(X, Y, params['scorer'], params['sigma_X'], params['sigma_Y'])

    # create a dataframe with the results and params
    results_df = pd.DataFrame({
        'dataset': [params['dataset']],
        'scorer': [params['scorer']],
        'sigma_X': [params['sigma_X']],
        'sigma_Y': [params['sigma_Y']],
        'score': score,
    },)

    return results_df

Now we can loop through and calculate the hsic value for each of the \sigma-parameters that we have enlisted. And we will do it in parallel to save time. I'm on a server with 28 cores free so best believe I will be using all of them...

# test the result
res_test = step(parameters[0], X, Y)

# quick test
res_keys = ['dataset', 'scorer', 'sigma_X', 'sigma_Y', 'score']
assert res_keys == res_test.columns.tolist()

# print out results
res_test.head().to_markdown()
dataset scorer sigma_X sigma_Y score
0 sine hsic 0.000329179 0.000343926 0.000997738
verbose = 1
n_jobs = -1

results = run_parallel_step(
    exp_step=step, 
    parameters=parameters,
    n_jobs=n_jobs,
    verbose=verbose,
    X=X,
    Y=Y
)
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 28 concurrent workers.
[Parallel(n_jobs=-1)]: Done 232 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 732 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 1432 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 2332 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 3432 tasks      | elapsed:   20.7s
[Parallel(n_jobs=-1)]: Done 4732 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 6232 tasks      | elapsed:   40.6s
[Parallel(n_jobs=-1)]: Done 7932 tasks      | elapsed:   48.7s
[Parallel(n_jobs=-1)]: Done 9832 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 11932 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 14232 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 16732 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 19145 out of 19200 | elapsed:  2.0min remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 19200 out of 19200 | elapsed:  2.0min finished
# test (number of results = n parameters)
assert n_params == len(results)

results_df = pd.concat(results, ignore_index=True)
results_df.head().to_markdown()
dataset scorer sigma_X sigma_Y score
0 sine hsic 0.000329179 0.000343926 0.000997738
1 sine hsic 0.000329179 0.000490129 0.000997927
2 sine hsic 0.000329179 0.000698484 0.000998668
3 sine hsic 0.000329179 0.000995412 0.000999668
4 sine hsic 0.000329179 0.00141856 0.00100054

Part II - Specific Methods

In the above section, we showed the full parameter space. But what happens if we just look at specific ways to estimate the sigma? For th

Free Parameters:

  • Dataset (sine, line, circle, random)
  • Scorer (hsic, cka, ka)
  • Sigma Estimator (mean, median, silverman, scott)

Experiment

Code Block
# initialize sigma (use the median)
sigma_X, sigma_Y = get_sigma(X,Y, method='mean')

# create a parameter grid
parameters = {
    "dataset": ['sine', 'line', 'random', 'circle'],
    "scorer": ['hsic', 'ka', 'cka'],
    "estimator": ['mean', 'median', 'mean']
}

# Get a list of all parameters
parameters = list(dict_product(parameters))

# check # of parameters
n_params = len(parameters)
print(f"Number of params: {n_params}")
print(f"First set of params:\n{parameters[0]}")
Number of params: 36
First set of params:
{'dataset': 'sine', 'scorer': 'hsic', 'estimator': 'mean'}
from typing import Dict

def step(params: Dict, X: np.ndarray, Y: np.ndarray)-> pd.DataFrame:

    # get dataset
    X, Y = generate_dependence_data(
        dataset=params['dataset'],
        num_points=1_000,
        seed=123,
        noise_x=0.1,
        noise_y=0.1
    )
    # estimate sigma
    sigma_X, sigma_Y = get_sigma(X, Y, method=params['estimator'])

    # calculate the hsic value
    score = get_hsic(X, Y, params['scorer'], sigma_X, sigma_Y)

    # create a dataframe with the results and params
    results_df = pd.DataFrame({
        'dataset': [params['dataset']],
        'scorer': [params['scorer']],
        'estimator': [params['estimator']],
        'sigma_X': [sigma_X],
        'sigma_Y': [sigma_Y],
        'score': score,
    },)

    return results_df

# test the result
res_test = step(parameters[0], X, Y)

# quick test
res_keys = ['dataset', 'scorer', 'estimator', 'sigma_X', 'sigma_Y', 'score']
assert res_keys == res_test.columns.tolist(), f'Not true:{res_test.columns.tolist()}'

# print out results
res_test.head().to_markdown()
dataset scorer estimator sigma_X sigma_Y score
0 sine hsic mean 0.329179 0.820625 0.0661783
verbose = 1
n_jobs = 1

results = run_parallel_step(
    exp_step=step, 
    parameters=parameters,
    n_jobs=n_jobs,
    verbose=verbose,
    X=X,
    Y=Y
)
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:    4.3s finished
# test (number of results = n parameters)
assert n_params == len(results)

results_est_df = pd.concat(results, ignore_index=True)
results_est_df.head().to_markdown()
dataset scorer estimator sigma_X sigma_Y score
0 sine hsic mean 0.329179 0.820625 0.0661783
1 sine hsic median 0.289021 0.72562 0.0770415
2 sine hsic mean 0.329179 0.820625 0.0661783
3 sine ka mean 0.329179 0.820625 0.91219
4 sine ka median 0.289021 0.72562 0.896967

Visualization - HeatMaps

Code Block
def plot_all_params(
    grid_df: pd.DataFrame, 
    params_df: Optional[pd.DataFrame]=None,
    scorer: str='hsc', 
    dataset: str='sine', 
):

    fig, ax = plt.subplots(figsize=(6, 5))

    # ===========================================
    # Plot Gridded DataFrame
    # ===========================================


    # subset hsic method
    grid_df_ = grid_df[grid_df['scorer'] == scorer].drop('scorer', axis=1)

    # subset dataset
    grid_df_ = grid_df_[grid_df_['dataset'] == dataset].drop('dataset', axis=1)

    # create a heatmap
    grid_df_ = pd.pivot_table(grid_df_, values='score', index=['sigma_Y'], columns='sigma_X')
#     print(grid_df_)
    # min max
    if scorer == 'hsic':
        vmax = 0.11
        vmin = grid_df_.values.min()
    else:
        vmax = 1.0
        vmin = grid_df_.values.min()
#     print(vmin)

    # heatmap_data.columns = np.sqrt(1 / 2 * heatmap_data.columns.values)
    # heatmap_data.index = np.sqrt(1 / 2 * heatmap_data.index.values)
    X, Y = np.meshgrid(grid_df_.index.values, grid_df_.columns.values, )
    pts = ax.pcolormesh(
        X, Y, grid_df_.values, #vmin=0, vmax=vmax, 
        cmap='Reds',vmin=0, vmax=vmax,
#         norm=colors.LogNorm(vmin=vmin, vmax=vmax)
    )

    # colorbar
    fig.colorbar(pts, ax=ax)
#     ax = sns.heatmap(
#         data=grid_df_,
#         xticklabels=grid_df_.columns.values.round(decimals=2),
#         yticklabels=grid_df_.index.values.round(decimals=2),
#         vmin=0, vmax=vmax
#     )
    # ===========================================
    # Plot Params
    # ===========================================    
    if params_df is not None:
        params_df_ = params_df[params_df['dataset'] == dataset]

        # subset hsic method
        params_df_ = params_df_[params_df_['scorer'] == scorer]

        # plot X
        estimators = [
            ('median', 'black'),
            ('mean', 'green'),
            ('silverman', 'blue'),
            ('scott', 'red'),
        ]


        for iest, icolor in estimators:

            # Plot X
#             print(
#                 params_df_[params_df_['estimator'] == iest].sigma_X,
#                 params_df_[params_df_['estimator'] == iest].sigma_Y
#             )
            ax.scatter(
                params_df_[params_df_['estimator'] == iest].sigma_X, 
                params_df_[params_df_['estimator'] == iest].sigma_Y,
                s=500, c=icolor, label=f"{iest.capitalize()} X", zorder=3, marker='.')
            # Plot Y
# #             ax.scatter(
# #                 , 
# #                 params_df[params_df['estimator'] == iest].score,
# #                 s=300, c=icolor, label=f"{iest.capitalize()} Y", zorder=3, marker='.')
    ax.legend()
    return fig, ax

Line Dataset

Code Block
scorers = results_df['scorer'].unique().tolist()
idataset = 'line'

for iscorer in scorers:

    fig, ax = plot_all_params(results_df, results_est_df, scorer=iscorer, dataset=idataset)

    # ax.legend(ncol=1, fontsize=15)
    plt.xscale('log', basex=10)
    plt.yscale('log', basey=10)
    ax.set_xlabel(r'$\sigma_X$', fontsize=20)
    ax.set_ylabel(r'$\sigma_Y$', fontsize=20)
    ax.set_title(f'{iscorer.upper()} Score, {idataset.capitalize()}', fontsize=20)
    plt.tight_layout()
    plt.show()

png

png

png

Sine Dataset

Code Block
scorers = results_df['scorer'].unique().tolist()
idataset = 'sine'

for iscorer in scorers:

    fig, ax = plot_all_params(results_df, results_est_df, scorer=iscorer, dataset=idataset)

    # ax.legend(ncol=1, fontsize=15)
    plt.xscale('log', basex=10)
    plt.yscale('log', basey=10)
    ax.set_xlabel(r'$\sigma_X$', fontsize=20)
    ax.set_ylabel(r'$\sigma_Y$', fontsize=20)
    ax.set_title(f'{iscorer.upper()} Score, {idataset.capitalize()}', fontsize=20)
    plt.tight_layout()
    plt.show()

png

![png](pics/param_grid/output_47_1.png)

png

Circle Dataset

Code Block
scorers = results_df['scorer'].unique().tolist()
idataset = 'circle'

for iscorer in scorers:

    fig, ax = plot_all_params(results_df, results_est_df, scorer=iscorer, dataset=idataset)

    # ax.legend(ncol=1, fontsize=15)
    plt.xscale('log', basex=10)
    plt.yscale('log', basey=10)
    ax.set_xlabel(r'$\sigma_X$', fontsize=20)
    ax.set_ylabel(r'$\sigma_Y$', fontsize=20)
    ax.set_title(f'{iscorer.upper()} Score, {idataset.capitalize()}', fontsize=20)
    plt.tight_layout()
    plt.show()

png

png

png

Random Dataset

Code Block
scorers = results_df['scorer'].unique().tolist()
idataset = 'random'

for iscorer in scorers:

    fig, ax = plot_all_params(results_df, results_est_df, scorer=iscorer, dataset=idataset)

    # ax.legend(ncol=1, fontsize=15)
    plt.xscale('log', basex=10)
    plt.yscale('log', basey=10)
    ax.set_xlabel(r'$\sigma_X$', fontsize=20)
    ax.set_ylabel(r'$\sigma_Y$', fontsize=20)
    ax.set_title(f'{iscorer.upper()} Score, {idataset.capitalize()}', fontsize=20)
    plt.tight_layout()
    plt.show()

png

png

png