Demo I - Different Cases¶
In this document, I will be looking at the motivation behind this study and why we would like to pursue this further.
import sys, os
import warnings
import tqdm
import random
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# Insert path to model directory,.
cwd = os.getcwd()
path = f"{cwd}/../../src"
sys.path.insert(0, path)
# Insert path to package,.
pysim_path = f"/home/emmanuel/code/pysim/"
sys.path.insert(0, pysim_path)
# toy datasets
from data.toy import generate_dependence_data
from data.distribution import DataParams
from dataclasses import dataclass
# Kernel Dependency measure
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process.kernels import RBF
from models.dependence import HSICModel
from pysim.kernel.utils import get_init_gammas, get_gamma_grid, estimate_sigma
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['seaborn-talk'])
%matplotlib inline
warnings.filterwarnings('ignore') # get rid of annoying warnings
%load_ext autoreload
%autoreload 2
FIG_PATH = "/home/emmanuel/projects/2019_hsic_align/results/figures/1d_dataset/demo/"
Estimating Sigma & HSIC¶
def standardize_data(X, Y, standardize: bool=False):
X = StandardScaler().fit_transform(X)
Y = StandardScaler().fit_transform(Y)
return X, Y
def get_sigma(X, Y, method: str='silverman', per_dimension: bool=False, separate_scales: bool=False):
# sigma parameters
subsample = None
percent = 0.20
random_state = 123
sigma_X = estimate_sigma(
X,
subsample=subsample,
method=method,
percent=percent,
random_state=random_state,
per_dimension=per_dimension
)
sigma_Y = estimate_sigma(
Y,
subsample=subsample,
method=method,
percent=percent,
random_state=random_state,
per_dimension=per_dimension
)
if separate_scales:
sigma_X = np.mean([sigma_X, sigma_Y])
sigma_Y = np.mean([sigma_X, sigma_Y])
return sigma_X, sigma_Y
def get_hsic(X, Y, scorer: str, sigma_X=None, sigma_Y=None):
# init hsic model class
hsic_model = HSICModel()
# hsic model params
if sigma_X is not None:
hsic_model.kernel_X = RBF(sigma_X)
hsic_model.kernel_Y = RBF(sigma_Y)
# get hsic score
hsic_val = hsic_model.get_score(X, Y, scorer)
return hsic_val
Data I - 1D Dataset¶
# data params
dataset = 'sine'
num_points = 1000
seed = 123
noise = 0.1
# get dataset
X, Y = generate_dependence_data(
dataset=dataset,
num_points=num_points,
seed=seed,
noise_x=noise,
noise_y=noise
)
# plot
fig, ax = plt.subplots()
ax.scatter(X[:100,:], Y[:100,:])
plt.tight_layout()
fig.savefig(FIG_PATH + f"demo_{dataset}.png")
plt.show()
Question I - Which Algorithm?¶
results_df = pd.DataFrame()
method = 'scott'
per_dimension = False
separate_scales = False
# sigma_X, sigma_y = get_sigma(
# X, Y,
# method=method,
# per_dimension=per_dimension,
# separate_scales=separate_scales
# )
method = 'default'
sigma_X, sigma_Y = None, None
scorer = 'hsic'
results_df = results_df.append(pd.DataFrame({
"hsic": [get_hsic(X, Y, 'hsic', sigma_X, sigma_Y)], # Estimate HSIC
"ka": [get_hsic(X, Y, 'ka', sigma_X, sigma_Y)], # Estimate KA
"cka": [get_hsic(X, Y, 'cka', sigma_X, sigma_Y)], # Estimate CKA
},index=['Q1']),)
print(results_df.to_markdown())
So each method has a slightly different formulation but they are mostly the same. So now the next question is: how do we estimate the parameters of the kernel used? Well the default is simply \sigma=1.0 but we know that this won't do as the kernel depends on the parameters of the kernel. In this case we are using the most commonly used kernel: the Radial Basis Function (RBF). Since this is a 1D example, I will use some generic estimators called the "Silverman Rule" and "Scott Rule". These are very commonly found in packages like scipy.stats.gaussian_kde
or statsmodels.nonparametric.bandwidth
. They are mostly used for the Kernel Density Estimation (KDE) where we need a decent parameter to approximate the kernel to get a decent density estimate.
So what happens with the methods and the results?
Question II - Which Parameter Estimator?¶
methods = ['scott', 'silverman', 'median']
per_dimension = False
separate_scales = True
results_df = pd.DataFrame()
for imethod in methods:
sigma_X, sigma_Y = get_sigma(
X, Y,
method=imethod,
per_dimension=per_dimension,
separate_scales=separate_scales
)
results_df = results_df.append(pd.DataFrame({
# "sigma_x": [sigma_X],
# "sigma_y": [sigma_Y],
'Estimator': [imethod],
"hsic": [get_hsic(X, Y, 'hsic', sigma_X, sigma_Y)], # Estimate HSIC
"ka": [get_hsic(X, Y, 'ka', sigma_X, sigma_Y)], # Estimate KA
"cka": [get_hsic(X, Y, 'cka', sigma_X, sigma_Y)], # Estimate CKA
},index=['Q2']),)
print(results_df.to_markdown())
Question III - How do we estimate the length scale?¶
- Use the same length scale?
- Use different length scales?
- Use a length scale per dimension (D>1)
methods = ['scott', 'silverman', 'median']
per_dimension = False
separate_scales = [True, False]
results_df = pd.DataFrame()
for iscaler in separate_scales:
for imethod in methods:
sigma_X, sigma_Y = get_sigma(
X, Y,
method=imethod,
per_dimension=per_dimension,
separate_scales=iscaler
)
results_df = results_df.append(pd.DataFrame({
# "sigma_x": [sigma_X],
"separate": [iscaler],
'Estimator': [imethod],
"hsic": [get_hsic(X, Y, 'hsic', sigma_X, sigma_Y)], # Estimate HSIC
"ka": [get_hsic(X, Y, 'ka', sigma_X, sigma_Y)], # Estimate KA
"cka": [get_hsic(X, Y, 'cka', sigma_X, sigma_Y)], # Estimate CKA
},index=['Q3']),)
print(results_df.to_markdown())
Question IV - Standardize Data?¶
We could also standardize our data... This could actually change the size of each of the features which could eliminate the need to apply separate length scales.
standardize = [True, False]
methods = ['scott', 'silverman', 'median']
per_dimension = False
separate_scales = [True, False]
results_df = pd.DataFrame()
for istandard in standardize:
X_, Y_ = standardize_data(X, Y, istandard)
for iscaler in separate_scales:
for imethod in methods:
sigma_X, sigma_Y = get_sigma(
X_, Y_,
method=imethod,
per_dimension=per_dimension,
separate_scales=iscaler
)
results_df = results_df.append(pd.DataFrame({
"standardize": [istandard],
"separate": [iscaler],
'Estimator': [imethod],
"hsic": [get_hsic(X_, Y_, 'hsic', sigma_X, sigma_Y)], # Estimate HSIC
"ka": [get_hsic(X_, Y_, 'ka', sigma_X, sigma_Y)], # Estimate KA
"cka": [get_hsic(X_, Y_, 'cka', sigma_X, sigma_Y)], # Estimate CKA
},index=['Q4']),)
print(results_df.to_markdown())
Verdict¶
Well, hard to say as it depends on the parameters. Every researcher I've met who dealt with kernel methods seems to have a suggestion that they swear by but I never know who to follow. My thoughts is that we should use dedicated sigma values per dataset however, that still leaves us with other methods that we may want to try. So we're going to repeat the same experiment but with a 2D dataset and we will see that the difficult will increase again.
2D Example¶
# initialize Data Params class
dataset = 'tstudent'
samples = 1_000
dimensions = 2
std = 5
nu = 8
trial = 1
standardize = False
# initialize params
example_params = DataParams(
dataset=dataset,
samples=samples,
dimensions=dimensions,
std=std,
nu=nu,
trial=trial,
standardize=standardize
)
# generate some parameters
inputs = example_params.generate_data()
sns.jointplot(x=inputs.X, y=inputs.Y)
plt.tight_layout()
plt.savefig(FIG_PATH + f"demo_{dataset}.png")
Question III (Revisited) - Different Length Scales?¶
Now we can revisit this question because we actually could estimate a different length scale depending upon the dimensionality. One problem with scott or Silverman's method is that it takes into account the entire dataset instead of having one estimate per feature.
methods = ['scott', 'silverman', 'median']
per_dimension = False
separate_scales = [True, False]
separate_dimensions = [True, False]
results_df = pd.DataFrame()
for iscaler in separate_scales:
for idim in separate_dimensions:
for imethod in methods:
sigma_X, sigma_Y = get_sigma(
X, Y,
method=imethod,
per_dimension=idim,
separate_scales=iscaler
)
results_df = results_df.append(pd.DataFrame({
"standardize": [istandard],
"Separate Dimensions": [idim],
"Separate Length Scales": [iscaler],
'Param Estimator': [imethod],
"HSIC": [get_hsic(X, Y, 'hsic', sigma_X, sigma_Y)], # Estimate HSIC
"KA": [get_hsic(X, Y, 'ka', sigma_X, sigma_Y)], # Estimate KA
"CKA": [get_hsic(X, Y, 'cka', sigma_X, sigma_Y)], # Estimate CKA
},index=['Q3']),)
print(results_df.to_markdown())
Q1-Q4¶
So now, let's look at all questions for the 2D data distribution
standardize = [True, False]
methods = ['scott', 'silverman', 'median']
per_dimension = False
separate_scales = [True, False]
separate_dimensions = [True, False]
results_df = pd.DataFrame()
for istandard in standardize:
X_, Y_ = standardize_data(X, Y, istandard)
for iscaler in separate_scales:
for idim in separate_dimensions:
for imethod in methods:
sigma_X, sigma_Y = get_sigma(
X_, Y_,
method=imethod,
per_dimension=idim,
separate_scales=iscaler
)
results_df = results_df.append(pd.DataFrame({
"standardize": [istandard],
"Separate Dimensions": [idim],
"Separate Length Scales": [iscaler],
'Param Estimator': [imethod],
"HSIC": [get_hsic(X_, Y_, 'hsic', sigma_X, sigma_Y)], # Estimate HSIC
"KA": [get_hsic(X_, Y_, 'ka', sigma_X, sigma_Y)], # Estimate KA
"CKA": [get_hsic(X_, Y_, 'cka', sigma_X, sigma_Y)], # Estimate CKA
},index=['Q4']),)
print(results_df.to_markdown())
Verdict¶
For the distributions, it seemed to be a little more consistent but with higher dimensions and more samples, these estimators start to fail. But then, we still don't have good alternative estimators.
What Now?¶
I will be looking at the following:
Options | |
---|---|
Standardize | Yes / No |
Parameter Estimator | Mean, Median, Silverman, etc |
Center Kernel | Yes / No |
Normalized Score | Yes / No |