Code Review IV - Best Parameters¶
This is part two of the notebook before. I want to have a dedicated portion where I look at the best plots.
Code Preamble¶
# toy datasets
import sys
from pyprojroot import here
sys.path.insert(0, str(here()))
import warnings
from typing import Optional, Tuple
from tqdm import tqdm
import random
import pandas as pd
import numpy as np
import argparse
from sklearn.utils import check_random_state
# toy datasets
from src.data.distribution import DataParams, Inputs
# Kernel Dependency measure
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process.kernels import RBF
from src.models.dependence import HSICModel
# RBIG IT measures
from src.features.utils import df_query, subset_dataframe
# Plotting
from src.visualization.distribution import plot_scorer, plot_score_vs_mi
# experiment helpers
from src.experiments.utils import dict_product, run_parallel_step
from tqdm import tqdm
# Plotting Procedures
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
sns.reset_defaults()
# sns.set_style('whitegrid')
#sns.set_context('talk')
sns.set_context(context='poster',font_scale=0.7, rc={'font.family': 'sans-serif'})
# sns.set(font='sans-serif')
%matplotlib inline
%load_ext autoreload
%autoreload 2
Query Data¶
DATA_PATH = "data/results/distributions/mutual_info/"
results_df = pd.concat([
pd.read_csv(here() / f"{DATA_PATH}v5_gauss.csv"),
# pd.read_csv(here() / f"{DATA_PATH}v5_tstudent.csv")
], axis=1)
results_df = results_df.loc[:, ~results_df.columns.str.match('Unnamed')]
results_df = results_df.astype(object).replace(np.nan, 'None')
Gaussian Distribution¶
# initialize list of queries
queries = []
# query dataframe for median
dataset_methods = ['gauss']
queries.append(df_query('dataset', dataset_methods))
# # query dataframe for median
# sigma_methods = ['median']
# queries.append(df_query('sigma_method', sigma_methods))
# # query dataframe for scott and silverman methods
# sigma_percents = [40., 50., 60.]
# queries.append(df_query('sigma_percent', sigma_percents))
# # query dataframe for RBF Kernel
# dimension_query = [False]
# queries.append(df_query('per_dimension', dimension_query))
# # query dataframe for HSIC
# scorer_query = ['cka']
# queries.append(df_query('scorer', scorer_query))
results_df = subset_dataframe(results_df, queries)
# # plot - score vs mi
# plot_score_vs_mi(sub_df, scorer='cka', compare='dimension');
sub_df.tail(3)
Case I - Standardize or Not¶
Scott | Silverman¶
# initialize list of queries
queries = []
# # query dataframe for hsic
# scorers = ['hsic',]
# queries.append(df_query('scorer', scorers))
# query dataframe for scott and silverman methods
sigma_methods = ['scott', 'silverman']
queries.append(df_query('sigma_method', sigma_methods))
sub_df = subset_dataframe(results_df, queries)
# plot - score vs mi
plot_score_vs_mi(sub_df, scorer='hsic', compare='standard');
plot_score_vs_mi(sub_df, scorer='cka', compare='standard');
plot_score_vs_mi(sub_df, scorer='ka', compare='standard');
# plot - score vs mi
plot_score_vs_mi(sub_df, scorer='hsic', compare='dimension');
plot_score_vs_mi(sub_df, scorer='cka', compare='dimension');
plot_score_vs_mi(sub_df, scorer='ka', compare='dimension');
Median - All Percentages¶
# initialize list of queries
queries = []
# query dataframe for scott and silverman methods
sigma_methods = ['median']
queries.append(df_query('sigma_method', sigma_methods))
# # query dataframe for scott and silverman methods
# sigma_percents = ['None']
# queries.append(df_query('sigma_percent', sigma_percents))
# # query dataframe for hsic
# scorers = ['hsic',]
# queries.append(df_query('scorer', scorers))
sub_df = subset_dataframe(results_df, queries)
# plot - score vs mi
plot_score_vs_mi(sub_df, scorer='hsic', compare='standard');
plot_score_vs_mi(sub_df, scorer='cka', compare='standard');
plot_score_vs_mi(sub_df, scorer='ka', compare='standard');
# plot - score vs mi
plot_score_vs_mi(sub_df, scorer='hsic', compare='dimension');
plot_score_vs_mi(sub_df, scorer='cka', compare='dimension');
plot_score_vs_mi(sub_df, scorer='ka', compare='dimension');
Reasonable Percentages (30,50,60)¶
# initialize list of queries
queries = []
# query dataframe for scott and silverman methods
sigma_methods = ['median']
queries.append(df_query('sigma_method', sigma_methods))
# query dataframe for scott and silverman methods
sigma_percents = [40., 50., 60.]
queries.append(df_query('sigma_percent', sigma_percents))
# # query dataframe for hsic
# scorers = ['hsic',]
# queries.append(df_query('scorer', scorers))
sub_df = subset_dataframe(results_df, queries)
# plot - score vs mi
plot_score_vs_mi(sub_df, scorer='hsic', compare='standard');
plot_score_vs_mi(sub_df, scorer='cka', compare='standard');
plot_score_vs_mi(sub_df, scorer='ka', compare='standard');
# plot - score vs mi
plot_score_vs_mi(sub_df, scorer='hsic', compare='dimension');
plot_score_vs_mi(sub_df, scorer='cka', compare='dimension');
plot_score_vs_mi(sub_df, scorer='ka', compare='dimension');
Extreme Percentages¶
# initialize list of queries
queries = []
# query dataframe for scott and silverman methods
sigma_methods = ['median']
queries.append(df_query('sigma_method', sigma_methods))
# query dataframe for scott and silverman methods
sigma_percents = [10., 20., 80., 90.]
queries.append(df_query('sigma_percent', sigma_percents))
# # query dataframe for hsic
# scorers = ['hsic',]
# queries.append(df_query('scorer', scorers))
sub_df = subset_dataframe(results_df, queries)
# plot - score vs mi
plot_score_vs_mi(sub_df, scorer='hsic', compare='standard');
plot_score_vs_mi(sub_df, scorer='cka', compare='standard');
plot_score_vs_mi(sub_df, scorer='ka', compare='standard');
# plot - score vs mi
plot_score_vs_mi(sub_df, scorer='hsic', compare='dimension');
plot_score_vs_mi(sub_df, scorer='cka', compare='dimension');
plot_score_vs_mi(sub_df, scorer='ka', compare='dimension');