Skip to content

Methods

In this notebook, I will be walking through how we can estimate different methods based on the density cubes that we derive.

import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))

import pathlib

# standard python packages
import xarray as xr
import pandas as pd
import numpy as np

# 
# Experiment Functions
from src.data.esdc import get_dataset
from src.features import Metrics
from src.features.temporal import select_period, get_smoke_test_time, TimePeriod
from src.features.spatial import select_region, get_europe, get_spain
from src.models.train_models import get_similarity_scores
from src.experiments.utils import dict_product, run_parallel_step

from src.features import Metrics
from src.features.density import get_density_cubes
from src.features.preprocessing import standardizer_data, get_reference_cube, get_common_indices
from src.models.similarity import cka_coefficient, rv_coefficient, rbig_it_measures

# # esdc tools
# from src.esdc.subset import select_pixel
# from src.esdc.shape import ShapeFileExtract, rasterize
# from esdc.transform import DensityCubes

from typing import List, Dict
import xarray as xr

from tqdm import tqdm

import cartopy
import cartopy.crs as ccrs

# NUMPY SETTINGS
import numpy as onp
onp.set_printoptions(precision=3, suppress=True)

# MATPLOTLIB Settings
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# SEABORN SETTINGS
import seaborn as sns
sns.set_context(context='talk',font_scale=0.7)
# sns.set(rc={'figure.figsize': (12, 9.)})
# sns.set_style("whitegrid")

# PANDAS SETTINGS
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

# LOGGING SETTINGS
import sys
import logging
logging.basicConfig(
    level=logging.INFO, 
    stream=sys.stdout,
    format='%(asctime)s:%(levelname)s:%(message)s'
)
logger = logging.getLogger()
#logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

1. Get DataCubes

!ls /media/disk/databases/ESDC/
Cube_2019highColombiaCube_184x120x120.zarr
Cube_2019highColombiaCube_1x3360x2760.zarr
esdc-8d-0.083deg-184x270x270-2.0.0.zarr
esdc-8d-0.083deg-1x2160x4320-2.0.0.zarr
esdc-8d-0.25deg-184x90x90-2.0.0.zarr
esdc-8d-0.25deg-1x720x1440-2.0.0.zarr
esdc = get_dataset(['gross_primary_productivity'])
esdc
Show/Hide data repr Show/Hide attributes
xarray.Dataset
    • lat: 720
    • lon: 1440
    • time: 1702
    • lon
      (lon)
      float32
      -179.875 -179.625 ... 179.875
      array([-179.875, -179.625, -179.375, ...,  179.375,  179.625,  179.875],
            dtype=float32)
    • lat
      (lat)
      float32
      89.875 89.625 ... -89.625 -89.875
      array([ 89.875,  89.625,  89.375, ..., -89.375, -89.625, -89.875],
            dtype=float32)
    • time
      (time)
      datetime64[ns]
      1980-01-05 ... 2016-12-30
      bounds :
      time_bnds
      long_name :
      time
      standard_name :
      time
      array(['1980-01-05T00:00:00.000000000', '1980-01-13T00:00:00.000000000',
             '1980-01-21T00:00:00.000000000', ..., '2016-12-14T00:00:00.000000000',
             '2016-12-22T00:00:00.000000000', '2016-12-30T00:00:00.000000000'],
            dtype='datetime64[ns]')
    • gross_primary_productivity
      (time, lat, lon)
      float32
      dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>
      ID :
      47
      esa_cci_path :
      nan
      long_name :
      Gross Primary Productivity
      orig_attrs :
      {'comment': 'Gross Carbon uptake of of the ecosystem through photosynthesis', 'long_name': 'Gross Primary Productivity', 'orig_attrs': {}, 'project_name': 'FLUXCOM', 'references': 'Tramontana, Gianluca, et al. "Predicting carbon dioxide and energy fluxes across global FLUXNET sites with regression algorithms." (2016).', 'source_name': 'GPPall', 'standard_name': 'gross_primary_productivity_of_carbon', 'units': 'gC m-2 day-1', 'url': 'http://www.fluxcom.org/'}
      orig_version :
      v1
      project_name :
      FLUXCOM
      time_coverage_end :
      2015-12-31
      time_coverage_resolution :
      P8D
      time_coverage_start :
      2001-01-05
      url :
      http://www.fluxcom.org/
      Array Chunk
      Bytes 7.06 GB 4.15 MB
      Shape (1702, 720, 1440) (1, 720, 1440)
      Count 1703 Tasks 1702 Chunks
      Type float32 numpy.ndarray
      1440 720 1702
  • Metadata_conventions :
    Unidata Dataset Discovery v1.0
    acknowledgment :
    The ESDL team acknowledges all data providers!
    chunking :
    1x720x1440
    comment :
    none.
    contributor_name :
    Max Planck Institute for Biogeochemistry
    contributor_role :
    ESDL Science Lead
    creator_email :
    info@earthsystemdatalab.net
    creator_name :
    Brockmann Consult GmbH
    creator_url :
    www.earthsystemdatalab.net
    date_created :
    17.12.2018
    date_issued :
    19.12.2018
    date_modified :
    17.12.2018
    geospatial_lat_max :
    89.75
    geospatial_lat_min :
    -89.75
    geospatial_lon_max :
    179.75
    geospatial_lon_min :
    -179.75
    geospatial_resolution :
    1/4deg
    history :
    - processing with esdl cube v0.1 (https://github.com/esa-esdl/esdl-core/)
    id :
    v2.0.0
    institution :
    Brockmann Consult GmbH
    keywords :
    Earth Science, Geophysical Variables
    license :
    Please refer to individual variables
    naming_authority :
    Earth System Data Lab team
    processing_level :
    Level 4
    project :
    ESA Earth System Data Lab
    publisher_email :
    info@earthsystemdatalab.net
    publisher_name :
    Brockmann Consult GmbH & Max Planck Institute for Biogechemistry
    publisher_url :
    www.brockmann-consult.de
    standard_name_vocabulary :
    CF-1.7
    summary :
    This data set contains a data cube of Earth System variables created by the ESA project Earth System Data Lab.
    time_coverage_duration :
    P37Y
    time_coverage_end :
    30.12.2016
    time_coverage_resolution :
    P8D
    time_coverage_start :
    05.01.1980
    title :
    Earth System Data Cube

2. Select Region

from src.features.spatial import get_europe, get_spain, select_region
# from src.features.spatial import 
# subset with bounding box
datacube = select_region(esdc, bbox=get_europe())

2. Remove Climatology

from src.features.temporal import remove_climatology

# remove the climatology
datacube_ = remove_climatology(datacube)

3. Resample - Mean Values per Month

datacube__ = datacube.resample(time='1MS').mean()
datacube__
Show/Hide data repr Show/Hide attributes
xarray.Dataset
    • lat: 144
    • lon: 232
    • time: 444
    • time
      (time)
      datetime64[ns]
      1980-01-01 ... 2016-12-01
      array(['1980-01-01T00:00:00.000000000', '1980-02-01T00:00:00.000000000',
             '1980-03-01T00:00:00.000000000', ..., '2016-10-01T00:00:00.000000000',
             '2016-11-01T00:00:00.000000000', '2016-12-01T00:00:00.000000000'],
            dtype='datetime64[ns]')
    • lat
      (lat)
      float32
      71.625 71.375 ... 36.125 35.875
      array([71.625, 71.375, 71.125, 70.875, 70.625, 70.375, 70.125, 69.875, 69.625,
             69.375, 69.125, 68.875, 68.625, 68.375, 68.125, 67.875, 67.625, 67.375,
             67.125, 66.875, 66.625, 66.375, 66.125, 65.875, 65.625, 65.375, 65.125,
             64.875, 64.625, 64.375, 64.125, 63.875, 63.625, 63.375, 63.125, 62.875,
             62.625, 62.375, 62.125, 61.875, 61.625, 61.375, 61.125, 60.875, 60.625,
             60.375, 60.125, 59.875, 59.625, 59.375, 59.125, 58.875, 58.625, 58.375,
             58.125, 57.875, 57.625, 57.375, 57.125, 56.875, 56.625, 56.375, 56.125,
             55.875, 55.625, 55.375, 55.125, 54.875, 54.625, 54.375, 54.125, 53.875,
             53.625, 53.375, 53.125, 52.875, 52.625, 52.375, 52.125, 51.875, 51.625,
             51.375, 51.125, 50.875, 50.625, 50.375, 50.125, 49.875, 49.625, 49.375,
             49.125, 48.875, 48.625, 48.375, 48.125, 47.875, 47.625, 47.375, 47.125,
             46.875, 46.625, 46.375, 46.125, 45.875, 45.625, 45.375, 45.125, 44.875,
             44.625, 44.375, 44.125, 43.875, 43.625, 43.375, 43.125, 42.875, 42.625,
             42.375, 42.125, 41.875, 41.625, 41.375, 41.125, 40.875, 40.625, 40.375,
             40.125, 39.875, 39.625, 39.375, 39.125, 38.875, 38.625, 38.375, 38.125,
             37.875, 37.625, 37.375, 37.125, 36.875, 36.625, 36.375, 36.125, 35.875],
            dtype=float32)
    • lon
      (lon)
      float32
      -17.875 -17.625 ... 39.625 39.875
      array([-17.875, -17.625, -17.375, ...,  39.375,  39.625,  39.875],
            dtype=float32)
    • gross_primary_productivity
      (time, lat, lon)
      float32
      dask.array<chunksize=(1, 144, 232), meta=np.ndarray>
      Array Chunk
      Bytes 59.33 MB 133.63 kB
      Shape (444, 144, 232) (1, 144, 232)
      Count 8141 Tasks 444 Chunks
      Type float32 numpy.ndarray
      232 144 444

2. Select Region

For this task, we are going to do something simple: work with only Europe and a segment of Eurasia. I have outlined a region described the latitude and longitude coordintes. With these coordinates, we can subset a section of the cube and continue working with that region only.

from src.features.temporal import select_period
from src.features.spatial import select_region
# get european bounding box
europe_bbox = get_europe()
time_period = ('July-2010', 'July-2010')

# subset region
europe_datacube = subset_cube(datacube, europe_bbox)

# subset region
europe_datacube_201007 = select_period(europe_datacube, time_period)

europe_datacube_201007
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-28-a375d20ecbe5> in <module>
      4 
      5 # subset region
----> 6 europe_datacube = subset_cube(datacube, europe_bbox)
      7 
      8 # subset region

NameError: name 'subset_cube' is not defined
europe_datacube_201007.gross_primary_productivity.mean('time').plot(vmin=0, robust=True)
/home/emmanuel/.conda/envs/rbig_eo/lib/python3.8/site-packages/dask/array/numpy_compat.py:40: RuntimeWarning: invalid value encountered in true_divide
  x = np.divide(x1, x2, out)
<matplotlib.collections.QuadMesh at 0x7f5113a06070>

3. Get Density Cubes

Now, we are going to create some density cubes. Instead of just taking the entire amount of samples, we are going to actually construct features. These features will be the neighbouring pixels in a spatial-temporal manner. For this demonstration, we will assume that the pixels

from src.features.preprocessing import DensityCubes
def get_density_cubes(data: xr.Dataset, spatial: int, temporal: int) -> Tuple[str, pd.DataFrame]:
    """Wrapper Function to get density cubes from a dataarray"""
    for ikey, idata in data.items():
        yield ikey, DensityCubes(
            spatial_window=spatial,
            time_window=temporal
        ).get_minicubes(idata)
# All samples
europe_df = europe_datacube_201007.to_dataframe().dropna()

# reorder index
levels = ['time', 'lon', 'lat']
europe_df = europe_df.reorder_levels(levels)

# europe_df = europe_df[indx]
europe_df.head()
gross_primary_productivity
time lon lat
2010-07-08 51.625 71.625 1.747494
2010-07-16 51.625 71.625 1.073725
2010-07-24 51.625 71.625 1.334421
2010-07-08 51.875 71.625 1.201953
2010-07-16 51.875 71.625 0.647814
spatial = 1
temporal = 3

ivar, europe_temp_df = next(get_density_cubes(
    europe_datacube_201007,
    spatial=spatial,
    temporal=temporal
))

levels = ['time', 'lon', 'lat']
europe_temp_df = europe_temp_df.reorder_levels(levels)

europe_temp_df.head()
var_x0 var_x1 var_x2
time lon lat
2010-07-24 51.625 71.625 1.747494 1.073725 1.334421
51.875 71.625 1.201953 0.647814 0.841013
52.125 71.625 1.158344 0.755085 0.847133
52.375 71.625 1.320013 0.837758 0.829512
52.625 71.625 1.087877 0.642634 0.499693
levels = ['time', 'lon', 'lat']


idx = europe_temp_df.index.intersection(europe_df.index)
idx.shape, 
((26694,),)
X_df = europe_df.loc[idx,:]
Y_df = europe_temp_df.loc[idx,:]
X_df.shape, Y_df.shape
((26694, 1), (26694, 3))

4.1 Models Framework

4.1 Preprocessing

4.1.1 - Training and testing

europe_df.head()
gross_primary_productivity
time lon lat
2010-07-08 51.625 71.625 1.747494
2010-07-16 51.625 71.625 1.073725
2010-07-24 51.625 71.625 1.334421
2010-07-08 51.875 71.625 1.201953
2010-07-16 51.875 71.625 0.647814
y = europe_df.iloc[:, 0][:, np.newaxis]
X = europe_df.iloc[:, 1:]

d_dimensions = X.shape[1]

4.1.2 - Train-Test Split

from sklearn.model_selection import train_test_split


train_size = 2_000
random_state = 123

xtrain, xtest, ytrain, ytest = train_test_split(
    X, y, train_size=train_size, random_state=random_state)

test_size = xtest.shape[0]

4.1.1 - Normalize

from sklearn.preprocessing import StandardScaler

# normalize inputs
x_normalizer = StandardScaler(with_mean=True, with_std=False)

xtrain_norm = x_normalizer.fit_transform(xtrain)
xtest_norm = x_normalizer.transform(xtest)

# remove mean outputs
y_normalizer = StandardScaler(with_std=False)

ytrain_norm = y_normalizer.fit_transform(ytrain)
ytest_norm = y_normalizer.transform(ytest)

4.2 - Training

# from src.models.gp import SparseGPR
import GPy
from scipy.cluster.vq import kmeans2
# Kernel Function (RBF)
n_dims = xtrain_norm.shape[1]
kernel = GPy.kern.RBF(input_dim=n_dims, ARD=False)

# Inducing Points
n_inducing = 100
z = kmeans2(X, n_inducing, minit="points")[0]

# Initialize GP Model
gp_model = GPy.models.SparseGPRegression(X, y, kernel=kernel, Z=z)

# choose VFE inference method
gp_model.inference_method = (GPy.inference.latent_function_inference.VarDTC())

# fix variance to be low in the beginning
gp_model.Gaussian_noise.variance = 0.01
2020-04-30 19:40:34,207:INFO:initializing Y
2020-04-30 19:40:34,209:INFO:initializing inference method
2020-04-30 19:40:34,209:INFO:adding kernel and likelihood as parameters
2020-04-30 19:40:34,211:INFO:Adding Z as parameter
# optimize GP Model
n_restarts = 0
verbose = 1
max_iters = 1_000

# optimize
gp_model.optimize(
    optimizer='scg',
    messages=verbose,
    max_iters=max_iters,
);
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/model.py in optimize(self, optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
    110         with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook, clear_after_finish=clear_after_finish) as vo:
--> 111             opt.run(start, f_fp=self._objective_grads, f=self._objective, fp=self._grads)
    112 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/optimization.py in run(self, x_init, **kwargs)
     50         start = dt.datetime.now()
---> 51         self.opt(x_init, **kwargs)
     52         end = dt.datetime.now()

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/optimization.py in opt(self, x_init, f_fp, f, fp)
    233 
--> 234         opt_result = SCG(f, fp, x_init,
    235                          maxiters=self.max_iters,

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/scg.py in SCG(f, gradf, x, optargs, maxiters, max_f_eval, xtol, ftol, gtol)
    112         xnew = x + alpha * d
--> 113         fnew = f(xnew, *optargs)
    114         function_eval += 1

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/model.py in _objective(self, x)
    260         try:
--> 261             self.optimizer_array = x
    262             obj = self.objective_function()

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/parameterized.py in __setattr__(self, name, val)
    338                 param[:] = val; return
--> 339         return object.__setattr__(self, name, val)
    340 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/sparse_gp_mpi.py in optimizer_array(self, p)
     87             self.mpi_comm.Bcast(p, root=0)
---> 88         SparseGP.optimizer_array.fset(self,p)
     89 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/parameter_core.py in optimizer_array(self, p)
    123         self._optimizer_copy_transformed = False
--> 124         self.trigger_update()
    125 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/updateable.py in trigger_update(self, trigger_parent)
     78             return
---> 79         self._trigger_params_changed(trigger_parent)

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/parameter_core.py in _trigger_params_changed(self, trigger_parent)
    133         [p._trigger_params_changed(trigger_parent=False) for p in self.parameters if not p.is_fixed]
--> 134         self.notify_observers(None, None if trigger_parent else -np.inf)
    135 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/observable.py in notify_observers(self, which, min_priority)
     90             if min_priority is None:
---> 91                 [callble(self, which=which) for _, _, callble in self.observers]
     92             else:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/observable.py in <listcomp>(.0)
     90             if min_priority is None:
---> 91                 [callble(self, which=which) for _, _, callble in self.observers]
     92             else:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/parameter_core.py in _parameters_changed_notification(self, me, which)
    507         self._optimizer_copy_transformed = False # tells the optimizer array to update on next request
--> 508         self.parameters_changed()
    509     def _pass_through_notify_observers(self, me, which=None):

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/models/sparse_gp_regression.py in parameters_changed(self)
     65         else:
---> 66             super(SparseGPRegression, self).parameters_changed()

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/sparse_gp_mpi.py in parameters_changed(self)
    121         else:
--> 122             super(SparseGP_MPI,self).parameters_changed()

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/sparse_gp.py in parameters_changed(self)
     77         self.posterior, self._log_marginal_likelihood, self.grad_dict = \
---> 78         self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood,
     79                                         self.Y_normalized, Y_metadata=self.Y_metadata,

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/inference/latent_function_inference/var_dtc.py in inference(self, kern, X, Z, likelihood, Y, Y_metadata, mean_function, precision, Lm, dL_dKmm, psi0, psi1, psi2, Z_tilde)
    124             if psi1 is None:
--> 125                 psi1 = kern.K(X, Z)
    126             if het_noise:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/kern/src/kernel_slice_operations.py in wrap(self, X, X2, *a, **kw)
    109         with _Slice_wrap(self, X, X2) as s:
--> 110             ret = f(self, s.X, s.X2, *a, **kw)
    111         return ret

<decorator-gen-150> in K(self, X, X2)

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/caching.py in g(obj, *args, **kw)
    282                 cacher = cache[self.f] = Cacher(self.f, self.limit, self.ignore_args, self.force_kwargs, cacher_enabled=cache.caching_enabled)
--> 283             return cacher(*args, **kw)
    284         g.__name__ = f.__name__

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/caching.py in __call__(self, *args, **kw)
    178             try:
--> 179                 new_output = self.operation(*args, **kw)
    180             except:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/kern/src/stationary.py in K(self, X, X2)
    113         """
--> 114         r = self._scaled_dist(X, X2)
    115         return self.K_of_r(r)

<decorator-gen-153> in _scaled_dist(self, X, X2)

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/caching.py in g(obj, *args, **kw)
    282                 cacher = cache[self.f] = Cacher(self.f, self.limit, self.ignore_args, self.force_kwargs, cacher_enabled=cache.caching_enabled)
--> 283             return cacher(*args, **kw)
    284         g.__name__ = f.__name__

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/caching.py in __call__(self, *args, **kw)
    178             try:
--> 179                 new_output = self.operation(*args, **kw)
    180             except:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/kern/src/stationary.py in _scaled_dist(self, X, X2)
    167         else:
--> 168             return self._unscaled_dist(X, X2)/self.lengthscale
    169 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/kern/src/stationary.py in _unscaled_dist(self, X, X2)
    147             r2 = np.clip(r2, 0, np.inf)
--> 148             return np.sqrt(r2)
    149 

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-77-e519e2d9a69c> in <module>
      5 
      6 # optimize
----> 7 gp_model.optimize(
      8     optimizer='scg',
      9     messages=verbose,

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/sparse_gp_mpi.py in optimize(self, optimizer, start, **kwargs)
     91         self._IN_OPTIMIZATION_ = True
     92         if self.mpi_comm==None:
---> 93             ret = super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs)
     94         elif self.mpi_comm.rank==0:
     95             ret = super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs)

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/gp.py in optimize(self, optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
    673         self.inference_method.on_optimization_start()
    674         try:
--> 675             ret = super(GP, self).optimize(optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
    676         except KeyboardInterrupt:
    677             print("KeyboardInterrupt caught, calling on_optimization_end() to round things up")

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/model.py in optimize(self, optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
    109 
    110         with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook, clear_after_finish=clear_after_finish) as vo:
--> 111             opt.run(start, f_fp=self._objective_grads, f=self._objective, fp=self._grads)
    112 
    113         self.optimizer_array = opt.x_opt

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/verbose_optimization.py in __exit__(self, type, value, traceback)
    213             self.stop = time.time()
    214             self.model.remove_observer(self)
--> 215             self.print_out(self.stop - self.start)
    216 
    217             if not self.ipython_notebook:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/verbose_optimization.py in print_out(self, seconds)
    146                           ['||gradient||',
    147                               "{: >+12.3E}".format(float(self.current_gradient))],
--> 148                           ['status', "{:s}".format(self.status)],
    149                           ]
    150             #message = "Lik:{:5.3E} Grad:{:5.3E} Lik:{:5.3E} Len:{!s}".format(float(m.log_likelihood()), np.einsum('i,i->', grads, grads), float(m.likelihood.variance), " ".join(["{:3.2E}".format(l) for l in m.kern.lengthscale.values]))

TypeError: unsupported format string passed to NoneType.__format__

4.3 - Testing

ypred = gp_model.predict(xtest_norm, )[0]
ypred.shape, ytest_norm.shape
((67414, 1), (67414, 1))
stats = Metrics().get_all(ypred.squeeze(), ytest_norm.squeeze())
stats
mae mse rmse r2
0 1.584058 4.35029 2.085735 0.325761
def _predict(model, Xs, batch_size):
    ms = []
    n = max(len(Xs) / batch_size, 1)  # predict in small batches
    with tqdm(np.array_split(Xs, n)) as bar:
        for xs in bar:
            m = model.predict(xs,)
            ms.append(m)

    return np.vstack(ms)
batch_size = 5_000
ms = []
n = max(len(xtest_norm) / batch_size, 1)  # predict in small batches
with tqdm(np.array_split(xtest_norm, n)) as bar:
    for xs in bar:
        m = sgp_model.predict(xs,)
        ms.append(m)
100%|██████████| 598/598 [00:51<00:00, 11.56it/s]

5. Direct Measurements

5.1 - \rhoV Coefficient

Now, we will explore the easiest linear method. It is the multi-dimensional version of the Pearson Correlation coefficient called the \rhoV-Coefficient (\rho-Vector Coefficient). Most people are familiar with the correlation coefficient:

\rho(X,Y) = \frac{\text{Cov}(X,Y)}{\sqrt{\text{Var}(X)}\sqrt{\text{Var}(Y)}}

This is very well-known in the literature but it doesn't directly apply to multi-dimensional data. The final result of the numerator and the denominator is a scalar value per dimension. There is no way we can summarize all of the information into a single scalar. One extension we can do is to create a matrix with the pairwise components (i.e gram matrices) for each of the variables and then take the Frobenius norm (Hilbert-Schmidt norm) of the cross term as well as the individual terms. So the equation is like so:

\rho V (\mathbf{X,Y}) = \frac{\left\langle \mathbf{XX^\top, YY^\top} \right\rangle_\mathbf{F}} {\sqrt{\left\langle \mathbf{XX^\top} \right\rangle_\mathbf{F}} \sqrt{\left\langle \mathbf{YY^\top} \right\rangle_\mathbf{F}}}

Note: This is very similar to HSIC and Centered Kernel Alignment (CKA) but this method dates back before. CKA generalizes this method with the addition of distance measures and non-linear kernel functions. We will explore this in the next section.

To code this up, we will all of the components of this equation because we will need them later.

from typing import Dict
from sklearn.preprocessing import KernelCenterer
from sklearn.metrics.pairwise import linear_kernel


def rv_coefficient(X: np.ndarray, Y: np.ndarray) -> Dict:
    """simple function to calculate the rv coefficient"""
    # calculate the kernel matrices
    X_gram = linear_kernel(X)
    Y_gram = linear_kernel(Y)

    # center the kernels
    X_gram = KernelCenterer().fit_transform(X_gram)
    Y_gram = KernelCenterer().fit_transform(Y_gram)

    # normalizing coefficients (denomenator)
    x_norm = np.linalg.norm(X_gram)
    y_norm = np.linalg.norm(Y_gram)

    # frobenius norm of the cross terms (numerator)
    xy_norm = np.sum(X_gram * Y_gram)

    # rv coefficient
    pv_coeff = xy_norm / x_norm / y_norm

    return {
        'coefficient': pv_coeff,
        'x_norm': x_norm,
        'y_norm': y_norm,
        'xy_norm': xy_norm
    }
X_samples = europe_temp_df.iloc[:, 0][:, np.newaxis]
Y_samples = europe_temp_df

logging.info(f" Size of X_samples: {X_samples.shape}, {Y_samples.shape}")

d_dimensions = X.shape[1]
2020-04-30 21:10:39,726:INFO: Size of X_samples: (26694, 1), (26694, 3)
# check that the coefficient is 1 if the data is the same
rv_coeff = rv_coefficient(X_samples[:100], X_samples[:100])
np.testing.assert_almost_equal(rv_coeff['coefficient'], 1)

So now, let's try when we have some a difference between the variables.

%%time 
rv_coeff = rv_coefficient(X_samples[:], Y_samples[:])
rv_coeff
CPU times: user 23.8 s, sys: 4.9 s, total: 28.7 s
Wall time: 13.3 s
{'coefficient': 0.9696304,
 'x_norm': 191155.58,
 'y_norm': 510949.53,
 'xy_norm': 94704630000.0}

5.2 - Non-Linear Kernel

An addition that we can do is to explore how the

from typing import Optional
from scipy.spatial.distance import pdist, squareform

def estimate_sigma(
    X: np.ndarray, 
    method: str='median', 
    percent: Optional[int]=None, 
    heuristic: bool=False
) -> float:

    # get the squared euclidean distances
    if method == 'silverman':
        return silvermans_factor(X)
    elif method == 'scott':
        return scotts_factor(X)
    elif percent is not None:
        kth_sample = int((percent/100) * X.shape[0])
        dists = np.sort(squareform(pdist(X, 'sqeuclidean')))[:, kth_sample]
#         print(dists.shape, dists.min(), dists.max())
    else:
        dists = np.sort(pdist(X, 'sqeuclidean'))
#         print(dists.shape, dists.min(), dists.max())


    if method == 'median':
        sigma = np.median(dists)
    elif method == 'mean':
        sigma = np.mean(dists)
    else:
        raise ValueError(f"Unrecognized distance measure: {method}")

    if heuristic:
        sigma = np.sqrt(sigma / 2)
    return sigma
from typing import Dict
from sklearn.preprocessing import KernelCenterer
from sklearn.gaussian_process.kernels import RBF


def cka_coefficient(X: np.ndarray, Y: np.ndarray) -> Dict:
    """simple function to calculate the rv coefficient"""

    # estimate sigmas
    sigma_X = estimate_sigma(X, method='median', percent=50)
    sigma_Y = estimate_sigma(Y, method='median', percent=50)
    # calculate the kernel matrices
    X_gram = RBF(sigma_X)(X)
    Y_gram = RBF(sigma_Y)(Y)

    # center the kernels
    X_gram = KernelCenterer().fit_transform(X_gram)
    Y_gram = KernelCenterer().fit_transform(Y_gram)

    # normalizing coefficients (denomenator)
    x_norm = np.linalg.norm(X_gram)
    y_norm = np.linalg.norm(Y_gram)

    # frobenius norm of the cross terms (numerator)
    xy_norm = np.sum(X_gram * Y_gram)

    # rv coefficient
    pv_coeff = xy_norm / x_norm / y_norm

    return {
        'coefficient': pv_coeff,
        'x_norm': x_norm,
        'y_norm': y_norm,
        'xy_norm': xy_norm
    }
# check that the coefficient is 1 if the data is the same
cka_coeff = cka_coefficient(X_samples[:100], X_samples[:100])
np.testing.assert_almost_equal(cka_coeff['coefficient'], 1)
%%time 
cka_coeff = cka_coefficient(X_samples[:10_000], Y_samples[:10_000])
cka_coeff
CPU times: user 20.4 s, sys: 2.89 s, total: 23.3 s
Wall time: 17 s
{'coefficient': 0.9576703788185938,
 'x_norm': 2326.331280441253,
 'y_norm': 1175.2587321221124,
 'xy_norm': 2618310.2249249523}

Variation of Information

from rbig.rbig import RBIGMI, RBIG

rbig_results = {}
def variation_of_info(H_X, H_Y, I_XY):
    return I_XY / np.sqrt(H_X) / np.sqrt(H_Y)
%%time 
n_layers = 10000
rotation_type = 'PCA'
random_state = 0
zero_tolerance = 60
pdf_extension = None
pdf_resolution = None
tolerance = None

# Initialize RBIG class
H_rbig_model = RBIG(n_layers=n_layers, 
                  rotation_type=rotation_type, 
                  random_state=random_state, 
                  zero_tolerance=zero_tolerance,
                  tolerance=tolerance)

# fit model to the data
rbig_results['H_x'] = H_rbig_model.fit(X_samples).entropy(correction=True);

rbig_results['H_y'] = H_rbig_model.fit(Y_samples).entropy(correction=True);

# Initialize RBIG class
I_rbig_model = RBIGMI(n_layers=n_layers, 
                  rotation_type=rotation_type, 
                  random_state=random_state, 
                  zero_tolerance=zero_tolerance,
                  tolerance=tolerance)

# fit model to the data
rbig_results['I_xy'] = I_rbig_model.fit(X_samples, Y_samples).mutual_information();

# calculate the variation of information coefficient
rbig_results['coefficient'] = variation_of_info(
    rbig_results['H_x'],
    rbig_results['H_y'],
    rbig_results['I_xy']
)
CPU times: user 4min 12s, sys: 36.4 s, total: 4min 49s
Wall time: 11.2 s
rbig_results
{'H_x': 3.2281169474002924,
 'H_y': 5.181635094706355,
 'I_xy': 5.412206453196236,
 'coefficient': 1.3233243751128443}