Methods¶

In this notebook, I will be walking through how we can estimate different methods based on the density cubes that we derive.

import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))

import pathlib

import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))

import pathlib

# standard python packages
import xarray as xr
import pandas as pd
import numpy as np

# 
# Experiment Functions
from src.data.esdc import get_dataset
from src.features import Metrics
from src.features.temporal import select_period, get_smoke_test_time, TimePeriod
from src.features.spatial import select_region, get_europe, get_spain
from src.models.train_models import get_similarity_scores
from src.experiments.utils import dict_product, run_parallel_step

from src.features import Metrics
from src.features.density import get_density_cubes
from src.features.preprocessing import standardizer_data, get_reference_cube, get_common_indices
from src.models.similarity import cka_coefficient, rv_coefficient, rbig_it_measures

# # esdc tools
# from src.esdc.subset import select_pixel
# from src.esdc.shape import ShapeFileExtract, rasterize
# from esdc.transform import DensityCubes

from typing import List, Dict
import xarray as xr

from tqdm import tqdm

import cartopy
import cartopy.crs as ccrs

# NUMPY SETTINGS
import numpy as onp
onp.set_printoptions(precision=3, suppress=True)

# MATPLOTLIB Settings
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# SEABORN SETTINGS
import seaborn as sns
sns.set_context(context='talk',font_scale=0.7)
# sns.set(rc={'figure.figsize': (12, 9.)})
# sns.set_style("whitegrid")

# PANDAS SETTINGS
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

# LOGGING SETTINGS
import sys
import logging
logging.basicConfig(
    level=logging.INFO, 
    stream=sys.stdout,
    format='%(asctime)s:%(levelname)s:%(message)s'
)
logger = logging.getLogger()
#logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

1. Get DataCubes¶

!ls /media/disk/databases/ESDC/

Cube_2019highColombiaCube_184x120x120.zarr
Cube_2019highColombiaCube_1x3360x2760.zarr
esdc-8d-0.083deg-184x270x270-2.0.0.zarr
esdc-8d-0.083deg-1x2160x4320-2.0.0.zarr
esdc-8d-0.25deg-184x90x90-2.0.0.zarr
esdc-8d-0.25deg-1x720x1440-2.0.0.zarr

# Datapath
DATA_PATH = pathlib.Path("/media/disk/databases/ESDC/")

# get filename
filename = DATA_PATH.joinpath("esdc-8d-0.25deg-1x720x1440-2.0.0.zarr")

from typing import List
import xarray as xr

def get_dataset(variable: str)-> xr.Dataset:
    return xr.open_zarr(str(filename))[[variable]]

variable = 'soil_moisture'

datacube = get_dataset(variable)
datacube

xarray.Dataset

Dimensions:
- lat: 720
- lon: 1440
- time: 1702

Coordinates: (3)

lon

(lon)

float32

-179.875 -179.625 ... 179.875

array([-179.875, -179.625, -179.375, ...,  179.375,  179.625,  179.875],
      dtype=float32)

time

(time)

datetime64[ns]

1980-01-05 ... 2016-12-30

bounds :: time_bnds
long_name :: time
standard_name :: time

array(['1980-01-05T00:00:00.000000000', '1980-01-13T00:00:00.000000000',
       '1980-01-21T00:00:00.000000000', ..., '2016-12-14T00:00:00.000000000',
       '2016-12-22T00:00:00.000000000', '2016-12-30T00:00:00.000000000'],
      dtype='datetime64[ns]')

lat

(lat)

float32

89.875 89.625 ... -89.625 -89.875

array([ 89.875,  89.625,  89.375, ..., -89.375, -89.625, -89.875],
      dtype=float32)

Data variables: (1)

soil_moisture

(time, lat, lon)

float32

dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>

ID :: 78
esa_cci_path :: /neodc/esacci/soil_moisture/data/daily_files/COMBINED/v04.2/
long_name :: Soil Moisture
orig_attrs :: {'comment': 'Soil moisture based on the SOilmoisture CCI project', 'long_name': 'Soil Moisture', 'project_name': 'SoilMoisture CCI', 'references': 'Liu, Y.Y., Parinussa, R.M., Dorigo, W.A., De Jeu, R.A.M., Wagner, W., McCabe, M.F., Evans, J.P., and van Dijk, A.I.J.M. (2012): Trend-preserving blending of passive and active microwave soil moisture retrievals; Liu, Y.Y., Parinussa, R.M., Dorigo, W.A., De Jeu, R.A.M., Wagner, W., van Dijk, A.I.J.M., McCabe, M.F., & Evans, J.P. (2011): Developing an improved soil moisture dataset by blending passive and active microwave satellite based retrievals. Hydrology and Earth System Sciences, 15, 425-436.', 'source_name': 'SoilMoisture', 'standard_name': 'soil_moisture_content', 'units': 'm3', 'url': 'http://www.esa-soilmoisture-cci.org'}
orig_version :: v04.2
project_name :: SoilMoisture CCI
time_coverage_end :: 2014-01-29
time_coverage_resolution :: P8D
time_coverage_start :: 1980-01-05
url :: http://www.esa-soilmoisture-cci.org





  
      Array  Chunk 
  
  
     Bytes  7.06 GB   4.15 MB 
     Shape  (1702, 720, 1440)   (1, 720, 1440) 
     Count  1703 Tasks  1702 Chunks 
     Type  float32  numpy.ndarray

Attributes: (35)
Metadata_conventions :
Unidata Dataset Discovery v1.0
acknowledgment :
The ESDL team acknowledges all data providers!
chunking :
1x720x1440
comment :
none.
contributor_name :
Max Planck Institute for Biogeochemistry
contributor_role :
ESDL Science Lead
creator_email :
info@earthsystemdatalab.net
creator_name :
Brockmann Consult GmbH
creator_url :
www.earthsystemdatalab.net
date_created :
17.12.2018
date_issued :
19.12.2018
date_modified :
17.12.2018
geospatial_lat_max :
89.75
geospatial_lat_min :
-89.75
geospatial_lon_max :
179.75
geospatial_lon_min :
-179.75
geospatial_resolution :
1/4deg
history :
- processing with esdl cube v0.1 (https://github.com/esa-esdl/esdl-core/)
id :
v2.0.0
institution :
Brockmann Consult GmbH
keywords :
Earth Science, Geophysical Variables
license :
Please refer to individual variables
naming_authority :
Earth System Data Lab team
processing_level :
Level 4
project :
ESA Earth System Data Lab
publisher_email :
info@earthsystemdatalab.net
publisher_name :
Brockmann Consult GmbH & Max Planck Institute for Biogechemistry
publisher_url :
www.brockmann-consult.de
standard_name_vocabulary :
CF-1.7
summary :
This data set contains a data cube of Earth System variables created by the ESA project Earth System Data Lab.
time_coverage_duration :
P37Y
time_coverage_end :
30.12.2016
time_coverage_resolution :
P8D
time_coverage_start :
05.01.1980
title :
Earth System Data Cube

2. Select Region¶

For this task, we are going to do something simple: work with only Europe and a segment of Eurasia. I have outlined a region described the latitude and longitude coordintes. With these coordinates, we can subset a section of the cube and continue working with that region only.

from src.features.temporal import select_period
from src.features.spatial import select_region

# get european bounding box
europe_bbox = get_europe()
time_period = ('July-2010', 'July-2010')

# subset region
europe_datacube = subset_cube(datacube, europe_bbox)

# subset region
europe_datacube_201007 = select_period(europe_datacube, time_period)

europe_datacube_201007

xarray.Dataset

Dimensions:
- lat: 144
- lon: 312
- time: 3

Coordinates: (3)

lon

(lon)

float32

-17.875 -17.625 ... 59.625 59.875

array([-17.875, -17.625, -17.375, ...,  59.375,  59.625,  59.875],
      dtype=float32)

lat

(lat)

float32

71.625 71.375 ... 36.125 35.875

array([71.625, 71.375, 71.125, 70.875, 70.625, 70.375, 70.125, 69.875, 69.625,
       69.375, 69.125, 68.875, 68.625, 68.375, 68.125, 67.875, 67.625, 67.375,
       67.125, 66.875, 66.625, 66.375, 66.125, 65.875, 65.625, 65.375, 65.125,
       64.875, 64.625, 64.375, 64.125, 63.875, 63.625, 63.375, 63.125, 62.875,
       62.625, 62.375, 62.125, 61.875, 61.625, 61.375, 61.125, 60.875, 60.625,
       60.375, 60.125, 59.875, 59.625, 59.375, 59.125, 58.875, 58.625, 58.375,
       58.125, 57.875, 57.625, 57.375, 57.125, 56.875, 56.625, 56.375, 56.125,
       55.875, 55.625, 55.375, 55.125, 54.875, 54.625, 54.375, 54.125, 53.875,
       53.625, 53.375, 53.125, 52.875, 52.625, 52.375, 52.125, 51.875, 51.625,
       51.375, 51.125, 50.875, 50.625, 50.375, 50.125, 49.875, 49.625, 49.375,
       49.125, 48.875, 48.625, 48.375, 48.125, 47.875, 47.625, 47.375, 47.125,
       46.875, 46.625, 46.375, 46.125, 45.875, 45.625, 45.375, 45.125, 44.875,
       44.625, 44.375, 44.125, 43.875, 43.625, 43.375, 43.125, 42.875, 42.625,
       42.375, 42.125, 41.875, 41.625, 41.375, 41.125, 40.875, 40.625, 40.375,
       40.125, 39.875, 39.625, 39.375, 39.125, 38.875, 38.625, 38.375, 38.125,
       37.875, 37.625, 37.375, 37.125, 36.875, 36.625, 36.375, 36.125, 35.875],
      dtype=float32)

time

(time)

datetime64[ns]

2010-07-08 2010-07-16 2010-07-24

bounds :: time_bnds
long_name :: time
standard_name :: time

array(['2010-07-08T00:00:00.000000000', '2010-07-16T00:00:00.000000000',
       '2010-07-24T00:00:00.000000000'], dtype='datetime64[ns]')

Data variables: (1)

gross_primary_productivity

(time, lat, lon)

float32

dask.array<chunksize=(1, 144, 312), meta=np.ndarray>

ID :: 47
esa_cci_path :: nan
long_name :: Gross Primary Productivity
orig_attrs :: {'comment': 'Gross Carbon uptake of of the ecosystem through photosynthesis', 'long_name': 'Gross Primary Productivity', 'orig_attrs': {}, 'project_name': 'FLUXCOM', 'references': 'Tramontana, Gianluca, et al. "Predicting carbon dioxide and energy fluxes across global FLUXNET sites with regression algorithms." (2016).', 'source_name': 'GPPall', 'standard_name': 'gross_primary_productivity_of_carbon', 'units': 'gC m-2 day-1', 'url': 'http://www.fluxcom.org/'}
orig_version :: v1
project_name :: FLUXCOM
time_coverage_end :: 2015-12-31
time_coverage_resolution :: P8D
time_coverage_start :: 2001-01-05
url :: http://www.fluxcom.org/





  
      Array  Chunk 
  
  
     Bytes  539.14 kB   179.71 kB 
     Shape  (3, 144, 312)   (1, 144, 312) 
     Count  3408 Tasks  3 Chunks 
     Type  float32  numpy.ndarray

Attributes: (39)
Metadata_conventions :
Unidata Dataset Discovery v1.0
acknowledgment :
The ESDL team acknowledges all data providers!
chunking :
1x720x1440
comment :
none.
contributor_name :
Max Planck Institute for Biogeochemistry
contributor_role :
ESDL Science Lead
creator_email :
info@earthsystemdatalab.net
creator_name :
Brockmann Consult GmbH
creator_url :
www.earthsystemdatalab.net
date_created :
17.12.2018
date_issued :
19.12.2018
date_modified :
2020-05-01T10:00:44.942841
geospatial_lat_max :
71.75
geospatial_lat_min :
35.75
geospatial_lon_max :
60.0
geospatial_lon_min :
-18.0
geospatial_resolution :
1/4deg
history :
- processing with esdl cube v0.1 (https://github.com/esa-esdl/esdl-core/)
id :
v2.0.0
institution :
Brockmann Consult GmbH
keywords :
Earth Science, Geophysical Variables
license :
Please refer to individual variables
naming_authority :
Earth System Data Lab team
processing_level :
Level 4
project :
ESA Earth System Data Lab
publisher_email :
info@earthsystemdatalab.net
publisher_name :
Brockmann Consult GmbH & Max Planck Institute for Biogechemistry
publisher_url :
www.brockmann-consult.de
standard_name_vocabulary :
CF-1.7
summary :
This data set contains a data cube of Earth System variables created by the ESA project Earth System Data Lab.
time_coverage_duration :
P37Y
time_coverage_end :
30.12.2016
time_coverage_resolution :
P8D
time_coverage_start :
05.01.1980
title :
Earth System Data Cube
geospatial_lon_units :
degrees_east
geospatial_lon_resolution :
0.25
geospatial_lat_units :
degrees_north
geospatial_lat_resolution :
0.25

europe_datacube_201007.gross_primary_productivity.mean('time').plot(vmin=0, robust=True)

/home/emmanuel/.conda/envs/rbig_eo/lib/python3.8/site-packages/dask/array/numpy_compat.py:40: RuntimeWarning: invalid value encountered in true_divide
  x = np.divide(x1, x2, out)

<matplotlib.collections.QuadMesh at 0x7f5113a06070>

3. Get Density Cubes¶

Now, we are going to create some density cubes. Instead of just taking the entire amount of samples, we are going to actually construct features. These features will be the neighbouring pixels in a spatial-temporal manner. For this demonstration, we will assume that the pixels

from src.features.preprocessing import DensityCubes

def get_density_cubes(data: xr.Dataset, spatial: int, temporal: int) -> Tuple[str, pd.DataFrame]:
    """Wrapper Function to get density cubes from a dataarray"""
    for ikey, idata in data.items():
        yield ikey, DensityCubes(
            spatial_window=spatial,
            time_window=temporal
        ).get_minicubes(idata)

# All samples
europe_df = europe_datacube_201007.to_dataframe().dropna()

# reorder index
levels = ['time', 'lon', 'lat']
europe_df = europe_df.reorder_levels(levels)

# europe_df = europe_df[indx]
europe_df.head()

			gross_primary_productivity
time	lon	lat
2010-07-08	51.625	71.625	1.747494
2010-07-16	51.625	71.625	1.073725
2010-07-24	51.625	71.625	1.334421
2010-07-08	51.875	71.625	1.201953
2010-07-16	51.875	71.625	0.647814

spatial = 1
temporal = 3

ivar, europe_temp_df = next(get_density_cubes(
    europe_datacube_201007,
    spatial=spatial,
    temporal=temporal
))

levels = ['time', 'lon', 'lat']
europe_temp_df = europe_temp_df.reorder_levels(levels)

europe_temp_df.head()

			var_x0	var_x1	var_x2
time	lon	lat
2010-07-24	51.625	71.625	1.747494	1.073725	1.334421
	51.875	71.625	1.201953	0.647814	0.841013
	52.125	71.625	1.158344	0.755085	0.847133
	52.375	71.625	1.320013	0.837758	0.829512
	52.625	71.625	1.087877	0.642634	0.499693

levels = ['time', 'lon', 'lat']


idx = europe_temp_df.index.intersection(europe_df.index)

idx.shape,

((26694,),)

X_df = europe_df.loc[idx,:]
Y_df = europe_temp_df.loc[idx,:]
X_df.shape, Y_df.shape

((26694, 1), (26694, 3))

4.1 Models Framework¶

4.1 Preprocessing¶

4.1.1 - Training and testing¶

europe_df.head()

			gross_primary_productivity
time	lon	lat
2010-07-08	51.625	71.625	1.747494
2010-07-16	51.625	71.625	1.073725
2010-07-24	51.625	71.625	1.334421
2010-07-08	51.875	71.625	1.201953
2010-07-16	51.875	71.625	0.647814

y = europe_df.iloc[:, 0][:, np.newaxis]
X = europe_df.iloc[:, 1:]

d_dimensions = X.shape[1]

4.1.2 - Train-Test Split¶

from sklearn.model_selection import train_test_split


train_size = 2_000
random_state = 123

xtrain, xtest, ytrain, ytest = train_test_split(
    X, y, train_size=train_size, random_state=random_state)

test_size = xtest.shape[0]

4.1.1 - Normalize¶

from sklearn.preprocessing import StandardScaler

# normalize inputs
x_normalizer = StandardScaler(with_mean=True, with_std=False)

xtrain_norm = x_normalizer.fit_transform(xtrain)
xtest_norm = x_normalizer.transform(xtest)

# remove mean outputs
y_normalizer = StandardScaler(with_std=False)

ytrain_norm = y_normalizer.fit_transform(ytrain)
ytest_norm = y_normalizer.transform(ytest)

4.2 - Training¶

# from src.models.gp import SparseGPR
import GPy
from scipy.cluster.vq import kmeans2

# Kernel Function (RBF)
n_dims = xtrain_norm.shape[1]
kernel = GPy.kern.RBF(input_dim=n_dims, ARD=False)

# Inducing Points
n_inducing = 100
z = kmeans2(X, n_inducing, minit="points")[0]

# Initialize GP Model
gp_model = GPy.models.SparseGPRegression(X, y, kernel=kernel, Z=z)

# choose VFE inference method
gp_model.inference_method = (GPy.inference.latent_function_inference.VarDTC())

# fix variance to be low in the beginning
gp_model.Gaussian_noise.variance = 0.01

2020-04-30 19:40:34,207:INFO:initializing Y
2020-04-30 19:40:34,209:INFO:initializing inference method
2020-04-30 19:40:34,209:INFO:adding kernel and likelihood as parameters
2020-04-30 19:40:34,211:INFO:Adding Z as parameter

# optimize GP Model
n_restarts = 0
verbose = 1
max_iters = 1_000

# optimize
gp_model.optimize(
    optimizer='scg',
    messages=verbose,
    max_iters=max_iters,
);

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/model.py in optimize(self, optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
    110         with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook, clear_after_finish=clear_after_finish) as vo:
--> 111             opt.run(start, f_fp=self._objective_grads, f=self._objective, fp=self._grads)
    112 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/optimization.py in run(self, x_init, **kwargs)
     50         start = dt.datetime.now()
---> 51         self.opt(x_init, **kwargs)
     52         end = dt.datetime.now()

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/optimization.py in opt(self, x_init, f_fp, f, fp)
    233 
--> 234         opt_result = SCG(f, fp, x_init,
    235                          maxiters=self.max_iters,

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/scg.py in SCG(f, gradf, x, optargs, maxiters, max_f_eval, xtol, ftol, gtol)
    112         xnew = x + alpha * d
--> 113         fnew = f(xnew, *optargs)
    114         function_eval += 1

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/model.py in _objective(self, x)
    260         try:
--> 261             self.optimizer_array = x
    262             obj = self.objective_function()

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/parameterized.py in __setattr__(self, name, val)
    338                 param[:] = val; return
--> 339         return object.__setattr__(self, name, val)
    340 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/sparse_gp_mpi.py in optimizer_array(self, p)
     87             self.mpi_comm.Bcast(p, root=0)
---> 88         SparseGP.optimizer_array.fset(self,p)
     89 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/parameter_core.py in optimizer_array(self, p)
    123         self._optimizer_copy_transformed = False
--> 124         self.trigger_update()
    125 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/updateable.py in trigger_update(self, trigger_parent)
     78             return
---> 79         self._trigger_params_changed(trigger_parent)

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/parameter_core.py in _trigger_params_changed(self, trigger_parent)
    133         [p._trigger_params_changed(trigger_parent=False) for p in self.parameters if not p.is_fixed]
--> 134         self.notify_observers(None, None if trigger_parent else -np.inf)
    135 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/observable.py in notify_observers(self, which, min_priority)
     90             if min_priority is None:
---> 91                 [callble(self, which=which) for _, _, callble in self.observers]
     92             else:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/observable.py in <listcomp>(.0)
     90             if min_priority is None:
---> 91                 [callble(self, which=which) for _, _, callble in self.observers]
     92             else:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/core/parameter_core.py in _parameters_changed_notification(self, me, which)
    507         self._optimizer_copy_transformed = False # tells the optimizer array to update on next request
--> 508         self.parameters_changed()
    509     def _pass_through_notify_observers(self, me, which=None):

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/models/sparse_gp_regression.py in parameters_changed(self)
     65         else:
---> 66             super(SparseGPRegression, self).parameters_changed()

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/sparse_gp_mpi.py in parameters_changed(self)
    121         else:
--> 122             super(SparseGP_MPI,self).parameters_changed()

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/sparse_gp.py in parameters_changed(self)
     77         self.posterior, self._log_marginal_likelihood, self.grad_dict = \
---> 78         self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood,
     79                                         self.Y_normalized, Y_metadata=self.Y_metadata,

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/inference/latent_function_inference/var_dtc.py in inference(self, kern, X, Z, likelihood, Y, Y_metadata, mean_function, precision, Lm, dL_dKmm, psi0, psi1, psi2, Z_tilde)
    124             if psi1 is None:
--> 125                 psi1 = kern.K(X, Z)
    126             if het_noise:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/kern/src/kernel_slice_operations.py in wrap(self, X, X2, *a, **kw)
    109         with _Slice_wrap(self, X, X2) as s:
--> 110             ret = f(self, s.X, s.X2, *a, **kw)
    111         return ret

<decorator-gen-150> in K(self, X, X2)

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/caching.py in g(obj, *args, **kw)
    282                 cacher = cache[self.f] = Cacher(self.f, self.limit, self.ignore_args, self.force_kwargs, cacher_enabled=cache.caching_enabled)
--> 283             return cacher(*args, **kw)
    284         g.__name__ = f.__name__

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/caching.py in __call__(self, *args, **kw)
    178             try:
--> 179                 new_output = self.operation(*args, **kw)
    180             except:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/kern/src/stationary.py in K(self, X, X2)
    113         """
--> 114         r = self._scaled_dist(X, X2)
    115         return self.K_of_r(r)

<decorator-gen-153> in _scaled_dist(self, X, X2)

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/caching.py in g(obj, *args, **kw)
    282                 cacher = cache[self.f] = Cacher(self.f, self.limit, self.ignore_args, self.force_kwargs, cacher_enabled=cache.caching_enabled)
--> 283             return cacher(*args, **kw)
    284         g.__name__ = f.__name__

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/caching.py in __call__(self, *args, **kw)
    178             try:
--> 179                 new_output = self.operation(*args, **kw)
    180             except:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/kern/src/stationary.py in _scaled_dist(self, X, X2)
    167         else:
--> 168             return self._unscaled_dist(X, X2)/self.lengthscale
    169 

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/kern/src/stationary.py in _unscaled_dist(self, X, X2)
    147             r2 = np.clip(r2, 0, np.inf)
--> 148             return np.sqrt(r2)
    149 

KeyboardInterrupt: 

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-77-e519e2d9a69c> in <module>
      5 
      6 # optimize
----> 7 gp_model.optimize(
      8     optimizer='scg',
      9     messages=verbose,

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/sparse_gp_mpi.py in optimize(self, optimizer, start, **kwargs)
     91         self._IN_OPTIMIZATION_ = True
     92         if self.mpi_comm==None:
---> 93             ret = super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs)
     94         elif self.mpi_comm.rank==0:
     95             ret = super(SparseGP_MPI, self).optimize(optimizer,start,**kwargs)

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/GPy/core/gp.py in optimize(self, optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
    673         self.inference_method.on_optimization_start()
    674         try:
--> 675             ret = super(GP, self).optimize(optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
    676         except KeyboardInterrupt:
    677             print("KeyboardInterrupt caught, calling on_optimization_end() to round things up")

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/model.py in optimize(self, optimizer, start, messages, max_iters, ipython_notebook, clear_after_finish, **kwargs)
    109 
    110         with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook, clear_after_finish=clear_after_finish) as vo:
--> 111             opt.run(start, f_fp=self._objective_grads, f=self._objective, fp=self._grads)
    112 
    113         self.optimizer_array = opt.x_opt

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/verbose_optimization.py in __exit__(self, type, value, traceback)
    213             self.stop = time.time()
    214             self.model.remove_observer(self)
--> 215             self.print_out(self.stop - self.start)
    216 
    217             if not self.ipython_notebook:

~/.conda/envs/rbig_eo/lib/python3.8/site-packages/paramz/optimization/verbose_optimization.py in print_out(self, seconds)
    146                           ['||gradient||',
    147                               "{: >+12.3E}".format(float(self.current_gradient))],
--> 148                           ['status', "{:s}".format(self.status)],
    149                           ]
    150             #message = "Lik:{:5.3E} Grad:{:5.3E} Lik:{:5.3E} Len:{!s}".format(float(m.log_likelihood()), np.einsum('i,i->', grads, grads), float(m.likelihood.variance), " ".join(["{:3.2E}".format(l) for l in m.kern.lengthscale.values]))

TypeError: unsupported format string passed to NoneType.__format__

4.3 - Testing¶

ypred = gp_model.predict(xtest_norm, )[0]

ypred.shape, ytest_norm.shape

((67414, 1), (67414, 1))

stats = Metrics().get_all(ypred.squeeze(), ytest_norm.squeeze())
stats

	mae	mse	rmse	r2
0	1.584058	4.35029	2.085735	0.325761

def _predict(model, Xs, batch_size):
    ms = []
    n = max(len(Xs) / batch_size, 1)  # predict in small batches
    with tqdm(np.array_split(Xs, n)) as bar:
        for xs in bar:
            m = model.predict(xs,)
            ms.append(m)

    return np.vstack(ms)

batch_size = 5_000
ms = []
n = max(len(xtest_norm) / batch_size, 1)  # predict in small batches
with tqdm(np.array_split(xtest_norm, n)) as bar:
    for xs in bar:
        m = sgp_model.predict(xs,)
        ms.append(m)

100%|██████████| 598/598 [00:51<00:00, 11.56it/s]

5. Direct Measurements¶

5.1 - $\rho$ V Coefficient¶

Now, we will explore the easiest linear method. It is the multi-dimensional version of the Pearson Correlation coefficient called the $\rho$ V-Coefficient ( $\rho$ -Vector Coefficient). Most people are familiar with the correlation coefficient:

$\rho(X,Y) = \frac{\text{Cov}(X,Y)}{\sqrt{\text{Var}(X)}\sqrt{\text{Var}(Y)}}$

This is very well-known in the literature but it doesn't directly apply to multi-dimensional data. The final result of the numerator and the denominator is a scalar value per dimension. There is no way we can summarize all of the information into a single scalar. One extension we can do is to create a matrix with the pairwise components (i.e gram matrices) for each of the variables and then take the Frobenius norm (Hilbert-Schmidt norm) of the cross term as well as the individual terms. So the equation is like so:

$\rho V (\mathbf{X,Y}) = \frac{\left\langle \mathbf{XX^\top, YY^\top} \right\rangle_\mathbf{F}} {\sqrt{\left\langle \mathbf{XX^\top} \right\rangle_\mathbf{F}} \sqrt{\left\langle \mathbf{YY^\top} \right\rangle_\mathbf{F}}}$

Note: This is very similar to HSIC and Centered Kernel Alignment (CKA) but this method dates back before. CKA generalizes this method with the addition of distance measures and non-linear kernel functions. We will explore this in the next section.

To code this up, we will all of the components of this equation because we will need them later.

from typing import Dict
from sklearn.preprocessing import KernelCenterer
from sklearn.metrics.pairwise import linear_kernel


def rv_coefficient(X: np.ndarray, Y: np.ndarray) -> Dict:
    """simple function to calculate the rv coefficient"""
    # calculate the kernel matrices
    X_gram = linear_kernel(X)
    Y_gram = linear_kernel(Y)

    # center the kernels
    X_gram = KernelCenterer().fit_transform(X_gram)
    Y_gram = KernelCenterer().fit_transform(Y_gram)

    # normalizing coefficients (denomenator)
    x_norm = np.linalg.norm(X_gram)
    y_norm = np.linalg.norm(Y_gram)

    # frobenius norm of the cross terms (numerator)
    xy_norm = np.sum(X_gram * Y_gram)

    # rv coefficient
    pv_coeff = xy_norm / x_norm / y_norm

    return {
        'coefficient': pv_coeff,
        'x_norm': x_norm,
        'y_norm': y_norm,
        'xy_norm': xy_norm
    }

X_samples = europe_temp_df.iloc[:, 0][:, np.newaxis]
Y_samples = europe_temp_df

logging.info(f" Size of X_samples: {X_samples.shape}, {Y_samples.shape}")

d_dimensions = X.shape[1]

2020-04-30 21:10:39,726:INFO: Size of X_samples: (26694, 1), (26694, 3)

# check that the coefficient is 1 if the data is the same
rv_coeff = rv_coefficient(X_samples[:100], X_samples[:100])
np.testing.assert_almost_equal(rv_coeff['coefficient'], 1)

So now, let's try when we have some a difference between the variables.

%%time 
rv_coeff = rv_coefficient(X_samples[:], Y_samples[:])
rv_coeff

CPU times: user 23.8 s, sys: 4.9 s, total: 28.7 s
Wall time: 13.3 s

{'coefficient': 0.9696304,
 'x_norm': 191155.58,
 'y_norm': 510949.53,
 'xy_norm': 94704630000.0}

5.2 - Non-Linear Kernel¶

An addition that we can do is to explore how the

from typing import Optional
from scipy.spatial.distance import pdist, squareform

def estimate_sigma(
    X: np.ndarray, 
    method: str='median', 
    percent: Optional[int]=None, 
    heuristic: bool=False
) -> float:

    # get the squared euclidean distances
    if method == 'silverman':
        return silvermans_factor(X)
    elif method == 'scott':
        return scotts_factor(X)
    elif percent is not None:
        kth_sample = int((percent/100) * X.shape[0])
        dists = np.sort(squareform(pdist(X, 'sqeuclidean')))[:, kth_sample]
#         print(dists.shape, dists.min(), dists.max())
    else:
        dists = np.sort(pdist(X, 'sqeuclidean'))
#         print(dists.shape, dists.min(), dists.max())


    if method == 'median':
        sigma = np.median(dists)
    elif method == 'mean':
        sigma = np.mean(dists)
    else:
        raise ValueError(f"Unrecognized distance measure: {method}")

    if heuristic:
        sigma = np.sqrt(sigma / 2)
    return sigma

from typing import Dict
from sklearn.preprocessing import KernelCenterer
from sklearn.gaussian_process.kernels import RBF


def cka_coefficient(X: np.ndarray, Y: np.ndarray) -> Dict:
    """simple function to calculate the rv coefficient"""

    # estimate sigmas
    sigma_X = estimate_sigma(X, method='median', percent=50)
    sigma_Y = estimate_sigma(Y, method='median', percent=50)
    # calculate the kernel matrices
    X_gram = RBF(sigma_X)(X)
    Y_gram = RBF(sigma_Y)(Y)

    # center the kernels
    X_gram = KernelCenterer().fit_transform(X_gram)
    Y_gram = KernelCenterer().fit_transform(Y_gram)

    # normalizing coefficients (denomenator)
    x_norm = np.linalg.norm(X_gram)
    y_norm = np.linalg.norm(Y_gram)

    # frobenius norm of the cross terms (numerator)
    xy_norm = np.sum(X_gram * Y_gram)

    # rv coefficient
    pv_coeff = xy_norm / x_norm / y_norm

    return {
        'coefficient': pv_coeff,
        'x_norm': x_norm,
        'y_norm': y_norm,
        'xy_norm': xy_norm
    }

# check that the coefficient is 1 if the data is the same
cka_coeff = cka_coefficient(X_samples[:100], X_samples[:100])
np.testing.assert_almost_equal(cka_coeff['coefficient'], 1)

%%time 
cka_coeff = cka_coefficient(X_samples[:10_000], Y_samples[:10_000])
cka_coeff

CPU times: user 20.4 s, sys: 2.89 s, total: 23.3 s
Wall time: 17 s

{'coefficient': 0.9576703788185938,
 'x_norm': 2326.331280441253,
 'y_norm': 1175.2587321221124,
 'xy_norm': 2618310.2249249523}

Variation of Information¶

from rbig.rbig import RBIGMI, RBIG

rbig_results = {}

def variation_of_info(H_X, H_Y, I_XY):
    return I_XY / np.sqrt(H_X) / np.sqrt(H_Y)

%%time 
n_layers = 10000
rotation_type = 'PCA'
random_state = 0
zero_tolerance = 60
pdf_extension = None
pdf_resolution = None
tolerance = None

# Initialize RBIG class
H_rbig_model = RBIG(n_layers=n_layers, 
                  rotation_type=rotation_type, 
                  random_state=random_state, 
                  zero_tolerance=zero_tolerance,
                  tolerance=tolerance)

# fit model to the data
rbig_results['H_x'] = H_rbig_model.fit(X_samples).entropy(correction=True);

rbig_results['H_y'] = H_rbig_model.fit(Y_samples).entropy(correction=True);

# Initialize RBIG class
I_rbig_model = RBIGMI(n_layers=n_layers, 
                  rotation_type=rotation_type, 
                  random_state=random_state, 
                  zero_tolerance=zero_tolerance,
                  tolerance=tolerance)

# fit model to the data
rbig_results['I_xy'] = I_rbig_model.fit(X_samples, Y_samples).mutual_information();

# calculate the variation of information coefficient
rbig_results['coefficient'] = variation_of_info(
    rbig_results['H_x'],
    rbig_results['H_y'],
    rbig_results['I_xy']
)

CPU times: user 4min 12s, sys: 36.4 s, total: 4min 49s
Wall time: 11.2 s

rbig_results

{'H_x': 3.2281169474002924,
 'H_y': 5.181635094706355,
 'I_xy': 5.412206453196236,
 'coefficient': 1.3233243751128443}

Methods¶

1. Get DataCubes¶

2. Select Region¶

3. Get Density Cubes¶

4.1 Models Framework¶

4.1 Preprocessing¶

4.1.1 - Training and testing¶

4.1.2 - Train-Test Split¶

4.1.1 - Normalize¶

4.2 - Training¶

4.3 - Testing¶

5. Direct Measurements¶

5.1 - \rho\rhoV Coefficient¶

5.2 - Non-Linear Kernel¶

Variation of Information¶

5.1 - $\rho$ V Coefficient¶