Spatial-Temporal Experiment¶

import sys, os
from pyprojroot import here
# sys.path.append(here)

# standard python packages
import xarray as xr
import pandas as pd
import numpy as np

# 
from src.models.spatemp.train_models import Metrics

# # esdc tools
# from src.esdc.subset import select_pixel
# from src.esdc.shape import ShapeFileExtract, rasterize
# from esdc.transform import DensityCubes

from tqdm import tqdm

import matplotlib.pyplot as plt
import cartopy
import cartopy.crs as ccrs
plt.style.use(['fivethirtyeight', 'seaborn-poster'])
%matplotlib inline

%load_ext autoreload
%autoreload 2

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-1-9a645161431b> in <module>
      1 import sys, os
----> 2 from pyprojroot import here
      3 # sys.path.append(here)
      4 
      5 # standard python packages

ModuleNotFoundError: No module named 'pyprojroot'

!ls /media/disk/databases/ESDC/

Cube_2019highColombiaCube_184x120x120.zarr
Cube_2019highColombiaCube_1x3360x2760.zarr
esdc-8d-0.083deg-184x270x270-2.0.0.zarr
esdc-8d-0.083deg-1x2160x4320-2.0.0.zarr
esdc-8d-0.25deg-184x90x90-2.0.0.zarr
esdc-8d-0.25deg-1x720x1440-2.0.0.zarr

1. Get DataCubes¶

filename = '/media/disk/databases/ESDC/esdc-8d-0.25deg-1x720x1440-2.0.0.zarr'

datacube = xr.open_zarr(filename)

lst_cube = datacube[['soil_moisture', 'land_surface_temperature']]
lst_cube

<xarray.Dataset>
Dimensions:                   (lat: 720, lon: 1440, time: 1702)
Coordinates:
  * lat                       (lat) float32 89.875 89.625 ... -89.625 -89.875
  * time                      (time) datetime64[ns] 1980-01-05 ... 2016-12-30
  * lon                       (lon) float32 -179.875 -179.625 ... 179.875
Data variables:
    soil_moisture             (time, lat, lon) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>
    land_surface_temperature  (time, lat, lon) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>
Attributes:
    Metadata_conventions :     Unidata Dataset Discovery v1.0
    acknowledgment:            The ESDL team acknowledges all data providers! 
    chunking:                  1x720x1440
    comment:                   none. 
    contributor_name:          Max Planck Institute for Biogeochemistry
    contributor_role:          ESDL Science Lead 
    creator_email:             info@earthsystemdatalab.net
    creator_name:              Brockmann Consult GmbH 
    creator_url:               www.earthsystemdatalab.net
    date_created:              17.12.2018
    date_issued:               19.12.2018
    date_modified:             17.12.2018
    geospatial_lat_max:        89.75
    geospatial_lat_min:        -89.75
    geospatial_lon_max:        179.75
    geospatial_lon_min:        -179.75
    geospatial_resolution:     1/4deg
    history:                   - processing with esdl cube v0.1  (https://git...
    id:                        v2.0.0
    institution:               Brockmann Consult GmbH 
    keywords:                  Earth Science, Geophysical Variables
    license:                   Please refer to individual variables
    naming_authority:          Earth System Data Lab team
    processing_level:          Level 4
    project:                   ESA Earth System Data Lab 
    publisher_email:           info@earthsystemdatalab.net
    publisher_name:            Brockmann Consult GmbH & Max Planck Institute ...
    publisher_url:             www.brockmann-consult.de
    standard_name_vocabulary:  CF-1.7
    summary:                   This data set contains a data cube of Earth Sy...
    time_coverage_duration:    P37Y
    time_coverage_end:         30.12.2016
    time_coverage_resolution:  P8D
    time_coverage_start:       05.01.1980
    title:                     Earth System Data Cube

2. Select Region¶

europe = lst_cube.sel(lat=slice(71.5, 35.5), lon=slice(-18.0, 60.0))

3. Get Density Cubes¶

spatial = 7
temporal = 1

# initialize minicuber
minicuber = DensityCubes(
    spatial_window=spatial, 
    time_window=temporal, 
)

europe_df = minicuber.get_minicubes(europe.land_surface_temperature)
europe_df.shape

(5982624, 49)

print(7 * 7 * 1 - 1)
print(5 * 5 * 2 - 1)
print(4 * 4 * 3 - 1)
print(3 * 3 * 5 - 1)
print(2 * 2 * 11 - 1)
print(1 * 1 * 46 - 1)

europe_df.shape

(2052734, 46)

4. ML Model Framework¶

4.1 Preprocessing¶

4.1.1 - Training and testing¶

europe_df.head()

			var_x0	var_x1	var_x2	var_x3	var_x4	var_x5	var_x6	var_x7	var_x8	var_x9	...	var_x39	var_x40	var_x41	var_x42	var_x43	var_x44	var_x45	var_x46	var_x47	var_x48
time	lat	lon
2002-05-21	70.625	54.375	268.145142	267.553741	267.075653	266.539734	265.585785	266.500458	270.357666	269.060791	268.648926	268.229797	...	269.760193	271.069000	271.488525	268.266052	269.794861	270.075409	271.263397	270.822144	271.262665	269.876068
		54.625	267.553741	267.075653	266.539734	265.585785	266.500458	270.357666	269.116730	268.648926	268.229797	268.201996	...	271.069000	271.488525	270.526123	269.794861	270.075409	271.263397	270.822144	271.262665	269.876068	267.898865
		54.875	267.075653	266.539734	265.585785	266.500458	270.357666	269.116730	269.217926	268.229797	268.201996	268.216003	...	271.488525	270.526123	266.060333	270.075409	271.263397	270.822144	271.262665	269.876068	267.898865	267.227875
		55.125	266.539734	265.585785	266.500458	270.357666	269.116730	269.217926	268.337921	268.201996	268.216003	268.703064	...	270.526123	266.060333	265.535248	271.263397	270.822144	271.262665	269.876068	267.898865	267.227875	267.252319
		55.375	265.585785	266.500458	270.357666	269.116730	269.217926	268.337921	269.024597	268.216003	268.703064	268.308807	...	266.060333	265.535248	267.518524	270.822144	271.262665	269.876068	267.898865	267.227875	267.252319	267.736053

5 rows × 49 columns

y = europe_df.iloc[:, 0][:, np.newaxis]
X = europe_df.iloc[:, 1:]

d_dimensions = X.shape[1]

4.1.2 - Train-Test Split¶

from sklearn.model_selection import train_test_split


train_size = 1_000
random_state = 123

xtrain, xtest, ytrain, ytest = train_test_split(
    X, y, train_size=train_size, random_state=random_state)

test_size = xtest.shape[0]

4.1.1 - Normalize¶

from sklearn.preprocessing import StandardScaler

# normalize inputs
x_normalizer = StandardScaler(with_mean=True, with_std=False)

xtrain_norm = x_normalizer.fit_transform(xtrain)
xtest_norm = x_normalizer.transform(xtest)

# remove mean outputs
y_normalizer = StandardScaler(with_std=False)

ytrain_norm = y_normalizer.fit_transform(ytrain)
ytest_norm = y_normalizer.transform(ytest)

4.2 - Training¶

from gpy.sparse import SparseGPR
import GPy

# gp params
n_dims = xtrain_norm.shape[1]
kernel = GPy.kern.RBF(input_dim=n_dims, ARD=False)
inference = 'vfe'
n_inducing = 300
verbose = 1
max_iters = 5_000
n_restarts = 0

# initialize GP Model
sgp_model = SparseGPR(
    kernel=kernel, 
    inference=inference, 
    n_inducing=n_inducing, 
    verbose=verbose,
    max_iters=max_iters,
    n_restarts=n_restarts
)

# train GP model
sgp_model.fit(xtrain_norm, ytrain_norm)

SparseGPR(alpha=0.5, inference='vfe',
          kernel=<GPy.kern.src.rbf.RBF object at 0x7f17d6d41780>,
          max_iters=5000, n_inducing=300, n_restarts=0, optimizer='scg',
          verbose=1)

sgp_model.display_model()

Model: sparse_gp
Objective: 4313.986904027843
Number of Parameters: 14403
Number of Optimization Parameters: 14403
Updates: True

sparse_gp.	value	constraints
inducing inputs	(300, 48)
rbf.variance	2.1394880780812098e-16	+ve
rbf.lengthscale	0.8861813022707942	+ve
Gaussian_noise.variance	326.94648917027007	+ve

4.3 - Testing¶

ypred = sgp_model.predict(xtest_norm, return_std=False)

ypred.shape, ytest_norm.shape

((5981624, 1), (5981624, 1))

stats = Metrics().get_all(ypred.squeeze(), ytest_norm.squeeze())
stats

	mae	mse	rmse	r2
0	15.522687	338.304949	18.393068	-0.000268

stats['r2'].values

array([-0.00026803])

def _predict(model, Xs, batch_size):
    ms = []
    n = max(len(Xs) / batch_size, 1)  # predict in small batches
    with tqdm(np.array_split(Xs, n)) as bar:
        for xs in bar:
            m = model.predict(xs,)
            ms.append(m)

    return np.vstack(ms)

batch_size = 5_000
ms = []
n = max(len(xtest_norm) / batch_size, 1)  # predict in small batches
with tqdm(np.array_split(xtest_norm, n)) as bar:
    for xs in bar:
        m = sgp_model.predict(xs,)
        ms.append(m)

100%|██████████| 598/598 [00:51<00:00, 11.56it/s]

np.vstack(ms).shape

(5981624, 1)

ypred = _predict(sgp_model, xtest_norm, 5_000)

100%|██████████| 1196/1196 [00:44<00:00, 27.05it/s]

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-82-585df2283be9> in <module>
----> 1 ypred = _predict(sgp_model, xtest_norm, 5_000)

<ipython-input-81-0f411118c22e> in _predict(model, Xs, batch_size)
      7             ms.append(m)
      8 
----> 9     return np.concatenate(ms, 1)

<__array_function__ internals> in concatenate(*args, **kwargs)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 5002 and the array at index 428 has size 5001

ypred.shape