Skip to content

Spatial-Temporal Experiment

import sys, os
from pyprojroot import here
# sys.path.append(here)

# standard python packages
import xarray as xr
import pandas as pd
import numpy as np

# 
from src.models.spatemp.train_models import Metrics

# # esdc tools
# from src.esdc.subset import select_pixel
# from src.esdc.shape import ShapeFileExtract, rasterize
# from esdc.transform import DensityCubes

from tqdm import tqdm

import matplotlib.pyplot as plt
import cartopy
import cartopy.crs as ccrs
plt.style.use(['fivethirtyeight', 'seaborn-poster'])
%matplotlib inline

%load_ext autoreload
%autoreload 2
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-1-9a645161431b> in <module>
      1 import sys, os
----> 2 from pyprojroot import here
      3 # sys.path.append(here)
      4 
      5 # standard python packages

ModuleNotFoundError: No module named 'pyprojroot'
!ls /media/disk/databases/ESDC/
Cube_2019highColombiaCube_184x120x120.zarr
Cube_2019highColombiaCube_1x3360x2760.zarr
esdc-8d-0.083deg-184x270x270-2.0.0.zarr
esdc-8d-0.083deg-1x2160x4320-2.0.0.zarr
esdc-8d-0.25deg-184x90x90-2.0.0.zarr
esdc-8d-0.25deg-1x720x1440-2.0.0.zarr

1. Get DataCubes

filename = '/media/disk/databases/ESDC/esdc-8d-0.25deg-1x720x1440-2.0.0.zarr'

datacube = xr.open_zarr(filename)
lst_cube = datacube[['soil_moisture', 'land_surface_temperature']]
lst_cube
<xarray.Dataset>
Dimensions:                   (lat: 720, lon: 1440, time: 1702)
Coordinates:
  * lat                       (lat) float32 89.875 89.625 ... -89.625 -89.875
  * time                      (time) datetime64[ns] 1980-01-05 ... 2016-12-30
  * lon                       (lon) float32 -179.875 -179.625 ... 179.875
Data variables:
    soil_moisture             (time, lat, lon) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>
    land_surface_temperature  (time, lat, lon) float32 dask.array<chunksize=(1, 720, 1440), meta=np.ndarray>
Attributes:
    Metadata_conventions :     Unidata Dataset Discovery v1.0
    acknowledgment:            The ESDL team acknowledges all data providers! 
    chunking:                  1x720x1440
    comment:                   none. 
    contributor_name:          Max Planck Institute for Biogeochemistry
    contributor_role:          ESDL Science Lead 
    creator_email:             info@earthsystemdatalab.net
    creator_name:              Brockmann Consult GmbH 
    creator_url:               www.earthsystemdatalab.net
    date_created:              17.12.2018
    date_issued:               19.12.2018
    date_modified:             17.12.2018
    geospatial_lat_max:        89.75
    geospatial_lat_min:        -89.75
    geospatial_lon_max:        179.75
    geospatial_lon_min:        -179.75
    geospatial_resolution:     1/4deg
    history:                   - processing with esdl cube v0.1  (https://git...
    id:                        v2.0.0
    institution:               Brockmann Consult GmbH 
    keywords:                  Earth Science, Geophysical Variables
    license:                   Please refer to individual variables
    naming_authority:          Earth System Data Lab team
    processing_level:          Level 4
    project:                   ESA Earth System Data Lab 
    publisher_email:           info@earthsystemdatalab.net
    publisher_name:            Brockmann Consult GmbH & Max Planck Institute ...
    publisher_url:             www.brockmann-consult.de
    standard_name_vocabulary:  CF-1.7
    summary:                   This data set contains a data cube of Earth Sy...
    time_coverage_duration:    P37Y
    time_coverage_end:         30.12.2016
    time_coverage_resolution:  P8D
    time_coverage_start:       05.01.1980
    title:                     Earth System Data Cube

2. Select Region

europe = lst_cube.sel(lat=slice(71.5, 35.5), lon=slice(-18.0, 60.0))

3. Get Density Cubes

spatial = 7
temporal = 1

# initialize minicuber
minicuber = DensityCubes(
    spatial_window=spatial, 
    time_window=temporal, 
)

europe_df = minicuber.get_minicubes(europe.land_surface_temperature)
europe_df.shape
(5982624, 49)
print(7 * 7 * 1 - 1)
print(5 * 5 * 2 - 1)
print(4 * 4 * 3 - 1)
print(3 * 3 * 5 - 1)
print(2 * 2 * 11 - 1)
print(1 * 1 * 46 - 1)
48
49
47
44
43
45
europe_df.shape
(2052734, 46)

4. ML Model Framework

4.1 Preprocessing

4.1.1 - Training and testing

europe_df.head()
var_x0 var_x1 var_x2 var_x3 var_x4 var_x5 var_x6 var_x7 var_x8 var_x9 ... var_x39 var_x40 var_x41 var_x42 var_x43 var_x44 var_x45 var_x46 var_x47 var_x48
time lat lon
2002-05-21 70.625 54.375 268.145142 267.553741 267.075653 266.539734 265.585785 266.500458 270.357666 269.060791 268.648926 268.229797 ... 269.760193 271.069000 271.488525 268.266052 269.794861 270.075409 271.263397 270.822144 271.262665 269.876068
54.625 267.553741 267.075653 266.539734 265.585785 266.500458 270.357666 269.116730 268.648926 268.229797 268.201996 ... 271.069000 271.488525 270.526123 269.794861 270.075409 271.263397 270.822144 271.262665 269.876068 267.898865
54.875 267.075653 266.539734 265.585785 266.500458 270.357666 269.116730 269.217926 268.229797 268.201996 268.216003 ... 271.488525 270.526123 266.060333 270.075409 271.263397 270.822144 271.262665 269.876068 267.898865 267.227875
55.125 266.539734 265.585785 266.500458 270.357666 269.116730 269.217926 268.337921 268.201996 268.216003 268.703064 ... 270.526123 266.060333 265.535248 271.263397 270.822144 271.262665 269.876068 267.898865 267.227875 267.252319
55.375 265.585785 266.500458 270.357666 269.116730 269.217926 268.337921 269.024597 268.216003 268.703064 268.308807 ... 266.060333 265.535248 267.518524 270.822144 271.262665 269.876068 267.898865 267.227875 267.252319 267.736053

5 rows × 49 columns

y = europe_df.iloc[:, 0][:, np.newaxis]
X = europe_df.iloc[:, 1:]

d_dimensions = X.shape[1]

4.1.2 - Train-Test Split

from sklearn.model_selection import train_test_split


train_size = 1_000
random_state = 123

xtrain, xtest, ytrain, ytest = train_test_split(
    X, y, train_size=train_size, random_state=random_state)

test_size = xtest.shape[0]

4.1.1 - Normalize

from sklearn.preprocessing import StandardScaler

# normalize inputs
x_normalizer = StandardScaler(with_mean=True, with_std=False)

xtrain_norm = x_normalizer.fit_transform(xtrain)
xtest_norm = x_normalizer.transform(xtest)

# remove mean outputs
y_normalizer = StandardScaler(with_std=False)

ytrain_norm = y_normalizer.fit_transform(ytrain)
ytest_norm = y_normalizer.transform(ytest)

4.2 - Training

from gpy.sparse import SparseGPR
import GPy
# gp params
n_dims = xtrain_norm.shape[1]
kernel = GPy.kern.RBF(input_dim=n_dims, ARD=False)
inference = 'vfe'
n_inducing = 300
verbose = 1
max_iters = 5_000
n_restarts = 0

# initialize GP Model
sgp_model = SparseGPR(
    kernel=kernel, 
    inference=inference, 
    n_inducing=n_inducing, 
    verbose=verbose,
    max_iters=max_iters,
    n_restarts=n_restarts
)

# train GP model
sgp_model.fit(xtrain_norm, ytrain_norm)
SparseGPR(alpha=0.5, inference='vfe',
          kernel=<GPy.kern.src.rbf.RBF object at 0x7f17d6d41780>,
          max_iters=5000, n_inducing=300, n_restarts=0, optimizer='scg',
          verbose=1)
sgp_model.display_model()

Model: sparse_gp
Objective: 4313.986904027843
Number of Parameters: 14403
Number of Optimization Parameters: 14403
Updates: True

sparse_gp. valueconstraintspriors
inducing inputs (300, 48)
rbf.variance 2.1394880780812098e-16 +ve
rbf.lengthscale 0.8861813022707942 +ve
Gaussian_noise.variance 326.94648917027007 +ve

4.3 - Testing


ypred = sgp_model.predict(xtest_norm, return_std=False)
ypred.shape, ytest_norm.shape
((5981624, 1), (5981624, 1))
stats = Metrics().get_all(ypred.squeeze(), ytest_norm.squeeze())
stats
mae mse rmse r2
0 15.522687 338.304949 18.393068 -0.000268
stats['r2'].values
array([-0.00026803])
def _predict(model, Xs, batch_size):
    ms = []
    n = max(len(Xs) / batch_size, 1)  # predict in small batches
    with tqdm(np.array_split(Xs, n)) as bar:
        for xs in bar:
            m = model.predict(xs,)
            ms.append(m)

    return np.vstack(ms)
batch_size = 5_000
ms = []
n = max(len(xtest_norm) / batch_size, 1)  # predict in small batches
with tqdm(np.array_split(xtest_norm, n)) as bar:
    for xs in bar:
        m = sgp_model.predict(xs,)
        ms.append(m)
100%|██████████| 598/598 [00:51<00:00, 11.56it/s]
np.vstack(ms).shape
(5981624, 1)
ypred = _predict(sgp_model, xtest_norm, 5_000)
100%|██████████| 1196/1196 [00:44<00:00, 27.05it/s]
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-82-585df2283be9> in <module>
----> 1 ypred = _predict(sgp_model, xtest_norm, 5_000)

<ipython-input-81-0f411118c22e> in _predict(model, Xs, batch_size)
      7             ms.append(m)
      8 
----> 9     return np.concatenate(ms, 1)

<__array_function__ internals> in concatenate(*args, **kwargs)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 5002 and the array at index 428 has size 5001
ypred.shape