Experiment Example¶

import sys, os
cwd = os.getcwd()
sys.path.insert(0, f'{cwd}/../../')
sys.path.insert(0, '/home/emmanuel/code/py_esdc')


import xarray as xr
import pandas as pd
import numpy as np

# drought tools
from src.data.drought.loader import DataLoader
from src.features.drought.build_features import (
    get_cali_geometry,
    mask_datacube,
    smooth_vod_signal,
    remove_climatology,
    get_cali_emdata,
    get_drought_years,
    get_density_cubes,
    get_common_elements_many,
    normalize
)
from src.visualization.drought.analysis import plot_mean_time

# esdc tools
from esdc.subset import select_pixel
from esdc.shape import ShapeFileExtract, rasterize
from esdc.transform import DensityCubes

# RBIG
from src.models.train_models import run_rbig_models
from sklearn.preprocessing import StandardScaler
from scipy import stats
from tqdm import tqdm

import matplotlib.pyplot as plt
import cartopy
import cartopy.crs as ccrs
plt.style.use(['fivethirtyeight', 'seaborn-poster'])
%matplotlib inline

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

# Load Data
region = 'conus'
sampling = '14D'

drought_cube = DataLoader().load_data(region, sampling)

# Subset california
cali_geoms = get_cali_geometry()

drought_cube = mask_datacube(drought_cube, cali_geoms)

# interpolate
# interpolation arguments
interp_dim = 'time'
method = 'linear'

# do interpolation
drought_cube = drought_cube.interpolate_na(
    dim=interp_dim, 
    method=method
)

# remove climatology
drought_cube, _ = remove_climatology(drought_cube)


# drought years
drought_years = {
    "2010": False,
    "2011": False,
    "2012": True,
    "2013": False,
    "2014": True,
    "2015": True,
}

# MI elements
common_vars = [
    ('VOD', 'NDVI'),
    ('VOD', 'LST'),
    ('VOD', 'SM'),
    ('NDVI', 'LST'),
    ('NDVI', 'SM'),
    ('LST', 'SM')
]

variables = [
    'VOD', 'NDVI', 'SM', 'LST'
]

Experiment I - Individual Measurements¶

In this part, we will look at the standard individual measurements such as

Entropy, H
Total Correlation, TC

time_steps = range(1,12)
spatial = 1
results_df_single = pd.DataFrame()


with tqdm(drought_cube.groupby('time.year')) as years_bar:
    # group datacube by years
    for iyear, icube in years_bar:

        # Loop through time steps
        for itime_step in time_steps:

            # extract density cubes
            vod_df, lst_df, ndvi_df, sm_df = get_density_cubes(icube, spatial, itime_step)


            # get common elements
            dfs = get_common_elements_many([vod_df, lst_df, ndvi_df, sm_df])
            vod_df, lst_df, ndvi_df, sm_df = dfs[0], dfs[1], dfs[2], dfs[3]


            variables = {
                'VOD': vod_df,
                'NDVI': ndvi_df,
                'SM': sm_df,
                'LST': lst_df
            }



            # do calculations for H, TC
            for iname, idata in variables.items():



                # normalize data
                X_norm = StandardScaler().fit_transform(idata)

                # entropy, total correlation
                tc, h, t_ = run_rbig_models(X_norm, measure="t", random_state=123)

                # get H and TC
                results_df_single = results_df_single.append({
                    'year': iyear,
                    'drought': drought_years[iyear],
                    'samples': X_norm.shape[0],
                    'dimensions': X_norm.shape[1],
                    'temporal': itime_step,
                    'variable': iname,
                    'tc': tc,
                    'h': h,
                    'time': t_,

                }, ignore_index=True)

                postfix = dict(
                    Dims=f"{itime_step}",
                    Variable=f"{iname}",
                )
                years_bar.set_postfix(postfix)
            # do calculations for 
            break
        break

  0%|          | 0/6 [00:03<?, ?it/s, Dims=1, Variable=LST]

Experiment II - Comparing Measurements¶

In this experiment, we will look at different combinations of variables. The following measurements will be calculated and compared:

Pearson Correlation
Spearman Correlation
Mutual Information
HSIC...

time_steps = range(1,12)
spatial = 1
results_df_single = pd.DataFrame()


with tqdm(drought_cube.groupby('time.year')) as years_bar:
    # group datacube by years
    for iyear, icube in years_bar:

        # Loop through time steps
        for itime_step in time_steps:

            # extract density cubes
            vod_df, lst_df, ndvi_df, sm_df = get_density_cubes(icube, spatial, itime_step)


            # get common elements
            dfs = get_common_elements_many([vod_df, lst_df, ndvi_df, sm_df])
            vod_df, lst_df, ndvi_df, sm_df = dfs[0], dfs[1], dfs[2], dfs[3]


            variables = {
                'VOD': vod_df,
                'NDVI': ndvi_df,
                'SM': sm_df,
                'LST': lst_df
            }



            # do calculations for H, TC
            for (ivar1, ivar2) in common_vars:
#             for iname, idata in variables.items():

                # Pearson coeffcient
                pears = stats.pearsonr(
                    variables[ivar1].values.ravel(), 
                    variables[ivar2].values.ravel()
                )[0]

                # Spearman Coefficient
                spears = stats.spearmanr(
                    variables[ivar1].values.ravel(), 
                    variables[ivar2].values.ravel()
                )[0]

                # normalize data
                X_norm = StandardScaler().fit_transform(variables[ivar1])
                Y_norm = StandardScaler().fit_transform(variables[ivar2])

                # entropy, total correlation
                mi, t_ = run_rbig_models(X_norm, Y_norm, measure="mi", random_state=123)

                # get H and TC
                results_df_single = results_df_single.append({
                    'year': iyear,
                    'drought': drought_years[str(iyear)],
                    'samples': X_norm.shape[0],
                    'dimensions': X_norm.shape[1],
                    'temporal': itime_step,
                    'variable1': ivar1,
                    'variable2': ivar2,
                    'pearson': pears,
                    'mi': mi,
                    'time': t_,

                }, ignore_index=True)

                postfix = dict(
                    Year=f"{iyear}", 
                    Dims=f"{itime_step}",
                    Variables=f"{ivar1}-{ivar2}",
                    MI=f"{mi:.3f}",
                    Pear=f"{pears:.3f}",
                    Spear=f"{spears:.3f}",
                )
                years_bar.set_postfix(postfix)
            # do calculations for 
            break
        break

  0%|          | 0/6 [00:18<?, ?it/s, Year=2010, Dims=1, Variables=LST-SM, MI=0.183, Pear=-0.232, Spear=-0.214]

results_df_single.head()

	dimensions	mi	samples	temporal	time	variable1	variable2
0	1.0	0.014735	25779.0	1.0	2.484674	VOD	NDVI
1	1.0	0.024350	25779.0	1.0	2.564911	VOD	LST
2	1.0	0.157174	25779.0	1.0	2.743430	VOD	SM
3	1.0	0.019120	25779.0	1.0	2.564871	NDVI	LST
4	1.0	0.059311	25779.0	1.0	2.583604	NDVI	SM

stats.spearmanr(variables[ivar1].values.ravel(), variables[ivar2].values.ravel())

SpearmanrResult(correlation=0.08307110054436087, pvalue=1.0326962564352802e-40)