Spatial-Temporal Experiment¶
In this notebook, I will be walking through how we can estimate different methods based on the density cubes that we derive.
import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))
import pathlib
# standard python packages
import xarray as xr
import pandas as pd
import numpy as np
#
# Experiment Functions
from src.data.esdc import get_dataset
from src.features import Metrics
from sklearn.preprocessing import StandardScaler
# from src.models.density import get_rbig_model
from src.features.temporal import select_period, get_smoke_test_time, TimePeriod
from src.features.spatial import select_region, get_europe, get_spain
from src.models.train_models import get_similarity_scores
from src.experiments.utils import dict_product, run_parallel_step
from src.features import Metrics
from src.features.density import get_density_cubes
from src.features.preprocessing import standardizer_data, get_reference_cube, get_common_indices
from src.features.utils import subset_indices
# # esdc tools
# from src.esdc.subset import select_pixel
# from src.esdc.shape import ShapeFileExtract, rasterize
# from esdc.transform import DensityCubes
from typing import List, Dict
import xarray as xr
from tqdm import tqdm
import time
import cartopy
import cartopy.crs as ccrs
# NUMPY SETTINGS
import numpy as onp
onp.set_printoptions(precision=3, suppress=True)
# MATPLOTLIB Settings
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# SEABORN SETTINGS
import seaborn as sns
sns.set_context(context='talk',font_scale=0.7)
# sns.set(rc={'figure.figsize': (12, 9.)})
# sns.set_style("whitegrid")
# PANDAS SETTINGS
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
# LOGGING SETTINGS
import sys
import logging
logging.basicConfig(
level=logging.INFO,
stream=sys.stdout,
format='%(asctime)s: %(levelname)s: %(message)s'
)
logger = logging.getLogger()
#logger.setLevel(logging.INFO)
%load_ext autoreload
%autoreload 2
Experiment Steps¶
Global Variables¶
Parameters¶
parameters = {}
parameters['variable'] = [
'precipitation',
'gross_primary_productivity',
'land_surface_temperature',
'root_moisture',
'soil_moisture',
'gross_primary_productivity',
'precipitation',
]
parameters['region'] = [get_europe()]#[None]#[get_europe()]
parameters['period'] = [TimePeriod(name="2010", start="Jan-2010", end="Dec-2010")]
parameters['spatial'] = [1]#[1, 7, 3, 1]
parameters['temporal'] = [6]#[1, 1, 6, 46]
# params = list(dict_product(parameters))
params = list(dict_product(parameters))
print(len(params))
smoke_test = True
params = params[0]
from src.features.density import get_information_cubes
def get_information(data, index, prob='logprob'):
# create dataframe
# # convert to datetime
# df.index['time'] = pd.to_datetime(df.index['time'])
if prob == 'logprob':
df = pd.DataFrame(
data=data,
index=index,
columns=['log_probability']
)
df['probability'] = np.exp(df['log_probability'])
elif prob == 'prob':
df = pd.DataFrame(
data=data,
index=index,
columns=['probability']
)
df['log_probability'] = np.log(df['probability'])
else:
raise ValueError('Unrecognized measure.')
df = df.reset_index().set_index(['time', 'lat', 'lon'])
# remove probabilities greater than 1
# df['probability'][df['probability'] >= 1.0] = np.nan
# df['probability'] = np.exp(df['log_probability'])
# z_scores = stats.zscore(df)
# print(z_scores[100])
# abs_z_scores = np.abs(z_scores)
# filtered_entries = (abs_z_scores < 2).all(axis=1)
# df = df[filtered_entries]
# print(df.head())
# create xarray cubes
probs_cubes = xr.Dataset.from_dataframe(df)
# resample monthly
probs_cubes = probs_cubes.resample(time='1MS').mean()
# shannon info
probs_cubes['shannon_info'] = -np.log(probs_cubes['probability'])
return probs_cubes
Experimental Step¶
def step(params: Dict, smoke_test: bool=False):
# ======================
# experiment - Data
# ======================
# Get DataCube
logging.info(f"Loading '{params['variable']}' variable")
datacube = get_dataset([params['variable']])
datacube = datacube.resample(time='1MS').mean()
# t = clip_dataset_by_geometry(datacube, get_spain())
# datacube = select_region(
# xr_data=datacube, bbox=get_spain()
# )[params['variable']]
# return t
# subset datacube (spatially)
if params['region'] is not None:
logging.info(f"Selecting region '{params['region'].name}'")
datacube = select_region(
xr_data=datacube, bbox=params['region']
)
# subset datacube (temporally)
logging.info(f"Selecting temporal period: '{params['period'].name}'")
datacube = select_period(xr_data=datacube, period=params['period'])
# # get datacubes
# reference_cube_df = get_reference_cube(data=datacube)
# get density cubes
logging.info(f"Getting density cubes: S: {params['spatial']}, T: {params['temporal']}")
density_cube_df = get_density_cubes(
data=datacube[params['variable']],
spatial=params['spatial'],
temporal=params['temporal']
)
logging.info(f"Total data: {density_cube_df.shape}")
# print(density_cube_df.head())
# # get reference dataframe
# X, Y = get_common_indices(
# reference_df=reference_cube_df,
# density_df=density_cube_df
# )
# print(density_cube_df.values.shape)
# print(density_cube_df.columns)
# standardize data
logging.info(f"Standardizing data...")
x_transformer = StandardScaler().fit(density_cube_df.values)
density_cube_df_norm = pd.DataFrame(
data=x_transformer.transform(density_cube_df.values),
columns=density_cube_df.columns.values,
index=density_cube_df.index
)
return density_cube_df_norm, x_transformer, density_cube_df
density_cube_df_norm, X_transformer, density_cube_df = step(params, smoke_test=True)
set(['lon','lat' in density_cube_df.index.names
subsamples = 10_000
idx = subset_indices(density_cube_df_norm.values, subsamples)
subset = density_cube_df_norm.iloc[idx, :]
logging.info(f"subset: {subset.shape}")
from rbig.model import RBIG
from rbig import RBIG as RBIGOLD
%autoreload 2
# ==================================
# Model - Gaussianization (Old way)
# ==================================
logging.info(f"Gaussianizing data...")
t0 = time.time()
n_layers = 10_000
rotation_type = 'PCA'
random_state = 0
zero_tolerance = 60
pdf_extension = 20
pdf_resolution = None
tolerance = None
# Initialize RBIG class
old_rbig_model = RBIGOLD(n_layers=n_layers,
rotation_type=rotation_type,
random_state=random_state,
zero_tolerance=zero_tolerance,
tolerance=tolerance,
pdf_extension = pdf_extension,
verbose=0,
)
# fit model to the data
old_rbig_model.fit(subset)
t1 = time.time() - t0
logging.info(f"Time Taken: {t1:.2f} secs")
# # forward transformation
# logging.info(f"Forward Transform...")
# t0 = time.time()
# X1_trans = old_rbig_model.transform(subset.values)
# t1 = time.time() - t0
# logging.info(f"Time Taken: {t1:.2f} secs")
# # inverse transformation
# logging.info(f"Inverse Transform...")
# t0 = time.time()
# X1_approx = old_rbig_model.inverse_transform(X1_trans)
# t1 = time.time() - t0
# logging.info(f"Time Taken: {t1:.2f} secs")
# # jacobian
# logging.info(f"Calculating the Jacobian...")
# t0 = time.time()
# X1_ldj, X1j_trans = old_rbig_model.jacobian(subset.values, return_X_transform=True)
# t1 = time.time() - t0
# logging.info(f"Time Taken: {t1:.2f} secs")
old_rbig_model.n_layers, old_rbig_model.total_correlation()
X_g = old_rbig_model.transform(subset.values.copy())
X_ng = neg_entropy_normal(X_g)
X_ll = -stats.norm.logpdf(X_g).sum(-1).mean()
X_mll = -stats.norm.logpdf(X_g).mean(axis=0)
df = pd.DataFrame(data=X_g, index=subset.index, columns=subset.columns)
# Xn_g = rbig_model.transform(subset.values.copy())
# sns.pairplot(, diag_kws={'bins': 50})
df = pd.DataFrame(data=X_g, index=subset.index, columns=subset.columns)
g = sns.PairGrid(df, diag_sharey=False)
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot, colors="C0")
g.map_diag(sns.kdeplot, lw=2)
sns.pairplot(df, diag_kws={'bins': 50})
# Probability
logging.info(f"Predicitng the probability...")
t0 = time.time()
X_prob = old_rbig_model.predict_proba(subset.values)
t1 = time.time() - t0
logging.info(f"Time Taken: {t1:.2f} secs")
probs_cubes = get_information(X_prob, subset.index, 'prob')
plot_all(probs_cubes)
# ==================================
# Model - Gaussianization (New way)
# ==================================
logger.setLevel(logging.INFO)
n_layers = 10_000
rotation_type = 'PCA'
random_state = 0
zero_tolerance = 60
pdf_extension = 20
pdf_resolution = None
tolerance = None
method = 'custom'
n_quantiles = 50
# Initialize RBIG class
rbig_model = RBIG(
n_layers=n_layers,
rotation_type=rotation_type,
random_state=random_state,
zero_tolerance=zero_tolerance,
tolerance=tolerance,
pdf_extension = pdf_extension,
verbose=1,
method=method,
n_quantiles=n_quantiles
)
# fit model to the data
logging.info(f"Gaussianizing data...")
t0 = time.time()
rbig_model.fit(subset.values.copy())
t1 = time.time() - t0
logging.info(f"Time Taken: {t1:.2f} secs")
# # forward transformation
# logging.info(f"Forward Transform...")
# t0 = time.time()
# X2_trans = rbig_model.transform(subset.values)
# t1 = time.time() - t0
# logging.info(f"Time Taken: {t1:.2f} secs")
# # inverse transformation
# logging.info(f"Inverse Transform...")
# t0 = time.time()
# X2_approx = rbig_model.inverse_transform(X1_trans)
# t1 = time.time() - t0
# logging.info(f"Time Taken: {t1:.2f} secs")
# # jacobian
# logging.info(f"Calculating the Jacobian...")
# t0 = time.time()
# X2_ldj, X2j_trans = rbig_model.jacobian(subset.values)
# t1 = time.time() - t0
# logging.info(f"Time Taken: {t1:.2f} secs")
# # jacobian
# logging.info(f"Calculating the Log Determinant Jacobian...")
# t0 = time.time()
# X2_j, X2j_trans = rbig_model.log_det_jacobian(subset.values)
# t1 = time.time() - t0
# logging.info(f"Time Taken: {t1:.2f} secs")
rbig_model.n_layers, rbig_model.total_correlation()
X_prob.mean(), Xn_prob.mean()
from rbig.information.kld import neg_entropy_normal
Xn_g = rbig_model.transform(subset.values.copy())
Xn_ll = -stats.norm.logpdf(Xn_g).sum(-1).mean()
Xn_ng = neg_entropy_normal(Xn_g)
Xn_mll = -stats.norm.logpdf(Xn_g).mean(axis=0)
Xn_mll
fig, ax = plt.subplots()
width = 0.2
ax.bar(labels, X_ng, 0.2, label=f'RBIG1.0, {X_ng.mean():.2e}, ')
ax.bar(labels + 0.2, Xn_ng, 0.2, label=f'RBIG1.1, {Xn_ng.mean():.2e}')
ax.legend()
ax.set(
title="Non-Gaussianity (Marginal)",
xlabel='Features',
)
plt.show()
fig, ax = plt.subplots()
width = 0.2
ax.bar(labels, X_mll, 0.2, label=f'RBIG1.0, {X_ll:.2f}, ')
ax.bar(labels + 0.2, Xn_mll, 0.2, label=f'RBIG1.1, {Xn_ll:.2f}')
ax.legend()
ax.set(
title="Negative Log-Likelihood (Marginal)",
xlabel='Features',
)
plt.show()
from scipy import stats
Xn_g = rbig_model.transform(subset.values.copy())
print(stats.norm.logpdf(Xn_g).sum(-1).mean())
print(stats.norm.logpdf(Xn_g).mean(axis=0))
# Xn_g = rbig_model.transform(subset.values.copy())
sns.pairplot(pd.DataFrame(data=Xn_g, index=subset.index, columns=subset.columns), diag_kws={'bins': 50})
# Xn_g = rbig_model.transform(subset.values.copy())
# sns.pairplot(, diag_kws={'bins': 50})
df = pd.DataFrame(data=Xn_g, index=subset.index, columns=subset.columns)
g = sns.PairGrid(df, diag_sharey=False)
g.map_upper(sns.scatterplot)
g.map_lower(sns.kdeplot, colors="C0")
g.map_diag(sns.kdeplot, lw=2)
# Probability
logger.setLevel(logging.DEBUG)
logging.info(f"Predicitng the probability...")
t0 = time.time()
Xn_prob = rbig_model.predict_proba(subset.values.copy())
t1 = time.time() - t0
logging.info(f"Time Taken: {t1:.2f} secs")
probsn_cubes = get_information(Xn_prob, subset.index, 'prob')
X_g = rbig_model.transform(subset.values.copy())
sns.pairplot(pd.DataFrame(data=X_g, index=subset.index, columns=subset.columns), )
logger.setLevel(logging.INFO)
plot_all(probsn_cubes)
# np.percentile(Xn_prob, [10, 50, 95])
jacs = np.vstack((rbig_model.jacs_))
jacs_full = np.vstack((rbig_model.jacs_sum_))
logger.setLevel(logging.INFO)
plt.figure()
plt.hist(jacs.ravel(), bins=100);
plt.figure()
plt.hist(jacs_full.ravel(), bins=100);
plt.figure()
plt.plot(jacs)
plt.legend(['Min', '5%', '10%', '50%', '90%', '95%', 'Max'])
plt.xlabel('Layer')
plt.figure()
plt.plot(jacs_full)
plt.legend(['Min', '5%', '10%', '50%', '90%', '95%', 'Max'])
plt.xlabel('Layer')
logger.setLevel(logging.INFO)
plot_all(probsn_cubes)
from scipy import stats
probs_cubes = get_information(X_prob, subset.index, 'logprob')
def plot_all(probs_cubes):
fig, ax = plt.subplots()
probs_cubes.probability.mean(dim='time').plot(
ax=ax, robust=True, cmap='Reds',
cbar_kwargs={'label': "Probability"}
)
fig, ax = plt.subplots()
probs_cubes.log_probability.mean(dim='time').plot(
ax=ax, robust=True, cmap='Reds',
cbar_kwargs={'label': "Log Probability"}
)
fig, ax = plt.subplots()
probs_cubes.shannon_info.mean(dim='time').plot(
ax=ax, robust=True, cmap='Reds',
cbar_kwargs={'label': "Shannon Information"}
)
fig, ax = plt.subplots()
probs_cubes.probability.mean(dim='time').plot(
ax=ax, vmax=0, robust=True, cmap='Reds',
cbar_kwargs={'label': "Probability"}
)
fig, ax = plt.subplots()
probs_cubes.log_probability.mean(dim='time').plot(
ax=ax, vmax=0.0, robust=True, cmap='Reds',
cbar_kwargs={'label': "Log Probability"}
)
fig, ax = plt.subplots()
probs_cubes.shannon_info.mean(dim='time').plot(
ax=ax, robust=True, cmap='Reds',
cbar_kwargs={'label': "Shannon Information"}
)
X_prob_df = pd.DataFrame(
data=X_prob,
index=subset.index,
columns=['logp']
)
for it1, it2 in zip(X1_trans.T, X2_trans.T):
plt.scatter(it1, it2, label=f"{mean_absolute_error(it1, it2)}")
plt.legend()
plt.xlabel("Old")
plt.ylabel("New")
for it1, it2 in zip(X1_approx.T, X2_approx.T):
plt.scatter(it1, it2, label=f"{mean_absolute_error(it1, it2)}")
plt.legend()
plt.xlabel("Old")
plt.ylabel("New")
rbig_model.total_correlation(), old_rbig_model.total_correlation()
# ==================================
# Model - Gaussianization (New way)
# ==================================
logging.info(f"Gaussianizing data...")
t0 = time.time()
n_layers = 10_000
rotation_type = 'PCA'
random_state = 0
zero_tolerance = 60
pdf_extension = None
pdf_resolution = None
tolerance = None
# Initialize RBIG class
rbig_model = RBIG(n_layers=n_layers,
rotation_type=rotation_type,
random_state=random_state,
pdf_extension = 50,
zero_tolerance=zero_tolerance,
tolerance=tolerance)
# fit model to the data
rbig_model.fit(subset)
t1 = time.time() - t0
logging.info(f"Time Taken: {t1:.2f} secs")
rbig_model.n_layers
rbig_model.total_correlation()
# get the probability estimates
logging.info(f"Getting probability estimates...")
t0 = time.time()
subsamples = 100_000
subset = subset_indices(density_cube_df_norm.values, subsamples)
X_prob = rbig_model.predict_proba(
density_cube_df_norm.values, n_trials=1, chunksize=100_000, domain='input'
)
t1 = time.time() - t0
logging.info(f"Time Taken: {t1:.2f} secs")
X_prob = pd.DataFrame(
data=X_prob,
index=density_cube_df_norm.index,
columns=['probability']
)
probs_cubes = get_information(X_prob_df)
fig, ax = plt.subplots()
probs_cubes.probability.mean(dim='time').plot(
ax=ax,
vmin=0, vmax=None, robust=True, cmap='Reds',
cbar_kwargs={'label': "Probability"}
)
fig, ax = plt.subplots()
probs_cubes.shannon_info.mean(dim='time').plot(
ax=ax,
vmin=0, robust=True, cmap='Reds',
cbar_kwargs={'label': "Shannon Information"}
)
plt.hist(density_cube_df['var_x0'].squeeze(), bins=50, density=True);
plt.hist(density_cube_df['prob'].squeeze(), bins=50, range=(0, 1.0));