Density Estimation¶
Methods:
- Histogram
- Mixture Model
- Kernel Density Estimation
- Nearest Neighbour Kernel Density Estimation
import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))
# NUMPY SETTINGS
import numpy as np
np.set_printoptions(precision=3, suppress=True)
# MATPLOTLIB Settings
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# SEABORN SETTINGS
import seaborn as sns
sns.set_context(context='talk',font_scale=0.7)
# sns.set(rc={'figure.figsize': (12, 9.)})
# sns.set_style("whitegrid")
# PANDAS SETTINGS
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)
# LOGGING SETTINGS
import sys
import logging
logging.basicConfig(
level=logging.INFO,
stream=sys.stdout,
format='%(asctime) s:%(levelname) s:%(message)s '
)
logger = logging.getLogger()
#logger.setLevel(logging.INFO)
%load_ext autoreload
%autoreload 2
Data¶
def make_bimodal_data(n_samples: int, f: float=0.3, seed: int=1) -> np.ndarray:
rand = np.random.RandomState(seed)
x = rand.randn(n_samples)
x[int(f * n_samples):] += 5
return x
Histogram Estimation¶
This is a kind of local density estimation. In histograms, we use nearby points to estimate the density. This works by dividing the space into N-bins and we can approximate the density at the center of each bin by the fraction of points in the training data that falls into the corresponding bin.
where:
- k is the number of points in a region
- V - volume of region
- N - total number of datapoints
So if x is close to many points, that's a high probability of x.
- Count # of datapoints
n_samples = 100
x = make_bimodal_data(n_samples)
fig, ax = plt.subplots()
hist = ax.hist(x, bins=10, density=True)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()
Bin Estimation¶
- scott
- knuth
- freedman
- bayesian blocks
- Studies in Astronomical Time Series Analysis. VI. Bayesian Block Representations - Scargle et. al. (2012)
- sqrt
Examples:
from astropy import stats as astats
methods = [
'scott',
'auto',
'sqrt'
]
x_plot = np.linspace(-4, 10, 1_000)
fig, ax = plt.subplots()
for imethod in methods:
# calculate bin edges
# bins = np.histogram_bin_edges()
ax.hist(x, bins=imethod, alpha=0.3, density=True, label=f"{imethod}")
bins_bayes = astats.bayesian_blocks(x, fitness='events', p0=0.01)
hist = ax.hist(x, bins=bins_bayes, density=True, alpha=0.3, label="Bayes Blocks")
ax.legend()
ax.set(
xlabel=r'$x$',
ylabel=r'$p(x)$'
)
plt.show()
hist, _ = np.histogram(x, bins=bins_bayes, density=True)
for imethod in methods:
# calculate bin edges
# bins = np.histogram_bin_edges()
hist, _ = np.histogram(x, bins=imethod, density=True)
print(stats.entropy(hist), - (hist * np.log(np.abs(hist))).sum())
bins_bayes = astats.bayesian_blocks(x, fitness='events', p0=0.01)
hist, _ = np.histogram(x, bins=bins_bayes, density=True)
print(stats.entropy(hist), - (hist * np.log(np.abs(hist))).sum())
- Sensitive to the # of bins
- Shape of the dist. depends on the starting position of the bins
- for multivariate data, this can also depend on the orientation.
- discontinuities are not due to underlying density, they are an artifact of the bin locations
- harder to grasp the structure of the data with the discontinuities
- curse of dimensionality: the # of bins grows exponentially with the # of dimensions
- need a large number of samples for higher dimensions
- only suitable for rapid viz of results in one or two dimensions
Widths¶
where:
- V is the volume surrounding x
- k is the number of samples inside the volume
- N is the number of examples
Mixture Models¶
from sklearn.mixture import GaussianMixture
stats.entropy(gmm_clf.score_samples(x_eval.reshape(-1, 1)))
gmm_clf = GaussianMixture(n_components=5).fit(x.reshape(-1, 1))
x_dens = gmm_clf.score_samples(x_eval.reshape(-1, 1))
fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, np.exp(x_dens), alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()
Kernel Density Estimation¶
In kernel density estimation, we choose a fixed value of the volumne V and determine the k from the data. This can be shown that the estimator converges to the true probability density as N \rightarrow \infty if V shrinks with N and k grows with N appropropriately.
$$ p_\text{KDE}(x) = \frac{1}{N \lambdad}\sum_{n=1}N K\left( \frac{x- x^n}{\lambda} \right) $$
def kerneldensity(y, x, bandwidth, kernel):
"""kernel density"""
# calculate bandwidth
h = bandwidth(y)
# calculate kernel
k = kernel((x - y[:, None]) / h)
print(k.shape)
k /= h
k = np.sum(k, axis=0)
print(k.shape)
return k / y.shape[0]
k = kerneldensity(x, x_eval, silverman, gaussian)
def gaussian(u):
return 1 / np.sqrt(2 * np.pi) * np.exp(- u**2 / 2)
Epanechnikov Kernel¶
We know that the Epanechnikov kernel has the lowest (asymptotic) mean-squared error.
def epanechnikov(u):
# return np.where(np.abs(u) <= np.sqrt(5), 3/(4*np.sqrt(5)) * (1 - u**2 / 5.0), 0)
return (3. / 4.) * np.max(1 - u**2, 0.0)
Estimating the Length Scale¶
def silverman(x):
n_samples = x.shape[0]
# calculate the interquartile range
iqr = np.subtract(*np.percentile(x, [75, 25]))
# lengthscale
h = 0.9 * np.min([x.std(ddof=1), iqr / 1.34]) * n_samples ** -0.2
return h
from scipy import stats
# # get evaluation grid
x_eval = np.linspace(-4, 10, 1_000)
# logging.info(f"x: {x_eval.shape}")
# # get pdf est for each
# density_grid = np.array([stats.norm(xi).pdf(x_eval) for xi in x])
# logging.info(f"K: {density_grid.shape}")
# density = np.sum(density_grid, axis=0)
# logging.info(f"K: {density.shape}")
density_ = kerneldensity(x, x, silverman, epanechnikov)
density = kerneldensity(x, x_eval, silverman, epanechnikov)
print(
stats.entropy(density_),
stats.entropy(density)
)
fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, density, alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
ax.set(
xlabel=r'$x$',
ylabel=r'$p(x)$'
)
plt.show()
- Fast at Training Time
Disadvantages
- Slow at Test Time
- High memory footprint
- Sensitive to kernel bandwidth
K-Nearest Neighbours¶
This method fixes the number of nearby points k,
where:
- k-th nearest neighbour - (fixed)
- V_d \cdot R_k^d(x) - Volume of a d-dimensional vall with radius being R_k(x).
- V_d = \frac{\pi^{d/2}}{\Gamma(\pi/2+1)}
We choose a fixed value of k and determine the corresponding volumne V from the data. Again, this can be shown that the estimator converges to the true probability density as N \rightarrow \infty if V shrinks with N and k grows with N appropropriately.
k_neighbours = 5
diff = np.absolute(x - x[:, np.newaxis])
logging.info(f"diff: {diff.shape}")
diffsort = np.sort(diff, axis=0)
h = diffsort[k_neighbours, :]
plt.hist(h)
from sklearn.neighbors import NearestNeighbors
from scipy import special
def n_volume(r, n):
"""compute the n-volume of a sphere of radius r in n dimensions"""
return np.pi ** (0.5 * n) / special.gamma(0.5 * n + 1) * (r ** n)
k = 10
# initialize nearest neighbour estimator
knn_clf = NearestNeighbors(n_neighbors=k)
#fit estimator
knn_clf.fit(x.reshape(-1,1))
n_samples = x.shape[0]
# query to get neighbours
dist, ind = knn_clf.kneighbors(x_eval.reshape(-1,1))
# n_samples = x_eval.shape[0]
# calculate the density
k = float(k)
n_dim = 1
# density_knn = float(k) / n_volume(dist[:, -1], 1)
density_knn = (k * (k + 1) * 0.5 / n_volume(1, n_dim) / (dist ** n_dim).sum(1))
density_knn /= n_samples
fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, density_knn, alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()
from sklearn.neighbors import KernelDensity
sk_clf = KernelDensity().fit(x.reshape(-1,1))
x_dens = sk_clf.score_samples(x_eval.reshape(-1, 1))
log_p = sk_clf.score_samples(x_eval.reshape(-1,1)) # returns log(p) of data sample
p = np.exp(log_p) # estimate p of data sample
entropy = -np.sum(p*log_p) # evaluate entropy
print(entropy)
fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, np.exp(x_dens), alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()
- Yields smoother decision regions
- Provides probabilistic information
- The ratio of examples
Disadvantages:
- Diverges - not a proper density estimator
- can be expensive