Skip to content

Density Estimation

Methods:

  • Histogram
  • Mixture Model
  • Kernel Density Estimation
  • Nearest Neighbour Kernel Density Estimation
import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))

# NUMPY SETTINGS
import numpy as np
np.set_printoptions(precision=3, suppress=True)

# MATPLOTLIB Settings
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# SEABORN SETTINGS
import seaborn as sns
sns.set_context(context='talk',font_scale=0.7)
# sns.set(rc={'figure.figsize': (12, 9.)})
# sns.set_style("whitegrid")

# PANDAS SETTINGS
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

# LOGGING SETTINGS
import sys
import logging
logging.basicConfig(
    level=logging.INFO, 
    stream=sys.stdout,
    format='%(asctime) s:%(levelname) s:%(message)s '
)
logger = logging.getLogger()
#logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Data

def make_bimodal_data(n_samples: int, f: float=0.3, seed: int=1) -> np.ndarray:
    rand = np.random.RandomState(seed)
    x = rand.randn(n_samples)
    x[int(f * n_samples):] += 5
    return x

Histogram Estimation

This is a kind of local density estimation. In histograms, we use nearby points to estimate the density. This works by dividing the space into N-bins and we can approximate the density at the center of each bin by the fraction of points in the training data that falls into the corresponding bin.

p_h(x) = \frac{k}{N V} = \frac{n_i}{N \Delta_i}

where:

  • k is the number of points in a region
  • V - volume of region
  • N - total number of datapoints

So if x is close to many points, that's a high probability of x.

  1. Count # of datapoints
n_samples = 100
x = make_bimodal_data(n_samples)

fig, ax = plt.subplots()
hist = ax.hist(x, bins=10, density=True)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()

Bin Estimation

Examples:

  • Astropy Bin Estimation - Demo
  • AstroML 1D Density Estimators - Demo
from astropy import stats as astats

methods = [
    'scott',
    'auto',
    'sqrt'
]
x_plot = np.linspace(-4, 10, 1_000)

fig, ax = plt.subplots()
for imethod in methods:
    # calculate bin edges
#     bins = np.histogram_bin_edges()
    ax.hist(x, bins=imethod, alpha=0.3, density=True, label=f"{imethod}")

bins_bayes = astats.bayesian_blocks(x, fitness='events', p0=0.01)
hist = ax.hist(x, bins=bins_bayes, density=True, alpha=0.3, label="Bayes Blocks")
ax.legend()
ax.set(
    xlabel=r'$x$',
    ylabel=r'$p(x)$'
)
plt.show()
hist, _ = np.histogram(x, bins=bins_bayes, density=True)

for imethod in methods:
    # calculate bin edges
#     bins = np.histogram_bin_edges()
    hist, _ = np.histogram(x, bins=imethod, density=True)

    print(stats.entropy(hist), - (hist * np.log(np.abs(hist))).sum())

bins_bayes = astats.bayesian_blocks(x, fitness='events', p0=0.01)
hist, _ = np.histogram(x, bins=bins_bayes, density=True)
print(stats.entropy(hist), - (hist * np.log(np.abs(hist))).sum())
1.3623759191968534 1.0555752005368166
1.7277094340769326 1.6006573458595894
1.9438450128427205 1.9934348495405312
0.8238249779913889 0.7227628905498504

Cons:

  • Sensitive to the # of bins
  • Shape of the dist. depends on the starting position of the bins
    • for multivariate data, this can also depend on the orientation.
  • discontinuities are not due to underlying density, they are an artifact of the bin locations
    • harder to grasp the structure of the data with the discontinuities
  • curse of dimensionality: the # of bins grows exponentially with the # of dimensions
    • need a large number of samples for higher dimensions
  • only suitable for rapid viz of results in one or two dimensions

Widths

p(x) \approx \frac{k}{NV}

where:

  • V is the volume surrounding x
  • k is the number of samples inside the volume
  • N is the number of examples

Mixture Models

from sklearn.mixture import GaussianMixture
stats.entropy(gmm_clf.score_samples(x_eval.reshape(-1, 1)))
gmm_clf = GaussianMixture(n_components=5).fit(x.reshape(-1, 1))

x_dens = gmm_clf.score_samples(x_eval.reshape(-1, 1))
fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, np.exp(x_dens), alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()

Kernel Density Estimation

In kernel density estimation, we choose a fixed value of the volumne V and determine the k from the data. This can be shown that the estimator converges to the true probability density as N \rightarrow \infty if V shrinks with N and k grows with N appropropriately.

k = \sum_{n=1}^N K\left( \frac{x- x^n}{\lambda} \right)

$$ p_\text{KDE}(x) = \frac{1}{N \lambdad}\sum_{n=1}N K\left( \frac{x- x^n}{\lambda} \right) $$

def kerneldensity(y, x, bandwidth, kernel):
    """kernel density"""
    # calculate bandwidth
    h = bandwidth(y)

    # calculate kernel
    k = kernel((x - y[:, None]) / h)
    print(k.shape)
    k /= h
    k = np.sum(k, axis=0)
    print(k.shape)
    return k / y.shape[0]
k = kerneldensity(x, x_eval, silverman, gaussian)
(100, 1000)
(1000,)

Kernels

Gaussian Kernel

$$ K(x) = \frac{1}{\sqrt{2\pi}} \exp\left(-\frac{1}{2}x^2 \right) $$

def gaussian(u):
    return 1 / np.sqrt(2 * np.pi) * np.exp(- u**2 / 2)

Epanechnikov Kernel

We know that the Epanechnikov kernel has the lowest (asymptotic) mean-squared error.

def epanechnikov(u):
#     return np.where(np.abs(u) <= np.sqrt(5), 3/(4*np.sqrt(5)) * (1 - u**2 / 5.0), 0)
    return (3. / 4.) * np.max(1 - u**2, 0.0)

Estimating the Length Scale

def silverman(x):
    n_samples = x.shape[0]

    # calculate the interquartile range
    iqr = np.subtract(*np.percentile(x, [75, 25]))

    # lengthscale
    h = 0.9 * np.min([x.std(ddof=1), iqr / 1.34]) * n_samples ** -0.2
    return h
from scipy import stats

# # get evaluation grid
x_eval = np.linspace(-4, 10, 1_000)
# logging.info(f"x: {x_eval.shape}")

# # get pdf est for each
# density_grid = np.array([stats.norm(xi).pdf(x_eval) for xi in x])
# logging.info(f"K: {density_grid.shape}")

# density = np.sum(density_grid, axis=0)
# logging.info(f"K: {density.shape}")

density_ = kerneldensity(x, x, silverman, epanechnikov)
density = kerneldensity(x, x_eval, silverman, epanechnikov)

print(
    stats.entropy(density_), 
    stats.entropy(density)
)


fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, density, alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
ax.set(
    xlabel=r'$x$',
    ylabel=r'$p(x)$'
)
plt.show()
(100, 100)
(100,)
(100, 1000)
(1000,)
4.5043625235270195 6.485400324068893

Advantages:

  • Fast at Training Time

Disadvantages

  • Slow at Test Time
    • High memory footprint
  • Sensitive to kernel bandwidth


K-Nearest Neighbours

This method fixes the number of nearby points k,

p_\text{kNN}(x) = \frac{k}{N}\frac{1}{V_d \cdot R_k^d(x)}

where:

  • k-th nearest neighbour - (fixed)
  • V_d \cdot R_k^d(x) - Volume of a d-dimensional vall with radius being R_k(x).
    • V_d = \frac{\pi^{d/2}}{\Gamma(\pi/2+1)}

We choose a fixed value of k and determine the corresponding volumne V from the data. Again, this can be shown that the estimator converges to the true probability density as N \rightarrow \infty if V shrinks with N and k grows with N appropropriately.

k_neighbours = 5

diff = np.absolute(x - x[:, np.newaxis])
logging.info(f"diff: {diff.shape}")
diffsort = np.sort(diff, axis=0)
h = diffsort[k_neighbours, :]

plt.hist(h)
2020-05-24 22:02:59,603:INFO:diff: (30, 30)
(array([9., 8., 3., 0., 2., 5., 0., 0., 2., 1.]),
 array([0.135, 0.383, 0.632, 0.88 , 1.129, 1.378, 1.626, 1.875, 2.123,
        2.372, 2.621]),
 <a list of 10 Patch objects>)

$$ p(x) = \sum_{i=1}^k \frac{1}{r_i^n} $$

from sklearn.neighbors import NearestNeighbors
from scipy import special

def n_volume(r, n):
    """compute the n-volume of a sphere of radius r in n dimensions"""
    return np.pi ** (0.5 * n) / special.gamma(0.5 * n + 1) * (r ** n)
k = 10

# initialize nearest neighbour estimator
knn_clf = NearestNeighbors(n_neighbors=k)

#fit estimator
knn_clf.fit(x.reshape(-1,1))
n_samples = x.shape[0]

# query to get neighbours
dist, ind = knn_clf.kneighbors(x_eval.reshape(-1,1))

# n_samples = x_eval.shape[0]

# calculate the density
k = float(k)
n_dim = 1
# density_knn = float(k) / n_volume(dist[:, -1], 1)


density_knn = (k * (k + 1) * 0.5 / n_volume(1, n_dim) / (dist ** n_dim).sum(1))
density_knn /= n_samples
fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, density_knn, alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()
from sklearn.neighbors import KernelDensity
sk_clf = KernelDensity().fit(x.reshape(-1,1))

x_dens = sk_clf.score_samples(x_eval.reshape(-1, 1))


log_p = sk_clf.score_samples(x_eval.reshape(-1,1))  # returns log(p) of data sample
p = np.exp(log_p)                # estimate p of data sample
entropy = -np.sum(p*log_p)       # evaluate entropy
print(entropy)
160.2927308434686
fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, np.exp(x_dens), alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()

Advantages:

  • Yields smoother decision regions
  • Provides probabilistic information
    • The ratio of examples

Disadvantages:

  • Diverges - not a proper density estimator
  • can be expensive