Density Estimation¶

Methods:

Histogram
Mixture Model
Kernel Density Estimation
Nearest Neighbour Kernel Density Estimation

import sys, os
from pyprojroot import here
root = here(project_files=[".here"])
sys.path.append(str(here()))

# NUMPY SETTINGS
import numpy as np
np.set_printoptions(precision=3, suppress=True)

# MATPLOTLIB Settings
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# SEABORN SETTINGS
import seaborn as sns
sns.set_context(context='talk',font_scale=0.7)
# sns.set(rc={'figure.figsize': (12, 9.)})
# sns.set_style("whitegrid")

# PANDAS SETTINGS
import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

# LOGGING SETTINGS
import sys
import logging
logging.basicConfig(
    level=logging.INFO, 
    stream=sys.stdout,
    format='%(asctime) s:%(levelname) s:%(message)s '
)
logger = logging.getLogger()
#logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Data¶

def make_bimodal_data(n_samples: int, f: float=0.3, seed: int=1) -> np.ndarray:
    rand = np.random.RandomState(seed)
    x = rand.randn(n_samples)
    x[int(f * n_samples):] += 5
    return x

Histogram Estimation¶

This is a kind of local density estimation. In histograms, we use nearby points to estimate the density. This works by dividing the space into $N$ -bins and we can approximate the density at the center of each bin by the fraction of points in the training data that falls into the corresponding bin.

$p_h(x) = \frac{k}{N V} = \frac{n_i}{N \Delta_i}$

where:

$k$ is the number of points in a region
$V$ - volume of region
$N$ - total number of datapoints

So if $x$ is close to many points, that's a high probability of x.

Count # of datapoints

n_samples = 100
x = make_bimodal_data(n_samples)

fig, ax = plt.subplots()
hist = ax.hist(x, bins=10, density=True)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()

Bin Estimation¶

scott
knuth
freedman
bayesian blocks
- Studies in Astronomical Time Series Analysis. VI. Bayesian Block Representations - Scargle et. al. (2012)
sqrt

Examples:

Astropy Bin Estimation - Demo
AstroML 1D Density Estimators - Demo

from astropy import stats as astats

methods = [
    'scott',
    'auto',
    'sqrt'
]

x_plot = np.linspace(-4, 10, 1_000)

fig, ax = plt.subplots()
for imethod in methods:
    # calculate bin edges
#     bins = np.histogram_bin_edges()
    ax.hist(x, bins=imethod, alpha=0.3, density=True, label=f"{imethod}")

bins_bayes = astats.bayesian_blocks(x, fitness='events', p0=0.01)
hist = ax.hist(x, bins=bins_bayes, density=True, alpha=0.3, label="Bayes Blocks")
ax.legend()
ax.set(
    xlabel=r'$x$',
    ylabel=r'$p(x)$'
)
plt.show()

hist, _ = np.histogram(x, bins=bins_bayes, density=True)

for imethod in methods:
    # calculate bin edges
#     bins = np.histogram_bin_edges()
    hist, _ = np.histogram(x, bins=imethod, density=True)

    print(stats.entropy(hist), - (hist * np.log(np.abs(hist))).sum())

bins_bayes = astats.bayesian_blocks(x, fitness='events', p0=0.01)
hist, _ = np.histogram(x, bins=bins_bayes, density=True)
print(stats.entropy(hist), - (hist * np.log(np.abs(hist))).sum())

1.3623759191968534 1.0555752005368166
1.7277094340769326 1.6006573458595894
1.9438450128427205 1.9934348495405312
0.8238249779913889 0.7227628905498504

Cons:

Sensitive to the # of bins
Shape of the dist. depends on the starting position of the bins
- for multivariate data, this can also depend on the orientation.
discontinuities are not due to underlying density, they are an artifact of the bin locations
- harder to grasp the structure of the data with the discontinuities
curse of dimensionality: the # of bins grows exponentially with the # of dimensions
- need a large number of samples for higher dimensions
only suitable for rapid viz of results in one or two dimensions

Widths¶

$p(x) \approx \frac{k}{NV}$

where:

$V$ is the volume surrounding $x$
$k$ is the number of samples inside the volume
$N$ is the number of examples

Mixture Models¶

from sklearn.mixture import GaussianMixture

stats.entropy(gmm_clf.score_samples(x_eval.reshape(-1, 1)))

gmm_clf = GaussianMixture(n_components=5).fit(x.reshape(-1, 1))

x_dens = gmm_clf.score_samples(x_eval.reshape(-1, 1))

fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, np.exp(x_dens), alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()

Kernel Density Estimation¶

In kernel density estimation, we choose a fixed value of the volumne $V$ and determine the $k$ from the data. This can be shown that the estimator converges to the true probability density as $N \rightarrow \infty$ if $V$ shrinks with $N$ and $k$ grows with $N$ appropropriately.

$k = \sum_{n=1}^N K\left( \frac{x- x^n}{\lambda} \right)$

$$ p_\text{KDE}(x) = \frac{1}{N \lambda^d}\sum_{n=1}N K\left( \frac{x- x^n}{\lambda} \right) $$

def kerneldensity(y, x, bandwidth, kernel):
    """kernel density"""
    # calculate bandwidth
    h = bandwidth(y)

    # calculate kernel
    k = kernel((x - y[:, None]) / h)
    print(k.shape)
    k /= h
    k = np.sum(k, axis=0)
    print(k.shape)
    return k / y.shape[0]

k = kerneldensity(x, x_eval, silverman, gaussian)

(100, 1000)
(1000,)

Kernels¶

Gaussian Kernel¶

$$ K(x) = \frac{1}{\sqrt{2\pi}} \exp\left(-\frac{1}{2}x^2 \right) $$

def gaussian(u):
    return 1 / np.sqrt(2 * np.pi) * np.exp(- u**2 / 2)

Epanechnikov Kernel¶

We know that the Epanechnikov kernel has the lowest (asymptotic) mean-squared error.

def epanechnikov(u):
#     return np.where(np.abs(u) <= np.sqrt(5), 3/(4*np.sqrt(5)) * (1 - u**2 / 5.0), 0)
    return (3. / 4.) * np.max(1 - u**2, 0.0)

Estimating the Length Scale¶

def silverman(x):
    n_samples = x.shape[0]

    # calculate the interquartile range
    iqr = np.subtract(*np.percentile(x, [75, 25]))

    # lengthscale
    h = 0.9 * np.min([x.std(ddof=1), iqr / 1.34]) * n_samples ** -0.2
    return h

from scipy import stats

# # get evaluation grid
x_eval = np.linspace(-4, 10, 1_000)
# logging.info(f"x: {x_eval.shape}")

# # get pdf est for each
# density_grid = np.array([stats.norm(xi).pdf(x_eval) for xi in x])
# logging.info(f"K: {density_grid.shape}")

# density = np.sum(density_grid, axis=0)
# logging.info(f"K: {density.shape}")

density_ = kerneldensity(x, x, silverman, epanechnikov)
density = kerneldensity(x, x_eval, silverman, epanechnikov)

print(
    stats.entropy(density_), 
    stats.entropy(density)
)


fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, density, alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
ax.set(
    xlabel=r'$x$',
    ylabel=r'$p(x)$'
)
plt.show()

(100, 100)
(100,)
(100, 1000)
(1000,)
4.5043625235270195 6.485400324068893

Advantages:

Fast at Training Time

Disadvantages

Slow at Test Time
- High memory footprint
Sensitive to kernel bandwidth

K-Nearest Neighbours¶

This method fixes the number of nearby points $k$ ,

$p_\text{kNN}(x) = \frac{k}{N}\frac{1}{V_d \cdot R_k^d(x)}$

where:

$k$ -th nearest neighbour - (fixed)
$V_d \cdot R_k^d(x)$ - Volume of a $d$ -dimensional vall with radius being $R_k(x)$ .
- $V_d = \frac{\pi^{d/2}}{\Gamma(\pi/2+1)}$

We choose a fixed value of $k$ and determine the corresponding volumne $V$ from the data. Again, this can be shown that the estimator converges to the true probability density as $N \rightarrow \infty$ if $V$ shrinks with $N$ and $k$ grows with $N$ appropropriately.

k_neighbours = 5

diff = np.absolute(x - x[:, np.newaxis])
logging.info(f"diff: {diff.shape}")
diffsort = np.sort(diff, axis=0)
h = diffsort[k_neighbours, :]

plt.hist(h)

2020-05-24 22:02:59,603:INFO:diff: (30, 30)

(array([9., 8., 3., 0., 2., 5., 0., 0., 2., 1.]),
 array([0.135, 0.383, 0.632, 0.88 , 1.129, 1.378, 1.626, 1.875, 2.123,
        2.372, 2.621]),
 <a list of 10 Patch objects>)

$$ p(x) = \sum_{i=1}^k \frac{1}{r_i^n} $$

from sklearn.neighbors import NearestNeighbors
from scipy import special

def n_volume(r, n):
    """compute the n-volume of a sphere of radius r in n dimensions"""
    return np.pi ** (0.5 * n) / special.gamma(0.5 * n + 1) * (r ** n)

k = 10

# initialize nearest neighbour estimator
knn_clf = NearestNeighbors(n_neighbors=k)

#fit estimator
knn_clf.fit(x.reshape(-1,1))
n_samples = x.shape[0]

# query to get neighbours
dist, ind = knn_clf.kneighbors(x_eval.reshape(-1,1))

# n_samples = x_eval.shape[0]

# calculate the density
k = float(k)
n_dim = 1
# density_knn = float(k) / n_volume(dist[:, -1], 1)


density_knn = (k * (k + 1) * 0.5 / n_volume(1, n_dim) / (dist ** n_dim).sum(1))
density_knn /= n_samples

fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, density_knn, alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()

from sklearn.neighbors import KernelDensity

* Entropy

sk_clf = KernelDensity().fit(x.reshape(-1,1))

x_dens = sk_clf.score_samples(x_eval.reshape(-1, 1))


log_p = sk_clf.score_samples(x_eval.reshape(-1,1))  # returns log(p) of data sample
p = np.exp(log_p)                # estimate p of data sample
entropy = -np.sum(p*log_p)       # evaluate entropy
print(entropy)

160.2927308434686

fig, ax = plt.subplots()
# hist = ax.hist(x, bins=10, density=True)
ax.fill_between(x_eval, np.exp(x_dens), alpha=0.5)
pts = ax.plot(x, np.full_like(x, -0.01), '|k', markeredgewidth=1)
plt.show()

Advantages:

Yields smoother decision regions
Provides probabilistic information
- The ratio of examples

Disadvantages:

Diverges - not a proper density estimator
can be expensive