https://github.com/python/cpython/commit/92397d5ead38dde4154e70d00f24973bcf2a925a
commit: 92397d5ead38dde4154e70d00f24973bcf2a925a
branch: main
author: Raymond Hettinger <[email protected]>
committer: rhettinger <[email protected]>
date: 2024-03-27T09:04:32-05:00
summary:
Add statistics recipe for sampling from an estimated probability density
distribution (#117221)
files:
M Doc/library/statistics.rst
diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index fc7e0c1ccad286..197c123f8356d8 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -1148,6 +1148,64 @@ The final prediction goes to the largest posterior. This
is known as the
'female'
+Sampling from kernel density estimation
+***************************************
+
+The :func:`kde()` function creates a continuous probability density
+function from discrete samples. Some applications need a way to make
+random selections from that distribution.
+
+The technique is to pick a sample from a bandwidth scaled kernel
+function and recenter the result around a randomly chosen point from
+the input data. This can be done with any kernel that has a known or
+accurately approximated inverse cumulative distribution function.
+
+.. testcode::
+
+ from random import choice, random, seed
+ from math import sqrt, log, pi, tan, asin
+ from statistics import NormalDist
+
+ kernel_invcdfs = {
+ 'normal': NormalDist().inv_cdf,
+ 'logistic': lambda p: log(p / (1 - p)),
+ 'sigmoid': lambda p: log(tan(p * pi/2)),
+ 'rectangular': lambda p: 2*p - 1,
+ 'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p),
+ 'cosine': lambda p: 2*asin(2*p - 1)/pi,
+ }
+
+ def kde_random(data, h, kernel='normal'):
+ 'Return a function that samples from kde() smoothed data.'
+ kernel_invcdf = kernel_invcdfs[kernel]
+ def rand():
+ return h * kernel_invcdf(random()) + choice(data)
+ return rand
+
+For example:
+
+.. doctest::
+
+ >>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+ >>> rand = kde_random(discrete_samples, h=1.5)
+ >>> seed(8675309)
+ >>> selections = [rand() for i in range(10)]
+ >>> [round(x, 1) for x in selections]
+ [4.7, 7.4, 1.2, 7.8, 6.9, -1.3, 5.8, 0.2, -1.4, 5.7]
+
+.. testcode::
+ :hide:
+
+ from statistics import kde
+ from math import isclose
+
+ # Verify that cdf / invcdf will round trip
+ xarr = [i/100 for i in range(-100, 101)]
+ for kernel, invcdf in kernel_invcdfs.items():
+ cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True)
+ for x in xarr:
+ assert isclose(invcdf(cdf(x)), x, abs_tol=1E-9)
+
..
# This modelines must appear within the last ten lines of the file.
kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding
utf-8;
_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]