https://github.com/python/cpython/commit/92397d5ead38dde4154e70d00f24973bcf2a925a
commit: 92397d5ead38dde4154e70d00f24973bcf2a925a
branch: main
author: Raymond Hettinger <[email protected]>
committer: rhettinger <[email protected]>
date: 2024-03-27T09:04:32-05:00
summary:

Add statistics recipe for sampling from an estimated probability density 
distribution (#117221)

files:
M Doc/library/statistics.rst

diff --git a/Doc/library/statistics.rst b/Doc/library/statistics.rst
index fc7e0c1ccad286..197c123f8356d8 100644
--- a/Doc/library/statistics.rst
+++ b/Doc/library/statistics.rst
@@ -1148,6 +1148,64 @@ The final prediction goes to the largest posterior. This 
is known as the
   'female'
 
 
+Sampling from kernel density estimation
+***************************************
+
+The :func:`kde()` function creates a continuous probability density
+function from discrete samples.  Some applications need a way to make
+random selections from that distribution.
+
+The technique is to pick a sample from a bandwidth scaled kernel
+function and recenter the result around a randomly chosen point from
+the input data.  This can be done with any kernel that has a known or
+accurately approximated inverse cumulative distribution function.
+
+.. testcode::
+
+   from random import choice, random, seed
+   from math import sqrt, log, pi, tan, asin
+   from statistics import NormalDist
+
+   kernel_invcdfs = {
+       'normal': NormalDist().inv_cdf,
+       'logistic': lambda p: log(p / (1 - p)),
+       'sigmoid': lambda p: log(tan(p * pi/2)),
+       'rectangular': lambda p: 2*p - 1,
+       'triangular': lambda p: sqrt(2*p) - 1 if p < 0.5 else 1 - sqrt(2 - 2*p),
+       'cosine': lambda p: 2*asin(2*p - 1)/pi,
+   }
+
+   def kde_random(data, h, kernel='normal'):
+       'Return a function that samples from kde() smoothed data.'
+       kernel_invcdf = kernel_invcdfs[kernel]
+       def rand():
+           return h * kernel_invcdf(random()) + choice(data)
+       return rand
+
+For example:
+
+.. doctest::
+
+   >>> discrete_samples = [-2.1, -1.3, -0.4, 1.9, 5.1, 6.2]
+   >>> rand = kde_random(discrete_samples, h=1.5)
+   >>> seed(8675309)
+   >>> selections = [rand() for i in range(10)]
+   >>> [round(x, 1) for x in selections]
+   [4.7, 7.4, 1.2, 7.8, 6.9, -1.3, 5.8, 0.2, -1.4, 5.7]
+
+.. testcode::
+    :hide:
+
+    from statistics import kde
+    from math import isclose
+
+    # Verify that cdf / invcdf will round trip
+    xarr = [i/100 for i in range(-100, 101)]
+    for kernel, invcdf in kernel_invcdfs.items():
+        cdf = kde([0.0], h=1.0, kernel=kernel, cumulative=True)
+        for x in xarr:
+            assert isclose(invcdf(cdf(x)), x, abs_tol=1E-9)
+
 ..
    # This modelines must appear within the last ten lines of the file.
    kate: indent-width 3; remove-trailing-space on; replace-tabs on; encoding 
utf-8;

_______________________________________________
Python-checkins mailing list -- [email protected]
To unsubscribe send an email to [email protected]
https://mail.python.org/mailman3/lists/python-checkins.python.org/
Member address: [email protected]

Reply via email to