From fa06351d435808241133f96be373429d8fca8ea2 Mon Sep 17 00:00:00 2001 From: tommyod Date: Mon, 23 Nov 2020 22:18:05 +0100 Subject: [PATCH 1/5] added weighting of silverman and scott --- KDEpy/bw_selection.py | 99 +++++++++++++++++++++++++++++-------------- KDEpy/utils.py | 69 +++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 32 deletions(-) diff --git a/KDEpy/bw_selection.py b/KDEpy/bw_selection.py index 736721d..edfa2fa 100644 --- a/KDEpy/bw_selection.py +++ b/KDEpy/bw_selection.py @@ -7,7 +7,7 @@ import scipy import warnings from KDEpy.binning import linear_binning -from KDEpy.utils import autogrid +from KDEpy.utils import autogrid, weighted_std, weighted_percentile from scipy import fftpack from scipy.optimize import brentq @@ -212,21 +212,35 @@ def scotts_rule(data, weights=None): Examples -------- >>> data = np.arange(9).reshape(-1, 1) - >>> ans = scotts_rule(data) - >>> assert np.allclose(ans, 1.76474568962182) + >>> scotts_rule(data) + 1.6638... """ if not len(data.shape) == 2: raise ValueError("Data must be of shape (obs, dims).") - if weights is not None: - warnings.warn("Scott's rule currently ignores all weights") - obs, dims = data.shape if not dims == 1: raise ValueError("Scotts rule is only available for 1D data.") - sigma = np.std(data, ddof=1) + + if obs == 1: + return 1 + if obs < 1: + raise ValueError("Data must be of length > 0.") + + data = data.ravel() + # scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634 - IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634 + IQR_norm = 1.3489795003921634 + + # Compute sigma and IQR + if weights is not None: + sigma = weighted_std(data, weights=weights) + low, high = weighted_percentile(data, [0.25, 0.75], weights=weights) + IQR = (high - low) / IQR_norm + else: + sigma = np.std(data) + low, high = np.percentile(data, q=[25, 75]) + IQR = (high - low) / IQR_norm sigma = min(sigma, IQR) return sigma * np.power(obs, -1.0 / (dims + 4)) @@ -244,26 +258,35 @@ def silvermans_rule(data, weights=None): Examples -------- >>> data = np.arange(9).reshape(-1, 1) - >>> ans = silvermans_rule(data) - >>> assert np.allclose(ans, 1.8692607078355594) + >>> silvermans_rule(data) + 1.762355896... """ if not len(data.shape) == 2: raise ValueError("Data must be of shape (obs, dims).") + obs, dims = data.shape if not dims == 1: raise ValueError("Silverman's rule is only available for 1D data.") - if weights is not None: - warnings.warn("Silverman's rule currently ignores all weights") - if obs == 1: return 1 if obs < 1: raise ValueError("Data must be of length > 0.") - sigma = np.std(data, ddof=1) - # scipy.stats.norm.ppf(.75) - scipy.stats.norm.ppf(.25) -> 1.3489795003921634 - IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634 + data = data.ravel() + + # scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634 + IQR_norm = 1.3489795003921634 + + # Compute sigma and IQR + if weights is not None: + sigma = weighted_std(data, weights=weights) + low, high = weighted_percentile(data, [0.25, 0.75], weights=weights) + IQR = (high - low) / IQR_norm + else: + sigma = np.std(data) + low, high = np.percentile(data, q=[25, 75]) + IQR = (high - low) / IQR_norm sigma = min(sigma, IQR) @@ -271,22 +294,29 @@ def silvermans_rule(data, weights=None): # it's nice to return a value instead of getting an error. A warning will be raised. if sigma > 0: return sigma * (obs * 3 / 4.0) ** (-1 / 5) + + # ===================== Little spread of values ======================= + + # stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815 + IQR_norm = 4.6526957480816815 + + # Compute sigma and IQR + if weights is not None: + low, high = weighted_percentile(data, [0.25, 0.75], weights=weights) + IQR = (high - low) / IQR_norm else: - # stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815 - IQR = (np.percentile(data, q=99) - np.percentile(data, q=1)) / 4.6526957480816815 - if IQR > 0: - bw = IQR * (obs * 3 / 4.0) ** (-1 / 5) - warnings.warn( - "Silverman's rule failed. Too many idential values. \ -Setting bw = {}".format( - bw - ) - ) - return bw - - # Here, all values are basically constant - warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0") - return 1.0 + low, high = np.percentile(data, q=[25, 75]) + IQR = (high - low) / IQR_norm + + if IQR > 0: + bw = IQR * (obs * 3 / 4.0) ** (-1 / 5) + msg = "Silverman's rule failed. Too many idential values. Setting bw = {}".format(bw) + warnings.warn(msg) + return bw + + # Here, all values are basically constant + warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0") + return 1.0 _bw_methods = { @@ -294,3 +324,10 @@ def silvermans_rule(data, weights=None): "scott": scotts_rule, "ISJ": improved_sheather_jones, } + + +if __name__ == "__main__": + import pytest + + # --durations=10 <- May be used to show potentially slow tests + pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"]) diff --git a/KDEpy/utils.py b/KDEpy/utils.py index 5c07811..8dc805d 100644 --- a/KDEpy/utils.py +++ b/KDEpy/utils.py @@ -131,8 +131,75 @@ def autogrid(data, boundary_abs=3, num_points=None, boundary_rel=0.05): return cartesian(list_of_grids) +def weighted_std(values, weights, ddof=0): + """Return the weighted standard deviation. + + Examples + -------- + >>> weighted_std([1, 2, 2], weights=[1, 1, 1]) + 0.4714045207910317 + >>> weighted_std([1, 2], weights=[1, 2]) + 0.4714045207910317 + >>> weighted_std([1, 2, 2], weights=[1, 1, 1], ddof=1) + 0.5773502691896257 + >>> weighted_std([1, 2], weights=[1, 2], ddof=1) + 0.5773502691896257 + + """ + values = np.asarray(values) + weights = np.asarray(weights) + assert np.all(weights > 0), "All weights must be >= 0" + assert isinstance(ddof, numbers.Integral), "ddof must be int" + + # If the degrees of freedom is greater than zero, we need to scale results + if ddof > 0: + weights_summed = np.sum(weights) + factor = weights_summed / (weights_summed - ddof) + else: + factor = 1 + + average = np.average(values, weights=weights) + # Fast and numerically precise: + variance = np.average((values - average) ** 2, weights=weights) + return np.sqrt(factor * variance) + + +def weighted_percentile(values, perc, weights=None): + """Compue the weighted percentile. + + Based on: https://stackoverflow.com/a/61343915 + + Examples + -------- + >>> weighted_percentile([1, 2, 2], 0) + 1.0 + >>> weighted_percentile([1, 2, 2], 1.) + 2.0 + >>> # These computations differ, but it does not matter + >>> weighted_percentile([1, 2], 0.5, [1, 2]) + 1.66666... + >>> weighted_percentile([1, 2, 2], 0.5) + 2.0 + + """ + values = np.asarray(values) + + if weights is None: + weights = np.ones_like(values) + else: + weights = np.asarray(weights) + assert np.all(weights > 0), "All weights must be >= 0" + + ix = np.argsort(values) + values = values[ix] # sort data + weights = weights[ix] # sort weights + weights_cumsum = np.cumsum(weights) + cdf = (weights_cumsum - 0.5 * weights) / weights_cumsum[-1] + return np.interp(perc, cdf, values) + + if __name__ == "__main__": import pytest # --durations=10 <- May be used to show potentially slow tests - pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"]) + pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"]) From ced2dd09be6f1b12baee6308bbd5f3caee4f491a Mon Sep 17 00:00:00 2001 From: tommyod Date: Thu, 26 Nov 2020 17:07:53 +0100 Subject: [PATCH 2/5] incremented version to 2.0.0 --- KDEpy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/KDEpy/__init__.py b/KDEpy/__init__.py index 5cca1d5..fefb1c2 100644 --- a/KDEpy/__init__.py +++ b/KDEpy/__init__.py @@ -6,7 +6,7 @@ from KDEpy.TreeKDE import TreeKDE from KDEpy.FFTKDE import FFTKDE -__version__ = "1.0.10" +__version__ = "2.0.0" __author__ = "tommyod" TreeKDE = TreeKDE From 42c9bfbe65076518e90fe071c305cd7aa7a0d375 Mon Sep 17 00:00:00 2001 From: tommyod Date: Thu, 26 Nov 2020 19:32:06 +0100 Subject: [PATCH 3/5] added back ddof=1 as default --- KDEpy/bw_selection.py | 14 +++++++------- KDEpy/utils.py | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/KDEpy/bw_selection.py b/KDEpy/bw_selection.py index edfa2fa..836fd53 100644 --- a/KDEpy/bw_selection.py +++ b/KDEpy/bw_selection.py @@ -213,7 +213,7 @@ def scotts_rule(data, weights=None): -------- >>> data = np.arange(9).reshape(-1, 1) >>> scotts_rule(data) - 1.6638... + 1.764745689... """ if not len(data.shape) == 2: raise ValueError("Data must be of shape (obs, dims).") @@ -234,11 +234,11 @@ def scotts_rule(data, weights=None): # Compute sigma and IQR if weights is not None: - sigma = weighted_std(data, weights=weights) + sigma = weighted_std(data, weights=weights, ddof=1) low, high = weighted_percentile(data, [0.25, 0.75], weights=weights) IQR = (high - low) / IQR_norm else: - sigma = np.std(data) + sigma = np.std(data, ddof=1) low, high = np.percentile(data, q=[25, 75]) IQR = (high - low) / IQR_norm @@ -259,7 +259,7 @@ def silvermans_rule(data, weights=None): -------- >>> data = np.arange(9).reshape(-1, 1) >>> silvermans_rule(data) - 1.762355896... + 1.8692607078... """ if not len(data.shape) == 2: raise ValueError("Data must be of shape (obs, dims).") @@ -280,11 +280,11 @@ def silvermans_rule(data, weights=None): # Compute sigma and IQR if weights is not None: - sigma = weighted_std(data, weights=weights) + sigma = weighted_std(data, weights=weights, ddof=1) low, high = weighted_percentile(data, [0.25, 0.75], weights=weights) IQR = (high - low) / IQR_norm else: - sigma = np.std(data) + sigma = np.std(data, ddof=1) low, high = np.percentile(data, q=[25, 75]) IQR = (high - low) / IQR_norm @@ -330,4 +330,4 @@ def silvermans_rule(data, weights=None): import pytest # --durations=10 <- May be used to show potentially slow tests - pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"]) + pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"]) diff --git a/KDEpy/utils.py b/KDEpy/utils.py index 8dc805d..abcfc40 100644 --- a/KDEpy/utils.py +++ b/KDEpy/utils.py @@ -153,8 +153,9 @@ def weighted_std(values, weights, ddof=0): # If the degrees of freedom is greater than zero, we need to scale results if ddof > 0: + smallest_weight = np.min(weights) weights_summed = np.sum(weights) - factor = weights_summed / (weights_summed - ddof) + factor = weights_summed / (weights_summed - ddof * smallest_weight) else: factor = 1 From fcbdff58b119654198844f5db2e1debad13091b6 Mon Sep 17 00:00:00 2001 From: tommyod Date: Thu, 26 Nov 2020 19:47:28 +0100 Subject: [PATCH 4/5] cleaned up a little bit --- KDEpy/utils.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/KDEpy/utils.py b/KDEpy/utils.py index abcfc40..5b7a78d 100644 --- a/KDEpy/utils.py +++ b/KDEpy/utils.py @@ -148,8 +148,9 @@ def weighted_std(values, weights, ddof=0): """ values = np.asarray(values) weights = np.asarray(weights) - assert np.all(weights > 0), "All weights must be >= 0" - assert isinstance(ddof, numbers.Integral), "ddof must be int" + assert np.all(weights > 0), "All weights must be > 0" + assert isinstance(ddof, numbers.Integral), "ddof must be an integer" + assert ddof >= 0 # If the degrees of freedom is greater than zero, we need to scale results if ddof > 0: @@ -176,11 +177,14 @@ def weighted_percentile(values, perc, weights=None): 1.0 >>> weighted_percentile([1, 2, 2], 1.) 2.0 - >>> # These computations differ, but it does not matter - >>> weighted_percentile([1, 2], 0.5, [1, 2]) - 1.66666... + >>> # These computations differ, but the difference between the + >>> # results goes to zero if we increase the weight of `2` even more. >>> weighted_percentile([1, 2, 2], 0.5) 2.0 + >>> weighted_percentile([1, 2], 0.5, [1, 2]) + 1.66666... + >>> weighted_percentile([1, 2], 0.5, [1, 20]) + 1.9523809... """ values = np.asarray(values) @@ -189,14 +193,17 @@ def weighted_percentile(values, perc, weights=None): weights = np.ones_like(values) else: weights = np.asarray(weights) - assert np.all(weights > 0), "All weights must be >= 0" + assert np.all(weights > 0), "All weights must be > 0" + + # Sort data and weights + sorted_indices = np.argsort(values) + values = values[sorted_indices] + weights = weights[sorted_indices] - ix = np.argsort(values) - values = values[ix] # sort data - weights = weights[ix] # sort weights + # Compute interpolation weights_cumsum = np.cumsum(weights) cdf = (weights_cumsum - 0.5 * weights) / weights_cumsum[-1] - return np.interp(perc, cdf, values) + return np.interp(x=perc, xp=cdf, fp=values) if __name__ == "__main__": From 8b5a178153f636f65e672b4c09edaef14e9367fa Mon Sep 17 00:00:00 2001 From: tommyod Date: Thu, 26 Nov 2020 20:05:00 +0100 Subject: [PATCH 5/5] fixed typo in docs --- docs/source/examples.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/examples.rst b/docs/source/examples.rst index dc681b8..8cc6d48 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -93,7 +93,7 @@ probability density as the kernel function in the KDE. Below an example is shown marker='|', label="Resampled from KDE") x, y = FFTKDE(kernel="gaussian", bw="silverman").fit(data).evaluate() plt.plot(x, y, label="FFTKDE with Silverman's rule") - plt.title('Weighted and unweighted KDE') + plt.title('Resampling from distribution') plt.tight_layout(); plt.legend(loc='upper left');