diff --git a/KDEpy/__init__.py b/KDEpy/__init__.py index 5cca1d5..fefb1c2 100644 --- a/KDEpy/__init__.py +++ b/KDEpy/__init__.py @@ -6,7 +6,7 @@ from KDEpy.TreeKDE import TreeKDE from KDEpy.FFTKDE import FFTKDE -__version__ = "1.0.10" +__version__ = "2.0.0" __author__ = "tommyod" TreeKDE = TreeKDE diff --git a/KDEpy/bw_selection.py b/KDEpy/bw_selection.py index 736721d..836fd53 100644 --- a/KDEpy/bw_selection.py +++ b/KDEpy/bw_selection.py @@ -7,7 +7,7 @@ import scipy import warnings from KDEpy.binning import linear_binning -from KDEpy.utils import autogrid +from KDEpy.utils import autogrid, weighted_std, weighted_percentile from scipy import fftpack from scipy.optimize import brentq @@ -212,21 +212,35 @@ def scotts_rule(data, weights=None): Examples -------- >>> data = np.arange(9).reshape(-1, 1) - >>> ans = scotts_rule(data) - >>> assert np.allclose(ans, 1.76474568962182) + >>> scotts_rule(data) + 1.764745689... """ if not len(data.shape) == 2: raise ValueError("Data must be of shape (obs, dims).") - if weights is not None: - warnings.warn("Scott's rule currently ignores all weights") - obs, dims = data.shape if not dims == 1: raise ValueError("Scotts rule is only available for 1D data.") - sigma = np.std(data, ddof=1) + + if obs == 1: + return 1 + if obs < 1: + raise ValueError("Data must be of length > 0.") + + data = data.ravel() + # scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634 - IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634 + IQR_norm = 1.3489795003921634 + + # Compute sigma and IQR + if weights is not None: + sigma = weighted_std(data, weights=weights, ddof=1) + low, high = weighted_percentile(data, [0.25, 0.75], weights=weights) + IQR = (high - low) / IQR_norm + else: + sigma = np.std(data, ddof=1) + low, high = np.percentile(data, q=[25, 75]) + IQR = (high - low) / IQR_norm sigma = min(sigma, IQR) return sigma * np.power(obs, -1.0 / (dims + 4)) @@ -244,26 +258,35 @@ def silvermans_rule(data, weights=None): Examples -------- >>> data = np.arange(9).reshape(-1, 1) - >>> ans = silvermans_rule(data) - >>> assert np.allclose(ans, 1.8692607078355594) + >>> silvermans_rule(data) + 1.8692607078... """ if not len(data.shape) == 2: raise ValueError("Data must be of shape (obs, dims).") + obs, dims = data.shape if not dims == 1: raise ValueError("Silverman's rule is only available for 1D data.") - if weights is not None: - warnings.warn("Silverman's rule currently ignores all weights") - if obs == 1: return 1 if obs < 1: raise ValueError("Data must be of length > 0.") - sigma = np.std(data, ddof=1) - # scipy.stats.norm.ppf(.75) - scipy.stats.norm.ppf(.25) -> 1.3489795003921634 - IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634 + data = data.ravel() + + # scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634 + IQR_norm = 1.3489795003921634 + + # Compute sigma and IQR + if weights is not None: + sigma = weighted_std(data, weights=weights, ddof=1) + low, high = weighted_percentile(data, [0.25, 0.75], weights=weights) + IQR = (high - low) / IQR_norm + else: + sigma = np.std(data, ddof=1) + low, high = np.percentile(data, q=[25, 75]) + IQR = (high - low) / IQR_norm sigma = min(sigma, IQR) @@ -271,22 +294,29 @@ def silvermans_rule(data, weights=None): # it's nice to return a value instead of getting an error. A warning will be raised. if sigma > 0: return sigma * (obs * 3 / 4.0) ** (-1 / 5) + + # ===================== Little spread of values ======================= + + # stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815 + IQR_norm = 4.6526957480816815 + + # Compute sigma and IQR + if weights is not None: + low, high = weighted_percentile(data, [0.25, 0.75], weights=weights) + IQR = (high - low) / IQR_norm else: - # stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815 - IQR = (np.percentile(data, q=99) - np.percentile(data, q=1)) / 4.6526957480816815 - if IQR > 0: - bw = IQR * (obs * 3 / 4.0) ** (-1 / 5) - warnings.warn( - "Silverman's rule failed. Too many idential values. \ -Setting bw = {}".format( - bw - ) - ) - return bw - - # Here, all values are basically constant - warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0") - return 1.0 + low, high = np.percentile(data, q=[25, 75]) + IQR = (high - low) / IQR_norm + + if IQR > 0: + bw = IQR * (obs * 3 / 4.0) ** (-1 / 5) + msg = "Silverman's rule failed. Too many idential values. Setting bw = {}".format(bw) + warnings.warn(msg) + return bw + + # Here, all values are basically constant + warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0") + return 1.0 _bw_methods = { @@ -294,3 +324,10 @@ def silvermans_rule(data, weights=None): "scott": scotts_rule, "ISJ": improved_sheather_jones, } + + +if __name__ == "__main__": + import pytest + + # --durations=10 <- May be used to show potentially slow tests + pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"]) diff --git a/KDEpy/utils.py b/KDEpy/utils.py index 5c07811..5b7a78d 100644 --- a/KDEpy/utils.py +++ b/KDEpy/utils.py @@ -131,8 +131,83 @@ def autogrid(data, boundary_abs=3, num_points=None, boundary_rel=0.05): return cartesian(list_of_grids) +def weighted_std(values, weights, ddof=0): + """Return the weighted standard deviation. + + Examples + -------- + >>> weighted_std([1, 2, 2], weights=[1, 1, 1]) + 0.4714045207910317 + >>> weighted_std([1, 2], weights=[1, 2]) + 0.4714045207910317 + >>> weighted_std([1, 2, 2], weights=[1, 1, 1], ddof=1) + 0.5773502691896257 + >>> weighted_std([1, 2], weights=[1, 2], ddof=1) + 0.5773502691896257 + + """ + values = np.asarray(values) + weights = np.asarray(weights) + assert np.all(weights > 0), "All weights must be > 0" + assert isinstance(ddof, numbers.Integral), "ddof must be an integer" + assert ddof >= 0 + + # If the degrees of freedom is greater than zero, we need to scale results + if ddof > 0: + smallest_weight = np.min(weights) + weights_summed = np.sum(weights) + factor = weights_summed / (weights_summed - ddof * smallest_weight) + else: + factor = 1 + + average = np.average(values, weights=weights) + # Fast and numerically precise: + variance = np.average((values - average) ** 2, weights=weights) + return np.sqrt(factor * variance) + + +def weighted_percentile(values, perc, weights=None): + """Compue the weighted percentile. + + Based on: https://stackoverflow.com/a/61343915 + + Examples + -------- + >>> weighted_percentile([1, 2, 2], 0) + 1.0 + >>> weighted_percentile([1, 2, 2], 1.) + 2.0 + >>> # These computations differ, but the difference between the + >>> # results goes to zero if we increase the weight of `2` even more. + >>> weighted_percentile([1, 2, 2], 0.5) + 2.0 + >>> weighted_percentile([1, 2], 0.5, [1, 2]) + 1.66666... + >>> weighted_percentile([1, 2], 0.5, [1, 20]) + 1.9523809... + + """ + values = np.asarray(values) + + if weights is None: + weights = np.ones_like(values) + else: + weights = np.asarray(weights) + assert np.all(weights > 0), "All weights must be > 0" + + # Sort data and weights + sorted_indices = np.argsort(values) + values = values[sorted_indices] + weights = weights[sorted_indices] + + # Compute interpolation + weights_cumsum = np.cumsum(weights) + cdf = (weights_cumsum - 0.5 * weights) / weights_cumsum[-1] + return np.interp(x=perc, xp=cdf, fp=values) + + if __name__ == "__main__": import pytest # --durations=10 <- May be used to show potentially slow tests - pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"]) + pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"]) diff --git a/docs/source/examples.rst b/docs/source/examples.rst index dc681b8..8cc6d48 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -93,7 +93,7 @@ probability density as the kernel function in the KDE. Below an example is shown marker='|', label="Resampled from KDE") x, y = FFTKDE(kernel="gaussian", bw="silverman").fit(data).evaluate() plt.plot(x, y, label="FFTKDE with Silverman's rule") - plt.title('Weighted and unweighted KDE') + plt.title('Resampling from distribution') plt.tight_layout(); plt.legend(loc='upper left');