tommyod · tommyod · Nov 23, 2020 · Nov 26, 2020 · Nov 26, 2020 · Nov 26, 2020
diff --git a/KDEpy/__init__.py b/KDEpy/__init__.py
@@ -6,7 +6,7 @@
 from KDEpy.TreeKDE import TreeKDE
 from KDEpy.FFTKDE import FFTKDE
 
-__version__ = "1.0.10"
+__version__ = "2.0.0"
 __author__ = "tommyod"
 
 TreeKDE = TreeKDE

diff --git a/KDEpy/bw_selection.py b/KDEpy/bw_selection.py
@@ -7,7 +7,7 @@
 import scipy
 import warnings
 from KDEpy.binning import linear_binning
-from KDEpy.utils import autogrid
+from KDEpy.utils import autogrid, weighted_std, weighted_percentile
 from scipy import fftpack
 from scipy.optimize import brentq
 
@@ -212,21 +212,35 @@ def scotts_rule(data, weights=None):
     Examples
     --------
     >>> data = np.arange(9).reshape(-1, 1)
-    >>> ans = scotts_rule(data)
-    >>> assert np.allclose(ans, 1.76474568962182)
+    >>> scotts_rule(data)
+    1.764745689...
     """
     if not len(data.shape) == 2:
         raise ValueError("Data must be of shape (obs, dims).")
 
-    if weights is not None:
-        warnings.warn("Scott's rule currently ignores all weights")
-
     obs, dims = data.shape
     if not dims == 1:
         raise ValueError("Scotts rule is only available for 1D data.")
-    sigma = np.std(data, ddof=1)
+
+    if obs == 1:
+        return 1
+    if obs < 1:
+        raise ValueError("Data must be of length > 0.")
+
+    data = data.ravel()
+
     # scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634
-    IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
+    IQR_norm = 1.3489795003921634
+
+    # Compute sigma and IQR
+    if weights is not None:
+        sigma = weighted_std(data, weights=weights, ddof=1)
+        low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
+        IQR = (high - low) / IQR_norm
+    else:
+        sigma = np.std(data, ddof=1)
+        low, high = np.percentile(data, q=[25, 75])
+        IQR = (high - low) / IQR_norm
 
     sigma = min(sigma, IQR)
     return sigma * np.power(obs, -1.0 / (dims + 4))
@@ -244,53 +258,76 @@ def silvermans_rule(data, weights=None):
     Examples
     --------
     >>> data = np.arange(9).reshape(-1, 1)
-    >>> ans = silvermans_rule(data)
-    >>> assert np.allclose(ans, 1.8692607078355594)
+    >>> silvermans_rule(data)
+    1.8692607078...
     """
     if not len(data.shape) == 2:
         raise ValueError("Data must be of shape (obs, dims).")
+
     obs, dims = data.shape
     if not dims == 1:
         raise ValueError("Silverman's rule is only available for 1D data.")
 
-    if weights is not None:
-        warnings.warn("Silverman's rule currently ignores all weights")
-
     if obs == 1:
         return 1
     if obs < 1:
         raise ValueError("Data must be of length > 0.")
 
-    sigma = np.std(data, ddof=1)
-    # scipy.stats.norm.ppf(.75) - scipy.stats.norm.ppf(.25) -> 1.3489795003921634
-    IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
+    data = data.ravel()
+
+    # scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634
+    IQR_norm = 1.3489795003921634
+
+    # Compute sigma and IQR
+    if weights is not None:
+        sigma = weighted_std(data, weights=weights, ddof=1)
+        low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
+        IQR = (high - low) / IQR_norm
+    else:
+        sigma = np.std(data, ddof=1)
+        low, high = np.percentile(data, q=[25, 75])
+        IQR = (high - low) / IQR_norm
 
     sigma = min(sigma, IQR)
 
     # The logic below is not related to silverman's rule, but if the data is constant
     # it's nice to return a value instead of getting an error. A warning will be raised.
     if sigma > 0:
         return sigma * (obs * 3 / 4.0) ** (-1 / 5)
+
+    # ===================== Little spread of values =======================
+
+    # stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815
+    IQR_norm = 4.6526957480816815
+
+    # Compute sigma and IQR
+    if weights is not None:
+        low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
+        IQR = (high - low) / IQR_norm
     else:
-        # stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815
-        IQR = (np.percentile(data, q=99) - np.percentile(data, q=1)) / 4.6526957480816815
-        if IQR > 0:
-            bw = IQR * (obs * 3 / 4.0) ** (-1 / 5)
-            warnings.warn(
-                "Silverman's rule failed. Too many idential values. \
-Setting bw = {}".format(
-                    bw
-                )
-            )
-            return bw
-
-        # Here, all values are basically constant
-        warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0")
-        return 1.0
+        low, high = np.percentile(data, q=[25, 75])
+        IQR = (high - low) / IQR_norm
+
+    if IQR > 0:
+        bw = IQR * (obs * 3 / 4.0) ** (-1 / 5)
+        msg = "Silverman's rule failed. Too many idential values. Setting bw = {}".format(bw)
+        warnings.warn(msg)
+        return bw
+
+    # Here, all values are basically constant
+    warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0")
+    return 1.0
 
 
 _bw_methods = {
     "silverman": silvermans_rule,
     "scott": scotts_rule,
     "ISJ": improved_sheather_jones,
 }
+
+
+if __name__ == "__main__":
+    import pytest
+
+    # --durations=10  <- May be used to show potentially slow tests
+    pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"])
diff --git a/KDEpy/utils.py b/KDEpy/utils.py
@@ -131,8 +131,83 @@ def autogrid(data, boundary_abs=3, num_points=None, boundary_rel=0.05):
     return cartesian(list_of_grids)
 
 
+def weighted_std(values, weights, ddof=0):
+    """Return the weighted standard deviation.
+
+    Examples
+    --------
+    >>> weighted_std([1, 2, 2], weights=[1, 1, 1])
+    0.4714045207910317
+    >>> weighted_std([1, 2], weights=[1, 2])
+    0.4714045207910317
+    >>> weighted_std([1, 2, 2], weights=[1, 1, 1], ddof=1)
+    0.5773502691896257
+    >>> weighted_std([1, 2], weights=[1, 2], ddof=1)
+    0.5773502691896257
+
+    """
+    values = np.asarray(values)
+    weights = np.asarray(weights)
+    assert np.all(weights > 0), "All weights must be > 0"
+    assert isinstance(ddof, numbers.Integral), "ddof must be an integer"
+    assert ddof >= 0
+
+    # If the degrees of freedom is greater than zero, we need to scale results
+    if ddof > 0:
+        smallest_weight = np.min(weights)
+        weights_summed = np.sum(weights)
+        factor = weights_summed / (weights_summed - ddof * smallest_weight)
+    else:
+        factor = 1
+
+    average = np.average(values, weights=weights)
+    # Fast and numerically precise:
+    variance = np.average((values - average) ** 2, weights=weights)
+    return np.sqrt(factor * variance)
+
+
+def weighted_percentile(values, perc, weights=None):
+    """Compue the weighted percentile.
+
+    Based on: https://stackoverflow.com/a/61343915
+
+    Examples
+    --------
+    >>> weighted_percentile([1, 2, 2], 0)
+    1.0
+    >>> weighted_percentile([1, 2, 2], 1.)
+    2.0
+    >>> # These computations differ, but the difference between the
+    >>> # results goes to zero if we increase the weight of `2` even more.
+    >>> weighted_percentile([1, 2, 2], 0.5)
+    2.0
+    >>> weighted_percentile([1, 2], 0.5, [1, 2])
+    1.66666...
+    >>> weighted_percentile([1, 2], 0.5, [1, 20])
+    1.9523809...
+
+    """
+    values = np.asarray(values)
+
+    if weights is None:
+        weights = np.ones_like(values)
+    else:
+        weights = np.asarray(weights)
+        assert np.all(weights > 0), "All weights must be > 0"
+
+    # Sort data and weights
+    sorted_indices = np.argsort(values)
+    values = values[sorted_indices]
+    weights = weights[sorted_indices]
+
+    # Compute interpolation
+    weights_cumsum = np.cumsum(weights)
+    cdf = (weights_cumsum - 0.5 * weights) / weights_cumsum[-1]
+    return np.interp(x=perc, xp=cdf, fp=values)
+
+
 if __name__ == "__main__":
     import pytest
 
     # --durations=10  <- May be used to show potentially slow tests
-    pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"])
+    pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"])
diff --git a/docs/source/examples.rst b/docs/source/examples.rst
@@ -93,7 +93,7 @@ probability density as the kernel function in the KDE. Below an example is shown
                 marker='|', label="Resampled from KDE")
     x, y = FFTKDE(kernel="gaussian", bw="silverman").fit(data).evaluate()
     plt.plot(x, y, label="FFTKDE with Silverman's rule")
-    plt.title('Weighted and unweighted KDE')
+    plt.title('Resampling from distribution')
     plt.tight_layout(); plt.legend(loc='upper left');