From fa06351d435808241133f96be373429d8fca8ea2 Mon Sep 17 00:00:00 2001
From: tommyod <tommy.odland@gmail.com>
Date: Mon, 23 Nov 2020 22:18:05 +0100
Subject: [PATCH 1/5] added weighting of silverman and scott

---
 KDEpy/bw_selection.py | 99 +++++++++++++++++++++++++++++--------------
 KDEpy/utils.py        | 69 +++++++++++++++++++++++++++++-
 2 files changed, 136 insertions(+), 32 deletions(-)

diff --git a/KDEpy/bw_selection.py b/KDEpy/bw_selection.py
index 736721d..edfa2fa 100644
--- a/KDEpy/bw_selection.py
+++ b/KDEpy/bw_selection.py
@@ -7,7 +7,7 @@
 import scipy
 import warnings
 from KDEpy.binning import linear_binning
-from KDEpy.utils import autogrid
+from KDEpy.utils import autogrid, weighted_std, weighted_percentile
 from scipy import fftpack
 from scipy.optimize import brentq
 
@@ -212,21 +212,35 @@ def scotts_rule(data, weights=None):
     Examples
     --------
     >>> data = np.arange(9).reshape(-1, 1)
-    >>> ans = scotts_rule(data)
-    >>> assert np.allclose(ans, 1.76474568962182)
+    >>> scotts_rule(data)
+    1.6638...
     """
     if not len(data.shape) == 2:
         raise ValueError("Data must be of shape (obs, dims).")
 
-    if weights is not None:
-        warnings.warn("Scott's rule currently ignores all weights")
-
     obs, dims = data.shape
     if not dims == 1:
         raise ValueError("Scotts rule is only available for 1D data.")
-    sigma = np.std(data, ddof=1)
+
+    if obs == 1:
+        return 1
+    if obs < 1:
+        raise ValueError("Data must be of length > 0.")
+
+    data = data.ravel()
+
     # scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634
-    IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
+    IQR_norm = 1.3489795003921634
+
+    # Compute sigma and IQR
+    if weights is not None:
+        sigma = weighted_std(data, weights=weights)
+        low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
+        IQR = (high - low) / IQR_norm
+    else:
+        sigma = np.std(data)
+        low, high = np.percentile(data, q=[25, 75])
+        IQR = (high - low) / IQR_norm
 
     sigma = min(sigma, IQR)
     return sigma * np.power(obs, -1.0 / (dims + 4))
@@ -244,26 +258,35 @@ def silvermans_rule(data, weights=None):
     Examples
     --------
     >>> data = np.arange(9).reshape(-1, 1)
-    >>> ans = silvermans_rule(data)
-    >>> assert np.allclose(ans, 1.8692607078355594)
+    >>> silvermans_rule(data)
+    1.762355896...
     """
     if not len(data.shape) == 2:
         raise ValueError("Data must be of shape (obs, dims).")
+
     obs, dims = data.shape
     if not dims == 1:
         raise ValueError("Silverman's rule is only available for 1D data.")
 
-    if weights is not None:
-        warnings.warn("Silverman's rule currently ignores all weights")
-
     if obs == 1:
         return 1
     if obs < 1:
         raise ValueError("Data must be of length > 0.")
 
-    sigma = np.std(data, ddof=1)
-    # scipy.stats.norm.ppf(.75) - scipy.stats.norm.ppf(.25) -> 1.3489795003921634
-    IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
+    data = data.ravel()
+
+    # scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634
+    IQR_norm = 1.3489795003921634
+
+    # Compute sigma and IQR
+    if weights is not None:
+        sigma = weighted_std(data, weights=weights)
+        low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
+        IQR = (high - low) / IQR_norm
+    else:
+        sigma = np.std(data)
+        low, high = np.percentile(data, q=[25, 75])
+        IQR = (high - low) / IQR_norm
 
     sigma = min(sigma, IQR)
 
@@ -271,22 +294,29 @@ def silvermans_rule(data, weights=None):
     # it's nice to return a value instead of getting an error. A warning will be raised.
     if sigma > 0:
         return sigma * (obs * 3 / 4.0) ** (-1 / 5)
+
+    # ===================== Little spread of values =======================
+
+    # stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815
+    IQR_norm = 4.6526957480816815
+
+    # Compute sigma and IQR
+    if weights is not None:
+        low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
+        IQR = (high - low) / IQR_norm
     else:
-        # stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815
-        IQR = (np.percentile(data, q=99) - np.percentile(data, q=1)) / 4.6526957480816815
-        if IQR > 0:
-            bw = IQR * (obs * 3 / 4.0) ** (-1 / 5)
-            warnings.warn(
-                "Silverman's rule failed. Too many idential values. \
-Setting bw = {}".format(
-                    bw
-                )
-            )
-            return bw
-
-        # Here, all values are basically constant
-        warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0")
-        return 1.0
+        low, high = np.percentile(data, q=[25, 75])
+        IQR = (high - low) / IQR_norm
+
+    if IQR > 0:
+        bw = IQR * (obs * 3 / 4.0) ** (-1 / 5)
+        msg = "Silverman's rule failed. Too many idential values. Setting bw = {}".format(bw)
+        warnings.warn(msg)
+        return bw
+
+    # Here, all values are basically constant
+    warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0")
+    return 1.0
 
 
 _bw_methods = {
@@ -294,3 +324,10 @@ def silvermans_rule(data, weights=None):
     "scott": scotts_rule,
     "ISJ": improved_sheather_jones,
 }
+
+
+if __name__ == "__main__":
+    import pytest
+
+    # --durations=10  <- May be used to show potentially slow tests
+    pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"])
diff --git a/KDEpy/utils.py b/KDEpy/utils.py
index 5c07811..8dc805d 100644
--- a/KDEpy/utils.py
+++ b/KDEpy/utils.py
@@ -131,8 +131,75 @@ def autogrid(data, boundary_abs=3, num_points=None, boundary_rel=0.05):
     return cartesian(list_of_grids)
 
 
+def weighted_std(values, weights, ddof=0):
+    """Return the weighted standard deviation.
+
+    Examples
+    --------
+    >>> weighted_std([1, 2, 2], weights=[1, 1, 1])
+    0.4714045207910317
+    >>> weighted_std([1, 2], weights=[1, 2])
+    0.4714045207910317
+    >>> weighted_std([1, 2, 2], weights=[1, 1, 1], ddof=1)
+    0.5773502691896257
+    >>> weighted_std([1, 2], weights=[1, 2], ddof=1)
+    0.5773502691896257
+
+    """
+    values = np.asarray(values)
+    weights = np.asarray(weights)
+    assert np.all(weights > 0), "All weights must be >= 0"
+    assert isinstance(ddof, numbers.Integral), "ddof must be int"
+
+    # If the degrees of freedom is greater than zero, we need to scale results
+    if ddof > 0:
+        weights_summed = np.sum(weights)
+        factor = weights_summed / (weights_summed - ddof)
+    else:
+        factor = 1
+
+    average = np.average(values, weights=weights)
+    # Fast and numerically precise:
+    variance = np.average((values - average) ** 2, weights=weights)
+    return np.sqrt(factor * variance)
+
+
+def weighted_percentile(values, perc, weights=None):
+    """Compue the weighted percentile.
+
+    Based on: https://stackoverflow.com/a/61343915
+
+    Examples
+    --------
+    >>> weighted_percentile([1, 2, 2], 0)
+    1.0
+    >>> weighted_percentile([1, 2, 2], 1.)
+    2.0
+    >>> # These computations differ, but it does not matter
+    >>> weighted_percentile([1, 2], 0.5, [1, 2])
+    1.66666...
+    >>> weighted_percentile([1, 2, 2], 0.5)
+    2.0
+
+    """
+    values = np.asarray(values)
+
+    if weights is None:
+        weights = np.ones_like(values)
+    else:
+        weights = np.asarray(weights)
+        assert np.all(weights > 0), "All weights must be >= 0"
+
+    ix = np.argsort(values)
+    values = values[ix]  # sort data
+    weights = weights[ix]  # sort weights
+    weights_cumsum = np.cumsum(weights)
+    cdf = (weights_cumsum - 0.5 * weights) / weights_cumsum[-1]
+    return np.interp(perc, cdf, values)
+
+
 if __name__ == "__main__":
     import pytest
 
     # --durations=10  <- May be used to show potentially slow tests
-    pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"])
+    pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"])

From ced2dd09be6f1b12baee6308bbd5f3caee4f491a Mon Sep 17 00:00:00 2001
From: tommyod <tommy.odland@gmail.com>
Date: Thu, 26 Nov 2020 17:07:53 +0100
Subject: [PATCH 2/5] incremented version to 2.0.0

---
 KDEpy/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/KDEpy/__init__.py b/KDEpy/__init__.py
index 5cca1d5..fefb1c2 100644
--- a/KDEpy/__init__.py
+++ b/KDEpy/__init__.py
@@ -6,7 +6,7 @@
 from KDEpy.TreeKDE import TreeKDE
 from KDEpy.FFTKDE import FFTKDE
 
-__version__ = "1.0.10"
+__version__ = "2.0.0"
 __author__ = "tommyod"
 
 TreeKDE = TreeKDE

From 42c9bfbe65076518e90fe071c305cd7aa7a0d375 Mon Sep 17 00:00:00 2001
From: tommyod <tommy.odland@gmail.com>
Date: Thu, 26 Nov 2020 19:32:06 +0100
Subject: [PATCH 3/5] added back ddof=1 as default

---
 KDEpy/bw_selection.py | 14 +++++++-------
 KDEpy/utils.py        |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/KDEpy/bw_selection.py b/KDEpy/bw_selection.py
index edfa2fa..836fd53 100644
--- a/KDEpy/bw_selection.py
+++ b/KDEpy/bw_selection.py
@@ -213,7 +213,7 @@ def scotts_rule(data, weights=None):
     --------
     >>> data = np.arange(9).reshape(-1, 1)
     >>> scotts_rule(data)
-    1.6638...
+    1.764745689...
     """
     if not len(data.shape) == 2:
         raise ValueError("Data must be of shape (obs, dims).")
@@ -234,11 +234,11 @@ def scotts_rule(data, weights=None):
 
     # Compute sigma and IQR
     if weights is not None:
-        sigma = weighted_std(data, weights=weights)
+        sigma = weighted_std(data, weights=weights, ddof=1)
         low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
         IQR = (high - low) / IQR_norm
     else:
-        sigma = np.std(data)
+        sigma = np.std(data, ddof=1)
         low, high = np.percentile(data, q=[25, 75])
         IQR = (high - low) / IQR_norm
 
@@ -259,7 +259,7 @@ def silvermans_rule(data, weights=None):
     --------
     >>> data = np.arange(9).reshape(-1, 1)
     >>> silvermans_rule(data)
-    1.762355896...
+    1.8692607078...
     """
     if not len(data.shape) == 2:
         raise ValueError("Data must be of shape (obs, dims).")
@@ -280,11 +280,11 @@ def silvermans_rule(data, weights=None):
 
     # Compute sigma and IQR
     if weights is not None:
-        sigma = weighted_std(data, weights=weights)
+        sigma = weighted_std(data, weights=weights, ddof=1)
         low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
         IQR = (high - low) / IQR_norm
     else:
-        sigma = np.std(data)
+        sigma = np.std(data, ddof=1)
         low, high = np.percentile(data, q=[25, 75])
         IQR = (high - low) / IQR_norm
 
@@ -330,4 +330,4 @@ def silvermans_rule(data, weights=None):
     import pytest
 
     # --durations=10  <- May be used to show potentially slow tests
-    pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"])
+    pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"])
diff --git a/KDEpy/utils.py b/KDEpy/utils.py
index 8dc805d..abcfc40 100644
--- a/KDEpy/utils.py
+++ b/KDEpy/utils.py
@@ -153,8 +153,9 @@ def weighted_std(values, weights, ddof=0):
 
     # If the degrees of freedom is greater than zero, we need to scale results
     if ddof > 0:
+        smallest_weight = np.min(weights)
         weights_summed = np.sum(weights)
-        factor = weights_summed / (weights_summed - ddof)
+        factor = weights_summed / (weights_summed - ddof * smallest_weight)
     else:
         factor = 1
 

From fcbdff58b119654198844f5db2e1debad13091b6 Mon Sep 17 00:00:00 2001
From: tommyod <tommy.odland@gmail.com>
Date: Thu, 26 Nov 2020 19:47:28 +0100
Subject: [PATCH 4/5] cleaned up a little bit

---
 KDEpy/utils.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/KDEpy/utils.py b/KDEpy/utils.py
index abcfc40..5b7a78d 100644
--- a/KDEpy/utils.py
+++ b/KDEpy/utils.py
@@ -148,8 +148,9 @@ def weighted_std(values, weights, ddof=0):
     """
     values = np.asarray(values)
     weights = np.asarray(weights)
-    assert np.all(weights > 0), "All weights must be >= 0"
-    assert isinstance(ddof, numbers.Integral), "ddof must be int"
+    assert np.all(weights > 0), "All weights must be > 0"
+    assert isinstance(ddof, numbers.Integral), "ddof must be an integer"
+    assert ddof >= 0
 
     # If the degrees of freedom is greater than zero, we need to scale results
     if ddof > 0:
@@ -176,11 +177,14 @@ def weighted_percentile(values, perc, weights=None):
     1.0
     >>> weighted_percentile([1, 2, 2], 1.)
     2.0
-    >>> # These computations differ, but it does not matter
-    >>> weighted_percentile([1, 2], 0.5, [1, 2])
-    1.66666...
+    >>> # These computations differ, but the difference between the
+    >>> # results goes to zero if we increase the weight of `2` even more.
     >>> weighted_percentile([1, 2, 2], 0.5)
     2.0
+    >>> weighted_percentile([1, 2], 0.5, [1, 2])
+    1.66666...
+    >>> weighted_percentile([1, 2], 0.5, [1, 20])
+    1.9523809...
 
     """
     values = np.asarray(values)
@@ -189,14 +193,17 @@ def weighted_percentile(values, perc, weights=None):
         weights = np.ones_like(values)
     else:
         weights = np.asarray(weights)
-        assert np.all(weights > 0), "All weights must be >= 0"
+        assert np.all(weights > 0), "All weights must be > 0"
+
+    # Sort data and weights
+    sorted_indices = np.argsort(values)
+    values = values[sorted_indices]
+    weights = weights[sorted_indices]
 
-    ix = np.argsort(values)
-    values = values[ix]  # sort data
-    weights = weights[ix]  # sort weights
+    # Compute interpolation
     weights_cumsum = np.cumsum(weights)
     cdf = (weights_cumsum - 0.5 * weights) / weights_cumsum[-1]
-    return np.interp(perc, cdf, values)
+    return np.interp(x=perc, xp=cdf, fp=values)
 
 
 if __name__ == "__main__":

From 8b5a178153f636f65e672b4c09edaef14e9367fa Mon Sep 17 00:00:00 2001
From: tommyod <tommy.odland@gmail.com>
Date: Thu, 26 Nov 2020 20:05:00 +0100
Subject: [PATCH 5/5] fixed typo in docs

---
 docs/source/examples.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index dc681b8..8cc6d48 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -93,7 +93,7 @@ probability density as the kernel function in the KDE. Below an example is shown
                 marker='|', label="Resampled from KDE")
     x, y = FFTKDE(kernel="gaussian", bw="silverman").fit(data).evaluate()
     plt.plot(x, y, label="FFTKDE with Silverman's rule")
-    plt.title('Weighted and unweighted KDE')
+    plt.title('Resampling from distribution')
     plt.tight_layout(); plt.legend(loc='upper left');