Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion KDEpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from KDEpy.TreeKDE import TreeKDE
from KDEpy.FFTKDE import FFTKDE

__version__ = "1.0.10"
__version__ = "2.0.0"
__author__ = "tommyod"

TreeKDE = TreeKDE
Expand Down
99 changes: 68 additions & 31 deletions KDEpy/bw_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import scipy
import warnings
from KDEpy.binning import linear_binning
from KDEpy.utils import autogrid
from KDEpy.utils import autogrid, weighted_std, weighted_percentile
from scipy import fftpack
from scipy.optimize import brentq

Expand Down Expand Up @@ -212,21 +212,35 @@ def scotts_rule(data, weights=None):
Examples
--------
>>> data = np.arange(9).reshape(-1, 1)
>>> ans = scotts_rule(data)
>>> assert np.allclose(ans, 1.76474568962182)
>>> scotts_rule(data)
1.764745689...
"""
if not len(data.shape) == 2:
raise ValueError("Data must be of shape (obs, dims).")

if weights is not None:
warnings.warn("Scott's rule currently ignores all weights")

obs, dims = data.shape
if not dims == 1:
raise ValueError("Scotts rule is only available for 1D data.")
sigma = np.std(data, ddof=1)

if obs == 1:
return 1
if obs < 1:
raise ValueError("Data must be of length > 0.")

data = data.ravel()

# scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634
IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
IQR_norm = 1.3489795003921634

# Compute sigma and IQR
if weights is not None:
sigma = weighted_std(data, weights=weights, ddof=1)
low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
IQR = (high - low) / IQR_norm
else:
sigma = np.std(data, ddof=1)
low, high = np.percentile(data, q=[25, 75])
IQR = (high - low) / IQR_norm

sigma = min(sigma, IQR)
return sigma * np.power(obs, -1.0 / (dims + 4))
Expand All @@ -244,53 +258,76 @@ def silvermans_rule(data, weights=None):
Examples
--------
>>> data = np.arange(9).reshape(-1, 1)
>>> ans = silvermans_rule(data)
>>> assert np.allclose(ans, 1.8692607078355594)
>>> silvermans_rule(data)
1.8692607078...
"""
if not len(data.shape) == 2:
raise ValueError("Data must be of shape (obs, dims).")

obs, dims = data.shape
if not dims == 1:
raise ValueError("Silverman's rule is only available for 1D data.")

if weights is not None:
warnings.warn("Silverman's rule currently ignores all weights")

if obs == 1:
return 1
if obs < 1:
raise ValueError("Data must be of length > 0.")

sigma = np.std(data, ddof=1)
# scipy.stats.norm.ppf(.75) - scipy.stats.norm.ppf(.25) -> 1.3489795003921634
IQR = (np.percentile(data, q=75) - np.percentile(data, q=25)) / 1.3489795003921634
data = data.ravel()

# scipy.norm.ppf(.75) - scipy.norm.ppf(.25) -> 1.3489795003921634
IQR_norm = 1.3489795003921634

# Compute sigma and IQR
if weights is not None:
sigma = weighted_std(data, weights=weights, ddof=1)
low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
IQR = (high - low) / IQR_norm
else:
sigma = np.std(data, ddof=1)
low, high = np.percentile(data, q=[25, 75])
IQR = (high - low) / IQR_norm

sigma = min(sigma, IQR)

# The logic below is not related to silverman's rule, but if the data is constant
# it's nice to return a value instead of getting an error. A warning will be raised.
if sigma > 0:
return sigma * (obs * 3 / 4.0) ** (-1 / 5)

# ===================== Little spread of values =======================

# stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815
IQR_norm = 4.6526957480816815

# Compute sigma and IQR
if weights is not None:
low, high = weighted_percentile(data, [0.25, 0.75], weights=weights)
IQR = (high - low) / IQR_norm
else:
# stats.norm.ppf(.99) - stats.norm.ppf(.01) = 4.6526957480816815
IQR = (np.percentile(data, q=99) - np.percentile(data, q=1)) / 4.6526957480816815
if IQR > 0:
bw = IQR * (obs * 3 / 4.0) ** (-1 / 5)
warnings.warn(
"Silverman's rule failed. Too many idential values. \
Setting bw = {}".format(
bw
)
)
return bw

# Here, all values are basically constant
warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0")
return 1.0
low, high = np.percentile(data, q=[25, 75])
IQR = (high - low) / IQR_norm

if IQR > 0:
bw = IQR * (obs * 3 / 4.0) ** (-1 / 5)
msg = "Silverman's rule failed. Too many idential values. Setting bw = {}".format(bw)
warnings.warn(msg)
return bw

# Here, all values are basically constant
warnings.warn("Silverman's rule failed. Too many idential values. Setting bw = 1.0")
return 1.0


_bw_methods = {
"silverman": silvermans_rule,
"scott": scotts_rule,
"ISJ": improved_sheather_jones,
}


if __name__ == "__main__":
import pytest

# --durations=10 <- May be used to show potentially slow tests
pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"])
77 changes: 76 additions & 1 deletion KDEpy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,83 @@ def autogrid(data, boundary_abs=3, num_points=None, boundary_rel=0.05):
return cartesian(list_of_grids)


def weighted_std(values, weights, ddof=0):
"""Return the weighted standard deviation.

Examples
--------
>>> weighted_std([1, 2, 2], weights=[1, 1, 1])
0.4714045207910317
>>> weighted_std([1, 2], weights=[1, 2])
0.4714045207910317
>>> weighted_std([1, 2, 2], weights=[1, 1, 1], ddof=1)
0.5773502691896257
>>> weighted_std([1, 2], weights=[1, 2], ddof=1)
0.5773502691896257

"""
values = np.asarray(values)
weights = np.asarray(weights)
assert np.all(weights > 0), "All weights must be > 0"
assert isinstance(ddof, numbers.Integral), "ddof must be an integer"
assert ddof >= 0

# If the degrees of freedom is greater than zero, we need to scale results
if ddof > 0:
smallest_weight = np.min(weights)
weights_summed = np.sum(weights)
factor = weights_summed / (weights_summed - ddof * smallest_weight)
else:
factor = 1

average = np.average(values, weights=weights)
# Fast and numerically precise:
variance = np.average((values - average) ** 2, weights=weights)
return np.sqrt(factor * variance)


def weighted_percentile(values, perc, weights=None):
"""Compue the weighted percentile.

Based on: https://stackoverflow.com/a/61343915

Examples
--------
>>> weighted_percentile([1, 2, 2], 0)
1.0
>>> weighted_percentile([1, 2, 2], 1.)
2.0
>>> # These computations differ, but the difference between the
>>> # results goes to zero if we increase the weight of `2` even more.
>>> weighted_percentile([1, 2, 2], 0.5)
2.0
>>> weighted_percentile([1, 2], 0.5, [1, 2])
1.66666...
>>> weighted_percentile([1, 2], 0.5, [1, 20])
1.9523809...

"""
values = np.asarray(values)

if weights is None:
weights = np.ones_like(values)
else:
weights = np.asarray(weights)
assert np.all(weights > 0), "All weights must be > 0"

# Sort data and weights
sorted_indices = np.argsort(values)
values = values[sorted_indices]
weights = weights[sorted_indices]

# Compute interpolation
weights_cumsum = np.cumsum(weights)
cdf = (weights_cumsum - 0.5 * weights) / weights_cumsum[-1]
return np.interp(x=perc, xp=cdf, fp=values)


if __name__ == "__main__":
import pytest

# --durations=10 <- May be used to show potentially slow tests
pytest.main(args=[".", "--doctest-modules", "-v", "--capture=sys"])
pytest.main(args=[__file__, "--doctest-modules", "-v", "--capture=sys"])
2 changes: 1 addition & 1 deletion docs/source/examples.rst
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ probability density as the kernel function in the KDE. Below an example is shown
marker='|', label="Resampled from KDE")
x, y = FFTKDE(kernel="gaussian", bw="silverman").fit(data).evaluate()
plt.plot(x, y, label="FFTKDE with Silverman's rule")
plt.title('Weighted and unweighted KDE')
plt.title('Resampling from distribution')
plt.tight_layout(); plt.legend(loc='upper left');


Expand Down