diff --git a/KDEpy/BaseKDE.py b/KDEpy/BaseKDE.py index cfe3359..61dfd6d 100644 --- a/KDEpy/BaseKDE.py +++ b/KDEpy/BaseKDE.py @@ -8,7 +8,7 @@ from abc import ABC, abstractmethod from collections.abc import Sequence from KDEpy.kernel_funcs import _kernel_functions -from KDEpy.bw_selection import _bw_methods +from KDEpy.bw_selection import _bw_methods, cross_val from KDEpy.utils import autogrid @@ -31,7 +31,7 @@ class BaseKDE(ABC): _bw_methods = _bw_methods @abstractmethod - def __init__(self, kernel: str, bw: float): + def __init__(self, kernel: str, bw: float, norm: float): """Initialize the kernel density estimator. The return type must be duplicated in the docstring to comply @@ -59,6 +59,9 @@ def __init__(self, kernel: str, bw: float): else: raise ValueError(msg) + # CV method must be added here since it depends on self + _bw_methods["CV"] = self.cross_val + # The `bw` paramter may either be a positive number, a string, or # array-like such that each point in the data has a uniue bw if isinstance(bw, numbers.Number) and bw > 0: @@ -74,12 +77,15 @@ def __init__(self, kernel: str, bw: float): else: raise ValueError("Bandwidth must be > 0, array-like or a string.") + self.norm = norm + # Test quickly that the method has done what is was supposed to do assert callable(self.kernel) assert isinstance(self.bw_method, (np.ndarray, Sequence, numbers.Number)) or callable(self.bw_method) + assert isinstance(self.norm, numbers.Number) and self.norm > 0 @abstractmethod - def fit(self, data, weights=None): + def fit(self, data, weights=None, **kwargs): """ Fit the kernel density estimator to the data. This method converts the data to shape (obs, dims) and the weights @@ -93,6 +99,8 @@ def fit(self, data, weights=None): weights : array-like, Sequence or None May be array-like of shape (obs,), shape (obs, dims), a Python Sequence, e.g. a list or tuple, or None. + **kwargs: + List of arguments to be passed to bandwidth optimization method. """ # -------------- Set up the data depending on input ------------------- @@ -126,7 +134,7 @@ def fit(self, data, weights=None): if isinstance(self.bw_method, (np.ndarray, Sequence)): self.bw = self.bw_method elif callable(self.bw_method): - self.bw = self.bw_method(self.data, self.weights) + self.bw = self.bw_method(self.data, self.weights, **kwargs) else: self.bw = self.bw_method @@ -175,6 +183,75 @@ def evaluate(self, grid_points=None, bw_to_scalar=True): assert bw > 0 assert len(self.grid_points.shape) == 2 + def score(self, test_data, test_weights=None): + """ + Computes the score of test data on the KDE model. The score is + calculated as the mean log-probability of the test samples + on the model. The method takes into account test weights, and + works with variable bandwidths. + + Parameters + ---------- + test_data : array-like or Sequence + May be array-like of shape (obs,), shape (obs, dims) or a + Python Sequence, e.g. a list or tuple. + test_weights : array-like, Sequence or None + May be array-like of shape (obs,), shape (obs, dims), a + Python Sequence, e.g. a list or tuple, or None. + """ + + # -------------- Set up the data depending on input ------------------- + # In the end, the data should be an ndarray of shape (obs, dims) + test_data = self._process_sequence(test_data) + + obs, dims = test_data.shape + + if not obs > 0: + raise ValueError("Test data must contain at least one data point.") + assert dims > 0 + + # -------------- Set up the weights depending on input ---------------- + if test_weights is not None: + test_weights = self._process_sequence(test_weights).ravel() + if not obs == len(test_weights): + raise ValueError("Number of test data obs must match test weights") + + return np.mean(test_weights * np.log(self.evaluate(test_data))) + + return np.mean(np.log(self.evaluate(test_data))) + + def cross_val(self, data, weights=None, cv=10, seed=None, grid=None): + """ + Computes the cross validated score over a grid of bandwidths, and returns + the one that maximizes it. It is a robust method against multimodal + distributions, and can be performed on variable bandwidths (e.g.: by + setting "seed" parameter as the output of k nearest neighbors algorithm). + + Habbema, J. D. F., Hermans, J., and Van den Broek, K. (1974) A stepwise + discrimination analysis program using density estimation. + + Leave-one-out MLCV method in R: https://rdrr.io/cran/kedd/man/h.mlcv.html + + Parameters + ---------- + data: array-like + The data points. Data must have shape (obs, dims). + weights: array-like, + One weight per data point. Numbers of observations must match + the data points. + cv: int + The number of cross validation folds. If cv equals obs, it is the + leave-one-out cross validation. + seed : float or array-like + The seed bandwidth. By default is a simplified version of the silverman + rule. + grid : array-like + The grid of factors. The bandwidth grid is constructed as: + bw_grid[i] = bw * grid[i] + By default is np.logspace(-1,1,20) + """ + return cross_val(self, data, weights=weights, cv=cv, seed=seed, grid=grid) + @staticmethod def _process_sequence(sequence_array_like): """ diff --git a/KDEpy/FFTKDE.py b/KDEpy/FFTKDE.py index cef8509..946d8b8 100644 --- a/KDEpy/FFTKDE.py +++ b/KDEpy/FFTKDE.py @@ -68,11 +68,9 @@ class FFTKDE(BaseKDE): """ def __init__(self, kernel="gaussian", bw=1, norm=2): - self.norm = norm - super().__init__(kernel, bw) - assert isinstance(self.norm, numbers.Number) and self.norm > 0 + super().__init__(kernel, bw, norm) - def fit(self, data, weights=None): + def fit(self, data, weights=None, **kwargs): """ Fit the KDE to the data. This validates the data and stores it. Computations are performed upon evaluation on a specific grid. @@ -83,6 +81,8 @@ def fit(self, data, weights=None): The data points. weights: array-like One weight per data point. Must have same shape as the data. + **kwargs: + List of arguments to be passed to bandwidth optimization method. Returns ------- @@ -99,7 +99,7 @@ def fit(self, data, weights=None): """ # Sets self.data - super().fit(data, weights) + super().fit(data, weights, **kwargs) return self def evaluate(self, grid_points=None): diff --git a/KDEpy/NaiveKDE.py b/KDEpy/NaiveKDE.py index 719c3ba..61a148a 100644 --- a/KDEpy/NaiveKDE.py +++ b/KDEpy/NaiveKDE.py @@ -23,7 +23,7 @@ class NaiveKDE(BaseKDE): bw : float, str or array-like Bandwidth or bandwidth selection method. If a float is passed, it is the standard deviation of the kernel. If a string it passed, it - is the bandwidth selection method, see cls._bw_methods.keys() for + is the bandwidth selection method, see cls()._bw_methods.keys() for choices. If an array-like it passed, it is the bandwidth of each point. norm : float @@ -50,10 +50,9 @@ class NaiveKDE(BaseKDE): """ def __init__(self, kernel="gaussian", bw=1, norm=2): - super().__init__(kernel, bw) - self.norm = norm + super().__init__(kernel, bw, norm) - def fit(self, data, weights=None): + def fit(self, data, weights=None, **kwargs): """ Fit the KDE to the data. This validates the data and stores it. Computations are performed when the KDE is evaluated on a grid. @@ -65,6 +64,8 @@ def fit(self, data, weights=None): weights: array-like One weight per data point. Must have shape (obs,). If None is passed, uniform weights are used. + **kwargs: + List of arguments to be passed to bandwidth optimization method. Returns ------- @@ -80,7 +81,7 @@ def fit(self, data, weights=None): >>> x, y = kde() """ # Sets self.data - super().fit(data, weights) + super().fit(data, weights, **kwargs) return self def evaluate(self, grid_points=None): diff --git a/KDEpy/TreeKDE.py b/KDEpy/TreeKDE.py index 1948021..819a016 100644 --- a/KDEpy/TreeKDE.py +++ b/KDEpy/TreeKDE.py @@ -27,7 +27,7 @@ class TreeKDE(BaseKDE): bw : float, str or array-like Bandwidth or bandwidth selection method. If a float is passed, it is the standard deviation of the kernel. If a string it passed, it - is the bandwidth selection method, see cls._bw_methods.keys() for + is the bandwidth selection method, see cls()._bw_methods.keys() for choices. If an array-like it passed, it is the bandwidth of each point. norm : float @@ -60,10 +60,9 @@ class TreeKDE(BaseKDE): """ def __init__(self, kernel="gaussian", bw=1, norm=2.0): - super().__init__(kernel, bw) - self.norm = norm + super().__init__(kernel, bw, norm) - def fit(self, data, weights=None): + def fit(self, data, weights=None, **kwargs): """ Fit the KDE to the data. This validates the data and stores it. Computations are performed upon evaluation on a grid. @@ -75,6 +74,8 @@ def fit(self, data, weights=None): weights: array-like One weight per data point. Numbers of observations must match the data points. + **kwargs: + List of arguments to be passed to bandwidth optimization method. Returns ------- @@ -90,7 +91,7 @@ def fit(self, data, weights=None): >>> x, y = kde() """ # Sets self.data - super().fit(data, weights) + super().fit(data, weights, **kwargs) return self def evaluate(self, grid_points=None, eps=10e-4): diff --git a/KDEpy/binning.py b/KDEpy/binning.py index 7efd437..aa2b6d6 100644 --- a/KDEpy/binning.py +++ b/KDEpy/binning.py @@ -268,10 +268,7 @@ def linbin_Ndim_python(data, grid_points, weights=None): # Compute integer part and fractional part for every x_i # Compute relation to previous grid point, and next grid point int_frac = ( - ( - (int(coordinate), 1 - (coordinate % 1)), - (int(coordinate) + 1, (coordinate % 1)), - ) + ((int(coordinate), 1 - (coordinate % 1)), (int(coordinate) + 1, (coordinate % 1))) for coordinate in observation ) diff --git a/KDEpy/bw_selection.py b/KDEpy/bw_selection.py index cfe5fa2..22050ae 100644 --- a/KDEpy/bw_selection.py +++ b/KDEpy/bw_selection.py @@ -3,13 +3,18 @@ """ Functions for bandwidth selection. """ +import numbers import numpy as np import scipy import warnings +from collections.abc import Sequence from KDEpy.binning import linear_binning from KDEpy.utils import autogrid from scipy import fftpack from scipy.optimize import brentq +from scipy.spatial import KDTree +from joblib import Parallel, delayed + # Choose the largest available float on the system try: @@ -292,8 +297,276 @@ def silvermans_rule(data, weights=None): return 1.0 +def _knn_batch(batch_data, k=10): + """ + Computes k nearest neighbors algorithm in one batch. + Returns the array of distances to the kth neighbor + + Parameters + ---------- + batch_data: array-like + The data points. Data must have shape (obs, dims). + k: int, default=10 + Number of neighbors per batch (without counting self). Must be > 0 and + < len(batch_data) + """ + assert len(batch_data.shape) == 2 + assert k > 0 and k < len(batch_data) + + kdtree = KDTree(batch_data) + # Use k+1 to take into account dist=0 between each point and self + dists, idxs = kdtree.query(batch_data, k=k + 1) + return dists[:, -1] + + +def k_nearest_neighbors(data, weights=None, k=10, batch_size=10000, n_jobs=-1): + """ + Computes variable bandwidth based on k nearest neighbors algorithm on data. + For each data point, sets its bandwidth as the euclidean distance to the + kth nearest neighbor within its batch. The computation is performed on + batches to allow scalability to large datasets. The scipy KDTree class is + used for the nearest neighbors queries. + + https://en.wikipedia.org/wiki/Variable_kernel_density_estimation + https://en.wikipedia.org/wiki/K-nearest_neighbour_algorithm + https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.html + + Parameters + ---------- + data: array-like + The data points. Data must have shape (obs, dims). + weights: array-like, Sequence or None + Ignored, only for compatibility. + k: int, default=10 + Number of neighbors per batch (without counting self). + batch_size: int, default=10000 + Aproximate size of each batch. Will be slightly modified to cover + dataset as uniformly as possible. + n_jobs : int, default=-1 + Number of jobs to run in parallel. -1 means using all processors. + """ + if not len(data.shape) == 2: + raise ValueError("Data must be of shape (obs, dims).") + obs, dims = data.shape + + if weights is not None: + warnings.warn("K nearest neighbors ignores all weights") + + if obs < 2: + raise ValueError("Data must be of length >= 2.") + + if k < 1 or k >= batch_size: + raise ValueError("k must be between 1 and batch_size-1.") + k = int(k) + + if batch_size < 2: + raise ValueError("Batch size must be >= 2.") + + batches = int(max(1, np.round(obs / batch_size))) + batch_size = int(obs / batches) + print("Using k = {} neighbors per batch (batch size = {})".format(k, batch_size)) + if batches > 1: + print("Equivalent to aprox. {} total neighbors".format(k * batches)) + + batches_dist = Parallel(n_jobs=n_jobs, verbose=10)( + delayed(_knn_batch)(batch_data, k) for batch_data in np.array_split(data, batches) + ) + bw_knn = np.concatenate(batches_dist) + + return bw_knn + + +def _cv_score(model, bw, data, weights=None, cv=10): + """ + Computes cross validated score of KDE model, which gives an indicator of + the quality of the estimation. Test and train data are taken following a + K-folds cross validation scheme. + + Parameters + ---------- + model: NaiveKDE() or TreeKDE() + The model to be used to evaluate scores. Technically it could be any + object that implements methods __init__, fit and evaluate (on arbitrary + grid) with the same syntax than KDEpy. + bw : float or array-like + Bandwidth. If a float is passed, it is the standard deviation of the + kernel. If an array-like it passed, it is the bandwidth of each point. + data: array-like + The data points. Data must have shape (obs, dims). + weights: array-like, Sequence or None + One weight per data point. Numbers of observations must match + the data points. If None is passed, uniform weights are used. + cv: int, default=10 + The number of cross validation folds. If cv equals obs, it is the + leave-one-out cross validation. + """ + if not len(data.shape) == 2: + raise ValueError("Data must be of shape (obs, dims).") + obs, dims = data.shape + + if obs < 2: + raise ValueError("Data must be of length >= 2.") + + if weights is not None: + if not weights.shape == (obs,): + raise ValueError("Number of weights must match data points") + + if cv > len(data): + cv = len(data) + if cv < 2: + raise ValueError("Number of folds must be >= 2.") + + if isinstance(bw, numbers.Number) and bw > 0: + variable_bw = False + elif isinstance(bw, (np.ndarray, Sequence)): + if not bw.shape == (obs,): + raise ValueError("If bandwidth is variable, it must match data points") + folds_bw = np.array_split(bw, cv) + variable_bw = True + else: + raise ValueError("Bandwidth must be > 0, or array-like.") + + # Folds for cross validation + folds_data = np.array_split(data, cv) + if weights is not None: + folds_weights = np.array_split(weights, cv) + else: + folds_weights = cv * [None] + folds_score = [] + + # Compute cross validation for each fold + for fold, [test_data, test_weights] in enumerate(zip(folds_data, folds_weights)): + if variable_bw: + train_bw = np.concatenate(folds_bw[:fold] + folds_bw[fold + 1 :], axis=0) + else: + train_bw = bw + kde = model.__class__(kernel=model.kernel, bw=train_bw, norm=model.norm) + train_data = np.concatenate(folds_data[:fold] + folds_data[fold + 1 :], axis=0) + if weights is not None: + train_weights = np.concatenate(folds_weights[:fold] + folds_weights[fold + 1 :], axis=0) + else: + train_weights = None + kde.fit(train_data, weights=train_weights) + folds_score.append(kde.score(test_data, test_weights)) + + return np.mean(folds_score) + + +def grid_search_cv(model, bw_grid, data, weights=None, cv=10, n_jobs=-1): + """ + Computes the cross validated score over a grid of bandwidths, and returns + the list of cv scores. + + Parameters + ---------- + model: NaiveKDE() or TreeKDE() + The model to be used to evaluate scores. Technically it could be any + object that implements methods __init__, fit and evaluate (on arbitrary + grid) with the same syntax than KDEpy. + bw_grid : array-like + The grid of bandwidths. Each element can be a float or an array of + shape (obs,). + data: array-like + The data points. Data must have shape (obs, dims). + weights: array-like, Sequence or None + One weight per data point. Numbers of observations must match + the data points. If None is passed, uniform weights are used. + cv: int, default=10 + The number of cross validation folds. If cv equals obs, it is the + leave-one-out cross validation. + n_jobs : int, default=-1 + Number of jobs to run in parallel. -1 means using all processors. + """ + if not len(data.shape) == 2: + raise ValueError("Data must be of shape (obs, dims).") + obs, dims = data.shape + + if obs < 2: + raise ValueError("Data must be of length >= 2.") + + assert isinstance(bw_grid, (Sequence, np.ndarray)) + + cv_scores = Parallel(n_jobs=n_jobs, verbose=10)( + delayed(_cv_score)(model, bw, data, weights=weights, cv=cv) for bw in bw_grid + ) + + return cv_scores + + +def cross_val(model, data, weights=None, cv=10, seed=None, grid=None, n_jobs=-1): + """ + Computes the cross validated score over a grid of bandwidths, and returns + the one that maximizes it. It is a robust method against multimodal + distributions, and can be performed on variable bandwidths (e.g.: by + setting "seed" parameter as the output of k nearest neighbors algorithm). + + Habbema, J. D. F., Hermans, J., and Van den Broek, K. (1974) A stepwise + discrimination analysis program using density estimation. + + Leave-one-out MLCV method in R: https://rdrr.io/cran/kedd/man/h.mlcv.html + + Parameters + ---------- + model: NaiveKDE() or TreeKDE() + The model to be used to evaluate scores. Technically it could be any + object that implements methods __init__, fit and evaluate (on arbitrary + grid) with the same syntax than KDEpy. + bw : float or array-like + data: array-like + The data points. Data must have shape (obs, dims). + weights: array-like, Sequence or None + One weight per data point. Numbers of observations must match + the data points. If None is passed, uniform weights are used. + cv: int, default=10 + The number of cross validation folds. If cv equals obs, it is the + leave-one-out cross validation. + seed : float or array-like, default=None + The seed bandwidth. If None is passed, a simplified version of the + silverman rule is used. + grid : array-like, default=None + The grid of factors. The bandwidth grid is constructed as: + bw_grid[i] = seed * grid[i] + If None is passed, the default grid is used: np.logspace(-1,1,20) + n_jobs : int, default=-1 + Number of jobs to run in parallel. -1 means using all processors. + """ + if not len(data.shape) == 2: + raise ValueError("Data must be of shape (obs, dims).") + obs, dims = data.shape + + if obs < 2: + raise ValueError("Data must be of length >= 2.") + + if seed is None: + # This should be replaced by a call to silverman_rule when it is + # implemented for multidimensional data + sigma = np.std(data, axis=0).mean() + seed = sigma * (obs * (2.0 + dims / 4)) ** (-1 / (4 + dims)) + + if grid is None: + grid = np.logspace(-1, 1, 20) + + # Build bandwidths grid so that bw_grid[i] = seed * grid[i] + bw_grid = np.reshape(grid, (-1, *np.ones(np.ndim(seed), dtype=int))) * seed + + cv_scores = grid_search_cv(model, bw_grid, data, weights=weights, cv=cv, n_jobs=n_jobs) + idx_best = np.argmax(cv_scores) + + # Warn if maximum was in beginning or end of grid + if idx_best in (0, len(bw_grid) - 1): + # Could calculate new grid automatically and call recursively + msg = "Could not find maximum in the selected range of bandwidths.\n" + msg += "Move grid and try again." + warnings.warn(msg) + + bw_cv = bw_grid[idx_best] + return bw_cv + + _bw_methods = { "silverman": silvermans_rule, "scott": scotts_rule, "ISJ": improved_sheather_jones, + "KNN": k_nearest_neighbors, + # CV can not be added here since it requires a model } diff --git a/KDEpy/tests/test_NaiveKDE.py b/KDEpy/tests/test_NaiveKDE.py index e94e567..0985d5f 100644 --- a/KDEpy/tests/test_NaiveKDE.py +++ b/KDEpy/tests/test_NaiveKDE.py @@ -67,41 +67,12 @@ def test_additivity_with_weights(data, split_index): @pytest.mark.parametrize( "kernel, bw, n, expected_result", [ - ( - "box", - 0.1, - 5, - np.array([2.101278e-19, 3.469447e-18, 1.924501e00, 0.000000e00, 9.622504e-01]), - ), - ( - "box", - 0.2, - 5, - np.array([3.854941e-18, 2.929755e-17, 9.622504e-01, 0.000000e00, 4.811252e-01]), - ), + ("box", 0.1, 5, np.array([2.101278e-19, 3.469447e-18, 1.924501e00, 0.000000e00, 9.622504e-01])), + ("box", 0.2, 5, np.array([3.854941e-18, 2.929755e-17, 9.622504e-01, 0.000000e00, 4.811252e-01])), ("box", 0.6, 3, np.array([0.1603751, 0.4811252, 0.4811252])), ("tri", 0.6, 3, np.array([0.1298519, 0.5098009, 0.3865535])), - ( - "epa", - 0.1, - 6, - np.array( - [ - 0.000000e00, - 7.285839e-17, - 2.251871e-01, - 1.119926e00, - 0.000000e00, - 1.118034e00, - ] - ), - ), - ( - "biweight", - 2, - 5, - np.array([0.1524078, 0.1655184, 0.1729870, 0.1743973, 0.1696706]), - ), + ("epa", 0.1, 6, np.array([0.000000e00, 7.285839e-17, 2.251871e-01, 1.119926e00, 0.000000e00, 1.118034e00])), + ("biweight", 2, 5, np.array([0.1524078, 0.1655184, 0.1729870, 0.1743973, 0.1696706])), ], ) def test_against_R_density(kernel, bw, n, expected_result): @@ -124,19 +95,7 @@ def test_against_R_density(kernel, bw, n, expected_result): "bw, n, expected_result", [ (1, 3, np.array([0.17127129, 0.34595518, 0.30233275])), - ( - 0.1, - 5, - np.array( - [ - 2.56493684e-22, - 4.97598466e-06, - 2.13637668e00, - 4.56012216e-04, - 1.32980760e00, - ] - ), - ), + (0.1, 5, np.array([2.56493684e-22, 4.97598466e-06, 2.13637668e00, 4.56012216e-04, 1.32980760e00])), (0.01, 3, np.array([0.0, 13.29807601, 13.29807601])), ], ) diff --git a/KDEpy/tests/test_TreeKDE.py b/KDEpy/tests/test_TreeKDE.py index 0a31e29..24e9db4 100644 --- a/KDEpy/tests/test_TreeKDE.py +++ b/KDEpy/tests/test_TreeKDE.py @@ -66,41 +66,12 @@ def test_additivity_with_weights(data, split_index): @pytest.mark.parametrize( "kernel, bw, n, expected_result", [ - ( - "box", - 0.1, - 5, - np.array([2.101278e-19, 3.469447e-18, 1.924501e00, 0.000000e00, 9.622504e-01]), - ), - ( - "box", - 0.2, - 5, - np.array([3.854941e-18, 2.929755e-17, 9.622504e-01, 0.000000e00, 4.811252e-01]), - ), + ("box", 0.1, 5, np.array([2.101278e-19, 3.469447e-18, 1.924501e00, 0.000000e00, 9.622504e-01])), + ("box", 0.2, 5, np.array([3.854941e-18, 2.929755e-17, 9.622504e-01, 0.000000e00, 4.811252e-01])), ("box", 0.6, 3, np.array([0.1603751, 0.4811252, 0.4811252])), ("tri", 0.6, 3, np.array([0.1298519, 0.5098009, 0.3865535])), - ( - "epa", - 0.1, - 6, - np.array( - [ - 0.000000e00, - 7.285839e-17, - 2.251871e-01, - 1.119926e00, - 0.000000e00, - 1.118034e00, - ] - ), - ), - ( - "biweight", - 2, - 5, - np.array([0.1524078, 0.1655184, 0.1729870, 0.1743973, 0.1696706]), - ), + ("epa", 0.1, 6, np.array([0.000000e00, 7.285839e-17, 2.251871e-01, 1.119926e00, 0.000000e00, 1.118034e00])), + ("biweight", 2, 5, np.array([0.1524078, 0.1655184, 0.1729870, 0.1743973, 0.1696706])), ], ) def test_against_R_density(kernel, bw, n, expected_result): @@ -121,19 +92,7 @@ def test_against_R_density(kernel, bw, n, expected_result): "bw, n, expected_result", [ (1, 3, np.array([0.17127129, 0.34595518, 0.30233275])), - ( - 0.1, - 5, - np.array( - [ - 2.56493684e-22, - 4.97598466e-06, - 2.13637668e00, - 4.56012216e-04, - 1.32980760e00, - ] - ), - ), + (0.1, 5, np.array([2.56493684e-22, 4.97598466e-06, 2.13637668e00, 4.56012216e-04, 1.32980760e00])), (0.01, 3, np.array([0.0, 13.29807601, 13.29807601])), ], ) diff --git a/KDEpy/tests/test_binning.py b/KDEpy/tests/test_binning.py index 7249493..f8da30b 100644 --- a/KDEpy/tests/test_binning.py +++ b/KDEpy/tests/test_binning.py @@ -69,12 +69,7 @@ class TestBinningFunctions: @pytest.mark.parametrize( "data, func", itertools.product( - [ - [1, 2, 3, 4, 5, 6], - [0.04, 0.54, 0.33, 0.85, 0.16], - [-4.12, 0.98, -4.3, -1.85], - [0, 0, 1], - ], + [[1, 2, 3, 4, 5, 6], [0.04, 0.54, 0.33, 0.85, 0.16], [-4.12, 0.98, -4.3, -1.85], [0, 0, 1]], [linear_binning, linbin_Ndim_python], ), ) @@ -110,8 +105,7 @@ def test_binning_simple_examples(self, data, weights, ans): assert np.allclose(y, ans) @pytest.mark.parametrize( - "dims, use_weights, eq_grid", - itertools.product([1, 2, 3, 4], [True, False], [True, False]), + "dims, use_weights, eq_grid", itertools.product([1, 2, 3, 4], [True, False], [True, False]) ) def test_cython_binning(self, dims, use_weights, eq_grid): """ diff --git a/KDEpy/tests/test_bw_selection.py b/KDEpy/tests/test_bw_selection.py index 6cd3512..7abb107 100644 --- a/KDEpy/tests/test_bw_selection.py +++ b/KDEpy/tests/test_bw_selection.py @@ -7,7 +7,8 @@ import pytest import numpy as np -from KDEpy.bw_selection import _bw_methods, improved_sheather_jones +from KDEpy.bw_selection import _bw_methods, improved_sheather_jones, k_nearest_neighbors, cross_val +from KDEpy.TreeKDE import TreeKDE @pytest.fixture(scope="module") @@ -29,8 +30,7 @@ def test_isj_bw_weights_single_zero_weighted_point(data): weights[-1] = 0 np.testing.assert_array_almost_equal( - improved_sheather_jones(data), - improved_sheather_jones(data_with_outlier, weights), + improved_sheather_jones(data), improved_sheather_jones(data_with_outlier, weights) ) @@ -40,11 +40,32 @@ def test_isj_bw_weights_same_as_resampling(data, execution_number): sample_weights = np.random.randint(low=1, high=100, size=len(data)) data_resampled = np.repeat(data, repeats=sample_weights).reshape((-1, 1)) np.testing.assert_array_almost_equal( - improved_sheather_jones(data_resampled), - improved_sheather_jones(data, sample_weights), + improved_sheather_jones(data_resampled), improved_sheather_jones(data, sample_weights) ) +@pytest.mark.parametrize("dims", [1, 2, 3]) +def test_knn_with_2_points(dims): + data = np.zeros((2, dims)) + data[0, 0] = 0 + data[1, 0] = 1 + # Distances must be [1,1] + np.testing.assert_array_almost_equal(k_nearest_neighbors(data, k=1), np.array([1, 1])) + + +@pytest.mark.parametrize("dims", [1, 2, 3]) +def test_cv_with_2_points(dims): + data = np.zeros((2, dims)) + data[0, 0] = 0 + data[1, 0] = 1 + # The optimal bw can be found analytically by solving: + # d log(kernel(1,bw)) / d bw = 0 + # For kernel="gaussian" and norm=2.0 it gives: + bw_optimal = 1 / np.sqrt(dims) + grid = np.logspace(-0.01, 0.01, 5) # Make grid of factors around 1 + np.allclose(cross_val(TreeKDE(), data, seed=bw_optimal, grid=grid), bw_optimal) + + if __name__ == "__main__": # --durations=10 <- May be used to show potentially slow tests pytest.main(args=[__file__, "--doctest-modules", "-v", "--durations=15"]) diff --git a/KDEpy/tests/test_kernel_funcs.py b/KDEpy/tests/test_kernel_funcs.py index aebfca8..40747e8 100644 --- a/KDEpy/tests/test_kernel_funcs.py +++ b/KDEpy/tests/test_kernel_funcs.py @@ -15,16 +15,7 @@ class TestKernelHelperFunctions: @pytest.mark.parametrize( - "dim, expected", - [ - (0, 1.25331), - (1, 1), - (2, 1.25331), - (3, 2), - (4, 3.75994), - (5, 8), - (6, 18.7997), - ], + "dim, expected", [(0, 1.25331), (1, 1), (2, 1.25331), (3, 2), (4, 3.75994), (5, 8), (6, 18.7997)] ) def test_gauss_integral(self, dim, expected): """ @@ -58,8 +49,7 @@ def test_integral_unity(self, fname, function): assert np.isclose(integral, 1) @pytest.mark.parametrize( - "p, kernel_name", - itertools.product([0.5, 1, 1.5, 2, 13, np.inf], ["triweight", "gaussian"]), + "p, kernel_name", itertools.product([0.5, 1, 1.5, 2, 13, np.inf], ["triweight", "gaussian"]) ) def test_integral_unity_2D_many_p_norms(self, p, kernel_name): """ diff --git a/KDEpy/tests/test_sorted_grid.py b/KDEpy/tests/test_sorted_grid.py index 6baffd7..d9bc5c6 100644 --- a/KDEpy/tests/test_sorted_grid.py +++ b/KDEpy/tests/test_sorted_grid.py @@ -71,10 +71,7 @@ def test_on_good_grids(self): grid = np.array([[0], [0], [0], [1], [1], [1], [2], [2], [2]], dtype=float) assert grid_is_sorted(grid) - grid = np.array( - [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2]], - dtype=float, - ) + grid = np.array([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2]], dtype=float) assert grid_is_sorted(grid) grid = np.array([[1, 1], [2, 2], [3, 3]], dtype=float) @@ -303,10 +300,7 @@ def test_on_bad_grids(self): grid = np.array([[0], [0], [2], [1], [1], [1], [2], [2], [2]], dtype=float) assert not grid_is_sorted(grid) - grid = np.array( - [[0, 0], [0, 1], [0, 2], [1, 0], [1, 2], [1, 1], [2, 0], [2, 1], [2, 2]], - dtype=float, - ) + grid = np.array([[0, 0], [0, 1], [0, 2], [1, 0], [1, 2], [1, 1], [2, 0], [2, 1], [2, 2]], dtype=float) assert not grid_is_sorted(grid) grid = np.array([[1, 1], [3, 3], [2, 2]], dtype=float) diff --git a/setup.py b/setup.py index e77dde7..db8f3d8 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ def read(fname): "Programming Language :: Python :: 3.8", ], packages=["KDEpy"], - install_requires=["numpy>=1.14.2", "scipy>=1.0.1", "matplotlib>=2.2.0"], + install_requires=["numpy>=1.14.2", "scipy>=1.0.1", "matplotlib>=2.2.0", "joblib>=1.0.1"], cmdclass=cmdclass, include_dirs=include_dirs, ext_modules=ext_modules,