diff --git a/KDEpy/BaseKDE.py b/KDEpy/BaseKDE.py
index cfe3359..61dfd6d 100644
--- a/KDEpy/BaseKDE.py
+++ b/KDEpy/BaseKDE.py
@@ -8,7 +8,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Sequence
 from KDEpy.kernel_funcs import _kernel_functions
-from KDEpy.bw_selection import _bw_methods
+from KDEpy.bw_selection import _bw_methods, cross_val
 from KDEpy.utils import autogrid
 
 
@@ -31,7 +31,7 @@ class BaseKDE(ABC):
     _bw_methods = _bw_methods
 
     @abstractmethod
-    def __init__(self, kernel: str, bw: float):
+    def __init__(self, kernel: str, bw: float, norm: float):
         """Initialize the kernel density estimator.
 
         The return type must be duplicated in the docstring to comply
@@ -59,6 +59,9 @@ def __init__(self, kernel: str, bw: float):
         else:
             raise ValueError(msg)
 
+        # CV method must be added here since it depends on self
+        _bw_methods["CV"] = self.cross_val
+
         # The `bw` paramter may either be a positive number, a string, or
         # array-like such that each point in the data has a uniue bw
         if isinstance(bw, numbers.Number) and bw > 0:
@@ -74,12 +77,15 @@ def __init__(self, kernel: str, bw: float):
         else:
             raise ValueError("Bandwidth must be > 0, array-like or a string.")
 
+        self.norm = norm
+
         # Test quickly that the method has done what is was supposed to do
         assert callable(self.kernel)
         assert isinstance(self.bw_method, (np.ndarray, Sequence, numbers.Number)) or callable(self.bw_method)
+        assert isinstance(self.norm, numbers.Number) and self.norm > 0
 
     @abstractmethod
-    def fit(self, data, weights=None):
+    def fit(self, data, weights=None, **kwargs):
         """
         Fit the kernel density estimator to the data.
         This method converts the data to shape (obs, dims) and the weights
@@ -93,6 +99,8 @@ def fit(self, data, weights=None):
         weights : array-like, Sequence or None
             May be array-like of shape (obs,), shape (obs, dims), a
             Python Sequence, e.g. a list or tuple, or None.
+        **kwargs:
+            List of arguments to be passed to bandwidth optimization method.
         """
 
         # -------------- Set up the data depending on input -------------------
@@ -126,7 +134,7 @@ def fit(self, data, weights=None):
         if isinstance(self.bw_method, (np.ndarray, Sequence)):
             self.bw = self.bw_method
         elif callable(self.bw_method):
-            self.bw = self.bw_method(self.data, self.weights)
+            self.bw = self.bw_method(self.data, self.weights, **kwargs)
         else:
             self.bw = self.bw_method
 
@@ -175,6 +183,75 @@ def evaluate(self, grid_points=None, bw_to_scalar=True):
             assert bw > 0
         assert len(self.grid_points.shape) == 2
 
+    def score(self, test_data, test_weights=None):
+        """
+        Computes the score of test data on the KDE model. The score is
+        calculated as the mean log-probability of the test samples
+        on the model. The method takes into account test weights, and
+        works with variable bandwidths.
+
+        Parameters
+        ----------
+        test_data : array-like or Sequence
+            May be array-like of shape (obs,), shape (obs, dims) or a
+            Python Sequence, e.g. a list or tuple.
+        test_weights : array-like, Sequence or None
+            May be array-like of shape (obs,), shape (obs, dims), a
+            Python Sequence, e.g. a list or tuple, or None.
+        """
+
+        # -------------- Set up the data depending on input -------------------
+        # In the end, the data should be an ndarray of shape (obs, dims)
+        test_data = self._process_sequence(test_data)
+
+        obs, dims = test_data.shape
+
+        if not obs > 0:
+            raise ValueError("Test data must contain at least one data point.")
+        assert dims > 0
+
+        # -------------- Set up the weights depending on input ----------------
+        if test_weights is not None:
+            test_weights = self._process_sequence(test_weights).ravel()
+            if not obs == len(test_weights):
+                raise ValueError("Number of test data obs must match test weights")
+
+            return np.mean(test_weights * np.log(self.evaluate(test_data)))
+
+        return np.mean(np.log(self.evaluate(test_data)))
+
+    def cross_val(self, data, weights=None, cv=10, seed=None, grid=None):
+        """
+        Computes the cross validated score over a grid of bandwidths, and returns
+        the one that maximizes it. It is a robust method against multimodal
+        distributions, and can be performed on variable bandwidths (e.g.: by
+        setting "seed" parameter as the output of k nearest neighbors algorithm).
+
+        Habbema, J. D. F., Hermans, J., and Van den Broek, K. (1974) A stepwise
+        discrimination analysis program using density estimation.
+
+        Leave-one-out MLCV method in R: https://rdrr.io/cran/kedd/man/h.mlcv.html
+
+        Parameters
+        ----------
+        data: array-like
+            The data points. Data must have shape (obs, dims).
+        weights: array-like,
+            One weight per data point. Numbers of observations must match
+            the data points.
+        cv: int
+            The number of cross validation folds. If cv equals obs, it is the
+            leave-one-out cross validation.
+        seed : float or array-like
+            The seed bandwidth. By default is a simplified version of the silverman
+            rule.
+        grid : array-like
+            The grid of factors. The bandwidth grid is constructed as:
+            bw_grid[i] = bw * grid[i]
+            By default is np.logspace(-1,1,20)
+        """
+        return cross_val(self, data, weights=weights, cv=cv, seed=seed, grid=grid)
+
     @staticmethod
     def _process_sequence(sequence_array_like):
         """
diff --git a/KDEpy/FFTKDE.py b/KDEpy/FFTKDE.py
index cef8509..946d8b8 100644
--- a/KDEpy/FFTKDE.py
+++ b/KDEpy/FFTKDE.py
@@ -68,11 +68,9 @@ class FFTKDE(BaseKDE):
     """
 
     def __init__(self, kernel="gaussian", bw=1, norm=2):
-        self.norm = norm
-        super().__init__(kernel, bw)
-        assert isinstance(self.norm, numbers.Number) and self.norm > 0
+        super().__init__(kernel, bw, norm)
 
-    def fit(self, data, weights=None):
+    def fit(self, data, weights=None, **kwargs):
         """
         Fit the KDE to the data. This validates the data and stores it.
         Computations are performed upon evaluation on a specific grid.
@@ -83,6 +81,8 @@ def fit(self, data, weights=None):
             The data points.
         weights: array-like
             One weight per data point. Must have same shape as the data.
+        **kwargs:
+            List of arguments to be passed to bandwidth optimization method.
 
         Returns
         -------
@@ -99,7 +99,7 @@ def fit(self, data, weights=None):
         """
 
         # Sets self.data
-        super().fit(data, weights)
+        super().fit(data, weights, **kwargs)
         return self
 
     def evaluate(self, grid_points=None):
diff --git a/KDEpy/NaiveKDE.py b/KDEpy/NaiveKDE.py
index 719c3ba..61a148a 100644
--- a/KDEpy/NaiveKDE.py
+++ b/KDEpy/NaiveKDE.py
@@ -23,7 +23,7 @@ class NaiveKDE(BaseKDE):
     bw : float, str or array-like
         Bandwidth or bandwidth selection method. If a float is passed, it
         is the standard deviation of the kernel. If a string it passed, it
-        is the bandwidth selection method, see cls._bw_methods.keys() for
+        is the bandwidth selection method, see cls()._bw_methods.keys() for
         choices. If an array-like it passed, it is the bandwidth of each
         point.
     norm : float
@@ -50,10 +50,9 @@ class NaiveKDE(BaseKDE):
     """
 
     def __init__(self, kernel="gaussian", bw=1, norm=2):
-        super().__init__(kernel, bw)
-        self.norm = norm
+        super().__init__(kernel, bw, norm)
 
-    def fit(self, data, weights=None):
+    def fit(self, data, weights=None, **kwargs):
         """
         Fit the KDE to the data. This validates the data and stores it.
         Computations are performed when the KDE is evaluated on a grid.
@@ -65,6 +64,8 @@ def fit(self, data, weights=None):
         weights: array-like
             One weight per data point. Must have shape (obs,). If None is
             passed, uniform weights are used.
+        **kwargs:
+            List of arguments to be passed to bandwidth optimization method.
 
         Returns
         -------
@@ -80,7 +81,7 @@ def fit(self, data, weights=None):
         >>> x, y = kde()
         """
         # Sets self.data
-        super().fit(data, weights)
+        super().fit(data, weights, **kwargs)
         return self
 
     def evaluate(self, grid_points=None):
diff --git a/KDEpy/TreeKDE.py b/KDEpy/TreeKDE.py
index 1948021..819a016 100644
--- a/KDEpy/TreeKDE.py
+++ b/KDEpy/TreeKDE.py
@@ -27,7 +27,7 @@ class TreeKDE(BaseKDE):
     bw : float, str or array-like
         Bandwidth or bandwidth selection method. If a float is passed, it
         is the standard deviation of the kernel. If a string it passed, it
-        is the bandwidth selection method, see cls._bw_methods.keys() for
+        is the bandwidth selection method, see cls()._bw_methods.keys() for
         choices. If an array-like it passed, it is the bandwidth of each
         point.
     norm : float
@@ -60,10 +60,9 @@ class TreeKDE(BaseKDE):
     """
 
     def __init__(self, kernel="gaussian", bw=1, norm=2.0):
-        super().__init__(kernel, bw)
-        self.norm = norm
+        super().__init__(kernel, bw, norm)
 
-    def fit(self, data, weights=None):
+    def fit(self, data, weights=None, **kwargs):
         """
         Fit the KDE to the data. This validates the data and stores it.
         Computations are performed upon evaluation on a grid.
@@ -75,6 +74,8 @@ def fit(self, data, weights=None):
         weights: array-like
             One weight per data point. Numbers of observations must match
             the data points.
+        **kwargs:
+            List of arguments to be passed to bandwidth optimization method.
 
         Returns
         -------
@@ -90,7 +91,7 @@ def fit(self, data, weights=None):
         >>> x, y = kde()
         """
         # Sets self.data
-        super().fit(data, weights)
+        super().fit(data, weights, **kwargs)
         return self
 
     def evaluate(self, grid_points=None, eps=10e-4):
diff --git a/KDEpy/binning.py b/KDEpy/binning.py
index 7efd437..aa2b6d6 100644
--- a/KDEpy/binning.py
+++ b/KDEpy/binning.py
@@ -268,10 +268,7 @@ def linbin_Ndim_python(data, grid_points, weights=None):
         # Compute integer part and fractional part for every x_i
         # Compute relation to previous grid point, and next grid point
         int_frac = (
-            (
-                (int(coordinate), 1 - (coordinate % 1)),
-                (int(coordinate) + 1, (coordinate % 1)),
-            )
+            ((int(coordinate), 1 - (coordinate % 1)), (int(coordinate) + 1, (coordinate % 1)))
             for coordinate in observation
         )
 
diff --git a/KDEpy/bw_selection.py b/KDEpy/bw_selection.py
index cfe5fa2..22050ae 100644
--- a/KDEpy/bw_selection.py
+++ b/KDEpy/bw_selection.py
@@ -3,13 +3,18 @@
 """
 Functions for bandwidth selection.
 """
+import numbers
 import numpy as np
 import scipy
 import warnings
+from collections.abc import Sequence
 from KDEpy.binning import linear_binning
 from KDEpy.utils import autogrid
 from scipy import fftpack
 from scipy.optimize import brentq
+from scipy.spatial import KDTree
+from joblib import Parallel, delayed
+
 
 # Choose the largest available float on the system
 try:
@@ -292,8 +297,276 @@ def silvermans_rule(data, weights=None):
         return 1.0
 
 
+def _knn_batch(batch_data, k=10):
+    """
+    Computes k nearest neighbors algorithm in one batch.
+    Returns the array of distances to the kth neighbor
+
+    Parameters
+    ----------
+    batch_data: array-like
+        The data points. Data must have shape (obs, dims).
+    k: int, default=10
+        Number of neighbors per batch (without counting self). Must be > 0 and
+        < len(batch_data)
+    """
+    assert len(batch_data.shape) == 2
+    assert k > 0 and k < len(batch_data)
+
+    kdtree = KDTree(batch_data)
+    # Use k+1 to take into account dist=0 between each point and self
+    dists, idxs = kdtree.query(batch_data, k=k + 1)
+    return dists[:, -1]
+
+
+def k_nearest_neighbors(data, weights=None, k=10, batch_size=10000, n_jobs=-1):
+    """
+    Computes variable bandwidth based on k nearest neighbors algorithm on data.
+    For each data point, sets its bandwidth as the euclidean distance to the
+    kth nearest neighbor within its batch. The computation is performed on
+    batches to allow scalability to large datasets. The scipy KDTree class is
+    used for the nearest neighbors queries.
+
+    https://en.wikipedia.org/wiki/Variable_kernel_density_estimation
+    https://en.wikipedia.org/wiki/K-nearest_neighbour_algorithm
+    https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.html
+
+    Parameters
+    ----------
+    data: array-like
+        The data points. Data must have shape (obs, dims).
+    weights: array-like, Sequence or None
+        Ignored, only for compatibility.
+    k: int, default=10
+        Number of neighbors per batch (without counting self).
+    batch_size: int, default=10000
+        Aproximate size of each batch. Will be slightly modified to cover
+        dataset as uniformly as possible.
+    n_jobs : int, default=-1
+        Number of jobs to run in parallel. -1 means using all processors.
+    """
+    if not len(data.shape) == 2:
+        raise ValueError("Data must be of shape (obs, dims).")
+    obs, dims = data.shape
+
+    if weights is not None:
+        warnings.warn("K nearest neighbors ignores all weights")
+
+    if obs < 2:
+        raise ValueError("Data must be of length >= 2.")
+
+    if k < 1 or k >= batch_size:
+        raise ValueError("k must be between 1 and batch_size-1.")
+    k = int(k)
+
+    if batch_size < 2:
+        raise ValueError("Batch size must be >= 2.")
+
+    batches = int(max(1, np.round(obs / batch_size)))
+    batch_size = int(obs / batches)
+    print("Using k = {} neighbors per batch (batch size = {})".format(k, batch_size))
+    if batches > 1:
+        print("Equivalent to aprox. {} total neighbors".format(k * batches))
+
+    batches_dist = Parallel(n_jobs=n_jobs, verbose=10)(
+        delayed(_knn_batch)(batch_data, k) for batch_data in np.array_split(data, batches)
+    )
+    bw_knn = np.concatenate(batches_dist)
+
+    return bw_knn
+
+
+def _cv_score(model, bw, data, weights=None, cv=10):
+    """
+    Computes cross validated score of KDE model, which gives an indicator of
+    the quality of the estimation. Test and train data are taken following a
+    K-folds cross validation scheme.
+
+    Parameters
+    ----------
+    model: NaiveKDE() or TreeKDE()
+        The model to be used to evaluate scores. Technically it could be any
+        object that implements methods __init__, fit and evaluate (on arbitrary
+        grid) with the same syntax than KDEpy.
+    bw : float or array-like
+        Bandwidth. If a float is passed, it is the standard deviation of the
+        kernel. If an array-like it passed, it is the bandwidth of each point.
+    data: array-like
+        The data points. Data must have shape (obs, dims).
+    weights: array-like, Sequence or None
+        One weight per data point. Numbers of observations must match
+        the data points.  If None is passed, uniform weights are used.
+    cv: int, default=10
+        The number of cross validation folds. If cv equals obs, it is the
+        leave-one-out cross validation.
+    """
+    if not len(data.shape) == 2:
+        raise ValueError("Data must be of shape (obs, dims).")
+    obs, dims = data.shape
+
+    if obs < 2:
+        raise ValueError("Data must be of length >= 2.")
+
+    if weights is not None:
+        if not weights.shape == (obs,):
+            raise ValueError("Number of weights must match data points")
+
+    if cv > len(data):
+        cv = len(data)
+    if cv < 2:
+        raise ValueError("Number of folds must be >= 2.")
+
+    if isinstance(bw, numbers.Number) and bw > 0:
+        variable_bw = False
+    elif isinstance(bw, (np.ndarray, Sequence)):
+        if not bw.shape == (obs,):
+            raise ValueError("If bandwidth is variable, it must match data points")
+        folds_bw = np.array_split(bw, cv)
+        variable_bw = True
+    else:
+        raise ValueError("Bandwidth must be > 0, or array-like.")
+
+    # Folds for cross validation
+    folds_data = np.array_split(data, cv)
+    if weights is not None:
+        folds_weights = np.array_split(weights, cv)
+    else:
+        folds_weights = cv * [None]
+    folds_score = []
+
+    # Compute cross validation for each fold
+    for fold, [test_data, test_weights] in enumerate(zip(folds_data, folds_weights)):
+        if variable_bw:
+            train_bw = np.concatenate(folds_bw[:fold] + folds_bw[fold + 1 :], axis=0)
+        else:
+            train_bw = bw
+        kde = model.__class__(kernel=model.kernel, bw=train_bw, norm=model.norm)
+        train_data = np.concatenate(folds_data[:fold] + folds_data[fold + 1 :], axis=0)
+        if weights is not None:
+            train_weights = np.concatenate(folds_weights[:fold] + folds_weights[fold + 1 :], axis=0)
+        else:
+            train_weights = None
+        kde.fit(train_data, weights=train_weights)
+        folds_score.append(kde.score(test_data, test_weights))
+
+    return np.mean(folds_score)
+
+
+def grid_search_cv(model, bw_grid, data, weights=None, cv=10, n_jobs=-1):
+    """
+    Computes the cross validated score over a grid of bandwidths, and returns
+    the list of cv scores.
+
+    Parameters
+    ----------
+    model: NaiveKDE() or TreeKDE()
+        The model to be used to evaluate scores. Technically it could be any
+        object that implements methods __init__, fit and evaluate (on arbitrary
+        grid) with the same syntax than KDEpy.
+    bw_grid : array-like
+        The grid of bandwidths. Each element can be a float or an array of
+        shape (obs,).
+    data: array-like
+        The data points. Data must have shape (obs, dims).
+    weights: array-like, Sequence or None
+        One weight per data point. Numbers of observations must match
+        the data points.  If None is passed, uniform weights are used.
+    cv: int, default=10
+        The number of cross validation folds. If cv equals obs, it is the
+        leave-one-out cross validation.
+    n_jobs : int, default=-1
+        Number of jobs to run in parallel. -1 means using all processors.
+    """
+    if not len(data.shape) == 2:
+        raise ValueError("Data must be of shape (obs, dims).")
+    obs, dims = data.shape
+
+    if obs < 2:
+        raise ValueError("Data must be of length >= 2.")
+
+    assert isinstance(bw_grid, (Sequence, np.ndarray))
+
+    cv_scores = Parallel(n_jobs=n_jobs, verbose=10)(
+        delayed(_cv_score)(model, bw, data, weights=weights, cv=cv) for bw in bw_grid
+    )
+
+    return cv_scores
+
+
+def cross_val(model, data, weights=None, cv=10, seed=None, grid=None, n_jobs=-1):
+    """
+    Computes the cross validated score over a grid of bandwidths, and returns
+    the one that maximizes it. It is a robust method against multimodal
+    distributions, and can be performed on variable bandwidths (e.g.: by
+    setting "seed" parameter as the output of k nearest neighbors algorithm).
+
+    Habbema, J. D. F., Hermans, J., and Van den Broek, K. (1974) A stepwise
+    discrimination analysis program using density estimation.
+
+    Leave-one-out MLCV method in R: https://rdrr.io/cran/kedd/man/h.mlcv.html
+
+    Parameters
+    ----------
+    model: NaiveKDE() or TreeKDE()
+        The model to be used to evaluate scores. Technically it could be any
+        object that implements methods __init__, fit and evaluate (on arbitrary
+        grid) with the same syntax than KDEpy.
+    bw : float or array-like
+    data: array-like
+        The data points. Data must have shape (obs, dims).
+    weights: array-like, Sequence or None
+        One weight per data point. Numbers of observations must match
+        the data points.  If None is passed, uniform weights are used.
+    cv: int, default=10
+        The number of cross validation folds. If cv equals obs, it is the
+        leave-one-out cross validation.
+    seed : float or array-like, default=None
+        The seed bandwidth. If None is passed, a simplified version of the
+        silverman rule is used.
+    grid : array-like, default=None
+        The grid of factors. The bandwidth grid is constructed as:
+        bw_grid[i] = seed * grid[i]
+        If None is passed, the default grid is used: np.logspace(-1,1,20)
+    n_jobs : int, default=-1
+        Number of jobs to run in parallel. -1 means using all processors.
+    """
+    if not len(data.shape) == 2:
+        raise ValueError("Data must be of shape (obs, dims).")
+    obs, dims = data.shape
+
+    if obs < 2:
+        raise ValueError("Data must be of length >= 2.")
+
+    if seed is None:
+        # This should be replaced by a call to silverman_rule when it is
+        # implemented for multidimensional data
+        sigma = np.std(data, axis=0).mean()
+        seed = sigma * (obs * (2.0 + dims / 4)) ** (-1 / (4 + dims))
+
+    if grid is None:
+        grid = np.logspace(-1, 1, 20)
+
+    # Build bandwidths grid so that bw_grid[i] = seed * grid[i]
+    bw_grid = np.reshape(grid, (-1, *np.ones(np.ndim(seed), dtype=int))) * seed
+
+    cv_scores = grid_search_cv(model, bw_grid, data, weights=weights, cv=cv, n_jobs=n_jobs)
+    idx_best = np.argmax(cv_scores)
+
+    # Warn if maximum was in beginning or end of grid
+    if idx_best in (0, len(bw_grid) - 1):
+        # Could calculate new grid automatically and call recursively
+        msg = "Could not find maximum in the selected range of bandwidths.\n"
+        msg += "Move grid and try again."
+        warnings.warn(msg)
+
+    bw_cv = bw_grid[idx_best]
+    return bw_cv
+
+
 _bw_methods = {
     "silverman": silvermans_rule,
     "scott": scotts_rule,
     "ISJ": improved_sheather_jones,
+    "KNN": k_nearest_neighbors,
+    # CV can not be added here since it requires a model
 }
diff --git a/KDEpy/tests/test_NaiveKDE.py b/KDEpy/tests/test_NaiveKDE.py
index e94e567..0985d5f 100644
--- a/KDEpy/tests/test_NaiveKDE.py
+++ b/KDEpy/tests/test_NaiveKDE.py
@@ -67,41 +67,12 @@ def test_additivity_with_weights(data, split_index):
 @pytest.mark.parametrize(
     "kernel, bw, n, expected_result",
     [
-        (
-            "box",
-            0.1,
-            5,
-            np.array([2.101278e-19, 3.469447e-18, 1.924501e00, 0.000000e00, 9.622504e-01]),
-        ),
-        (
-            "box",
-            0.2,
-            5,
-            np.array([3.854941e-18, 2.929755e-17, 9.622504e-01, 0.000000e00, 4.811252e-01]),
-        ),
+        ("box", 0.1, 5, np.array([2.101278e-19, 3.469447e-18, 1.924501e00, 0.000000e00, 9.622504e-01])),
+        ("box", 0.2, 5, np.array([3.854941e-18, 2.929755e-17, 9.622504e-01, 0.000000e00, 4.811252e-01])),
         ("box", 0.6, 3, np.array([0.1603751, 0.4811252, 0.4811252])),
         ("tri", 0.6, 3, np.array([0.1298519, 0.5098009, 0.3865535])),
-        (
-            "epa",
-            0.1,
-            6,
-            np.array(
-                [
-                    0.000000e00,
-                    7.285839e-17,
-                    2.251871e-01,
-                    1.119926e00,
-                    0.000000e00,
-                    1.118034e00,
-                ]
-            ),
-        ),
-        (
-            "biweight",
-            2,
-            5,
-            np.array([0.1524078, 0.1655184, 0.1729870, 0.1743973, 0.1696706]),
-        ),
+        ("epa", 0.1, 6, np.array([0.000000e00, 7.285839e-17, 2.251871e-01, 1.119926e00, 0.000000e00, 1.118034e00])),
+        ("biweight", 2, 5, np.array([0.1524078, 0.1655184, 0.1729870, 0.1743973, 0.1696706])),
     ],
 )
 def test_against_R_density(kernel, bw, n, expected_result):
@@ -124,19 +95,7 @@ def test_against_R_density(kernel, bw, n, expected_result):
     "bw, n, expected_result",
     [
         (1, 3, np.array([0.17127129, 0.34595518, 0.30233275])),
-        (
-            0.1,
-            5,
-            np.array(
-                [
-                    2.56493684e-22,
-                    4.97598466e-06,
-                    2.13637668e00,
-                    4.56012216e-04,
-                    1.32980760e00,
-                ]
-            ),
-        ),
+        (0.1, 5, np.array([2.56493684e-22, 4.97598466e-06, 2.13637668e00, 4.56012216e-04, 1.32980760e00])),
         (0.01, 3, np.array([0.0, 13.29807601, 13.29807601])),
     ],
 )
diff --git a/KDEpy/tests/test_TreeKDE.py b/KDEpy/tests/test_TreeKDE.py
index 0a31e29..24e9db4 100644
--- a/KDEpy/tests/test_TreeKDE.py
+++ b/KDEpy/tests/test_TreeKDE.py
@@ -66,41 +66,12 @@ def test_additivity_with_weights(data, split_index):
 @pytest.mark.parametrize(
     "kernel, bw, n, expected_result",
     [
-        (
-            "box",
-            0.1,
-            5,
-            np.array([2.101278e-19, 3.469447e-18, 1.924501e00, 0.000000e00, 9.622504e-01]),
-        ),
-        (
-            "box",
-            0.2,
-            5,
-            np.array([3.854941e-18, 2.929755e-17, 9.622504e-01, 0.000000e00, 4.811252e-01]),
-        ),
+        ("box", 0.1, 5, np.array([2.101278e-19, 3.469447e-18, 1.924501e00, 0.000000e00, 9.622504e-01])),
+        ("box", 0.2, 5, np.array([3.854941e-18, 2.929755e-17, 9.622504e-01, 0.000000e00, 4.811252e-01])),
         ("box", 0.6, 3, np.array([0.1603751, 0.4811252, 0.4811252])),
         ("tri", 0.6, 3, np.array([0.1298519, 0.5098009, 0.3865535])),
-        (
-            "epa",
-            0.1,
-            6,
-            np.array(
-                [
-                    0.000000e00,
-                    7.285839e-17,
-                    2.251871e-01,
-                    1.119926e00,
-                    0.000000e00,
-                    1.118034e00,
-                ]
-            ),
-        ),
-        (
-            "biweight",
-            2,
-            5,
-            np.array([0.1524078, 0.1655184, 0.1729870, 0.1743973, 0.1696706]),
-        ),
+        ("epa", 0.1, 6, np.array([0.000000e00, 7.285839e-17, 2.251871e-01, 1.119926e00, 0.000000e00, 1.118034e00])),
+        ("biweight", 2, 5, np.array([0.1524078, 0.1655184, 0.1729870, 0.1743973, 0.1696706])),
     ],
 )
 def test_against_R_density(kernel, bw, n, expected_result):
@@ -121,19 +92,7 @@ def test_against_R_density(kernel, bw, n, expected_result):
     "bw, n, expected_result",
     [
         (1, 3, np.array([0.17127129, 0.34595518, 0.30233275])),
-        (
-            0.1,
-            5,
-            np.array(
-                [
-                    2.56493684e-22,
-                    4.97598466e-06,
-                    2.13637668e00,
-                    4.56012216e-04,
-                    1.32980760e00,
-                ]
-            ),
-        ),
+        (0.1, 5, np.array([2.56493684e-22, 4.97598466e-06, 2.13637668e00, 4.56012216e-04, 1.32980760e00])),
         (0.01, 3, np.array([0.0, 13.29807601, 13.29807601])),
     ],
 )
diff --git a/KDEpy/tests/test_binning.py b/KDEpy/tests/test_binning.py
index 7249493..f8da30b 100644
--- a/KDEpy/tests/test_binning.py
+++ b/KDEpy/tests/test_binning.py
@@ -69,12 +69,7 @@ class TestBinningFunctions:
     @pytest.mark.parametrize(
         "data, func",
         itertools.product(
-            [
-                [1, 2, 3, 4, 5, 6],
-                [0.04, 0.54, 0.33, 0.85, 0.16],
-                [-4.12, 0.98, -4.3, -1.85],
-                [0, 0, 1],
-            ],
+            [[1, 2, 3, 4, 5, 6], [0.04, 0.54, 0.33, 0.85, 0.16], [-4.12, 0.98, -4.3, -1.85], [0, 0, 1]],
             [linear_binning, linbin_Ndim_python],
         ),
     )
@@ -110,8 +105,7 @@ def test_binning_simple_examples(self, data, weights, ans):
             assert np.allclose(y, ans)
 
     @pytest.mark.parametrize(
-        "dims, use_weights, eq_grid",
-        itertools.product([1, 2, 3, 4], [True, False], [True, False]),
+        "dims, use_weights, eq_grid", itertools.product([1, 2, 3, 4], [True, False], [True, False])
     )
     def test_cython_binning(self, dims, use_weights, eq_grid):
         """
diff --git a/KDEpy/tests/test_bw_selection.py b/KDEpy/tests/test_bw_selection.py
index 6cd3512..7abb107 100644
--- a/KDEpy/tests/test_bw_selection.py
+++ b/KDEpy/tests/test_bw_selection.py
@@ -7,7 +7,8 @@
 import pytest
 import numpy as np
 
-from KDEpy.bw_selection import _bw_methods, improved_sheather_jones
+from KDEpy.bw_selection import _bw_methods, improved_sheather_jones, k_nearest_neighbors, cross_val
+from KDEpy.TreeKDE import TreeKDE
 
 
 @pytest.fixture(scope="module")
@@ -29,8 +30,7 @@ def test_isj_bw_weights_single_zero_weighted_point(data):
     weights[-1] = 0
 
     np.testing.assert_array_almost_equal(
-        improved_sheather_jones(data),
-        improved_sheather_jones(data_with_outlier, weights),
+        improved_sheather_jones(data), improved_sheather_jones(data_with_outlier, weights)
     )
 
 
@@ -40,11 +40,32 @@ def test_isj_bw_weights_same_as_resampling(data, execution_number):
     sample_weights = np.random.randint(low=1, high=100, size=len(data))
     data_resampled = np.repeat(data, repeats=sample_weights).reshape((-1, 1))
     np.testing.assert_array_almost_equal(
-        improved_sheather_jones(data_resampled),
-        improved_sheather_jones(data, sample_weights),
+        improved_sheather_jones(data_resampled), improved_sheather_jones(data, sample_weights)
     )
 
 
+@pytest.mark.parametrize("dims", [1, 2, 3])
+def test_knn_with_2_points(dims):
+    data = np.zeros((2, dims))
+    data[0, 0] = 0
+    data[1, 0] = 1
+    # Distances must be [1,1]
+    np.testing.assert_array_almost_equal(k_nearest_neighbors(data, k=1), np.array([1, 1]))
+
+
+@pytest.mark.parametrize("dims", [1, 2, 3])
+def test_cv_with_2_points(dims):
+    data = np.zeros((2, dims))
+    data[0, 0] = 0
+    data[1, 0] = 1
+    # The optimal bw can be found analytically by solving:
+    # d log(kernel(1,bw)) / d bw = 0
+    # For kernel="gaussian" and norm=2.0 it gives:
+    bw_optimal = 1 / np.sqrt(dims)
+    grid = np.logspace(-0.01, 0.01, 5)  # Make grid of factors around 1
+    np.allclose(cross_val(TreeKDE(), data, seed=bw_optimal, grid=grid), bw_optimal)
+
+
 if __name__ == "__main__":
     # --durations=10  <- May be used to show potentially slow tests
     pytest.main(args=[__file__, "--doctest-modules", "-v", "--durations=15"])
diff --git a/KDEpy/tests/test_kernel_funcs.py b/KDEpy/tests/test_kernel_funcs.py
index aebfca8..40747e8 100644
--- a/KDEpy/tests/test_kernel_funcs.py
+++ b/KDEpy/tests/test_kernel_funcs.py
@@ -15,16 +15,7 @@
 
 class TestKernelHelperFunctions:
     @pytest.mark.parametrize(
-        "dim, expected",
-        [
-            (0, 1.25331),
-            (1, 1),
-            (2, 1.25331),
-            (3, 2),
-            (4, 3.75994),
-            (5, 8),
-            (6, 18.7997),
-        ],
+        "dim, expected", [(0, 1.25331), (1, 1), (2, 1.25331), (3, 2), (4, 3.75994), (5, 8), (6, 18.7997)]
     )
     def test_gauss_integral(self, dim, expected):
         """
@@ -58,8 +49,7 @@ def test_integral_unity(self, fname, function):
         assert np.isclose(integral, 1)
 
     @pytest.mark.parametrize(
-        "p, kernel_name",
-        itertools.product([0.5, 1, 1.5, 2, 13, np.inf], ["triweight", "gaussian"]),
+        "p, kernel_name", itertools.product([0.5, 1, 1.5, 2, 13, np.inf], ["triweight", "gaussian"])
     )
     def test_integral_unity_2D_many_p_norms(self, p, kernel_name):
         """
diff --git a/KDEpy/tests/test_sorted_grid.py b/KDEpy/tests/test_sorted_grid.py
index 6baffd7..d9bc5c6 100644
--- a/KDEpy/tests/test_sorted_grid.py
+++ b/KDEpy/tests/test_sorted_grid.py
@@ -71,10 +71,7 @@ def test_on_good_grids(self):
         grid = np.array([[0], [0], [0], [1], [1], [1], [2], [2], [2]], dtype=float)
         assert grid_is_sorted(grid)
 
-        grid = np.array(
-            [[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2]],
-            dtype=float,
-        )
+        grid = np.array([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2]], dtype=float)
         assert grid_is_sorted(grid)
 
         grid = np.array([[1, 1], [2, 2], [3, 3]], dtype=float)
@@ -303,10 +300,7 @@ def test_on_bad_grids(self):
         grid = np.array([[0], [0], [2], [1], [1], [1], [2], [2], [2]], dtype=float)
         assert not grid_is_sorted(grid)
 
-        grid = np.array(
-            [[0, 0], [0, 1], [0, 2], [1, 0], [1, 2], [1, 1], [2, 0], [2, 1], [2, 2]],
-            dtype=float,
-        )
+        grid = np.array([[0, 0], [0, 1], [0, 2], [1, 0], [1, 2], [1, 1], [2, 0], [2, 1], [2, 2]], dtype=float)
         assert not grid_is_sorted(grid)
 
         grid = np.array([[1, 1], [3, 3], [2, 2]], dtype=float)
diff --git a/setup.py b/setup.py
index e77dde7..db8f3d8 100644
--- a/setup.py
+++ b/setup.py
@@ -63,7 +63,7 @@ def read(fname):
         "Programming Language :: Python :: 3.8",
     ],
     packages=["KDEpy"],
-    install_requires=["numpy>=1.14.2", "scipy>=1.0.1", "matplotlib>=2.2.0"],
+    install_requires=["numpy>=1.14.2", "scipy>=1.0.1", "matplotlib>=2.2.0", "joblib>=1.0.1"],
     cmdclass=cmdclass,
     include_dirs=include_dirs,
     ext_modules=ext_modules,