diff --git a/README.md b/README.md index 67135ed..1b818f7 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,7 @@ Remove clustpy via pip to avoid ambiguities during development, e.g., when chang | ENRC | [Deep Embedded Non-Redundant Clustering](https://ojs.aaai.org/index.php/AAAI/article/view/5961) | AAAI 2020 | [Link](https://gitlab.cs.univie.ac.at/lukas/enrcpublic) (Python + PyTorch) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.enrc.ENRC) | | IDEC | [Improved Deep Embedded Clustering with Local Structure Preservation](https://www.ijcai.org/proceedings/2017/243) | IJCAI 2017 | [Link](https://github.com/XifengGuo/IDEC) (Python + Keras) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.dec.IDEC) | | N2D | [N2d:(not too) deep clustering via clustering the local manifold of an autoencoded embedding](https://ieeexplore.ieee.org/document/9413131) | ICPR 2021 | [Link](https://github.com/XifengGuo/IDEC) (Python + Keras) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.ddc_n2d.N2D) | +| SHADE | [SHADE: Deep Density-based Clustering](https://ieeexplore.ieee.org/abstract/document/10884477) | ICDM 2024 | [Link](https://github.com/pasiweber/SHADE) (Python + PyTorch) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.shade.SHADE) | | VaDE | [Variational Deep Embedding: An Unsupervised and Generative Approach to Clustering](https://www.ijcai.org/proceedings/2017/0273) | IJCAI 2017 | [Link](https://github.com/slim1017/VaDE) (Python + Keras) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.vade.VaDE) | #### Neural Networks diff --git a/clustpy/deep/__init__.py b/clustpy/deep/__init__.py index a5b72d0..10561d8 100644 --- a/clustpy/deep/__init__.py +++ b/clustpy/deep/__init__.py @@ -9,6 +9,7 @@ from .aec import AEC from .deepect import DeepECT from .den import DEN +from .shade import SHADE from ._data_utils import get_dataloader, get_default_augmented_dataloaders from ._train_utils import get_trained_network, get_neural_network from ._utils import encode_batchwise, decode_batchwise, encode_decode_batchwise, predict_batchwise, detect_device, \ @@ -28,6 +29,7 @@ 'DipEncoder', 'DeepECT', 'DEN', + 'SHADE', 'get_dataloader', 'get_default_augmented_dataloaders', 'get_neural_network' diff --git a/clustpy/deep/_abstract_deep_clustering_algo.py b/clustpy/deep/_abstract_deep_clustering_algo.py index 68ba772..613ed03 100644 --- a/clustpy/deep/_abstract_deep_clustering_algo.py +++ b/clustpy/deep/_abstract_deep_clustering_algo.py @@ -101,11 +101,6 @@ def transform(self, X: np.ndarray) -> np.ndarray: """ check_is_fitted(self, ["labels_", "neural_network_trained_", "n_features_in_"]) X, _, _ = check_parameters(X, allow_size_1=True, allow_nd=self.neural_network_trained_.allow_nd_input, estimator_obj=self) - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"X has {X.shape[1]} features, but {self.__class__.__name__} " - f"is expecting {self.n_features_in_} features as input." - ) X_embed = self.neural_network_trained_.transform(X) return X_embed diff --git a/clustpy/deep/_train_utils.py b/clustpy/deep/_train_utils.py index 4618a5c..110e547 100644 --- a/clustpy/deep/_train_utils.py +++ b/clustpy/deep/_train_utils.py @@ -167,7 +167,7 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n neural_network = get_neural_network(input_dim, embedding_size, neural_network, neural_network_class, neural_network_params, neural_network_weights, device, random_state) # Start training - if not neural_network.fitted: + if not neural_network.fitted and n_epochs > 0: print("Neural network is not fitted yet, will be pretrained.") # Pretrain neural network optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params diff --git a/clustpy/deep/shade.py b/clustpy/deep/shade.py new file mode 100644 index 0000000..1b595ea --- /dev/null +++ b/clustpy/deep/shade.py @@ -0,0 +1,375 @@ +""" +@authors: +Pascal Weber +""" + +from __future__ import annotations +import numpy as np +import torch +from clustpy.utils import DCTree +from clustpy.hierarchical import DCTree_Clusterer +from clustpy.deep._abstract_deep_clustering_algo import _AbstractDeepClusteringAlgo +from clustpy.deep._data_utils import get_train_and_test_dataloader +from clustpy.deep._train_utils import get_trained_network +from clustpy.deep._utils import detect_device, squared_euclidean_distance, encode_batchwise, run_initial_clustering, mean_squared_error +from clustpy.utils.checks import check_parameters +import tqdm +from typing import Callable, Optional, Tuple +from sklearn.utils.validation import check_is_fitted +from sklearn.base import ClusterMixin + + +class SHADE(_AbstractDeepClusteringAlgo): + """ + The Structure-preserving High-dimensional Analysis with Density-based Exploration (SHADE) algorithm. + A neural network (autoencoder AE) will be trained with the reconstruction loss and the d_dc loss function. + Afterward, KMeans or HDBSCAN identifies the initial clusters. + + Parameters + ---------- + clustering_class : ClusterMixin + clustering class to obtain the cluster labels after getting the embedding (default: DCTree_Clusterer) + clustering_params : dict + parameters for the clustering class. If None, it will be set to {"min_points": min_points} (default: None) + min_points : int + the minimum number of points (default: 5) + use_complete_dc_tree : bool + Defines whether the complete DC Tree should be used instead of a batch-wise version (default: True) + use_matrix_dc_distance: bool + Defines whether the matrix DC distance should be stored - can cause memory issues (default: True) + use_less_memory: bool + Use less memory when constructing the DCTree. + This will, however, increase the runtime (default: False) + batch_size : int + Size of the data batches. (default: 500) + pretrain_optimizer_params : dict + parameters of the optimizer for the pretraining of the neural network, includes the learning rate. + If None, it will be set to {"lr": 1e-3}. (default: None) + clustering_optimizer_params : dict + parameters of the optimizer for the actual clustering procedure, includes the learning rate. If None, it will be set to {"lr": 1e-4} (default: None) + pretrain_epochs : int + number of epochs for the pretraining of the neural network. (default: 0) + clustering_epochs : int + number of epochs for the actual clustering procedure (default: 100) + optimizer_class : torch.optim.Optimizer + the optimizer class (default: torch.optim.Adam) + ssl_loss_fn : Callable | torch.nn.modules.loss._Loss + self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders (default: mean_squared_error) + neural_network : torch.nn.Module | tuple + the input neural network. If None, a new FeedforwardAutoencoder will be created. + Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) + neural_network_weights : str + Path to a file containing the state_dict of the neural_network (default: None) + embedding_size : int + size of the embedding within the neural network (default: 10) + density_loss_weight : float + weight of the density loss compared to the reconstruction loss (default: 1.0) + ssl_loss_weight : float + weight of the self-supervised learning (ssl) loss (default: 1.0) + custom_dataloaders : tuple + tuple consisting of a trainloader (random order) at the first and a test loader (non-random order) at the second position. + Can also be a tuple of strings, where the first entry is the path to a saved trainloader and the second entry the path to a saved testloader. + In this case the dataloaders will be loaded by torch.load(PATH). + If None, the default dataloaders will be used (default: None) + device : torch.device + The device on which to perform the computations. + If device is None then it will be automatically chosen: if a gpu is available the gpu with the highest amount of free memory will be chosen (default: None) + random_state : np.random.RandomState | int + use a fixed random state to get a repeatable solution. Can also be of type int (default: None) + + Attributes + ---------- + n_clusters_ : int + The final number of clusters + labels_ : np.ndarray + The final labels + cluster_centers_ : np.ndarray + The final cluster centers defined as the mean of assigned samples within the AE embedding + dc_tree_ : DCTree + The dc tree + neural_network_trained_ : torch.nn.Module + The final neural network + n_features_in_ : int + the number of features used for the fitting + + Examples + -------- + >>> from clustpy.data import create_subspace_data + >>> data, labels = create_subspace_data(1500, subspace_features=(3, 50), random_state=1) + >>> shade = SHADE() + >>> shade.fit(data) + + References + ---------- + SHADE: Deep Density-based Clustering + Anna Beer; Pascal Weber; Lukas Miklautz; Collin Leiber; Walid Durani; Christian Böhm + IEEE International Conference on Data Mining (ICDM), Abu Dhabi, United Arab Emirates, 2024, pp. 675-680, doi: 10.1109/ICDM59182.2024. + """ + + def __init__( + self, + clustering_class : Optional[ClusterMixin] = DCTree_Clusterer, + clustering_params : dict = None, + min_points : int = 5, + use_complete_dc_tree: bool = True, + use_matrix_dc_distance: bool = True, + use_less_memory: bool = False, + batch_size: int = 500, + pretrain_optimizer_params: dict = None, + clustering_optimizer_params : dict = None, + pretrain_epochs : int = 0, + clustering_epochs : int = 100, + optimizer_class: torch.optim.Optimizer = torch.optim.Adam, + ssl_loss_fn : Callable | torch.nn.modules.loss._Loss = mean_squared_error, + neural_network : torch.nn.Module | tuple = None, + neural_network_weights : str = None, + embedding_size : int = 10, + density_loss_weight : float = 1.0, + ssl_loss_weight : float = 1.0, + custom_dataloaders : tuple = None, + device : torch.device = None, + random_state : np.random.RandomState | int = None, + ): + super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state) + self.clustering_class = clustering_class + self.clustering_params = clustering_params + self.min_points = min_points + self.use_complete_dc_tree = use_complete_dc_tree + self.use_matrix_dc_distance = use_matrix_dc_distance + self.use_less_memory = use_less_memory + self.pretrain_optimizer_params = pretrain_optimizer_params + self.clustering_optimizer_params = clustering_optimizer_params + self.pretrain_epochs = pretrain_epochs + self.clustering_epochs = clustering_epochs + self.optimizer_class = optimizer_class + self.ssl_loss_fn = ssl_loss_fn + self.density_loss_weight = density_loss_weight + self.ssl_loss_weight = ssl_loss_weight + self.custom_dataloaders = custom_dataloaders + + def fit(self, X: np.ndarray, y: np.ndarray=None) -> SHADE: + """ + Cluster the input dataset with the SHADE algorithm. + The resulting cluster labels will be stored in the `labels_` attribute. + + Parameters + ---------- + X : np.ndarray + The given data set. + y : np.ndarray + The labels. (can be ignored) + + Returns + ------- + self : SHADE + This instance of the SHADE algorithm. + """ + X, _, random_state, pretrain_optimizer_params, _, _ = self._check_parameters(X, y=y) + clustering_optimizer_params = {"lr": 1e-3} if self.clustering_optimizer_params is None else self.clustering_optimizer_params + clustering_params = {"min_points": self.min_points, "use_less_memory": self.use_less_memory} if self.clustering_params is None else self.clustering_params + device = detect_device(self.device) + trainloader, testloader, batch_size = get_train_and_test_dataloader(X, self.batch_size, self.custom_dataloaders) + assert batch_size >= self.min_points, f"Batch_size ({batch_size}) cannot be smaller than min_points ({self.min_points})" + # Create dc_tree + if self.use_complete_dc_tree: + self.dc_tree_ = DCTree(X, min_points=self.min_points, use_less_memory=self.use_less_memory) + else: + self.dc_tree_ = None + # Create and pretrain Autoencoder + neural_network_params = {"layers": [X.shape[1], 512, 256, 128, self.embedding_size]} + neural_network = get_trained_network(trainloader, n_epochs=self.pretrain_epochs, + optimizer_params=pretrain_optimizer_params, optimizer_class=self.optimizer_class, + device=device, ssl_loss_fn=self.ssl_loss_fn, embedding_size=self.embedding_size, + neural_network=self.neural_network, + neural_network_weights=self.neural_network_weights, neural_network_params=neural_network_params, + random_state=random_state) + # Setup SHADE Module + shade_module = _SHADE_Module( + n_epochs=self.clustering_epochs, + neural_network=neural_network, + min_points=self.min_points, + dc_tree=self.dc_tree_, + use_matrix_dc_distance=self.use_matrix_dc_distance, + device=device, + ssl_loss_fn=self.ssl_loss_fn, + density_loss_weight=self.density_loss_weight, + ssl_loss_weight=self.ssl_loss_weight + ) + optimizer = self.optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params) + shade_module.fit(X, trainloader, optimizer) + # Get labels + embedded_data = encode_batchwise(testloader, neural_network) + n_clusters, labels, cluster_centers, _ = run_initial_clustering( + X=embedded_data, + n_clusters=None, + clustering_class=self.clustering_class, + clustering_params=clustering_params, + random_state=random_state, + ) + self.n_clusters_ = n_clusters + self.labels_ = labels + self.cluster_centers_ = cluster_centers + self.neural_network_trained_ = neural_network + self.set_n_featrues_in(X) + return self + + def predict(self, X: np.ndarray) -> np.ndarray: + """ + Predicts the labels of the input data. + Note that this is just a very imprecise estimation as we are not using the DC Tree to predict the labels. + The prediction is calculated by checking the distance to the clostest mean of samples in a cluster within the embedding of the AE. + + Parameters + ---------- + X : np.ndarray + input data + + Returns + ------- + predicted_labels : np.ndarray + The predicted labels + """ + check_is_fitted(self, ["labels_", "neural_network_trained_", "n_features_in_"]) + X, _, _ = check_parameters(X, allow_size_1=True, allow_nd=self.neural_network_trained_.allow_nd_input, estimator_obj=self) + print("WARNING: predict does not use the embedding of the manifold and is, therefore, just a very rough estimate") + predicted_labels = super().predict(X) + return predicted_labels + + +class _SHADE_Module(torch.nn.Module): + """ + The _SHADE_Module. Contains most of the algorithm specific procedures like the loss function. + + Parameters + ---------- + n_epochs : int + number of epochs for the clustering procedure + neural_network : torch.nn.Module + the neural network + min_points : int + the minimum number of points + dc_tree : Optional[DCTree] + the DCTree + use_matrix_dc_distance: bool + Defines whether the matrix DC distance should be stored - can cause memory issues + device : torch.device + device to be trained on + ssl_loss_fn : Callable | torch.nn.modules.loss._Loss + self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders + density_loss_weight : float + weight of the clustering loss + ssl_loss_weight : float + weight of the self-supervised learning (ssl) loss + """ + + def __init__( + self, + n_epochs : int, + neural_network: torch.nn.Module, + min_points: int, + dc_tree: Optional[DCTree], + use_matrix_dc_distance: bool, + device: torch.device, + ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, + density_loss_weight: float, + ssl_loss_weight: float + ): + super().__init__() + self.n_epochs = n_epochs + self.neural_network = neural_network + self.min_points = min_points + self.dc_tree = dc_tree + self.use_matrix_dc_distance = use_matrix_dc_distance + self.device = device + self.ssl_loss_fn = ssl_loss_fn + self.density_loss_weight = density_loss_weight + self.ssl_loss_weight = ssl_loss_weight + + def fit( + self, + X: np.ndarray, + trainloader: torch.utils.data.DataLoader, + optimizer: torch.optim.Optimizer + ) -> _SHADE_Module: + """ + Trains the _SHADE_Module in place. + + Parameters + ---------- + X : np.ndarray + The data + trainloader : torch.utils.data.DataLoader + dataloader to be used for training + optimizer : torch.optim.Optimizer + the optimizer for training + + Returns + ------- + self : _SHADE_Module + This instance of the _SHADE_Module. + """ + if self.dc_tree is not None and self.use_matrix_dc_distance: + matrix_dc_distance = self.dc_tree.dc_distances() + matrix_dc_distance_torch = torch.tensor(matrix_dc_distance, device=self.device) + else: + matrix_dc_distance_torch = None + self.train() + tbar = tqdm.trange(self.n_epochs, desc="SHADE training") + for _ in tbar: + # Update Network + for batch in trainloader: + if len(batch[0]) <= self.min_points: + continue + loss = self._loss(X, batch, matrix_dc_distance_torch) + # Backward pass - update weights + optimizer.zero_grad() + loss.backward() + optimizer.step() + postfix_str = {"Loss": loss} + tbar.set_postfix(postfix_str) + self.neural_network.eval() + self.eval() + return self + + def _loss( + self, + X: np.ndarray, + batch: list, + matrix_dc_distance_torch: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Calculate the autoencoder reconstruction + d_dc loss. + + Parameters + ---------- + X : np.ndarray + The data + batch : list + The minibatch. + matrix_dc_distance_torch : torch.Tensor + A matrix containing pairwise dc distances + + Returns + ------- + loss : torch.Tensor + The final SHADE loss. + """ + # Reconstrucion + ssl_loss, embedded, _ = self.neural_network.loss(batch, self.ssl_loss_fn, self.device) + # Density loss + if self.dc_tree is None: + # Batch-wise DCTree + dc_distances = DCTree(X[batch[0]], min_points=self.min_points).dc_distances() + batch_dc_dists = torch.tensor(dc_distances, device=self.device) + else: + # DCTree of all data points X + if self.use_matrix_dc_distance: + idxs = batch[0].to(self.device) + batch_dc_dists = matrix_dc_distance_torch[idxs[:, None], idxs[None, :]] + else: + dc_distances = self.dc_tree.dc_distances(batch[0], batch[0]) + batch_dc_dists = torch.tensor(dc_distances, device=self.device) + batch_eucl_dists = squared_euclidean_distance(embedded, embedded) + loss_dens = (batch_eucl_dists - batch_dc_dists).pow(2).mean() + loss = self.ssl_loss_weight * ssl_loss + self.density_loss_weight * loss_dens + return loss diff --git a/clustpy/deep/tests/test_shade.py b/clustpy/deep/tests/test_shade.py new file mode 100644 index 0000000..487ff34 --- /dev/null +++ b/clustpy/deep/tests/test_shade.py @@ -0,0 +1,25 @@ +from clustpy.deep import SHADE +import numpy as np +from clustpy.utils.checks import check_clustpy_estimator +from clustpy.deep.tests._helpers_for_tests import _test_dc_algorithm_simple, _test_dc_algorithm_with_augmentation + + +def test_shade_estimator(): + # Ignore check_methods_subset_invariance due to numerical issues + check_clustpy_estimator(SHADE(min_points=2, pretrain_epochs=0, clustering_epochs=3), + ("check_complex_data", "check_methods_subset_invariance")) + + +def test_shade(): + shade = SHADE() + _test_dc_algorithm_simple(shade, check_predict=False) + + +def test_shade_wo_matrix_distance(): + shade = SHADE(use_matrix_dc_distance=False) + _test_dc_algorithm_simple(shade, check_predict=False) + + +def test_shade_wo_complete_dc_tree(): + shade = SHADE(use_complete_dc_tree=False) + _test_dc_algorithm_simple(shade, check_predict=False) diff --git a/clustpy/hierarchical/__init__.py b/clustpy/hierarchical/__init__.py index e13ac0a..51ee526 100644 --- a/clustpy/hierarchical/__init__.py +++ b/clustpy/hierarchical/__init__.py @@ -1,3 +1,5 @@ from .diana import Diana +from .dctree_clusterer import DCTree_Clusterer -__all__ = ["Diana"] +__all__ = ["Diana", + "DCTree_Clusterer"] diff --git a/clustpy/hierarchical/dctree_clusterer.py b/clustpy/hierarchical/dctree_clusterer.py new file mode 100644 index 0000000..46f5ad2 --- /dev/null +++ b/clustpy/hierarchical/dctree_clusterer.py @@ -0,0 +1,153 @@ +""" +@authors: +Pascal Weber +""" + +# - Author: Pascal Weber +# - Source: https://github.com/pasiweber/SHADE + +from __future__ import annotations +import numpy as np +import sys +from clustpy.utils.dctree import DCTree, _DCNode +from typing import Optional +from sklearn.base import ClusterMixin, BaseEstimator +from clustpy.utils.checks import check_parameters + + +sys.setrecursionlimit(1000000000) + + +class DCTree_Clusterer(ClusterMixin, BaseEstimator): + """ + The DCTree clustering algorithm. + Identifies stable nodes within the DCTree and labels the data accordingly. + + Parameters + ---------- + min_points : int + the minimum number of points (default: 5) + use_less_memory: bool + Use less memory when constructing the DCTree. + This will, however, increase the runtime (default: False) + + Attributes + ---------- + n_clusters_ : int + The final number of clusters + labels_ : np.ndarray + The final labels + dc_tree_ : BinaryClusterTree + The resulting cluster tree + n_features_in_ : int + the number of features used for the fitting + + References + ---------- + SHADE: Deep Density-based Clustering + Anna Beer; Pascal Weber; Lukas Miklautz; Collin Leiber; Walid Durani; Christian Böhm + IEEE International Conference on Data Mining (ICDM), Abu Dhabi, United Arab Emirates, 2024, pp. 675-680, doi: 10.1109/ICDM59182.2024. + """ + + def __init__(self, min_points: int = 5, use_less_memory: bool = False): + self.min_points = min_points + self.use_less_memory = use_less_memory + + def fit(self, X: np.ndarray, y: np.ndarray=None) -> 'DCTree_Clusterer': + """ + Initiate the actual clustering process on the input data set. + The resulting cluster labels will be stored in the labels_ attribute. + + Parameters + ---------- + X : np.ndarray + the given data set + y : np.ndarray + the labels (can be ignored) + + Returns + ------- + self : DCTree_Clusterer + this instance of the DCTree_Clusterer algorithm + """ + X, _, _ = check_parameters(X=X, y=y) + self.dc_tree_ = DCTree(X, min_points=self.min_points, use_less_memory=self.use_less_memory) + condensed_root = self._condense(self.dc_tree_.root) + stable_nodes = self._get_stable_nodes(condensed_root) + labels = np.full(self.dc_tree_.n, -1 if stable_nodes else 0, dtype=np.int32) + self.n_clusters_ = len(stable_nodes) if stable_nodes else 1 + for idx, node in enumerate(stable_nodes): + labels[node.leaves] = idx + self.labels_ = labels + self.n_features_in_ = X.shape[1] + return self + + def _condense(self, node: Optional[_DCNode]) -> Optional[_DCNode]: + """ + Condense the tree to nodes, where both children contain at least min_points leaves. + Uses a recursive strategy that checks each node separately. + + Parameters + ---------- + node : _DCNode + the node that is checked for its children + + Returns + ------- + condensed_node : _DCNode + either a condensed node or None + """ + if node is None or len(node.leaves) < self.min_points: + return None + # Process children + L = self._condense(node.left) + R = self._condense(node.right) + # If both children have min_points children, keep this branch. + size_L = len(node.left.leaves) if node.left is not None else 0 + size_R = len(node.right.leaves) if node.right is not None else 0 + if size_L >= self.min_points and size_R >= self.min_points: + if (L is not None and R is not None) or (L is None and R is None): + return _DCNode(node.id, node.dist, node.leaves, L, R) + elif L is not None: + return L + else: + return R + elif L is not None: + return _DCNode(node.id, L.dist, node.leaves, L.left, L.right) + elif R is not None: + return _DCNode(node.id, R.dist, node.leaves, R.left, R.right) + return None + + def _get_stable_nodes(self, node: Optional[_DCNode], parent_dist: float = None) -> list: + """ + Identify stable nodes in the tree. + + Parameters + ---------- + node : _DCNode + the node that is checked. + parent_dist : float + Distance in the parent node. Can be None in the case of the root (default: None) + + Returns + ------- + stable_nodes : list + list of stable nodes + """ + if node is None: # In case root is None + return [] + node.stability_ = (1.0/node.dist - 1.0/parent_dist) * len(node.leaves) if parent_dist is not None else 0 + # Calculate stability for children + sum_child_stabilities = 0 + child_results = [] + for child in [node.left, node.right]: + if child is not None: + child_results += self._get_stable_nodes(child, node.dist) + sum_child_stabilities += child.stability_ + # Flag stable nodes + if node.stability_ >= sum_child_stabilities: + stable_nodes = [node] + else: + node.stability_ = sum_child_stabilities + stable_nodes = child_results + return stable_nodes diff --git a/clustpy/hierarchical/tests/test_dctree_clusterer.py b/clustpy/hierarchical/tests/test_dctree_clusterer.py new file mode 100644 index 0000000..e3f798a --- /dev/null +++ b/clustpy/hierarchical/tests/test_dctree_clusterer.py @@ -0,0 +1,17 @@ +from clustpy.hierarchical import DCTree_Clusterer +from clustpy.utils.checks import check_clustpy_estimator +from sklearn.datasets import make_blobs +import numpy as np + + +def test_dctree_clusterer_estimator(): + check_clustpy_estimator(DCTree_Clusterer(min_points=2), ("check_complex_data")) + + +def test_simple_dctree_clusterer(): + X, labels = make_blobs(200, 4, centers=3, random_state=1) + dctree_clusterer = DCTree_Clusterer() + assert not hasattr(dctree_clusterer, "labels_") + dctree_clusterer.fit(X) + assert dctree_clusterer.labels_.dtype == np.int32 + assert dctree_clusterer.labels_.shape == labels.shape diff --git a/clustpy/utils/__init__.py b/clustpy/utils/__init__.py index a485729..8970132 100644 --- a/clustpy/utils/__init__.py +++ b/clustpy/utils/__init__.py @@ -3,6 +3,7 @@ from .diptest import dip_test, dip_pval, dip_boot_samples, dip_gradient, dip_pval_gradient, plot_dip from .plots import plot_with_transformation, plot_image, plot_scatter_matrix, plot_histogram, plot_1d_data, \ plot_2d_data, plot_3d_data +from .dctree import DCTree, reachability_distances, minimum_spanning_tree_prims __all__ = ['evaluate_dataset', 'evaluate_multiple_datasets', @@ -22,4 +23,7 @@ 'dip_gradient', 'dip_pval_gradient', 'plot_dip', - 'evaluation_df_to_latex_table'] + 'evaluation_df_to_latex_table', + 'DCTree', + 'reachability_distances', + 'minimum_spanning_tree_prims'] diff --git a/clustpy/utils/dctree.py b/clustpy/utils/dctree.py new file mode 100644 index 0000000..2e87ca1 --- /dev/null +++ b/clustpy/utils/dctree.py @@ -0,0 +1,603 @@ +""" +@authors: +Pascal Weber +""" + +# Implementation of the dc-distance with a DCTree by +# - Author: Pascal Weber +# - Source: https://github.com/pasiweber/SHADE + +# Paper: Connecting the Dots -- Density-Connectivity Distance unifies DBSCAN, k-Center and Spectral Clustering +# Authors: Anna Beer, Andrew Draganov, Ellen Hohma, Philipp Jahn, Christian M.M. Frey, and Ira Assent +# Link: https://doi.org/10.1145/3580305.3599283 + + +from __future__ import annotations +import numpy as np +from typing import List, Optional, Sequence, Tuple, Union +from scipy.spatial.distance import pdist, cdist, squareform + + +class DCTree: + """ + The DCTree computes the dc_distances and stores them in a tree structure. + By using the euler tour of the tree and a sparse table for range minimum queries, + the lca-elements in the tree (i.e. the dc_distances) can be computed in O(1) time. + The function `dc_dist(i, j)` returns the dc_distance between `points[i]` and `points[j]` + in O(1) time. + The function `dc_distance(idx_X, idx_Y=None, access_method="tree")` returns a dc_distance matrix + with the dc_distance from each pair of `points[idx_X]` and `points[idx_Y]`. + See the section Functions for more details. + + Parameters + ---------- + X : np.ndarray + points, of which the dc_distances should be computed of. + min_points : int, optional + min_points parameter used for the computation of the dc_distances (default: 5). + use_less_memory: bool + Use less memory when constructing the DCTree. + This will, however, increase the runtime (default: False). + + Functions + --------- + DCTree[x]: + Returns the _DCNode of given index. + dc_dist : (i, j) -> distance + returns the dc_distance between points[i] and points[j] in O(1) time. + dc_distances : (idx_X, idx_Y) -> np.ndarray + Computes the dc_distance matrix between each pair of points[idx_X] and points[idx_Y]. + Returns dc_dists as ndarray of shape (n_samples_X, n_samples_Y) + + Examples + -------- + >>> points = np.array([[1,6],[2,6],[6,2],[14,17],[123,3246],[52,8323],[265,73]]) + >>> dc_tree = dcdist.DCTree(points, 5) + >>> print(dc_tree.dc_dist(2,5)) + >>> print(dc_tree.dc_distances(range(len(points)))) + >>> print(dc_tree.dc_distances([0,1], [2,3])) + + References + ---------- + Anna Beer, Andrew Draganov, Ellen Hohma, Philipp Jahn, Christian M.M. Frey, and Ira Assent. + "Connecting the Dots -- Density-Connectivity Distance unifies DBSCAN, k-Center and Spectral + Clustering." KDD '23. 2023. + """ + + def __init__( + self, + X: np.ndarray, + min_points: int = 5, + use_less_memory: bool = False + ): + self.n = X.shape[0] + self.min_points = min_points + if not use_less_memory: + # Calculate pair-wise reachability distance + X = reachability_distances(X, min_points) + mst_edges = minimum_spanning_tree_prims(X, use_less_memory=use_less_memory, min_points=min_points) + self.root = self._build_tree(mst_edges) + self._init_fast_index() + + def _build_tree(self, mst_edges: np.ndarray) -> _DCNode: + """ + Build the final DCTree using the MST edges. + + Parameters + ---------- + mst_edges : np.ndarray + The edges of the MST, including the dc-distances + + Returns + ------- + new_root_node : _DCNode + The root node of the tree + """ + mst_edges.sort(order="dist") + # Create leaf nodes + leaf_nodes = [_DCNode(idx, 0.0, [idx]) for idx in range(self.n)] + idx = self.n + for i, j, dist in mst_edges: + root_i = self._get_root(leaf_nodes[i]) + root_j = self._get_root(leaf_nodes[j]) + new_leaves = root_i.leaves + root_j.leaves + new_root_node = _DCNode( + id=idx, + dist=dist, + left=root_i, + right=root_j, + leaves=new_leaves, + ) + root_i.parent = new_root_node + root_i.root = new_root_node + root_j.parent = new_root_node + root_j.root = new_root_node + idx += 1 + return new_root_node + + def _get_root(self, node: _DCNode) -> '_DCNode': + """ + Get the root of a node and update all root_ entries that have been visited. + + Parameters + ---------- + node : _DCNode + The input node + + Returns + ------- + root_node : _DCNode + The root node + """ + root_node = node + while root_node.root is not None: + root_node = root_node.root + # Override old roots + if root_node is not node: + current_node = node + while current_node.root != root_node: + next_node = current_node.root + current_node.root = root_node + current_node = next_node + return root_node + + def _init_fast_index(self) -> None: + """ + Build a fast index structure for the DCTree. + """ + n_nodes = 2 * self.n - 1 + self.euler = [] + self.level = [] + self.f_occur = [None] * n_nodes + # Euler tour to get the euler, level, and f_occur lists in O(n) time. + DOWN, UP = 0, 1 + stack = [(self.root, 0, DOWN)] # (node, level, DOWN / UP) + while len(stack) > 0: + (node, level, status) = stack.pop() + if status == DOWN: + if self.f_occur[node.id] is None: + self.f_occur[node.id] = len(self.euler) + self.euler.append(node) + self.level.append(level) + if node.right is not None: + stack.append((node, level, UP)) + stack.append((node.right, level + 1, DOWN)) + if node.left is not None: + stack.append((node, level, UP)) + stack.append((node.left, level + 1, DOWN)) + elif status == UP: + self.euler.append(node) + self.level.append(level) + # SparseTable structure which finds the index of the minimum value within the range [l,r] in self.level + levels = np.array(self.level, dtype=int) + n_elements = len(levels) + log_n = n_elements.bit_length() + self.sparse_table = np.zeros((log_n, n_elements), dtype=int) + self.sparse_table[0] = np.arange(n_elements) + for j in range(1, log_n): + stride = 1 << (j - 1) + limit = n_elements - (1 << j) + 1 + if limit <= 0: + break + left_indices = self.sparse_table[j - 1, :limit] + right_indices = self.sparse_table[j - 1, stride : stride + limit] + choose_left = levels[left_indices] <= levels[right_indices] + self.sparse_table[j, :limit] = np.where(choose_left, left_indices, right_indices) + + def __getitem__( + self, + point_idx: Union[int, Sequence[int], np.ndarray] + ) -> Union[_DCNode, List[_DCNode]]: + """ + Returns the _DCNode of given index if `point_idx` is an integer or a list of _DCNodes if `point_idx' is a Sequence. + + Parameters + ---------- + point_idx : Union[int, Sequence[int], np.ndarray] + The input parameter as described above + + Returns + ------- + result : Union[_DCNode, List[_DCNode]] + The querried nodes in the tree + """ + if isinstance(point_idx, int): + return self.euler[self.f_occur[point_idx]] + elif isinstance(point_idx, (Sequence, np.ndarray)): + return [self.euler[self.f_occur[i]] for i in point_idx] + else: + raise IndexError(f"`{point_idx}` needs to be an integer, Sequence or np.ndarray!") + + def dc_dist(self, i: int, j: int) -> float: + """ + Returns the dc_distance between points[i] and points[j] in O(1) time. + + Parameters + ---------- + i : int + index of the first point + j : int + index of the second point + + Returns + ------- + dc_distance : float + The dc_distance of the two points + """ + if i == j: + return 0.0 + l = self.f_occur[i] + r = self.f_occur[j] + if l > r: + l, r = r, l + k = (r - l + 1).bit_length() - 1 + idx_l = self.sparse_table[k, l] + idx_r = self.sparse_table[k, r - (1 << k) + 1] + ancestor = self.euler[idx_l if self.level[idx_l] <= self.level[idx_r] else idx_r] + dc_distance = ancestor.dist + return dc_distance + + def dc_distances( + self, + idx_X: Union[Sequence[int], np.ndarray, None] = None, + idx_Y: Union[Sequence[int], np.ndarray, None] = None, + access_method: str = "tree", + ) -> np.ndarray: + """ + Computes the dc_distance matrix between each pair of points[idx_X] and points[idx_Y]. + If idx_X=None, idx_X=range(n) is used. + If idx_Y=None, idx_Y=idx_X is used. + + Parameters + ---------- + idx_X : Union[Sequence[int], np.ndarray, None] + the first set of indices + idx_Y : Union[Sequence[int], np.ndarray, None] + the second set of indices + access_method : str + "tree": traverses the tree in O(n) time (n = len(points)), no matter the size of X / Y. + "dc_dist": uses the dc_dist function and needs O(k*l) time (k = len(X), l = len(Y))). + (default: "tree") + + Returns + ------- + dc_dists : np.array + ndarray of shape (n_samples_X, n_samples_Y) containing the distances + """ + if idx_X is None: + idx_X = range(self.n) + if idx_Y is None: + idx_Y = idx_X + dc_dists = np.zeros((len(idx_X), len(idx_Y))) + if access_method == "dc_dist": + # Get distances from the index structure + for i in range(len(idx_X)): + start_index = i + 1 if idx_X is idx_Y else 0 + for j in range(start_index, len(idx_Y)): + dc_dists[i, j] = self.dc_dist(idx_X[i], idx_Y[j]) + elif access_method == "tree": + # Get distances from the tree + idx_rev_X = np.full(self.n, -1, dtype=int) + idx_rev_X[idx_X] = range(len(idx_X)) + if idx_X is idx_Y: + idx_rev_Y = idx_rev_X + else: + idx_rev_Y = np.full(self.n, -1, dtype=int) + idx_rev_Y[idx_Y] = range(len(idx_Y)) + node_list = [self.root] + while len(node_list) > 0: + node = node_list.pop() + if node.left is not None: + node_list.append(node.left) + if node.right is not None: + node_list.append(node.right) + if node.left is not None and node.right is not None: + i_leaves = node.left.leaves + j_leaves = node.right.leaves + (i_, j_) = (idx_rev_X[i_leaves], idx_rev_Y[j_leaves]) + (i_, j_) = (i_[i_ != -1], j_[j_ != -1]) + if len(i_) > 0 and len(j_) > 0: + dc_dists[(i_[:, np.newaxis], j_[np.newaxis, :])] = node.dist + # In case of asymmetric call + if idx_X is not idx_Y: + (i_, j_) = (idx_rev_Y[i_leaves], idx_rev_X[j_leaves]) + (i_, j_) = (i_[i_ != -1], j_[j_ != -1]) + if len(i_) > 0 and len(j_) > 0: + dc_dists[(j_[:, np.newaxis], i_[np.newaxis, :])] = node.dist + else: + raise ValueError(f"'{access_method}' is no valid `access_method`") + if idx_X is idx_Y: + # Mirror values + dc_dists = dc_dists + dc_dists.T + return dc_dists + + + def _traverse_until_k_clusters(self, n_clusters: int) -> List[_DCNode]: + """ + Traverse the tree to identify n_clusters nodes that minimize the maximum within-cluster distance. + + Parameters + ---------- + n_clusters : int + the number of desired clusters + + Returns + ------- + result_nodes : List[_DCNode] + List of nodes, where each node represents the root of a cluster + """ + assert n_clusters <= self.n, "n_clusters can not be larger than the number of input points" + node_list = [self.root] + result_nodes = [] + id_threshold = 2 * self.n - n_clusters - 1 + while len(node_list) > 0: + node = node_list.pop() + if node.id <= id_threshold: + result_nodes.append(node) + else: + if node.left is not None: + node_list.append(node.left) + if node.right is not None: + node_list.append(node.right) + assert len(result_nodes) == n_clusters, "Length of the result_nodes list is not equal to k" + return result_nodes + + def get_k_center(self, n_clusters: int) -> np.ndarray: + """ + Extract the k center approach. + It identifies clusters such that the maximum distance within a cluster is minimized. + + Parameters + ---------- + n_clusters : int + the number of desired clusters + + Returns + ------- + labels : np.ndarray + The cluster labels + """ + nodes = self._traverse_until_k_clusters(n_clusters) + labels = np.zeros(self.n, dtype=np.int32) + for i, node in enumerate(nodes): + labels[np.array(node.leaves)] = i + return labels + + def get_eps_for_k(self, n_clusters: int, eps: float = 1e-10): + """ + Receive the largest possible eps value such that one receives exectly n_clusters. + + Parameters + ---------- + n_clusters : int + the number of desired clusters + eps : float + a small constaint used to differentiate from the next potential cluster (default: 1e-10) + + Returns + ------- + min_eps : float + The resulting epsilon value + """ + assert eps > 0, "Eps has to be a positive number" + nodes = self._traverse_until_k_clusters(n_clusters) + min_eps = np.inf + for node in nodes: + if node.parent is not None: + min_eps = min(min_eps, node.parent.dist) + return min_eps - eps + + def __repr__(self) -> str: + """ + Return a string describing the DCTree. + + Returns + ------- + repr_string : str + The string representation of the tree + """ + if self.root is None: + return "" + pointer_right = " " + pointer_left = " " if self.root.right else " " + repr_string = f"{self.root}" + repr_string += f"{self.__repr__help(self.root.left, pointer_left, '', self.root.right is not None)}" + repr_string += f"{self.__repr__help(self.root.right, pointer_right, '', False)}" + return repr_string + + def __repr__help(self, node: Optional[_DCNode], pointer: str, padding: str, has_right_sibling: bool) -> str: + """ + Helper for the __repr__ function. + + Parameters + ---------- + node : Optional[_DCNode] + The current node + pointer : str + The pointer string + padding : str + The padding string + has_right_sibling : bool + True if node has a right sibbling + + Returns + ------- + repr_string : str + The string for this node + """ + if node is None: + return "" + padding_for_both = padding + (" " if has_right_sibling else " ") + pointer_right = " " + pointer_left = " " if node.right else " " + repr_string = f"\n {padding.replace('|', ' ')}// #region" + repr_string += f"\n{padding}{pointer}{node}" + repr_string += f"{self.__repr__help(node.left, pointer_left, padding_for_both, node.right is not None)}" + repr_string += f"{self.__repr__help(node.right, pointer_right, padding_for_both, False)}" + repr_string += f"\n {padding.replace('|', ' ')}// #endregion" + return repr_string + + +class _DCNode: + """ + A node in a DCTree. + Each node contains a set of leaves and can have a left and a right child node. + + Parameters + ---------- + id : int + the id of the node + dist : float + the dc distance + leaves : List[int] + list of the ids of its child nodes + left : Optional[_DCNode] + the left child node (default: None) + right : Optional[_DCNode] + the right child node (default: None) + parent : Optional[_DCNode] + the parent node. If None, it will be set to itself (default: None) + root : Optional[_DCNode] + the root node (default: None) + """ + + def __init__( + self, + id: int, + dist: float, + leaves: List[int], + left: Optional[_DCNode] = None, + right: Optional[_DCNode] = None, + parent: Optional[_DCNode] = None, + root: Optional[_DCNode] = None + ): + self.id = id + self.dist = dist + self.leaves = leaves + self.left = left + self.right = right + if not parent: + self.parent = self + else: + self.parent = parent + self.root = root + + def __repr__(self) -> str: + """ + Return a string containing the id and dist of this node. + + Returns + ------- + to_str : str + The string + """ + to_str = f"DCNode #{self.id} ({self.dist})" + return to_str + + def __lt__(self, other_node: _DCNode) -> bool: + """ + Less than method of the node. Compares the dist with respect to the other input node. + + Parameters + ---------- + other_node : _DCNode + the other node + + Returns + ------- + is_less : bool + True if dist is smaller than dist of the other node + """ + less_than = self.dist < other_node.dist + return less_than + + +def reachability_distances(X: np.ndarray, min_points: int = 5) -> np.ndarray: + """ + Calculates the reachability distances between points using the min_points threshold in O(n^2) time. + + Parameters + ---------- + X : np.ndarray + the points + min_points : int, optional + min_points parameter (default: 5) + + Returns + ------- + reach_distances : np.ndarray + Array containing the pair-wise reachability distances + + Raises + ------ + Raises a ValueError if min_points is larger than the number of points. + """ + if min_points > X.shape[0]: + raise ValueError(f"Min points ({min_points}) can't exceed the size of the dataset ({X.shape[0]})") + eucl_distances = squareform(pdist(X, metric="euclidean")) + if min_points > 1: + core_distances = np.partition(eucl_distances, min_points - 1, axis=1)[:, min_points - 1] + reach_distances = np.maximum(eucl_distances, np.maximum.outer(core_distances, core_distances)) + np.fill_diagonal(reach_distances, 0) + else: + reach_distances = eucl_distances + return reach_distances + + +def minimum_spanning_tree_prims(matrix: np.ndarray, use_less_memory: bool = False, min_points: int = None) -> np.ndarray: + """ + Create a Minimum-spanning-tree of a given matrix using Prim's algorithm. + The tree will be build in O(n^2) time. + + Parameters + ---------- + matrix : np.ndarray + The input matrix + use_less_memory : bool + If true, the MST will not directly be build for the input matrix but the matrix will be used to construct a distance matrix first. + Saves quadratic RAM usage, but also needs double the time for computing (default: False) + min_points : int + Min_points for calculating the reachability distance. Only relevant if use_less_memory is True. + If min_points is None, the euclidean distance will be used (default: None) + + Returns + ------- + mst_edges : np.ndarray + The edges of the Minimum-spanning-tree, represented as a (n-1, 3) matrix with entries corresponding to (node_i, node_j, dist_ij) + """ + assert (matrix.shape[0] == matrix.shape[1]) or use_less_memory, "Input matrix must be quadratic or use_less_memory must be True." + n = matrix.shape[0] + nodes_min_dist = np.full(n, np.inf) + parent = np.zeros(n, dtype=int) + not_in_mst = np.ones(n, dtype=bool) + mst_edges = np.empty((n - 1), dtype=([("i", int), ("j", int), ("dist", float)])) + # If min_points is not None, use reachability distance => calculate core distances of all points + if use_less_memory and min_points is not None: + core_distances = np.zeros(n) + for i in range(n): + eucl_distances = cdist([matrix[i]], matrix, metric="euclidean").ravel() + core_distances[i] = np.partition(eucl_distances, min_points - 1)[min_points - 1] + # Start building the MST + u = 0 + nodes_min_dist[u] = 0 + not_in_mst[u] = False + for i in range(n - 1): + if use_less_memory: + eucl_distances = cdist([matrix[u]], matrix, metric="euclidean").ravel() + if min_points is None: + dist_u = eucl_distances + else: + dist_u = np.maximum(eucl_distances, np.maximum(core_distances[u], core_distances)) + else: + # If use_less_memory=False, 'matrix' is expected to be the precomputed distance matrix + dist_u = matrix[u] + update_mask = not_in_mst & (dist_u < nodes_min_dist) + # Update distances and parents + nodes_min_dist[update_mask] = dist_u[update_mask] + parent[update_mask] = u + # Select next closest unvisited node + masked_dists = np.where(not_in_mst, nodes_min_dist, np.inf) + u = np.argmin(masked_dists) + mst_edges[i] = (parent[u], u, nodes_min_dist[u]) + not_in_mst[u] = False + return mst_edges diff --git a/clustpy/utils/tests/test_dctree.py b/clustpy/utils/tests/test_dctree.py new file mode 100644 index 0000000..9d79944 --- /dev/null +++ b/clustpy/utils/tests/test_dctree.py @@ -0,0 +1,162 @@ +from clustpy.utils import minimum_spanning_tree_prims, reachability_distances, DCTree +import numpy as np + +def test_dc_tree_min(): + X = np.array([ + [0.0, 0.0], + [-3.0, 0.0], + [2.0, 0.0], + [3.0, 0.0], + [-7.0, 0.0], + [-8.0, 0.0] + ]) + # Min_points = 3 + dctree = DCTree(X, min_points = 3, use_less_memory = False) + root = dctree.root + assert root.dist == 5.0 + assert root.left.leaves == [0, 2, 3, 1, 4] + assert root.right.leaves == [5] + assert root.right.dist == 0.0 + assert root.right.right is None + assert root.right.left is None + assert root.left.dist == 4.0 + assert root.left.left.leaves == [0, 2, 3, 1] + assert root.left.right.leaves == [4] + assert root.left.right.dist == 0.0 + assert root.left.right.left is None + assert root.left.right.right is None + assert root.left.left.dist == 4.0 + assert root.left.left.left.leaves == [0, 2, 3] + assert root.left.left.right.leaves == [1] + assert root.left.left.left.dist == 3.0 + assert root.left.left.left.left.leaves == [0, 2] + assert root.left.left.left.right.leaves == [3] + assert root.left.left.left.left.dist == 3.0 + assert root.left.left.left.left.left.leaves == [0] + assert root.left.left.left.left.right.leaves == [2] + all_distances = dctree.dc_distances() + expected_distances = np.array([[0., 4., 3., 3., 4., 5.], + [4., 0., 4., 4., 4., 5.], + [3., 4., 0., 3., 4., 5.], + [3., 4., 3., 0., 4., 5.], + [4., 4., 4., 4., 0., 5.], + [5., 5., 5., 5., 5., 0.] + ]) + assert np.array_equal(all_distances, expected_distances) + all_distances_reverse = dctree.dc_distances(np.arange(6), np.arange(5)[::-1], access_method="dc_dist") + expected_distances = expected_distances[:, np.arange(5)[::-1]] + assert np.array_equal(all_distances_reverse, expected_distances) + labels = dctree.get_k_center(3) + assert np.array_equal(labels, np.array([2, 2, 2, 2, 1, 0])) + eps = dctree.get_eps_for_k(3) + assert np.abs(eps - 4.0) < 1e-5 + # Min_points = 2 + dctree = DCTree(X, min_points = 2, use_less_memory = True) + root = dctree.root + assert root.dist == 4.0 + assert root.left.leaves == [0, 2, 3, 1] + assert root.right.leaves == [4, 5] + assert root.right.dist == 1.0 + assert root.right.left.leaves == [4] + assert root.right.right.leaves == [5] + assert root.right.left.dist == 0.0 + assert root.right.left.left is None + assert root.right.left.right is None + assert root.right.right.dist == 0.0 + assert root.right.right.left is None + assert root.right.right.right is None + assert root.left.dist == 3.0 + assert root.left.left.leaves == [0, 2, 3] + assert root.left.right.leaves == [1] + assert root.left.right.dist == 0.0 + assert root.left.right.left is None + assert root.left.right.right is None + assert root.left.left.dist == 2.0 + assert root.left.left.left.leaves == [0] + assert root.left.left.right.leaves == [2, 3] + assert root.left.left.right.dist == 1.0 + assert root.left.left.right.left.leaves == [2] + assert root.left.left.right.right.leaves == [3] + all_distances = dctree.dc_distances() + expected_distances = np.array([[0., 3., 2., 2., 4., 4.], + [3., 0., 3., 3., 4., 4.], + [2., 3., 0., 1., 4., 4.], + [2., 3., 1., 0., 4., 4.], + [4., 4., 4., 4., 0., 1.], + [4., 4., 4., 4., 1., 0.] + ]) + assert np.array_equal(all_distances, expected_distances) + all_distances_reverse = dctree.dc_distances(np.arange(6), np.arange(5)[::-1], access_method="dc_dist") + expected_distances = expected_distances[:, np.arange(5)[::-1]] + assert np.array_equal(all_distances_reverse, expected_distances) + labels = dctree.get_k_center(3) + assert np.array_equal(labels, np.array([2, 1, 2, 2, 0, 0])) + eps = dctree.get_eps_for_k(3) + assert np.abs(eps - 3.0) < 1e-5 + # Test different accesses to DCTree + node = dctree[4] + assert node.id == 4 + all_nodes = dctree[np.arange(11)] + for i, node in enumerate(all_nodes): + assert node.id == i + # Test repr + assert type(dctree.__repr__()) is str + assert type(node.__repr__()) is str + + +def test_reachability_distances(): + X = np.array([ + [0.0, 0.0], + [-3.0, 0.0], + [2.0, 0.0], + [3.0, 0.0], + [-7.0, 0.0], + [-8.0, 0.] + ]) + distances = reachability_distances(X, min_points = 2) + expected_distances = [[0., 3., 2., 3., 7., 8.], + [3., 0., 5., 6., 4., 5.], + [2., 5., 0., 1., 9., 10.], + [3., 6., 1., 0., 10., 11. ], + [7., 4., 9., 10, 0., 1.], + [8., 5., 10., 11., 1., 0.] + ] + assert np.array_equal(distances, expected_distances) + distances = reachability_distances(X, min_points = 3) + expected_distances = [[0., 4., 3., 3., 7., 8.], + [4., 0., 5., 6., 4., 5.], + [3., 5., 0., 3., 9., 10.], + [3., 6., 3., 0., 10., 11.], + [7., 4., 9., 10., 0., 5.], + [8., 5., 10., 11., 5., 0.] + ] + assert np.array_equal(distances, expected_distances) + + +def test_minimum_spanning_tree_prims(): + distance_matrix = np.array([ + [0.0, 3.0, 2.0, 3.0, 7.0, 8.0], + [3.0, 0.0, 5.0, 6.0, 4.0, 5.0], + [2.0, 5.0, 0.0, 1.0, 9.0, 10.0], + [3.0, 6.0, 1.0, 0.0, 10.0, 11.0], + [7.0, 4.0, 9.0, 10.0, 0.0, 1.0], + [8.0, 5.0, 10.0, 11.0, 1.0, 0.0] + ]) + mst = minimum_spanning_tree_prims(distance_matrix, use_less_memory=False, min_points=None) + expected_mst = np.array([(0, 2, 2.0), (2, 3, 1.0), (0, 1, 3.0), (1, 4, 4.0), (4, 5, 1.0)], dtype=([("i", int), ("j", int), ("dist", float)])) + # test with use_less_memory=True + X = np.array([ + [0.0, 0.0], + [-3.0, 0.0], + [2.0, 0.0], + [3.0, 0.0], + [-7.0, 0.0], + [-8.0, 0.] + ]) + mst = minimum_spanning_tree_prims(X, use_less_memory=True, min_points=None) + assert np.array_equal(mst, expected_mst) + # Test reachability distance + core_distances = [3, 4, 2, 3, 7] + mst = minimum_spanning_tree_prims(X, use_less_memory=True, min_points=3) + expected_mst = np.array([(0, 2, 3.0), (0, 3, 3.0), (0, 1, 4.0), (1, 4, 4.0), (1, 5, 5.0)], dtype=([("i", int), ("j", int), ("dist", float)])) + assert np.array_equal(mst, expected_mst)