From e9c659b85813baae0efbdfc06d973808c5f37c56 Mon Sep 17 00:00:00 2001 From: Pascal Weber Date: Sun, 11 Jan 2026 13:53:59 +0100 Subject: [PATCH 1/5] Added SHADE --- clustpy/deep/__init__.py | 2 + clustpy/deep/dcdist/__init__.py | 27 + clustpy/deep/dcdist/dctree.py | 857 ++++++++++++++++++++++++ clustpy/deep/dcdist/dctree_clusterer.py | 228 +++++++ clustpy/deep/shade.py | 508 ++++++++++++++ 5 files changed, 1622 insertions(+) create mode 100644 clustpy/deep/dcdist/__init__.py create mode 100644 clustpy/deep/dcdist/dctree.py create mode 100644 clustpy/deep/dcdist/dctree_clusterer.py create mode 100644 clustpy/deep/shade.py diff --git a/clustpy/deep/__init__.py b/clustpy/deep/__init__.py index a5b72d0..10561d8 100644 --- a/clustpy/deep/__init__.py +++ b/clustpy/deep/__init__.py @@ -9,6 +9,7 @@ from .aec import AEC from .deepect import DeepECT from .den import DEN +from .shade import SHADE from ._data_utils import get_dataloader, get_default_augmented_dataloaders from ._train_utils import get_trained_network, get_neural_network from ._utils import encode_batchwise, decode_batchwise, encode_decode_batchwise, predict_batchwise, detect_device, \ @@ -28,6 +29,7 @@ 'DipEncoder', 'DeepECT', 'DEN', + 'SHADE', 'get_dataloader', 'get_default_augmented_dataloaders', 'get_neural_network' diff --git a/clustpy/deep/dcdist/__init__.py b/clustpy/deep/dcdist/__init__.py new file mode 100644 index 0000000..113a855 --- /dev/null +++ b/clustpy/deep/dcdist/__init__.py @@ -0,0 +1,27 @@ +from .dctree import ( + DCTree, + calculate_reachability_distance, + serialize as serialize_DCTree, + serialize_compressed as serialize_DCTree_compressed, + save as save_DCTree, + deserialize as deserialize_DCTree, + deserialize_compressed as deserialize_DCTree_compressed, + load as load_DCTree, +) + + +from .dctree_clusterer import DCTree_Clusterer + +__all__ = [ + ### DCTree ### + "DCTree", + "calculate_reachability_distance", + "serialize_DCTree", + "serialize_DCTree_compressed", + "save_DCTree", + "deserialize_DCTree", + "deserialize_DCTree_compressed", + "load_DCTree", + ### DCTree_Cluster ### + "DCTree_Clusterer", +] diff --git a/clustpy/deep/dcdist/dctree.py b/clustpy/deep/dcdist/dctree.py new file mode 100644 index 0000000..e6299ab --- /dev/null +++ b/clustpy/deep/dcdist/dctree.py @@ -0,0 +1,857 @@ +""" +@authors: +Pascal Weber +""" + +# Implementation of the dc-distance with a DCTree by +# - Author: Pascal Weber +# - Source: https://github.com/pasiweber/SHADE + +# Paper: Connecting the Dots -- Density-Connectivity Distance unifies DBSCAN, k-Center and Spectral Clustering +# Authors: Anna Beer, Andrew Draganov, Ellen Hohma, Philipp Jahn, Christian M.M. Frey, and Ira Assent +# Link: https://doi.org/10.1145/3580305.3599283 + + +from __future__ import annotations + +import gzip +import numpy as np +import sys + +from collections import deque +from multiprocessing import cpu_count +from multiprocessing.pool import ThreadPool +from sklearn.metrics import pairwise_distances +from typing import List, Literal, Optional, Sequence, Tuple, Union + + +class DCTree: + """ + The DCTree computes the dc_distances and stores them in a tree structure. + + By using the euler tour of the tree and a sparse table for range minimum queries, + the lca-elements in the tree (i.e. the dc_distances) can be computed in O(1) time. + + The function `dc_dist(i, j)` returns the dc_distance between `points[i]` and `points[j]` + in O(1) time. + The function `dc_distance(X, Y=None, access_method="tree")` returns a dc_distance matrix + with the dc_distance from each pair of `points[X]` and `points[Y]`. + See the section Functions for more details. + + The DCTree provides `serialize` and `serialize_compressed` functions to serialize the + DCTree. With `deserialize` or `deserialize_compressed` the DCTree can be deserialized again. + + The DCTree provides `save` and `load` functions to save / load the DCTree to / from disk. + + + Parameters + ---------- + points : np.ndarray + points, of which the dc_distances should be computed of. + + min_points : int, optional + min_points parameter used for the computation of the dc_distances, by default 5. + + access_method: str + Default access_method for dc_distances, by default "tree". + + no_fastindex: bool = False + No fast index structure is constructed if this parameter is set to `True`. + Disables the functions `dc_dist` and `dc_distances` with `access_method = "dc_dist"`. + Only useful if you only use the `dc_distance` function with `access_method = "tree"`, + e.g. for calculating all pair dc_distances with `dc_dists = dc_tree.dc_distances()`. + + use_less_memory: bool = False + Don't precalculate the reachability distance. Saves quadratic RAM usage, but also needs + double the time for computing. + + + Functions + --------- + DCTree[x] or DCTree[i,j] or DCTree[X,Y]: + Returns the _DCNode of given index if `arg` is an integer or a Sequence. + + If DCTree[i,j] is used, the dc_dist of `points[i]` and `points[j]` is returned + (i and j are integer). + + If DCTree[X,Y] is used, the dc_dist matrix between each pair of `points[X]` and + `points[Y]` is returned (X and Y are np.ndarray or Sequences). + + dc_dist : (self, i: int, j: int) -> distance + returns the dc_distance between points[i] and points[j] in O(1) time. + + dc_distances : (self, X = None, Y = None, access_method = `self.accesss_method`) -> np.ndarray + Computes the dc_distance matrix between each pair of points[X] and points[Y]. + If X=None, X=range(n) is used. + If Y=None, Y=X is used. + + `access_method` (by default `self.accesss_method`): + - "tree": Traverses the tree in O(n) time (n = len(points)), + no matter the size of X / Y. + + - "dc_dist": Uses the dc_dist function and needs O(k*l) time + (k = len(X), l = len(Y))). + + "dc_dist" is often faster if X and Y are smaller than ~10% of n = len(points). + + Returns dc_dists: ndarray of shape (n_samples_X, n_samples_Y) + + + serialize : (dc_tree: DCTree) -> str + Serializes the DCTree `dc_tree` to a string. + + serialize_compressed : (dc_tree: DCTree) -> bytes + Serializes the DCTree `dc_tree` to a compressed byte array. + + save : (dc_tree: DCTree, file_path: str) -> save to disk + Saves the DCTree `dc_tree` to disk at `file_path`. + + + deserialize : (data: str, access_method = "tree", no_fastindex = False, n_jobs = None) -> DCTree + Deserializes a string `str` to a DCTree. + + deserialize_compressed : (data: bytes, access_method = "tree", no_fastindex = False, n_jobs = None) -> DCTree + Deserializes a compressed byte array `bytes` to a DCTree. + + load : (file_path: str, access_method = "tree", no_fastindex = False, n_jobs = None) -> DCTree + Loads a DCTree from disk at `file_path`. + + + Examples + -------- + >>> import dcdist + >>> points = np.array([[1,6],[2,6],[6,2],[14,17],[123,3246],[52,8323],[265,73]]) + >>> dc_tree = dcdist.DCTree(points, 5) + >>> print(dc_tree.dc_dist(2,5)) + >>> print(dc_tree.dc_distances(range(len(points)))) + >>> print(dc_tree.dc_distances([0,1], [2,3])) + + >>> s = dcdist.serialize(dc_tree) + >>> dc_tree_new = dcdist.deserialize(s) + + >>> b = dcdist.serialize_compressed(dc_tree) + >>> dc_tree_new = dcdist.deserialize_compressed(b) + + >>> dcdist.save(dc_tree, "./data.dctree") + >>> dc_tree_new = dcdist.load("./data.dctree") + + + References + ---------- + Anna Beer, Andrew Draganov, Ellen Hohma, Philipp Jahn, Christian M.M. Frey, and Ira Assent. + "Connecting the Dots -- Density-Connectivity Distance unifies DBSCAN, k-Center and Spectral + Clustering." KDD '23. 2023. + """ + + n: int # len(points) + min_points: int + root: _DCNode + + euler: List[_DCNode] + level: List[int] + f_occur: List[int] + level_table: _SparseTable + + access_method: str + no_fastindex: bool + n_jobs: int + no_gil: bool + + def __init__( + self, + points: np.ndarray, + min_points: int = 5, + min_points_mr: Optional[int] = None, + access_method: str = "tree", + no_fastindex: bool = False, + use_less_memory: bool = False, + n_jobs: Optional[int] = None, + precomputed=False, + ): + self.n = points.shape[0] + self.min_points = min_points + self.access_method = access_method + self.no_fastindex = no_fastindex + + if min_points_mr is None: + min_points_mr = min_points + + if n_jobs is None or n_jobs == 0: + self.n_jobs = cpu_count() + elif n_jobs < 0: + self.n_jobs = max(cpu_count() + 1 + n_jobs, 1) + else: + self.n_jobs = n_jobs + self.no_gil = sys.flags.nogil if hasattr(sys.flags, "nogil") else False + + if use_less_memory and not precomputed: + mst_edges = self._get_mst_edges(points, use_less_memory=True) + else: + if not precomputed: + reach_dists = calculate_reachability_distance(points, min_points_mr) + else: + reach_dists = points + mst_edges = self._get_mst_edges(reach_dists) + if not precomputed: + del reach_dists + mst_edges.sort(order="dist") + self.root = self._build_tree(mst_edges) + del mst_edges + + if not self.no_fastindex: + self._init_fast_index() + + def _init_fast_index(self): + n_nodes = 2 * self.n - 1 + self.euler = [self.root] * (2 * n_nodes - 1) + self.level = [0] * (2 * n_nodes - 1) + self.f_occur = [-1] * n_nodes + self._euler_tour(self.root) + self.level_table = _SparseTable(self.level) + + def __getitem__( + self, + arg: Union[ + Union[int, slice, Sequence[int], np.ndarray], + Union[ + Tuple[int, int], + Tuple[int, np.ndarray, Sequence[int]], + Tuple[np.ndarray, Sequence[int], int], + Tuple[np.ndarray, Sequence[int], np.ndarray, Sequence[int]], + ], + ], + ) -> Union[Union[_DCNode, List[_DCNode]], Union[float, np.ndarray]]: + """ + Returns the _DCNode of given index if `arg` is an integer or a Sequence. + + If DCTree[i,j] is used, the dc_dist of `points[i]` and `points[j]` is returned + (i and j are integer). + + If DCTree[X,Y] is used, the dc_dist matrix between each pair of `points[X]` and + `points[Y]` is returned (X and Y are np.ndarray or Sequences). + """ + + index_error_msg = f"`{arg}` needs to be an integer, Sequence, np.ndarray, tuple of integer, or tuple of np.ndarray / Sequence!" + + if isinstance(arg, tuple): + if len(arg) != 2: + raise IndexError(index_error_msg) + (i, j) = arg + if isinstance(i, int) and isinstance(j, int): + return self.dc_dist(i, j) + if isinstance(i, int) and isinstance(j, (np.ndarray, Sequence)): + if isinstance(j, np.ndarray): + j = j.flatten() + return self.dc_distances([i], j) + if isinstance(i, (np.ndarray, Sequence)) and isinstance(j, int): + if isinstance(i, np.ndarray): + i = i.flatten() + return self.dc_distances(i, [j]) + elif isinstance(i, (np.ndarray, Sequence)) and isinstance(j, (np.ndarray, Sequence)): + if isinstance(i, np.ndarray): + i = i.flatten() + if isinstance(j, np.ndarray): + j = j.flatten() + return self.dc_distances(i, j) + else: + raise IndexError(index_error_msg) + + elif isinstance(arg, int): + return self.euler[self.f_occur[arg]] + elif isinstance(arg, (Sequence, np.ndarray)): + return [self.euler[self.f_occur[i]] for i in arg] + elif isinstance(arg, slice): + return [self.euler[i] for i in self.f_occur[arg]] + + else: + raise IndexError(index_error_msg) + + def __repr__(self): + if self.root is None: + return "" + + # pointer_right = "└──" + # pointer_left = "├──" if self.root.right else "└──" + pointer_right = " " + pointer_left = " " if self.root.right else " " + return ( + f"{self.root}" + f"{self.__repr__help(self.root.left, pointer_left, '', self.root.right is not None)}" + f"{self.__repr__help(self.root.right, pointer_right, '', False)}" + ) + + def __repr__help(self, node: Optional[_DCNode], pointer: str, padding: str, has_right_sibling: bool): + if node is None: + return "" + + # padding_for_both = padding + ("| " if has_right_sibling else " ") + # pointer_right = "└──" + # pointer_left = "├──" if node.right else "└──" + padding_for_both = padding + (" " if has_right_sibling else " ") + pointer_right = " " + pointer_left = " " if node.right else " " + return ( + f"\n {padding.replace('|', ' ')}// #region" + f"\n{padding}{pointer}{node}" + f"{self.__repr__help(node.left, pointer_left, padding_for_both, node.right is not None)}" + f"{self.__repr__help(node.right, pointer_right, padding_for_both, False)}" + f"\n {padding.replace('|', ' ')}// #endregion" + ) + + def dc_dist(self, i: int, j: int) -> float: + """Returns the dc_distance from points[i] to points[j] in O(1) time.""" + if i == j: + return 0 + return self._lca(i, j).dist + + def _lca(self, i: int, j: int) -> _DCNode: + first_i = self.f_occur[i] + first_j = self.f_occur[j] + if first_i > first_j: + first_i, first_j = first_j, first_i + return self.euler[self.level_table.query(first_i, first_j)] + + def dc_distances( + self, + X: Union[Sequence[int], np.ndarray, None] = None, + Y: Union[Sequence[int], np.ndarray, None] = None, + access_method: Optional[str] = None, + ) -> np.ndarray: + """ + Computes the dc_distance matrix between each pair of points[X] and points[Y]. + If X=None, X=range(n) is used. + If Y=None, Y=X is used. + + `access_method` (by default `self.access_method`): + - "tree": traverses the tree in O(n) time (n = len(points)), + no matter the size of X / Y. + - "dc_dist": uses the dc_dist function and needs O(k*l) time + (k = len(X), l = len(Y))). + + "dc_dist" is often faster if X and Y are smaller than ~10% of n = len(points). + + Returns dc_dists: ndarray of shape (n_samples_X, n_samples_Y) + """ + + if X is None: + X = range(self.n) + + if Y is None: + Y = X + + if access_method is None: + access_method = self.access_method + + if access_method == "dc_dist" and self.no_fastindex: + print("No fastindex computed. Fallback to access_method: `tree`.") + access_method = "tree" + + if access_method == "dc_dist" and not self.no_fastindex: + dc_dists = np.zeros((len(X), len(Y))) + + for i in range(len(X)): + for j in range(i + 1 if X is Y else 0, len(Y)): + if X[i] != Y[j]: + dc_dists[i, j] = self.dc_dist(X[i], Y[j]) + if X is Y: + dc_dists = dc_dists + dc_dists.T + return dc_dists + + elif access_method == "tree": + dc_dists = np.zeros((len(X), len(Y))) + + idx_rev_X = np.full(self.n, -1) + idx_rev_X[X] = range(len(X)) + + if X is Y: + idx_rev_Y = idx_rev_X + else: + idx_rev_Y = np.full(self.n, -1) + idx_rev_Y[Y] = range(len(Y)) + + inodes = self.get_internal_nodes() + inodes_start = 0 + inodes_end = len(inodes) + + n_jobs = self.n_jobs if self.no_gil else 4 + pool = ThreadPool(n_jobs) + n_inodes = self.n - 1 + chunk_size = int(np.ceil(n_inodes / n_jobs)) + start_points = range(inodes_start, inodes_end, chunk_size) + + def func(start): + for i in range(start, min(start + chunk_size, inodes_end)): + inode = inodes[i] + # inode always has left and right node + i_leaves = inode.left.leaves + j_leaves = inode.right.leaves + + (i_, j_) = (idx_rev_X[i_leaves], idx_rev_Y[j_leaves]) + (i_, j_) = (i_[i_ != -1], j_[j_ != -1]) + dc_dists[(i_[:, np.newaxis], j_[np.newaxis, :])] = inode.dist + + if X is Y: + dc_dists[(j_[:, np.newaxis], i_[np.newaxis, :])] = inode.dist + else: + (i_, j_) = (idx_rev_Y[i_leaves], idx_rev_X[j_leaves]) + (i_, j_) = (i_[i_ != -1], j_[j_ != -1]) + dc_dists[(j_[:, np.newaxis], i_[np.newaxis, :])] = inode.dist + + pool.map(func, start_points) + return dc_dists + + raise ValueError(f"'{access_method}' is no valid `access_method`") + + def _get_mst_edges(self, dist_matrix: np.ndarray, use_less_memory: bool = False) -> np.ndarray: + """Prim's algorithm to build up the minimum spanning tree in O(n^2) time.""" + # dist_matrix are the points, if use_less_memory=True + n = self.n + nodes_min_dist = np.full(n, np.inf) + parent = np.full(n, 0) + not_in_mst = np.full(n, True) + u = 0 # Start node + nodes_min_dist[u] = 0 + not_in_mst[u] = False + mst_edges = np.empty((n - 1), dtype=([("i", int), ("j", int), ("dist", float)])) + + if use_less_memory: + reach_dists = np.empty((dist_matrix.shape[0])) + for i in range(dist_matrix.shape[0]): + eucl_dists = np.linalg.norm(dist_matrix - dist_matrix[i], axis=1) + reach_dists[i] = np.max(np.partition(eucl_dists, self.min_points)[: self.min_points]) + + for i in range(n - 1): + if use_less_memory: + dist_u = np.linalg.norm(dist_matrix - dist_matrix[u], axis=1) # Euclidean distances + dist_u = np.maximum(dist_u, reach_dists[u]) # reachability distance of i + dist_u = np.maximum(dist_u, reach_dists) # reachability distance of all points + + v = np.where(not_in_mst & (dist_u < nodes_min_dist))[0] + nodes_min_dist[v] = dist_u[v] + else: + v = np.where(not_in_mst & (dist_matrix[u] < nodes_min_dist))[0] + nodes_min_dist[v] = dist_matrix[u, v] + parent[v] = u + + arg = np.argmin(nodes_min_dist[not_in_mst]) + u = np.arange(n)[not_in_mst][arg] + mst_edges[i] = (parent[u], u, nodes_min_dist[u]) + not_in_mst[u] = False + + return mst_edges + + def _build_tree(self, mst_edges: np.ndarray) -> _DCNode: + """Kruskal's algorithm to build up the DCTree with the precomputed + and sorted mst_edges in O(n) time.""" + union_find = _UnionFind(self.n) + node = _DCNode(id=0, dist=0.0, leaves=[0]) + idx = self.n + k = len(mst_edges) + for i, j, dist in mst_edges: + i_root = union_find.find(i) + j_root = union_find.find(j) + node = _DCNode( + id=idx, + dist=dist, + k=-1 if (len(i_root.leaves) + len(j_root.leaves)) < self.min_points else k, + left=i_root, + right=j_root, + leaves=i_root.leaves + j_root.leaves, + ) + i_root.parent = node + j_root.parent = node + union_find.union(i, j, node) + idx += 1 + if not node.k == -1: + k -= 1 + return node + + def _euler_tour(self, root: _DCNode): + """Euler tour to get the euler, level, and f_occur lists in O(n) time.""" + DOWN, UP = 0, 1 + stack: deque[Tuple[_DCNode, int, Literal[0, 1]]] = deque([(root, 0, DOWN)]) # (node, level, DOWN / UP) + + pos = 0 + while len(stack): + (node, level, status) = stack.pop() + + if status == DOWN: + if self.f_occur[node.id] == -1: + self.f_occur[node.id] = pos + + self.euler[pos] = node + self.level[pos] = level + pos += 1 + + if node.right: + stack.append((node, level, UP)) + stack.append((node.right, level + 1, DOWN)) + if node.left: + stack.append((node, level, UP)) + stack.append((node.left, level + 1, DOWN)) + + elif status == UP: + self.euler[pos] = node + self.level[pos] = level + pos += 1 + + def get_internal_nodes(self): + nodes = [] + stack: deque[_DCNode] = deque([self.root]) + + while len(stack): + node = stack.pop() + + if node and node.left and node.right: + nodes.append(node) + + if node.left: + stack.append(node.left) + + if node.right: + stack.append(node.right) + return nodes + + def traverse_until_k(self, k): + from queue import PriorityQueue + + result_nodes = set([self.root]) + if k == 1: + return result_nodes + + stack = PriorityQueue() + stack.put((-self.root.dist, self.root, self.root)) + while not stack.empty(): + _, node, parent_node = stack.get() + + if node is None: + return + + if node.k >= 0: + if node.left.k != -1: + if node.left.k != -1 and node.right.k != -1: + result_nodes.discard(parent_node) + result_nodes.discard(node) + result_nodes.add(node.left) + if node.parent.left.k == -1 or node.parent.right.k == -1: + stack.put((-node.left.dist, node.left, parent_node)) + else: + stack.put((-node.left.dist, node.left, node)) + if node.right.k != -1: + if node.left.k != -1 and node.right.k != -1: + result_nodes.discard(parent_node) + result_nodes.discard(node) + result_nodes.add(node.right) + if node.parent.left.k == -1 or node.parent.right.k == -1: + stack.put((-node.right.dist, node.right, parent_node)) + else: + stack.put((-node.right.dist, node.right, node)) + + if len(result_nodes) >= k: + break + + return result_nodes + + def get_k_center(self, k): + nodes = self.traverse_until_k(k) + labels = np.full(self.n, -1) + for i, node in enumerate(nodes): + labels[np.array(node.leaves)] = i + return labels + + def get_eps_for_k(self, k, eps=-3e-12): + nodes = self.traverse_until_k(k) + min_eps = np.inf + for node in nodes: + min_eps = min(min_eps, node.parent.dist) + return min_eps + eps + + +def _serialize(root: _DCNode): + tree_list = [] + stack: deque[Optional[_DCNode]] = deque([root]) + while len(stack): + node = stack.pop() + if node is None: + tree_list.append("/") + elif node.left is None and node.right is None: + tree_list.append(f"{node.id}|{node.dist}") + else: + tree_list.append(f"'{node.id}|{node.dist}") + stack.append(node.right) + stack.append(node.left) + return ",".join(tree_list) + + +def serialize(dc_tree: DCTree) -> str: + """Serializes the DCTree `dc_tree` to a string.""" + + sep = "<\1dc_tree>" + # Tree structure, min_points, n_jobs, no_gil + return _serialize(dc_tree.root) + sep + str(dc_tree.min_points) + + +def serialize_compressed(dc_tree: DCTree) -> bytes: + """Serializes the DCTree `dc_tree` to a gzip-compressed byte array.""" + data = serialize(dc_tree) + byte_data = bytes(data, "utf-8") + return gzip.compress(byte_data) + + +def save(dc_tree: DCTree, file_path: str) -> None: + """Saves the DCTree `dc_tree` to disk at `file_path`.""" + byte_data = serialize_compressed(dc_tree) + with open(file_path, "wb") as file: + file.write(byte_data) + + +def _deserialize(data: List[str]) -> _DCNode: + id, dist = data[0].split("|") + root = _DCNode(id=int(id[1:]), dist=float(dist), leaves=[]) + + DOWN, UP = 0, 1 + stack: deque[Literal[0, 1]] = deque([UP, DOWN, DOWN]) + nodes: deque[_DCNode] = deque([root]) + res: deque[Optional[_DCNode]] = deque([]) + + pos = 0 + while len(stack): + status = stack.pop() + + if status == DOWN: + pos += 1 + if data[pos] == "/": + res.append(None) + continue + id, dist = data[pos].split("|") + if id[0] != "'": + res.append(_DCNode(id=int(id), dist=0, leaves=[int(id)])) + continue + inode = _DCNode(id=int(id[1:]), dist=float(dist), leaves=[]) + nodes.append(inode) + stack.extend([UP, DOWN, DOWN]) + + elif status == UP: + node = nodes.pop() + node.right = res.pop() + node.left = res.pop() + res.append(node) + node.leaves = node.left.leaves + node.right.leaves + return root + + +def deserialize( + data: str, + access_method: Optional[str] = None, + no_fastindex: bool = False, + n_jobs: Optional[int] = None, +) -> DCTree: + """Deserializes a string `str` to a DCTree.""" + dc_tree = object.__new__(DCTree) + + sep = "<\1dc_tree>" + # Tree structure, min_points, n_jobs, no_gil + (tree_data, min_points) = data.split(sep) + dc_tree.min_points = int(min_points) + dc_tree.access_method = access_method if access_method is not None else "tree" + dc_tree.n_jobs = n_jobs if n_jobs else cpu_count() + dc_tree.no_gil = sys.flags.nogil if hasattr(sys.flags, "nogil") else False + + tree_data_list = tree_data.split(",") + dc_tree.root = _deserialize(tree_data_list) + dc_tree.n = (len(tree_data_list) + 1) // 2 + + dc_tree.no_fastindex = no_fastindex + if not dc_tree.no_fastindex: + dc_tree._init_fast_index() + return dc_tree + + +def deserialize_compressed( + compressed_data: bytes, + access_method: Optional[str] = None, + no_fastindex: bool = False, + n_jobs: Optional[int] = None, +) -> DCTree: + """Deserializes a compressed byte array `bytes` to a DCTree.""" + byte_data = gzip.decompress(compressed_data) + data = str(byte_data, "utf-8") + return deserialize(data, access_method, no_fastindex, n_jobs=n_jobs) + + +def load( + file_path: str, + access_method: Optional[str] = None, + no_fastindex: bool = False, + n_jobs: Optional[int] = None, +) -> DCTree: + """Loads a DCTree from disk at `file_path`.""" + file = open(file_path, "rb") + byte_data = file.read() + file.close() + return deserialize_compressed(byte_data, access_method, no_fastindex, n_jobs=n_jobs) + + +def calculate_reachability_distance( + points: np.ndarray, min_points: int = 5, n_jobs: Optional[int] = None +) -> np.ndarray: + """ + Calculates the reachability distance of points using the min_points threshold in O(n^2) time. + + Raises a ValueError if min_points is larger than the number of points. + """ + + if min_points > points.shape[0]: + raise ValueError(f"Min points ({min_points}) can't exceed the size of the dataset ({points.shape[0]})") + + eucl_dists = pairwise_distances(points, metric="euclidean", n_jobs=n_jobs) + + if min_points > 1: + # Get reachability for each point with respect to min_points parameter + reach_dists = np.empty((points.shape[0])) + for i in range(points.shape[0]): + reach_dists[i] = np.max(np.partition(eucl_dists[i], min_points)[:min_points]) + np.maximum(eucl_dists, reach_dists[np.newaxis, :], eucl_dists) + np.maximum(eucl_dists, reach_dists[:, np.newaxis], eucl_dists) + np.fill_diagonal(eucl_dists, 0) + return eucl_dists + + +class _DCNode: + id: int + dist: float + leaves: List[int] + left: Optional[_DCNode] + right: Optional[_DCNode] + parent: _DCNode + k: int + + def __init__( + self, + id: int, + dist: float, + leaves: List[int], + left: Optional[_DCNode] = None, + right: Optional[_DCNode] = None, + parent=None, + k=-1, + ): + self.id = id + self.dist = dist + self.leaves = leaves + self.left = left + self.right = right + if not parent: + self.parent = self + else: + self.parent = parent + self.k = k + + def __repr__(self): + return f"DCNode #{self.id} ({self.dist}) - {self.k}" + + def __lt__(self, other): + return self.dist < other.dist + + +class _UnionFind: + """ + UnionFind structure which provides the functions `find` and `union` with amortized + time complexity of O(α(n)), where α(n) is the inverse Ackermann function. + """ + + root: List[_DCNode] + parent: List[int] + rank: List[int] + + def __init__(self, n: int): + self.root = [_DCNode(id=i, dist=0, leaves=[i]) for i in range(n)] + self.parent = list(range(n)) + self.rank = [1] * n + + def find(self, x: int) -> _DCNode: + """Finds the representative node of x.""" + return self.root[self._find(x)] + + def _find(self, x: int) -> int: + """Finds the representative of x.""" + parent = self.parent[x] + if parent != x: + parent = self._find(parent) + self.parent[x] = parent + return parent + + def union(self, x: int, y: int, node: _DCNode): + """Union the set which contains x with the set which contains y.""" + xset = self._find(x) + yset = self._find(y) + if xset == yset: + return + + # Put smaller ranked item under bigger ranked item if ranks are different + if self.rank[xset] < self.rank[yset]: + self.parent[xset] = yset + self.root[yset] = node + elif self.rank[xset] > self.rank[yset]: + self.parent[yset] = xset + self.root[xset] = node + + # If ranks are same, then move y under x (doesn't matter which one goes where) + # and increment rank of x's tree + else: + self.parent[yset] = xset + self.root[xset] = node + self.rank[xset] = self.rank[xset] + 1 + + +class _SparseTable: + """ + SparseTable structure which provides the function `query` which finds the index of the + minimum value within the range [l,r] in the array `arr`. + Needs O(n * logn) time and storage to precompute and O(1) for the query. + """ + + # arr: List[int] + arr: np.ndarray + # sparse_table: List[List[int]] + sparse_table: np.ndarray + pow_2: List[int] + log_2: List[int] + + def __init__(self, arr: List[int]): + self.arr = np.asarray(arr, dtype=np.int64) + n = len(arr) + log_n = n.bit_length() - 1 + # self.sparse_table = [[-1] * log_n for _ in range(n - 1)] + self.sparse_table = np.full((n, log_n), -1, dtype=np.int64) + + self.pow_2 = [1 << i for i in range(log_n + 1)] + self.log_2 = [i.bit_length() - 1 for i in range(n)] + + # for i in range(0, n - 1): + # self.sparse_table[i][0] = i if self.arr[i] < self.arr[i + 1] else i + 1 + idx = np.arange(n - 1, dtype=np.int64) + self.sparse_table[: n - 1, 0] = np.where(self.arr[idx] < self.arr[idx + 1], idx, idx + 1) + + for j in range(1, log_n): + step = self.pow_2[j] # 2**j + limit = n - self.pow_2[j + 1] + 1 # number of start positions + left_idx = self.sparse_table[:limit, j - 1] # L for i = 0..limit‑1 + right_idx = self.sparse_table[step : step + limit, j - 1] # R for i = 0..limit‑1 + choose_left = self.arr[left_idx] <= self.arr[right_idx] + + self.sparse_table[:limit, j] = np.where(choose_left, left_idx, right_idx) + + # for i in range(n - self.pow_2[j + 1] + 1): + # L = self.sparse_table[i][j - 1] + # R = self.sparse_table[i + step][j - 1] + # self.sparse_table[i][j] = L if arr[L] <= arr[R] else R + + def query(self, l: int, r: int) -> int: + # assert L > R, f"L value ({L}) needs to be larger than R value ({R})" + if l == r: + return l + k = self.log_2[r - l + 1] + l_k = self.sparse_table[l][k - 1] + r_k = self.sparse_table[r - self.pow_2[k] + 1][k - 1] + return l_k if self.arr[l_k] <= self.arr[r_k] else r_k diff --git a/clustpy/deep/dcdist/dctree_clusterer.py b/clustpy/deep/dcdist/dctree_clusterer.py new file mode 100644 index 0000000..e89b7c8 --- /dev/null +++ b/clustpy/deep/dcdist/dctree_clusterer.py @@ -0,0 +1,228 @@ +""" +@authors: +Pascal Weber +""" + +# - Author: Pascal Weber +# - Source: https://github.com/pasiweber/SHADE + +from __future__ import annotations + +import numpy as np + +from collections import deque +from .dctree import DCTree, _DCNode as _DCDist_DCNode +from typing import List, Literal, Optional, Sequence, Tuple, Union + +from sklearn.base import ClusterMixin, BaseEstimator + + +class DCTree_Clusterer(ClusterMixin, BaseEstimator): + """ + Parameters + ---------- + points : np.ndarray + points, of which the dc_distances should be computed of. + + min_points : int, optional + min_points parameter used for the computation of the dc_distances, by default 5. + """ + + min_points: int + increase_inter_cluster_distance: bool + + dc_tree: DCTree + labels_: np.ndarray + + def __init__( + self, + min_points: int = 5, + increase_inter_cluster_distance: bool = True, + ): + self.min_points = min_points + self.increase_inter_cluster_distance = increase_inter_cluster_distance + + def fit(self, X, y=None): + self.dc_tree = DCTree(X, min_points=self.min_points) + self.labels_ = self.clustering(self.dc_tree) + self.n_clusters = len(np.where(np.unique(self.labels_) >= 0)[0]) + + if self.increase_inter_cluster_distance: + self._increase_inter_cluster_distance(self.dc_tree.root) + + return self + + def _condense_tree(self, node: Optional[_DCDist_DCNode]) -> Optional[_DCNode]: + if node is None: + return None + + if node.left is None: + right = self._condense_tree(node.right) + if right is not None: + right.id = node.id + right.leaves = node.leaves + return right + + if node.right is None: + left = self._condense_tree(node.left) + if left is not None: + left.id = node.id + left.leaves = node.leaves + return left + + left_size = len(node.left.leaves) + right_size = len(node.right.leaves) + + if right_size >= self.min_points and left_size >= self.min_points: + node_left = self._condense_tree(node.left) + node_right = self._condense_tree(node.right) + if node_left is None and node_right is None: + return _DCNode(node.id, node.dist, node.leaves, node_left, node_right) + if node_left is None: + return node_right + if node_right is None: + return node_left + return _DCNode(node.id, node.dist, node.leaves, node_left, node_right) + + if right_size < self.min_points and left_size < self.min_points: + return None + + if right_size < self.min_points: + left = self._condense_tree(node.left) + if left is not None: + left.id = node.id + left.leaves = node.leaves + return left + + if left_size < self.min_points: + right = self._condense_tree(node.right) + if right is not None: + right.id = node.id + right.leaves = node.leaves + return right + + def _calculate_stability(self, node: _DCNode): + if node.left is None and node.right is None: + return + + if node.left is not None: + self._calculate_stability(node.left) + node.left.stability = (1.0 / node.left.dist - 1.0 / node.dist) * len(node.left.leaves) + + if node.right is not None: + self._calculate_stability(node.right) + node.right.stability = (1.0 / node.right.dist - 1.0 / node.dist) * len( + node.right.leaves + ) + + def _stable_node_flagging(self, node: Optional[_DCNode]): + if node is None: + return + + node.is_stable = False + + if node.left is None and node.right is None: + node.is_stable = True + return + + if node.left is None: + self._stable_node_flagging(node.right) + if node.stability < node.right.stability: + node.stability = node.right.stability + node.is_stable = False + return + else: + node.is_stable = True + return + + if node.right is None: + self._stable_node_flagging(node.left) + if node.stability < node.left.stability: + node.stability = node.left.stability + node.is_stable = False + return + else: + node.is_stable = True + return + + self._stable_node_flagging(node.right) + self._stable_node_flagging(node.left) + + if node.stability < node.left.stability + node.right.stability: + node.stability = node.left.stability + node.right.stability + node.is_stable = False + return + else: + # Node is a true cluster + node.is_stable = True + return + + def _get_stable_clusters(self, node: Optional[_DCNode]) -> list[_DCNode]: + if node is None: + return [] + + if node.is_stable: + self.dc_tree[node.id].is_stable = True + return [node] + + else: + return self._get_stable_clusters(node.left) + self._get_stable_clusters(node.right) + + def _increase_inter_cluster_distance(self, root: _DCDist_DCNode): + queue = deque() + queue.append(root) + while len(queue): + node = queue.popleft() + + if node.left is None and node.right is None: + continue + + if hasattr(node, "is_stable") and node.is_stable: + continue + + node.dist += max( + node.left.dist * len(node.left.leaves), node.right.dist * len(node.right.leaves) + ) + # node.dist += max( + # (node.dist - node.left.dist) * len(node.left.leaves), (node.dist - node.right.dist) * len(node.right.leaves) + # ) + queue.append(node.left) + queue.append(node.right) + + def stable_nodes(self, dc_tree: DCTree) -> list[_DCNode]: + new_root = self._condense_tree(dc_tree.root) + if new_root is None: + return [] + + self._calculate_stability(new_root) + new_root.stability = 0 + + self._stable_node_flagging(new_root) + new_root.is_stable = False + + stable_nodes = self._get_stable_clusters(new_root) + + stable_nodes_ = [] + for node in stable_nodes: + if len(node.leaves) >= self.min_points: + stable_nodes_.append(node) + + return stable_nodes_ + + def clustering(self, dc_tree: DCTree): + stable_nodes = self.stable_nodes(dc_tree) + + labels = np.full(dc_tree.n, -1) + + idx = 0 + for i in range(len(stable_nodes)): + labels[stable_nodes[i].leaves] = idx + idx += 1 + return labels + + +class _DCNode(_DCDist_DCNode): + is_stable: Optional[Union[Literal[True], Literal[False]]] + stability: Optional[float] + left: _DCNode + right: _DCNode diff --git a/clustpy/deep/shade.py b/clustpy/deep/shade.py new file mode 100644 index 0000000..28a389b --- /dev/null +++ b/clustpy/deep/shade.py @@ -0,0 +1,508 @@ +""" +@authors: +Pascal Weber +""" + +from __future__ import annotations + +import numpy as np +import torch +import sys + +from .dcdist import DCTree, DCTree_Clusterer +from clustpy.deep._abstract_deep_clustering_algo import _AbstractDeepClusteringAlgo +from clustpy.deep.neural_networks._abstract_autoencoder import _AbstractAutoencoder +from clustpy.deep.neural_networks import FeedforwardAutoencoder +from clustpy.deep._data_utils import get_dataloader +from clustpy.deep._train_utils import get_trained_network +from clustpy.deep._utils import ( + detect_device, + set_torch_seed, + squared_euclidean_distance, + encode_batchwise, + run_initial_clustering, +) +from sklearn.utils import check_random_state +from tqdm import tqdm +from typing import Callable, Optional, Tuple + +import matplotlib.pyplot as plt + + +sys.setrecursionlimit(1000000000) + + +class SHADE(_AbstractDeepClusteringAlgo): + """ + A neural network (autoencoder AE) will be trained with the reconstruction loss and the d_dc loss function. + Afterward, KMeans or HDBSCAN identifies the initial clusters. + + Parameters + ---------- + batch_size : int + Size of the data batches. (default: 500) + embedding_size : int + Size of the embedding within the neural_network. (default: 10) + neural_network : torch.nn.Module + The input neural_network. If None a new Autoencoder model will be created. (default: None) + optimizer_params : dict + Parameters of the optimizer for the clustering procedure. + Can also include the learning rate. (default: {"lr": 1e-3}) + optimizer_class : torch.optim.Optimizer + The optimizer class. (default: torch.optim.Adam) + loss_fn : torch.nn.modules.loss._Loss + Loss function for the reconstruction. (default: torch.nn.MSELoss()) + custom_dataloaders : tuple + Tuple consisting of a trainloader (random order) at the first and + a test loader (non-random order) at the second position. + If None, the default dataloaders will be used. (default: None) + random_state : np.random.RandomState + Use a fixed random state to get a repeatable solution. + Can also be of type int. (default: None) + device : torch.device + If device is None then it will set to cuda if it is available. (default: None) + + Attributes + ---------- + labels_ : np.ndarray + The final labels + neural_network : torch.nn.Module + The final neural_network + + Examples + -------- + >>> from clustpy.data import create_subspace_data + >>> data, labels = create_subspace_data(1500, subspace_features=(3, 50), random_state=1) + >>> shade = SHADE() + >>> shade.fit(data) + + References + ---------- + SHADE: Deep Density-based Clustering + Anna Beer; Pascal Weber; Lukas Miklautz; Collin Leiber; Walid Durani; Christian Böhm + IEEE International Conference on Data Mining (ICDM), Abu Dhabi, United Arab Emirates, 2024, pp. 675-680, doi: 10.1109/ICDM59182.2024. + """ + + batch_size: int + neural_network: Optional[torch.nn.Module] + min_points: int + use_complete_dc_tree: bool + use_matrix_dc_distance: bool + increase_inter_cluster_distance: bool + pretrain_epochs: int + pretrain_optimizer_params: dict + clustering_epochs: int + clustering_optimizer_params: dict + embedding_size: int + optimizer_params: dict + optimizer_class: torch.optim.Optimizer + loss_fn: torch.nn.modules.loss._Loss + custom_dataloaders = Optional[Tuple[Callable, Callable]] + random_state: np.random.RandomState + device: torch.device + cluster_algorithm: ClusterMixin + cluster_algorithm_params: dict + degree_of_reconstruction: float + degree_of_density_preservation: float + + n_clusters: Optional[int] + labels_: np.ndarray + + def __init__( + self, + batch_size: int = 500, + neural_network: Optional[torch.nn.Module] = None, + min_points: int = 5, + use_complete_dc_tree: bool = True, + use_matrix_dc_distance: bool = True, + increase_inter_cluster_distance: bool = False, + pretrain_epochs: int = 0, + pretrain_optimizer_params: dict = {"lr": 1e-3}, + clustering_epochs: int = 100, + clustering_optimizer_params: dict = {"lr": 1e-3}, + embedding_size: int = 10, + optimizer_class: torch.optim.Optimizer = torch.optim.Adam, + loss_fn: torch.nn.modules.loss._Loss = torch.nn.MSELoss(), + custom_dataloaders: Optional[Tuple[Callable, Callable]] = None, + random_state: Optional[np.random.RandomState] = None, + device: Optional[torch.device] = None, + n_clusters: Optional[int] = None, + cluster_algorithm: Optional[ClusterMixin] = DCTree_Clusterer, + cluster_algorithm_params: dict = {}, + degree_of_reconstruction: float = 1.0, + degree_of_density_preservation: float = 1.0, + ): + self.batch_size = batch_size + self.min_points = min_points + self.use_complete_dc_tree = use_complete_dc_tree + self.dc_tree = None + self.use_matrix_dc_distance = use_matrix_dc_distance + self.increase_inter_cluster_distance = increase_inter_cluster_distance + self.pretrain_epochs = pretrain_epochs + self.pretrain_optimizer_params = pretrain_optimizer_params + self.clustering_epochs = clustering_epochs + self.clustering_optimizer_params = clustering_optimizer_params + self.embedding_size = embedding_size + self.neural_network = neural_network + self.optimizer_class = optimizer_class + self.loss_fn = loss_fn + self.custom_dataloaders = custom_dataloaders + self.random_state = check_random_state(random_state) + set_torch_seed(self.random_state) + self.device = detect_device(device) + self.n_clusters = n_clusters + self.cluster_algorithm = cluster_algorithm + self.cluster_algorithm_params = cluster_algorithm_params + self.degree_of_reconstruction = degree_of_reconstruction + self.degree_of_density_preservation = degree_of_density_preservation + + def fit(self, X, y=None) -> SHADE: + """ + Cluster the input dataset with the SHADE algorithm. + The resulting cluster labels will be stored in the `labels_` attribute. + + Parameters + ---------- + X : np.ndarray + The given data set. + y : np.ndarray + The labels. (can be ignored) + dc_distances : Optional[Union[np.ndarray, DCTree, SampleDCTree]] + dc_distances of X. + + Returns + ------- + self : SHADE + This instance of the SHADE algorithm. + """ + + # Create Dataloader + if self.custom_dataloaders is None: + trainloader = get_dataloader( + X, + self.batch_size, + drop_last=False, + shuffle=True, + ) + testloader = get_dataloader( + X, + self.batch_size, + drop_last=False, + shuffle=False, + ) + else: + trainloader, testloader = self.custom_dataloaders + if trainloader.batch_size != self.batch_size: + self.batch_size = trainloader.batch_size + + # Create dc_tree + if self.dc_tree is None and self.use_complete_dc_tree: + print("Build DCTree of the complete dataset") + self.dc_tree = DCTree(X, min_points=self.min_points) + + # Create and pretrain Autoencoder + if self.neural_network is None: + architecture = [X.shape[1], 512, 256, 128, self.embedding_size] + self.neural_network = FeedforwardAutoencoder(architecture) + self.neural_network = self.neural_network.to(self.device) + + if not self.neural_network.fitted: + self.neural_network = get_trained_network( + trainloader=trainloader, + data=X, + n_epochs=self.pretrain_epochs, + batch_size=self.batch_size, + optimizer_params=self.pretrain_optimizer_params, + optimizer_class=self.optimizer_class, + device=self.device, + ssl_loss_fn=self.loss_fn, + embedding_size=self.embedding_size, + neural_network=self.neural_network, + random_state=self.random_state, + ) + self.neural_network.fitted = False + + # Setup SHADE Module + self.shade_module = _SHADE_Module( + autoencoder=self.neural_network, + n_epochs=self.clustering_epochs, + min_points=self.min_points, + dc_tree=self.dc_tree, + use_matrix_dc_distance=self.use_matrix_dc_distance, + increase_inter_cluster_distance=self.increase_inter_cluster_distance, + degree_of_reconstruction=self.degree_of_reconstruction, + degree_of_density_preservation=self.degree_of_density_preservation, + optimizer_class=self.optimizer_class, + optimizer_params=self.clustering_optimizer_params, + device=self.device, + ) + if not self.neural_network.fitted: + print("Start training with clustering loss.") + self.shade_module.fit( + X, + trainloader=trainloader, + loss_fn=self.loss_fn, + testloader=testloader, + ) + self.neural_network.fitted = True + + embedding = encode_batchwise(testloader, self.neural_network) + + ( + self.n_clusters, + self.labels_, + self.cluster_centers_, + _, + ) = run_initial_clustering( + X=embedding, + n_clusters=self.n_clusters, + clustering_class=self.cluster_algorithm, + clustering_params=self.cluster_algorithm_params, + random_state=self.random_state, + ) + return self + + def predict(self, X: np.ndarray) -> np.ndarray: + """ + Predicts the labels of the input data. + + Parameters + ---------- + X : np.ndarray + The input data. + n_clusters : int + Number of clusters. Can be None if a corresponding initial_clustering_class is given, + e.g. DBSCAN / HDBSCAN. + cluster_algorithm : Clusterer, optional + Clustering algorithm which should be used on the learned embedding. (default: KMeans) + cluster_params : dict + Clustering parameters for `cluster_algorithm`. + + Returns + ------- + predicted_labels : np.ndarray + The predicted labels. + """ + + dataloader = get_dataloader( + X, + self.batch_size, + drop_last=False, + shuffle=False, + ) + + embedding = encode_batchwise(dataloader, self.neural_network) + + ( + self.n_clusters, + self.labels_, + self.cluster_centers_, + _, + ) = run_initial_clustering( + X=embedding, + n_clusters=self.n_clusters, + clustering_class=self.cluster_algorithm, + clustering_params=self.cluster_algorithm_params, + random_state=self.random_state, + ) + return self.labels_ + + def encode(self, X: np.ndarray) -> np.ndarray: + """ + Embedds the input data with the learned SHADE Autoencoder. + + Parameters + ---------- + X : np.ndarray + The input data. + + Returns + ------- + np.ndarray + Embedded input data. + """ + + dataloader = get_dataloader( + X, + self.batch_size, + drop_last=False, + shuffle=False, + ) + + embedding = encode_batchwise(dataloader, self.neural_network) + return embedding + + +class _SHADE_Module(_AbstractAutoencoder): + """ + The SHADE Autoencoder. + """ + + autoencoder: torch.nn.Module + dc_tree: Optional[DCTree] + use_matrix_dc_distance: bool + n_epochs: int + min_points: int + increase_inter_cluster_distance: bool + degree_of_reconstruction: float + degree_of_density_preservation: float + optimizer: torch.optim.Optimizer + device: torch.device + + def __init__( + self, + autoencoder: torch.nn.Module, + n_epochs=100, + min_points: int = 5, + dc_tree: Optional[DCTree] = None, + use_matrix_dc_distance: bool = True, + increase_inter_cluster_distance: bool = False, + degree_of_reconstruction: float = 1.0, + degree_of_density_preservation: float = 1.0, + optimizer_class: torch.optim.Optimizer = torch.optim.Adam, + optimizer_params: dict = {}, + use_tqdm = False, + device: Optional[torch.device] = None, + ): + super().__init__() + + self.autoencoder = autoencoder + self.n_epochs = n_epochs + self.min_points = min_points + self.dc_tree = dc_tree + self.use_matrix_dc_distance = use_matrix_dc_distance + self.increase_inter_cluster_distance = increase_inter_cluster_distance + self.degree_of_reconstruction = degree_of_reconstruction + self.degree_of_density_preservation = degree_of_density_preservation + self.optimizer = optimizer_class(list(autoencoder.parameters()), **optimizer_params) + self.use_tqdm = use_tqdm + self.device = detect_device(device) + + def encode(self, x: torch.Tensor) -> torch.Tensor: + return self.autoencoder.encode(x) + + def decode(self, embedded: torch.Tensor) -> torch.Tensor: + return self.autoencoder.decode(embedded) + + def fit( + self, + X, + trainloader: torch.utils.data.DataLoader, + loss_fn: torch.nn.modules.loss._Loss, + testloader, + ) -> _SHADE_Module: + """ + Trains the autoencoder in place. + + Parameters + ---------- + trainloader : torch.utils.data.DataLoader + Dataloader to be used for training. + dc_distances : Union[np.ndarray, DCTree, SampleDCTree] + dc_distances of the data of the dataloader. + n_epochs : int + Number of epochs for the clustering procedure. + optimizer : torch.optim.Optimizer + The optimizer for training. + loss_fn : torch.nn.modules.loss._Loss + Loss function for the reconstruction. + device : torch.device + Device to be trained on. + + Returns + ------- + self : _SHADE_Module + This instance of the _SHADE_Module. + """ + + if self.dc_tree is not None and self.use_matrix_dc_distance: + print("Compute dc_distance matrix.") + self.matrix_dc_distance = self.dc_tree.dc_distances() + + self.train() + for epoch_i in tqdm(range(self.n_epochs), file=sys.stdout, desc="Epoch", disable=not self.use_tqdm): + loss_rec_sum = [] + loss_dens_sum = [] + # Update Network + for batch in trainloader: + if len(batch[0]) <= self.min_points: + continue + + loss_rec, loss_dens = self._loss(X, batch, loss_fn) + loss = ( + self.degree_of_reconstruction * loss_rec + + self.degree_of_density_preservation * loss_dens + ) + loss_rec_sum.append(loss_rec.cpu().detach().numpy()) + loss_dens_sum.append(loss_dens.cpu().detach().numpy()) + # Backward pass - update weights + self.optimizer.zero_grad() + loss.backward() + self.optimizer.step() + + self.autoencoder.eval() + self.eval() + return self + + def _loss( + self, + X, + batch: list, + loss_fn: torch.nn.modules.loss._Loss, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Calculate the autoencoder reconstruction + d_dc loss. + + Parameters + ---------- + batch : list + The minibatch. + loss_fn : torch.nn.modules.loss._Loss + Loss function for the reconstruction. + device : torch.device + Device to be trained on. + + Returns + -------loss + loss : torch.Tensor + The final SHADE loss. + """ + + # Reconstrucion + batch_data = batch[1].to(self.device) + emb_data = self.encode(batch_data) + reconstructed = self.decode(emb_data) + loss_rec = loss_fn(reconstructed, batch_data) + + # Density loss + if self.dc_tree is None: + # Batch-wise DCTree + if self.increase_inter_cluster_distance: + dc_clusterer = DCTree_Clusterer( + min_points=self.min_points, + increase_inter_cluster_distance=self.increase_inter_cluster_distance, + ) + dc_clusterer.fit_predict(X[batch[0]]) + batch_dc_dists = torch.tensor( + dc_clusterer.dc_tree.dc_distances(), device=self.device + ) + else: + batch_dc_dists = torch.tensor( + DCTree(X[batch[0]], min_points=self.min_points).dc_distances(), + device=self.device, + ) + else: + # DCTree of all data points X + if self.use_matrix_dc_distance: + batch_dc_dists = torch.tensor( + self.matrix_dc_distance[np.ix_(batch[0], batch[0])], device=self.device + ) + else: + batch_dc_dists = torch.tensor( + self.dc_tree.dc_distances(batch[0], batch[0]), device=self.device + ) + + batch_eucl_dists = squared_euclidean_distance(emb_data, emb_data) + loss_dens = (batch_eucl_dists - batch_dc_dists).pow(2).mean() + + return loss_rec, loss_dens From 1448438ec3d391421c33a96605f59640e570d6b2 Mon Sep 17 00:00:00 2001 From: Collin Leiber Date: Thu, 21 May 2026 13:25:59 +0300 Subject: [PATCH 2/5] Shorten code and better align it to ClustPy standards. Add tests --- README.md | 1 + .../deep/_abstract_deep_clustering_algo.py | 5 - clustpy/deep/_train_utils.py | 2 +- clustpy/deep/dcdist/__init__.py | 27 - clustpy/deep/dcdist/dctree.py | 857 ------------------ clustpy/deep/dcdist/dctree_clusterer.py | 228 ----- clustpy/deep/shade.py | 554 +++++------ clustpy/deep/tests/test_shade.py | 25 + clustpy/hierarchical/__init__.py | 4 +- clustpy/hierarchical/dctree_clusterer.py | 153 ++++ .../tests/test_dctree_clusterer.py | 17 + clustpy/utils/__init__.py | 6 +- clustpy/utils/dctree.py | 607 +++++++++++++ clustpy/utils/tests/test_dctree.py | 162 ++++ 14 files changed, 1184 insertions(+), 1464 deletions(-) delete mode 100644 clustpy/deep/dcdist/__init__.py delete mode 100644 clustpy/deep/dcdist/dctree.py delete mode 100644 clustpy/deep/dcdist/dctree_clusterer.py create mode 100644 clustpy/deep/tests/test_shade.py create mode 100644 clustpy/hierarchical/dctree_clusterer.py create mode 100644 clustpy/hierarchical/tests/test_dctree_clusterer.py create mode 100644 clustpy/utils/dctree.py create mode 100644 clustpy/utils/tests/test_dctree.py diff --git a/README.md b/README.md index 67135ed..1b818f7 100644 --- a/README.md +++ b/README.md @@ -132,6 +132,7 @@ Remove clustpy via pip to avoid ambiguities during development, e.g., when chang | ENRC | [Deep Embedded Non-Redundant Clustering](https://ojs.aaai.org/index.php/AAAI/article/view/5961) | AAAI 2020 | [Link](https://gitlab.cs.univie.ac.at/lukas/enrcpublic) (Python + PyTorch) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.enrc.ENRC) | | IDEC | [Improved Deep Embedded Clustering with Local Structure Preservation](https://www.ijcai.org/proceedings/2017/243) | IJCAI 2017 | [Link](https://github.com/XifengGuo/IDEC) (Python + Keras) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.dec.IDEC) | | N2D | [N2d:(not too) deep clustering via clustering the local manifold of an autoencoded embedding](https://ieeexplore.ieee.org/document/9413131) | ICPR 2021 | [Link](https://github.com/XifengGuo/IDEC) (Python + Keras) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.ddc_n2d.N2D) | +| SHADE | [SHADE: Deep Density-based Clustering](https://ieeexplore.ieee.org/abstract/document/10884477) | ICDM 2024 | [Link](https://github.com/pasiweber/SHADE) (Python + PyTorch) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.shade.SHADE) | | VaDE | [Variational Deep Embedding: An Unsupervised and Generative Approach to Clustering](https://www.ijcai.org/proceedings/2017/0273) | IJCAI 2017 | [Link](https://github.com/slim1017/VaDE) (Python + Keras) | [Link](https://clustpy.readthedocs.io/en/latest/clustpy.deep.html#clustpy.deep.vade.VaDE) | #### Neural Networks diff --git a/clustpy/deep/_abstract_deep_clustering_algo.py b/clustpy/deep/_abstract_deep_clustering_algo.py index c776ff6..e7a5a8d 100644 --- a/clustpy/deep/_abstract_deep_clustering_algo.py +++ b/clustpy/deep/_abstract_deep_clustering_algo.py @@ -100,11 +100,6 @@ def transform(self, X: np.ndarray) -> np.ndarray: """ check_is_fitted(self, ["labels_", "neural_network_trained_", "n_features_in_"]) X, _, _ = check_parameters(X, allow_size_1=True, allow_nd=self.neural_network_trained_.allow_nd_input, estimator_obj=self) - if X.shape[1] != self.n_features_in_: - raise ValueError( - f"X has {X.shape[1]} features, but {self.__class__.__name__} " - f"is expecting {self.n_features_in_} features as input." - ) X_embed = self.neural_network_trained_.transform(X) return X_embed diff --git a/clustpy/deep/_train_utils.py b/clustpy/deep/_train_utils.py index e776d96..79bd84c 100644 --- a/clustpy/deep/_train_utils.py +++ b/clustpy/deep/_train_utils.py @@ -166,7 +166,7 @@ def get_trained_network(trainloader: torch.utils.data.DataLoader = None, data: n neural_network = get_neural_network(input_dim, embedding_size, neural_network, neural_network_class, neural_network_params, neural_network_weights, device, random_state) # Start training - if not neural_network.fitted: + if not neural_network.fitted and n_epochs > 0: print("Neural network is not fitted yet, will be pretrained.") # Pretrain neural network optimizer_params = {"lr": 1e-3} if optimizer_params is None else optimizer_params diff --git a/clustpy/deep/dcdist/__init__.py b/clustpy/deep/dcdist/__init__.py deleted file mode 100644 index 113a855..0000000 --- a/clustpy/deep/dcdist/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -from .dctree import ( - DCTree, - calculate_reachability_distance, - serialize as serialize_DCTree, - serialize_compressed as serialize_DCTree_compressed, - save as save_DCTree, - deserialize as deserialize_DCTree, - deserialize_compressed as deserialize_DCTree_compressed, - load as load_DCTree, -) - - -from .dctree_clusterer import DCTree_Clusterer - -__all__ = [ - ### DCTree ### - "DCTree", - "calculate_reachability_distance", - "serialize_DCTree", - "serialize_DCTree_compressed", - "save_DCTree", - "deserialize_DCTree", - "deserialize_DCTree_compressed", - "load_DCTree", - ### DCTree_Cluster ### - "DCTree_Clusterer", -] diff --git a/clustpy/deep/dcdist/dctree.py b/clustpy/deep/dcdist/dctree.py deleted file mode 100644 index e6299ab..0000000 --- a/clustpy/deep/dcdist/dctree.py +++ /dev/null @@ -1,857 +0,0 @@ -""" -@authors: -Pascal Weber -""" - -# Implementation of the dc-distance with a DCTree by -# - Author: Pascal Weber -# - Source: https://github.com/pasiweber/SHADE - -# Paper: Connecting the Dots -- Density-Connectivity Distance unifies DBSCAN, k-Center and Spectral Clustering -# Authors: Anna Beer, Andrew Draganov, Ellen Hohma, Philipp Jahn, Christian M.M. Frey, and Ira Assent -# Link: https://doi.org/10.1145/3580305.3599283 - - -from __future__ import annotations - -import gzip -import numpy as np -import sys - -from collections import deque -from multiprocessing import cpu_count -from multiprocessing.pool import ThreadPool -from sklearn.metrics import pairwise_distances -from typing import List, Literal, Optional, Sequence, Tuple, Union - - -class DCTree: - """ - The DCTree computes the dc_distances and stores them in a tree structure. - - By using the euler tour of the tree and a sparse table for range minimum queries, - the lca-elements in the tree (i.e. the dc_distances) can be computed in O(1) time. - - The function `dc_dist(i, j)` returns the dc_distance between `points[i]` and `points[j]` - in O(1) time. - The function `dc_distance(X, Y=None, access_method="tree")` returns a dc_distance matrix - with the dc_distance from each pair of `points[X]` and `points[Y]`. - See the section Functions for more details. - - The DCTree provides `serialize` and `serialize_compressed` functions to serialize the - DCTree. With `deserialize` or `deserialize_compressed` the DCTree can be deserialized again. - - The DCTree provides `save` and `load` functions to save / load the DCTree to / from disk. - - - Parameters - ---------- - points : np.ndarray - points, of which the dc_distances should be computed of. - - min_points : int, optional - min_points parameter used for the computation of the dc_distances, by default 5. - - access_method: str - Default access_method for dc_distances, by default "tree". - - no_fastindex: bool = False - No fast index structure is constructed if this parameter is set to `True`. - Disables the functions `dc_dist` and `dc_distances` with `access_method = "dc_dist"`. - Only useful if you only use the `dc_distance` function with `access_method = "tree"`, - e.g. for calculating all pair dc_distances with `dc_dists = dc_tree.dc_distances()`. - - use_less_memory: bool = False - Don't precalculate the reachability distance. Saves quadratic RAM usage, but also needs - double the time for computing. - - - Functions - --------- - DCTree[x] or DCTree[i,j] or DCTree[X,Y]: - Returns the _DCNode of given index if `arg` is an integer or a Sequence. - - If DCTree[i,j] is used, the dc_dist of `points[i]` and `points[j]` is returned - (i and j are integer). - - If DCTree[X,Y] is used, the dc_dist matrix between each pair of `points[X]` and - `points[Y]` is returned (X and Y are np.ndarray or Sequences). - - dc_dist : (self, i: int, j: int) -> distance - returns the dc_distance between points[i] and points[j] in O(1) time. - - dc_distances : (self, X = None, Y = None, access_method = `self.accesss_method`) -> np.ndarray - Computes the dc_distance matrix between each pair of points[X] and points[Y]. - If X=None, X=range(n) is used. - If Y=None, Y=X is used. - - `access_method` (by default `self.accesss_method`): - - "tree": Traverses the tree in O(n) time (n = len(points)), - no matter the size of X / Y. - - - "dc_dist": Uses the dc_dist function and needs O(k*l) time - (k = len(X), l = len(Y))). - - "dc_dist" is often faster if X and Y are smaller than ~10% of n = len(points). - - Returns dc_dists: ndarray of shape (n_samples_X, n_samples_Y) - - - serialize : (dc_tree: DCTree) -> str - Serializes the DCTree `dc_tree` to a string. - - serialize_compressed : (dc_tree: DCTree) -> bytes - Serializes the DCTree `dc_tree` to a compressed byte array. - - save : (dc_tree: DCTree, file_path: str) -> save to disk - Saves the DCTree `dc_tree` to disk at `file_path`. - - - deserialize : (data: str, access_method = "tree", no_fastindex = False, n_jobs = None) -> DCTree - Deserializes a string `str` to a DCTree. - - deserialize_compressed : (data: bytes, access_method = "tree", no_fastindex = False, n_jobs = None) -> DCTree - Deserializes a compressed byte array `bytes` to a DCTree. - - load : (file_path: str, access_method = "tree", no_fastindex = False, n_jobs = None) -> DCTree - Loads a DCTree from disk at `file_path`. - - - Examples - -------- - >>> import dcdist - >>> points = np.array([[1,6],[2,6],[6,2],[14,17],[123,3246],[52,8323],[265,73]]) - >>> dc_tree = dcdist.DCTree(points, 5) - >>> print(dc_tree.dc_dist(2,5)) - >>> print(dc_tree.dc_distances(range(len(points)))) - >>> print(dc_tree.dc_distances([0,1], [2,3])) - - >>> s = dcdist.serialize(dc_tree) - >>> dc_tree_new = dcdist.deserialize(s) - - >>> b = dcdist.serialize_compressed(dc_tree) - >>> dc_tree_new = dcdist.deserialize_compressed(b) - - >>> dcdist.save(dc_tree, "./data.dctree") - >>> dc_tree_new = dcdist.load("./data.dctree") - - - References - ---------- - Anna Beer, Andrew Draganov, Ellen Hohma, Philipp Jahn, Christian M.M. Frey, and Ira Assent. - "Connecting the Dots -- Density-Connectivity Distance unifies DBSCAN, k-Center and Spectral - Clustering." KDD '23. 2023. - """ - - n: int # len(points) - min_points: int - root: _DCNode - - euler: List[_DCNode] - level: List[int] - f_occur: List[int] - level_table: _SparseTable - - access_method: str - no_fastindex: bool - n_jobs: int - no_gil: bool - - def __init__( - self, - points: np.ndarray, - min_points: int = 5, - min_points_mr: Optional[int] = None, - access_method: str = "tree", - no_fastindex: bool = False, - use_less_memory: bool = False, - n_jobs: Optional[int] = None, - precomputed=False, - ): - self.n = points.shape[0] - self.min_points = min_points - self.access_method = access_method - self.no_fastindex = no_fastindex - - if min_points_mr is None: - min_points_mr = min_points - - if n_jobs is None or n_jobs == 0: - self.n_jobs = cpu_count() - elif n_jobs < 0: - self.n_jobs = max(cpu_count() + 1 + n_jobs, 1) - else: - self.n_jobs = n_jobs - self.no_gil = sys.flags.nogil if hasattr(sys.flags, "nogil") else False - - if use_less_memory and not precomputed: - mst_edges = self._get_mst_edges(points, use_less_memory=True) - else: - if not precomputed: - reach_dists = calculate_reachability_distance(points, min_points_mr) - else: - reach_dists = points - mst_edges = self._get_mst_edges(reach_dists) - if not precomputed: - del reach_dists - mst_edges.sort(order="dist") - self.root = self._build_tree(mst_edges) - del mst_edges - - if not self.no_fastindex: - self._init_fast_index() - - def _init_fast_index(self): - n_nodes = 2 * self.n - 1 - self.euler = [self.root] * (2 * n_nodes - 1) - self.level = [0] * (2 * n_nodes - 1) - self.f_occur = [-1] * n_nodes - self._euler_tour(self.root) - self.level_table = _SparseTable(self.level) - - def __getitem__( - self, - arg: Union[ - Union[int, slice, Sequence[int], np.ndarray], - Union[ - Tuple[int, int], - Tuple[int, np.ndarray, Sequence[int]], - Tuple[np.ndarray, Sequence[int], int], - Tuple[np.ndarray, Sequence[int], np.ndarray, Sequence[int]], - ], - ], - ) -> Union[Union[_DCNode, List[_DCNode]], Union[float, np.ndarray]]: - """ - Returns the _DCNode of given index if `arg` is an integer or a Sequence. - - If DCTree[i,j] is used, the dc_dist of `points[i]` and `points[j]` is returned - (i and j are integer). - - If DCTree[X,Y] is used, the dc_dist matrix between each pair of `points[X]` and - `points[Y]` is returned (X and Y are np.ndarray or Sequences). - """ - - index_error_msg = f"`{arg}` needs to be an integer, Sequence, np.ndarray, tuple of integer, or tuple of np.ndarray / Sequence!" - - if isinstance(arg, tuple): - if len(arg) != 2: - raise IndexError(index_error_msg) - (i, j) = arg - if isinstance(i, int) and isinstance(j, int): - return self.dc_dist(i, j) - if isinstance(i, int) and isinstance(j, (np.ndarray, Sequence)): - if isinstance(j, np.ndarray): - j = j.flatten() - return self.dc_distances([i], j) - if isinstance(i, (np.ndarray, Sequence)) and isinstance(j, int): - if isinstance(i, np.ndarray): - i = i.flatten() - return self.dc_distances(i, [j]) - elif isinstance(i, (np.ndarray, Sequence)) and isinstance(j, (np.ndarray, Sequence)): - if isinstance(i, np.ndarray): - i = i.flatten() - if isinstance(j, np.ndarray): - j = j.flatten() - return self.dc_distances(i, j) - else: - raise IndexError(index_error_msg) - - elif isinstance(arg, int): - return self.euler[self.f_occur[arg]] - elif isinstance(arg, (Sequence, np.ndarray)): - return [self.euler[self.f_occur[i]] for i in arg] - elif isinstance(arg, slice): - return [self.euler[i] for i in self.f_occur[arg]] - - else: - raise IndexError(index_error_msg) - - def __repr__(self): - if self.root is None: - return "" - - # pointer_right = "└──" - # pointer_left = "├──" if self.root.right else "└──" - pointer_right = " " - pointer_left = " " if self.root.right else " " - return ( - f"{self.root}" - f"{self.__repr__help(self.root.left, pointer_left, '', self.root.right is not None)}" - f"{self.__repr__help(self.root.right, pointer_right, '', False)}" - ) - - def __repr__help(self, node: Optional[_DCNode], pointer: str, padding: str, has_right_sibling: bool): - if node is None: - return "" - - # padding_for_both = padding + ("| " if has_right_sibling else " ") - # pointer_right = "└──" - # pointer_left = "├──" if node.right else "└──" - padding_for_both = padding + (" " if has_right_sibling else " ") - pointer_right = " " - pointer_left = " " if node.right else " " - return ( - f"\n {padding.replace('|', ' ')}// #region" - f"\n{padding}{pointer}{node}" - f"{self.__repr__help(node.left, pointer_left, padding_for_both, node.right is not None)}" - f"{self.__repr__help(node.right, pointer_right, padding_for_both, False)}" - f"\n {padding.replace('|', ' ')}// #endregion" - ) - - def dc_dist(self, i: int, j: int) -> float: - """Returns the dc_distance from points[i] to points[j] in O(1) time.""" - if i == j: - return 0 - return self._lca(i, j).dist - - def _lca(self, i: int, j: int) -> _DCNode: - first_i = self.f_occur[i] - first_j = self.f_occur[j] - if first_i > first_j: - first_i, first_j = first_j, first_i - return self.euler[self.level_table.query(first_i, first_j)] - - def dc_distances( - self, - X: Union[Sequence[int], np.ndarray, None] = None, - Y: Union[Sequence[int], np.ndarray, None] = None, - access_method: Optional[str] = None, - ) -> np.ndarray: - """ - Computes the dc_distance matrix between each pair of points[X] and points[Y]. - If X=None, X=range(n) is used. - If Y=None, Y=X is used. - - `access_method` (by default `self.access_method`): - - "tree": traverses the tree in O(n) time (n = len(points)), - no matter the size of X / Y. - - "dc_dist": uses the dc_dist function and needs O(k*l) time - (k = len(X), l = len(Y))). - - "dc_dist" is often faster if X and Y are smaller than ~10% of n = len(points). - - Returns dc_dists: ndarray of shape (n_samples_X, n_samples_Y) - """ - - if X is None: - X = range(self.n) - - if Y is None: - Y = X - - if access_method is None: - access_method = self.access_method - - if access_method == "dc_dist" and self.no_fastindex: - print("No fastindex computed. Fallback to access_method: `tree`.") - access_method = "tree" - - if access_method == "dc_dist" and not self.no_fastindex: - dc_dists = np.zeros((len(X), len(Y))) - - for i in range(len(X)): - for j in range(i + 1 if X is Y else 0, len(Y)): - if X[i] != Y[j]: - dc_dists[i, j] = self.dc_dist(X[i], Y[j]) - if X is Y: - dc_dists = dc_dists + dc_dists.T - return dc_dists - - elif access_method == "tree": - dc_dists = np.zeros((len(X), len(Y))) - - idx_rev_X = np.full(self.n, -1) - idx_rev_X[X] = range(len(X)) - - if X is Y: - idx_rev_Y = idx_rev_X - else: - idx_rev_Y = np.full(self.n, -1) - idx_rev_Y[Y] = range(len(Y)) - - inodes = self.get_internal_nodes() - inodes_start = 0 - inodes_end = len(inodes) - - n_jobs = self.n_jobs if self.no_gil else 4 - pool = ThreadPool(n_jobs) - n_inodes = self.n - 1 - chunk_size = int(np.ceil(n_inodes / n_jobs)) - start_points = range(inodes_start, inodes_end, chunk_size) - - def func(start): - for i in range(start, min(start + chunk_size, inodes_end)): - inode = inodes[i] - # inode always has left and right node - i_leaves = inode.left.leaves - j_leaves = inode.right.leaves - - (i_, j_) = (idx_rev_X[i_leaves], idx_rev_Y[j_leaves]) - (i_, j_) = (i_[i_ != -1], j_[j_ != -1]) - dc_dists[(i_[:, np.newaxis], j_[np.newaxis, :])] = inode.dist - - if X is Y: - dc_dists[(j_[:, np.newaxis], i_[np.newaxis, :])] = inode.dist - else: - (i_, j_) = (idx_rev_Y[i_leaves], idx_rev_X[j_leaves]) - (i_, j_) = (i_[i_ != -1], j_[j_ != -1]) - dc_dists[(j_[:, np.newaxis], i_[np.newaxis, :])] = inode.dist - - pool.map(func, start_points) - return dc_dists - - raise ValueError(f"'{access_method}' is no valid `access_method`") - - def _get_mst_edges(self, dist_matrix: np.ndarray, use_less_memory: bool = False) -> np.ndarray: - """Prim's algorithm to build up the minimum spanning tree in O(n^2) time.""" - # dist_matrix are the points, if use_less_memory=True - n = self.n - nodes_min_dist = np.full(n, np.inf) - parent = np.full(n, 0) - not_in_mst = np.full(n, True) - u = 0 # Start node - nodes_min_dist[u] = 0 - not_in_mst[u] = False - mst_edges = np.empty((n - 1), dtype=([("i", int), ("j", int), ("dist", float)])) - - if use_less_memory: - reach_dists = np.empty((dist_matrix.shape[0])) - for i in range(dist_matrix.shape[0]): - eucl_dists = np.linalg.norm(dist_matrix - dist_matrix[i], axis=1) - reach_dists[i] = np.max(np.partition(eucl_dists, self.min_points)[: self.min_points]) - - for i in range(n - 1): - if use_less_memory: - dist_u = np.linalg.norm(dist_matrix - dist_matrix[u], axis=1) # Euclidean distances - dist_u = np.maximum(dist_u, reach_dists[u]) # reachability distance of i - dist_u = np.maximum(dist_u, reach_dists) # reachability distance of all points - - v = np.where(not_in_mst & (dist_u < nodes_min_dist))[0] - nodes_min_dist[v] = dist_u[v] - else: - v = np.where(not_in_mst & (dist_matrix[u] < nodes_min_dist))[0] - nodes_min_dist[v] = dist_matrix[u, v] - parent[v] = u - - arg = np.argmin(nodes_min_dist[not_in_mst]) - u = np.arange(n)[not_in_mst][arg] - mst_edges[i] = (parent[u], u, nodes_min_dist[u]) - not_in_mst[u] = False - - return mst_edges - - def _build_tree(self, mst_edges: np.ndarray) -> _DCNode: - """Kruskal's algorithm to build up the DCTree with the precomputed - and sorted mst_edges in O(n) time.""" - union_find = _UnionFind(self.n) - node = _DCNode(id=0, dist=0.0, leaves=[0]) - idx = self.n - k = len(mst_edges) - for i, j, dist in mst_edges: - i_root = union_find.find(i) - j_root = union_find.find(j) - node = _DCNode( - id=idx, - dist=dist, - k=-1 if (len(i_root.leaves) + len(j_root.leaves)) < self.min_points else k, - left=i_root, - right=j_root, - leaves=i_root.leaves + j_root.leaves, - ) - i_root.parent = node - j_root.parent = node - union_find.union(i, j, node) - idx += 1 - if not node.k == -1: - k -= 1 - return node - - def _euler_tour(self, root: _DCNode): - """Euler tour to get the euler, level, and f_occur lists in O(n) time.""" - DOWN, UP = 0, 1 - stack: deque[Tuple[_DCNode, int, Literal[0, 1]]] = deque([(root, 0, DOWN)]) # (node, level, DOWN / UP) - - pos = 0 - while len(stack): - (node, level, status) = stack.pop() - - if status == DOWN: - if self.f_occur[node.id] == -1: - self.f_occur[node.id] = pos - - self.euler[pos] = node - self.level[pos] = level - pos += 1 - - if node.right: - stack.append((node, level, UP)) - stack.append((node.right, level + 1, DOWN)) - if node.left: - stack.append((node, level, UP)) - stack.append((node.left, level + 1, DOWN)) - - elif status == UP: - self.euler[pos] = node - self.level[pos] = level - pos += 1 - - def get_internal_nodes(self): - nodes = [] - stack: deque[_DCNode] = deque([self.root]) - - while len(stack): - node = stack.pop() - - if node and node.left and node.right: - nodes.append(node) - - if node.left: - stack.append(node.left) - - if node.right: - stack.append(node.right) - return nodes - - def traverse_until_k(self, k): - from queue import PriorityQueue - - result_nodes = set([self.root]) - if k == 1: - return result_nodes - - stack = PriorityQueue() - stack.put((-self.root.dist, self.root, self.root)) - while not stack.empty(): - _, node, parent_node = stack.get() - - if node is None: - return - - if node.k >= 0: - if node.left.k != -1: - if node.left.k != -1 and node.right.k != -1: - result_nodes.discard(parent_node) - result_nodes.discard(node) - result_nodes.add(node.left) - if node.parent.left.k == -1 or node.parent.right.k == -1: - stack.put((-node.left.dist, node.left, parent_node)) - else: - stack.put((-node.left.dist, node.left, node)) - if node.right.k != -1: - if node.left.k != -1 and node.right.k != -1: - result_nodes.discard(parent_node) - result_nodes.discard(node) - result_nodes.add(node.right) - if node.parent.left.k == -1 or node.parent.right.k == -1: - stack.put((-node.right.dist, node.right, parent_node)) - else: - stack.put((-node.right.dist, node.right, node)) - - if len(result_nodes) >= k: - break - - return result_nodes - - def get_k_center(self, k): - nodes = self.traverse_until_k(k) - labels = np.full(self.n, -1) - for i, node in enumerate(nodes): - labels[np.array(node.leaves)] = i - return labels - - def get_eps_for_k(self, k, eps=-3e-12): - nodes = self.traverse_until_k(k) - min_eps = np.inf - for node in nodes: - min_eps = min(min_eps, node.parent.dist) - return min_eps + eps - - -def _serialize(root: _DCNode): - tree_list = [] - stack: deque[Optional[_DCNode]] = deque([root]) - while len(stack): - node = stack.pop() - if node is None: - tree_list.append("/") - elif node.left is None and node.right is None: - tree_list.append(f"{node.id}|{node.dist}") - else: - tree_list.append(f"'{node.id}|{node.dist}") - stack.append(node.right) - stack.append(node.left) - return ",".join(tree_list) - - -def serialize(dc_tree: DCTree) -> str: - """Serializes the DCTree `dc_tree` to a string.""" - - sep = "<\1dc_tree>" - # Tree structure, min_points, n_jobs, no_gil - return _serialize(dc_tree.root) + sep + str(dc_tree.min_points) - - -def serialize_compressed(dc_tree: DCTree) -> bytes: - """Serializes the DCTree `dc_tree` to a gzip-compressed byte array.""" - data = serialize(dc_tree) - byte_data = bytes(data, "utf-8") - return gzip.compress(byte_data) - - -def save(dc_tree: DCTree, file_path: str) -> None: - """Saves the DCTree `dc_tree` to disk at `file_path`.""" - byte_data = serialize_compressed(dc_tree) - with open(file_path, "wb") as file: - file.write(byte_data) - - -def _deserialize(data: List[str]) -> _DCNode: - id, dist = data[0].split("|") - root = _DCNode(id=int(id[1:]), dist=float(dist), leaves=[]) - - DOWN, UP = 0, 1 - stack: deque[Literal[0, 1]] = deque([UP, DOWN, DOWN]) - nodes: deque[_DCNode] = deque([root]) - res: deque[Optional[_DCNode]] = deque([]) - - pos = 0 - while len(stack): - status = stack.pop() - - if status == DOWN: - pos += 1 - if data[pos] == "/": - res.append(None) - continue - id, dist = data[pos].split("|") - if id[0] != "'": - res.append(_DCNode(id=int(id), dist=0, leaves=[int(id)])) - continue - inode = _DCNode(id=int(id[1:]), dist=float(dist), leaves=[]) - nodes.append(inode) - stack.extend([UP, DOWN, DOWN]) - - elif status == UP: - node = nodes.pop() - node.right = res.pop() - node.left = res.pop() - res.append(node) - node.leaves = node.left.leaves + node.right.leaves - return root - - -def deserialize( - data: str, - access_method: Optional[str] = None, - no_fastindex: bool = False, - n_jobs: Optional[int] = None, -) -> DCTree: - """Deserializes a string `str` to a DCTree.""" - dc_tree = object.__new__(DCTree) - - sep = "<\1dc_tree>" - # Tree structure, min_points, n_jobs, no_gil - (tree_data, min_points) = data.split(sep) - dc_tree.min_points = int(min_points) - dc_tree.access_method = access_method if access_method is not None else "tree" - dc_tree.n_jobs = n_jobs if n_jobs else cpu_count() - dc_tree.no_gil = sys.flags.nogil if hasattr(sys.flags, "nogil") else False - - tree_data_list = tree_data.split(",") - dc_tree.root = _deserialize(tree_data_list) - dc_tree.n = (len(tree_data_list) + 1) // 2 - - dc_tree.no_fastindex = no_fastindex - if not dc_tree.no_fastindex: - dc_tree._init_fast_index() - return dc_tree - - -def deserialize_compressed( - compressed_data: bytes, - access_method: Optional[str] = None, - no_fastindex: bool = False, - n_jobs: Optional[int] = None, -) -> DCTree: - """Deserializes a compressed byte array `bytes` to a DCTree.""" - byte_data = gzip.decompress(compressed_data) - data = str(byte_data, "utf-8") - return deserialize(data, access_method, no_fastindex, n_jobs=n_jobs) - - -def load( - file_path: str, - access_method: Optional[str] = None, - no_fastindex: bool = False, - n_jobs: Optional[int] = None, -) -> DCTree: - """Loads a DCTree from disk at `file_path`.""" - file = open(file_path, "rb") - byte_data = file.read() - file.close() - return deserialize_compressed(byte_data, access_method, no_fastindex, n_jobs=n_jobs) - - -def calculate_reachability_distance( - points: np.ndarray, min_points: int = 5, n_jobs: Optional[int] = None -) -> np.ndarray: - """ - Calculates the reachability distance of points using the min_points threshold in O(n^2) time. - - Raises a ValueError if min_points is larger than the number of points. - """ - - if min_points > points.shape[0]: - raise ValueError(f"Min points ({min_points}) can't exceed the size of the dataset ({points.shape[0]})") - - eucl_dists = pairwise_distances(points, metric="euclidean", n_jobs=n_jobs) - - if min_points > 1: - # Get reachability for each point with respect to min_points parameter - reach_dists = np.empty((points.shape[0])) - for i in range(points.shape[0]): - reach_dists[i] = np.max(np.partition(eucl_dists[i], min_points)[:min_points]) - np.maximum(eucl_dists, reach_dists[np.newaxis, :], eucl_dists) - np.maximum(eucl_dists, reach_dists[:, np.newaxis], eucl_dists) - np.fill_diagonal(eucl_dists, 0) - return eucl_dists - - -class _DCNode: - id: int - dist: float - leaves: List[int] - left: Optional[_DCNode] - right: Optional[_DCNode] - parent: _DCNode - k: int - - def __init__( - self, - id: int, - dist: float, - leaves: List[int], - left: Optional[_DCNode] = None, - right: Optional[_DCNode] = None, - parent=None, - k=-1, - ): - self.id = id - self.dist = dist - self.leaves = leaves - self.left = left - self.right = right - if not parent: - self.parent = self - else: - self.parent = parent - self.k = k - - def __repr__(self): - return f"DCNode #{self.id} ({self.dist}) - {self.k}" - - def __lt__(self, other): - return self.dist < other.dist - - -class _UnionFind: - """ - UnionFind structure which provides the functions `find` and `union` with amortized - time complexity of O(α(n)), where α(n) is the inverse Ackermann function. - """ - - root: List[_DCNode] - parent: List[int] - rank: List[int] - - def __init__(self, n: int): - self.root = [_DCNode(id=i, dist=0, leaves=[i]) for i in range(n)] - self.parent = list(range(n)) - self.rank = [1] * n - - def find(self, x: int) -> _DCNode: - """Finds the representative node of x.""" - return self.root[self._find(x)] - - def _find(self, x: int) -> int: - """Finds the representative of x.""" - parent = self.parent[x] - if parent != x: - parent = self._find(parent) - self.parent[x] = parent - return parent - - def union(self, x: int, y: int, node: _DCNode): - """Union the set which contains x with the set which contains y.""" - xset = self._find(x) - yset = self._find(y) - if xset == yset: - return - - # Put smaller ranked item under bigger ranked item if ranks are different - if self.rank[xset] < self.rank[yset]: - self.parent[xset] = yset - self.root[yset] = node - elif self.rank[xset] > self.rank[yset]: - self.parent[yset] = xset - self.root[xset] = node - - # If ranks are same, then move y under x (doesn't matter which one goes where) - # and increment rank of x's tree - else: - self.parent[yset] = xset - self.root[xset] = node - self.rank[xset] = self.rank[xset] + 1 - - -class _SparseTable: - """ - SparseTable structure which provides the function `query` which finds the index of the - minimum value within the range [l,r] in the array `arr`. - Needs O(n * logn) time and storage to precompute and O(1) for the query. - """ - - # arr: List[int] - arr: np.ndarray - # sparse_table: List[List[int]] - sparse_table: np.ndarray - pow_2: List[int] - log_2: List[int] - - def __init__(self, arr: List[int]): - self.arr = np.asarray(arr, dtype=np.int64) - n = len(arr) - log_n = n.bit_length() - 1 - # self.sparse_table = [[-1] * log_n for _ in range(n - 1)] - self.sparse_table = np.full((n, log_n), -1, dtype=np.int64) - - self.pow_2 = [1 << i for i in range(log_n + 1)] - self.log_2 = [i.bit_length() - 1 for i in range(n)] - - # for i in range(0, n - 1): - # self.sparse_table[i][0] = i if self.arr[i] < self.arr[i + 1] else i + 1 - idx = np.arange(n - 1, dtype=np.int64) - self.sparse_table[: n - 1, 0] = np.where(self.arr[idx] < self.arr[idx + 1], idx, idx + 1) - - for j in range(1, log_n): - step = self.pow_2[j] # 2**j - limit = n - self.pow_2[j + 1] + 1 # number of start positions - left_idx = self.sparse_table[:limit, j - 1] # L for i = 0..limit‑1 - right_idx = self.sparse_table[step : step + limit, j - 1] # R for i = 0..limit‑1 - choose_left = self.arr[left_idx] <= self.arr[right_idx] - - self.sparse_table[:limit, j] = np.where(choose_left, left_idx, right_idx) - - # for i in range(n - self.pow_2[j + 1] + 1): - # L = self.sparse_table[i][j - 1] - # R = self.sparse_table[i + step][j - 1] - # self.sparse_table[i][j] = L if arr[L] <= arr[R] else R - - def query(self, l: int, r: int) -> int: - # assert L > R, f"L value ({L}) needs to be larger than R value ({R})" - if l == r: - return l - k = self.log_2[r - l + 1] - l_k = self.sparse_table[l][k - 1] - r_k = self.sparse_table[r - self.pow_2[k] + 1][k - 1] - return l_k if self.arr[l_k] <= self.arr[r_k] else r_k diff --git a/clustpy/deep/dcdist/dctree_clusterer.py b/clustpy/deep/dcdist/dctree_clusterer.py deleted file mode 100644 index e89b7c8..0000000 --- a/clustpy/deep/dcdist/dctree_clusterer.py +++ /dev/null @@ -1,228 +0,0 @@ -""" -@authors: -Pascal Weber -""" - -# - Author: Pascal Weber -# - Source: https://github.com/pasiweber/SHADE - -from __future__ import annotations - -import numpy as np - -from collections import deque -from .dctree import DCTree, _DCNode as _DCDist_DCNode -from typing import List, Literal, Optional, Sequence, Tuple, Union - -from sklearn.base import ClusterMixin, BaseEstimator - - -class DCTree_Clusterer(ClusterMixin, BaseEstimator): - """ - Parameters - ---------- - points : np.ndarray - points, of which the dc_distances should be computed of. - - min_points : int, optional - min_points parameter used for the computation of the dc_distances, by default 5. - """ - - min_points: int - increase_inter_cluster_distance: bool - - dc_tree: DCTree - labels_: np.ndarray - - def __init__( - self, - min_points: int = 5, - increase_inter_cluster_distance: bool = True, - ): - self.min_points = min_points - self.increase_inter_cluster_distance = increase_inter_cluster_distance - - def fit(self, X, y=None): - self.dc_tree = DCTree(X, min_points=self.min_points) - self.labels_ = self.clustering(self.dc_tree) - self.n_clusters = len(np.where(np.unique(self.labels_) >= 0)[0]) - - if self.increase_inter_cluster_distance: - self._increase_inter_cluster_distance(self.dc_tree.root) - - return self - - def _condense_tree(self, node: Optional[_DCDist_DCNode]) -> Optional[_DCNode]: - if node is None: - return None - - if node.left is None: - right = self._condense_tree(node.right) - if right is not None: - right.id = node.id - right.leaves = node.leaves - return right - - if node.right is None: - left = self._condense_tree(node.left) - if left is not None: - left.id = node.id - left.leaves = node.leaves - return left - - left_size = len(node.left.leaves) - right_size = len(node.right.leaves) - - if right_size >= self.min_points and left_size >= self.min_points: - node_left = self._condense_tree(node.left) - node_right = self._condense_tree(node.right) - if node_left is None and node_right is None: - return _DCNode(node.id, node.dist, node.leaves, node_left, node_right) - if node_left is None: - return node_right - if node_right is None: - return node_left - return _DCNode(node.id, node.dist, node.leaves, node_left, node_right) - - if right_size < self.min_points and left_size < self.min_points: - return None - - if right_size < self.min_points: - left = self._condense_tree(node.left) - if left is not None: - left.id = node.id - left.leaves = node.leaves - return left - - if left_size < self.min_points: - right = self._condense_tree(node.right) - if right is not None: - right.id = node.id - right.leaves = node.leaves - return right - - def _calculate_stability(self, node: _DCNode): - if node.left is None and node.right is None: - return - - if node.left is not None: - self._calculate_stability(node.left) - node.left.stability = (1.0 / node.left.dist - 1.0 / node.dist) * len(node.left.leaves) - - if node.right is not None: - self._calculate_stability(node.right) - node.right.stability = (1.0 / node.right.dist - 1.0 / node.dist) * len( - node.right.leaves - ) - - def _stable_node_flagging(self, node: Optional[_DCNode]): - if node is None: - return - - node.is_stable = False - - if node.left is None and node.right is None: - node.is_stable = True - return - - if node.left is None: - self._stable_node_flagging(node.right) - if node.stability < node.right.stability: - node.stability = node.right.stability - node.is_stable = False - return - else: - node.is_stable = True - return - - if node.right is None: - self._stable_node_flagging(node.left) - if node.stability < node.left.stability: - node.stability = node.left.stability - node.is_stable = False - return - else: - node.is_stable = True - return - - self._stable_node_flagging(node.right) - self._stable_node_flagging(node.left) - - if node.stability < node.left.stability + node.right.stability: - node.stability = node.left.stability + node.right.stability - node.is_stable = False - return - else: - # Node is a true cluster - node.is_stable = True - return - - def _get_stable_clusters(self, node: Optional[_DCNode]) -> list[_DCNode]: - if node is None: - return [] - - if node.is_stable: - self.dc_tree[node.id].is_stable = True - return [node] - - else: - return self._get_stable_clusters(node.left) + self._get_stable_clusters(node.right) - - def _increase_inter_cluster_distance(self, root: _DCDist_DCNode): - queue = deque() - queue.append(root) - while len(queue): - node = queue.popleft() - - if node.left is None and node.right is None: - continue - - if hasattr(node, "is_stable") and node.is_stable: - continue - - node.dist += max( - node.left.dist * len(node.left.leaves), node.right.dist * len(node.right.leaves) - ) - # node.dist += max( - # (node.dist - node.left.dist) * len(node.left.leaves), (node.dist - node.right.dist) * len(node.right.leaves) - # ) - queue.append(node.left) - queue.append(node.right) - - def stable_nodes(self, dc_tree: DCTree) -> list[_DCNode]: - new_root = self._condense_tree(dc_tree.root) - if new_root is None: - return [] - - self._calculate_stability(new_root) - new_root.stability = 0 - - self._stable_node_flagging(new_root) - new_root.is_stable = False - - stable_nodes = self._get_stable_clusters(new_root) - - stable_nodes_ = [] - for node in stable_nodes: - if len(node.leaves) >= self.min_points: - stable_nodes_.append(node) - - return stable_nodes_ - - def clustering(self, dc_tree: DCTree): - stable_nodes = self.stable_nodes(dc_tree) - - labels = np.full(dc_tree.n, -1) - - idx = 0 - for i in range(len(stable_nodes)): - labels[stable_nodes[i].leaves] = idx - idx += 1 - return labels - - -class _DCNode(_DCDist_DCNode): - is_stable: Optional[Union[Literal[True], Literal[False]]] - stability: Optional[float] - left: _DCNode - right: _DCNode diff --git a/clustpy/deep/shade.py b/clustpy/deep/shade.py index 28a389b..5bf50bb 100644 --- a/clustpy/deep/shade.py +++ b/clustpy/deep/shade.py @@ -4,70 +4,92 @@ """ from __future__ import annotations - import numpy as np import torch -import sys - -from .dcdist import DCTree, DCTree_Clusterer +from clustpy.utils import DCTree +from clustpy.hierarchical import DCTree_Clusterer from clustpy.deep._abstract_deep_clustering_algo import _AbstractDeepClusteringAlgo -from clustpy.deep.neural_networks._abstract_autoencoder import _AbstractAutoencoder -from clustpy.deep.neural_networks import FeedforwardAutoencoder -from clustpy.deep._data_utils import get_dataloader +from clustpy.deep._data_utils import get_train_and_test_dataloader from clustpy.deep._train_utils import get_trained_network -from clustpy.deep._utils import ( - detect_device, - set_torch_seed, - squared_euclidean_distance, - encode_batchwise, - run_initial_clustering, -) -from sklearn.utils import check_random_state -from tqdm import tqdm +from clustpy.deep._utils import detect_device, squared_euclidean_distance, encode_batchwise, run_initial_clustering, mean_squared_error +from clustpy.utils.checks import check_parameters +import tqdm from typing import Callable, Optional, Tuple - -import matplotlib.pyplot as plt - - -sys.setrecursionlimit(1000000000) +from sklearn.utils.validation import check_is_fitted class SHADE(_AbstractDeepClusteringAlgo): """ + The Structure-preserving High-dimensional Analysis with Density-based Exploration (SHADE) algorithm. A neural network (autoencoder AE) will be trained with the reconstruction loss and the d_dc loss function. Afterward, KMeans or HDBSCAN identifies the initial clusters. Parameters ---------- + clustering_class : ClusterMixin + clustering class to obtain the cluster labels after getting the embedding (default: DCTree_Clusterer) + clustering_params : dict + parameters for the clustering class. If None, it will be set to {"min_points": min_points} (default: None) + min_points : int + the minimum number of points (default: 5) + use_complete_dc_tree : bool + Defines whether the complete DC Tree should be used instead of a batch-wise version (default: True) + use_matrix_dc_distance: bool + Defines whether the matrix DC distance should be stored - can cause memory issues (default: True) + use_less_memory: bool + Use less memory when constructing the DCTree. + This will, however, increase the runtime (default: False) batch_size : int Size of the data batches. (default: 500) - embedding_size : int - Size of the embedding within the neural_network. (default: 10) - neural_network : torch.nn.Module - The input neural_network. If None a new Autoencoder model will be created. (default: None) - optimizer_params : dict - Parameters of the optimizer for the clustering procedure. - Can also include the learning rate. (default: {"lr": 1e-3}) + pretrain_optimizer_params : dict + parameters of the optimizer for the pretraining of the neural network, includes the learning rate. + If None, it will be set to {"lr": 1e-3}. (default: None) + clustering_optimizer_params : dict + parameters of the optimizer for the actual clustering procedure, includes the learning rate. If None, it will be set to {"lr": 1e-4} (default: None) + pretrain_epochs : int + number of epochs for the pretraining of the neural network. (default: 0) + clustering_epochs : int + number of epochs for the actual clustering procedure (default: 100) optimizer_class : torch.optim.Optimizer - The optimizer class. (default: torch.optim.Adam) - loss_fn : torch.nn.modules.loss._Loss - Loss function for the reconstruction. (default: torch.nn.MSELoss()) + the optimizer class (default: torch.optim.Adam) + ssl_loss_fn : Callable | torch.nn.modules.loss._Loss + self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders (default: mean_squared_error) + neural_network : torch.nn.Module | tuple + the input neural network. If None, a new FeedforwardAutoencoder will be created. + Can also be a tuple consisting of the neural network class (torch.nn.Module) and the initialization parameters (dict) (default: None) + neural_network_weights : str + Path to a file containing the state_dict of the neural_network (default: None) + embedding_size : int + size of the embedding within the neural network (default: 10) + density_loss_weight : float + weight of the density loss compared to the reconstruction loss (default: 1.0) + ssl_loss_weight : float + weight of the self-supervised learning (ssl) loss (default: 1.0) custom_dataloaders : tuple - Tuple consisting of a trainloader (random order) at the first and - a test loader (non-random order) at the second position. - If None, the default dataloaders will be used. (default: None) - random_state : np.random.RandomState - Use a fixed random state to get a repeatable solution. - Can also be of type int. (default: None) + tuple consisting of a trainloader (random order) at the first and a test loader (non-random order) at the second position. + Can also be a tuple of strings, where the first entry is the path to a saved trainloader and the second entry the path to a saved testloader. + In this case the dataloaders will be loaded by torch.load(PATH). + If None, the default dataloaders will be used (default: None) device : torch.device - If device is None then it will set to cuda if it is available. (default: None) + The device on which to perform the computations. + If device is None then it will be automatically chosen: if a gpu is available the gpu with the highest amount of free memory will be chosen (default: None) + random_state : np.random.RandomState | int + use a fixed random state to get a repeatable solution. Can also be of type int (default: None) Attributes ---------- + n_clusters_ : int + The final number of clusters labels_ : np.ndarray The final labels - neural_network : torch.nn.Module - The final neural_network + cluster_centers_ : np.ndarray + The final cluster centers defined as the mean of assigned samples within the AE embedding + dc_tree_ : DCTree + The dc tree + neural_network_trained_ : torch.nn.Module + The final neural network + n_features_in_ : int + the number of features used for the fitting Examples -------- @@ -83,80 +105,48 @@ class SHADE(_AbstractDeepClusteringAlgo): IEEE International Conference on Data Mining (ICDM), Abu Dhabi, United Arab Emirates, 2024, pp. 675-680, doi: 10.1109/ICDM59182.2024. """ - batch_size: int - neural_network: Optional[torch.nn.Module] - min_points: int - use_complete_dc_tree: bool - use_matrix_dc_distance: bool - increase_inter_cluster_distance: bool - pretrain_epochs: int - pretrain_optimizer_params: dict - clustering_epochs: int - clustering_optimizer_params: dict - embedding_size: int - optimizer_params: dict - optimizer_class: torch.optim.Optimizer - loss_fn: torch.nn.modules.loss._Loss - custom_dataloaders = Optional[Tuple[Callable, Callable]] - random_state: np.random.RandomState - device: torch.device - cluster_algorithm: ClusterMixin - cluster_algorithm_params: dict - degree_of_reconstruction: float - degree_of_density_preservation: float - - n_clusters: Optional[int] - labels_: np.ndarray - def __init__( self, - batch_size: int = 500, - neural_network: Optional[torch.nn.Module] = None, - min_points: int = 5, + clustering_class : Optional[ClusterMixin] = DCTree_Clusterer, + clustering_params : dict = None, + min_points : int = 5, use_complete_dc_tree: bool = True, use_matrix_dc_distance: bool = True, - increase_inter_cluster_distance: bool = False, - pretrain_epochs: int = 0, - pretrain_optimizer_params: dict = {"lr": 1e-3}, - clustering_epochs: int = 100, - clustering_optimizer_params: dict = {"lr": 1e-3}, - embedding_size: int = 10, + use_less_memory: bool = False, + batch_size: int = 500, + pretrain_optimizer_params: dict = None, + clustering_optimizer_params : dict = None, + pretrain_epochs : int = 0, + clustering_epochs : int = 100, optimizer_class: torch.optim.Optimizer = torch.optim.Adam, - loss_fn: torch.nn.modules.loss._Loss = torch.nn.MSELoss(), - custom_dataloaders: Optional[Tuple[Callable, Callable]] = None, - random_state: Optional[np.random.RandomState] = None, - device: Optional[torch.device] = None, - n_clusters: Optional[int] = None, - cluster_algorithm: Optional[ClusterMixin] = DCTree_Clusterer, - cluster_algorithm_params: dict = {}, - degree_of_reconstruction: float = 1.0, - degree_of_density_preservation: float = 1.0, + ssl_loss_fn : Callable | torch.nn.modules.loss._Loss = mean_squared_error, + neural_network : torch.nn.Module | tuple = None, + neural_network_weights : str = None, + embedding_size : int = 10, + density_loss_weight : float = 1.0, + ssl_loss_weight : float = 1.0, + custom_dataloaders : tuple = None, + device : torch.device = None, + random_state : np.random.RandomState | int = None, ): - self.batch_size = batch_size + super().__init__(batch_size, neural_network, neural_network_weights, embedding_size, device, random_state) + self.clustering_class = clustering_class + self.clustering_params = clustering_params self.min_points = min_points self.use_complete_dc_tree = use_complete_dc_tree - self.dc_tree = None self.use_matrix_dc_distance = use_matrix_dc_distance - self.increase_inter_cluster_distance = increase_inter_cluster_distance - self.pretrain_epochs = pretrain_epochs + self.use_less_memory = use_less_memory self.pretrain_optimizer_params = pretrain_optimizer_params - self.clustering_epochs = clustering_epochs self.clustering_optimizer_params = clustering_optimizer_params - self.embedding_size = embedding_size - self.neural_network = neural_network + self.pretrain_epochs = pretrain_epochs + self.clustering_epochs = clustering_epochs self.optimizer_class = optimizer_class - self.loss_fn = loss_fn + self.ssl_loss_fn = ssl_loss_fn + self.density_loss_weight = density_loss_weight + self.ssl_loss_weight = ssl_loss_weight self.custom_dataloaders = custom_dataloaders - self.random_state = check_random_state(random_state) - set_torch_seed(self.random_state) - self.device = detect_device(device) - self.n_clusters = n_clusters - self.cluster_algorithm = cluster_algorithm - self.cluster_algorithm_params = cluster_algorithm_params - self.degree_of_reconstruction = degree_of_reconstruction - self.degree_of_density_preservation = degree_of_density_preservation - def fit(self, X, y=None) -> SHADE: + def fit(self, X: np.ndarray, y: np.ndarray=None) -> SHADE: """ Cluster the input dataset with the SHADE algorithm. The resulting cluster labels will be stored in the `labels_` attribute. @@ -167,342 +157,218 @@ def fit(self, X, y=None) -> SHADE: The given data set. y : np.ndarray The labels. (can be ignored) - dc_distances : Optional[Union[np.ndarray, DCTree, SampleDCTree]] - dc_distances of X. Returns ------- self : SHADE This instance of the SHADE algorithm. """ - - # Create Dataloader - if self.custom_dataloaders is None: - trainloader = get_dataloader( - X, - self.batch_size, - drop_last=False, - shuffle=True, - ) - testloader = get_dataloader( - X, - self.batch_size, - drop_last=False, - shuffle=False, - ) - else: - trainloader, testloader = self.custom_dataloaders - if trainloader.batch_size != self.batch_size: - self.batch_size = trainloader.batch_size - + X, _, random_state, pretrain_optimizer_params, _, _ = self._check_parameters(X, y=y) + clustering_optimizer_params = {"lr": 1e-3} if self.clustering_optimizer_params is None else self.clustering_optimizer_params + clustering_params = {"min_points": self.min_points, "use_less_memory": self.use_less_memory} if self.clustering_params is None else self.clustering_params + device = detect_device(self.device) + trainloader, testloader, batch_size = get_train_and_test_dataloader(X, self.batch_size, self.custom_dataloaders) + assert batch_size >= self.min_points, f"Batch_size ({batch_size}) cannot be smaller than min_points ({self.min_points})" # Create dc_tree - if self.dc_tree is None and self.use_complete_dc_tree: - print("Build DCTree of the complete dataset") - self.dc_tree = DCTree(X, min_points=self.min_points) - + if self.use_complete_dc_tree: + self.dc_tree_ = DCTree(X, min_points=self.min_points, use_less_memory=self.use_less_memory) + else: + self.dc_tree_ = None # Create and pretrain Autoencoder - if self.neural_network is None: - architecture = [X.shape[1], 512, 256, 128, self.embedding_size] - self.neural_network = FeedforwardAutoencoder(architecture) - self.neural_network = self.neural_network.to(self.device) - - if not self.neural_network.fitted: - self.neural_network = get_trained_network( - trainloader=trainloader, - data=X, - n_epochs=self.pretrain_epochs, - batch_size=self.batch_size, - optimizer_params=self.pretrain_optimizer_params, - optimizer_class=self.optimizer_class, - device=self.device, - ssl_loss_fn=self.loss_fn, - embedding_size=self.embedding_size, - neural_network=self.neural_network, - random_state=self.random_state, - ) - self.neural_network.fitted = False - + neural_network_params = {"layers": [X.shape[1], 512, 256, 128, self.embedding_size]} + neural_network = get_trained_network(trainloader, n_epochs=self.pretrain_epochs, + optimizer_params=pretrain_optimizer_params, optimizer_class=self.optimizer_class, + device=device, ssl_loss_fn=self.ssl_loss_fn, embedding_size=self.embedding_size, + neural_network=self.neural_network, + neural_network_weights=self.neural_network_weights, neural_network_params=neural_network_params, + random_state=random_state) # Setup SHADE Module - self.shade_module = _SHADE_Module( - autoencoder=self.neural_network, + shade_module = _SHADE_Module( n_epochs=self.clustering_epochs, + neural_network=neural_network, min_points=self.min_points, - dc_tree=self.dc_tree, + dc_tree=self.dc_tree_, use_matrix_dc_distance=self.use_matrix_dc_distance, - increase_inter_cluster_distance=self.increase_inter_cluster_distance, - degree_of_reconstruction=self.degree_of_reconstruction, - degree_of_density_preservation=self.degree_of_density_preservation, - optimizer_class=self.optimizer_class, - optimizer_params=self.clustering_optimizer_params, - device=self.device, + device=device, + ssl_loss_fn=self.ssl_loss_fn, + density_loss_weight=self.density_loss_weight, + ssl_loss_weight=self.ssl_loss_weight ) - if not self.neural_network.fitted: - print("Start training with clustering loss.") - self.shade_module.fit( - X, - trainloader=trainloader, - loss_fn=self.loss_fn, - testloader=testloader, - ) - self.neural_network.fitted = True - - embedding = encode_batchwise(testloader, self.neural_network) - - ( - self.n_clusters, - self.labels_, - self.cluster_centers_, - _, - ) = run_initial_clustering( - X=embedding, - n_clusters=self.n_clusters, - clustering_class=self.cluster_algorithm, - clustering_params=self.cluster_algorithm_params, - random_state=self.random_state, + optimizer = self.optimizer_class(list(neural_network.parameters()), **clustering_optimizer_params) + shade_module.fit(X, trainloader, optimizer) + # Get labels + embedded_data = encode_batchwise(testloader, neural_network) + n_clusters, labels, cluster_centers, _ = run_initial_clustering( + X=embedded_data, + n_clusters=None, + clustering_class=self.clustering_class, + clustering_params=clustering_params, + random_state=random_state, ) + self.n_clusters_ = n_clusters + self.labels_ = labels + self.cluster_centers_ = cluster_centers + self.neural_network_trained_ = neural_network + self.set_n_featrues_in(X) return self def predict(self, X: np.ndarray) -> np.ndarray: """ Predicts the labels of the input data. + Note that this is just a very imprecise estimation as we are not using the DC Tree to predict the labels. + The prediction is calculated by checking the distance to the clostest mean of samples in a cluster within the embedding of the AE. Parameters ---------- X : np.ndarray - The input data. - n_clusters : int - Number of clusters. Can be None if a corresponding initial_clustering_class is given, - e.g. DBSCAN / HDBSCAN. - cluster_algorithm : Clusterer, optional - Clustering algorithm which should be used on the learned embedding. (default: KMeans) - cluster_params : dict - Clustering parameters for `cluster_algorithm`. + input data Returns ------- predicted_labels : np.ndarray - The predicted labels. - """ - - dataloader = get_dataloader( - X, - self.batch_size, - drop_last=False, - shuffle=False, - ) - - embedding = encode_batchwise(dataloader, self.neural_network) - - ( - self.n_clusters, - self.labels_, - self.cluster_centers_, - _, - ) = run_initial_clustering( - X=embedding, - n_clusters=self.n_clusters, - clustering_class=self.cluster_algorithm, - clustering_params=self.cluster_algorithm_params, - random_state=self.random_state, - ) - return self.labels_ - - def encode(self, X: np.ndarray) -> np.ndarray: + The predicted labels """ - Embedds the input data with the learned SHADE Autoencoder. + check_is_fitted(self, ["labels_", "neural_network_trained_", "n_features_in_"]) + X, _, _ = check_parameters(X, allow_size_1=True, allow_nd=self.neural_network_trained_.allow_nd_input, estimator_obj=self) + print("WARNING: predict does not use the embedding of the manifold and is, therefore, just a very rough estimate") + predicted_labels = super().predict(X) + return predicted_labels - Parameters - ---------- - X : np.ndarray - The input data. - - Returns - ------- - np.ndarray - Embedded input data. - """ - - dataloader = get_dataloader( - X, - self.batch_size, - drop_last=False, - shuffle=False, - ) - embedding = encode_batchwise(dataloader, self.neural_network) - return embedding - - -class _SHADE_Module(_AbstractAutoencoder): - """ - The SHADE Autoencoder. +class _SHADE_Module(torch.nn.Module): """ + The _SHADE_Module. Contains most of the algorithm specific procedures like the loss function. - autoencoder: torch.nn.Module - dc_tree: Optional[DCTree] + Parameters + ---------- + n_epochs : int + number of epochs for the clustering procedure + neural_network : torch.nn.Module + the neural network + min_points : int + the minimum number of points + dc_tree : Optional[DCTree] + the DCTree use_matrix_dc_distance: bool - n_epochs: int - min_points: int - increase_inter_cluster_distance: bool - degree_of_reconstruction: float - degree_of_density_preservation: float - optimizer: torch.optim.Optimizer - device: torch.device + Defines whether the matrix DC distance should be stored - can cause memory issues + device : torch.device + device to be trained on + ssl_loss_fn : Callable | torch.nn.modules.loss._Loss + self-supervised learning (ssl) loss function for training the network, e.g. reconstruction loss for autoencoders + density_loss_weight : float + weight of the clustering loss + ssl_loss_weight : float + weight of the self-supervised learning (ssl) loss + """ def __init__( self, - autoencoder: torch.nn.Module, - n_epochs=100, - min_points: int = 5, - dc_tree: Optional[DCTree] = None, - use_matrix_dc_distance: bool = True, - increase_inter_cluster_distance: bool = False, - degree_of_reconstruction: float = 1.0, - degree_of_density_preservation: float = 1.0, - optimizer_class: torch.optim.Optimizer = torch.optim.Adam, - optimizer_params: dict = {}, - use_tqdm = False, - device: Optional[torch.device] = None, + n_epochs : int, + neural_network: torch.nn.Module, + min_points: int, + dc_tree: Optional[DCTree], + use_matrix_dc_distance: bool, + device: torch.device, + ssl_loss_fn: Callable | torch.nn.modules.loss._Loss, + density_loss_weight: float, + ssl_loss_weight: float ): super().__init__() - - self.autoencoder = autoencoder self.n_epochs = n_epochs + self.neural_network = neural_network self.min_points = min_points self.dc_tree = dc_tree self.use_matrix_dc_distance = use_matrix_dc_distance - self.increase_inter_cluster_distance = increase_inter_cluster_distance - self.degree_of_reconstruction = degree_of_reconstruction - self.degree_of_density_preservation = degree_of_density_preservation - self.optimizer = optimizer_class(list(autoencoder.parameters()), **optimizer_params) - self.use_tqdm = use_tqdm - self.device = detect_device(device) - - def encode(self, x: torch.Tensor) -> torch.Tensor: - return self.autoencoder.encode(x) - - def decode(self, embedded: torch.Tensor) -> torch.Tensor: - return self.autoencoder.decode(embedded) + self.device = device + self.ssl_loss_fn = ssl_loss_fn + self.density_loss_weight = density_loss_weight + self.ssl_loss_weight = ssl_loss_weight def fit( self, - X, + X: np.ndarray, trainloader: torch.utils.data.DataLoader, - loss_fn: torch.nn.modules.loss._Loss, - testloader, + optimizer: torch.optim.Optimizer ) -> _SHADE_Module: """ - Trains the autoencoder in place. + Trains the _SHADE_Module in place. Parameters ---------- + X : np.ndarray + The data trainloader : torch.utils.data.DataLoader - Dataloader to be used for training. - dc_distances : Union[np.ndarray, DCTree, SampleDCTree] - dc_distances of the data of the dataloader. - n_epochs : int - Number of epochs for the clustering procedure. + dataloader to be used for training optimizer : torch.optim.Optimizer - The optimizer for training. - loss_fn : torch.nn.modules.loss._Loss - Loss function for the reconstruction. - device : torch.device - Device to be trained on. + the optimizer for training Returns ------- self : _SHADE_Module This instance of the _SHADE_Module. """ - if self.dc_tree is not None and self.use_matrix_dc_distance: - print("Compute dc_distance matrix.") - self.matrix_dc_distance = self.dc_tree.dc_distances() - + matrix_dc_distance = self.dc_tree.dc_distances() + matrix_dc_distance_torch = torch.tensor(matrix_dc_distance, device=self.device) + else: + matrix_dc_distance_torch = None self.train() - for epoch_i in tqdm(range(self.n_epochs), file=sys.stdout, desc="Epoch", disable=not self.use_tqdm): - loss_rec_sum = [] - loss_dens_sum = [] + tbar = tqdm.trange(self.n_epochs, desc="SHADE training") + for _ in tbar: # Update Network for batch in trainloader: if len(batch[0]) <= self.min_points: continue - - loss_rec, loss_dens = self._loss(X, batch, loss_fn) - loss = ( - self.degree_of_reconstruction * loss_rec - + self.degree_of_density_preservation * loss_dens - ) - loss_rec_sum.append(loss_rec.cpu().detach().numpy()) - loss_dens_sum.append(loss_dens.cpu().detach().numpy()) + loss = self._loss(X, batch, matrix_dc_distance_torch) # Backward pass - update weights - self.optimizer.zero_grad() + optimizer.zero_grad() loss.backward() - self.optimizer.step() - - self.autoencoder.eval() + optimizer.step() + postfix_str = {"Loss": loss} + tbar.set_postfix(postfix_str) + self.neural_network.eval() self.eval() return self def _loss( self, - X, + X: np.ndarray, batch: list, - loss_fn: torch.nn.modules.loss._Loss, + matrix_dc_distance_torch: torch.Tensor ) -> Tuple[torch.Tensor, torch.Tensor]: """ Calculate the autoencoder reconstruction + d_dc loss. Parameters ---------- + X : np.ndarray + The data batch : list The minibatch. - loss_fn : torch.nn.modules.loss._Loss - Loss function for the reconstruction. - device : torch.device - Device to be trained on. + matrix_dc_distance_torch : torch.Tensor + A matrix containing pairwise dc distances Returns - -------loss + ------- loss : torch.Tensor The final SHADE loss. """ - # Reconstrucion - batch_data = batch[1].to(self.device) - emb_data = self.encode(batch_data) - reconstructed = self.decode(emb_data) - loss_rec = loss_fn(reconstructed, batch_data) - + ssl_loss, embedded, _ = self.neural_network.loss(batch, self.ssl_loss_fn, self.device) # Density loss if self.dc_tree is None: # Batch-wise DCTree - if self.increase_inter_cluster_distance: - dc_clusterer = DCTree_Clusterer( - min_points=self.min_points, - increase_inter_cluster_distance=self.increase_inter_cluster_distance, - ) - dc_clusterer.fit_predict(X[batch[0]]) - batch_dc_dists = torch.tensor( - dc_clusterer.dc_tree.dc_distances(), device=self.device - ) - else: - batch_dc_dists = torch.tensor( - DCTree(X[batch[0]], min_points=self.min_points).dc_distances(), - device=self.device, - ) + dc_distances = DCTree(X[batch[0]], min_points=self.min_points).dc_distances() + batch_dc_dists = torch.tensor(dc_distances, device=self.device) else: # DCTree of all data points X if self.use_matrix_dc_distance: - batch_dc_dists = torch.tensor( - self.matrix_dc_distance[np.ix_(batch[0], batch[0])], device=self.device - ) + idxs = batch[0].to(self.device) + batch_dc_dists = matrix_dc_distance_torch[idxs[:, None], idxs[None, :]] else: - batch_dc_dists = torch.tensor( - self.dc_tree.dc_distances(batch[0], batch[0]), device=self.device - ) - - batch_eucl_dists = squared_euclidean_distance(emb_data, emb_data) + dc_distances = self.dc_tree.dc_distances(batch[0], batch[0]) + batch_dc_dists = torch.tensor(dc_distances, device=self.device) + batch_eucl_dists = squared_euclidean_distance(embedded, embedded) loss_dens = (batch_eucl_dists - batch_dc_dists).pow(2).mean() - - return loss_rec, loss_dens + loss = self.ssl_loss_weight * ssl_loss + self.density_loss_weight * loss_dens + return loss diff --git a/clustpy/deep/tests/test_shade.py b/clustpy/deep/tests/test_shade.py new file mode 100644 index 0000000..487ff34 --- /dev/null +++ b/clustpy/deep/tests/test_shade.py @@ -0,0 +1,25 @@ +from clustpy.deep import SHADE +import numpy as np +from clustpy.utils.checks import check_clustpy_estimator +from clustpy.deep.tests._helpers_for_tests import _test_dc_algorithm_simple, _test_dc_algorithm_with_augmentation + + +def test_shade_estimator(): + # Ignore check_methods_subset_invariance due to numerical issues + check_clustpy_estimator(SHADE(min_points=2, pretrain_epochs=0, clustering_epochs=3), + ("check_complex_data", "check_methods_subset_invariance")) + + +def test_shade(): + shade = SHADE() + _test_dc_algorithm_simple(shade, check_predict=False) + + +def test_shade_wo_matrix_distance(): + shade = SHADE(use_matrix_dc_distance=False) + _test_dc_algorithm_simple(shade, check_predict=False) + + +def test_shade_wo_complete_dc_tree(): + shade = SHADE(use_complete_dc_tree=False) + _test_dc_algorithm_simple(shade, check_predict=False) diff --git a/clustpy/hierarchical/__init__.py b/clustpy/hierarchical/__init__.py index e13ac0a..51ee526 100644 --- a/clustpy/hierarchical/__init__.py +++ b/clustpy/hierarchical/__init__.py @@ -1,3 +1,5 @@ from .diana import Diana +from .dctree_clusterer import DCTree_Clusterer -__all__ = ["Diana"] +__all__ = ["Diana", + "DCTree_Clusterer"] diff --git a/clustpy/hierarchical/dctree_clusterer.py b/clustpy/hierarchical/dctree_clusterer.py new file mode 100644 index 0000000..46f5ad2 --- /dev/null +++ b/clustpy/hierarchical/dctree_clusterer.py @@ -0,0 +1,153 @@ +""" +@authors: +Pascal Weber +""" + +# - Author: Pascal Weber +# - Source: https://github.com/pasiweber/SHADE + +from __future__ import annotations +import numpy as np +import sys +from clustpy.utils.dctree import DCTree, _DCNode +from typing import Optional +from sklearn.base import ClusterMixin, BaseEstimator +from clustpy.utils.checks import check_parameters + + +sys.setrecursionlimit(1000000000) + + +class DCTree_Clusterer(ClusterMixin, BaseEstimator): + """ + The DCTree clustering algorithm. + Identifies stable nodes within the DCTree and labels the data accordingly. + + Parameters + ---------- + min_points : int + the minimum number of points (default: 5) + use_less_memory: bool + Use less memory when constructing the DCTree. + This will, however, increase the runtime (default: False) + + Attributes + ---------- + n_clusters_ : int + The final number of clusters + labels_ : np.ndarray + The final labels + dc_tree_ : BinaryClusterTree + The resulting cluster tree + n_features_in_ : int + the number of features used for the fitting + + References + ---------- + SHADE: Deep Density-based Clustering + Anna Beer; Pascal Weber; Lukas Miklautz; Collin Leiber; Walid Durani; Christian Böhm + IEEE International Conference on Data Mining (ICDM), Abu Dhabi, United Arab Emirates, 2024, pp. 675-680, doi: 10.1109/ICDM59182.2024. + """ + + def __init__(self, min_points: int = 5, use_less_memory: bool = False): + self.min_points = min_points + self.use_less_memory = use_less_memory + + def fit(self, X: np.ndarray, y: np.ndarray=None) -> 'DCTree_Clusterer': + """ + Initiate the actual clustering process on the input data set. + The resulting cluster labels will be stored in the labels_ attribute. + + Parameters + ---------- + X : np.ndarray + the given data set + y : np.ndarray + the labels (can be ignored) + + Returns + ------- + self : DCTree_Clusterer + this instance of the DCTree_Clusterer algorithm + """ + X, _, _ = check_parameters(X=X, y=y) + self.dc_tree_ = DCTree(X, min_points=self.min_points, use_less_memory=self.use_less_memory) + condensed_root = self._condense(self.dc_tree_.root) + stable_nodes = self._get_stable_nodes(condensed_root) + labels = np.full(self.dc_tree_.n, -1 if stable_nodes else 0, dtype=np.int32) + self.n_clusters_ = len(stable_nodes) if stable_nodes else 1 + for idx, node in enumerate(stable_nodes): + labels[node.leaves] = idx + self.labels_ = labels + self.n_features_in_ = X.shape[1] + return self + + def _condense(self, node: Optional[_DCNode]) -> Optional[_DCNode]: + """ + Condense the tree to nodes, where both children contain at least min_points leaves. + Uses a recursive strategy that checks each node separately. + + Parameters + ---------- + node : _DCNode + the node that is checked for its children + + Returns + ------- + condensed_node : _DCNode + either a condensed node or None + """ + if node is None or len(node.leaves) < self.min_points: + return None + # Process children + L = self._condense(node.left) + R = self._condense(node.right) + # If both children have min_points children, keep this branch. + size_L = len(node.left.leaves) if node.left is not None else 0 + size_R = len(node.right.leaves) if node.right is not None else 0 + if size_L >= self.min_points and size_R >= self.min_points: + if (L is not None and R is not None) or (L is None and R is None): + return _DCNode(node.id, node.dist, node.leaves, L, R) + elif L is not None: + return L + else: + return R + elif L is not None: + return _DCNode(node.id, L.dist, node.leaves, L.left, L.right) + elif R is not None: + return _DCNode(node.id, R.dist, node.leaves, R.left, R.right) + return None + + def _get_stable_nodes(self, node: Optional[_DCNode], parent_dist: float = None) -> list: + """ + Identify stable nodes in the tree. + + Parameters + ---------- + node : _DCNode + the node that is checked. + parent_dist : float + Distance in the parent node. Can be None in the case of the root (default: None) + + Returns + ------- + stable_nodes : list + list of stable nodes + """ + if node is None: # In case root is None + return [] + node.stability_ = (1.0/node.dist - 1.0/parent_dist) * len(node.leaves) if parent_dist is not None else 0 + # Calculate stability for children + sum_child_stabilities = 0 + child_results = [] + for child in [node.left, node.right]: + if child is not None: + child_results += self._get_stable_nodes(child, node.dist) + sum_child_stabilities += child.stability_ + # Flag stable nodes + if node.stability_ >= sum_child_stabilities: + stable_nodes = [node] + else: + node.stability_ = sum_child_stabilities + stable_nodes = child_results + return stable_nodes diff --git a/clustpy/hierarchical/tests/test_dctree_clusterer.py b/clustpy/hierarchical/tests/test_dctree_clusterer.py new file mode 100644 index 0000000..e3f798a --- /dev/null +++ b/clustpy/hierarchical/tests/test_dctree_clusterer.py @@ -0,0 +1,17 @@ +from clustpy.hierarchical import DCTree_Clusterer +from clustpy.utils.checks import check_clustpy_estimator +from sklearn.datasets import make_blobs +import numpy as np + + +def test_dctree_clusterer_estimator(): + check_clustpy_estimator(DCTree_Clusterer(min_points=2), ("check_complex_data")) + + +def test_simple_dctree_clusterer(): + X, labels = make_blobs(200, 4, centers=3, random_state=1) + dctree_clusterer = DCTree_Clusterer() + assert not hasattr(dctree_clusterer, "labels_") + dctree_clusterer.fit(X) + assert dctree_clusterer.labels_.dtype == np.int32 + assert dctree_clusterer.labels_.shape == labels.shape diff --git a/clustpy/utils/__init__.py b/clustpy/utils/__init__.py index a485729..8970132 100644 --- a/clustpy/utils/__init__.py +++ b/clustpy/utils/__init__.py @@ -3,6 +3,7 @@ from .diptest import dip_test, dip_pval, dip_boot_samples, dip_gradient, dip_pval_gradient, plot_dip from .plots import plot_with_transformation, plot_image, plot_scatter_matrix, plot_histogram, plot_1d_data, \ plot_2d_data, plot_3d_data +from .dctree import DCTree, reachability_distances, minimum_spanning_tree_prims __all__ = ['evaluate_dataset', 'evaluate_multiple_datasets', @@ -22,4 +23,7 @@ 'dip_gradient', 'dip_pval_gradient', 'plot_dip', - 'evaluation_df_to_latex_table'] + 'evaluation_df_to_latex_table', + 'DCTree', + 'reachability_distances', + 'minimum_spanning_tree_prims'] diff --git a/clustpy/utils/dctree.py b/clustpy/utils/dctree.py new file mode 100644 index 0000000..60f1751 --- /dev/null +++ b/clustpy/utils/dctree.py @@ -0,0 +1,607 @@ +""" +@authors: +Pascal Weber +""" + +# Implementation of the dc-distance with a DCTree by +# - Author: Pascal Weber +# - Source: https://github.com/pasiweber/SHADE + +# Paper: Connecting the Dots -- Density-Connectivity Distance unifies DBSCAN, k-Center and Spectral Clustering +# Authors: Anna Beer, Andrew Draganov, Ellen Hohma, Philipp Jahn, Christian M.M. Frey, and Ira Assent +# Link: https://doi.org/10.1145/3580305.3599283 + + +from __future__ import annotations +import numpy as np +from typing import List, Optional, Sequence, Tuple, Union +from scipy.spatial.distance import pdist, cdist, squareform + + +class DCTree: + """ + The DCTree computes the dc_distances and stores them in a tree structure. + By using the euler tour of the tree and a sparse table for range minimum queries, + the lca-elements in the tree (i.e. the dc_distances) can be computed in O(1) time. + The function `dc_dist(i, j)` returns the dc_distance between `points[i]` and `points[j]` + in O(1) time. + The function `dc_distance(idx_X, idx_Y=None, access_method="tree")` returns a dc_distance matrix + with the dc_distance from each pair of `points[idx_X]` and `points[idx_Y]`. + See the section Functions for more details. + + Parameters + ---------- + X : np.ndarray + points, of which the dc_distances should be computed of. + min_points : int, optional + min_points parameter used for the computation of the dc_distances (default: 5). + use_less_memory: bool + Use less memory when constructing the DCTree. + This will, however, increase the runtime (default: False). + + Functions + --------- + DCTree[x]: + Returns the _DCNode of given index. + dc_dist : (i, j) -> distance + returns the dc_distance between points[i] and points[j] in O(1) time. + dc_distances : (idx_X, idx_Y) -> np.ndarray + Computes the dc_distance matrix between each pair of points[idx_X] and points[idx_Y]. + Returns dc_dists as ndarray of shape (n_samples_X, n_samples_Y) + + Examples + -------- + >>> points = np.array([[1,6],[2,6],[6,2],[14,17],[123,3246],[52,8323],[265,73]]) + >>> dc_tree = dcdist.DCTree(points, 5) + >>> print(dc_tree.dc_dist(2,5)) + >>> print(dc_tree.dc_distances(range(len(points)))) + >>> print(dc_tree.dc_distances([0,1], [2,3])) + + References + ---------- + Anna Beer, Andrew Draganov, Ellen Hohma, Philipp Jahn, Christian M.M. Frey, and Ira Assent. + "Connecting the Dots -- Density-Connectivity Distance unifies DBSCAN, k-Center and Spectral + Clustering." KDD '23. 2023. + """ + + def __init__( + self, + X: np.ndarray, + min_points: int = 5, + use_less_memory: bool = False + ): + self.n = X.shape[0] + self.min_points = min_points + if not use_less_memory: + # Calculate pair-wise reachability distance + X = reachability_distances(X, min_points) + mst_edges = minimum_spanning_tree_prims(X, use_less_memory=use_less_memory, min_points=min_points) + self.root = self._build_tree(mst_edges) + self._init_fast_index() + + def _build_tree(self, mst_edges: np.ndarray) -> _DCNode: + """ + Build the final DCTree using the MST edges. + + Parameters + ---------- + mst_edges : np.ndarray + The edges of the MST, including the dc-distances + + Returns + ------- + new_root_node : _DCNode + The root node of the tree + """ + mst_edges.sort(order="dist") + # Create leaf nodes + leaf_nodes = [_DCNode(idx, 0.0, [idx]) for idx in range(self.n)] + idx = self.n + for i, j, dist in mst_edges: + root_i = self._get_root(leaf_nodes[i]) + root_j = self._get_root(leaf_nodes[j]) + new_leaves = root_i.leaves + root_j.leaves + new_root_node = _DCNode( + id=idx, + dist=dist, + left=root_i, + right=root_j, + leaves=new_leaves, + ) + root_i.parent = new_root_node + root_i.root = new_root_node + root_j.parent = new_root_node + root_j.root = new_root_node + idx += 1 + return new_root_node + + def _get_root(self, node: _DCNode) -> '_DCNode': + """ + Get the root of a node and update all root_ entries that have been visited. + + Parameters + ---------- + node : _DCNode + The input node + + Returns + ------- + root_node : _DCNode + The root node + """ + root_node = node + while root_node.root is not None: + root_node = root_node.root + # Override old roots + if root_node is not node: + current_node = node + while current_node.root != root_node: + next_node = current_node.root + current_node.root = root_node + current_node = next_node + return root_node + + def _init_fast_index(self) -> None: + """ + Build a fast index structure for the DCTree. + """ + n_nodes = 2 * self.n - 1 + self.euler = [] + self.level = [] + self.f_occur = [None] * n_nodes + # Euler tour to get the euler, level, and f_occur lists in O(n) time. + DOWN, UP = 0, 1 + stack = [(self.root, 0, DOWN)] # (node, level, DOWN / UP) + while len(stack) > 0: + (node, level, status) = stack.pop() + if status == DOWN: + if self.f_occur[node.id] is None: + self.f_occur[node.id] = len(self.euler) + self.euler.append(node) + self.level.append(level) + if node.right is not None: + stack.append((node, level, UP)) + stack.append((node.right, level + 1, DOWN)) + if node.left is not None: + stack.append((node, level, UP)) + stack.append((node.left, level + 1, DOWN)) + elif status == UP: + self.euler.append(node) + self.level.append(level) + # SparseTable structure which finds the index of the minimum value within the range [l,r] in self.level + levels = np.array(self.level, dtype=int) + n_elements = len(levels) + log_n = n_elements.bit_length() + self.sparse_table = np.zeros((log_n, n_elements), dtype=int) + self.sparse_table[0] = np.arange(n_elements) + for j in range(1, log_n): + stride = 1 << (j - 1) + limit = n_elements - (1 << j) + 1 + if limit <= 0: + break + left_indices = self.sparse_table[j - 1, :limit] + right_indices = self.sparse_table[j - 1, stride : stride + limit] + choose_left = levels[left_indices] <= levels[right_indices] + self.sparse_table[j, :limit] = np.where(choose_left, left_indices, right_indices) + + def __getitem__( + self, + point_idx: Union[int, Sequence[int], np.ndarray] + ) -> Union[_DCNode, List[_DCNode]]: + """ + Returns the _DCNode of given index if `point_idx` is an integer or a list of _DCNodes if `point_idx' is a Sequence. + + Parameters + ---------- + point_idx : Union[int, Sequence[int], np.ndarray] + The input parameter as described above + + Returns + ------- + result : Union[_DCNode, List[_DCNode]] + The querried nodes in the tree + """ + if isinstance(point_idx, int): + return self.euler[self.f_occur[point_idx]] + elif isinstance(point_idx, (Sequence, np.ndarray)): + return [self.euler[self.f_occur[i]] for i in point_idx] + else: + raise IndexError(f"`{point_idx}` needs to be an integer, Sequence or np.ndarray!") + + def dc_dist(self, i: int, j: int) -> float: + """ + Returns the dc_distance between points[i] and points[j] in O(1) time. + + Parameters + ---------- + i : int + index of the first point + j : int + index of the second point + + Returns + ------- + dc_distance : float + The dc_distance of the two points + """ + if i == j: + return 0.0 + l = self.f_occur[i] + r = self.f_occur[j] + if l > r: + l, r = r, l + k = (r - l + 1).bit_length() - 1 + idx_l = self.sparse_table[k, l] + idx_r = self.sparse_table[k, r - (1 << k) + 1] + ancestor = self.euler[idx_l if self.level[idx_l] <= self.level[idx_r] else idx_r] + dc_distance = ancestor.dist + return dc_distance + + def dc_distances( + self, + idx_X: Union[Sequence[int], np.ndarray, None] = None, + idx_Y: Union[Sequence[int], np.ndarray, None] = None, + access_method: str = "tree", + ) -> np.ndarray: + """ + Computes the dc_distance matrix between each pair of points[idx_X] and points[idx_Y]. + If idx_X=None, idx_X=range(n) is used. + If idx_Y=None, idx_Y=idx_X is used. + + Parameters + ---------- + idx_X : Union[Sequence[int], np.ndarray, None] + the first set of indices + idx_Y : Union[Sequence[int], np.ndarray, None] + the second set of indices + access_method : str + "tree": traverses the tree in O(n) time (n = len(points)), no matter the size of X / Y. + "dc_dist": uses the dc_dist function and needs O(k*l) time (k = len(X), l = len(Y))). + (default: "tree") + + Returns + ------- + dc_dists : np.array + ndarray of shape (n_samples_X, n_samples_Y) containing the distances + """ + if idx_X is None: + idx_X = range(self.n) + if idx_Y is None: + idx_Y = idx_X + dc_dists = np.zeros((len(idx_X), len(idx_Y))) + if access_method == "dc_dist": + # Get distances from the index structure + for i in range(len(idx_X)): + start_index = i + 1 if idx_X is idx_Y else 0 + for j in range(start_index, len(idx_Y)): + dc_dists[i, j] = self.dc_dist(idx_X[i], idx_Y[j]) + elif access_method == "tree": + # Get distances from the tree + idx_rev_X = np.full(self.n, -1, dtype=int) + idx_rev_X[idx_X] = range(len(idx_X)) + if idx_X is idx_Y: + idx_rev_Y = idx_rev_X + else: + idx_rev_Y = np.full(self.n, -1, dtype=int) + idx_rev_Y[idx_Y] = range(len(idx_Y)) + node_list = [self.root] + while len(node_list) > 0: + node = node_list.pop() + if node.left is not None: + node_list.append(node.left) + if node.right is not None: + node_list.append(node.right) + if node.left is not None and node.right is not None: + i_leaves = node.left.leaves + j_leaves = node.right.leaves + (i_, j_) = (idx_rev_X[i_leaves], idx_rev_Y[j_leaves]) + (i_, j_) = (i_[i_ != -1], j_[j_ != -1]) + if len(i_) > 0 and len(j_) > 0: + dc_dists[(i_[:, np.newaxis], j_[np.newaxis, :])] = node.dist + # In case of asymmetric call + if idx_X is not idx_Y: + (i_, j_) = (idx_rev_Y[i_leaves], idx_rev_X[j_leaves]) + (i_, j_) = (i_[i_ != -1], j_[j_ != -1]) + if len(i_) > 0 and len(j_) > 0: + dc_dists[(j_[:, np.newaxis], i_[np.newaxis, :])] = node.dist + else: + raise ValueError(f"'{access_method}' is no valid `access_method`") + if idx_X is idx_Y: + # Mirror values + dc_dists = dc_dists + dc_dists.T + return dc_dists + + + def _traverse_until_k_clusters(self, n_clusters: int) -> List[_DCNode]: + """ + Traverse the tree to identify n_clusters nodes that minimize the maximum within-cluster distance. + + Parameters + ---------- + n_clusters : int + the number of desired clusters + + Returns + ------- + result_nodes : List[_DCNode] + List of nodes, where each node represents the root of a cluster + """ + assert n_clusters <= self.n, "n_clusters can not be larger than the number of input points" + node_list = [self.root] + result_nodes = [] + id_threshold = 2 * self.n - n_clusters - 1 + while len(node_list) > 0: + node = node_list.pop() + if node.id <= id_threshold: + result_nodes.append(node) + else: + if node.left is not None: + node_list.append(node.left) + if node.right is not None: + node_list.append(node.right) + assert len(result_nodes) == n_clusters, "Length of the result_nodes list is not equal to k" + return result_nodes + + def get_k_center(self, n_clusters: int) -> np.ndarray: + """ + Extract the k center approach. + It identifies clusters such that the maximum distance within a cluster is minimized. + + Parameters + ---------- + n_clusters : int + the number of desired clusters + + Returns + ------- + labels : np.ndarray + The cluster labels + """ + nodes = self._traverse_until_k_clusters(n_clusters) + labels = np.zeros(self.n, dtype=np.int32) + for i, node in enumerate(nodes): + labels[np.array(node.leaves)] = i + return labels + + def get_eps_for_k(self, n_clusters: int, eps: float = 1e-10): + """ + Receive the largest possible eps value such that one receives exectly n_clusters. + + Parameters + ---------- + n_clusters : int + the number of desired clusters + eps : float + a small constaint used to differentiate from the next potential cluster (default: 1e-10) + + Returns + ------- + min_eps : float + The resulting epsilon value + """ + assert eps > 0, "Eps has to be a positive number" + nodes = self._traverse_until_k_clusters(n_clusters) + min_eps = np.inf + for node in nodes: + if node.parent is not None: + min_eps = min(min_eps, node.parent.dist) + return min_eps - eps + + def __repr__(self) -> str: + """ + Return a string describing the DCTree. + + Returns + ------- + repr_string : str + The string representation of the tree + """ + if self.root is None: + return "" + pointer_right = " " + pointer_left = " " if self.root.right else " " + repr_string = ( + f"{self.root}" + f"{self.__repr__help(self.root.left, pointer_left, '', self.root.right is not None)}" + f"{self.__repr__help(self.root.right, pointer_right, '', False)}" + ) + return repr_string + + def __repr__help(self, node: Optional[_DCNode], pointer: str, padding: str, has_right_sibling: bool) -> str: + """ + Helper for the __repr__ function. + + Parameters + ---------- + node : Optional[_DCNode] + The current node + pointer : str + The pointer string + padding : str + The padding string + has_right_sibling : bool + True if node has a right sibbling + + Returns + ------- + repr_string : str + The string for this node + """ + if node is None: + return "" + padding_for_both = padding + (" " if has_right_sibling else " ") + pointer_right = " " + pointer_left = " " if node.right else " " + repr_string = ( + f"\n {padding.replace('|', ' ')}// #region" + f"\n{padding}{pointer}{node}" + f"{self.__repr__help(node.left, pointer_left, padding_for_both, node.right is not None)}" + f"{self.__repr__help(node.right, pointer_right, padding_for_both, False)}" + f"\n {padding.replace('|', ' ')}// #endregion" + ) + return repr_string + + +class _DCNode: + """ + A node in a DCTree. + Each node contains a set of leaves and can have a left and a right child node. + + Parameters + ---------- + id : int + the id of the node + dist : float + the dc distance + leaves : List[int] + list of the ids of its child nodes + left : Optional[_DCNode] + the left child node (default: None) + right : Optional[_DCNode] + the right child node (default: None) + parent : Optional[_DCNode] + the parent node. If None, it will be set to itself (default: None) + root : Optional[_DCNode] + the root node (default: None) + """ + + def __init__( + self, + id: int, + dist: float, + leaves: List[int], + left: Optional[_DCNode] = None, + right: Optional[_DCNode] = None, + parent: Optional[_DCNode] = None, + root: Optional[_DCNode] = None + ): + self.id = id + self.dist = dist + self.leaves = leaves + self.left = left + self.right = right + if not parent: + self.parent = self + else: + self.parent = parent + self.root = root + + def __repr__(self) -> str: + """ + Return a string containing the id and dist of this node. + + Returns + ------- + to_str : str + The string + """ + to_str = f"DCNode #{self.id} ({self.dist})" + return to_str + + def __lt__(self, other_node: _DCNode) -> bool: + """ + Less than method of the node. Compares the dist with respect to the other input node. + + Parameters + ---------- + other_node : _DCNode + the other node + + Returns + ------- + is_less : bool + True if dist is smaller than dist of the other node + """ + less_than = self.dist < other_node.dist + return less_than + + +def reachability_distances(X: np.ndarray, min_points: int = 5) -> np.ndarray: + """ + Calculates the reachability distances between points using the min_points threshold in O(n^2) time. + + Parameters + ---------- + X : np.ndarray + the points + min_points : int, optional + min_points parameter (default: 5) + + Returns + ------- + reach_distances : np.ndarray + Array containing the pair-wise reachability distances + + Raises + ------ + Raises a ValueError if min_points is larger than the number of points. + """ + if min_points > X.shape[0]: + raise ValueError(f"Min points ({min_points}) can't exceed the size of the dataset ({X.shape[0]})") + eucl_distances = squareform(pdist(X, metric="euclidean")) + if min_points > 1: + core_distances = np.partition(eucl_distances, min_points - 1, axis=1)[:, min_points - 1] + reach_distances = np.maximum(eucl_distances, np.maximum.outer(core_distances, core_distances)) + np.fill_diagonal(reach_distances, 0) + else: + reach_distances = eucl_distances + return reach_distances + + +def minimum_spanning_tree_prims(matrix: np.ndarray, use_less_memory: bool = False, min_points: int = None) -> np.ndarray: + """ + Create a Minimum-spanning-tree of a given matrix using Prim's algorithm. + The tree will be build in O(n^2) time. + + Parameters + ---------- + matrix : np.ndarray + The input matrix + use_less_memory : bool + If true, the MST will not directly be build for the input matrix but the matrix will be used to construct a distance matrix first. + Saves quadratic RAM usage, but also needs double the time for computing (default: False) + min_points : int + Min_points for calculating the reachability distance. Only relevant if use_less_memory is True. + If min_points is None, the euclidean distance will be used (default: None) + + Returns + ------- + mst_edges : np.ndarray + The edges of the Minimum-spanning-tree, represented as a (n-1, 3) matrix with entries corresponding to (node_i, node_j, dist_ij) + """ + assert (matrix.shape[0] == matrix.shape[1]) or use_less_memory, "Input matrix must be quadratic or use_less_memory must be True." + n = matrix.shape[0] + nodes_min_dist = np.full(n, np.inf) + parent = np.zeros(n, dtype=int) + not_in_mst = np.ones(n, dtype=bool) + mst_edges = np.empty((n - 1), dtype=([("i", int), ("j", int), ("dist", float)])) + # If min_points is not None, use reachability distance => calculate core distances of all points + if use_less_memory and min_points is not None: + core_distances = np.zeros(n) + for i in range(n): + eucl_distances = cdist([matrix[i]], matrix, metric="euclidean").ravel() + core_distances[i] = np.partition(eucl_distances, min_points - 1)[min_points - 1] + # Start building the MST + u = 0 + nodes_min_dist[u] = 0 + not_in_mst[u] = False + for i in range(n - 1): + if use_less_memory: + eucl_distances = cdist([matrix[u]], matrix, metric="euclidean").ravel() + if min_points is None: + dist_u = eucl_distances + else: + dist_u = np.maximum(eucl_distances, np.maximum(core_distances[u], core_distances)) + else: + # If use_less_memory=False, 'matrix' is expected to be the precomputed distance matrix + dist_u = matrix[u] + update_mask = not_in_mst & (dist_u < nodes_min_dist) + # Update distances and parents + nodes_min_dist[update_mask] = dist_u[update_mask] + parent[update_mask] = u + # Select next closest unvisited node + masked_dists = np.where(not_in_mst, nodes_min_dist, np.inf) + u = np.argmin(masked_dists) + mst_edges[i] = (parent[u], u, nodes_min_dist[u]) + not_in_mst[u] = False + return mst_edges diff --git a/clustpy/utils/tests/test_dctree.py b/clustpy/utils/tests/test_dctree.py new file mode 100644 index 0000000..1dea910 --- /dev/null +++ b/clustpy/utils/tests/test_dctree.py @@ -0,0 +1,162 @@ +from clustpy.utils import minimum_spanning_tree_prims, reachability_distances, DCTree +import numpy as np + +def test_dc_tree_min(): + X = np.array([ + [0.0, 0.0], + [-3.0, 0.0], + [2.0, 0.0], + [3.0, 0.0], + [-7.0, 0.0], + [-8.0, 0.0] + ]) + # Min_points = 3 + dctree = DCTree(X, min_points = 3, use_less_memory = False) + root = dctree.root + assert root.dist == 5.0 + assert root.left.leaves == [0, 2, 3, 1, 4] + assert root.right.leaves == [5] + assert root.right.dist == 0.0 + assert root.right.right is None + assert root.right.left is None + assert root.left.dist == 4.0 + assert root.left.left.leaves == [0, 2, 3, 1] + assert root.left.right.leaves == [4] + assert root.left.right.dist == 0.0 + assert root.left.right.left is None + assert root.left.right.right is None + assert root.left.left.dist == 4.0 + assert root.left.left.left.leaves == [0, 2, 3] + assert root.left.left.right.leaves == [1] + assert root.left.left.left.dist == 3.0 + assert root.left.left.left.left.leaves == [0, 2] + assert root.left.left.left.right.leaves == [3] + assert root.left.left.left.left.dist == 3.0 + assert root.left.left.left.left.left.leaves == [0] + assert root.left.left.left.left.right.leaves == [2] + all_distances = dctree.dc_distances() + expected_distances = np.array([[0., 4., 3., 3., 4., 5.], + [4., 0., 4., 4., 4., 5.], + [3., 4., 0., 3., 4., 5.], + [3., 4., 3., 0., 4., 5.], + [4., 4., 4., 4., 0., 5.], + [5., 5., 5., 5., 5., 0.] + ]) + assert np.array_equal(all_distances, expected_distances) + all_distances_reverse = dctree.dc_distances(np.arange(6), np.arange(5)[::-1], access_method="dc_dist") + expected_distances = expected_distances[:, np.arange(5)[::-1]] + assert np.array_equal(all_distances_reverse, expected_distances) + labels = dctree.get_k_center(3) + assert np.array_equal(labels, np.array([2, 2, 2, 2, 1, 0])) + eps = dctree.get_eps_for_k(3) + assert np.abs(eps - 4.0) < 1e-5 + # Min_points = 2 + dctree = DCTree(X, min_points = 2, use_less_memory = True) + root = dctree.root + assert root.dist == 4.0 + assert root.left.leaves == [0, 2, 3, 1] + assert root.right.leaves == [4, 5] + assert root.right.dist == 1.0 + assert root.right.left.leaves == [4] + assert root.right.right.leaves == [5] + assert root.right.left.dist == 0.0 + assert root.right.left.left is None + assert root.right.left.right is None + assert root.right.right.dist == 0.0 + assert root.right.right.left is None + assert root.right.right.right is None + assert root.left.dist == 3.0 + assert root.left.left.leaves == [0, 2, 3] + assert root.left.right.leaves == [1] + assert root.left.right.dist == 0.0 + assert root.left.right.left is None + assert root.left.right.right is None + assert root.left.left.dist == 2.0 + assert root.left.left.left.leaves == [0] + assert root.left.left.right.leaves == [2, 3] + assert root.left.left.right.dist == 1.0 + assert root.left.left.right.left.leaves == [2] + assert root.left.left.right.right.leaves == [3] + all_distances = dctree.dc_distances() + expected_distances = np.array([[0., 3., 2., 2., 4., 4.], + [3., 0., 3., 3., 4., 4.], + [2., 3., 0., 1., 4., 4.], + [2., 3., 1., 0., 4., 4.], + [4., 4., 4., 4., 0., 1.], + [4., 4., 4., 4., 1., 0.] + ]) + assert np.array_equal(all_distances, expected_distances) + all_distances_reverse = dctree.dc_distances(np.arange(6), np.arange(5)[::-1], access_method="dc_dist") + expected_distances = expected_distances[:, np.arange(5)[::-1]] + assert np.array_equal(all_distances_reverse, expected_distances) + labels = dctree.get_k_center(3) + assert np.array_equal(labels, np.array([2, 1, 2, 2, 0, 0])) + eps = dctree.get_eps_for_k(3) + assert np.abs(eps - 3.0) < 1e-5 + # Test different accesses to DCTree + node = dctree[4] + assert node.id == 4 + all_nodes = dctree[np.arange(11)] + for i, node in enumerate(all_nodes): + assert node.id == i + # Test repr + assert type(dctree.__repr__) is str + assert type(node.__repr__) is str + + +def test_reachability_distances(): + X = np.array([ + [0.0, 0.0], + [-3.0, 0.0], + [2.0, 0.0], + [3.0, 0.0], + [-7.0, 0.0], + [-8.0, 0.] + ]) + distances = reachability_distances(X, min_points = 2) + expected_distances = [[0., 3., 2., 3., 7., 8.], + [3., 0., 5., 6., 4., 5.], + [2., 5., 0., 1., 9., 10.], + [3., 6., 1., 0., 10., 11. ], + [7., 4., 9., 10, 0., 1.], + [8., 5., 10., 11., 1., 0.] + ] + assert np.array_equal(distances, expected_distances) + distances = reachability_distances(X, min_points = 3) + expected_distances = [[0., 4., 3., 3., 7., 8.], + [4., 0., 5., 6., 4., 5.], + [3., 5., 0., 3., 9., 10.], + [3., 6., 3., 0., 10., 11.], + [7., 4., 9., 10., 0., 5.], + [8., 5., 10., 11., 5., 0.] + ] + assert np.array_equal(distances, expected_distances) + + +def test_minimum_spanning_tree_prims(): + distance_matrix = np.array([ + [0.0, 3.0, 2.0, 3.0, 7.0, 8.0], + [3.0, 0.0, 5.0, 6.0, 4.0, 5.0], + [2.0, 5.0, 0.0, 1.0, 9.0, 10.0], + [3.0, 6.0, 1.0, 0.0, 10.0, 11.0], + [7.0, 4.0, 9.0, 10.0, 0.0, 1.0], + [8.0, 5.0, 10.0, 11.0, 1.0, 0.0] + ]) + mst = minimum_spanning_tree_prims(distance_matrix, use_less_memory=False, min_points=None) + expected_mst = np.array([(0, 2, 2.0), (2, 3, 1.0), (0, 1, 3.0), (1, 4, 4.0), (4, 5, 1.0)], dtype=([("i", int), ("j", int), ("dist", float)])) + # test with use_less_memory=True + X = np.array([ + [0.0, 0.0], + [-3.0, 0.0], + [2.0, 0.0], + [3.0, 0.0], + [-7.0, 0.0], + [-8.0, 0.] + ]) + mst = minimum_spanning_tree_prims(X, use_less_memory=True, min_points=None) + assert np.array_equal(mst, expected_mst) + # Test reachability distance + core_distances = [3, 4, 2, 3, 7] + mst = minimum_spanning_tree_prims(X, use_less_memory=True, min_points=3) + expected_mst = np.array([(0, 2, 3.0), (0, 3, 3.0), (0, 1, 4.0), (1, 4, 4.0), (1, 5, 5.0)], dtype=([("i", int), ("j", int), ("dist", float)])) + assert np.array_equal(mst, expected_mst) From 0edf985e4caa7c0a40e30172d81e74fb12940fdd Mon Sep 17 00:00:00 2001 From: Collin Leiber <33159895+collinleiber@users.noreply.github.com> Date: Thu, 11 Jun 2026 10:01:35 +0300 Subject: [PATCH 3/5] Add missing import to shade.py --- clustpy/deep/shade.py | 1 + 1 file changed, 1 insertion(+) diff --git a/clustpy/deep/shade.py b/clustpy/deep/shade.py index 5bf50bb..1b595ea 100644 --- a/clustpy/deep/shade.py +++ b/clustpy/deep/shade.py @@ -16,6 +16,7 @@ import tqdm from typing import Callable, Optional, Tuple from sklearn.utils.validation import check_is_fitted +from sklearn.base import ClusterMixin class SHADE(_AbstractDeepClusteringAlgo): From 943af52cbc45a1a2d56080d1be22fe465bad7deb Mon Sep 17 00:00:00 2001 From: Collin Leiber <33159895+collinleiber@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:47:41 +0300 Subject: [PATCH 4/5] Update dctree.py --- clustpy/utils/dctree.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/clustpy/utils/dctree.py b/clustpy/utils/dctree.py index 60f1751..2e87ca1 100644 --- a/clustpy/utils/dctree.py +++ b/clustpy/utils/dctree.py @@ -400,11 +400,9 @@ def __repr__(self) -> str: return "" pointer_right = " " pointer_left = " " if self.root.right else " " - repr_string = ( - f"{self.root}" - f"{self.__repr__help(self.root.left, pointer_left, '', self.root.right is not None)}" - f"{self.__repr__help(self.root.right, pointer_right, '', False)}" - ) + repr_string = f"{self.root}" + repr_string += f"{self.__repr__help(self.root.left, pointer_left, '', self.root.right is not None)}" + repr_string += f"{self.__repr__help(self.root.right, pointer_right, '', False)}" return repr_string def __repr__help(self, node: Optional[_DCNode], pointer: str, padding: str, has_right_sibling: bool) -> str: @@ -432,13 +430,11 @@ def __repr__help(self, node: Optional[_DCNode], pointer: str, padding: str, has_ padding_for_both = padding + (" " if has_right_sibling else " ") pointer_right = " " pointer_left = " " if node.right else " " - repr_string = ( - f"\n {padding.replace('|', ' ')}// #region" - f"\n{padding}{pointer}{node}" - f"{self.__repr__help(node.left, pointer_left, padding_for_both, node.right is not None)}" - f"{self.__repr__help(node.right, pointer_right, padding_for_both, False)}" - f"\n {padding.replace('|', ' ')}// #endregion" - ) + repr_string = f"\n {padding.replace('|', ' ')}// #region" + repr_string += f"\n{padding}{pointer}{node}" + repr_string += f"{self.__repr__help(node.left, pointer_left, padding_for_both, node.right is not None)}" + repr_string += f"{self.__repr__help(node.right, pointer_right, padding_for_both, False)}" + repr_string += f"\n {padding.replace('|', ' ')}// #endregion" return repr_string From c32c1f7a4dc1a85df0ccd395f3e7e925fc3c8c97 Mon Sep 17 00:00:00 2001 From: Collin Leiber <33159895+collinleiber@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:43:10 +0200 Subject: [PATCH 5/5] Update test_dctree.py --- clustpy/utils/tests/test_dctree.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clustpy/utils/tests/test_dctree.py b/clustpy/utils/tests/test_dctree.py index 1dea910..9d79944 100644 --- a/clustpy/utils/tests/test_dctree.py +++ b/clustpy/utils/tests/test_dctree.py @@ -100,8 +100,8 @@ def test_dc_tree_min(): for i, node in enumerate(all_nodes): assert node.id == i # Test repr - assert type(dctree.__repr__) is str - assert type(node.__repr__) is str + assert type(dctree.__repr__()) is str + assert type(node.__repr__()) is str def test_reachability_distances():