From d17d3f921e3ff97d0f435d11fb35e4a1212b4195 Mon Sep 17 00:00:00 2001 From: anripa Date: Thu, 11 Jun 2026 10:40:36 +0300 Subject: [PATCH] Fix projected dip-means MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit changes the logic for selecting the cluster for splitting. The code was change from picking the lowest score (p-value) to picking the maximum dip statistic. This matches the original paper algorithm description (see the reference below). The additional benefit is that the p-value can be exactly zero for two very large dip values, hence the selection would be random. Chamalis, T., & Likas, A. (2018). The Projected Dip-means Clustering Algorithm. Proceedings of the 10th Hellenic Conference on Artificial Intelligence, SETN ’18, 1–7. https://doi.org/10.1145/3200947.3201008 --- clustpy/partition/projected_dipmeans.py | 27 +++++++++++++++---------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/clustpy/partition/projected_dipmeans.py b/clustpy/partition/projected_dipmeans.py index 9e60e25..e7949d8 100644 --- a/clustpy/partition/projected_dipmeans.py +++ b/clustpy/partition/projected_dipmeans.py @@ -4,13 +4,14 @@ """ import numpy as np -from sklearn.decomposition import PCA -from clustpy.utils import dip_test, dip_pval from sklearn.base import BaseEstimator, ClusterMixin -from clustpy.partition.xmeans import _initial_kmeans_clusters, _execute_two_means -from clustpy.utils.checks import check_parameters -from sklearn.utils.validation import check_is_fitted +from sklearn.decomposition import PCA from sklearn.metrics.pairwise import pairwise_distances_argmin_min +from sklearn.utils.validation import check_is_fitted + +from clustpy.partition.xmeans import _execute_two_means, _initial_kmeans_clusters +from clustpy.utils import dip_pval, dip_test +from clustpy.utils.checks import check_parameters def _proj_dipmeans(X: np.ndarray, significance: float, n_random_projections: int, pval_strategy: str, n_boots: int, @@ -64,14 +65,18 @@ def _proj_dipmeans(X: np.ndarray, significance: float, n_random_projections: int cluster_dips = np.array([dip_test(projected_data[:, p], just_dip=True, is_data_sorted=False) for p in range(projected_data.shape[1])]) # Calculate p-values of maximum dip - pval = dip_pval(np.max(cluster_dips), ids_in_cluster.shape[0], pval_strategy=pval_strategy, n_boots=n_boots, + max_dip = np.max(cluster_dips) + pval = dip_pval(max_dip, ids_in_cluster.shape[0], pval_strategy=pval_strategy, n_boots=n_boots, random_state=random_state) - # Calculate cluster score - cluster_scores[c] = pval - # Get cluster with minimum pval - cluster_id_to_split = np.argmin(cluster_scores) + + if pval < significance: + # Save cluster multimodality score + cluster_scores[c] = max_dip + + # Get cluster with maximum multimodality score + cluster_id_to_split = np.argmax(cluster_scores) # Check if any cluster has to be split - if cluster_scores[cluster_id_to_split] < significance: + if cluster_scores[cluster_id_to_split] > 0: # Split cluster using bisecting kmeans labels, centers, _ = _execute_two_means(X, ids_in_each_cluster, cluster_id_to_split, centers, n_split_trials, random_state)