From b1479491a3b72d3eb3eac5fd612f3e23880b83b3 Mon Sep 17 00:00:00 2001 From: Collin Leiber Date: Thu, 11 Jun 2026 16:16:44 +0300 Subject: [PATCH 1/3] use p-value from scipy for GMeans. Add corrected BIC calculation to XMeans --- clustpy/partition/gmeans.py | 46 +------- clustpy/partition/tests/test_gmeans.py | 13 --- clustpy/partition/xmeans.py | 155 ++++++++++++++----------- pyproject.toml | 2 +- 4 files changed, 96 insertions(+), 120 deletions(-) diff --git a/clustpy/partition/gmeans.py b/clustpy/partition/gmeans.py index 5c34091..dcbf147 100644 --- a/clustpy/partition/gmeans.py +++ b/clustpy/partition/gmeans.py @@ -54,10 +54,13 @@ def _gmeans(X: np.ndarray, significance: float, n_clusters_init: int, max_n_clus labels_split, centers_split, _ = _execute_two_means(X[ids_in_cluster], [np.arange(ids_in_cluster.shape[0])], 0, np.array([centers[c]]), n_split_trials, random_state) # Project data form cluster onto resulting connection axis - projected_data = np.dot(X[ids_in_cluster], centers_split[0] - centers_split[1]) + projection_vector = centers_split[0] - centers_split[1] + projection_vector /= np.linalg.norm(projection_vector) + projected_data = np.dot(X[ids_in_cluster], projection_vector) + projected_data = (projected_data - projected_data.mean()) / projected_data.std() # Use Anderson Darling to test if data is Gaussian - ad_result = anderson(projected_data, "norm") - p_value = _anderson_darling_statistic_to_prob(ad_result.statistic, len(ids_in_cluster)) + ad_result = anderson(projected_data, "norm", method="interpolated") + p_value = ad_result.pvalue if p_value < significance: # If data is not Gaussian, keep the newly created cluster centers centers[c] = centers_split[0] @@ -79,43 +82,6 @@ def _gmeans(X: np.ndarray, significance: float, n_clusters_init: int, max_n_clus return n_clusters, labels, centers -def _anderson_darling_statistic_to_prob(statistic: float, n_points: int) -> float: - """ - Transform the statistic returned by the Anderson Darling test into a p_value. - First the adjusted statistic will be calculated. - Afterwards, the actual p-value can be obtained. - - Parameters - ---------- - statistic : float - The original statistic from the Anderson Darling test. - n_points : int - The number of samples - - Returns - ------- - p_value : float - The p-value - - References - ---------- - D'Agostino, Ralph B., and Michael A. Stephens. "Goodness-of-fit techniques." - Statistics: Textbooks and Monographs (1986). - """ - adjusted_stat = statistic * (1 + (.75 / n_points) + 2.25 / (n_points ** 2)) - if adjusted_stat < 0.2: - # is log q => therefore add 1 - ... - p_value = 1 - np.exp(-13.436 + 101.14 * adjusted_stat - 223.73 * (adjusted_stat ** 2)) - elif adjusted_stat < 0.34: - # is log q => therefore add 1 - ... - p_value = 1 - np.exp(-8.318 + 42.796 * adjusted_stat - 59.938 * (adjusted_stat ** 2)) - elif adjusted_stat < 0.6: - p_value = np.exp(0.9177 - 4.279 * adjusted_stat - 1.38 * (adjusted_stat ** 2)) - else: - p_value = np.exp(1.2937 - 5.709 * adjusted_stat - 0.0186 * (adjusted_stat ** 2)) - return p_value - - class GMeans(ClusterMixin, BaseEstimator): """ Execute the GMeans clustering procedure. diff --git a/clustpy/partition/tests/test_gmeans.py b/clustpy/partition/tests/test_gmeans.py index d2fdd53..21dbc72 100644 --- a/clustpy/partition/tests/test_gmeans.py +++ b/clustpy/partition/tests/test_gmeans.py @@ -1,6 +1,5 @@ import numpy as np from clustpy.partition import GMeans -from clustpy.partition.gmeans import _anderson_darling_statistic_to_prob from sklearn.datasets import make_blobs from scipy.stats import anderson from clustpy.utils.checks import check_clustpy_estimator @@ -10,18 +9,6 @@ def test_gmeans_estimator(): check_clustpy_estimator(GMeans(), ("check_complex_data")) -def test_anderson_darling_statistic_to_prob(): - n_points = 20 - statistic = 0.435 - assert np.isclose(_anderson_darling_statistic_to_prob(statistic, n_points), 0.270, atol=0.001) - # Example from https://www.spcforexcel.com/knowledge/basic-statistics/anderson-darling-test-for-normality - data = np.array([3334, 3554, 3625, 3837, 3838]) - ad_result = anderson(data, "norm") - statistic = ad_result.statistic - assert np.isclose(statistic, 0.288, atol=0.001) - assert np.isclose(_anderson_darling_statistic_to_prob(statistic, data.shape[0]), 0.456, atol=0.001) - - """ Tests regarding the GMeans object """ diff --git a/clustpy/partition/xmeans.py b/clustpy/partition/xmeans.py index 8ba1f4d..35f59b0 100644 --- a/clustpy/partition/xmeans.py +++ b/clustpy/partition/xmeans.py @@ -127,7 +127,7 @@ def _execute_two_means(X: np.ndarray, ids_in_each_cluster: list, cluster_id_to_s def _xmeans(X: np.ndarray, n_clusters_init: int, max_n_clusters: int, check_global_score: bool, allow_merging: bool, - n_split_trials: int, random_state: np.random.RandomState) -> (int, np.ndarray, np.ndarray): + n_split_trials: int, split_criterion: str, random_state: np.random.RandomState) -> (int, np.ndarray, np.ndarray): """ Start the actual XMeans clustering procedure on the input data set. @@ -140,11 +140,13 @@ def _xmeans(X: np.ndarray, n_clusters_init: int, max_n_clusters: int, check_glob max_n_clusters : int Maximum number of clusters. Must be larger than n_clusters_init check_global_score : bool - Defines whether the global BIC score should be checked after the 'Improve-Params' step. Some implementations skip this step + Defines whether the global score should be checked after the 'Improve-Params' step. Some implementations skip this step allow_merging : bool Try to merge clusters after the regular XMeans algorithm terminated. See Ishioka et al. for more information n_split_trials : int Number tries to split a cluster. For each try 2-KMeans is executed with different cluster centers + split_criterion : str + The split criterion. Can be "original", "corrected", or "aic" random_state : np.random.RandomState use a fixed random state to get a repeatable solution @@ -157,19 +159,17 @@ def _xmeans(X: np.ndarray, n_clusters_init: int, max_n_clusters: int, check_glob """ assert max_n_clusters >= n_clusters_init, "max_n_clusters can not be smaller than n_clusters_init" n_dims = X.shape[1] - n_clusters, labels, centers, global_variance = _initial_kmeans_clusters(X, n_clusters_init, random_state) + n_clusters, labels, centers, inertia = _initial_kmeans_clusters(X, n_clusters_init, random_state) # Get parameters of all clusters ids_in_each_cluster = [np.where(labels == c)[0] for c in range(n_clusters)] cluster_sizes = np.array([ids_in_cluster.shape[0] for ids_in_cluster in ids_in_each_cluster]) - cluster_variances = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) / ( - cluster_sizes[c] - 1) for c in range(n_clusters)]) # Only used if allow_merging is True + cluster_inertias = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) if cluster_sizes[c] > 1 else 0 + for c in range(n_clusters)]) if check_global_score: - # Get initial global variance - global_variance = global_variance / (X.shape[0] - n_clusters) - # Get initial global BIC score - best_global_bic_score = _bic_score(X.shape[0], cluster_sizes, n_dims, global_variance) + # Get initial global score + best_global_score = _clustering_score(X.shape[0], cluster_sizes, n_dims, inertia, split_criterion) # Save best result - best_result = (n_clusters, labels, centers, ids_in_each_cluster, cluster_sizes, cluster_variances) + best_result = (n_clusters, labels, centers, ids_in_each_cluster, cluster_sizes, cluster_inertias) while n_clusters < max_n_clusters: n_clusters_old = n_clusters # Split Clusters => Improve-Structure @@ -179,21 +179,19 @@ def _xmeans(X: np.ndarray, n_clusters_init: int, max_n_clusters: int, check_glob if ids_in_cluster.shape[0] <= 2: # Cluster can not be split because it is too small continue - # Get variance of original cluster - cluster_variance = cluster_variances[c] - # Get BIC score of original cluster - cluster_bic_score = _bic_score(original_cluster_size, original_cluster_size, n_dims, cluster_variance) + # Get inertia of original cluster + cluster_inertia = cluster_inertias[c] + # Get score of original cluster + cluster_score = _clustering_score(original_cluster_size, original_cluster_size, n_dims, cluster_inertia, split_criterion) # Split cluster into two - labels_split, centers_split, split_variance = _execute_two_means(X[ids_in_cluster], + labels_split, centers_split, split_inertia = _execute_two_means(X[ids_in_cluster], [np.arange(original_cluster_size)], 0, np.array([centers[c]]), n_split_trials, random_state) - # Get variance of splitted clusters - split_variance = split_variance / (ids_in_cluster.shape[0] - 2) cluster_sizes_split = np.array([np.sum(labels_split == c) for c in range(2)]) - # Get BIC score of splitted clusters - split_cluster_bic_score = _bic_score(original_cluster_size, cluster_sizes_split, n_dims, split_variance) - if cluster_bic_score < split_cluster_bic_score: + # Get score of splitted clusters + split_cluster_score = _clustering_score(original_cluster_size, cluster_sizes_split, n_dims, split_inertia, split_criterion) + if cluster_score < split_cluster_score: # Keep new clusters centers[c] = centers_split[0] centers = np.r_[centers, [centers_split[1]]] @@ -214,32 +212,29 @@ def _xmeans(X: np.ndarray, n_clusters_init: int, max_n_clusters: int, check_glob # Update parameters of all clusters ids_in_each_cluster = [np.where(labels == c)[0] for c in range(n_clusters)] cluster_sizes = np.array([ids_in_cluster.shape[0] for ids_in_cluster in ids_in_each_cluster]) - cluster_variances = [np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) / ( - cluster_sizes[c] - 1) if cluster_sizes[c] > 1 else 0 for c in - range(n_clusters)] # Only used if allow_merging is True + cluster_inertias = [np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) if cluster_sizes[c] > 1 else 0 for c in + range(n_clusters)] if check_global_score: - # Get new global variance - global_variance = kmeans.inertia_ / (X.shape[0] - n_clusters) - # Get new global BIC score - new_global_bic_score = _bic_score(X.shape[0], cluster_sizes, n_dims, global_variance) - if best_global_bic_score < new_global_bic_score: + # Get new global score + new_global_score = _clustering_score(X.shape[0], cluster_sizes, n_dims, kmeans.inertia_, split_criterion) + if best_global_score < new_global_score: # If score improved, save new best model - best_global_bic_score = new_global_bic_score + best_global_score = new_global_score best_result = ( n_clusters, labels.copy(), centers.copy(), ids_in_each_cluster.copy(), cluster_sizes.copy(), - cluster_variances.copy()) + cluster_inertias.copy()) if check_global_score: # Exchange latest result with best overall result - n_clusters, labels, centers, ids_in_each_cluster, cluster_sizes, cluster_variances = best_result + n_clusters, labels, centers, ids_in_each_cluster, cluster_sizes, cluster_inertias = best_result # OPTIONAL: try to merge clusters if allow_merging: n_clusters, labels, centers = _merge_clusters(X, n_clusters, labels, centers, ids_in_each_cluster, - cluster_sizes, cluster_variances) + cluster_sizes, cluster_inertias, split_criterion) return n_clusters, labels, centers def _merge_clusters(X: np.ndarray, n_clusters: int, labels: np.ndarray, centers: np.ndarray, ids_in_each_cluster: list, - cluster_sizes: np.ndarray, cluster_variances: np.ndarray) -> (int, np.ndarray, np.ndarray): + cluster_sizes: np.ndarray, cluster_inertias: np.ndarray, split_criterion: str) -> (int, np.ndarray, np.ndarray): """ Addition to XMeans by Ishioka et al.. Attempts to repair errors caused by an unfortunate splitting order by merging clusters. @@ -260,8 +255,10 @@ def _merge_clusters(X: np.ndarray, n_clusters: int, labels: np.ndarray, centers: List containing the ids of the samples of a cluster cluster_sizes : np.ndarray The sizes of the clusters - cluster_variances : np.ndarray - The variances of the clusters + cluster_inertias : np.ndarray + The inertias of the clusters + split_criterion : str + The split criterion. Can be "original", "corrected", or "aic" Returns ------- @@ -289,20 +286,18 @@ def _merge_clusters(X: np.ndarray, n_clusters: int, labels: np.ndarray, centers: if already_merged[c2]: continue combined_cluster_size = cluster_sizes[c1] + cluster_sizes[c2] - # Get BIC score of non-merged clusters - cluster_1_and_2_variance = (cluster_variances[c1] * (cluster_sizes[c1] - 1) + - cluster_variances[c2] * (cluster_sizes[c2] - 1)) / (combined_cluster_size - 2) - cluster_1_and_2_bic_score = _bic_score(combined_cluster_size, cluster_sizes[[c1, c2]], n_dims, - cluster_1_and_2_variance) - # Get BIC of merged cluster + # Get score of non-merged clusters + cluster_1_and_2_inertia = (cluster_inertias[c1] + cluster_inertias[c2]) + cluster_1_and_2_score = _clustering_score(combined_cluster_size, cluster_sizes[[c1, c2]], n_dims, + cluster_1_and_2_inertia, split_criterion) + # Get score of merged cluster new_center = (centers[c1] * cluster_sizes[c1] + centers[c2] * cluster_sizes[c2]) / combined_cluster_size - cluster_merged_variance = np.sum( - (X[np.r_[ids_in_each_cluster[c1], ids_in_each_cluster[c2]]] - new_center) ** 2) / ( - combined_cluster_size - 1) - cluster_merged_bic_score = _bic_score(combined_cluster_size, combined_cluster_size, - n_dims, cluster_merged_variance) - # Is merge improving the local BIC score? - if cluster_merged_bic_score > cluster_1_and_2_bic_score: + cluster_merged_inertia = np.sum( + (X[np.r_[ids_in_each_cluster[c1], ids_in_each_cluster[c2]]] - new_center) ** 2) + cluster_merged_score = _clustering_score(combined_cluster_size, combined_cluster_size, + n_dims, cluster_merged_inertia, split_criterion) + # Is merge improving the local score? + if cluster_merged_score > cluster_1_and_2_score: # Update labels and centers min_cluster_id = min(c1, c2) max_cluster_id = max(c1, c2) @@ -322,10 +317,11 @@ def _merge_clusters(X: np.ndarray, n_clusters: int, labels: np.ndarray, centers: return n_clusters, labels, centers -def _bic_score(n_points: int, cluster_sizes: np.ndarray, n_dims: int, variance: float) -> float: +def _clustering_score(n_points: int, cluster_sizes: np.ndarray, n_dims: int, inertia: float, split_criterion: str) -> float: """ - Calculate the BIC score of a clustering result. - For more information see: 'X-means: Extending k-means with efficient estimation of the number of clusters' + Calculate the score of a clustering result. In the original paper this corresponds to the BIC score of the result. + For more information see: 'X-means: Extending k-means with efficient estimation of the number of clusters' as well as + https://github.com/bobhancock/goxmeans/blob/master/doc/BIC_notes.pdf. Parameters ---------- @@ -335,28 +331,45 @@ def _bic_score(n_points: int, cluster_sizes: np.ndarray, n_dims: int, variance: Number of samples in each cluster. Can also by of type int in case of a single cluster n_dims : int Number of features in the data set - variance : float - The variance across all clusters + inertia : float + The inertia of the clustering result + split_criterion : str + The split criterion. Can be "original", "corrected", or "aic" Returns ------- - bic_total : float - The BIC score of the cluster + score_total : float + The score of the clustering result """ n_clusters = cluster_sizes.shape[0] if type(cluster_sizes) is np.ndarray else 1 - # BIC of the free parameters + # Cost of the free parameters n_free_params = n_clusters * (n_dims + 1) # Equal to: (n_clusters - 1) + n_clusters * n_dims + 1 - bic_free_params = n_free_params * bic_costs(n_points, False) - # BIC of the data using the loglikelihood - bic_loglikelihood = np.sum(cluster_sizes * (np.log(cluster_sizes) - np.log(n_points) - np.log( - 2.0 * np.pi) / 2 - n_dims * np.log(variance) / 2) - (cluster_sizes - n_clusters) / 2) - # Combine BIC score components - bic_total = bic_loglikelihood - bic_free_params - return bic_total + if split_criterion.startswith("bic"): + cost_free_params = n_free_params * bic_costs(n_points, False) + else: + cost_free_params = n_free_params + # Score of Loglikelihood + variance = inertia / (n_points - n_clusters) + if split_criterion.endswith("original"): + # BIC of the data using the loglikelihood as porposed in the original paper + score_loglikelihood = np.sum(cluster_sizes * np.log(cluster_sizes)) - n_points * (np.log(n_points) + np.log( + 2.0 * np.pi) / 2 + n_dims * np.log(variance) / 2) - (n_points - n_clusters * n_clusters) / 2 + else: + variance = variance / n_dims + score_loglikelihood = np.sum(cluster_sizes * np.log(cluster_sizes)) - n_points * (np.log(n_points) + n_dims * np.log( + 2 * np.pi * variance) / 2) - n_dims * (n_points - n_clusters) / 2 + # Combine score components + score_total = score_loglikelihood - cost_free_params + return score_total class XMeans(ClusterMixin, BaseEstimator): """ + Execute the XMeans clustering procedure. + Determines the number of clusters by executing the KMeans with an increasing number of clusters. + For each result, the clustering score based on the BIC or AIC is evaluated. + The process is repeated until no cluster are added anymore. + Optionally, a final merging mechanism can be used to check if the score can be further improved. Parameters ---------- @@ -365,12 +378,15 @@ class XMeans(ClusterMixin, BaseEstimator): max_n_clusters : int Maximum number of clusters. Must be larger than n_clusters_init (default: np.inf) check_global_score : bool - Defines whether the global BIC score should be checked after the 'Improve-Params' step. Some implementations skip this step (default: True) + Defines whether the global score should be checked after the 'Improve-Params' step. Some implementations skip this step (default: True) allow_merging : bool Try to merge clusters after the regular XMeans algorithm terminated. See Ishioka et al. for more information. Normally, if allow_merging is True, check_global_score should be False (default: False) n_split_trials : int Number tries to split a cluster. For each try 2-KMeans is executed with different cluster centers (default: 10) + split_criterion : str + The split criterion. Can be "bic-original" (BIC), "bic-corrected" (corrected BIC), + "aic-original" (AIC), or "aic-corrected" (corrected AIC) (default: bic-corrected) random_state : np.random.RandomState | int use a fixed random state to get a repeatable solution. Can also be of type int (default: None) @@ -408,16 +424,21 @@ class XMeans(ClusterMixin, BaseEstimator): Ishioka, Tsunenori. "An expansion of X-means for automatically determining the optimal number of clusters." Proceedings of International Conference on Computational Intelligence. Vol. 2. 2005. + + and + + https://github.com/bobhancock/goxmeans/blob/master/doc/BIC_notes.pdf """ def __init__(self, n_clusters_init: int = 2, max_n_clusters: int = np.inf, check_global_score: bool = True, - allow_merging: bool = False, n_split_trials: int = 10, + allow_merging: bool = False, n_split_trials: int = 10, split_criterion: str = "bic-corrected", random_state: np.random.RandomState | int = None): self.n_clusters_init = n_clusters_init self.max_n_clusters = max_n_clusters self.check_global_score = check_global_score self.allow_merging = allow_merging self.n_split_trials = n_split_trials + self.split_criterion = split_criterion self.random_state = random_state def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'XMeans': @@ -438,8 +459,10 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'XMeans': this instance of the XMeans algorithm """ X, _, random_state = check_parameters(X=X, y=y, random_state=self.random_state) + split_criterion = self.split_criterion.lower() + assert split_criterion in ["bic-original", "bic-corrected", "aic-original", "aic-corrected"] n_clusters, labels, centers = _xmeans(X, self.n_clusters_init, self.max_n_clusters, self.check_global_score, - self.allow_merging, self.n_split_trials, random_state) + self.allow_merging, self.n_split_trials, split_criterion, random_state) self.n_clusters_ = n_clusters self.labels_ = labels self.cluster_centers_ = centers diff --git a/pyproject.toml b/pyproject.toml index c47f6ae..049503a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ license = {text = "BSD-3-Clause License"} requires-python = ">=3.10" dependencies = [ "numpy", - "scipy", + "scipy>=1.17.0", "scikit-learn>=1.6", "matplotlib", "torch", From fb00a7b85b5c295f8d14e7dd8aea1514aa3bb5fb Mon Sep 17 00:00:00 2001 From: Collin Leiber Date: Thu, 11 Jun 2026 16:52:14 +0300 Subject: [PATCH 2/3] change minimum Python version to 3.11. Fix tests --- .circleci/config.yml | 2 +- .github/workflows/publish.yml | 2 +- .github/workflows/test-main.yml | 14 +++++++------- clustpy/partition/gmeans.py | 2 +- clustpy/partition/tests/test_xmeans.py | 4 ++-- clustpy/partition/xmeans.py | 6 ++++-- pyproject.toml | 2 +- 7 files changed, 17 insertions(+), 15 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a84e983..95b8b30 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -21,7 +21,7 @@ jobs: # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container # Change the version below to your required version of python docker: - - image: cimg/python:3.12.2 + - image: cimg/python:3.14.5 # Checkout the code as the first step. This is a dedicated CircleCI step. # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default. # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt. diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 698a487..14d19d8 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -26,7 +26,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: '3.14' - name: Install dependencies run: | diff --git a/.github/workflows/test-main.yml b/.github/workflows/test-main.yml index f9b8ce0..4d8a15c 100644 --- a/.github/workflows/test-main.yml +++ b/.github/workflows/test-main.yml @@ -20,10 +20,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python 3.12 + - name: Set up Python 3.14 uses: actions/setup-python@v5 with: - python-version: '3.12' + python-version: '3.14' cache: 'pip' # Speeds up flake8 installation - name: Install dependencies @@ -42,9 +42,9 @@ jobs: needs: lint # This job only starts if 'lint' passes runs-on: ubuntu-latest strategy: - fail-fast: false # don't break 3.12 if 3.10 fails + fail-fast: false # don't break a version if another one fails matrix: - python-version: ["3.12", "3.10"] + python-version: ["3.14", "3.11"] steps: - uses: actions/checkout@v4 @@ -62,17 +62,17 @@ jobs: pip install -e .[full] - name: Test with pytest (with codecov) - if: ${{ matrix.python-version == '3.10' }} + if: ${{ matrix.python-version == '3.11' }} run: | pytest -m "not largedata" --cov --cov-report=xml - name: Test with pytest (without codecov) - if: ${{ matrix.python-version != '3.10' }} + if: ${{ matrix.python-version != '3.11' }} run: | pytest -m "not largedata" - name: Upload coverage reports to Codecov - if: ${{ matrix.python-version == '3.10' }} + if: ${{ matrix.python-version == '3.11' }} uses: codecov/codecov-action@v4 with: token: ${{ secrets.CODECOV_TOKEN }} diff --git a/clustpy/partition/gmeans.py b/clustpy/partition/gmeans.py index dcbf147..feb4ee7 100644 --- a/clustpy/partition/gmeans.py +++ b/clustpy/partition/gmeans.py @@ -59,7 +59,7 @@ def _gmeans(X: np.ndarray, significance: float, n_clusters_init: int, max_n_clus projected_data = np.dot(X[ids_in_cluster], projection_vector) projected_data = (projected_data - projected_data.mean()) / projected_data.std() # Use Anderson Darling to test if data is Gaussian - ad_result = anderson(projected_data, "norm", method="interpolated") + ad_result = anderson(projected_data, "norm", method="interpolate") p_value = ad_result.pvalue if p_value < significance: # If data is not Gaussian, keep the newly created cluster centers diff --git a/clustpy/partition/tests/test_xmeans.py b/clustpy/partition/tests/test_xmeans.py index eb3e90a..f6cc9b1 100644 --- a/clustpy/partition/tests/test_xmeans.py +++ b/clustpy/partition/tests/test_xmeans.py @@ -70,7 +70,7 @@ def test_merge_clusters(): cluster_variances = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) / ( cluster_sizes[c] - 1) for c in range(n_clusters)]) n_clusters, labels, centers = _merge_clusters(X, n_clusters, labels, centers, ids_in_each_cluster, cluster_sizes, - cluster_variances) + cluster_variances, "bic-corrected") assert n_clusters == 1 assert np.array_equal(labels, np.array([0] * 18)) assert np.array_equal(centers, np.array([[2.5, 0]])) @@ -99,7 +99,7 @@ def test_simple_XMeans(): assert np.array_equal(xmeans.cluster_centers_, xmeans2.cluster_centers_) # Test with parameters xmeans = XMeans(n_clusters_init=3, max_n_clusters=5, check_global_score=False, allow_merging=True, n_split_trials=5, - random_state=1) + split_criterion="aic-original", random_state=1) xmeans.fit(X) assert xmeans.labels_.dtype == np.int32 assert xmeans.labels_.shape == labels.shape diff --git a/clustpy/partition/xmeans.py b/clustpy/partition/xmeans.py index 35f59b0..5b11944 100644 --- a/clustpy/partition/xmeans.py +++ b/clustpy/partition/xmeans.py @@ -146,7 +146,8 @@ def _xmeans(X: np.ndarray, n_clusters_init: int, max_n_clusters: int, check_glob n_split_trials : int Number tries to split a cluster. For each try 2-KMeans is executed with different cluster centers split_criterion : str - The split criterion. Can be "original", "corrected", or "aic" + The split criterion. Can be "bic-original" (BIC), "bic-corrected" (corrected BIC), + "aic-original" (AIC), or "aic-corrected" (corrected AIC) random_state : np.random.RandomState use a fixed random state to get a repeatable solution @@ -258,7 +259,8 @@ def _merge_clusters(X: np.ndarray, n_clusters: int, labels: np.ndarray, centers: cluster_inertias : np.ndarray The inertias of the clusters split_criterion : str - The split criterion. Can be "original", "corrected", or "aic" + The split criterion. Can be "bic-original" (BIC), "bic-corrected" (corrected BIC), + "aic-original" (AIC), or "aic-corrected" (corrected AIC) Returns ------- diff --git a/pyproject.toml b/pyproject.toml index 049503a..163b1ce 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ description = "A Python library for advanced clustering algorithms" readme = "README.md" authors = [{name = "Collin Leiber", email = "leiber@dbs.ifi.lmu.de"}] license = {text = "BSD-3-Clause License"} -requires-python = ">=3.10" +requires-python = ">=3.11" dependencies = [ "numpy", "scipy>=1.17.0", From 84a18448c0497a52888896143a32c999f28fe0bd Mon Sep 17 00:00:00 2001 From: Collin Leiber <33159895+collinleiber@users.noreply.github.com> Date: Thu, 11 Jun 2026 16:37:57 +0200 Subject: [PATCH 3/3] Update test_xmeans.py --- clustpy/partition/tests/test_xmeans.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clustpy/partition/tests/test_xmeans.py b/clustpy/partition/tests/test_xmeans.py index f6cc9b1..8bb7872 100644 --- a/clustpy/partition/tests/test_xmeans.py +++ b/clustpy/partition/tests/test_xmeans.py @@ -67,10 +67,9 @@ def test_merge_clusters(): centers = np.array([[1., 0.], [4., 0.]]) ids_in_each_cluster = [np.array([i for i in range(9)]), np.array([i for i in range(9, 18)])] cluster_sizes = np.array([9, 9]) - cluster_variances = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) / ( - cluster_sizes[c] - 1) for c in range(n_clusters)]) + cluster_inertias = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) for c in range(n_clusters)]) n_clusters, labels, centers = _merge_clusters(X, n_clusters, labels, centers, ids_in_each_cluster, cluster_sizes, - cluster_variances, "bic-corrected") + cluster_inertias, "bic-corrected") assert n_clusters == 1 assert np.array_equal(labels, np.array([0] * 18)) assert np.array_equal(centers, np.array([[2.5, 0]]))