collinleiber · collinleiber · Jun 11, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -21,7 +21,7 @@ jobs:
     # The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container
     # Change the version below to your required version of python
     docker:
-      - image: cimg/python:3.12.2
+      - image: cimg/python:3.14.5
     # Checkout the code as the first step. This is a dedicated CircleCI step.
     # The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default.
     # Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt.

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -26,7 +26,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
-        python-version: '3.12'
+        python-version: '3.14'
 
     - name: Install dependencies
       run: |

diff --git a/.github/workflows/test-main.yml b/.github/workflows/test-main.yml
@@ -20,10 +20,10 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - name: Set up Python 3.12
+      - name: Set up Python 3.14
         uses: actions/setup-python@v5
         with:
-          python-version: '3.12'
+          python-version: '3.14'
           cache: 'pip' # Speeds up flake8 installation
 
       - name: Install dependencies
@@ -42,9 +42,9 @@ jobs:
     needs: lint  # This job only starts if 'lint' passes
     runs-on: ubuntu-latest
     strategy:
-      fail-fast: false # don't break 3.12 if 3.10 fails
+      fail-fast: false # don't break a version if another one fails
       matrix:
-        python-version: ["3.12", "3.10"]
+        python-version: ["3.14", "3.11"]
 
     steps:
       - uses: actions/checkout@v4
@@ -62,17 +62,17 @@ jobs:
           pip install -e .[full]
 
       - name: Test with pytest (with codecov)
-        if: ${{ matrix.python-version == '3.10' }}
+        if: ${{ matrix.python-version == '3.11' }}
         run: |
           pytest -m "not largedata" --cov --cov-report=xml
 
       - name: Test with pytest (without codecov)
-        if: ${{ matrix.python-version != '3.10' }}
+        if: ${{ matrix.python-version != '3.11' }}
         run: |
           pytest -m "not largedata"
 
       - name: Upload coverage reports to Codecov
-        if: ${{ matrix.python-version == '3.10' }}
+        if: ${{ matrix.python-version == '3.11' }}
         uses: codecov/codecov-action@v4
         with:
           token: ${{ secrets.CODECOV_TOKEN }}

diff --git a/clustpy/partition/gmeans.py b/clustpy/partition/gmeans.py
@@ -54,10 +54,13 @@ def _gmeans(X: np.ndarray, significance: float, n_clusters_init: int, max_n_clus
             labels_split, centers_split, _ = _execute_two_means(X[ids_in_cluster], [np.arange(ids_in_cluster.shape[0])], 0,
                                                              np.array([centers[c]]), n_split_trials, random_state)
             # Project data form cluster onto resulting connection axis
-            projected_data = np.dot(X[ids_in_cluster], centers_split[0] - centers_split[1])
+            projection_vector = centers_split[0] - centers_split[1]
+            projection_vector /= np.linalg.norm(projection_vector)
+            projected_data = np.dot(X[ids_in_cluster], projection_vector)
+            projected_data = (projected_data - projected_data.mean()) / projected_data.std()
             # Use Anderson Darling to test if data is Gaussian
-            ad_result = anderson(projected_data, "norm")
-            p_value = _anderson_darling_statistic_to_prob(ad_result.statistic, len(ids_in_cluster))
+            ad_result = anderson(projected_data, "norm", method="interpolate")
+            p_value = ad_result.pvalue
             if p_value < significance:
                 # If data is not Gaussian, keep the newly created cluster centers
                 centers[c] = centers_split[0]
@@ -79,43 +82,6 @@ def _gmeans(X: np.ndarray, significance: float, n_clusters_init: int, max_n_clus
     return n_clusters, labels, centers
 
 
-def _anderson_darling_statistic_to_prob(statistic: float, n_points: int) -> float:
-    """
-    Transform the statistic returned by the Anderson Darling test into a p_value.
-    First the adjusted statistic will be calculated.
-    Afterwards, the actual p-value can be obtained.
-
-    Parameters
-    ----------
-    statistic : float
-        The original statistic from the Anderson Darling test.
-    n_points : int
-        The number of samples
-
-    Returns
-    -------
-    p_value : float
-        The p-value
-
-    References
-    ----------
-    D'Agostino, Ralph B., and Michael A. Stephens. "Goodness-of-fit techniques."
-    Statistics: Textbooks and Monographs (1986).
-    """
-    adjusted_stat = statistic * (1 + (.75 / n_points) + 2.25 / (n_points ** 2))
-    if adjusted_stat < 0.2:
-        # is log q => therefore add 1 - ...
-        p_value = 1 - np.exp(-13.436 + 101.14 * adjusted_stat - 223.73 * (adjusted_stat ** 2))
-    elif adjusted_stat < 0.34:
-        # is log q => therefore add 1 - ...
-        p_value = 1 - np.exp(-8.318 + 42.796 * adjusted_stat - 59.938 * (adjusted_stat ** 2))
-    elif adjusted_stat < 0.6:
-        p_value = np.exp(0.9177 - 4.279 * adjusted_stat - 1.38 * (adjusted_stat ** 2))
-    else:
-        p_value = np.exp(1.2937 - 5.709 * adjusted_stat - 0.0186 * (adjusted_stat ** 2))
-    return p_value
-
-
 class GMeans(ClusterMixin, BaseEstimator):
     """
     Execute the GMeans clustering procedure.

diff --git a/clustpy/partition/tests/test_gmeans.py b/clustpy/partition/tests/test_gmeans.py
@@ -1,6 +1,5 @@
 import numpy as np
 from clustpy.partition import GMeans
-from clustpy.partition.gmeans import _anderson_darling_statistic_to_prob
 from sklearn.datasets import make_blobs
 from scipy.stats import anderson
 from clustpy.utils.checks import check_clustpy_estimator
@@ -10,18 +9,6 @@ def test_gmeans_estimator():
     check_clustpy_estimator(GMeans(), ("check_complex_data"))
 
 
-def test_anderson_darling_statistic_to_prob():
-    n_points = 20
-    statistic = 0.435
-    assert np.isclose(_anderson_darling_statistic_to_prob(statistic, n_points), 0.270, atol=0.001)
-    # Example from https://www.spcforexcel.com/knowledge/basic-statistics/anderson-darling-test-for-normality
-    data = np.array([3334, 3554, 3625, 3837, 3838])
-    ad_result = anderson(data, "norm")
-    statistic = ad_result.statistic
-    assert np.isclose(statistic, 0.288, atol=0.001)
-    assert np.isclose(_anderson_darling_statistic_to_prob(statistic, data.shape[0]), 0.456, atol=0.001)
-
-
 """
 Tests regarding the GMeans object
 """

diff --git a/clustpy/partition/tests/test_xmeans.py b/clustpy/partition/tests/test_xmeans.py
@@ -67,10 +67,9 @@ def test_merge_clusters():
     centers = np.array([[1., 0.], [4., 0.]])
     ids_in_each_cluster = [np.array([i for i in range(9)]), np.array([i for i in range(9, 18)])]
     cluster_sizes = np.array([9, 9])
-    cluster_variances = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) / (
-            cluster_sizes[c] - 1) for c in range(n_clusters)])
+    cluster_inertias = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) for c in range(n_clusters)])
     n_clusters, labels, centers = _merge_clusters(X, n_clusters, labels, centers, ids_in_each_cluster, cluster_sizes,
-                                                  cluster_variances)
+                                                  cluster_inertias, "bic-corrected")
     assert n_clusters == 1
     assert np.array_equal(labels, np.array([0] * 18))
     assert np.array_equal(centers, np.array([[2.5, 0]]))
@@ -99,7 +98,7 @@ def test_simple_XMeans():
     assert np.array_equal(xmeans.cluster_centers_, xmeans2.cluster_centers_)
     # Test with parameters
     xmeans = XMeans(n_clusters_init=3, max_n_clusters=5, check_global_score=False, allow_merging=True, n_split_trials=5,
-                    random_state=1)
+                    split_criterion="aic-original", random_state=1)
     xmeans.fit(X)
     assert xmeans.labels_.dtype == np.int32
     assert xmeans.labels_.shape == labels.shape