Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
# The executor is the environment in which the steps below will be executed - below will use a python 3.10.2 container
# Change the version below to your required version of python
docker:
- image: cimg/python:3.12.2
- image: cimg/python:3.14.5
# Checkout the code as the first step. This is a dedicated CircleCI step.
# The python orb's install-packages step will install the dependencies from a Pipfile via Pipenv by default.
# Here we're making sure we use just use the system-wide pip. By default it uses the project root's requirements.txt.
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.12'
python-version: '3.14'

- name: Install dependencies
run: |
Expand Down
14 changes: 7 additions & 7 deletions .github/workflows/test-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ jobs:
steps:
- uses: actions/checkout@v4

- name: Set up Python 3.12
- name: Set up Python 3.14
uses: actions/setup-python@v5
with:
python-version: '3.12'
python-version: '3.14'
cache: 'pip' # Speeds up flake8 installation

- name: Install dependencies
Expand All @@ -42,9 +42,9 @@ jobs:
needs: lint # This job only starts if 'lint' passes
runs-on: ubuntu-latest
strategy:
fail-fast: false # don't break 3.12 if 3.10 fails
fail-fast: false # don't break a version if another one fails
matrix:
python-version: ["3.12", "3.10"]
python-version: ["3.14", "3.11"]

steps:
- uses: actions/checkout@v4
Expand All @@ -62,17 +62,17 @@ jobs:
pip install -e .[full]

- name: Test with pytest (with codecov)
if: ${{ matrix.python-version == '3.10' }}
if: ${{ matrix.python-version == '3.11' }}
run: |
pytest -m "not largedata" --cov --cov-report=xml

- name: Test with pytest (without codecov)
if: ${{ matrix.python-version != '3.10' }}
if: ${{ matrix.python-version != '3.11' }}
run: |
pytest -m "not largedata"

- name: Upload coverage reports to Codecov
if: ${{ matrix.python-version == '3.10' }}
if: ${{ matrix.python-version == '3.11' }}
uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
Expand Down
46 changes: 6 additions & 40 deletions clustpy/partition/gmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,13 @@ def _gmeans(X: np.ndarray, significance: float, n_clusters_init: int, max_n_clus
labels_split, centers_split, _ = _execute_two_means(X[ids_in_cluster], [np.arange(ids_in_cluster.shape[0])], 0,
np.array([centers[c]]), n_split_trials, random_state)
# Project data form cluster onto resulting connection axis
projected_data = np.dot(X[ids_in_cluster], centers_split[0] - centers_split[1])
projection_vector = centers_split[0] - centers_split[1]
projection_vector /= np.linalg.norm(projection_vector)
projected_data = np.dot(X[ids_in_cluster], projection_vector)
projected_data = (projected_data - projected_data.mean()) / projected_data.std()
# Use Anderson Darling to test if data is Gaussian
ad_result = anderson(projected_data, "norm")
p_value = _anderson_darling_statistic_to_prob(ad_result.statistic, len(ids_in_cluster))
ad_result = anderson(projected_data, "norm", method="interpolate")
p_value = ad_result.pvalue
if p_value < significance:
# If data is not Gaussian, keep the newly created cluster centers
centers[c] = centers_split[0]
Expand All @@ -79,43 +82,6 @@ def _gmeans(X: np.ndarray, significance: float, n_clusters_init: int, max_n_clus
return n_clusters, labels, centers


def _anderson_darling_statistic_to_prob(statistic: float, n_points: int) -> float:
"""
Transform the statistic returned by the Anderson Darling test into a p_value.
First the adjusted statistic will be calculated.
Afterwards, the actual p-value can be obtained.

Parameters
----------
statistic : float
The original statistic from the Anderson Darling test.
n_points : int
The number of samples

Returns
-------
p_value : float
The p-value

References
----------
D'Agostino, Ralph B., and Michael A. Stephens. "Goodness-of-fit techniques."
Statistics: Textbooks and Monographs (1986).
"""
adjusted_stat = statistic * (1 + (.75 / n_points) + 2.25 / (n_points ** 2))
if adjusted_stat < 0.2:
# is log q => therefore add 1 - ...
p_value = 1 - np.exp(-13.436 + 101.14 * adjusted_stat - 223.73 * (adjusted_stat ** 2))
elif adjusted_stat < 0.34:
# is log q => therefore add 1 - ...
p_value = 1 - np.exp(-8.318 + 42.796 * adjusted_stat - 59.938 * (adjusted_stat ** 2))
elif adjusted_stat < 0.6:
p_value = np.exp(0.9177 - 4.279 * adjusted_stat - 1.38 * (adjusted_stat ** 2))
else:
p_value = np.exp(1.2937 - 5.709 * adjusted_stat - 0.0186 * (adjusted_stat ** 2))
return p_value


class GMeans(ClusterMixin, BaseEstimator):
"""
Execute the GMeans clustering procedure.
Expand Down
13 changes: 0 additions & 13 deletions clustpy/partition/tests/test_gmeans.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
from clustpy.partition import GMeans
from clustpy.partition.gmeans import _anderson_darling_statistic_to_prob
from sklearn.datasets import make_blobs
from scipy.stats import anderson
from clustpy.utils.checks import check_clustpy_estimator
Expand All @@ -10,18 +9,6 @@ def test_gmeans_estimator():
check_clustpy_estimator(GMeans(), ("check_complex_data"))


def test_anderson_darling_statistic_to_prob():
n_points = 20
statistic = 0.435
assert np.isclose(_anderson_darling_statistic_to_prob(statistic, n_points), 0.270, atol=0.001)
# Example from https://www.spcforexcel.com/knowledge/basic-statistics/anderson-darling-test-for-normality
data = np.array([3334, 3554, 3625, 3837, 3838])
ad_result = anderson(data, "norm")
statistic = ad_result.statistic
assert np.isclose(statistic, 0.288, atol=0.001)
assert np.isclose(_anderson_darling_statistic_to_prob(statistic, data.shape[0]), 0.456, atol=0.001)


"""
Tests regarding the GMeans object
"""
Expand Down
7 changes: 3 additions & 4 deletions clustpy/partition/tests/test_xmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,9 @@ def test_merge_clusters():
centers = np.array([[1., 0.], [4., 0.]])
ids_in_each_cluster = [np.array([i for i in range(9)]), np.array([i for i in range(9, 18)])]
cluster_sizes = np.array([9, 9])
cluster_variances = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) / (
cluster_sizes[c] - 1) for c in range(n_clusters)])
cluster_inertias = np.array([np.sum((X[ids_in_each_cluster[c]] - centers[c]) ** 2) for c in range(n_clusters)])
n_clusters, labels, centers = _merge_clusters(X, n_clusters, labels, centers, ids_in_each_cluster, cluster_sizes,
cluster_variances)
cluster_inertias, "bic-corrected")
assert n_clusters == 1
assert np.array_equal(labels, np.array([0] * 18))
assert np.array_equal(centers, np.array([[2.5, 0]]))
Expand Down Expand Up @@ -99,7 +98,7 @@ def test_simple_XMeans():
assert np.array_equal(xmeans.cluster_centers_, xmeans2.cluster_centers_)
# Test with parameters
xmeans = XMeans(n_clusters_init=3, max_n_clusters=5, check_global_score=False, allow_merging=True, n_split_trials=5,
random_state=1)
split_criterion="aic-original", random_state=1)
xmeans.fit(X)
assert xmeans.labels_.dtype == np.int32
assert xmeans.labels_.shape == labels.shape
Expand Down
Loading
Loading