From 0fd16fdffcc953e8b5cb6fc9b9684004a80e07ca Mon Sep 17 00:00:00 2001
From: anripa <anri.patron@gmail.com>
Date: Thu, 28 May 2026 09:51:31 +0300
Subject: [PATCH 1/2] Fix PG-means - select the largest log likelihood model
 when updating (instead of lowest) - use log likelihood for comparing models
 instead of the lower bound - improve new center proposal by picking the
 lowest log likelihood points - fix possible bias in random projections by
 using randn instead of rand - scale critical value if downsampling data

---
 clustpy/partition/pgmeans.py | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/clustpy/partition/pgmeans.py b/clustpy/partition/pgmeans.py
index 086c1ba..4092ba3 100644
--- a/clustpy/partition/pgmeans.py
+++ b/clustpy/partition/pgmeans.py
@@ -4,12 +4,13 @@
 """
 
 import numpy as np
-from sklearn.mixture import GaussianMixture as GMM
 from scipy.stats import ks_2samp
 from sklearn.base import BaseEstimator, ClusterMixin
-from clustpy.utils.checks import check_parameters
+from sklearn.mixture import GaussianMixture as GMM
 from sklearn.utils.validation import check_is_fitted
 
+from clustpy.utils.checks import check_parameters
+
 
 def _pgmeans(X, significance, n_projections, n_samples, n_new_centers, amount_random_centers, n_clusters_init,
              max_n_clusters, random_state) -> (int, np.ndarray, np.ndarray, GMM):
@@ -62,7 +63,8 @@ def _pgmeans(X, significance, n_projections, n_samples, n_new_centers, amount_ra
         gmm_matches = True
         for _ in range(n_projections):
             # Get random projection
-            projection_vector = random_state.rand(X.shape[1])
+            projection_vector = random_state.randn(X.shape[1])
+            projection_vector /= np.linalg.norm(projection_vector)
             # Project data
             projected_X = np.matmul(X, projection_vector)
             # Project model - Alternative: Sample directly from model and project samples (should be slower)
@@ -150,12 +152,11 @@ def _update_gmm_with_new_center(X: np.ndarray, n_clusters: int, current_gmm: GMM
         The updated GMM with an additional center added
     """
     best_gmm = None
-    best_log_likelihood = np.inf
+    best_log_likelihood = -np.inf
     if n_new_non_random_centers > 0:
-        # Non-random centers are chosen through lowest probability regarding current GMM
-        max_probability_densities = np.max(current_gmm.predict_proba(X), axis=1)
-        # Get minimum max probabilities
-        possible_non_random_samples = np.argsort(max_probability_densities)
+        pointwise_log_likelihood = current_gmm.score_samples(X)
+        # Get lowest fit point (lowest log likelihood)
+        possible_non_random_samples = np.argsort(pointwise_log_likelihood)
     for c in range(n_new_non_random_centers + n_new_random_centers):
         if c < n_new_non_random_centers:
             # Add non random centers
@@ -166,9 +167,10 @@ def _update_gmm_with_new_center(X: np.ndarray, n_clusters: int, current_gmm: GMM
         new_gmm = GMM(n_components=n_clusters, n_init=1, means_init=np.r_[current_gmm.means_, [new_center]],
                       random_state=random_state)
         new_gmm.fit(X)
+        new_avg_log_likelihood = new_gmm.score(X)
         # Check error of new GMM
-        if new_gmm.lower_bound_ < best_log_likelihood:
-            best_log_likelihood = new_gmm.lower_bound_
+        if new_avg_log_likelihood > best_log_likelihood:
+            best_log_likelihood = new_avg_log_likelihood
             best_gmm = new_gmm
     return best_gmm
 
@@ -296,16 +298,25 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'PGMeans':
             this instance of the PGMeans algorithm
         """
         X, _, random_state = check_parameters(X=X, y=y, random_state=self.random_state)
+        n, _ = X.shape
+
+        alpha = self.significance
+
         if self.n_projections is None:
             n_projections = int(-2.6198 * np.log(self.significance)) + 1
         else:
             n_projections = self.n_projections
         if self.n_samples is None:
-            n_samples = int(3 / self.significance) + 1
+            n_prime = 3 / alpha
+            n_samples = int(n_prime) + 1
+            if n_prime < n:
+                # scale the significance level if using less than n generated points for KS test
+                alpha = np.sqrt(n_prime / n) * alpha
         else:
             n_samples = self.n_samples
+
         n_samples = min(n_samples, X.shape[0])
-        n_clusters, labels, centers, gmm = _pgmeans(X, self.significance, n_projections, n_samples,
+        n_clusters, labels, centers, gmm = _pgmeans(X, alpha, n_projections, n_samples,
                                                self.n_new_centers, self.amount_random_centers, self.n_clusters_init,
                                                self.max_n_clusters, random_state)
         self.n_clusters_ = n_clusters

From 90e8d67479aacbbf9f1738bbe338f3a0e5639edd Mon Sep 17 00:00:00 2001
From: anripa <anri.patron@gmail.com>
Date: Tue, 2 Jun 2026 09:07:03 +0300
Subject: [PATCH 2/2] Critical value scaling if using less than n generated
 samples

---
 clustpy/partition/pgmeans.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/clustpy/partition/pgmeans.py b/clustpy/partition/pgmeans.py
index 4092ba3..f09fe39 100644
--- a/clustpy/partition/pgmeans.py
+++ b/clustpy/partition/pgmeans.py
@@ -309,12 +309,14 @@ def fit(self, X: np.ndarray, y: np.ndarray = None) -> 'PGMeans':
         if self.n_samples is None:
             n_prime = 3 / alpha
             n_samples = int(n_prime) + 1
-            if n_prime < n:
-                # scale the significance level if using less than n generated points for KS test
-                alpha = np.sqrt(n_prime / n) * alpha
         else:
             n_samples = self.n_samples
 
+        if n_samples < n:
+                # scale the significance level if using less than n generated points for KS test
+                alpha = np.sqrt(n_samples / n) * alpha
+        
+
         n_samples = min(n_samples, X.shape[0])
         n_clusters, labels, centers, gmm = _pgmeans(X, alpha, n_projections, n_samples,
                                                self.n_new_centers, self.amount_random_centers, self.n_clusters_init,