From 3b5bb09298026b6084ff15d2d745d38aaaa726da Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Fri, 17 Apr 2026 17:49:09 +0000 Subject: [PATCH 1/5] fix: update parent count and diameter handling in SplitSpecies class --- gambitdb/SplitSpecies.py | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/gambitdb/SplitSpecies.py b/gambitdb/SplitSpecies.py index 37c56a9..f48fd7a 100755 --- a/gambitdb/SplitSpecies.py +++ b/gambitdb/SplitSpecies.py @@ -85,9 +85,10 @@ def split_high_diameter_species(self): if subspecies is not None and not subspecies.empty: # 2+ viable clusters — subspeciate species = pd.concat([species, subspecies], ignore_index=False, sort=False) - species.loc[ - species["name"] == single_species[1]["name"], "diameter" - ] = 0.0 + parent_name = single_species[1]["name"] + parent_count = int((genome_metadata["species"] == parent_name).sum()) + species.loc[species["name"] == parent_name, "diameter"] = 0.0 + species.loc[species["name"] == parent_name, "ngenomes"] = parent_count elif subspecies is not None and subspecies.empty: # Singleton outliers removed, 1 cluster remains — keep species with recalculated diameter species.loc[ @@ -183,11 +184,11 @@ def split_single_high_diameter_species_into_subspecies( "All clusters are singletons for species %s, removing entirely", single_species[1]["name"] ) - self.save_small_clusters_accessions_removed(small_clusters, single_species) + self.save_small_clusters_accessions_removed(small_clusters, single_species, genome_metadata) return None, genome_metadata, single_species # Always record singleton accessions as removed - self.save_small_clusters_accessions_removed(small_clusters, single_species) + self.save_small_clusters_accessions_removed(small_clusters, single_species, genome_metadata) if num_clusters == 1: # Single viable cluster remains after removing singletons. @@ -317,22 +318,14 @@ def create_subspecies_from_clusters( return subspecies, genome_metadata, single_species - def save_small_clusters_accessions_removed(self, small_clusters, single_species): + def save_small_clusters_accessions_removed(self, small_clusters, single_species, genome_metadata): """ - Saves the accessions of small clusters to a file. - Args: - small_clusters (DataFrame): A DataFrame containing the small clusters. - single_species (DataFrame): A DataFrame containing the single species. - Returns: - None - Side Effects: - Updates the accessions_removed attribute. - Examples: - >>> save_small_clusters_accessions_removed(small_clusters, single_species) + Records singleton/small-cluster accessions as removed and drops them from genome_metadata + so downstream counts reflect the post-removal state. """ - # save the accessions of the small clusters to a file small_clusters_accessions = small_clusters["assembly_accession"].tolist() self.accessions_removed = self.accessions_removed + small_clusters_accessions + genome_metadata.drop(small_clusters_accessions, inplace=True) self.logger.debug( "Remove small clusters: " From 9c5b60d4591740d982a862a4b2a89549572f5346 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Fri, 17 Apr 2026 17:49:15 +0000 Subject: [PATCH 2/5] test: enhance SplitSpecies tests with additional assertions and add test data files --- gambitdb/tests/SplitSpecies_test.py | 13 +++++++++++++ gambitdb/tests/data/gambitdb/pw-dists.csv | 5 +++++ gambitdb/tests/data/gambitdb/signatures.h5 | Bin 0 -> 8736 bytes 3 files changed, 18 insertions(+) create mode 100644 gambitdb/tests/data/gambitdb/pw-dists.csv create mode 100644 gambitdb/tests/data/gambitdb/signatures.h5 diff --git a/gambitdb/tests/SplitSpecies_test.py b/gambitdb/tests/SplitSpecies_test.py index 7c5b49c..a7e5a31 100755 --- a/gambitdb/tests/SplitSpecies_test.py +++ b/gambitdb/tests/SplitSpecies_test.py @@ -91,6 +91,14 @@ def test_split_species(self): self.assertEqual(s.shape[0], 5) self.assertEqual(g.shape[0], 13) self.assertEqual(len(accessions_removed), 0) + # Parent species 'Yellow black' was subspeciated; its genomes are + # reassigned to the subspecies rows, so parent ngenomes must be 0. + parent_row = s[s['name'] == 'Yellow black'] + self.assertEqual(len(parent_row), 1) + self.assertEqual(int(parent_row['ngenomes'].iloc[0]), 0) + self.assertEqual(float(parent_row['diameter'].iloc[0]), 0.0) + # No genomes in genome_metadata should still be labeled with the parent name. + self.assertEqual((g['species'] == 'Yellow black').sum(), 0) def test_two_genome_high_diameter_species_removed(self): """ @@ -145,6 +153,11 @@ def test_singleton_outliers_removed_species_kept(self): self.assertEqual(int(red_black['ngenomes'].iloc[0]), 3) # No subspecies should have been created self.assertNotIn('subspecies', ' '.join(s['name'].tolist())) + # Singleton outliers GCA_4 and GCA_5 should have been physically dropped + # from genome_metadata (8 original rows - 2 removed = 6). + self.assertEqual(g.shape[0], 6) + self.assertNotIn('GCA_4', g.index) + self.assertNotIn('GCA_5', g.index) def test_all_singletons_species_removed(self): """ diff --git a/gambitdb/tests/data/gambitdb/pw-dists.csv b/gambitdb/tests/data/gambitdb/pw-dists.csv new file mode 100644 index 0000000..392a552 --- /dev/null +++ b/gambitdb/tests/data/gambitdb/pw-dists.csv @@ -0,0 +1,5 @@ +,s2_overlap_s1,s3_partial_overlap_s1,s4_no_overlap,sample1 +s2_overlap_s1,0.0000,0.0000,0.0000,0.0000 +s3_partial_overlap_s1,0.0000,0.0000,0.0000,0.0000 +s4_no_overlap,0.0000,0.0000,0.0000,0.0000 +sample1,0.0000,0.0000,0.0000,0.0000 diff --git a/gambitdb/tests/data/gambitdb/signatures.h5 b/gambitdb/tests/data/gambitdb/signatures.h5 new file mode 100644 index 0000000000000000000000000000000000000000..13e2dfdbc588ecdb4d4686af582baf03ffc3b485 GIT binary patch literal 8736 zcmeI2&2G~`5P)a%(^{=7j095Y$#CLGfdfZJZD)WRGfbxf3*Kkbzjy?{DV0v<35=)LL$yq zSI~FsK^+u*Q!2Aw_UhqhExX2VJ#F*ro4Htjr{mIkl&{N)A)<^dZdh3f`RV=PzOA|W zTPVDHT7(j=v4GqAZj&o`+TQ0rdJgsu-(jT@aMTS5#yL1T80;Av>vlnY{pM)VYaF(m z3FYfJo5oY~7>;#r+PHq#*{>)0IN6=&f6VQ#;#?P1s>d80m%GMCul+jz%Rl`T4$%YF zL<#{RAOwVf5D)@FKnMr{As_^VfDjM@O#--Yryo)5UK{u0WZT~W7X5awkNbG?^WMIh zQ*0q11cZPP5CTGAwFxXcf8uihmiA=Eo)I+9BkOag4jv)Y=S~qFy*zJ#xNX?0VJ_p* ZoaLN{ak#&SLhRDcqxqi=oN+aI{06PZgS-F$ literal 0 HcmV?d00001 From 0d7d470ac64eada2ffa7dcbc181ee80067e5ce63 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Mon, 20 Apr 2026 19:02:47 +0000 Subject: [PATCH 3/5] fix: remove processed accessions from genome metadata and species list in split_single_high_diameter_species_into_subspecies method --- gambitdb/SplitSpecies.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gambitdb/SplitSpecies.py b/gambitdb/SplitSpecies.py index f48fd7a..511069b 100755 --- a/gambitdb/SplitSpecies.py +++ b/gambitdb/SplitSpecies.py @@ -103,6 +103,8 @@ def split_high_diameter_species(self): genome_metadata["species_taxid"] == single_species[0] ] self.accessions_removed.extend(genome_accessions.index.tolist()) + genome_metadata.drop(genome_accessions.index, inplace=True) + species = species[species["name"] != single_species[1]["name"]] return species, genome_metadata, self.accessions_removed From 3c315138ccc4d11e908d231d74a07773b2239888 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Mon, 20 Apr 2026 19:02:50 +0000 Subject: [PATCH 4/5] fix: update assertions in split_high_diameter_species tests to verify genome removal --- gambitdb/tests/SplitSpecies_test.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/gambitdb/tests/SplitSpecies_test.py b/gambitdb/tests/SplitSpecies_test.py index a7e5a31..502dd78 100755 --- a/gambitdb/tests/SplitSpecies_test.py +++ b/gambitdb/tests/SplitSpecies_test.py @@ -115,12 +115,15 @@ def test_two_genome_high_diameter_species_removed(self): 1, 'average', False) s, g, accessions_removed = ss.split_high_diameter_species() # Yellow black (2 genomes, diameter 0.9) should be removed entirely - # Remaining: Yellow white + Yellow genus = 2 species rows, diameter set to 0.0 for Yellow black + # Remaining: Yellow white + Yellow genus = 2 species rows self.assertNotIn('Yellow black subspecies', ' '.join(s['name'].tolist())) - # GCA_1 and GCA_2 should be in accessions_removed + self.assertNotIn('Yellow black', s['name'].tolist()) + # Both genomes should be removed from genome_metadata and recorded in accessions_removed self.assertIn('GCA_1', accessions_removed) self.assertIn('GCA_2', accessions_removed) self.assertEqual(len(accessions_removed), 2) + self.assertNotIn('GCA_1', g.index) + self.assertNotIn('GCA_2', g.index) def test_singleton_outliers_removed_species_kept(self): """ @@ -206,6 +209,11 @@ def test_all_singletons_species_removed(self): self.assertIn('GCA_1', accessions_removed) self.assertIn('GCA_2', accessions_removed) self.assertIn('GCA_3', accessions_removed) + # Species row should be dropped entirely; genomes dropped from metadata + self.assertNotIn('All apart', s['name'].tolist()) + self.assertNotIn('GCA_1', g.index) + self.assertNotIn('GCA_2', g.index) + self.assertNotIn('GCA_3', g.index) finally: os.unlink(pw_path) From 5cdc06157d6c567c37fc549dd808400ea34ef516 Mon Sep 17 00:00:00 2001 From: Michal-Babins Date: Tue, 19 May 2026 15:28:11 +0000 Subject: [PATCH 5/5] Bump ver --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index d917d3e..845639e 100755 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.2 +0.1.4