From 5ef0b2ef09ceb475dd30fed4250edc9293424396 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 27 Apr 2026 23:33:00 -0400 Subject: [PATCH 1/2] add sv graph checker for yeast pangenome test --- test/evolverTest.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/evolverTest.py b/test/evolverTest.py index 2faeec2a6..bc80866de 100644 --- a/test/evolverTest.py +++ b/test/evolverTest.py @@ -608,6 +608,7 @@ def _run_yeast_pangenome_step_by_step(self, binariesMode): mg_path = os.path.join(self.tempDir, 'yeast.sv.gfa.gz') mg_cmd = ['cactus-minigraph', self._job_store(binariesMode), seq_file_path, mg_path, '--reference', 'S288C'] + cactus_opts subprocess.check_call(mg_cmd) + self._validate_sv_gfa(mg_path) # run graphmap in base mode paf_path = os.path.join(self.tempDir, 'yeast.paf') @@ -669,6 +670,16 @@ def _run_yeast_pangenome(self, binariesMode, mgSplit=False, collapse=False, augR subprocess.check_call(['mkdir', '-p', os.path.join(self.tempDir, 'chroms')]) subprocess.check_call(['mv', os.path.join(join_path, 'chrom-subproblems', 'contig_sizes.tsv'), os.path.join(self.tempDir, 'chroms')]) + def _validate_sv_gfa(self, gfa_path): + """ run `zcat | vg validate -` and assert it passes. + catches things like missing edges in the merged minigraph SV GFA. + """ + self.assertTrue(os.path.exists(gfa_path), 'sv.gfa.gz not found at {}'.format(gfa_path)) + proc = subprocess.run('zcat {} | vg validate -'.format(gfa_path), + shell=True, capture_output=True) + self.assertEqual(proc.returncode, 0, + 'vg validate failed for {}\nstderr:\n{}'.format(gfa_path, proc.stderr.decode())) + def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False, expect_haplo=False, expect_unchopped_gfa=False, expect_augRef=False): """ yeast pangenome chromosome by chromosome pipeline """ @@ -684,6 +695,12 @@ def _check_yeast_pangenome(self, binariesMode, other_ref=None, expect_odgi=False assert len(events) == 6 join_path = os.path.join(self.tempDir, 'join') + + # validate the minigraph SV GFA: catches dropped/dangling edges + # (e.g. the per-chrom merge_sv_gfa bug under --mgSplit) + sv_gfa_path = os.path.join(join_path, 'yeast.sv.gfa.gz') + if os.path.exists(sv_gfa_path): + self._validate_sv_gfa(sv_gfa_path) vcf_paths = [os.path.join(join_path, 'yeast.vcf.gz')] if other_ref: vcf_paths.append(os.path.join(join_path, 'yeast.{}.vcf.gz'.format(other_ref))) From 1fe4b3dbd215984bef5e2d6e181d41531ca3b750 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 27 Apr 2026 23:45:56 -0400 Subject: [PATCH 2/2] fix sv gfa merging bug --- src/cactus/refmap/cactus_graphmap_join.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/cactus/refmap/cactus_graphmap_join.py b/src/cactus/refmap/cactus_graphmap_join.py index a7fde3248..1a4715c32 100644 --- a/src/cactus/refmap/cactus_graphmap_join.py +++ b/src/cactus/refmap/cactus_graphmap_join.py @@ -2210,11 +2210,18 @@ def merge_sv_gfa(job, options, sv_gfa_ids): line = line.decode() if line.startswith('S'): toks = line.split('\t') - seq_id = toks[1] - seq_no = int(seq_id[1:]) + offset + seq_no = int(toks[1][1:]) + offset toks[1] = 's{}'.format(seq_no) cur_max = max(cur_max, seq_no) merged_gfa_file.write('\t'.join(toks).encode()) + elif line.startswith('L'): + # L lines reference segment IDs in toks[1] (from) and toks[3] (to); + # they need the same offset as S lines or edges will silently rewire + # to segments from previously-merged chromosomes + toks = line.split('\t') + toks[1] = 's{}'.format(int(toks[1][1:]) + offset) + toks[3] = 's{}'.format(int(toks[3][1:]) + offset) + merged_gfa_file.write('\t'.join(toks).encode()) else: merged_gfa_file.write(line.encode()) offset = cur_max