From e43dd17d86312932c087798c60f22e3fa2cc007c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 2 Dec 2021 15:18:24 -0500
Subject: [PATCH 001/222] Only compute clust_lik; don't change underlying
 likelihood data

---
 hapaseg/allelic_DP.py | 60 +++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 33 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 9bf2d2e..790b4be 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -559,48 +559,42 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None
 
         max_clust_idx = segs_to_clusters.max() + 1
 
-        liks = np.full([segs_to_clusters.shape[0], 2], np.nan)
+        liks = np.full(segs_to_clusters.shape[0], np.nan)
 
         for i, (cl_samp, ph_samp) in enumerate(zip(segs_to_clusters, phase_orientations)):
-            # reset phases
-            # TODO: when we switch to faster phasing correction model that doesn't involve modifying self.S, this won't be necessary
-            S_ph = self.S.copy()
-            flip_idx = np.flatnonzero(ph_samp != S_ph["flipped"])
-            S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
-
             ## overall clustering likelihood
-            A = npg.aggregate(cl_samp, S_ph["min"], size = max_clust_idx)
-            B = npg.aggregate(cl_samp, S_ph["maj"], size = max_clust_idx)
 
-# for when self.S is not modified
-#            A = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx) + \
-#              npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx)
-#
-#            B = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx) + \
-#              npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx)
+            A1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx)
+            A2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx)
 
-            clust_lik = ss.betaln(A + 1, B + 1).sum()
-
-            ## segmentation likelihood
-
-            # get segment boundaries
-            bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1])
-            bdy = np.c_[bdy[:-1], bdy[1:]]
+            B1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx)
+            B2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx)
 
-            # sum log-likelihoods of each segment
-            seg_lik = 0
-            for st, en in bdy:
-                A, B = S_ph.iloc[st:en, [self.min_col, self.maj_col]].sum()
+            count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double)
+            count_prior /= count_prior.sum()
 
-# for when self.S is not modified
-#               A = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \
-#                   self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum()
-#               B = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \
-#                   self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum()
+            clust_lik = (ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1) + np.log(count_prior)[count_prior > 0]).sum()
 
-                seg_lik += ss.betaln(A + 1, B + 1)
+#            ## segmentation likelihood
+#
+#            # get segment boundaries
+#            bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1])
+#            bdy = np.c_[bdy[:-1], bdy[1:]]
+#
+#            # sum log-likelihoods of each segment
+#            seg_lik = 0
+#            for st, en in bdy:
+#                A, B = S_ph.iloc[st:en, [self.min_col, self.maj_col]].sum()
+#
+## for when self.S is not modified
+##               A = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \
+##                   self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum()
+##               B = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \
+##                   self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum()
+#
+#                seg_lik += ss.betaln(A + 1, B + 1)
 
-            liks[i, :] = np.r_[clust_lik, seg_lik]
+            liks[i] = clust_lik
 
         return liks
 

From 4f748ea898e7333abc928a154790a7c8e084bfdd Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 14:53:07 -0500
Subject: [PATCH 002/222] Split rephase into probability/realization

---
 hapaseg/allelic_DP.py | 43 +++++++++++++++++++++++--------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 790b4be..85b07ae 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -354,29 +354,32 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster
         self.clust_count_prior[0] = self.alpha # relative probability of sending a cluster to the garbage
 
+    def compute_rephase_prob(self, seg_idx):
+        A_a = self.S.iloc[seg_idx, self.aalt_col].sum() + 1
+        A_b = self.S.iloc[seg_idx, self.aref_col].sum() + 1
+        B_a = self.S.iloc[seg_idx, self.balt_col].sum() + 1
+        B_b = self.S.iloc[seg_idx, self.bref_col].sum() + 1
+
+        # use normal approximation to beta if conditions are right
+        if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20:
+            m_x = A_a/(A_a + A_b)
+            s_x = A_a*A_b/((A_a + A_b)**2*(A_a + A_b + 1))
+            m_y = B_a/(B_a + B_b)
+            s_y = B_a*B_b/((B_a + B_b)**2*(B_a + B_b + 1))
+
+            return s.norm.cdf(0, m_y - m_x, np.sqrt(s_x + s_y))
+
+        # Monte Carlo simulate difference of betas
+        else:
+            x = s.beta.rvs(A_a, A_b, size = 1000)
+            y = s.beta.rvs(B_a, B_b, size = 1000)
+
+            return (x > y).mean()
 
     def rephase(self, seg_idx, force = False):
+        do_rephase = False
         if not force:
-            A_a = self.S.iloc[seg_idx, self.aalt_col].sum() + 1
-            A_b = self.S.iloc[seg_idx, self.aref_col].sum() + 1
-            B_a = self.S.iloc[seg_idx, self.balt_col].sum() + 1
-            B_b = self.S.iloc[seg_idx, self.bref_col].sum() + 1
-
-            # use normal approximation to beta if conditions are right
-            if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20:
-                m_x = A_a/(A_a + A_b)
-                s_x = A_a*A_b/((A_a + A_b)**2*(A_a + A_b + 1))
-                m_y = B_a/(B_a + B_b)
-                s_y = B_a*B_b/((B_a + B_b)**2*(B_a + B_b + 1))
-
-                do_rephase = np.random.rand() < s.norm.cdf(0, m_y - m_x, np.sqrt(s_x + s_y))
-
-            # Monte Carlo simulate difference of betas
-            else:
-                x = s.beta.rvs(A_a, A_b, size = 1000)
-                y = s.beta.rvs(B_a, B_b, size = 1000)
-
-                do_rephase = np.random.rand() < (x > y).mean()
+            do_rephase = np.random.rand() < self.compute_rephase_prob(seg_idx)
 
         if force or do_rephase:
             self.S.iloc[seg_idx, [self.min_col, self.maj_col]] = self.S.iloc[seg_idx, [self.min_col, self.maj_col]].values[:, ::-1]

From 69efc109d69472c4034a50018766820317d942cb Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 14:53:59 -0500
Subject: [PATCH 003/222] Compute likelihood in both phase orientations

---
 hapaseg/allelic_DP.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 85b07ae..204119f 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -608,6 +608,7 @@ def run(self, n_iter = 50):
         if len(self.clust_prior) > 1:
             for seg_idx in range(len(self.S)):
                 seg_idx = np.r_[seg_idx] 
+                #self.rephase(seg_idx)
 
                 # compute probability that segment belongs to each cluster prior element
                 S_a = self.S.iloc[seg_idx[0], self.min_col]
@@ -786,7 +787,7 @@ def run(self, n_iter = 50):
             #
             # perform phase correction on segment/cluster
             # flip min/maj with probability that alleles are oriented the "wrong" way
-            self.rephase(seg_idx)
+            rephase_prob = self.compute_rephase_prob(seg_idx)
 
             #
             # choose to join a cluster or make a new one
@@ -821,18 +822,22 @@ def run(self, n_iter = 50):
             C = ss.betaln(C_ab[:, 0] + 1, C_ab[:, 1] + 1)
             # A is likelihood cluster B is part of, minus B
             #A = ss.betaln(A_a + 1, A_b + 1)
-            # B+C is likelihood of target cluster post-join
-            BC = ss.betaln(C_ab[:, 0] + B_a + 1, C_ab[:, 1] + B_b + 1)
+            # B+C is likelihood of target cluster post-join, with both phase orientations
+            BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1)
+
+            MLs = BC - C[:, None] + np.log(np.r_[1 - rephase_prob, rephase_prob])
+            # TODO: get adj_BC working again
 
             #     L(join)           L(split)
             #MLs = A + BC + adj_BC - (AB + C + adj_AB)
             # TODO: remove extraneous calculations (e.g. adj_AB, AB, A);
             #       likelihood simplifies to this in the prior:
-            MLs = adj_BC + BC - C
+            #MLs = adj_BC + BC - C
 
             # if we are moving multiple contiguous segments assigned to the same
             # cluster, do not allow them to create a new cluster. this helps keep
             # cluster indices consistent
+            # TODO: if we don't care about keeping indices consistent, then we can probably remove this line
             if n_move > 1 and not move_clust:
                 MLs[self.clust_sums.index(-1)] = -np.inf
 

From fd60e522d1dbd549afb0f9afc7c0fe59b98ba5db Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 14:54:31 -0500
Subject: [PATCH 004/222] Compute prior in both phase orientations

---
 hapaseg/allelic_DP.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 204119f..8c5ddb8 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -872,16 +872,17 @@ def run(self, n_iter = 50):
                   np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null | {0})]]
                 ]
 
+                # prior marginal likelihoods for both phase orientations
                 prior_MLs = ss.betaln( # prior clusters + segment
-                  np.r_[self.clust_prior_mat[prior_idx, 0]] + B_a + 1,
-                  np.r_[self.clust_prior_mat[prior_idx, 1]] + B_b + 1
+                  np.c_[self.clust_prior_mat[prior_idx, 0]] + np.c_[B_a, B_b] + 1,
+                  np.c_[self.clust_prior_mat[prior_idx, 1]] + np.c_[B_b, B_a] + 1
                 ) \
-                - (ss.betaln(B_a + 1, B_b + 1) + np.r_[np.r_[self.clust_prior_liks.values()][prior_idx]]) # prior clusters, segment
+                - np.c_[ss.betaln(B_a + 1, B_b + 1) + np.r_[np.r_[self.clust_prior_liks.values()][prior_idx]]] # prior clusters, segment
 
                 clust_prior_p = np.maximum(np.exp(prior_MLs - prior_MLs.max())/np.exp(prior_MLs - prior_MLs.max()).sum(), 1e-300)
 
                 # expand MLs to account for multiple new clusters
-                MLs = np.r_[np.full(len(prior_diff), MLs[0]), MLs[1:]]
+                MLs = np.r_[np.full([len(prior_diff), 2], MLs[0]), MLs[1:, :]]
                 
             # DP prior based on clusters sizes
             # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster)
@@ -890,7 +891,7 @@ def run(self, n_iter = 50):
             count_prior /= count_prior.sum()
 
             # choose to join a cluster or make a new one (choice_idx = 0)
-            num = MLs + np.log(count_prior) + np.log(clust_prior_p)
+            num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p)
             choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum()
             choice_idx = np.random.choice(
               np.r_[0:len(MLs)],

From 95cd50b22232acde087ef3b29b93a11b28c27d33 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 15:25:14 -0500
Subject: [PATCH 005/222] Pick new cluster accounting for phasing state

---
 hapaseg/allelic_DP.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 8c5ddb8..a751406 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -894,12 +894,16 @@ def run(self, n_iter = 50):
             num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p)
             choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum()
             choice_idx = np.random.choice(
-              np.r_[0:len(MLs)],
-              p = choice_p
+              np.r_[0:np.prod(choice_p.shape)],
+              p = choice_p.ravel()
             )
             # -1 = brand new, -2, -3, ... = -(prior clust index) - 2
             # 0 = garbage
-            choice = np.r_[-np.r_[prior_diff] - 2, 0, self.clust_counts.keys()][choice_idx]
+            choice = np.r_[-np.r_[prior_diff] - 2, 0, self.clust_counts.keys()][choice_idx//2]
+
+            # save rephasing status
+            if choice_idx & 1:
+                self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]
 
             # create new cluster
             if choice < 0:

From 81e446914bcf1c9d8c411867e6d0e06e0e33f915 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 15:32:44 -0500
Subject: [PATCH 006/222] Apply some commits from fastphase branch

---
 hapaseg/allelic_DP.py | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index a751406..6144871 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -332,6 +332,10 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.clust_count_prior = clust_count_prior.copy()
         self.alpha = alpha
 
+        self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed
+        self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F")
+        self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F")
+
         #
         # define column indices
         self.clust_col = self.S.columns.get_loc("clust")
@@ -447,8 +451,8 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                   (self.clusts[st - j] == U_cl or self.clusts[st - j] == 0):
                     # again, skip over segments in the garbage
                     if self.clusts[st - j] != 0:
-                        UD_counts[o, 0] += self.S.iloc[st - j, self.min_col]
-                        UD_counts[o, 1] += self.S.iloc[st - j, self.maj_col]
+                        UD_counts[o, 0] += self._Siat_ph(st - j, min = True)
+                        UD_counts[o, 1] += self._Siat_ph(st - j, min = False)
 
                     j += 1
 
@@ -464,8 +468,8 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                 while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \
                   (self.clusts[en + j] == D_cl or self.clusts[en + j] == 0):
                     if self.clusts[en + j] != 0:
-                        UD_counts[o, 2] += self.S.iloc[en + j, self.min_col]
-                        UD_counts[o, 3] += self.S.iloc[en + j, self.maj_col]
+                        UD_counts[o, 2] += self._Siat_ph(en + j, min = True)
+                        UD_counts[o, 3] += self._Siat_ph(en + j, min = False)
 
                     j += 1
 
@@ -497,8 +501,8 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                 # min/maj counts of the segment(s) being moved
                 st = ordpairs[j, 0]
                 en = ordpairs[j, 1]
-                S_a = self.S.iloc[:, self.min_col].values[st:(en + 1)].sum()
-                S_b = self.S.iloc[:, self.maj_col].values[st:(en + 1)].sum()
+                S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # XXX: why en + 1?
+                S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) # XXX: why en + 1?
 
                 # adjacency likelihood of this segment remaining where it is
 #                adj_AB += self.SJliks(
@@ -608,11 +612,10 @@ def run(self, n_iter = 50):
         if len(self.clust_prior) > 1:
             for seg_idx in range(len(self.S)):
                 seg_idx = np.r_[seg_idx] 
-                #self.rephase(seg_idx)
 
                 # compute probability that segment belongs to each cluster prior element
-                S_a = self.S.iloc[seg_idx[0], self.min_col]
-                S_b = self.S.iloc[seg_idx[0], self.maj_col]
+                S_a = self._Siat_ph(seg_idx[0], min = True)
+                S_b = self._Siat_ph(seg_idx[0], min = False)
                 P_a = self.clust_prior_mat[1:, 0]
                 P_b = self.clust_prior_mat[1:, 1]
 
@@ -751,7 +754,7 @@ def run(self, n_iter = 50):
                         del self.clust_sums[cur_clust]
                         del self.clust_members[cur_clust]
                     else:
-                        self.clust_sums[cur_clust] -= np.r_[self.S.iloc[seg_idx, self.min_col].sum(), self.S.iloc[seg_idx, self.maj_col].sum()]
+                        self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)]
                         self.clust_members[cur_clust] -= set(seg_idx)
 
                     unassigned_segs.update(seg_idx)
@@ -798,8 +801,8 @@ def run(self, n_iter = 50):
             # C is all possible clusters to move to
             A_a = self.clust_sums[cur_clust][0] if cur_clust in self.clust_sums else 0
             A_b = self.clust_sums[cur_clust][1] if cur_clust in self.clust_sums else 0
-            B_a = self.S.iloc[seg_idx, self.min_col].sum() # TODO: slow if seg_idx contains many SNPs
-            B_b = self.S.iloc[seg_idx, self.maj_col].sum()
+            B_a = self._Ssum_ph(seg_idx, min = True)
+            B_b = self._Ssum_ph(seg_idx, min = False)
             C_ab = np.r_[self.clust_sums.values()] # first terms: (-1) = make new cluster, (0) = garbage cluster
             #C_ab = np.r_[[v for k, v in clust_sums.items() if k != cur_clust or cur_clust == -1]] # if we don't want to explicitly propose letting B rejoin cur_clust
 

From f7e360c0583877d0305251757972d909a9be1026 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 15:37:27 -0500
Subject: [PATCH 007/222] Apply more commits from fastphase

---
 hapaseg/allelic_DP.py | 39 ++++++++++++++++++++++++---------------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 6144871..5860eb7 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -358,11 +358,31 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster
         self.clust_count_prior[0] = self.alpha # relative probability of sending a cluster to the garbage
 
+    def _Siat_ph(self, ridx, min = True):
+        # min, flip => maj
+        # ~min, ~flip => maj
+        # min, ~flip => min
+        # ~min, flip => min
+        col = self.min_col if self.S.iat[ridx, self.flip_col] ^ min else self.maj_col
+        return self.S.iat[ridx, col]
+
+    def _Ssum_ph(self, seg_idx, min = True):
+        #flip = self.flip_mat[seg_idx]
+        flip = self.S.iloc[seg_idx, self.flip_col]
+        flip_n = ~flip
+        if min:
+            return self.mm_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum()
+        else:
+            return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum()
+
     def compute_rephase_prob(self, seg_idx):
-        A_a = self.S.iloc[seg_idx, self.aalt_col].sum() + 1
-        A_b = self.S.iloc[seg_idx, self.aref_col].sum() + 1
-        B_a = self.S.iloc[seg_idx, self.balt_col].sum() + 1
-        B_b = self.S.iloc[seg_idx, self.bref_col].sum() + 1
+        flip = self.S.iloc[seg_idx, self.flip_col]
+        flip_n = ~flip
+
+        A_a = self.alt_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1
+        A_b = self.ref_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1
+        B_a = self.alt_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1
+        B_b = self.ref_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1
 
         # use normal approximation to beta if conditions are right
         if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20:
@@ -380,17 +400,6 @@ def compute_rephase_prob(self, seg_idx):
 
             return (x > y).mean()
 
-    def rephase(self, seg_idx, force = False):
-        do_rephase = False
-        if not force:
-            do_rephase = np.random.rand() < self.compute_rephase_prob(seg_idx)
-
-        if force or do_rephase:
-            self.S.iloc[seg_idx, [self.min_col, self.maj_col]] = self.S.iloc[seg_idx, [self.min_col, self.maj_col]].values[:, ::-1]
-            self.S.iloc[seg_idx, [self.aalt_col, self.balt_col]] = self.S.iloc[seg_idx, [self.aalt_col, self.balt_col]].values[:, ::-1]
-            self.S.iloc[seg_idx, [self.aref_col, self.bref_col]] = self.S.iloc[seg_idx, [self.aref_col, self.bref_col]].values[:, ::-1]
-            self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]
-
     def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_b, D_a, D_b):
 #            if st == en:
 #                J_a = S.iat[st, min_col].sum()

From e9a9f00443131008c4ef4d0f8a4ce26b58112d68 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 15:38:32 -0500
Subject: [PATCH 008/222] Eliminate self.rephase

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 5860eb7..c6a3241 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -651,7 +651,7 @@ def run(self, n_iter = 50):
 
                 # rephase
                 if choice < 0:
-                    self.rephase(seg_idx, force = True)
+                    self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]
                     choice = -choice
 
                 self.S.iloc[seg_idx, self.clust_col] = choice

From b75ae38d1375f176b81b65211ce6c0fa7b33caa7 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 15:40:56 -0500
Subject: [PATCH 009/222] Add clarifying comment

---
 hapaseg/allelic_DP.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index c6a3241..ddcdbf1 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -905,6 +905,7 @@ def run(self, n_iter = 50):
             # choose to join a cluster or make a new one (choice_idx = 0)
             num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p)
             choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum()
+            # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true
             choice_idx = np.random.choice(
               np.r_[0:np.prod(choice_p.shape)],
               p = choice_p.ravel()

From 6d904da642540886ad375460fa546c0c253a1219 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 6 Dec 2021 16:05:27 -0500
Subject: [PATCH 010/222] Draft commit of getting rid of garbage cluster

---
 hapaseg/allelic_DP.py | 94 +++++++++++--------------------------------
 1 file changed, 24 insertions(+), 70 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index ddcdbf1..4632b22 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -111,7 +111,7 @@ def load_seg_samp(self, samp_idx):
 
         # initial cluster assignments
         S["clust"] = -1 # initially, all segments are unassigned
-        S.iloc[0, S.columns.get_loc("clust")] = 1 # first segment is assigned to cluster 1
+        S.iloc[0, S.columns.get_loc("clust")] = 0 # first segment is assigned to cluster 0
 
         # initial phasing orientation
         S["flipped"] = False
@@ -231,10 +231,6 @@ def run(self, N_seg_samps = 50, N_clust_samps = 5, seg_sample_idx = None):
                 del clust_prior[kk]
                 del clust_count_prior[kk]
 
-            # remove garbage cluster from priors
-            #del clust_prior[0]
-            #del clust_count_prior[0]
-
         return self.snps_to_clusters, self.snps_to_phases, self.DP_likelihoods
 
     def visualize_segs(self, snps_to_clusters = None, f = None, n_vis_samp = None):
@@ -356,7 +352,6 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.clust_prior_mat = np.r_[self.clust_prior.values()]
 
         self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster
-        self.clust_count_prior[0] = self.alpha # relative probability of sending a cluster to the garbage
 
     def _Siat_ph(self, ridx, min = True):
         # min, flip => maj
@@ -408,20 +403,14 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_
 #                J_a = S.iloc[st:(en + 1), min_col].sum()
 #                J_b = S.iloc[st:(en + 1), maj_col].sum()
         SU_a = SU_b = SD_a = SD_b = 0
-        # if target segments are being moved to the garbage, it is equivalent to making them their own segment, and joining the upstream and downstream segments
-        if targ_clust == 0:
-            SU_a = J_a
-            SU_b = J_b
-            J_a = 0
-            J_b = 0
-
-        if targ_clust != - 1 and (targ_clust == upstream_clust or targ_clust == 0):
+
+        if targ_clust != - 1 and targ_clust == upstream_clust:
             J_a += U_a
             J_b += U_b
         else:
             SU_a += U_a
             SU_b += U_b
-        if targ_clust != - 1 and (targ_clust == downstream_clust or targ_clust == 0):
+        if targ_clust != - 1 and targ_clust == downstream_clust:
             J_a += D_a
             J_b += D_b
         else:
@@ -447,48 +436,32 @@ def compute_adj_liks(self, seg_idx, cur_clust):
         for o, (st, en) in enumerate(ordpairs):
             # maj/min counts of contiguous upstream segments belonging to the same cluster
             if st - 1 > 0:
-                # skip over adjacent segments that are in the garbage;
-                # we only care about adjacent segments actually assigned to clusters
                 j = 1
-                while st - j > 0 and self.clusts[st - j] == 0:
-                    j += 1
 
                 U_cl = self.clusts[st - j]
                 adj_clusters[o, 0] = U_cl
 
                 while st - j > 0 and self.clusts[st - j] != -1 and \
-                  (self.clusts[st - j] == U_cl or self.clusts[st - j] == 0):
-                    # again, skip over segments in the garbage
-                    if self.clusts[st - j] != 0:
-                        UD_counts[o, 0] += self._Siat_ph(st - j, min = True)
-                        UD_counts[o, 1] += self._Siat_ph(st - j, min = False)
+                  self.clusts[st - j] == U_cl:
+                    UD_counts[o, 0] += self._Siat_ph(st - j, min = True)
+                    UD_counts[o, 1] += self._Siat_ph(st - j, min = False)
 
                     j += 1
 
             # maj/min counts of contiguous downstream segments belonging to the same cluster
             if en + 1 < len(self.S):
                 j = 1
-                while en + j < len(self.S) - 1 and self.clusts[en + j] == 0:
-                    j += 1
 
                 D_cl = self.clusts[en + j]
                 adj_clusters[o, 1] = D_cl
 
                 while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \
-                  (self.clusts[en + j] == D_cl or self.clusts[en + j] == 0):
-                    if self.clusts[en + j] != 0:
-                        UD_counts[o, 2] += self._Siat_ph(en + j, min = True)
-                        UD_counts[o, 3] += self._Siat_ph(en + j, min = False)
+                  self.clusts[en + j] == D_cl:
+                    UD_counts[o, 2] += self._Siat_ph(en + j, min = True)
+                    UD_counts[o, 3] += self._Siat_ph(en + j, min = False)
 
                     j += 1
 
-        # if we are looking at the segments at the very start or very end, set
-        # upstream/downstream cluster indices to garbage
-        if ordpairs[0, 0] == 0:
-            adj_clusters[0, 0] = 0
-        if ordpairs[-1, 1] == len(self.S) - 1:
-            adj_clusters[-1, 1] = 0
-
         # if there are any segments being moved adjacent to already existing clusters, get local split/join likelihoods
         adj_idx = ~(adj_clusters == -1).all(1)
 
@@ -527,8 +500,8 @@ def compute_adj_liks(self, seg_idx, cur_clust):
 #                )
 
                 # adjacency likelihood of this segment joining each possible cluster:
-                # 1. those it is actually adjacent to (+ new cluster, garbage)
-                for cl in {-1, 0, cl_u, cl_d}:
+                # 1. those it is actually adjacent to (+ new cluster)
+                for cl in {-1, cl_u, cl_d}:
                     idx = self.clust_sums.index(cl)
                     adj_BC[idx] += self.SJliks(
                       targ_clust = cl, 
@@ -541,12 +514,9 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                       D_a = D_a,
                       D_b = D_b
                     )
-                    # we cannot send a segment to the garbage adjacent to any unassigned segment
-                    if cl == 0 and (cl_u == -1 or cl_d == -1):
-                        adj_BC[idx] = -np.inf
 
                 # 2. clusters it is not adjacent to (use default split value)
-                for cl in self.clust_sums.keys() - ({-1, 0} | set(adj_clusters[adj_idx].ravel())):
+                for cl in self.clust_sums.keys() - ({-1} | set(adj_clusters[adj_idx].ravel())):
                     idx = self.clust_sums.index(cl)
                     adj_BC[idx] += self.SJliks(
                       targ_clust = -1, 
@@ -559,9 +529,6 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                       D_a = D_a,
                       D_b = D_b
                     )
-        else:
-            # we cannot send a segment to the garbage adjacent to any unassigned segment
-            adj_BC[self.clust_sums.index(0)] = -np.inf
 
         return adj_AB, adj_BC
 
@@ -635,7 +602,7 @@ def run(self, n_iter = 50):
                 ]
 
                 # get count prior
-                ccp = np.c_[[v for k, v in self.clust_count_prior.items() if k != -1 and k != 0]]
+                ccp = np.c_[[v for k, v in self.clust_count_prior.items() if k != -1]]
 
                 # posterior numerator
                 num = P_l + np.log(ccp)
@@ -664,9 +631,9 @@ def run(self, n_iter = 50):
         # for the first round of clustering, this is { 1 : 1 }
         self.clust_sums = sc.SortedDict({
           **{ k : np.r_[v["min"], v["maj"]] for k, v in self.S.groupby("clust")[["min", "maj"]].sum().to_dict(orient = "index").items() },
-          **{-1 : np.r_[0, 0], 0 : np.r_[0, 0]}
+          **{-1 : np.r_[0, 0]}
         })
-        # for the first round, this is { -1/0 : np.r_[0, 0], 1 : np.r_[S[0, "min"], S[0, "maj"]] }
+        # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]] }
         self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 and k != 0 })
         # for the first round, this is { 1 : {0} }
         unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1])
@@ -694,7 +661,6 @@ def run(self, n_iter = 50):
             if not n_it % 1000:
                 print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index())
                 print("n unassigned: {}".format((self.S["clust"] == -1).sum()))
-                print("n garbage: {}".format((self.S["clust"] == 0).sum()))
 
             # we are burned in (n_seg/n_clust) iterations after all segments have been touched
             if not n_it % 100:
@@ -728,20 +694,18 @@ def run(self, n_iter = 50):
 
                 # expand segment to include all adjacent segments in the same cluster,
                 # if it has already been assigned to a cluster
-                if cur_clust > 0 and np.random.rand() < 0.5:
+                if cur_clust >= 0 and np.random.rand() < 0.5:
                     si = seg_idx[0]
 
                     j = 1
                     while si - j > 0 and \
                       (self.clusts[si - j] == cur_clust or self.clusts[si - j] == 0):
-                        if self.clusts[si - j] != 0:
-                            seg_idx.add(si - j)
+                        seg_idx.add(si - j)
                         j += 1
                     j = 1
                     while si + j < len(self.S) and \
                       (self.clusts[si + j] == cur_clust or self.clusts[si + j] == 0):
-                        if self.clusts[si + j] != 0:
-                            seg_idx.add(si + j)
+                        seg_idx.add(si + j)
                         j += 1
 
                     # if we've expanded to include a large fraction (>10%) of segments 
@@ -756,7 +720,7 @@ def run(self, n_iter = 50):
                 n_move = len(seg_idx)
 
                 # if segment was already assigned to a cluster, unassign it
-                if cur_clust > 0:
+                if cur_clust >= 0:
                     self.clust_counts[cur_clust] -= n_move
                     if self.clust_counts[cur_clust] == 0:
                         del self.clust_counts[cur_clust]
@@ -812,7 +776,7 @@ def run(self, n_iter = 50):
             A_b = self.clust_sums[cur_clust][1] if cur_clust in self.clust_sums else 0
             B_a = self._Ssum_ph(seg_idx, min = True)
             B_b = self._Ssum_ph(seg_idx, min = False)
-            C_ab = np.r_[self.clust_sums.values()] # first terms: (-1) = make new cluster, (0) = garbage cluster
+            C_ab = np.r_[self.clust_sums.values()] # first terms: -1 = make new cluster
             #C_ab = np.r_[[v for k, v in clust_sums.items() if k != cur_clust or cur_clust == -1]] # if we don't want to explicitly propose letting B rejoin cur_clust
 
             #
@@ -881,7 +845,7 @@ def run(self, n_iter = 50):
                 # [-1 (totally new cluster), <prior_diff>, <prior_com + prior_null>]
                 prior_idx = np.r_[
                   np.r_[[self.clust_prior.index(x) for x in prior_diff]],
-                  np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null | {0})]]
+                  np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null)]]
                 ]
 
                 # prior marginal likelihoods for both phase orientations
@@ -899,7 +863,7 @@ def run(self, n_iter = 50):
             # DP prior based on clusters sizes
             # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster)
             ccp = np.r_[[self.clust_count_prior[x] for x in prior_diff]]
-            count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_count_prior[0], self.clust_counts.values()]
+            count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_counts.values()]
             count_prior /= count_prior.sum()
 
             # choose to join a cluster or make a new one (choice_idx = 0)
@@ -911,8 +875,7 @@ def run(self, n_iter = 50):
               p = choice_p.ravel()
             )
             # -1 = brand new, -2, -3, ... = -(prior clust index) - 2
-            # 0 = garbage
-            choice = np.r_[-np.r_[prior_diff] - 2, 0, self.clust_counts.keys()][choice_idx//2]
+            choice = np.r_[-np.r_[prior_diff] - 2, self.clust_counts.keys()][choice_idx//2]
 
             # save rephasing status
             if choice_idx & 1:
@@ -937,11 +900,6 @@ def run(self, n_iter = 50):
                 self.clust_sums[new_clust_idx] = np.r_[B_a, B_b]
                 self.clust_members[new_clust_idx] = set(seg_idx)
 
-            # send to garbage
-            elif choice == 0:
-                self.S.iloc[seg_idx, self.clust_col] = 0
-                self.clusts[seg_idx] = 0
-
             # join existing cluster
             else:
                 # if we are combining two clusters, take the index of the bigger one
@@ -1024,10 +982,6 @@ def visualize_segs(self):
             S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
 
             for i, r in enumerate(S_ph.itertuples()):
-                ## don't show garbage clusters
-                #if s2cu[s2c[i]] == 0:
-                #    continue
-
                 ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], r.min + 1, r.maj + 1)
                 ax.add_patch(mpl.patches.Rectangle((r.start_gp, ci_lo), r.end_gp - r.start_gp, ci_hi - ci_lo, facecolor = colors[s2c[i] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000))
 

From 575ae560edd06d55c4ae5f411430270cee582ace Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 7 Dec 2021 14:30:39 -0500
Subject: [PATCH 011/222] Fix clust_sums update bug

---
 hapaseg/allelic_DP.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 4632b22..bdc904d 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -897,7 +897,7 @@ def run(self, n_iter = 50):
                 self.S.iloc[seg_idx, self.clust_col] = new_clust_idx
                 self.clusts[seg_idx] = new_clust_idx
 
-                self.clust_sums[new_clust_idx] = np.r_[B_a, B_b]
+                self.clust_sums[new_clust_idx] = np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a]
                 self.clust_members[new_clust_idx] = set(seg_idx)
 
             # join existing cluster
@@ -915,7 +915,7 @@ def run(self, n_iter = 50):
                     choice = cl_idx
 
                 self.clust_counts[choice] += n_move 
-                self.clust_sums[choice] += np.r_[B_a, B_b]
+                self.clust_sums[choice] += np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a]
                 self.S.iloc[seg_idx, self.clust_col] = choice
                 self.clusts[seg_idx] = choice
 

From 6d1c991062922cbdc9674374c0459576f84aa37c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 7 Dec 2021 14:36:06 -0500
Subject: [PATCH 012/222] Bump DP alpha

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index bdc904d..4382e67 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -322,7 +322,7 @@ def scerrorbar(idx, rev = False, alpha = 1, show_CI = True):
         scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color)
 
 class DPinstance:
-    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 0.1):
+    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1):
         self.S = S
         self.clust_prior = clust_prior.copy()
         self.clust_count_prior = clust_count_prior.copy()

From 4e9a00a2006f85bb4a362a002fc8237fbbce4d70 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 7 Dec 2021 15:16:17 -0500
Subject: [PATCH 013/222] Fix another bug related to getting rid of garbage
 cluster

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 4382e67..99fef40 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -627,7 +627,7 @@ def run(self, n_iter = 50):
 
         #
         # initialize cluster tracking hash tables
-        self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore"))
+        self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore"))
         # for the first round of clustering, this is { 1 : 1 }
         self.clust_sums = sc.SortedDict({
           **{ k : np.r_[v["min"], v["maj"]] for k, v in self.S.groupby("clust")[["min", "maj"]].sum().to_dict(orient = "index").items() },

From 627b05920067f4d03eb127864adbd7637fed35a8 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 7 Dec 2021 15:16:42 -0500
Subject: [PATCH 014/222] Avoid divide by zero warning

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 99fef40..c985996 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -801,7 +801,7 @@ def run(self, n_iter = 50):
             # B+C is likelihood of target cluster post-join, with both phase orientations
             BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1)
 
-            MLs = BC - C[:, None] + np.log(np.r_[1 - rephase_prob, rephase_prob])
+            MLs = BC - C[:, None] + np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob]))
             # TODO: get adj_BC working again
 
             #     L(join)           L(split)

From 13498be4a271e55e67f416308e4dc22dc3d89dce Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 7 Dec 2021 16:41:03 -0500
Subject: [PATCH 015/222] Fix bug computing overall likelihood

---
 hapaseg/allelic_DP.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index c985996..18f9eae 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -534,12 +534,15 @@ def compute_adj_liks(self, seg_idx, cur_clust):
 
     def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None):
         if segs_to_clusters is None:
-            _, segs_to_clusters = self.get_unique_clust_idxs()
+            su, segs_to_clusters = self.get_unique_clust_idxs()
         else:
-            _, segs_to_clusters = self.get_unique_clust_idxs(segs_to_clusters)
+            su, segs_to_clusters = self.get_unique_clust_idxs(segs_to_clusters)
         if phase_orientations is None:
             phase_orientations = np.r_[self.phase_orientations]
 
+        # account for unassigned clusters
+        min_clust_idx = 1 if (su == -1).any() else 0
+
         max_clust_idx = segs_to_clusters.max() + 1
 
         liks = np.full(segs_to_clusters.shape[0], np.nan)
@@ -556,7 +559,10 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None
             count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double)
             count_prior /= count_prior.sum()
 
-            clust_lik = (ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1) + np.log(count_prior)[count_prior > 0]).sum()
+            clust_lik = (ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1))[min_clust_idx:].sum()
+            # account for unassigned clusters, if present
+            if min_clust_idx == 1:
+                clust_lik += ss.betaln(self.S.loc[cl_samp == 0, "maj"] + 1, self.S.loc[cl_samp == 0, "min"] + 1).sum()
 
 #            ## segmentation likelihood
 #

From 2cd6eb467746acfd5ac002be1ef7134e423fbcfc Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 7 Dec 2021 18:52:16 -0500
Subject: [PATCH 016/222] adj_BC working again, accounting for phasing?

---
 hapaseg/allelic_DP.py | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 18f9eae..0e214fa 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -421,9 +421,10 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_
 
     def compute_adj_liks(self, seg_idx, cur_clust):
         adj_AB = 0
-        adj_BC = np.zeros(len(self.clust_sums))
+        adj_BC = np.zeros([len(self.clust_sums), 2])
 
         # start/end coordinates of consecutive runs of segments being moved
+        # NOTE: ordpairs represents closed intervals!
         ordpairs = np.c_[
           [np.r_[list(x)][[0, -1]] for x in more_itertools.consecutive_groups(
             np.sort(seg_idx))
@@ -483,8 +484,8 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                 # min/maj counts of the segment(s) being moved
                 st = ordpairs[j, 0]
                 en = ordpairs[j, 1]
-                S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # XXX: why en + 1?
-                S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) # XXX: why en + 1?
+                S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # en + 1 because ordpairs is closed
+                S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) 
 
                 # adjacency likelihood of this segment remaining where it is
 #                adj_AB += self.SJliks(
@@ -503,7 +504,7 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                 # 1. those it is actually adjacent to (+ new cluster)
                 for cl in {-1, cl_u, cl_d}:
                     idx = self.clust_sums.index(cl)
-                    adj_BC[idx] += self.SJliks(
+                    adj_BC[idx, 0] += self.SJliks(
                       targ_clust = cl, 
                       upstream_clust = cl_u, 
                       downstream_clust = cl_d, 
@@ -514,11 +515,22 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                       D_a = D_a,
                       D_b = D_b
                     )
+                    adj_BC[idx, 1] += self.SJliks(
+                      targ_clust = cl, 
+                      upstream_clust = cl_u, 
+                      downstream_clust = cl_d, 
+                      J_a = S_b, 
+                      J_b = S_a,
+                      U_a = U_a,
+                      U_b = U_b,
+                      D_a = D_a,
+                      D_b = D_b
+                    )
 
                 # 2. clusters it is not adjacent to (use default split value)
                 for cl in self.clust_sums.keys() - ({-1} | set(adj_clusters[adj_idx].ravel())):
                     idx = self.clust_sums.index(cl)
-                    adj_BC[idx] += self.SJliks(
+                    adj_BC[idx, 0] += self.SJliks(
                       targ_clust = -1, 
                       upstream_clust = -1, 
                       downstream_clust = -1, 
@@ -529,6 +541,17 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                       D_a = D_a,
                       D_b = D_b
                     )
+                    adj_BC[idx, 1] += self.SJliks(
+                      targ_clust = -1, 
+                      upstream_clust = -1, 
+                      downstream_clust = -1, 
+                      J_a = S_b, 
+                      J_b = S_a,
+                      U_a = U_a,
+                      U_b = U_b,
+                      D_a = D_a,
+                      D_b = D_b
+                    )
 
         return adj_AB, adj_BC
 
@@ -789,12 +812,10 @@ def run(self, n_iter = 50):
             # adjacent segment likelihoods
 
             adj_AB = 0
-            adj_BC = np.zeros(len(self.clust_sums))
+            adj_BC = np.zeros([len(self.clust_sums), 2])
 
-            if not move_clust or (all_assigned and move_clust and np.random.rand() < 0.01):
+            if not move_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
                 adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust)
-            else:
-                adj_BC[self.clust_sums.index(0)] = -np.inf
 
             # A+B,C -> A,B+C
 
@@ -808,7 +829,6 @@ def run(self, n_iter = 50):
             BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1)
 
             MLs = BC - C[:, None] + np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob]))
-            # TODO: get adj_BC working again
 
             #     L(join)           L(split)
             #MLs = A + BC + adj_BC - (AB + C + adj_AB)
@@ -873,7 +893,7 @@ def run(self, n_iter = 50):
             count_prior /= count_prior.sum()
 
             # choose to join a cluster or make a new one (choice_idx = 0)
-            num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p)
+            num = MLs + adj_BC + np.log(count_prior[:, None]) + np.log(clust_prior_p)
             choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum()
             # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true
             choice_idx = np.random.choice(

From f6b5dc54411abe5324c03becb09acb5d84531c19 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 8 Dec 2021 14:37:19 -0500
Subject: [PATCH 017/222] Incrementally update relative posterior estimate

---
 hapaseg/allelic_DP.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 0e214fa..7a290c5 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -680,9 +680,9 @@ def run(self, n_iter = 50):
         all_assigned = False
         seg_touch_idx = np.zeros(len(self.S), dtype = np.uint16)
 
-#        # containers for saving debugging information (overall likelihoods/cluster assignments pre-burnin)
-#        self.lik_tmp = []
-#        self.vc_tmp = []
+        # likelihood trace
+        self.lik_tmp = []
+        self.post = 0
 
         n_it = 0
         n_it_last = 0
@@ -903,6 +903,11 @@ def run(self, n_iter = 50):
             # -1 = brand new, -2, -3, ... = -(prior clust index) - 2
             choice = np.r_[-np.r_[prior_diff] - 2, self.clust_counts.keys()][choice_idx//2]
 
+            # compute posterior delta between previous and current state
+            post_delta = num.ravel()[choice_idx] - \
+              num[self.clust_sums.index(cur_clust if cur_clust in self.clust_sums else -1), 0]
+            self.post += post_delta
+
             # save rephasing status
             if choice_idx & 1:
                 self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]

From b8155ad750f526dfb2c762a4698bf7f84b43b2cc Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 9 Dec 2021 13:01:43 -0500
Subject: [PATCH 018/222] Remove another vestige of garbage clusters

---
 hapaseg/allelic_DP.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 7a290c5..98e2506 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -663,7 +663,7 @@ def run(self, n_iter = 50):
           **{-1 : np.r_[0, 0]}
         })
         # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]] }
-        self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 and k != 0 })
+        self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 })
         # for the first round, this is { 1 : {0} }
         unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1])
 
@@ -727,13 +727,11 @@ def run(self, n_iter = 50):
                     si = seg_idx[0]
 
                     j = 1
-                    while si - j > 0 and \
-                      (self.clusts[si - j] == cur_clust or self.clusts[si - j] == 0):
+                    while si - j > 0 and self.clusts[si - j] == cur_clust:
                         seg_idx.add(si - j)
                         j += 1
                     j = 1
-                    while si + j < len(self.S) and \
-                      (self.clusts[si + j] == cur_clust or self.clusts[si + j] == 0):
+                    while si + j < len(self.S) and self.clusts[si + j] == cur_clust:
                         seg_idx.add(si + j)
                         j += 1
 

From d4418436f5a64b3b8f23391c67fa1d59e09983f2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 9 Dec 2021 15:44:27 -0500
Subject: [PATCH 019/222] Initialize clust_counts to respect new phasing
 indexing scheme

---
 hapaseg/allelic_DP.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 98e2506..dcbc543 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -658,13 +658,19 @@ def run(self, n_iter = 50):
         # initialize cluster tracking hash tables
         self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore"))
         # for the first round of clustering, this is { 1 : 1 }
+
+        x = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum()
+        if (x.droplevel(0).index == True).any():
+            x.loc[(slice(None), True), ["min", "maj"]] = x.loc[(slice(None), True), ["maj", "min"]].values
         self.clust_sums = sc.SortedDict({
-          **{ k : np.r_[v["min"], v["maj"]] for k, v in self.S.groupby("clust")[["min", "maj"]].sum().to_dict(orient = "index").items() },
+          **{ k : np.r_[v["min"], v["maj"]] for k, v in x.groupby(level = "clust").sum().to_dict(orient = "index").items() },
           **{-1 : np.r_[0, 0]}
         })
         # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]] }
+
         self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 })
         # for the first round, this is { 1 : {0} }
+
         unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1])
 
         # store this as numpy for speed

From 02ee2035f3fbd2ee40cb4080f774a61d770ba42e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 9 Dec 2021 15:48:08 -0500
Subject: [PATCH 020/222] Initial commit of cluster splitting

---
 hapaseg/allelic_DP.py | 108 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 107 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index dcbc543..af50e07 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -370,6 +370,16 @@ def _Ssum_ph(self, seg_idx, min = True):
         else:
             return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum()
 
+    def _Scumsum_ph(self, seg_idx, min = True):
+        flip = self.S.iloc[seg_idx, self.flip_col]
+        flip_n = ~flip
+        if min:
+            si = np.argsort(np.r_[seg_idx[flip_n], seg_idx[flip]])
+            return self.mm_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]][si].cumsum()
+        else:
+            si = np.argsort(np.r_[seg_idx[flip], seg_idx[flip_n]])
+            return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]][si].cumsum()
+
     def compute_rephase_prob(self, seg_idx):
         flip = self.S.iloc[seg_idx, self.flip_col]
         flip_n = ~flip
@@ -555,6 +565,63 @@ def compute_adj_liks(self, seg_idx, cur_clust):
 
         return adj_AB, adj_BC
 
+    def compute_cluster_splitpoints(self, seg_idx):
+        spl = []
+
+        # left bias
+        end = len(seg_idx)
+        i = 0
+        while True:
+            seg_idx_sp = seg_idx[0:end]
+            if len(seg_idx_sp) < 2:
+                break
+
+            min_cs = self._Scumsum_ph(seg_idx_sp, min = True)
+            min_csr = self._Ssum_ph(seg_idx_sp, min = True) - min_cs
+            maj_cs = self._Scumsum_ph(seg_idx_sp, min = False)
+            maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs
+
+            split_lik = ss.betaln(min_cs[:-1] + 1, maj_cs[:-1] + 1) + ss.betaln(min_csr[1:] + 1, maj_csr[1:] + 1)
+            # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum())
+            # NOTE: instead of argmax, probabilistically choose? will this make a difference?
+
+            end = split_lik.argmax()
+            spl.append(end)
+
+            if end <= 1 or end == len(split_lik) - 1:
+                break
+
+            i += 1
+
+        # right bias
+        start = 0
+        i = 0
+        while True:
+            seg_idx_sp = seg_idx[start:]
+            if len(seg_idx_sp) < 2:
+                break
+
+            min_cs = self._Scumsum_ph(seg_idx_sp, min = True)
+            min_csr = self._Ssum_ph(seg_idx_sp, min = True) - min_cs
+            maj_cs = self._Scumsum_ph(seg_idx_sp, min = False)
+            maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs
+
+            split_lik = ss.betaln(min_cs[:-1] + 1, maj_cs[:-1] + 1) + ss.betaln(min_csr[1:] + 1, maj_csr[1:] + 1)
+            # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum())
+
+            start += split_lik.argmax() + 1
+            spl.append(start - 1)
+
+            if start > len(seg_idx) - 1 or split_lik.argmax() == 0:
+                break
+
+            i += 1
+
+        bdy = np.unique(np.r_[0, spl, len(seg_idx)])
+        bdy = np.c_[bdy[:-1], bdy[1:]]
+
+        return bdy
+
     def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None):
         if segs_to_clusters is None:
             su, segs_to_clusters = self.get_unique_clust_idxs()
@@ -744,10 +811,48 @@ def run(self, n_iter = 50):
                     # if we've expanded to include a large fraction (>10%) of segments 
                     # in this cluster, cluster indexing might become inconsistent.
                     # skip this iteration
-                    if len(seg_idx) >= 0.1*self.clust_counts[cur_clust]:
+#                    if len(seg_idx) >= 0.1*self.clust_counts[cur_clust]:
+#                        breakpoint()
+#                        n_it += 1
+#                        continue
+
+                # propose splitting out a contiguous interval of segments within the current cluster
+                split_clust = False
+                if all_assigned and np.random.rand() < 0.1:
+                    # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable?
+                    clust_segs = np.sort(np.r_[list(self.clust_members[cur_clust])])
+                    split_bdy = self.compute_cluster_splitpoints(clust_segs)
+
+                    A_tot, B_tot = self.clust_sums[cur_clust]
+
+                    lik0 = ss.betaln(A_tot + 1, B_tot + 1)
+
+                    liks = np.zeros(len(split_bdy) + 1)
+                    liks[-1] = lik0 # don't split at all
+
+                    # likelihood ratios for splitting each region into a new cluster
+                    for i, (st, en) in enumerate(split_bdy):
+                        A = self._Ssum_ph(clust_segs[st:en], min = True)
+                        B = self._Ssum_ph(clust_segs[st:en], min = False)
+
+                        liks[i] = ss.betaln(A_tot - A + 1, B_tot - B + 1) + ss.betaln(A + 1, B + 1)
+
+                    # pick a region to split
+                    split_idx = np.random.choice(
+                      len(split_bdy) + 1,
+                      p = np.exp(liks - liks.max())/np.exp(liks - liks.max()).sum()
+                    )
+
+                    # don't split at all
+                    if split_idx == len(split_bdy):
                         n_it += 1
                         continue
 
+                    # seg_idx == segments to propose to split off
+                    seg_idx = clust_segs[slice(*split_bdy[split_idx])]
+
+                    split_clust = True
+
                 seg_idx = np.r_[list(seg_idx)]
 
                 n_move = len(seg_idx)
@@ -819,6 +924,7 @@ def run(self, n_iter = 50):
             adj_BC = np.zeros([len(self.clust_sums), 2])
 
             if not move_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
+            if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
                 adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust)
 
             # A+B,C -> A,B+C

From e12d4a9b3e55827388e118d242573efe5c010d43 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 9 Dec 2021 20:46:11 -0500
Subject: [PATCH 021/222] Initial commit of new burnin criterion

---
 hapaseg/allelic_DP.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index af50e07..ee80704 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -751,7 +751,8 @@ def run(self, n_iter = 50):
 
         burned_in = False
         all_assigned = False
-        seg_touch_idx = np.zeros(len(self.S), dtype = np.uint16)
+        all_touched = False
+        seg_touch_idx = np.zeros(len(self.S), dtype = bool)
 
         # likelihood trace
         self.lik_tmp = []
@@ -764,20 +765,23 @@ def run(self, n_iter = 50):
                 print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index())
                 print("n unassigned: {}".format((self.S["clust"] == -1).sum()))
 
-            # we are burned in (n_seg/n_clust) iterations after all segments have been touched
+            # poll every 100 iterations for burnin status
             if not n_it % 100:
-                if not all_assigned and (((seg_touch_idx > 0) | (self.clusts == 0)).all() or \
-                  # if there is only one cluster, then consider every segment to have been touched
-                  # otherwise, waiting for every segment to actually be touched will take forever
-                  len(unassigned_segs) == 0 and len(self.clust_counts) == 1):
+                self.lik_tmp.append(self.post)
+                if not all_assigned and len(unassigned_segs) == 0:
                     all_assigned = True
-                    n_it_last = n_it
-                if not burned_in and all_assigned and \
-                  n_it - n_it_last > len(self.S)/len(self.clust_counts):
-                    burned_in = True
-            
-#                self.lik_tmp.append(self.compute_overall_lik())
-#                self.vc_tmp.append(self.S["clust"].value_counts())
+                if not burned_in and all_assigned:
+                    # 1. have >90% of segments been adjacency corrected?
+                    # print(seg_touch_idx.mean())
+                    if seg_touch_idx.mean() > 0.9:
+                        all_touched = True
+
+                    # 2. if >90% of segments have been adjacency corrected, check for burnin
+                    # does the smoothed derivative of the posterior numerator go below zero? this would indicate that we've solidly reached an optimum
+                    # TODO: make this check more efficient?
+                    if all_touched and (np.convolve(np.diff(self.lik_tmp), np.ones(50)/50, mode = "same") < 0).sum() > 2:
+                        burned_in = True
+                        breakpoint()
 
             #
             # pick either a segment or a cluster at random (50:50 prob.)
@@ -923,9 +927,10 @@ def run(self, n_iter = 50):
             adj_AB = 0
             adj_BC = np.zeros([len(self.clust_sums), 2])
 
-            if not move_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
             if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
                 adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust)
+                if all_assigned:
+                    seg_touch_idx[seg_idx] = True
 
             # A+B,C -> A,B+C
 

From 275d9902f384f7f275dff705f2d7be1e877fcb23 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Dec 2021 18:53:22 -0500
Subject: [PATCH 022/222] Properly cycle through cluster colors

---
 hapaseg/allelic_DP.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index ee80704..9bab53c 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1108,7 +1108,11 @@ def get_colors(self):
         si = np.argsort(tot_terr)[::-1]
         terr_cs = np.cumsum(tot_terr[si])/tot_terr.sum()
 
-        return [mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (terr_cs < 0.99).sum())]
+        colors_to_use = np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (terr_cs < 0.99).sum())])
+        colors = np.zeros([len(s2cu), 4])
+        n_distinct = colors_to_use.shape[0] 
+        colors[si[:n_distinct], :] = colors_to_use
+        colors[si[n_distinct:], :] = colors_to_use[:(len(si) - n_distinct), :]
 
     def visualize_segs(self):
         plt.figure()

From a851663f9bd2d74295e8a3798b595fbb9fca98b8 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 17 Dec 2021 10:28:49 -0500
Subject: [PATCH 023/222] Make adjacent segment penalty a prior, not a
 likelihood

---
 hapaseg/allelic_DP.py | 92 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 14 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 9bab53c..c7ee6ea 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -429,6 +429,73 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_
 
         return ss.betaln(SU_a + 1, SU_b + 1) + ss.betaln(J_a + 1, J_b + 1) + ss.betaln(SD_a + 1, SD_b + 1)
 
+    def compute_adj_prob(self, seg_idx):
+        ## compute boundaries of adjacent segments
+
+        # maj/min counts of contiguous upstream segments belonging to the same cluster
+        st = seg_idx[0]
+        U_A = 0
+        U_B = 0
+        U_cl = -1
+        if st - 1 > 0:
+            U_cl = self.clusts[st - 1]
+            j = 1
+            while st - j > 0 and self.clusts[st - j] != -1 and \
+              self.clusts[st - j] == U_cl:
+                U_A += self._Siat_ph(st - j, min = True)
+                U_B += self._Siat_ph(st - j, min = False)
+
+                j += 1
+
+        # maj/min counts of contiguous downstream segments belonging to the same cluster
+        en = seg_idx[-1]
+        D_A = 0
+        D_B = 0
+        D_cl = -1
+        if en + 1 < len(self.S):
+            D_cl = self.clusts[en + 1]
+            j = 1
+            while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \
+              self.clusts[en + j] == D_cl:
+                D_A += self._Siat_ph(en + j, min = True)
+                D_B += self._Siat_ph(en + j, min = False)
+
+                j += 1 
+
+        # maj/min counts of segment(s) being moved
+        S_A = self._Ssum_ph(seg_idx, min = True)
+        S_B = self._Ssum_ph(seg_idx, min = False)
+
+        ## compute all four possible segmentations relative to neighbor, in
+        ## both phasing orientations
+        MLs = np.c_[
+          # UTD             T  U  D
+          # -^_ or -_- (U != T & T != D) (00)
+          np.r_[self.SJliks(1, 0, 0, S_A, S_B, U_A, U_B, D_A, D_B),
+                self.SJliks(1, 0, 0, S_B, S_A, U_A, U_B, D_A, D_B)],
+          # -__ (U != T & T == D) (01)
+          np.r_[self.SJliks(0, 1, 0, S_A, S_B, U_A, U_B, D_A, D_B),
+                self.SJliks(0, 1, 0, S_B, S_A, U_A, U_B, D_A, D_B)],
+          # --_ (U == T & T != D) (10)
+          np.r_[self.SJliks(1, 1, 0, S_A, S_B, U_A, U_B, D_A, D_B),
+                self.SJliks(1, 1, 0, S_B, S_A, U_A, U_B, D_A, D_B)],
+          # --- (U == T & T == D) (11)
+          np.r_[self.SJliks(0, 0, 0, S_A, S_B, U_A, U_B, D_A, D_B),
+                self.SJliks(0, 0, 0, S_B, S_A, U_A, U_B, D_A, D_B)],
+        ]
+
+        ## match probs to cluster choices (will match MLs matrix in main calculation)
+        probs = np.zeros([len(self.clust_sums), 2])
+        probs_idx = np.zeros([len(self.clust_sums), 2]).astype(np.uint8)
+        for k in self.clust_sums.keys():
+            MLs_idx = np.r_[k == U_cl, k == D_cl]@np.r_[2, 1]
+            probs[self.clust_sums.index(k), :] = MLs[:, MLs_idx]
+            probs_idx[self.clust_sums.index(k), :] = np.r_[0, 4] + MLs_idx
+
+        ## convert to conditional likelihoods, by scaling each likelihood by number of 
+        ## cluster candidates with that segmentation configuration
+        return probs - np.log(np.bincount(probs_idx.ravel())[probs_idx])
+
     def compute_adj_liks(self, seg_idx, cur_clust):
         adj_AB = 0
         adj_BC = np.zeros([len(self.clust_sums), 2])
@@ -921,17 +988,6 @@ def run(self, n_iter = 50):
             C_ab = np.r_[self.clust_sums.values()] # first terms: -1 = make new cluster
             #C_ab = np.r_[[v for k, v in clust_sums.items() if k != cur_clust or cur_clust == -1]] # if we don't want to explicitly propose letting B rejoin cur_clust
 
-            #
-            # adjacent segment likelihoods
-
-            adj_AB = 0
-            adj_BC = np.zeros([len(self.clust_sums), 2])
-
-            if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
-                adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust)
-                if all_assigned:
-                    seg_touch_idx[seg_idx] = True
-
             # A+B,C -> A,B+C
 
             # A+B is likelihood of current cluster B is part of
@@ -961,7 +1017,7 @@ def run(self, n_iter = 50):
             #
             # priors
 
-            # prior on previous cluster fractions
+            ## prior on previous cluster fractions
 
             prior_diff = []
             prior_com = []
@@ -1001,14 +1057,22 @@ def run(self, n_iter = 50):
                 # expand MLs to account for multiple new clusters
                 MLs = np.r_[np.full([len(prior_diff), 2], MLs[0]), MLs[1:, :]]
                 
-            # DP prior based on clusters sizes
+            ## DP prior based on clusters sizes
             # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster)
             ccp = np.r_[[self.clust_count_prior[x] for x in prior_diff]]
             count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_counts.values()]
             count_prior /= count_prior.sum()
 
+            # adjacent segment prior
+
+            log_adj_prior = 0
+            if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
+                log_adj_prior = self.compute_adj_prob(seg_idx)
+                if all_assigned:
+                    seg_touch_idx[seg_idx] = True
+
             # choose to join a cluster or make a new one (choice_idx = 0)
-            num = MLs + adj_BC + np.log(count_prior[:, None]) + np.log(clust_prior_p)
+            num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) + log_adj_prior
             choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum()
             # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true
             choice_idx = np.random.choice(

From 9b78b348cc4b4cb0b9889a9b9ff7164163d6cd07 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 22 Dec 2021 14:16:58 -0500
Subject: [PATCH 024/222] Add temperature parameter

---
 hapaseg/allelic_DP.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index c7ee6ea..b44e71d 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -322,11 +322,12 @@ def scerrorbar(idx, rev = False, alpha = 1, show_CI = True):
         scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color)
 
 class DPinstance:
-    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1):
+    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1, temperature = 1):
         self.S = S
         self.clust_prior = clust_prior.copy()
         self.clust_count_prior = clust_count_prior.copy()
         self.alpha = alpha
+        self.temperature = temperature
 
         self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed
         self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F")
@@ -1073,6 +1074,7 @@ def run(self, n_iter = 50):
 
             # choose to join a cluster or make a new one (choice_idx = 0)
             num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) + log_adj_prior
+            num /= self.temperature
             choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum()
             # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true
             choice_idx = np.random.choice(

From 3ddffcd65dfea1ba4525848af3e7af5c771c10bc Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 22 Dec 2021 14:17:54 -0500
Subject: [PATCH 025/222] Add simple overall likelihood calculation

---
 hapaseg/allelic_DP.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index b44e71d..63a7aa9 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -344,6 +344,12 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.bref_col = self.S.columns.get_loc("B_ref")
         self.flip_col = self.S.columns.get_loc("flipped")
 
+        #
+        # compute rephase probabilities for each segment
+        self.S["rephase_prob"] = np.nan
+        for i in range(0, len(self.S)):
+            self.S.at[i, "rephase_prob"] = self.compute_rephase_prob(np.r_[i])
+
         #
         # initialize priors
 
@@ -690,7 +696,33 @@ def compute_cluster_splitpoints(self, seg_idx):
 
         return bdy
 
-    def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None):
+    def compute_overall_lik_simple(self):
+        ## overall clustering likelihood
+        clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum()
+
+        ## overall phasing likelihood
+        phase_lik = 1 - self.S["rephase_prob"].copy()
+        phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]]
+        phase_lik = np.log(phase_lik).sum()
+
+        ## count prior
+        count_prior = np.r_[self.clust_counts.values()].astype(float)
+        count_prior /= count_prior.sum()
+
+        ## segmentation likelihood
+        bdy = np.flatnonzero(np.r_[1, np.diff(self.S["clust"]) != 0, 1])
+        bdy = np.c_[bdy[:-1], bdy[1:]]
+
+        seg_lik = 0.0
+        for st, en in bdy:
+            seg_lik += ss.betaln(
+              self._Ssum_ph(np.r_[st:en], min = True) + 1,
+              self._Ssum_ph(np.r_[st:en], min = False) + 1
+            )
+
+        return clust_lik + phase_lik + np.log(count_prior).sum() + seg_lik
+
+    def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False):
         if segs_to_clusters is None:
             su, segs_to_clusters = self.get_unique_clust_idxs()
         else:

From 3468a33b00fb4d6ebec3ced87836598ea3bfd2dc Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 5 Jan 2022 15:39:18 -0500
Subject: [PATCH 026/222] Use Dir-Cat marg. lik. in overall posterior

---
 hapaseg/allelic_DP.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 63a7aa9..8e71208 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -705,9 +705,10 @@ def compute_overall_lik_simple(self):
         phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]]
         phase_lik = np.log(phase_lik).sum()
 
-        ## count prior
-        count_prior = np.r_[self.clust_counts.values()].astype(float)
-        count_prior /= count_prior.sum()
+        ## Dirichlet count prior (Dirichlet-categorical marginal likelihood)
+        dirvec = np.r_[self.clust_counts.values()].astype(float)
+        k = len(dirvec)
+        count_prior = ss.gammaln(dirvec + self.alpha/k).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) - k*ss.gammaln(self.alpha/k)
 
         ## segmentation likelihood
         bdy = np.flatnonzero(np.r_[1, np.diff(self.S["clust"]) != 0, 1])
@@ -720,7 +721,7 @@ def compute_overall_lik_simple(self):
               self._Ssum_ph(np.r_[st:en], min = False) + 1
             )
 
-        return clust_lik + phase_lik + np.log(count_prior).sum() + seg_lik
+        return clust_lik + phase_lik + count_prior + seg_lik
 
     def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False):
         if segs_to_clusters is None:

From c9bb0ad21fa65b59abfbf70054b1d3755f1badc4 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 11 Jan 2022 17:07:23 -0500
Subject: [PATCH 027/222] Don't rescale adjacency likelihoods

---
 hapaseg/allelic_DP.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 8e71208..b05f4b1 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -492,16 +492,12 @@ def compute_adj_prob(self, seg_idx):
         ]
 
         ## match probs to cluster choices (will match MLs matrix in main calculation)
-        probs = np.zeros([len(self.clust_sums), 2])
-        probs_idx = np.zeros([len(self.clust_sums), 2]).astype(np.uint8)
+        probs = np.full([len(self.clust_sums), 2], -np.inf)
         for k in self.clust_sums.keys():
             MLs_idx = np.r_[k == U_cl, k == D_cl]@np.r_[2, 1]
             probs[self.clust_sums.index(k), :] = MLs[:, MLs_idx]
-            probs_idx[self.clust_sums.index(k), :] = np.r_[0, 4] + MLs_idx
 
-        ## convert to conditional likelihoods, by scaling each likelihood by number of 
-        ## cluster candidates with that segmentation configuration
-        return probs - np.log(np.bincount(probs_idx.ravel())[probs_idx])
+        return probs
 
     def compute_adj_liks(self, seg_idx, cur_clust):
         adj_AB = 0

From acc88e04611050a2139248786a6e1491407a1005 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 11 Jan 2022 17:09:19 -0500
Subject: [PATCH 028/222] Use multi DP prior; use correct joint probability for
 p(clust,phase|X)

---
 hapaseg/allelic_DP.py | 53 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 40 insertions(+), 13 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index b05f4b1..31f2116 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1029,8 +1029,9 @@ def run(self, n_iter = 50):
             # B+C is likelihood of target cluster post-join, with both phase orientations
             BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1)
 
-            MLs = BC - C[:, None] + np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob]))
+            MLs = BC - C[:, None]
 
+            # {{{
             #     L(join)           L(split)
             #MLs = A + BC + adj_BC - (AB + C + adj_AB)
             # TODO: remove extraneous calculations (e.g. adj_AB, AB, A);
@@ -1044,10 +1045,12 @@ def run(self, n_iter = 50):
             if n_move > 1 and not move_clust:
                 MLs[self.clust_sums.index(-1)] = -np.inf
 
+            # }}}
+
             #
             # priors
 
-            ## prior on previous cluster fractions
+            ## prior on previous cluster fractions {{{
 
             prior_diff = []
             prior_com = []
@@ -1086,25 +1089,49 @@ def run(self, n_iter = 50):
 
                 # expand MLs to account for multiple new clusters
                 MLs = np.r_[np.full([len(prior_diff), 2], MLs[0]), MLs[1:, :]]
+
+            # }}}
                 
             ## DP prior based on clusters sizes
-            # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster)
-            ccp = np.r_[[self.clust_count_prior[x] for x in prior_diff]]
-            count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_counts.values()]
-            count_prior /= count_prior.sum()
+            n_c = np.c_[self.clust_counts.values()]
+            N = n_c.sum() + n_move
+            log_count_prior = np.full([len(self.clust_sums), 1], np.nan)
+            log_count_prior[1:] = ss.gammaln(n_move + n_c) + ss.gammaln(N + self.alpha - n_move) \
+              - (ss.gammaln(n_c) + ss.gammaln(N + self.alpha))
+            # probability of opening a new cluster
+            # TODO: accommodate prior clusters here
+            log_count_prior[0] = ss.gammaln(n_move) + np.log(self.alpha) + ss.gammaln(N + self.alpha - n_move) - ss.gammaln(N + self.alpha)
 
-            # adjacent segment prior
+            #
+            # adjacent segment likelihood
+
+            #adj_AB = 0
+            #adj_BC = np.zeros([len(self.clust_sums), 2])
 
-            log_adj_prior = 0
+            log_adj_lik = 0
             if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
-                log_adj_prior = self.compute_adj_prob(seg_idx)
+                log_adj_lik = self.compute_adj_prob(seg_idx)
                 if all_assigned:
                     seg_touch_idx[seg_idx] = True
 
-            # choose to join a cluster or make a new one (choice_idx = 0)
-            num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) + log_adj_prior
-            num /= self.temperature
-            choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum()
+            # p(X|clust,phase)p(X|seg,phase)p(clust)
+            num = (MLs               # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B})
+                  + log_adj_lik      # p({a_i, b_i}_{i\in B} | U, D, phase_{i\in B})
+                  + log_count_prior) # p(clust) (DP prior on clust counts)
+
+            num /= self.temperature # scale by temperature for replica-exchange
+
+            num -= num.max(0) # avoid underflow in sum-exp
+
+            # p(clust|X,phase)
+            log_clust_post = num - np.log(np.exp(num).sum(0))
+
+            # p(phase|X)
+            log_phase_prob = np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob]))
+
+            # p(clust,phase|X) = p(clust|X,phase)p(phase|X)
+            choice_p = np.exp(log_clust_post + log_phase_prob)
+
             # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true
             choice_idx = np.random.choice(
               np.r_[0:np.prod(choice_p.shape)],

From f4ce65eca22340e9763f100a2210e65b952c2da0 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 8 Feb 2022 09:33:59 -0500
Subject: [PATCH 029/222] Add note

---
 hapaseg/allelic_DP.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 31f2116..63e3990 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -500,6 +500,13 @@ def compute_adj_prob(self, seg_idx):
         return probs
 
     def compute_adj_liks(self, seg_idx, cur_clust):
+        # idea to simplify this code:
+        # - strip out logic for working with noncontiguous seg_idx's
+        # - compute all four possibile segmentations:
+        #   ABC, AAB, ABB, AAA
+        # - associate those segmentations with each cluster choice, in order
+        #   to return `adj_BC` with same size as `MLs`
+
         adj_AB = 0
         adj_BC = np.zeros([len(self.clust_sums), 2])
 

From 5f08b8925f0bb1593ca2d57828711bc553a90753 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 8 Feb 2022 09:35:23 -0500
Subject: [PATCH 030/222] Temporarily return individual components of overall
 joint likelihood

---
 hapaseg/allelic_DP.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 63e3990..4f5780a 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -701,14 +701,17 @@ def compute_cluster_splitpoints(self, seg_idx):
 
     def compute_overall_lik_simple(self):
         ## overall clustering likelihood
+        # p({a_i, b_i} | {c_k}, {phase_i})
         clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum()
 
         ## overall phasing likelihood
+        # p({phase_i} | {a_i, b_i})
         phase_lik = 1 - self.S["rephase_prob"].copy()
         phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]]
         phase_lik = np.log(phase_lik).sum()
 
         ## Dirichlet count prior (Dirichlet-categorical marginal likelihood)
+        # p({c_k})
         dirvec = np.r_[self.clust_counts.values()].astype(float)
         k = len(dirvec)
         count_prior = ss.gammaln(dirvec + self.alpha/k).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) - k*ss.gammaln(self.alpha/k)
@@ -724,7 +727,9 @@ def compute_overall_lik_simple(self):
               self._Ssum_ph(np.r_[st:en], min = False) + 1
             )
 
-        return clust_lik + phase_lik + count_prior + seg_lik
+        # p({c_k}, {s}, {phase_i} | {a_i, b_i})
+        #return clust_lik + phase_lik + count_prior + seg_lik
+        return np.r_[clust_lik, phase_lik, count_prior, seg_lik]
 
     def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False):
         if segs_to_clusters is None:

From 6a10f4745b0fcf266139b977081cb6630a2e07a3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 8 Feb 2022 09:37:57 -0500
Subject: [PATCH 031/222] More flexible stopping criteria

---
 hapaseg/allelic_DP.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 4f5780a..3e18f33 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -786,7 +786,7 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None
 
         return liks
 
-    def run(self, n_iter = 50):
+    def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
         #
         # assign segments to likeliest prior component {{{
 
@@ -869,11 +869,19 @@ def run(self, n_iter = 50):
 
         n_it = 0
         n_it_last = 0
-        while len(self.segs_to_clusters) < n_iter:
+        while True:
             if not n_it % 1000:
                 print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index())
                 print("n unassigned: {}".format((self.S["clust"] == -1).sum()))
 
+            # stop after a raw number of iterations
+            if n_iter > 0 and n_it > n_iter:
+                return
+
+#            # stop after a number of samples have been taken
+#            if n_samps > 0 and len() > n_samps:
+#                break
+
             # poll every 100 iterations for burnin status
             if not n_it % 100:
                 self.lik_tmp.append(self.post)

From d464c7cdd92bc1a436b744308018bad2cc0b6480 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 8 Feb 2022 09:47:18 -0500
Subject: [PATCH 032/222] Don't track unassigned segs; unnecessary for warm
 start

---
 hapaseg/allelic_DP.py | 33 ++++++++-------------------------
 1 file changed, 8 insertions(+), 25 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 3e18f33..52e1f99 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -786,7 +786,7 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None
 
         return liks
 
-    def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
+    def run(self, n_iter = 0, n_samps = 0):
         #
         # assign segments to likeliest prior component {{{
 
@@ -847,8 +847,6 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
         self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 })
         # for the first round, this is { 1 : {0} }
 
-        unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1])
-
         # store this as numpy for speed
         self.clusts = self.S["clust"].values
 
@@ -859,7 +857,6 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
         self.phase_orientations = []
 
         burned_in = False
-        all_assigned = False
         all_touched = False
         seg_touch_idx = np.zeros(len(self.S), dtype = bool)
 
@@ -884,10 +881,10 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
 
             # poll every 100 iterations for burnin status
             if not n_it % 100:
-                self.lik_tmp.append(self.post)
-                if not all_assigned and len(unassigned_segs) == 0:
-                    all_assigned = True
-                if not burned_in and all_assigned:
+
+                # have most segments been adjacency corrected?
+                # if so, has the overall likelihood stabilized enough that we're burned in?
+                if not burned_in:
                     # 1. have >90% of segments been adjacency corrected?
                     # print(seg_touch_idx.mean())
                     if seg_touch_idx.mean() > 0.9:
@@ -906,13 +903,7 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
 
             # pick a segment at random
             if np.random.rand() < 0.5:
-            #if np.random.rand() < 1:
-                # bias picking unassigned segments if >90% of segments have been assigned
-                if len(unassigned_segs) > 0 and len(unassigned_segs)/len(self.S) < 0.1 and np.random.rand() < 0.5:
-                    seg_idx = sc.SortedSet({np.random.choice(unassigned_segs)})
-                else:
-                    seg_idx = sc.SortedSet({np.random.choice(len(self.S))})
-
+                seg_idx = sc.SortedSet({np.random.choice(len(self.S))})
                 cur_clust = int(self.clusts[seg_idx])
 
                 # expand segment to include all adjacent segments in the same cluster,
@@ -989,7 +980,6 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
                         self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)]
                         self.clust_members[cur_clust] -= set(seg_idx)
 
-                    unassigned_segs.update(seg_idx)
                     self.clusts[seg_idx] = -1
 
             # pick a cluster at random
@@ -1011,14 +1001,10 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
                 del self.clust_counts[cl_idx]
                 del self.clust_sums[cl_idx]
                 del self.clust_members[cl_idx]
-                unassigned_segs.update(seg_idx)
                 self.clusts[seg_idx] = -1
 
                 move_clust = True
 
-            if not all_assigned:
-                seg_touch_idx[seg_idx] += 1
-
             #
             # perform phase correction on segment/cluster
             # flip min/maj with probability that alleles are oriented the "wrong" way
@@ -1129,10 +1115,9 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
             #adj_BC = np.zeros([len(self.clust_sums), 2])
 
             log_adj_lik = 0
-            if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01):
+            if not move_clust and not split_clust: # or (move_clust and np.random.rand() < 0.01):
                 log_adj_lik = self.compute_adj_prob(seg_idx)
-                if all_assigned:
-                    seg_touch_idx[seg_idx] = True
+                seg_touch_idx[seg_idx] = True
 
             # p(X|clust,phase)p(X|seg,phase)p(clust)
             num = (MLs               # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B})
@@ -1209,8 +1194,6 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False):
 
                 self.clust_members[choice].update(set(seg_idx))
 
-            for si in seg_idx:
-                unassigned_segs.discard(si)
 
             # track global state of cluster assignments
             # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations

From b6b2348e1cc49731083393146d5af909f9c3a4a6 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 8 Feb 2022 09:55:15 -0500
Subject: [PATCH 033/222] Update comments accounting for warm start

---
 hapaseg/allelic_DP.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 52e1f99..98a3c78 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -833,7 +833,7 @@ def run(self, n_iter = 0, n_samps = 0):
         #
         # initialize cluster tracking hash tables
         self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore"))
-        # for the first round of clustering, this is { 1 : 1 }
+        # for the first round of clustering, this is { 0 : 1, 1 : 1, ..., N - 1 : 1 }
 
         x = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum()
         if (x.droplevel(0).index == True).any():
@@ -842,10 +842,10 @@ def run(self, n_iter = 0, n_samps = 0):
           **{ k : np.r_[v["min"], v["maj"]] for k, v in x.groupby(level = "clust").sum().to_dict(orient = "index").items() },
           **{-1 : np.r_[0, 0]}
         })
-        # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]] }
+        # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]], 1 : S[1, "min"], S[1, "maj"], ..., N : S[N - 1, "min"], S[N - 1, "maj"] }
 
         self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 })
-        # for the first round, this is { 1 : {0} }
+        # for the first round, this is { 0 : {0}, 1 : {1}, ..., N - 1 : {N - 1} }
 
         # store this as numpy for speed
         self.clusts = self.S["clust"].values

From 78059b940ffb12451307d20d25a464db3db5f605 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:36:45 -0500
Subject: [PATCH 034/222] Print abbreviated cluster summary for warm start

---
 hapaseg/allelic_DP.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 98a3c78..bde7f8e 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -861,15 +861,20 @@ def run(self, n_iter = 0, n_samps = 0):
         seg_touch_idx = np.zeros(len(self.S), dtype = bool)
 
         # likelihood trace
-        self.lik_tmp = []
+        self.lik_tmp = [-np.inf]
         self.post = 0
 
         n_it = 0
         n_it_last = 0
         while True:
             if not n_it % 1000:
-                print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index())
-                print("n unassigned: {}".format((self.S["clust"] == -1).sum()))
+                if len(self.clust_counts) > 20:
+                    print(pd.Series(self.clust_counts.values()).value_counts().sort_index())
+                else:
+                    print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1]))
+                print(self.lik_tmp[-1])
+                #print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index())
+                #print("n unassigned: {}".format((self.S["clust"] == -1).sum()))
 
             # stop after a raw number of iterations
             if n_iter > 0 and n_it > n_iter:

From 99ceff430dff32dea929202a867482404bd14eb3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:40:43 -0500
Subject: [PATCH 035/222] Use correct count marginal likelihood

---
 hapaseg/allelic_DP.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index bde7f8e..7cfee62 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -714,9 +714,10 @@ def compute_overall_lik_simple(self):
         # p({c_k})
         dirvec = np.r_[self.clust_counts.values()].astype(float)
         k = len(dirvec)
-        count_prior = ss.gammaln(dirvec + self.alpha/k).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) - k*ss.gammaln(self.alpha/k)
+        count_prior = k*np.log(self.alpha) + ss.gammaln(dirvec).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha)
 
         ## segmentation likelihood
+        # p({a_i, b_i} | {s}, {phase_i})
         bdy = np.flatnonzero(np.r_[1, np.diff(self.S["clust"]) != 0, 1])
         bdy = np.c_[bdy[:-1], bdy[1:]]
 

From eb1abc54e069737b814185c41cfab21092ff5046 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:41:05 -0500
Subject: [PATCH 036/222] Misc scrap commits to old overall likelihood function

---
 hapaseg/allelic_DP.py | 51 ++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 7cfee62..64d31ad 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -732,6 +732,7 @@ def compute_overall_lik_simple(self):
         #return clust_lik + phase_lik + count_prior + seg_lik
         return np.r_[clust_lik, phase_lik, count_prior, seg_lik]
 
+    # {{{
     def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False):
         if segs_to_clusters is None:
             su, segs_to_clusters = self.get_unique_clust_idxs()
@@ -745,10 +746,11 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None
 
         max_clust_idx = segs_to_clusters.max() + 1
 
-        liks = np.full(segs_to_clusters.shape[0], np.nan)
+        liks = np.full([segs_to_clusters.shape[0], 2], np.nan)
 
         for i, (cl_samp, ph_samp) in enumerate(zip(segs_to_clusters, phase_orientations)):
             ## overall clustering likelihood
+            clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum()
 
             A1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx)
             A2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx)
@@ -756,36 +758,45 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None
             B1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx)
             B2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx)
 
-            count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double)
+            # print(A1[1:].sum(), B1[1:].sum(), A2[1:].sum(), B2[1:].sum())
+
+            count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double)[min_clust_idx:]
             count_prior /= count_prior.sum()
 
-            clust_lik = (ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1))[min_clust_idx:].sum()
+            #breakpoint()
+
+            clust_lik = ((ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1))[min_clust_idx:] + np.log(count_prior)).sum()
             # account for unassigned clusters, if present
             if min_clust_idx == 1:
                 clust_lik += ss.betaln(self.S.loc[cl_samp == 0, "maj"] + 1, self.S.loc[cl_samp == 0, "min"] + 1).sum()
 
-#            ## segmentation likelihood
-#
-#            # get segment boundaries
-#            bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1])
-#            bdy = np.c_[bdy[:-1], bdy[1:]]
-#
-#            # sum log-likelihoods of each segment
-#            seg_lik = 0
-#            for st, en in bdy:
-#                A, B = S_ph.iloc[st:en, [self.min_col, self.maj_col]].sum()
+            if debug:
+                breakpoint()
+
+            ## segmentation likelihood
+
+            seg_lik = np.nan
+#            if min_clust_idx == 0:
+#                # get segment boundaries
+#                bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1])
+#                bdy = np.c_[bdy[:-1], bdy[1:]]
 #
-## for when self.S is not modified
-##               A = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \
-##                   self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum()
-##               B = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \
-##                   self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum()
+#                # sum log-likelihoods of each segment
+#                seg_lik = 0
+#                for st, en in bdy:
+#                   A1 = self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum()
+#                   A2 = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum()
+#                   B1 = self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum()
+#                   B2 = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum()
 #
-#                seg_lik += ss.betaln(A + 1, B + 1)
+#                   seg_lik += ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1)
+#            else:
+#                seg_lik = np.nan
 
-            liks[i] = clust_lik
+            liks[i, :] = np.r_[clust_lik, seg_lik]
 
         return liks
+# }}}
 
     def run(self, n_iter = 0, n_samps = 0):
         #

From 5cf3e2d67d764ffbcfefdb4065fba0a7c6c15527 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:42:45 -0500
Subject: [PATCH 037/222] Remove outdated TODO

---
 hapaseg/allelic_DP.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 64d31ad..1295b09 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -982,6 +982,8 @@ def run(self, n_iter = 0, n_samps = 0):
 
                     split_clust = True
 
+                # }}}
+
                 seg_idx = np.r_[list(seg_idx)]
 
                 n_move = len(seg_idx)
@@ -1122,7 +1124,6 @@ def run(self, n_iter = 0, n_samps = 0):
             log_count_prior[1:] = ss.gammaln(n_move + n_c) + ss.gammaln(N + self.alpha - n_move) \
               - (ss.gammaln(n_c) + ss.gammaln(N + self.alpha))
             # probability of opening a new cluster
-            # TODO: accommodate prior clusters here
             log_count_prior[0] = ss.gammaln(n_move) + np.log(self.alpha) + ss.gammaln(N + self.alpha - n_move) - ss.gammaln(N + self.alpha)
 
             #

From d88d7d90488b5ffcd30fdfcb1e0fe6e1f12ec4a0 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:46:39 -0500
Subject: [PATCH 038/222] Remove cruft

---
 hapaseg/allelic_DP.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 1295b09..92883d1 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -225,7 +225,6 @@ def run(self, N_seg_samps = 50, N_clust_samps = 5, seg_sample_idx = None):
                     if k != -1 and k not in next_clust_prior:
                         clust_count_prior[k] -= clust_count_prior[k]/cur_samp_iter
 
-
             # remove improbable clusters from prior
             for kk in [k for k, v in clust_count_prior.items() if v < 1]:
                 del clust_prior[kk]
@@ -1056,22 +1055,6 @@ def run(self, n_iter = 0, n_samps = 0):
 
             MLs = BC - C[:, None]
 
-            # {{{
-            #     L(join)           L(split)
-            #MLs = A + BC + adj_BC - (AB + C + adj_AB)
-            # TODO: remove extraneous calculations (e.g. adj_AB, AB, A);
-            #       likelihood simplifies to this in the prior:
-            #MLs = adj_BC + BC - C
-
-            # if we are moving multiple contiguous segments assigned to the same
-            # cluster, do not allow them to create a new cluster. this helps keep
-            # cluster indices consistent
-            # TODO: if we don't care about keeping indices consistent, then we can probably remove this line
-            if n_move > 1 and not move_clust:
-                MLs[self.clust_sums.index(-1)] = -np.inf
-
-            # }}}
-
             #
             # priors
 
@@ -1163,11 +1146,6 @@ def run(self, n_iter = 0, n_samps = 0):
             # -1 = brand new, -2, -3, ... = -(prior clust index) - 2
             choice = np.r_[-np.r_[prior_diff] - 2, self.clust_counts.keys()][choice_idx//2]
 
-            # compute posterior delta between previous and current state
-            post_delta = num.ravel()[choice_idx] - \
-              num[self.clust_sums.index(cur_clust if cur_clust in self.clust_sums else -1), 0]
-            self.post += post_delta
-
             # save rephasing status
             if choice_idx & 1:
                 self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]

From d0e6e7d12bde60209947355945a354e454bcfe6b Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:48:31 -0500
Subject: [PATCH 039/222] Unfinished burnin-related code

---
 hapaseg/allelic_DP.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 92883d1..3dddcc4 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -897,7 +897,6 @@ def run(self, n_iter = 0, n_samps = 0):
 
             # poll every 100 iterations for burnin status
             if not n_it % 100:
-
                 # have most segments been adjacency corrected?
                 # if so, has the overall likelihood stabilized enough that we're burned in?
                 if not burned_in:
@@ -909,9 +908,16 @@ def run(self, n_iter = 0, n_samps = 0):
                     # 2. if >90% of segments have been adjacency corrected, check for burnin
                     # does the smoothed derivative of the posterior numerator go below zero? this would indicate that we've solidly reached an optimum
                     # TODO: make this check more efficient?
-                    if all_touched and (np.convolve(np.diff(self.lik_tmp), np.ones(50)/50, mode = "same") < 0).sum() > 2:
-                        burned_in = True
-                        breakpoint()
+#                    if all_touched and (np.convolve(np.diff(self.lik_tmp), np.ones(50)/50, mode = "same") < 0).sum() > 2:
+#                        pass
+#                        burned_in = True
+#                        n_it_last = n_it
+#                        seg_touch_idx[:] = False
+
+                if burned_in and seg_touch_idx.mean() > 0.3:
+                    self.segs_to_clusters.append(self.S["clust"].copy())
+                    self.phase_orientations.append(self.S["flipped"].copy())
+                    seg_touch_idx[:] = False
 
             #
             # pick either a segment or a cluster at random (50:50 prob.)
@@ -1190,7 +1196,6 @@ def run(self, n_iter = 0, n_samps = 0):
 
                 self.clust_members[choice].update(set(seg_idx))
 
-
             # track global state of cluster assignments
             # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations
             if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2):

From 1f43c5934836092ef5b7ca3009495802fe1b56a8 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:37:28 -0500
Subject: [PATCH 040/222] Try clustering on the SNP level

---
 hapaseg/allelic_DP.py | 105 +++++++++++++-----------------------------
 1 file changed, 33 insertions(+), 72 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 3dddcc4..97ae8fe 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -44,84 +44,45 @@ def load_seg_samp(self, samp_idx):
         if samp_idx > self.n_samp:
             raise ValueError(f"Only {self.n_samp} MCMC samples were taken!")
 
-        all_segs = []
-        all_SNPs = []
-
-        maj_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("MAJ_COUNT")
-        min_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("MIN_COUNT")
-
-        alt_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("ALT_COUNT")
-        ref_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("REF_COUNT")
-
-        chunk_offset = 0
+        SNPs = []
+        clust_offset = 0
         for _, H in self.allelic_segs.dropna(subset = ["results"]).iterrows():
-            r = copy.deepcopy(H["results"])
-
-            # set phasing orientation back to original
-            for st, en in r.F.intervals():
-                # code excised from flip_hap
-                x = r.P.iloc[st:en, maj_idx].copy()
-                r.P.iloc[st:en, maj_idx] = r.P.iloc[st:en, min_idx]
-                r.P.iloc[st:en, min_idx] = x
-
-            # save SNPs for this chunk
-            if self.SNPs is None:
-                all_SNPs.append(pd.DataFrame({
-                  "maj" : r.P["MAJ_COUNT"],
-                  "min" : r.P["MIN_COUNT"],
-                  # TODO: gpos should be computed earlier, so that that we don't need to pass ref_fasta here
-                  "gpos" : seq.chrpos2gpos(r.P.loc[0, "chr"], r.P["pos"], ref = self.ref_fasta),
-                  "allele" : r.P["allele_A"]
-                }))
-
-            # draw breakpoint, phasing, and SNP inclusion sample from segmentation MCMC trace
-            bp_samp, pi_samp, inc_samp = (r.breakpoint_list[samp_idx], r.phase_interval_list[samp_idx] if r.phase_correct else None, r.include[samp_idx])
-            # flip everything according to sample
-            if r.phase_correct:
-                for st, en in pi_samp.intervals():
-                    x = r.P.iloc[st:en, maj_idx].copy()
-                    r.P.iloc[st:en, maj_idx] = r.P.iloc[st:en, min_idx]
-                    r.P.iloc[st:en, min_idx] = x
-
-            bpl = np.array(bp_samp); bpl = np.c_[bpl[0:-1], bpl[1:]]
-
-            # get major/minor sums for each segment
-            # also get {alt, ref} x {aidx, bidx}
-            for st, en in bpl:
-                all_segs.append([
-                  st + chunk_offset, en + chunk_offset,                        # SNP index for seg
-                  r.P.loc[st, "chr"], r.P.loc[st, "pos"], r.P.loc[en, "pos"],  # chromosomal position of seg
-                  r._Piloc(st, en, min_idx, inc_samp).sum(),                   # min/maj counts
-                  r._Piloc(st, en, maj_idx, inc_samp).sum(),
-
-                  r._Piloc(st, en, alt_idx, inc_samp & r.P["aidx"]).sum(),     # allele A alt/ref
-                  r._Piloc(st, en, ref_idx, inc_samp & r.P["aidx"]).sum(),
-                  r._Piloc(st, en, alt_idx, inc_samp & ~r.P["aidx"]).sum(),    # allele B alt/ref
-                  r._Piloc(st, en, ref_idx, inc_samp & ~r.P["aidx"]).sum()
-                ])
-
-            chunk_offset += len(r.P)
-
-        # convert samples into dataframe
-        S = pd.DataFrame(all_segs, columns = ["SNP_st", "SNP_en", "chr", "start", "end", "min", "maj", "A_alt", "A_ref", "B_alt", "B_ref"])
+            S = copy.deepcopy(H["results"].P)
+            S["A_alt"] = 0
+            S.loc[S["aidx"], "A_alt"] = S.loc[S["aidx"], "ALT_COUNT"]
+            S["A_ref"] = 0
+            S.loc[S["aidx"], "A_ref"] = S.loc[S["aidx"], "REF_COUNT"]
+            S["B_alt"] = 0
+            S.loc[~S["aidx"], "B_alt"] = S.loc[~S["aidx"], "ALT_COUNT"]
+            S["B_ref"] = 0
+            S.loc[~S["aidx"], "B_ref"] = S.loc[~S["aidx"], "REF_COUNT"]
+
+            S = S.rename(columns = { "MIN_COUNT" : "min", "MAJ_COUNT" : "maj" })
+            S = S.loc[:, ["chr", "pos", "min", "maj", "A_alt", "A_ref", "B_alt", "B_ref"]]
+
+            # set initial cluster assignments based on segmentation
+            S["clust"] = -1
+            # TODO: use ML segmentation
+            bpl = np.array(H["results"].breakpoint_list[samp_idx]); bpl = np.c_[bpl[0:-1], bpl[1:]]
+            for i, (st, en) in enumerate(bpl):
+                S.iloc[st:en, S.columns.get_loc("clust")] = i + clust_offset
+            clust_offset += i
+
+            # bug in segmentation omits final SNP?
+            S = S.iloc[:-1]
+            assert (S["clust"] != -1).all()
+
+            SNPs.append(S)
+
+        SNPs = pd.concat(SNPs, ignore_index = True)
 
         # convert chr-relative positions to absolute genomic coordinates
-        S["start_gp"] = seq.chrpos2gpos(S["chr"], S["start"], ref = self.ref_fasta)
-        S["end_gp"] = seq.chrpos2gpos(S["chr"], S["end"], ref = self.ref_fasta)
-
-        # initial cluster assignments
-        S["clust"] = -1 # initially, all segments are unassigned
-        S.iloc[0, S.columns.get_loc("clust")] = 0 # first segment is assigned to cluster 0
+        SNPs["pos_gp"] = seq.chrpos2gpos(SNPs["chr"], SNPs["pos"], ref = self.ref_fasta)
 
         # initial phasing orientation
-        S["flipped"] = False
-
-        if self.SNPs is None:
-            self.SNPs = pd.concat(all_SNPs, ignore_index = True)
-            CI = s.beta.ppf([0.05, 0.5, 0.95], self.SNPs["min"].values[:, None] + 1, self.SNPs["maj"].values[:, None] + 1)
-            self.SNPs[["f_CI_lo", "f", "f_CI_hi"]] = CI
+        SNPs["flipped"] = False
 
-        return S, self.SNPs
+        return SNPs, None
 
     # map trace of segment cluster assignments to the SNPs within
     @staticmethod

From 5d963fd122ef9fdd33ac5102eeaee7bc5d2179fc Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:38:33 -0500
Subject: [PATCH 041/222] Try moving segments by (almost) default

---
 hapaseg/allelic_DP.py | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 97ae8fe..0f4ced8 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -304,12 +304,6 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.bref_col = self.S.columns.get_loc("B_ref")
         self.flip_col = self.S.columns.get_loc("flipped")
 
-        #
-        # compute rephase probabilities for each segment
-        self.S["rephase_prob"] = np.nan
-        for i in range(0, len(self.S)):
-            self.S.at[i, "rephase_prob"] = self.compute_rephase_prob(np.r_[i])
-
         #
         # initialize priors
 
@@ -891,7 +885,7 @@ def run(self, n_iter = 0, n_samps = 0):
 
                 # expand segment to include all adjacent segments in the same cluster,
                 # if it has already been assigned to a cluster
-                if cur_clust >= 0 and np.random.rand() < 0.5:
+                if cur_clust >= 0 and np.random.rand() < 0.95:
                     si = seg_idx[0]
 
                     j = 1

From 6d40fc30906b0dbc05e5dd684c235d3526c6a4eb Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:39:45 -0500
Subject: [PATCH 042/222] Can't split segs of length 1

---
 hapaseg/allelic_DP.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 0f4ced8..1d86ff1 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -905,11 +905,17 @@ def run(self, n_iter = 0, n_samps = 0):
 #                        n_it += 1
 #                        continue
 
-                # propose splitting out a contiguous interval of segments within the current cluster
+                # propose splitting out a contiguous interval of segments within the current cluster {{{
                 split_clust = False
-                if all_assigned and np.random.rand() < 0.1:
+                if np.random.rand() < 0.1:
                     # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable?
                     clust_segs = np.sort(np.r_[list(self.clust_members[cur_clust])])
+
+                    # can't split clusters of length 1
+                    if len(clust_segs) == 1:
+                        n_it += 1
+                        continue
+
                     split_bdy = self.compute_cluster_splitpoints(clust_segs)
 
                     A_tot, B_tot = self.clust_sums[cur_clust]

From 81686cbbdfe3d73683ddb18a6f60d47f5ec2cc5d Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 9 Feb 2022 06:51:17 -0500
Subject: [PATCH 043/222] Diagnostic code for printing each transition

---
 hapaseg/allelic_DP.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 1d86ff1..fd6d9e4 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1117,6 +1117,11 @@ def run(self, n_iter = 0, n_samps = 0):
             if choice_idx & 1:
                 self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]
 
+            if not move_clust:
+                print(f"{cur_clust}->{choice} ({len(seg_idx)}, s, [{seg_idx[0]}, {seg_idx[-1]}])")
+            else:
+                print(f"{cl_idx}->{choice} ({len(seg_idx)}, c, [{seg_idx[0]}, {seg_idx[-1]}])")
+
             # create new cluster
             if choice < 0:
                 # if we are moving an entire cluster, give it the same index it used to have

From 673a17259e619c1c69048c837559d342d75ad9a2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 08:41:50 -0500
Subject: [PATCH 044/222] Rescale DP counts by average segment length

---
 hapaseg/allelic_DP.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index fd6d9e4..25d68de 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -282,12 +282,13 @@ def scerrorbar(idx, rev = False, alpha = 1, show_CI = True):
         scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color)
 
 class DPinstance:
-    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1, temperature = 1):
+    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1, temperature = 1, dp_count_scale_factor = 1):
         self.S = S
         self.clust_prior = clust_prior.copy()
         self.clust_count_prior = clust_count_prior.copy()
         self.alpha = alpha
         self.temperature = temperature
+        self.dp_count_scale_factor = dp_count_scale_factor
 
         self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed
         self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F")
@@ -1068,13 +1069,14 @@ def run(self, n_iter = 0, n_samps = 0):
             # }}}
                 
             ## DP prior based on clusters sizes
-            n_c = np.c_[self.clust_counts.values()]
-            N = n_c.sum() + n_move
+            n_c = np.c_[self.clust_counts.values()]/self.dp_count_scale_factor
+            M = n_move/self.dp_count_scale_factor
+            N = n_c.sum() + M
             log_count_prior = np.full([len(self.clust_sums), 1], np.nan)
-            log_count_prior[1:] = ss.gammaln(n_move + n_c) + ss.gammaln(N + self.alpha - n_move) \
+            log_count_prior[1:] = ss.gammaln(M + n_c) + ss.gammaln(N + self.alpha - M) \
               - (ss.gammaln(n_c) + ss.gammaln(N + self.alpha))
             # probability of opening a new cluster
-            log_count_prior[0] = ss.gammaln(n_move) + np.log(self.alpha) + ss.gammaln(N + self.alpha - n_move) - ss.gammaln(N + self.alpha)
+            log_count_prior[0] = ss.gammaln(M) + np.log(self.alpha) + ss.gammaln(N + self.alpha - M) - ss.gammaln(N + self.alpha)
 
             #
             # adjacent segment likelihood

From 0d51cce02fdbd0b99d14c42e465e1383f2e3ace9 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 10:08:07 -0500
Subject: [PATCH 045/222] Keep track of segmentation breakpoints

---
 hapaseg/allelic_DP.py | 64 +++++++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 24 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 25d68de..bea48d3 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -395,34 +395,42 @@ def compute_adj_prob(self, seg_idx):
         ## compute boundaries of adjacent segments
 
         # maj/min counts of contiguous upstream segments belonging to the same cluster
-        st = seg_idx[0]
         U_A = 0
         U_B = 0
         U_cl = -1
-        if st - 1 > 0:
-            U_cl = self.clusts[st - 1]
-            j = 1
-            while st - j > 0 and self.clusts[st - j] != -1 and \
-              self.clusts[st - j] == U_cl:
-                U_A += self._Siat_ph(st - j, min = True)
-                U_B += self._Siat_ph(st - j, min = False)
-
-                j += 1
+        if seg_idx[0] - 1 > 0:
+            break_idx = self.breakpoints.index(seg_idx[0]) - 1
+            seg_st = self.breakpoints[break_idx]
+            seg_en = self.breakpoints[break_idx + 1]
+            U_cl = self.clusts[seg_st]
+            while break_idx > 0 and self.clusts[seg_st] != -1 and \
+              self.clusts[seg_st] == U_cl:
+                # TODO: segment sums will eventually be memoized
+                U_A += self._Ssum_ph(np.r_[seg_st:seg_en], min = True)
+                U_B += self._Ssum_ph(np.r_[seg_st:seg_en], min = False)
+
+                break_idx = self.breakpoints.index(seg_st) - 1
+                seg_st = self.breakpoints[break_idx]
+                seg_en = self.breakpoints[break_idx + 1]
 
         # maj/min counts of contiguous downstream segments belonging to the same cluster
-        en = seg_idx[-1]
         D_A = 0
         D_B = 0
         D_cl = -1
-        if en + 1 < len(self.S):
-            D_cl = self.clusts[en + 1]
-            j = 1
-            while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \
-              self.clusts[en + j] == D_cl:
-                D_A += self._Siat_ph(en + j, min = True)
-                D_B += self._Siat_ph(en + j, min = False)
-
-                j += 1 
+        if seg_idx[-1] + 1 < len(self.S):
+            break_idx = self.breakpoints.index(seg_idx[0]) + 1
+            seg_st = self.breakpoints[break_idx]
+            seg_en = self.breakpoints[break_idx + 1]
+            D_cl = self.clusts[seg_st]
+            while break_idx < len(self.breakpoints) - 1 and self.clusts[seg_st] != -1 and \
+              self.clusts[seg_st] == D_cl:
+                # TODO: segment sums will eventually be memoized
+                D_A += self._Ssum_ph(np.r_[seg_st:seg_en], min = True)
+                D_B += self._Ssum_ph(np.r_[seg_st:seg_en], min = False)
+
+                break_idx = self.breakpoints.index(seg_st) + 1
+                seg_st = self.breakpoints[break_idx]
+                seg_en = self.breakpoints[break_idx + 1]
 
         # maj/min counts of segment(s) being moved
         S_A = self._Ssum_ph(seg_idx, min = True)
@@ -526,7 +534,7 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                 # min/maj counts of the segment(s) being moved
                 st = ordpairs[j, 0]
                 en = ordpairs[j, 1]
-                S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # en + 1 because ordpairs is closed
+                S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # en + 1 because ordpairs is a closed interval
                 S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) 
 
                 # adjacency likelihood of this segment remaining where it is
@@ -819,6 +827,10 @@ def run(self, n_iter = 0, n_samps = 0):
 
         max_clust_idx = np.max(self.clust_members.keys() | self.clust_prior.keys() if self.clust_prior is not None else {})
 
+        # segmentation breakpoints
+        self.breakpoints = sc.SortedSet(np.flatnonzero(np.diff(self.S["clust"]) != 0) + 1) | {0, len(self.S)}
+        # TODO: memoize min/maj counts for each segment
+
         # containers for saving the MCMC trace
         self.segs_to_clusters = []
         self.phase_orientations = []
@@ -880,9 +892,13 @@ def run(self, n_iter = 0, n_samps = 0):
             move_clust = False
 
             # pick a segment at random
-            if np.random.rand() < 0.5:
-                seg_idx = sc.SortedSet({np.random.choice(len(self.S))})
-                cur_clust = int(self.clusts[seg_idx])
+            if True or np.random.rand() < 0.5:
+                # get all SNPs within this segment
+                # TODO: why is seg_idx a sortedset? we don't use that functionality elsewhere
+                break_idx = np.random.choice(len(self.breakpoints) - 1)
+                seg_idx = sc.SortedSet(np.r_[self.breakpoints[break_idx]:self.breakpoints[break_idx + 1]])
+
+                cur_clust = int(self.clusts[seg_idx[0]])
 
                 # expand segment to include all adjacent segments in the same cluster,
                 # if it has already been assigned to a cluster

From 719fc7a7074e60a49281c57532d5b7b18e1d2944 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 10:43:39 -0500
Subject: [PATCH 046/222] Memoize segment min/maj counts

---
 hapaseg/allelic_DP.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index bea48d3..5716976 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -405,9 +405,8 @@ def compute_adj_prob(self, seg_idx):
             U_cl = self.clusts[seg_st]
             while break_idx > 0 and self.clusts[seg_st] != -1 and \
               self.clusts[seg_st] == U_cl:
-                # TODO: segment sums will eventually be memoized
-                U_A += self._Ssum_ph(np.r_[seg_st:seg_en], min = True)
-                U_B += self._Ssum_ph(np.r_[seg_st:seg_en], min = False)
+                U_A += self.seg_sums[seg_st][0]
+                U_B += self.seg_sums[seg_st][1]
 
                 break_idx = self.breakpoints.index(seg_st) - 1
                 seg_st = self.breakpoints[break_idx]
@@ -424,17 +423,16 @@ def compute_adj_prob(self, seg_idx):
             D_cl = self.clusts[seg_st]
             while break_idx < len(self.breakpoints) - 1 and self.clusts[seg_st] != -1 and \
               self.clusts[seg_st] == D_cl:
-                # TODO: segment sums will eventually be memoized
-                D_A += self._Ssum_ph(np.r_[seg_st:seg_en], min = True)
-                D_B += self._Ssum_ph(np.r_[seg_st:seg_en], min = False)
+                D_A += self.seg_sums[seg_st][0]
+                D_B += self.seg_sums[seg_st][1]
 
                 break_idx = self.breakpoints.index(seg_st) + 1
                 seg_st = self.breakpoints[break_idx]
                 seg_en = self.breakpoints[break_idx + 1]
 
         # maj/min counts of segment(s) being moved
-        S_A = self._Ssum_ph(seg_idx, min = True)
-        S_B = self._Ssum_ph(seg_idx, min = False)
+        S_A = self.seg_sums[seg_idx[0]][0]
+        S_B = self.seg_sums[seg_idx[0]][1]
 
         ## compute all four possible segmentations relative to neighbor, in
         ## both phasing orientations
@@ -810,11 +808,11 @@ def run(self, n_iter = 0, n_samps = 0):
         self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore"))
         # for the first round of clustering, this is { 0 : 1, 1 : 1, ..., N - 1 : 1 }
 
-        x = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum()
-        if (x.droplevel(0).index == True).any():
-            x.loc[(slice(None), True), ["min", "maj"]] = x.loc[(slice(None), True), ["maj", "min"]].values
+        Sgc = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum()
+        if (Sgc.droplevel(0).index == True).any():
+            Sgc.loc[(slice(None), True), ["min", "maj"]] = Sgc.loc[(slice(None), True), ["maj", "min"]].values
         self.clust_sums = sc.SortedDict({
-          **{ k : np.r_[v["min"], v["maj"]] for k, v in x.groupby(level = "clust").sum().to_dict(orient = "index").items() },
+          **{ k : np.r_[v["min"], v["maj"]] for k, v in Sgc.groupby(level = "clust").sum().to_dict(orient = "index").items() },
           **{-1 : np.r_[0, 0]}
         })
         # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]], 1 : S[1, "min"], S[1, "maj"], ..., N : S[N - 1, "min"], S[N - 1, "maj"] }
@@ -829,7 +827,13 @@ def run(self, n_iter = 0, n_samps = 0):
 
         # segmentation breakpoints
         self.breakpoints = sc.SortedSet(np.flatnonzero(np.diff(self.S["clust"]) != 0) + 1) | {0, len(self.S)}
-        # TODO: memoize min/maj counts for each segment
+        # min/maj counts in each segment
+        self.seg_sums = sc.SortedDict()
+        bpl = np.r_[self.breakpoints]
+        for st, en in np.c_[bpl[:-1], bpl[1:]]:
+            mn = self._Ssum_ph(np.r_[st:en], min = True)
+            mj = self._Ssum_ph(np.r_[st:en], min = False)
+            self.seg_sums[st] = np.r_[mn, mj]
 
         # containers for saving the MCMC trace
         self.segs_to_clusters = []

From bf337b6e88a45cd05edf73ce18c7ec8d85148ad3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 11:05:30 -0500
Subject: [PATCH 047/222] Fix indexing bug

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 5716976..97ba693 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -421,7 +421,7 @@ def compute_adj_prob(self, seg_idx):
             seg_st = self.breakpoints[break_idx]
             seg_en = self.breakpoints[break_idx + 1]
             D_cl = self.clusts[seg_st]
-            while break_idx < len(self.breakpoints) - 1 and self.clusts[seg_st] != -1 and \
+            while break_idx < len(self.breakpoints) - 2 and self.clusts[seg_st] != -1 and \
               self.clusts[seg_st] == D_cl:
                 D_A += self.seg_sums[seg_st][0]
                 D_B += self.seg_sums[seg_st][1]

From a63f5deb442290c6e0414452d879a24c7ba2f9bb Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 11:48:55 -0500
Subject: [PATCH 048/222] Merge adjacent segments if they're assigned to the
 same cluster

---
 hapaseg/allelic_DP.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 97ba693..9563781 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -898,9 +898,8 @@ def run(self, n_iter = 0, n_samps = 0):
             # pick a segment at random
             if True or np.random.rand() < 0.5:
                 # get all SNPs within this segment
-                # TODO: why is seg_idx a sortedset? we don't use that functionality elsewhere
-                break_idx = np.random.choice(len(self.breakpoints) - 1)
-                seg_idx = sc.SortedSet(np.r_[self.breakpoints[break_idx]:self.breakpoints[break_idx + 1]])
+                break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)})
+                seg_idx = sc.SortedSet(np.r_[self.breakpoints[break_idx[0]]:self.breakpoints[break_idx[0] + 1]])
 
                 cur_clust = int(self.clusts[seg_idx[0]])
 
@@ -1184,6 +1183,21 @@ def run(self, n_iter = 0, n_samps = 0):
 
                 self.clust_members[choice].update(set(seg_idx))
 
+            # update breakpoints
+            snp_idx = [self.breakpoints[b] for b in break_idx]
+            update_idx = sc.SortedSet()
+            for snp in snp_idx:
+                if self.clusts[snp - 1] == self.clusts[snp]:
+                    self.breakpoints.remove(snp)
+                    self.seg_sums.pop(snp)
+                    update_idx.add(self.breakpoints.bisect_left(snp) - 1)
+            for bp_idx in update_idx:
+                st = self.breakpoints[bp_idx]
+                en = self.breakpoints[bp_idx + 1]
+                mn = self._Ssum_ph(np.r_[st:en], min = True)
+                mj = self._Ssum_ph(np.r_[st:en], min = False)
+                self.seg_sums[st] = np.r_[mn, mj]
+
             # track global state of cluster assignments
             # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations
             if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2):

From daa625f0dbddd13b330760d7c71041e202a01c18 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 13:12:00 -0500
Subject: [PATCH 049/222] Need to consider both ends of segments

---
 hapaseg/allelic_DP.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 9563781..62162fb 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1184,10 +1184,10 @@ def run(self, n_iter = 0, n_samps = 0):
                 self.clust_members[choice].update(set(seg_idx))
 
             # update breakpoints
-            snp_idx = [self.breakpoints[b] for b in break_idx]
+            snp_idx = [self.breakpoints[b] for b in break_idx | { x + 1 for x in break_idx }]
             update_idx = sc.SortedSet()
             for snp in snp_idx:
-                if self.clusts[snp - 1] == self.clusts[snp]:
+                if snp < len(self.S) and self.clusts[snp - 1] == self.clusts[snp]:
                     self.breakpoints.remove(snp)
                     self.seg_sums.pop(snp)
                     update_idx.add(self.breakpoints.bisect_left(snp) - 1)

From d7c9bd88c2658ff3fc5157b1d356be17d692098a Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 14:19:51 -0500
Subject: [PATCH 050/222] Allow segments to be broken

Also remove code for expanding into adjacent segments; this is no longer
necessary since we keep track of breakpoints now
---
 hapaseg/allelic_DP.py | 42 ++++++++++++++++++------------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 62162fb..97c8915 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -899,31 +899,27 @@ def run(self, n_iter = 0, n_samps = 0):
             if True or np.random.rand() < 0.5:
                 # get all SNPs within this segment
                 break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)})
-                seg_idx = sc.SortedSet(np.r_[self.breakpoints[break_idx[0]]:self.breakpoints[break_idx[0] + 1]])
+                seg_idx = np.r_[self.breakpoints[break_idx[0]]:self.breakpoints[break_idx[0] + 1]]
 
                 cur_clust = int(self.clusts[seg_idx[0]])
 
-                # expand segment to include all adjacent segments in the same cluster,
-                # if it has already been assigned to a cluster
-                if cur_clust >= 0 and np.random.rand() < 0.95:
-                    si = seg_idx[0]
-
-                    j = 1
-                    while si - j > 0 and self.clusts[si - j] == cur_clust:
-                        seg_idx.add(si - j)
-                        j += 1
-                    j = 1
-                    while si + j < len(self.S) and self.clusts[si + j] == cur_clust:
-                        seg_idx.add(si + j)
-                        j += 1
-
-                    # if we've expanded to include a large fraction (>10%) of segments 
-                    # in this cluster, cluster indexing might become inconsistent.
-                    # skip this iteration
-#                    if len(seg_idx) >= 0.1*self.clust_counts[cur_clust]:
-#                        breakpoint()
-#                        n_it += 1
-#                        continue
+                # propose breaking this segment
+                if np.random.rand() < 0.1:
+                    # can't split segments of length 1
+                    if len(seg_idx) == 1:
+                        n_it += 1
+                        continue
+
+                    # TODO: memoize cumsums?
+                    min_cs = self._Scumsum_ph(seg_idx, min = True)
+                    min_csr = self.seg_sums[seg_idx[0]][0] - min_cs
+                    maj_cs = self._Scumsum_ph(seg_idx, min = False)
+                    maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs
+
+                    split_lik = ss.betaln(min_cs + 1, maj_cs + 1) + ss.betaln(min_csr + 1, maj_csr + 1)
+                    split_lik -= split_lik.max()
+                    split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum())
+                    seg_idx = seg_idx[:(split_point + 1)]
 
                 # propose splitting out a contiguous interval of segments within the current cluster {{{
                 split_clust = False
@@ -970,8 +966,6 @@ def run(self, n_iter = 0, n_samps = 0):
 
                 # }}}
 
-                seg_idx = np.r_[list(seg_idx)]
-
                 n_move = len(seg_idx)
 
                 # if segment was already assigned to a cluster, unassign it

From afa35efe86abdec94efcfce54e9218bf1781b77a Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 15:20:25 -0500
Subject: [PATCH 051/222] Add breakpoints when splitting segment

---
 hapaseg/allelic_DP.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 97c8915..173a6cf 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -899,7 +899,9 @@ def run(self, n_iter = 0, n_samps = 0):
             if True or np.random.rand() < 0.5:
                 # get all SNPs within this segment
                 break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)})
-                seg_idx = np.r_[self.breakpoints[break_idx[0]]:self.breakpoints[break_idx[0] + 1]]
+                seg_st = self.breakpoints[break_idx[0]]
+                seg_en = self.breakpoints[break_idx[0] + 1]
+                seg_idx = np.r_[seg_st:seg_en]
 
                 cur_clust = int(self.clusts[seg_idx[0]])
 
@@ -921,6 +923,13 @@ def run(self, n_iter = 0, n_samps = 0):
                     split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum())
                     seg_idx = seg_idx[:(split_point + 1)]
 
+                    # add breakpoint (can be erased subsequently if segment rejoins original cluster)
+                    new_bp = seg_idx[-1] + 1
+                    if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment
+                        self.breakpoints.add(new_bp)
+                        self.seg_sums[new_bp] = np.r_[self._Ssum_ph(np.r_[new_bp:seg_en], min = True), self._Ssum_ph(np.r_[new_bp:seg_en], min = False)]
+                        self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp]
+
                 # propose splitting out a contiguous interval of segments within the current cluster {{{
                 split_clust = False
                 if np.random.rand() < 0.1:

From 10629c34f012e1e45b1243f08825db5b70b334b6 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 25 Feb 2022 16:26:45 -0500
Subject: [PATCH 052/222] Update seg_sums for phase flips

---
 hapaseg/allelic_DP.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 173a6cf..f92bc28 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1140,6 +1140,10 @@ def run(self, n_iter = 0, n_samps = 0):
             # save rephasing status
             if choice_idx & 1:
                 self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]
+                for b in break_idx:
+                    st = self.breakpoints[b]
+                    en = self.breakpoints[b + 1]
+                    self.seg_sums[st] = self.seg_sums[st][::-1]
 
             if not move_clust:
                 print(f"{cur_clust}->{choice} ({len(seg_idx)}, s, [{seg_idx[0]}, {seg_idx[-1]}])")
@@ -1197,9 +1201,10 @@ def run(self, n_iter = 0, n_samps = 0):
             for bp_idx in update_idx:
                 st = self.breakpoints[bp_idx]
                 en = self.breakpoints[bp_idx + 1]
-                mn = self._Ssum_ph(np.r_[st:en], min = True)
-                mj = self._Ssum_ph(np.r_[st:en], min = False)
-                self.seg_sums[st] = np.r_[mn, mj]
+                self.seg_sums[st] = np.r_[
+                  self._Ssum_ph(np.r_[st:en], min = True),
+                  self._Ssum_ph(np.r_[st:en], min = False)
+                ]
 
             # track global state of cluster assignments
             # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations

From dd7ffdf5aece3f50f2797798637d7119adcd5bf6 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sun, 27 Feb 2022 10:48:59 -0500
Subject: [PATCH 053/222] Fix style

---
 hapaseg/allelic_DP.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index f92bc28..3e211f0 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -376,13 +376,13 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_
 #                J_b = S.iloc[st:(en + 1), maj_col].sum()
         SU_a = SU_b = SD_a = SD_b = 0
 
-        if targ_clust != - 1 and targ_clust == upstream_clust:
+        if targ_clust != -1 and targ_clust == upstream_clust:
             J_a += U_a
             J_b += U_b
         else:
             SU_a += U_a
             SU_b += U_b
-        if targ_clust != - 1 and targ_clust == downstream_clust:
+        if targ_clust != -1 and targ_clust == downstream_clust:
             J_a += D_a
             J_b += D_b
         else:

From ff508a9107187f8752fa78878295724bc0e0265b Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sun, 27 Feb 2022 14:25:07 -0500
Subject: [PATCH 054/222] Add hash mapping cluster indices -> breakpoints

---
 hapaseg/allelic_DP.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 3e211f0..914e790 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -834,6 +834,11 @@ def run(self, n_iter = 0, n_samps = 0):
             mn = self._Ssum_ph(np.r_[st:en], min = True)
             mj = self._Ssum_ph(np.r_[st:en], min = False)
             self.seg_sums[st] = np.r_[mn, mj]
+        # breakpoints for each cluster
+        self.clust_members_bps = sc.SortedDict({
+          k : sc.SortedSet(v) for k, v in \
+            self.S.loc[self.breakpoints[:-1], ["clust"]].groupby("clust").groups.items()
+        })
 
         # containers for saving the MCMC trace
         self.segs_to_clusters = []
@@ -929,6 +934,7 @@ def run(self, n_iter = 0, n_samps = 0):
                         self.breakpoints.add(new_bp)
                         self.seg_sums[new_bp] = np.r_[self._Ssum_ph(np.r_[new_bp:seg_en], min = True), self._Ssum_ph(np.r_[new_bp:seg_en], min = False)]
                         self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp]
+                        self.clust_members_bps[cur_clust].add(new_bp)
 
                 # propose splitting out a contiguous interval of segments within the current cluster {{{
                 split_clust = False
@@ -984,9 +990,11 @@ def run(self, n_iter = 0, n_samps = 0):
                         del self.clust_counts[cur_clust]
                         del self.clust_sums[cur_clust]
                         del self.clust_members[cur_clust]
+                        del self.clust_members_bps[cur_clust]
                     else:
                         self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)]
                         self.clust_members[cur_clust] -= set(seg_idx)
+                        self.clust_members_bps[cur_clust].remove(self.breakpoints[break_idx[0]])
 
                     self.clusts[seg_idx] = -1
 
@@ -1206,6 +1214,11 @@ def run(self, n_iter = 0, n_samps = 0):
                   self._Ssum_ph(np.r_[st:en], min = False)
                 ]
 
+            if choice < 0:
+                self.clust_members_bps[new_clust_idx] = sc.SortedSet([self.breakpoints[b] for b in break_idx | update_idx])
+            else:
+                self.clust_members_bps[choice] |= sc.SortedSet([self.breakpoints[b] for b in break_idx | update_idx])
+
             # track global state of cluster assignments
             # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations
             if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2):

From c135bec43b35916118f53670d96083c314402c53 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 2 Mar 2022 14:38:41 -0500
Subject: [PATCH 055/222] Track breakpoints within each cluster

---
 hapaseg/allelic_DP.py | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 914e790..e5f5ed9 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1007,6 +1007,10 @@ def run(self, n_iter = 0, n_samps = 0):
 
                 cl_idx = np.random.choice(self.clust_counts.keys())
                 seg_idx = np.r_[list(self.clust_members[cl_idx])]
+
+                # get all breakpoints corresponding to this cluster
+                break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cl_idx]])
+
                 n_move = len(seg_idx)
                 cur_clust = -1 # only applicable for individual segments, so we set to -1 here
                                # (this is so that subsequent references to clust_sums[cur_clust]
@@ -1017,6 +1021,7 @@ def run(self, n_iter = 0, n_samps = 0):
                 del self.clust_counts[cl_idx]
                 del self.clust_sums[cl_idx]
                 del self.clust_members[cl_idx]
+                del self.clust_members_bps[cl_idx]
                 self.clusts[seg_idx] = -1
 
                 move_clust = True
@@ -1199,13 +1204,25 @@ def run(self, n_iter = 0, n_samps = 0):
                 self.clust_members[choice].update(set(seg_idx))
 
             # update breakpoints
-            snp_idx = [self.breakpoints[b] for b in break_idx | { x + 1 for x in break_idx }]
+
+            # B->A
+            #    .   .     .   break_idx + 1
+            # A B A B A C B A
+            #  +   +     +     break_idx
+            #  *         *     update_idx
+
+            break_idx_bi = break_idx | { x + 1 for x in break_idx }
+            snp_idx_bi = sc.SortedSet([self.breakpoints[b] for b in break_idx_bi])
+            snp_idx = sc.SortedSet([self.breakpoints[b] for b in break_idx])
             update_idx = sc.SortedSet()
-            for snp in snp_idx:
+            for snp in snp_idx_bi:
                 if snp < len(self.S) and self.clusts[snp - 1] == self.clusts[snp]:
+                    snp_idx.discard(snp) # discard rather than remvoe because this could be in snp_idx + 1
                     self.breakpoints.remove(snp)
                     self.seg_sums.pop(snp)
+                    self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster
                     update_idx.add(self.breakpoints.bisect_left(snp) - 1)
+                    snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1])
             for bp_idx in update_idx:
                 st = self.breakpoints[bp_idx]
                 en = self.breakpoints[bp_idx + 1]
@@ -1215,9 +1232,9 @@ def run(self, n_iter = 0, n_samps = 0):
                 ]
 
             if choice < 0:
-                self.clust_members_bps[new_clust_idx] = sc.SortedSet([self.breakpoints[b] for b in break_idx | update_idx])
+                self.clust_members_bps[new_clust_idx] = snp_idx
             else:
-                self.clust_members_bps[choice] |= sc.SortedSet([self.breakpoints[b] for b in break_idx | update_idx])
+                self.clust_members_bps[choice] |= snp_idx
 
             # track global state of cluster assignments
             # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations

From 4bccf1a4575ed73576d737987cf58eeb24b13998 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 3 Mar 2022 12:29:05 -0500
Subject: [PATCH 056/222] Improve efficiency of computing adjacency likelihood

---
 hapaseg/allelic_DP.py | 78 ++++++++++++++++---------------------------
 1 file changed, 28 insertions(+), 50 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index e5f5ed9..51d83ed 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -391,48 +391,21 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_
 
         return ss.betaln(SU_a + 1, SU_b + 1) + ss.betaln(J_a + 1, J_b + 1) + ss.betaln(SD_a + 1, SD_b + 1)
 
-    def compute_adj_prob(self, seg_idx):
-        ## compute boundaries of adjacent segments
-
-        # maj/min counts of contiguous upstream segments belonging to the same cluster
-        U_A = 0
-        U_B = 0
-        U_cl = -1
-        if seg_idx[0] - 1 > 0:
-            break_idx = self.breakpoints.index(seg_idx[0]) - 1
-            seg_st = self.breakpoints[break_idx]
-            seg_en = self.breakpoints[break_idx + 1]
-            U_cl = self.clusts[seg_st]
-            while break_idx > 0 and self.clusts[seg_st] != -1 and \
-              self.clusts[seg_st] == U_cl:
-                U_A += self.seg_sums[seg_st][0]
-                U_B += self.seg_sums[seg_st][1]
-
-                break_idx = self.breakpoints.index(seg_st) - 1
-                seg_st = self.breakpoints[break_idx]
-                seg_en = self.breakpoints[break_idx + 1]
-
-        # maj/min counts of contiguous downstream segments belonging to the same cluster
-        D_A = 0
-        D_B = 0
-        D_cl = -1
-        if seg_idx[-1] + 1 < len(self.S):
-            break_idx = self.breakpoints.index(seg_idx[0]) + 1
-            seg_st = self.breakpoints[break_idx]
-            seg_en = self.breakpoints[break_idx + 1]
-            D_cl = self.clusts[seg_st]
-            while break_idx < len(self.breakpoints) - 2 and self.clusts[seg_st] != -1 and \
-              self.clusts[seg_st] == D_cl:
-                D_A += self.seg_sums[seg_st][0]
-                D_B += self.seg_sums[seg_st][1]
-
-                break_idx = self.breakpoints.index(seg_st) + 1
-                seg_st = self.breakpoints[break_idx]
-                seg_en = self.breakpoints[break_idx + 1]
-
-        # maj/min counts of segment(s) being moved
-        S_A = self.seg_sums[seg_idx[0]][0]
-        S_B = self.seg_sums[seg_idx[0]][1]
+    def compute_adj_prob(self, break_idx):
+        if break_idx > 1:
+            U_A, U_B = self.seg_sums[self.breakpoints[break_idx - 1]]
+            U_cl = self.clusts[self.breakpoints[break_idx - 1]]
+        else:
+            U_A = U_B = 0
+            U_cl = -1
+        if break_idx + 2 < len(self.breakpoints):
+            D_A, D_B = self.seg_sums[self.breakpoints[break_idx + 1]]
+            D_cl = self.clusts[self.breakpoints[break_idx + 1]]
+        else:
+            D_A = D_B = 0
+            D_cl = -1
+
+        S_A, S_B = self.seg_sums[self.breakpoints[break_idx]]
 
         ## compute all four possible segmentations relative to neighbor, in
         ## both phasing orientations
@@ -453,10 +426,15 @@ def compute_adj_prob(self, seg_idx):
         ]
 
         ## match probs to cluster choices (will match MLs matrix in main calculation)
-        probs = np.full([len(self.clust_sums), 2], -np.inf)
-        for k in self.clust_sums.keys():
-            MLs_idx = np.r_[k == U_cl, k == D_cl]@np.r_[2, 1]
-            probs[self.clust_sums.index(k), :] = MLs[:, MLs_idx]
+        probs = np.full([len(self.clust_sums), 2], MLs[0, 0])
+        if U_cl == D_cl and U_cl != -1 and D_cl != -1:
+            probs[self.clust_sums.index(U_cl), :] = MLs[:, 3]
+            probs[self.clust_sums.index(D_cl), :] = MLs[:, 3]
+        else:
+            if U_cl != -1:
+                probs[self.clust_sums.index(U_cl), :] = MLs[:, 2]
+            if D_cl != -1:
+                probs[self.clust_sums.index(D_cl), :] = MLs[:, 1]
 
         return probs
 
@@ -1120,10 +1098,10 @@ def run(self, n_iter = 0, n_samps = 0):
             #adj_BC = np.zeros([len(self.clust_sums), 2])
 
             log_adj_lik = 0
-            if not move_clust and not split_clust: # or (move_clust and np.random.rand() < 0.01):
-                log_adj_lik = self.compute_adj_prob(seg_idx)
-                seg_touch_idx[seg_idx] = True
-
+            if not move_clust: # or (move_clust and np.random.rand() < 0.01):
+                log_adj_lik = self.compute_adj_prob(break_idx[0])
+                #seg_touch_idx[seg_idx] = True
+ 
             # p(X|clust,phase)p(X|seg,phase)p(clust)
             num = (MLs               # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B})
                   + log_adj_lik      # p({a_i, b_i}_{i\in B} | U, D, phase_{i\in B})

From 3ea0fc8764603deb6c9265c324eb1816bfc20fab Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 4 Mar 2022 11:19:04 -0500
Subject: [PATCH 057/222] Allow beta hyperparam to be specified

---
 hapaseg/allelic_DP.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 51d83ed..45ae343 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -294,6 +294,8 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F")
         self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F")
 
+        self.betahyp = 1
+
         #
         # define column indices
         self.clust_col = self.S.columns.get_loc("clust")
@@ -310,7 +312,7 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
 
         # store likelihoods for each cluster in the prior (from previous iterations)
         self.clust_prior[-1] = np.r_[0, 0]
-        self.clust_prior_liks = sc.SortedDict({ k : ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_prior.items()})
+        self.clust_prior_liks = sc.SortedDict({ k : ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_prior.items()})
         self.clust_prior_mat = np.r_[self.clust_prior.values()]
 
         self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster
@@ -389,7 +391,7 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_
             SD_a += D_a
             SD_b += D_b
 
-        return ss.betaln(SU_a + 1, SU_b + 1) + ss.betaln(J_a + 1, J_b + 1) + ss.betaln(SD_a + 1, SD_b + 1)
+        return ss.betaln(SU_a + 1 + self.betahyp, SU_b + 1 + self.betahyp) + ss.betaln(J_a + 1 + self.betahyp, J_b + 1 + self.betahyp) + ss.betaln(SD_a + 1 + self.betahyp, SD_b + 1 + self.betahyp)
 
     def compute_adj_prob(self, break_idx):
         if break_idx > 1:
@@ -597,7 +599,7 @@ def compute_cluster_splitpoints(self, seg_idx):
             maj_cs = self._Scumsum_ph(seg_idx_sp, min = False)
             maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs
 
-            split_lik = ss.betaln(min_cs[:-1] + 1, maj_cs[:-1] + 1) + ss.betaln(min_csr[1:] + 1, maj_csr[1:] + 1)
+            split_lik = ss.betaln(min_cs[:-1] + 1 + self.betahyp, maj_cs[:-1] + 1 + self.betahyp) + ss.betaln(min_csr[1:] + 1 + self.betahyp, maj_csr[1:] + 1 + self.betahyp)
             # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum())
             # NOTE: instead of argmax, probabilistically choose? will this make a difference?
 
@@ -622,7 +624,7 @@ def compute_cluster_splitpoints(self, seg_idx):
             maj_cs = self._Scumsum_ph(seg_idx_sp, min = False)
             maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs
 
-            split_lik = ss.betaln(min_cs[:-1] + 1, maj_cs[:-1] + 1) + ss.betaln(min_csr[1:] + 1, maj_csr[1:] + 1)
+            split_lik = ss.betaln(min_cs[:-1] + 1 + self.betahyp, maj_cs[:-1] + 1 + self.betahyp) + ss.betaln(min_csr[1:] + 1 + self.betahyp, maj_csr[1:] + 1 + self.betahyp)
             # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum())
 
             start += split_lik.argmax() + 1
@@ -641,7 +643,7 @@ def compute_cluster_splitpoints(self, seg_idx):
     def compute_overall_lik_simple(self):
         ## overall clustering likelihood
         # p({a_i, b_i} | {c_k}, {phase_i})
-        clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum()
+        clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) + self.betahyp for k, v in self.clust_sums.items() if k >= 0]].sum()
 
         ## overall phasing likelihood
         # p({phase_i} | {a_i, b_i})
@@ -901,7 +903,7 @@ def run(self, n_iter = 0, n_samps = 0):
                     maj_cs = self._Scumsum_ph(seg_idx, min = False)
                     maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs
 
-                    split_lik = ss.betaln(min_cs + 1, maj_cs + 1) + ss.betaln(min_csr + 1, maj_csr + 1)
+                    split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp)
                     split_lik -= split_lik.max()
                     split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum())
                     seg_idx = seg_idx[:(split_point + 1)]
@@ -929,7 +931,7 @@ def run(self, n_iter = 0, n_samps = 0):
 
                     A_tot, B_tot = self.clust_sums[cur_clust]
 
-                    lik0 = ss.betaln(A_tot + 1, B_tot + 1)
+                    lik0 = ss.betaln(A_tot + 1 + self.betahyp, B_tot + 1 + self.betahyp)
 
                     liks = np.zeros(len(split_bdy) + 1)
                     liks[-1] = lik0 # don't split at all
@@ -939,7 +941,7 @@ def run(self, n_iter = 0, n_samps = 0):
                         A = self._Ssum_ph(clust_segs[st:en], min = True)
                         B = self._Ssum_ph(clust_segs[st:en], min = False)
 
-                        liks[i] = ss.betaln(A_tot - A + 1, B_tot - B + 1) + ss.betaln(A + 1, B + 1)
+                        liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
 
                     # pick a region to split
                     split_idx = np.random.choice(
@@ -1028,11 +1030,11 @@ def run(self, n_iter = 0, n_samps = 0):
             # A+B is likelihood of current cluster B is part of
             #AB = ss.betaln(A_a + B_a + 1, A_b + B_b + 1)
             # C is likelihood of target cluster pre-join
-            C = ss.betaln(C_ab[:, 0] + 1, C_ab[:, 1] + 1)
+            C = ss.betaln(C_ab[:, 0] + 1 + self.betahyp, C_ab[:, 1] + 1 + self.betahyp)
             # A is likelihood cluster B is part of, minus B
             #A = ss.betaln(A_a + 1, A_b + 1)
             # B+C is likelihood of target cluster post-join, with both phase orientations
-            BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1)
+            BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1 + self.betahyp, C_ab[:, [1]] + np.c_[B_b, B_a] + 1 + self.betahyp)
 
             MLs = BC - C[:, None]
 

From 97d3d7a8921f06073177437f617bd0b79ede6909 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 7 Mar 2022 14:29:49 -0500
Subject: [PATCH 058/222] Add beta hyperparameter to rephasing

---
 hapaseg/allelic_DP.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 45ae343..8e042c0 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -348,10 +348,10 @@ def compute_rephase_prob(self, seg_idx):
         flip = self.S.iloc[seg_idx, self.flip_col]
         flip_n = ~flip
 
-        A_a = self.alt_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1
-        A_b = self.ref_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1
-        B_a = self.alt_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1
-        B_b = self.ref_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1
+        A_a = self.alt_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + self.betahyp
+        A_b = self.ref_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + self.betahyp
+        B_a = self.alt_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + self.betahyp
+        B_b = self.ref_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + self.betahyp
 
         # use normal approximation to beta if conditions are right
         if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20:

From dc5f9c79890a5ec6d4784c1a0aa771b6ca779a85 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 7 Mar 2022 14:31:47 -0500
Subject: [PATCH 059/222] Compute segmentation likelihood on the fly

---
 hapaseg/allelic_DP.py | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 8e042c0..11ca47d 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -659,15 +659,7 @@ def compute_overall_lik_simple(self):
 
         ## segmentation likelihood
         # p({a_i, b_i} | {s}, {phase_i})
-        bdy = np.flatnonzero(np.r_[1, np.diff(self.S["clust"]) != 0, 1])
-        bdy = np.c_[bdy[:-1], bdy[1:]]
-
-        seg_lik = 0.0
-        for st, en in bdy:
-            seg_lik += ss.betaln(
-              self._Ssum_ph(np.r_[st:en], min = True) + 1,
-              self._Ssum_ph(np.r_[st:en], min = False) + 1
-            )
+        seg_lik = np.r_[self.seg_liks].sum()
 
         # p({c_k}, {s}, {phase_i} | {a_i, b_i})
         #return clust_lik + phase_lik + count_prior + seg_lik
@@ -805,8 +797,12 @@ def run(self, n_iter = 0, n_samps = 0):
 
         max_clust_idx = np.max(self.clust_members.keys() | self.clust_prior.keys() if self.clust_prior is not None else {})
 
+        #
+        # breakpoint tracking
+
         # segmentation breakpoints
         self.breakpoints = sc.SortedSet(np.flatnonzero(np.diff(self.S["clust"]) != 0) + 1) | {0, len(self.S)}
+
         # min/maj counts in each segment
         self.seg_sums = sc.SortedDict()
         bpl = np.r_[self.breakpoints]
@@ -814,6 +810,12 @@ def run(self, n_iter = 0, n_samps = 0):
             mn = self._Ssum_ph(np.r_[st:en], min = True)
             mj = self._Ssum_ph(np.r_[st:en], min = False)
             self.seg_sums[st] = np.r_[mn, mj]
+
+        # likelihoods for each segment
+        self.seg_liks = sc.SortedDict()
+        for k, (a, b) in self.seg_sums.items():
+            self.seg_liks[k] = ss.betaln(a + 1 + self.betahyp, b + 1 + self.betahyp)
+
         # breakpoints for each cluster
         self.clust_members_bps = sc.SortedDict({
           k : sc.SortedSet(v) for k, v in \
@@ -912,8 +914,16 @@ def run(self, n_iter = 0, n_samps = 0):
                     new_bp = seg_idx[-1] + 1
                     if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment
                         self.breakpoints.add(new_bp)
-                        self.seg_sums[new_bp] = np.r_[self._Ssum_ph(np.r_[new_bp:seg_en], min = True), self._Ssum_ph(np.r_[new_bp:seg_en], min = False)]
+
+                        A = self._Ssum_ph(np.r_[new_bp:seg_en], min = True)
+                        B = self._Ssum_ph(np.r_[new_bp:seg_en], min = False)
+
+                        self.seg_sums[new_bp] = np.r_[A, B]
                         self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp]
+
+                        self.seg_liks[new_bp] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
+                        self.seg_liks[seg_idx[0]] -= self.seg_liks[new_bp]
+
                         self.clust_members_bps[cur_clust].add(new_bp)
 
                 # propose splitting out a contiguous interval of segments within the current cluster {{{
@@ -1200,16 +1210,17 @@ def run(self, n_iter = 0, n_samps = 0):
                     snp_idx.discard(snp) # discard rather than remvoe because this could be in snp_idx + 1
                     self.breakpoints.remove(snp)
                     self.seg_sums.pop(snp)
+                    self.seg_liks.pop(snp)
                     self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster
                     update_idx.add(self.breakpoints.bisect_left(snp) - 1)
                     snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1])
             for bp_idx in update_idx:
                 st = self.breakpoints[bp_idx]
                 en = self.breakpoints[bp_idx + 1]
-                self.seg_sums[st] = np.r_[
-                  self._Ssum_ph(np.r_[st:en], min = True),
-                  self._Ssum_ph(np.r_[st:en], min = False)
-                ]
+                A = self._Ssum_ph(np.r_[st:en], min = True)
+                B = self._Ssum_ph(np.r_[st:en], min = False)
+                self.seg_sums[st] = np.r_[A, B]
+                self.seg_liks[st] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
 
             if choice < 0:
                 self.clust_members_bps[new_clust_idx] = snp_idx

From 0b510e7e50dcc025c36d633928ad85d011f0e15e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 17 Mar 2022 14:52:23 -0400
Subject: [PATCH 060/222] Explicitly remove empty segment from nonsplit
 likelihood

With beta hyperparameter = 0 this was fine, since
Beta(A + 1, B + 1) + Beta(1, 1) == Beta(A + 1, B + 1)
With beta hyp != 0, this is not fine, since
Beta(A + 1 + h, B + 1 + h) + Beta(1 + h, 1 + h) != Beta(A + 1, B + 1)
---
 hapaseg/allelic_DP.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 11ca47d..7a3222b 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -906,6 +906,7 @@ def run(self, n_iter = 0, n_samps = 0):
                     maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs
 
                     split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp)
+                    split_lik[-1] = ss.betaln(min_cs[-1] + 1 + self.betahyp, maj_cs[-1] + 1 + self.betahyp)
                     split_lik -= split_lik.max()
                     split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum())
                     seg_idx = seg_idx[:(split_point + 1)]

From 366ab1fd9e3b62961a83efb346e45a876bccfbd6 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 17 Mar 2022 16:03:31 -0400
Subject: [PATCH 061/222] Return proper adjacency likelihood with betahyp > 0

---
 hapaseg/allelic_DP.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 7a3222b..4b13f72 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -391,7 +391,9 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_
             SD_a += D_a
             SD_b += D_b
 
-        return ss.betaln(SU_a + 1 + self.betahyp, SU_b + 1 + self.betahyp) + ss.betaln(J_a + 1 + self.betahyp, J_b + 1 + self.betahyp) + ss.betaln(SD_a + 1 + self.betahyp, SD_b + 1 + self.betahyp)
+        return (ss.betaln(SU_a + 1 + self.betahyp, SU_b + 1 + self.betahyp) if SU_a > 0 or SU_b > 0 else 0) + \
+          ss.betaln(J_a + 1 + self.betahyp, J_b + 1 + self.betahyp) + \
+          (ss.betaln(SD_a + 1 + self.betahyp, SD_b + 1 + self.betahyp) if SD_a > 0 or SD_b > 0 else 0)
 
     def compute_adj_prob(self, break_idx):
         if break_idx > 1:

From 5c01088aec136a86f72b470ec60b95e74ed10e58 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 5 Apr 2022 17:57:08 -0400
Subject: [PATCH 062/222] Fix up overall likelihood function

---
 hapaseg/allelic_DP.py | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 4b13f72..d058e1e 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -645,26 +645,28 @@ def compute_cluster_splitpoints(self, seg_idx):
     def compute_overall_lik_simple(self):
         ## overall clustering likelihood
         # p({a_i, b_i} | {c_k}, {phase_i})
-        clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) + self.betahyp for k, v in self.clust_sums.items() if k >= 0]].sum()
+        clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_sums.items() if k >= 0]].sum()
 
-        ## overall phasing likelihood
-        # p({phase_i} | {a_i, b_i})
-        phase_lik = 1 - self.S["rephase_prob"].copy()
-        phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]]
-        phase_lik = np.log(phase_lik).sum()
+#        ## overall phasing likelihood
+#        # p({phase_i} | {a_i, b_i})
+# TODO: memoize
+#        phase_lik = 1 - self.S["rephase_prob"].copy()
+#        phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]]
+#        phase_lik = np.log(phase_lik).sum()
+        phase_lik = 0
 
         ## Dirichlet count prior (Dirichlet-categorical marginal likelihood)
         # p({c_k})
-        dirvec = np.r_[self.clust_counts.values()].astype(float)
+        dirvec = np.r_[self.clust_counts.values()].astype(float)/self.dp_count_scale_factor
         k = len(dirvec)
         count_prior = k*np.log(self.alpha) + ss.gammaln(dirvec).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha)
 
         ## segmentation likelihood
         # p({a_i, b_i} | {s}, {phase_i})
-        seg_lik = np.r_[self.seg_liks].sum()
+        # TODO: memoize
+        seg_lik = np.r_[self.seg_liks.values()].sum()
 
         # p({c_k}, {s}, {phase_i} | {a_i, b_i})
-        #return clust_lik + phase_lik + count_prior + seg_lik
         return np.r_[clust_lik, phase_lik, count_prior, seg_lik]
 
     # {{{
@@ -833,7 +835,7 @@ def run(self, n_iter = 0, n_samps = 0):
         seg_touch_idx = np.zeros(len(self.S), dtype = bool)
 
         # likelihood trace
-        self.lik_tmp = [-np.inf]
+        self.lik_trace = []
         self.post = 0
 
         n_it = 0

From 6ed69eb0dfdc5030fc6561ff930cf43c10c786f0 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 5 Apr 2022 17:58:18 -0400
Subject: [PATCH 063/222] Sequentially scan over segments if >90% have been
 touched

---
 hapaseg/allelic_DP.py | 42 +++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index d058e1e..b015533 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -831,8 +831,6 @@ def run(self, n_iter = 0, n_samps = 0):
         self.phase_orientations = []
 
         burned_in = False
-        all_touched = False
-        seg_touch_idx = np.zeros(len(self.S), dtype = bool)
 
         # likelihood trace
         self.lik_trace = []
@@ -840,13 +838,17 @@ def run(self, n_iter = 0, n_samps = 0):
 
         n_it = 0
         n_it_last = 0
+
+        brk = 0
+        touch90 = False
+
         while True:
             if not n_it % 1000:
                 if len(self.clust_counts) > 20:
                     print(pd.Series(self.clust_counts.values()).value_counts().sort_index())
                 else:
                     print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1]))
-                print(self.lik_tmp[-1])
+                print(brk % (len(self.breakpoints) - 1))
                 #print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index())
                 #print("n unassigned: {}".format((self.S["clust"] == -1).sum()))
 
@@ -860,14 +862,14 @@ def run(self, n_iter = 0, n_samps = 0):
 
             # poll every 100 iterations for burnin status
             if not n_it % 100:
+                # have >90% of segments been touched?
+                if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.9:
+                    touch90 = True
+
                 # have most segments been adjacency corrected?
                 # if so, has the overall likelihood stabilized enough that we're burned in?
                 if not burned_in:
-                    # 1. have >90% of segments been adjacency corrected?
-                    # print(seg_touch_idx.mean())
-                    if seg_touch_idx.mean() > 0.9:
-                        all_touched = True
-
+                    pass
                     # 2. if >90% of segments have been adjacency corrected, check for burnin
                     # does the smoothed derivative of the posterior numerator go below zero? this would indicate that we've solidly reached an optimum
                     # TODO: make this check more efficient?
@@ -877,19 +879,33 @@ def run(self, n_iter = 0, n_samps = 0):
 #                        n_it_last = n_it
 #                        seg_touch_idx[:] = False
 
-                if burned_in and seg_touch_idx.mean() > 0.3:
+                # start computing likelihoods
+                if touch90:
+                    print(self.compute_overall_lik_simple())
+                    print(self.compute_overall_lik_simple().sum())
+                    self.lik_trace.append(self.compute_overall_lik_simple())
+
+                # save cluster assignments and phase orientations once burned in
+                if burned_in:
                     self.segs_to_clusters.append(self.S["clust"].copy())
                     self.phase_orientations.append(self.S["flipped"].copy())
-                    seg_touch_idx[:] = False
 
             #
             # pick either a segment or a cluster at random (50:50 prob.)
             move_clust = False
 
-            # pick a segment at random
-            if True or np.random.rand() < 0.5:
+            # move a segment
+            #if not touch90 or np.random.rand() < 0.9:
+            if True or np.random.rand() < 0.9:
+                # >90% of segments have been moved; we are iterating over segments sequentially
+                if touch90:
+                    break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)})
+                    brk += 1
+                # we are picking segments at random
+                else:
+                    break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)})
+
                 # get all SNPs within this segment
-                break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)})
                 seg_st = self.breakpoints[break_idx[0]]
                 seg_en = self.breakpoints[break_idx[0] + 1]
                 seg_idx = np.r_[seg_st:seg_en]

From 09fa7ca62bb9f074cd65f1c2ef3b0a3177645974 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 5 Apr 2022 19:05:24 -0400
Subject: [PATCH 064/222] Fix up cluster splitting

---
 hapaseg/allelic_DP.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index b015533..5fc6126 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -601,7 +601,7 @@ def compute_cluster_splitpoints(self, seg_idx):
             maj_cs = self._Scumsum_ph(seg_idx_sp, min = False)
             maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs
 
-            split_lik = ss.betaln(min_cs[:-1] + 1 + self.betahyp, maj_cs[:-1] + 1 + self.betahyp) + ss.betaln(min_csr[1:] + 1 + self.betahyp, maj_csr[1:] + 1 + self.betahyp)
+            split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp)
             # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum())
             # NOTE: instead of argmax, probabilistically choose? will this make a difference?
 
@@ -637,7 +637,7 @@ def compute_cluster_splitpoints(self, seg_idx):
 
             i += 1
 
-        bdy = np.unique(np.r_[0, spl, len(seg_idx)])
+        bdy = seg_idx[np.unique(np.r_[0, spl, len(seg_idx) - 1])]
         bdy = np.c_[bdy[:-1], bdy[1:]]
 
         return bdy

From 90b474d3ded6e00c554520b41c5bf972d016c62f Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 7 Apr 2022 13:50:17 -0400
Subject: [PATCH 065/222] Can't update likelihood by subtracting it off like
 that

---
 hapaseg/allelic_DP.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 5fc6126..cc9114e 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -943,7 +943,9 @@ def run(self, n_iter = 0, n_samps = 0):
                         self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp]
 
                         self.seg_liks[new_bp] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
-                        self.seg_liks[seg_idx[0]] -= self.seg_liks[new_bp]
+                        A = self._Ssum_ph(np.r_[seg_idx[0]:new_bp], min = True)
+                        B = self._Ssum_ph(np.r_[seg_idx[0]:new_bp], min = False)
+                        self.seg_liks[seg_idx[0]] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
 
                         self.clust_members_bps[cur_clust].add(new_bp)
 

From 5e7dce19411b4374fd262a749f914d93310ff331 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 7 Apr 2022 14:30:38 -0400
Subject: [PATCH 066/222] Fix bug introduced in 09fa7ca

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index cc9114e..8cd20a0 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -637,7 +637,7 @@ def compute_cluster_splitpoints(self, seg_idx):
 
             i += 1
 
-        bdy = seg_idx[np.unique(np.r_[0, spl, len(seg_idx) - 1])]
+        bdy = np.unique(np.r_[0, spl, len(seg_idx)])
         bdy = np.c_[bdy[:-1], bdy[1:]]
 
         return bdy

From c5dd58d354fb1521d28dbdc90b24c46d0ef86eb4 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 7 Apr 2022 14:32:12 -0400
Subject: [PATCH 067/222] Put breakpoint adding into its own function

Add breakpoints when splitting out contiguous range of SNPs within a cluster
---
 hapaseg/allelic_DP.py | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 8cd20a0..5d5dbc5 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -642,6 +642,24 @@ def compute_cluster_splitpoints(self, seg_idx):
 
         return bdy
 
+    def add_breakpoint(self, start, mid, end, clust_idx):
+        """
+        Add breakpoint at mid belonging to clust_idx, between start and end
+        """
+        self.breakpoints.add(mid)
+        self.clust_members_bps[clust_idx].add(mid)
+        
+        A = self._Ssum_ph(np.r_[mid:end], min = True)
+        B = self._Ssum_ph(np.r_[mid:end], min = False)
+
+        self.seg_sums[mid] = np.r_[A, B]
+        self.seg_sums[st] -= self.seg_sums[mid]
+
+        self.seg_liks[mid] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
+        A = self._Ssum_ph(np.r_[start:mid], min = True)
+        B = self._Ssum_ph(np.r_[start:mid], min = False)
+        self.seg_liks[start] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
+
     def compute_overall_lik_simple(self):
         ## overall clustering likelihood
         # p({a_i, b_i} | {c_k}, {phase_i})
@@ -934,20 +952,7 @@ def run(self, n_iter = 0, n_samps = 0):
                     # add breakpoint (can be erased subsequently if segment rejoins original cluster)
                     new_bp = seg_idx[-1] + 1
                     if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment
-                        self.breakpoints.add(new_bp)
-
-                        A = self._Ssum_ph(np.r_[new_bp:seg_en], min = True)
-                        B = self._Ssum_ph(np.r_[new_bp:seg_en], min = False)
-
-                        self.seg_sums[new_bp] = np.r_[A, B]
-                        self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp]
-
-                        self.seg_liks[new_bp] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
-                        A = self._Ssum_ph(np.r_[seg_idx[0]:new_bp], min = True)
-                        B = self._Ssum_ph(np.r_[seg_idx[0]:new_bp], min = False)
-                        self.seg_liks[seg_idx[0]] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
-
-                        self.clust_members_bps[cur_clust].add(new_bp)
+                        self.add_breakpoint(start = seg_idx[0], mid = new_bp, end = seg_en, clust_idx = cur_clust)
 
                 # propose splitting out a contiguous interval of segments within the current cluster {{{
                 split_clust = False
@@ -992,6 +997,16 @@ def run(self, n_iter = 0, n_samps = 0):
 
                     split_clust = True
 
+                    # add breakpoints
+                    for si in [seg_idx[0], seg_idx[-1]]:
+                        if si not in self.breakpoints:
+                            seg_st_idx = self.breakpoints.bisect_left(si) - 1
+                            seg_st = self.breakpoints[seg_st_idx]
+                            seg_en_idx = self.breakpoints.bisect_left(si)
+                            seg_en = self.breakpoints[seg_en_idx]
+
+                            self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust)
+
                 # }}}
 
                 n_move = len(seg_idx)

From 56a654349a3b3e69afad7f05d9c6d136e3966b7a Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sat, 9 Apr 2022 17:21:30 -0400
Subject: [PATCH 068/222] segs -> snps

---
 hapaseg/allelic_DP.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 5d5dbc5..76dcd1f 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -958,14 +958,14 @@ def run(self, n_iter = 0, n_samps = 0):
                 split_clust = False
                 if np.random.rand() < 0.1:
                     # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable?
-                    clust_segs = np.sort(np.r_[list(self.clust_members[cur_clust])])
+                    clust_snps = np.sort(np.r_[list(self.clust_members[cur_clust])])
 
                     # can't split clusters of length 1
-                    if len(clust_segs) == 1:
+                    if len(clust_snps) == 1:
                         n_it += 1
                         continue
 
-                    split_bdy = self.compute_cluster_splitpoints(clust_segs)
+                    split_bdy = self.compute_cluster_splitpoints(clust_snps)
 
                     A_tot, B_tot = self.clust_sums[cur_clust]
 
@@ -976,8 +976,8 @@ def run(self, n_iter = 0, n_samps = 0):
 
                     # likelihood ratios for splitting each region into a new cluster
                     for i, (st, en) in enumerate(split_bdy):
-                        A = self._Ssum_ph(clust_segs[st:en], min = True)
-                        B = self._Ssum_ph(clust_segs[st:en], min = False)
+                        A = self._Ssum_ph(clust_snps[st:en], min = True)
+                        B = self._Ssum_ph(clust_snps[st:en], min = False)
 
                         liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
 
@@ -992,8 +992,8 @@ def run(self, n_iter = 0, n_samps = 0):
                         n_it += 1
                         continue
 
-                    # seg_idx == segments to propose to split off
-                    seg_idx = clust_segs[slice(*split_bdy[split_idx])]
+                    # seg_idx == SNPs to propose to split off
+                    seg_idx = clust_snps[slice(*split_bdy[split_idx])]
 
                     split_clust = True
 

From 941f05194337d26267a876b1107adb659465797c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sat, 9 Apr 2022 17:22:00 -0400
Subject: [PATCH 069/222] fix typo

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 76dcd1f..1b2d906 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -653,7 +653,7 @@ def add_breakpoint(self, start, mid, end, clust_idx):
         B = self._Ssum_ph(np.r_[mid:end], min = False)
 
         self.seg_sums[mid] = np.r_[A, B]
-        self.seg_sums[st] -= self.seg_sums[mid]
+        self.seg_sums[start] -= self.seg_sums[mid]
 
         self.seg_liks[mid] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
         A = self._Ssum_ph(np.r_[start:mid], min = True)

From 6faa2b178366e8bf4e44c0ac5339091353499d64 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sat, 9 Apr 2022 17:22:50 -0400
Subject: [PATCH 070/222] Properly update breakpoints when splitting cluster

---
 hapaseg/allelic_DP.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 1b2d906..22e6998 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1007,6 +1007,11 @@ def run(self, n_iter = 0, n_samps = 0):
 
                             self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust)
 
+                    # get all breakpoints within this cluster/interval
+                    left_idx = self.clust_members_bps[cur_clust].bisect_left(seg_idx[0])
+                    right_idx = self.clust_members_bps[cur_clust].bisect_right(seg_idx[-1])
+                    break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cur_clust][left_idx:right_idx]])
+
                 # }}}
 
                 n_move = len(seg_idx)
@@ -1022,7 +1027,8 @@ def run(self, n_iter = 0, n_samps = 0):
                     else:
                         self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)]
                         self.clust_members[cur_clust] -= set(seg_idx)
-                        self.clust_members_bps[cur_clust].remove(self.breakpoints[break_idx[0]])
+                        for b in break_idx:
+                            self.clust_members_bps[cur_clust].remove(self.breakpoints[b])
 
                     self.clusts[seg_idx] = -1
 

From 5fe512bce552a51be361854c609212819a254439 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 12:05:36 -0400
Subject: [PATCH 071/222] Clarify which breakpoints get updated

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 22e6998..7f1217c 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1243,7 +1243,7 @@ def run(self, n_iter = 0, n_samps = 0):
             #    .   .     .   break_idx + 1
             # A B A B A C B A
             #  +   +     +     break_idx
-            #  *         *     update_idx
+            #*           *     update_idx
 
             break_idx_bi = break_idx | { x + 1 for x in break_idx }
             snp_idx_bi = sc.SortedSet([self.breakpoints[b] for b in break_idx_bi])

From b6e5d05d75f5249fe54258583fe7d867ea0b0e9d Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 14:16:38 -0400
Subject: [PATCH 072/222] Memoize segment misphase probabilities

---
 hapaseg/allelic_DP.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 7f1217c..7379e6f 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -660,6 +660,9 @@ def add_breakpoint(self, start, mid, end, clust_idx):
         B = self._Ssum_ph(np.r_[start:mid], min = False)
         self.seg_liks[start] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
 
+        self.seg_phase_probs[start] = self.compute_rephase_prob(np.r_[start:mid])
+        self.seg_phase_probs[mid] = self.compute_rephase_prob(np.r_[mid:end])
+
     def compute_overall_lik_simple(self):
         ## overall clustering likelihood
         # p({a_i, b_i} | {c_k}, {phase_i})
@@ -844,6 +847,9 @@ def run(self, n_iter = 0, n_samps = 0):
             self.S.loc[self.breakpoints[:-1], ["clust"]].groupby("clust").groups.items()
         })
 
+        # misphase probabilities for each segment
+        self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints })
+
         # containers for saving the MCMC trace
         self.segs_to_clusters = []
         self.phase_orientations = []
@@ -1063,7 +1069,7 @@ def run(self, n_iter = 0, n_samps = 0):
             #
             # perform phase correction on segment/cluster
             # flip min/maj with probability that alleles are oriented the "wrong" way
-            rephase_prob = self.compute_rephase_prob(seg_idx)
+            rephase_prob = self.seg_phase_probs[seg_idx[0]] if not np.isnan(self.seg_phase_probs[seg_idx[0]]) else self.compute_rephase_prob(seg_idx)
 
             #
             # choose to join a cluster or make a new one
@@ -1237,6 +1243,13 @@ def run(self, n_iter = 0, n_samps = 0):
 
                 self.clust_members[choice].update(set(seg_idx))
 
+            # if segment was rephased, update saved phasing probabilities
+            if choice_idx & 1:
+                for bp_idx in break_idx:
+                    st = self.breakpoints[bp_idx]
+                    en = self.breakpoints[bp_idx + 1]
+                    self.seg_phase_probs[st] = self.compute_rephase_prob(np.r_[st:en])
+
             # update breakpoints
 
             # B->A
@@ -1255,6 +1268,7 @@ def run(self, n_iter = 0, n_samps = 0):
                     self.breakpoints.remove(snp)
                     self.seg_sums.pop(snp)
                     self.seg_liks.pop(snp)
+                    self.seg_phase_probs.pop(snp)
                     self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster
                     update_idx.add(self.breakpoints.bisect_left(snp) - 1)
                     snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1])
@@ -1265,6 +1279,7 @@ def run(self, n_iter = 0, n_samps = 0):
                 B = self._Ssum_ph(np.r_[st:en], min = False)
                 self.seg_sums[st] = np.r_[A, B]
                 self.seg_liks[st] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
+                self.seg_phase_probs[st] = self.compute_rephase_prob(np.r_[st:en])
 
             if choice < 0:
                 self.clust_members_bps[new_clust_idx] = snp_idx

From 0bee5ff055434426fce1f33c13f5bad10c7b19fe Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 16:39:04 -0400
Subject: [PATCH 073/222] Fix a couple phase tracking bugs

---
 hapaseg/allelic_DP.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 7379e6f..a9438f8 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -848,7 +848,7 @@ def run(self, n_iter = 0, n_samps = 0):
         })
 
         # misphase probabilities for each segment
-        self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints })
+        self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints[:-1] })
 
         # containers for saving the MCMC trace
         self.segs_to_clusters = []
@@ -1069,7 +1069,9 @@ def run(self, n_iter = 0, n_samps = 0):
             #
             # perform phase correction on segment/cluster
             # flip min/maj with probability that alleles are oriented the "wrong" way
-            rephase_prob = self.seg_phase_probs[seg_idx[0]] if not np.isnan(self.seg_phase_probs[seg_idx[0]]) else self.compute_rephase_prob(seg_idx)
+            if np.isnan(self.seg_phase_probs[seg_idx[0]]):
+                self.seg_phase_probs[seg_idx[0]] = self.compute_rephase_prob(seg_idx)
+            rephase_prob = self.seg_phase_probs[seg_idx[0]]
 
             #
             # choose to join a cluster or make a new one

From 8899a5bb879ac7c733d1e42dca5c0386a7d3e312 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 16:40:33 -0400
Subject: [PATCH 074/222] Roughly(?) compute overall phasing likelihood

---
 hapaseg/allelic_DP.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index a9438f8..35055ef 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -668,13 +668,9 @@ def compute_overall_lik_simple(self):
         # p({a_i, b_i} | {c_k}, {phase_i})
         clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_sums.items() if k >= 0]].sum()
 
-#        ## overall phasing likelihood
-#        # p({phase_i} | {a_i, b_i})
-# TODO: memoize
-#        phase_lik = 1 - self.S["rephase_prob"].copy()
-#        phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]]
-#        phase_lik = np.log(phase_lik).sum()
-        phase_lik = 0
+        ## overall phasing likelihood
+        # p({phase_i} | {a_i, b_i})
+        phase_lik = np.log1p(-np.r_[self.seg_phase_probs.values()]).sum()
 
         ## Dirichlet count prior (Dirichlet-categorical marginal likelihood)
         # p({c_k})

From bf61793eda7dcc14b852a6840cc41eff36edb7b7 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 21:56:39 -0400
Subject: [PATCH 075/222] Remove phase correction from initial MCMC

---
 hapaseg/allelic_MCMC.py | 345 +---------------------------------------
 1 file changed, 3 insertions(+), 342 deletions(-)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index d38d67e..8f7b983 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -24,8 +24,6 @@ def __init__(self, P,
       quit_after_burnin = False,
       n_iter = 100000,
       ref_bias = 1.0,
-      misphase_prior = 0.001,
-      phase_correct = False
     ):
         #
         # dataframe stuff
@@ -59,22 +57,11 @@ def __init__(self, P,
 
         self.quit_after_burnin = quit_after_burnin
 
-        self.misphase_prior = misphase_prior
-
-        # whether to perform phasing correction iterations
-        self.phase_correct = phase_correct
-
-        # how many post-burnin samples to use to infer phase switches
-        self.n_phase_correct_samples = 40
-
         #
         # chain state
         self.iter = 1
         self.burned_in = False
 
-        # whether phase correction has been performed
-        self.phase_correction_ready = False
-
         #
         # breakpoint storage
 
@@ -89,17 +76,6 @@ def __init__(self, P,
         # list of all breakpoints at nth iteration
         self.breakpoint_list = []
 
-        #
-        # misphase interval storage
-
-        # candidate intervals that were misphased
-        self.B_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int)
-
-        # current state of interval assignments (relative to B_ct)
-        self.F = MySortedList()
-
-        # state of interval assignments at nth iteration
-        self.phase_interval_list = []
 
         #
         # cumsum arrays for each segment
@@ -141,25 +117,14 @@ def _Piloc(self, st, en, col_idx, incl_idx = None):
 
     def run(self):
         while self.iter < self.n_iter:
-            # perform a split, combine, phase correct, or prune operation
-            op = np.random.choice(4)
+            # perform a split or combine
+            op = np.random.choice(2)
             if op == 0:
                 if self.combine(np.random.choice(self.breakpoints[:-1]), force = False) == -1:
                     continue
             elif op == 1:
                 if self.split(b_idx = np.random.choice(len(self.breakpoints))) == -1:
                     continue
-            elif op == 2:
-                if self.phase_correct and self.phase_correction_ready:
-                    self.rephase()
-                else:
-                    continue
-            elif op == 3:
-                continue
-                if np.random.rand() < 0.01:
-                    self.prune()
-                else:
-                    continue
 
             # if we're only running up to burnin, bail
             if self.quit_after_burnin and self.burned_in:
@@ -172,27 +137,11 @@ def run(self):
                 ) + colorama.Fore.RESET)
                 return self
 
-            # correct phases after some post-burnin iterations
-            if not self.phase_correction_ready and self.phase_correct and \
-              self.burned_in and len(self.breakpoint_list) >= 2*self.n_phase_correct_samples:
-                self.correct_phases()
-                self.phase_correction_ready = True
-
-                # breakpoint/prune lists are liable to change after phase correction, so clear them 
-                self.breakpoint_list = []
-                self.include = []
-
-            # save set of breakpoints, phase intervals, and prune states if burned in 
-            if self.burned_in and not self.iter % 100:
-                self.breakpoint_list.append(self.breakpoints.copy())
-                self.include.append(self.P["include"].copy())
-                if self.phase_correction_ready:
-                    self.phase_interval_list.append(self.F.copy())
 
             # print status
             if not self.iter % 100:
                 if self.burned_in:
-                    color = colorama.Fore.MAGENTA if not self.phase_correction_ready else colorama.Fore.RESET
+                    color = colorama.Fore.RESET
                 else:
                     color = colorama.Fore.YELLOW
                 print("{color}[{st},{en}]\t{n}/{tot}\tn_bp = {n_bp}\tlik = {lik}".format(
@@ -276,294 +225,6 @@ def combine(self, st = None, b_idx = None, force = True):
 
             return mid
 
-    def flip_hap(self, st, en):
-        """
-        Flips the SNPs from st to en
-        """
-
-        x = self.P.iloc[st:en, self.maj_idx].copy()
-        self.P.iloc[st:en, self.maj_idx] = self.P.iloc[st:en, self.min_idx]
-        self.P.iloc[st:en, self.min_idx] = x
-
-    def prob_misphase(self, bdy1, bdy2):
-        """
-        Compute probability of misphase
-        """
-        # TODO: change invocation to st, mid, en -- we don't need to correct
-        #       phasing of noncontiguous segments
-
-        # prior on misphasing probability
-        p_mis = self.misphase_prior if np.isnan(self.P.loc[bdy1[1] - 1, "misphase_prob"]) else self.P.loc[bdy1[1] - 1, "misphase_prob"]
-        if p_mis == 0:
-            return -np.inf, 0
-
-        # haps = x/y, segs = 1/2, beta params. = A/B
-
-        # seg 1
-        rng_idx = (self.P.index >= bdy1[0]) & (self.P.index < bdy1[1])
-
-        idx = rng_idx & self.P["aidx"] & self.P["include"]
-        x1_A = self.P.loc[idx, "ALT_COUNT"].sum()
-        x1_B = self.P.loc[idx, "REF_COUNT"].sum()
-
-        idx = rng_idx & ~self.P["aidx"] & self.P["include"]
-        y1_A = self.P.loc[idx, "ALT_COUNT"].sum()
-        y1_B = self.P.loc[idx, "REF_COUNT"].sum()
-
-        # seg 2
-        rng_idx = (self.P.index >= bdy2[0]) & (self.P.index < bdy2[1])
-
-        idx = rng_idx & self.P["aidx"] & self.P["include"]
-        x2_A = self.P.loc[idx, "ALT_COUNT"].sum()
-        x2_B = self.P.loc[idx, "REF_COUNT"].sum()
-
-        idx = rng_idx & ~self.P["aidx"] & self.P["include"]
-        y2_A = self.P.loc[idx, "ALT_COUNT"].sum()
-        y2_B = self.P.loc[idx, "REF_COUNT"].sum()
-
-        lik_mis   = ss.betaln(x1_A + y1_B + y2_A + x2_B + 1, y1_A + x1_B + x2_A + y2_B + 1)
-        lik_nomis = ss.betaln(x1_A + y1_B + x2_A + y2_B + 1, y1_A + x1_B + y2_A + x2_B + 1)
-
-        # logsumexp
-        m = np.maximum(lik_mis, lik_nomis)
-        denom = m + np.log(np.exp(lik_mis - m)*p_mis + np.exp(lik_nomis - m)*(1 - p_mis))
-
-        return lik_mis + np.log(p_mis) - denom, lik_nomis + np.log(1 - p_mis) - denom
-
-    def correct_phases(self):
-        """
-        Compute potentially misphased intervals, given some segmentation samples
-        """
-        if not self.burned_in or len(self.breakpoint_list) == 0:
-            raise RuntimeError("Breakpoint sample list must be populated (chain must be burned in)")
-
-        #A_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int)
-        #B_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int)
-
-        for bp_idx in np.random.choice(len(self.breakpoint_list), self.n_phase_correct_samples, replace = False):
-            bpl = np.array(self.breakpoint_list[bp_idx]); bpl = np.c_[bpl[:-1], bpl[1:]]
-
-            p_mis = np.full(len(bpl) - 1, np.nan)
-            p_A = np.full(len(bpl) - 1, np.nan)
-            p_B = np.full(len(bpl) - 1, np.nan)
-
-            V = np.full([len(bpl) - 1, 2], np.nan)
-            B = np.zeros([len(bpl) - 1, 2], dtype = np.uint8)
-
-            for i, (st, mid, _, en) in enumerate(np.c_[bpl[:-1], bpl[1:]]):
-                p_mis, p_nomis = self.prob_misphase([st, mid], [mid, en])
-
-                # TODO: memoize partial sums
-
-                # prob. that left segment is on hap. A
-                p_A1 = s.beta.logsf(0.5, self._Piloc(st, mid, self.min_idx).sum() + 1, self._Piloc(st, mid, self.maj_idx).sum() + 1)
-                # prob. that right segment is on hap. A
-                p_A2 = s.beta.logsf(0.5, self._Piloc(mid, en, self.min_idx).sum() + 1, self._Piloc(mid, en, self.maj_idx).sum() + 1)
-
-                # prob. that left segment is on hap. B
-                p_B1 = s.beta.logcdf(0.5, self._Piloc(st, mid, self.min_idx).sum() + 1, self._Piloc(st, mid, self.maj_idx).sum() + 1)
-                # prob. that right segment is on hap. B
-                p_B2 = s.beta.logcdf(0.5, self._Piloc(mid, en, self.min_idx).sum() + 1, self._Piloc(mid, en, self.maj_idx).sum() + 1)
-
-                if i == 0:
-                    V[i, :] = [p_A1, p_B1]
-                    continue
-
-                p_AB = p_mis + p_A1 + p_B2
-                p_BA = p_mis + p_B1 + p_A2
-                p_AA = p_nomis + p_A1 + p_A2
-                p_BB = p_nomis + p_B1 + p_B2
-
-                V[i, 0] = np.max(np.r_[p_AA + V[i - 1, 0], p_BA + V[i - 1, 1]])
-                V[i, 1] = np.max(np.r_[p_AB + V[i - 1, 0], p_BB + V[i - 1, 1]])
-
-                B[i, 0] = np.argmax(np.r_[p_AA + V[i - 1, 0], p_BA + V[i - 1, 1]])
-                B[i, 1] = np.argmax(np.r_[p_AB + V[i - 1, 0], p_BB + V[i - 1, 1]])
-
-            # backtrace
-            BT = np.full(len(B), -1, dtype = np.uint8)
-            ix = np.argmax(V[-1])
-            BT[-1] = ix
-            for i, b in reversed(list(enumerate(B[:-1]))):
-                ix = b[ix]
-                BT[i] = ix
-
-            # join contiguous segments assigned to hap. B
-            d = np.diff(BT, append = 0, prepend = 0)
-            ctg_idx = np.c_[np.flatnonzero(d == 1), np.flatnonzero(d == -1) - 1]
-            b_segs_j = np.c_[bpl[ctg_idx[:, 0], 0], bpl[ctg_idx[:, 1], 1]]
-
-#            # join contiguous segments assigned to hap. A
-#            d = np.diff(1 - BT, append = 0, prepend = 0)
-#            ctg_idx = np.c_[np.flatnonzero(d == 1), np.flatnonzero(d == -1) - 1]
-#            a_segs_j = np.c_[bpl[ctg_idx[:, 0], 0], bpl[ctg_idx[:, 1], 1]]
-
-            # plot
-            #for x in np.flatnonzero(BT):
-            #    plt.plot(self.P.loc[bpl[x], "pos"], np.r_[j + 1, j + 1]*0.01)
-
-            # record
-            for x in b_segs_j:
-                self.B_ct[x[0], x[1]] += 1
-#            for x in a_segs_j:
-#                A_ct[x[0], x[1]] += 1
-
-#        # plot
-#        for k, v in B_ct.items():
-#            for _ in range(0, v):
-#                plt.plot(self.P.iloc[np.r_[k], self.P.columns.get_loc("pos")], 0.2*np.random.rand()*np.r_[1, 1])
-
-    # MCMC iteration that corrects a phase
-    def rephase(self): # TODO: add parameters to force an interval?
-        # TODO: prerequisite checks; has correct_phases() been run?
-        choice = list(self.B_ct.keys())
-        probs = np.r_[list(self.B_ct.values())]
-
-        #
-        # propose an interval to flip from B->A
-        st, en = choice[np.random.choice(np.r_[0:len(choice)], p = probs/probs.sum())]
-
-        #
-        # check if this overlaps any other regions that were already flipped B->A.
-
-        # any previously flipped regions contained within will be left alone
-
-        # return range of flipped region array that [st, en) overlaps
-        # TODO: rename this; f_o is a terrible name
-        def f_o(st = st, en = en):
-            st_idx = self.F.bisect_left(st + 1); st_idx -= st_idx % 2
-            en_idx = self.F.bisect_right(en - 1); en_idx += en_idx % 2
-            return slice(st_idx, en_idx)
-
-        overlaps = np.array(self.F[f_o()]).reshape(-1, 2)
-        o_S = sc.SortedSet({st, en})
-        for o in overlaps:
-            o_S.add(o[0])
-            o_S.add(o[1])
-
-        # somewhere we ought to assert that the length of self.F is even
-
-        # get list of regions to flip
-        flip_candidates = np.r_[o_S] # all possible regions to flip
-        flip_idx = np.zeros(len(flip_candidates) - 1, dtype = np.bool) # index of regions that haven't been flipped yet
-        A_flag = True # whether st:en consists entirely of regions that were flipped to A
-        for i, (st_seg, en_seg) in enumerate(np.c_[flip_candidates[:-1], flip_candidates[1:]]):
-            # this region was not already flipped B->A
-            if not self.F[f_o(st_seg, en_seg)]:
-                flip_idx[i] = True
-                A_flag = False
-
-        flips = np.c_[flip_candidates[:-1], flip_candidates[1:]][flip_idx, :]
-
-        #
-        # get full range of CNV breakpoints this region spans
-        st_reg = self.breakpoints.bisect_left(o_S[0])
-        en_reg = self.breakpoints.bisect_right(o_S[-1])
-        breakpoints0 = sc.SortedSet(self.breakpoints[(st_reg - 1):(en_reg + 1)])
-
-        #
-        # get initial marginal likelihood of this configuration
-        ML_orig = 0
-        for b in breakpoints0[:-1]:
-            ML_orig += self.seg_marg_liks[b]
-
-        #
-        # perform flips; update breakpoint list accordingly
-        for st_seg, en_seg in flips:
-            # if flip boundary corresponds to an extant breakpoint, remove it
-            # (we will propose joining these segments after flip)
-            if st_seg in breakpoints0:
-                breakpoints0 -= {st_seg}
-            # otherwise, add the flip boundary as a new breakpoint
-            # (we will propose introducing a new segment after flip)
-            else:
-                breakpoints0.add(st_seg)
-            if en_seg in breakpoints0:
-                breakpoints0 -= {en_seg}
-            else:
-                breakpoints0.add(en_seg)
-
-            self.flip_hap(st_seg, en_seg)
-
-        #
-        # if st:en is entirely assigned to A, try to flip it back to B (i.e. it was a false flip)
-        if A_flag:
-            if flip_candidates[0] in breakpoints0:
-                breakpoints0 -= {flip_candidates[0]}
-            else:
-                breakpoints0.add(flip_candidates[0])
-            if en_reg in breakpoints0:
-                breakpoints0 -= {flip_candidates[-1]}
-            else:
-                breakpoints0.add(flip_candidates[-1])
-
-            self.flip_hap(flip_candidates[0], flip_candidates[-1])
-
-        #
-        # get marginal likelihood post-flip and breakpoint adjustment
-        bps = np.r_[breakpoints0]
-        ML = 0
-        for st_bp, en_bp in np.c_[bps[:-1], bps[1:]]:
-            ML += ss.betaln(
-              self._Piloc(st_bp, en_bp, self.min_idx).sum() + 1,
-              self._Piloc(st_bp, en_bp, self.maj_idx).sum() + 1
-            )
-
-        #
-        # probabilistically accept new configuration
-        if np.log(np.random.rand()) < np.minimum(0, ML - ML_orig):
-            #
-            # update F array
-
-            # we could have either flipped a region from B->A ...
-            if not A_flag:
-                for st_seg, en_seg in flips:
-                    self.F.update([st_seg, en_seg])
-
-            # ... or reverted a flip
-            else:
-                for p in self.F[f_o(flip_candidates[0], flip_candidates[-1])]:
-                    self.F.remove(p)
-
-            #
-            # combine contiguous intervals in F array
-            # TODO
-
-            #
-            # update breakpoint list and seg. marg. liks
-            bps_to_del = list(self.breakpoints.islice(
-              self.breakpoints.bisect_left(breakpoints0[0]),
-              self.breakpoints.bisect_right(breakpoints0[-1])
-            ))
-            for x in bps_to_del:
-                self.breakpoints.remove(x)
-            self.breakpoints.update(breakpoints0)
-
-            #
-            # update seg. marg. liks
-            # TODO: recomputing each sum (even if in the future we use memoization)
-            #       is wasteful. intelligently pick which seg_marg_liks keys to update.
-            for x in bps_to_del[:-1]:
-                self.seg_marg_liks.__delitem__(x)
-            for st_bp, en_bp in np.c_[bps[:-1], bps[1:]]:
-                self.seg_marg_liks[st_bp] = ss.betaln(
-                  self._Piloc(st_bp, en_bp, self.min_idx).sum() + 1,
-                  self._Piloc(st_bp, en_bp, self.maj_idx).sum() + 1
-                )
-
-            self.marg_lik[self.iter] = self.marg_lik[self.iter - 1] - ML_orig + ML
-
-        #
-        # revert
-        else:
-            # flip each region back
-            for st_seg, en_seg in flips:
-                self.flip_hap(st_seg, en_seg)
-            if A_flag:
-                self.flip_hap(flip_candidates[0], flip_candidates[-1])
-
-            self.marg_lik[self.iter] = self.marg_lik[self.iter - 1]
-
     def compute_all_cumsums(self):
         bpl = np.array(self.breakpoints); bpl = np.c_[bpl[0:-1], bpl[1:]]
         for st, en in bpl:

From 287b0a8d628afe61982c3fe7fe06dd39f450e642 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 21:56:54 -0400
Subject: [PATCH 076/222] Save only MLE breakpoint

---
 hapaseg/allelic_MCMC.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index 8f7b983..de7f765 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -76,6 +76,8 @@ def __init__(self, P,
         # list of all breakpoints at nth iteration
         self.breakpoint_list = []
 
+        # MLE breakpoint
+        self.breakpoints_MLE = None
 
         #
         # cumsum arrays for each segment
@@ -137,6 +139,10 @@ def run(self):
                 ) + colorama.Fore.RESET)
                 return self
 
+            # save MLE breakpoint if we've burned in
+            if self.burned_in:
+                if self.marg_lik[self.iter] > self.marg_lik[self.iter - 1]:
+                    self.breakpoints_MLE = self.breakpoints.copy()
 
             # print status
             if not self.iter % 100:

From a14b8d9791e6671d5c978d459cc6ba2554d329e7 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 21:57:04 -0400
Subject: [PATCH 077/222] Make burnin criterion more stringent

---
 hapaseg/allelic_MCMC.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index de7f765..28d1dd2 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -162,8 +162,8 @@ def run(self):
 
             # check if we've burned in
             # TODO: use a faster method of computing rolling average
-            if not self.burned_in and self.iter > 500:
-                if np.diff(self.marg_lik[(self.iter - 500):self.iter]).mean() < 0:
+            if not self.burned_in and self.iter > 1000:
+                if np.diff(self.marg_lik[(self.iter - 1000):self.iter]).mean() < 0:
                     self.burned_in = True
 
             self.iter += 1 

From 56afcd8edb938d06be4181073dd5cd1aecb8281f Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 22:11:02 -0400
Subject: [PATCH 078/222] Remove more cruft

---
 hapaseg/allelic_MCMC.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index 28d1dd2..eef040c 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -88,9 +88,6 @@ def __init__(self, P,
         self.cs_MAJ = sc.SortedDict()
         self.cs_MIN = sc.SortedDict()
 
-        # probability of picking a breakpoint
-        self.split_prob = sc.SortedDict()
-
         #
         # marginal likelihoods
 
@@ -231,11 +228,6 @@ def combine(self, st = None, b_idx = None, force = True):
 
             return mid
 
-    def compute_all_cumsums(self):
-        bpl = np.array(self.breakpoints); bpl = np.c_[bpl[0:-1], bpl[1:]]
-        for st, en in bpl:
-            self.cs_MAJ[st], self.cs_MIN[st], self.split_prob[st] = self.compute_cumsum(st, en)
-
     def compute_cumsum(self, st, en):
         # major
         cs_MAJ = np.zeros(en - st, dtype = np.int)

From 73430e35787ddc9bcc440aca9a5ea52c08b8d513 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 22:11:19 -0400
Subject: [PATCH 079/222] Add beta hyperparameter to A_MCMC

---
 hapaseg/allelic_MCMC.py | 60 +++++++++++++++++++++--------------------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index eef040c..5505518 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -91,13 +91,15 @@ def __init__(self, P,
         #
         # marginal likelihoods
 
+        self.betahyp = 1
+
         # log marginal likelihoods for each segment
         # initialize with each SNP comprising its own segment.
         self.seg_marg_liks = sc.SortedDict(zip(
           range(0, len(self.P)),
           ss.betaln(
-            self.P.iloc[0:len(self.P), self.min_idx] + 1,
-            self.P.iloc[0:len(self.P), self.maj_idx] + 1
+            self.P.iloc[0:len(self.P), self.min_idx] + 1 + self.betahyp,
+            self.P.iloc[0:len(self.P), self.maj_idx] + 1 + self.betahyp
           )
         ))
 
@@ -197,8 +199,8 @@ def combine(self, st = None, b_idx = None, force = True):
         ML_split = self.seg_marg_liks[st] + self.seg_marg_liks[mid]
 
         ML_join = ss.betaln(
-          self._Piloc(st, en, self.min_idx).sum() + 1,
-          self._Piloc(st, en, self.maj_idx).sum() + 1
+          self._Piloc(st, en, self.min_idx).sum() + 1 + self.betahyp,
+          self._Piloc(st, en, self.maj_idx).sum() + 1 + self.betahyp
         )
 
         # proposal dist. ratio
@@ -241,7 +243,7 @@ def compute_cumsum(self, st, en):
             cs_MIN[i - st] = cs_MIN[i - st - 1] + (self.P.iat[i, self.min_idx] if self.P.iat[i, self.P.columns.get_loc("include")] else 0)
 
         # marginal likelihoods
-        ml = ss.betaln(cs_MAJ + 1, cs_MIN + 1) + ss.betaln(cs_MAJ[-1] - cs_MAJ + 1, cs_MIN[-1] - cs_MIN + 1)
+        ml = ss.betaln(cs_MAJ + 1 + self.betahyp, cs_MIN + 1 + self.betahyp) + ss.betaln(cs_MAJ[-1] - cs_MAJ + 1 + self.betahyp, cs_MIN[-1] - cs_MIN + 1 + self.betahyp)
 
         # prior
         # TODO: allow user to specify
@@ -290,12 +292,12 @@ def split(self, st = None, b_idx = None):
 
         # M-H acceptance
         seg_lik_1 = ss.betaln(
-          self._Piloc(st, mid, self.min_idx).sum() + 1,
-          self._Piloc(st, mid, self.maj_idx).sum() + 1
+          self._Piloc(st, mid, self.min_idx).sum() + 1 + self.betahyp,
+          self._Piloc(st, mid, self.maj_idx).sum() + 1 + self.betahyp
         )
         seg_lik_2 = ss.betaln(
-          self._Piloc(mid, en, self.min_idx).sum() + 1,
-          self._Piloc(mid, en, self.maj_idx).sum() + 1
+          self._Piloc(mid, en, self.min_idx).sum() + 1 + self.betahyp,
+          self._Piloc(mid, en, self.maj_idx).sum() + 1 + self.betahyp
         )
 
         ML_split = seg_lik_1 + seg_lik_2
@@ -350,21 +352,21 @@ def prune(self):
             # q_i = seg(A - A_i, B - B_i) + garbage(A_i, B_i) + (1 - include prior_i)
             #       - (seg(A, B) + (include prior_i))
             r_exc = ss.betaln(
-              A_inc_s - I["MIN_COUNT"] + 1,
-              B_inc_s - I["MAJ_COUNT"] + 1
-            ) + ss.betaln(I["MIN_COUNT"] + 1, I["MAJ_COUNT"] + 1) \
+              A_inc_s - I["MIN_COUNT"] + 1 + self.betahyp,
+              B_inc_s - I["MAJ_COUNT"] + 1 + self.betahyp
+            ) + ss.betaln(I["MIN_COUNT"] + 1 + self.betahyp, I["MAJ_COUNT"] + 1 + self.betahyp) \
               + np.log(1 - I["include_prior"]) \
-              - (ss.betaln(A_inc_s + 1, B_inc_s + 1) + np.log(I["include_prior"]))
+              - (ss.betaln(A_inc_s + 1 + self.betahyp, B_inc_s + 1 + self.betahyp) + np.log(I["include_prior"]))
 
             # 2. probability to include SNPs (that were previously excluded)
             # q_i = seg(A + A_i, B + B_i) + (include prior_i)
             #       - (seg(A, B) + garbage(A_i, B_i) + (1 - include prior_i))
             r_inc = ss.betaln(
-              A_inc_s + E["MIN_COUNT"] + 1,
-              B_inc_s + E["MAJ_COUNT"] + 1
+              A_inc_s + E["MIN_COUNT"] + 1 + self.betahyp,
+              B_inc_s + E["MAJ_COUNT"] + 1 + self.betahyp
             ) + np.log(E["include_prior"]) \
-              - (ss.betaln(A_inc_s + 1, B_inc_s + 1) + \
-                ss.betaln(E["MIN_COUNT"] + 1, E["MAJ_COUNT"] + 1) + \
+              - (ss.betaln(A_inc_s + 1 + self.betahyp, B_inc_s + 1 + self.betahyp) + \
+                ss.betaln(E["MIN_COUNT"] + 1 + self.betahyp, E["MAJ_COUNT"] + 1 + self.betahyp) + \
                 np.log(1 - E["include_prior"]))
 
             r_cat = pd.concat([r_inc, r_exc]).sort_index()
@@ -398,18 +400,18 @@ def prune(self):
 
             # regardless, code for computing q_star is the same
             r_exc_star = ss.betaln(
-              A_inc_s_star - I_star["MIN_COUNT"] + 1,
-              B_inc_s_star - I_star["MAJ_COUNT"] + 1
-            ) + ss.betaln(I_star["MIN_COUNT"] + 1, I_star["MAJ_COUNT"] + 1) \
+              A_inc_s_star - I_star["MIN_COUNT"] + 1 + self.betahyp,
+              B_inc_s_star - I_star["MAJ_COUNT"] + 1 + self.betahyp
+            ) + ss.betaln(I_star["MIN_COUNT"] + 1 + self.betahyp, I_star["MAJ_COUNT"] + 1 + self.betahyp) \
               + np.log(1 - I_star["include_prior"]) \
-              - (ss.betaln(A_inc_s_star + 1, B_inc_s_star + 1) + np.log(I_star["include_prior"]))
+              - (ss.betaln(A_inc_s_star + 1 + self.betahyp, B_inc_s_star + 1 + self.betahyp) + np.log(I_star["include_prior"]))
 
             r_inc_star = ss.betaln(
-              A_inc_s_star + E_star["MIN_COUNT"] + 1,
-              B_inc_s_star + E_star["MAJ_COUNT"] + 1
+              A_inc_s_star + E_star["MIN_COUNT"] + 1 + self.betahyp,
+              B_inc_s_star + E_star["MAJ_COUNT"] + 1 + self.betahyp
             ) + np.log(E_star["include_prior"]) \
-              - (ss.betaln(A_inc_s_star + 1, B_inc_s_star + 1) + \
-                ss.betaln(E_star["MIN_COUNT"] + 1, E_star["MAJ_COUNT"] + 1) + \
+              - (ss.betaln(A_inc_s_star + 1 + self.betahyp, B_inc_s_star + 1 + self.betahyp) + \
+                ss.betaln(E_star["MIN_COUNT"] + 1 + self.betahyp, E_star["MAJ_COUNT"] + 1 + self.betahyp) + \
                 np.log(1 - E_star["include_prior"]))
 
             r_cat_star = pd.concat([r_inc_star, r_exc_star]).sort_index()
@@ -430,8 +432,8 @@ def prune(self):
 
                 self.marg_lik[self.iter] -= self.seg_marg_liks[st]
                 self.seg_marg_liks[st] = ss.betaln(
-                  T.loc[T["include"], "MIN_COUNT"].sum() + 1,
-                  T.loc[T["include"], "MAJ_COUNT"].sum() + 1,
+                  T.loc[T["include"], "MIN_COUNT"].sum() + 1 + self.betahyp,
+                  T.loc[T["include"], "MAJ_COUNT"].sum() + 1 + self.betahyp,
                 )
                 self.marg_lik[self.iter] += self.seg_marg_liks[st]
 
@@ -439,8 +441,8 @@ def prune(self):
                 # effectively their own segments)
                 self.marg_lik[self.iter] += (1 if ~self.P.at[choice_idx, "include"] else -1)* \
                   ss.betaln(
-                    self.P.at[choice_idx, "MIN_COUNT"] + 1,
-                    self.P.at[choice_idx, "MAJ_COUNT"] + 1
+                    self.P.at[choice_idx, "MIN_COUNT"] + 1 + self.betahyp,
+                    self.P.at[choice_idx, "MAJ_COUNT"] + 1 + self.betahyp
                   )
 
                 # TODO: update segment partial sums (when we actually use these)

From a2a7c97961bd0b7ae26a8baf91a711f0ee0d085c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 22:48:54 -0400
Subject: [PATCH 080/222] Set segmentation betahyp based on coverage

---
 hapaseg/allelic_MCMC.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index 5505518..4da203b 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -91,7 +91,7 @@ def __init__(self, P,
         #
         # marginal likelihoods
 
-        self.betahyp = 1
+        self.betahyp = (self.P["REF_COUNT"] + self.P["ALT_COUNT"]).mean()/4
 
         # log marginal likelihoods for each segment
         # initialize with each SNP comprising its own segment.

From 14c736cfae53405c4753f669a39c5a230f203b6f Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 13 Apr 2022 22:49:12 -0400
Subject: [PATCH 081/222] Simplify initial segmentation visualization

---
 hapaseg/allelic_MCMC.py | 59 ++++++++++-------------------------------
 1 file changed, 14 insertions(+), 45 deletions(-)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index 4da203b..7f9bbff 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -473,20 +473,16 @@ def incr_bp_counter(self, st, en, mid = None):
             self.breakpoint_counter[(mid + 1):en] += np.r_[0, 1]
 
     def visualize(self, show_CIs = False):
-        Ph = self.P.copy()
-        CI = s.beta.ppf([0.05, 0.5, 0.95], Ph["MIN_COUNT"][:, None] + 1, Ph["MAJ_COUNT"][:, None] + 1)
-        Ph[["CI_lo_hap", "median_hap", "CI_hi_hap"]] = CI
-
-        plt.figure(); plt.clf()
+        plt.figure(figsize = [16, 4]); plt.clf()
         ax = plt.gca()
 
         # SNPs
-        ax.scatter(Ph["pos"], Ph["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)], alpha = 0.5, s = 4)
+        ax.scatter(self.P["pos"], self.P["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][self.P["aidx"].astype(np.int)], alpha = 0.5, s = 4, marker = '.')
         if show_CIs:
-            ax.errorbar(Ph["pos"], y = Ph["median_hap"], yerr = np.c_[Ph["median_hap"] - Ph["CI_lo_hap"], Ph["CI_hi_hap"] - Ph["median_hap"]].T, fmt = 'none', alpha = 0.5, color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)])
+            ax.errorbar(self.P["pos"], y = self.P["median_hap"], yerr = np.c_[self.P["median_hap"] - self.P["CI_lo_hap"], self.P["CI_hi_hap"] - self.P["median_hap"]].T, fmt = 'none', alpha = 0.1, ecolor = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][self.P["aidx"].astype(np.int)])
 
         # mask excluded SNPs
-        ax.scatter(Ph["pos"], Ph["median_hap"], color = 'k', alpha = 1 - pd.concat(self.include, axis = 1).mean(1).values)
+        # ax.scatter(Ph["pos"], Ph["median_hap"], color = 'k', alpha = 1 - pd.concat(self.include, axis = 1).mean(1).values)
 
         # breakpoints 
 #        bp_prob = self.breakpoint_counter[:, 0]/self.breakpoint_counter[:, 1]
@@ -501,47 +497,20 @@ def visualize(self, show_CIs = False):
 #        ax2.set_xlim(ax.get_xlim());
 #        ax2.set_xlabel("Breakpoint number in current MCMC iteration")
 
-        # beta CI's weighted by breakpoints
-        # flip current rephases back to baseline
-        for st, en in self.F.intervals():
-            # code excised from flip_hap
-            x = Ph.iloc[st:en, self.maj_idx].copy()
-            Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx]
-            Ph.iloc[st:en, self.min_idx] = x
-
-        pos_col = Ph.columns.get_loc("pos")
-        for bp_samp, pi_samp, inc_samp in itertools.zip_longest(self.breakpoint_list, self.phase_interval_list, self.include):
-            # flip everything according to sample
-            # if we did not perform phase correction, pi_samp will be none (hence
-            # the use of zip_longest above)
-            if pi_samp is not None:
-                for st, en in pi_samp.intervals():
-                    # TODO: can replace with flip_hap()?
-                    x = Ph.iloc[st:en, self.maj_idx].copy()
-                    Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx]
-                    Ph.iloc[st:en, self.min_idx] = x
-
-            # SNPs TODO: plot only those that flipped, in a diff. color?
-            #ax.scatter(Ph["pos"], Ph["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)], alpha = 0.5, s = 4)
-
-            bpl = np.array(bp_samp); bpl = np.c_[bpl[0:-1], bpl[1:]]
-            for st, en in bpl:
-                Phi = Ph.iloc[st:en]; Phi = Phi.loc[inc_samp]
-                ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], Phi.iloc[:, self.min_idx].sum() + 1, Phi.iloc[:, self.maj_idx].sum() + 1)
-                ax.add_patch(mpl.patches.Rectangle((Ph.iloc[st, pos_col], ci_lo), Ph.iloc[en, pos_col] - Ph.iloc[st, pos_col], ci_hi - ci_lo, fill = True, facecolor = 'k', alpha = 1/len(self.breakpoint_list), zorder = 1000))
-
-            # flip everything back
-            if pi_samp is not None:
-                for st, en in pi_samp.intervals():
-                    # TODO: can replace with flip_hap()?
-                    x = Ph.iloc[st:en, self.maj_idx].copy()
-                    Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx]
-                    Ph.iloc[st:en, self.min_idx] = x
+        bpl = self.breakpoints if self.breakpoints_MLE is None else self.breakpoints_MLE
+        bpl = np.array(bpl); bpl = np.c_[bpl[0:-1], bpl[1:]]
+
+        pos_col = self.P.columns.get_loc("pos")
+        for st, en in bpl:
+            ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], self.P.iloc[st:en, self.maj_idx].sum() + 1, self.P.iloc[st:en, self.min_idx].sum() + 1)
+            ax.add_patch(mpl.patches.Rectangle((self.P.iloc[st, pos_col], ci_lo), self.P.iloc[en, pos_col] - self.P.iloc[st, pos_col], ci_hi - ci_lo, fill = True, facecolor = 'lime', alpha = 0.4, zorder = 1000))
 
         # 50:50 line
         ax.axhline(0.5, color = 'k', linestyle = ":")
 
         ax.set_xticks(np.linspace(*plt.xlim(), 20));
-        ax.set_xticklabels(Ph["pos"].searchsorted(np.linspace(*plt.xlim(), 20)));
+        ax.set_xticklabels(self.P["pos"].searchsorted(np.linspace(*plt.xlim(), 20)));
         ax.set_xlabel("SNP index")
         ax.set_ylim([0, 1])
+
+        plt.tight_layout()

From 8ad2500c3eb822b53461b2c7fd80139d5230328e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Apr 2022 14:36:50 -0400
Subject: [PATCH 082/222] Treat p(clust,phase) jointly, not p(clust|phase)

---
 hapaseg/allelic_DP.py | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 35055ef..886beba 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1151,6 +1151,9 @@ def run(self, n_iter = 0, n_samps = 0):
             # probability of opening a new cluster
             log_count_prior[0] = ss.gammaln(M) + np.log(self.alpha) + ss.gammaln(N + self.alpha - M) - ss.gammaln(N + self.alpha)
 
+            # p(phase|X)
+            log_phase_prob = np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob]))
+
             #
             # adjacent segment likelihood
 
@@ -1162,23 +1165,18 @@ def run(self, n_iter = 0, n_samps = 0):
                 log_adj_lik = self.compute_adj_prob(break_idx[0])
                 #seg_touch_idx[seg_idx] = True
  
-            # p(X|clust,phase)p(X|seg,phase)p(clust)
+            # p(X|clust,phase)p(X|seg,phase)p(clust)p(phase)
             num = (MLs               # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B})
                   + log_adj_lik      # p({a_i, b_i}_{i\in B} | U, D, phase_{i\in B})
-                  + log_count_prior) # p(clust) (DP prior on clust counts)
+                  + log_count_prior  # p(clust) (DP prior on clust counts)
+                  + log_phase_prob)  # p(phase)
 
             num /= self.temperature # scale by temperature for replica-exchange
 
-            num -= num.max(0) # avoid underflow in sum-exp
-
-            # p(clust|X,phase)
-            log_clust_post = num - np.log(np.exp(num).sum(0))
-
-            # p(phase|X)
-            log_phase_prob = np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob]))
+            num -= num.max() # avoid underflow in sum-exp
 
-            # p(clust,phase|X) = p(clust|X,phase)p(phase|X)
-            choice_p = np.exp(log_clust_post + log_phase_prob)
+            # p(clust,phase|X)
+            choice_p = np.exp(num - np.log(np.exp(num).sum()))
 
             # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true
             choice_idx = np.random.choice(

From 296b5f0efaa07bceaf7e9e3d947c0d39283309c8 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Apr 2022 15:04:44 -0400
Subject: [PATCH 083/222] Exclude chimeric reads by default

---
 wolF/workflow.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index da9b3bb..ebb9a33 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -189,7 +189,9 @@ def interval_gather(interval_files):
           refFastaIdx = localization_task["ref_fasta_idx"],
           refFastaDict = localization_task["ref_fasta_dict"],
 
-          intervals = split_intervals_task["interval_files"]
+          intervals = split_intervals_task["interval_files"],
+
+          exclude_chimeric = True
         ))
 
         hp_scatter = het_pulldown.get_het_coverage_from_callstats(

From 2a81227ae72a505bc60a4642463f8535c10472ba Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Apr 2022 15:05:24 -0400
Subject: [PATCH 084/222] Only need to return dataframe, since we are using MLE
 from initial segmentation

---
 wolF/workflow.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index ebb9a33..6ae5505 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -351,7 +351,7 @@ def get_chunks(scatter_chunks):
     )
 
     # concat arm level results
-    @prefect.task(nout = 2)
+    @prefect.task
     def concat_arm_level_results(arm_results):
         A = []
         for arm_file in arm_results:
@@ -366,12 +366,9 @@ def concat_arm_level_results(arm_results):
         _, tmpfile = tempfile.mkstemp(  )
         A.to_pickle(tmpfile)
 
-        # get number of MCMC samples
-        n_samps = int(np.minimum(np.inf, A.loc[~A["results"].isna(), "results"].apply(lambda x : len(x.breakpoint_list))).min())
-
-        return tmpfile, list(range(0, n_samps))
+        return tmpfile
 
-    arm_concat, n_samps_range = concat_arm_level_results(hapaseg_arm_AMCMC_task["arm_level_MCMC"])
+    arm_concat = concat_arm_level_results(hapaseg_arm_AMCMC_task["arm_level_MCMC"])
 
     ## run DP
 

From cf97fa76d5ddbfbe266e6426486fc4ea178ad7c0 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Apr 2022 15:11:09 -0400
Subject: [PATCH 085/222] Remove cruft

---
 hapaseg/allelic_DP.py | 110 ------------------------------------------
 1 file changed, 110 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 886beba..fe13120 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -686,116 +686,7 @@ def compute_overall_lik_simple(self):
         # p({c_k}, {s}, {phase_i} | {a_i, b_i})
         return np.r_[clust_lik, phase_lik, count_prior, seg_lik]
 
-    # {{{
-    def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False):
-        if segs_to_clusters is None:
-            su, segs_to_clusters = self.get_unique_clust_idxs()
-        else:
-            su, segs_to_clusters = self.get_unique_clust_idxs(segs_to_clusters)
-        if phase_orientations is None:
-            phase_orientations = np.r_[self.phase_orientations]
-
-        # account for unassigned clusters
-        min_clust_idx = 1 if (su == -1).any() else 0
-
-        max_clust_idx = segs_to_clusters.max() + 1
-
-        liks = np.full([segs_to_clusters.shape[0], 2], np.nan)
-
-        for i, (cl_samp, ph_samp) in enumerate(zip(segs_to_clusters, phase_orientations)):
-            ## overall clustering likelihood
-            clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum()
-
-            A1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx)
-            A2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx)
-
-            B1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx)
-            B2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx)
-
-            # print(A1[1:].sum(), B1[1:].sum(), A2[1:].sum(), B2[1:].sum())
-
-            count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double)[min_clust_idx:]
-            count_prior /= count_prior.sum()
-
-            #breakpoint()
-
-            clust_lik = ((ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1))[min_clust_idx:] + np.log(count_prior)).sum()
-            # account for unassigned clusters, if present
-            if min_clust_idx == 1:
-                clust_lik += ss.betaln(self.S.loc[cl_samp == 0, "maj"] + 1, self.S.loc[cl_samp == 0, "min"] + 1).sum()
-
-            if debug:
-                breakpoint()
-
-            ## segmentation likelihood
-
-            seg_lik = np.nan
-#            if min_clust_idx == 0:
-#                # get segment boundaries
-#                bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1])
-#                bdy = np.c_[bdy[:-1], bdy[1:]]
-#
-#                # sum log-likelihoods of each segment
-#                seg_lik = 0
-#                for st, en in bdy:
-#                   A1 = self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum()
-#                   A2 = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum()
-#                   B1 = self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum()
-#                   B2 = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum()
-#
-#                   seg_lik += ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1)
-#            else:
-#                seg_lik = np.nan
-
-            liks[i, :] = np.r_[clust_lik, seg_lik]
-
-        return liks
-# }}}
-
     def run(self, n_iter = 0, n_samps = 0):
-        #
-        # assign segments to likeliest prior component {{{
-
-        if len(self.clust_prior) > 1:
-            for seg_idx in range(len(self.S)):
-                seg_idx = np.r_[seg_idx] 
-
-                # compute probability that segment belongs to each cluster prior element
-                S_a = self._Siat_ph(seg_idx[0], min = True)
-                S_b = self._Siat_ph(seg_idx[0], min = False)
-                P_a = self.clust_prior_mat[1:, 0]
-                P_b = self.clust_prior_mat[1:, 1]
-
-                # prior likelihood ratios for both phase orientations
-                P_l = np.c_[
-                  ss.betaln(S_a + P_a + 1, S_b + P_b + 1) - (ss.betaln(S_a + 1, S_b + 1) + ss.betaln(P_a + 1, P_b + 1)),
-                  ss.betaln(S_b + P_a + 1, S_a + P_b + 1) - (ss.betaln(S_b + 1, S_a + 1) + ss.betaln(P_a + 1, P_b + 1)),
-                ]
-
-                # get count prior
-                ccp = np.c_[[v for k, v in self.clust_count_prior.items() if k != -1]]
-
-                # posterior numerator
-                num = P_l + np.log(ccp)
-                num -= num.max()
-
-                # probabilistically choose a cluster
-                probs = np.exp(num)/np.exp(num).sum()
-                idx = np.tile(np.r_[self.clust_prior.keys()][1:], [2, 1]).T*[1, -1]
-                choice = np.random.choice(
-                  idx.ravel(),
-                  p = probs.ravel()
-                )
-
-                # rephase
-                if choice < 0:
-                    self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]
-                    choice = -choice
-
-                self.S.iloc[seg_idx, self.clust_col] = choice
-
-        # }}}
-
         #
         # initialize cluster tracking hash tables
         self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore"))
@@ -1163,7 +1054,6 @@ def run(self, n_iter = 0, n_samps = 0):
             log_adj_lik = 0
             if not move_clust: # or (move_clust and np.random.rand() < 0.01):
                 log_adj_lik = self.compute_adj_prob(break_idx[0])
-                #seg_touch_idx[seg_idx] = True
  
             # p(X|clust,phase)p(X|seg,phase)p(clust)p(phase)
             num = (MLs               # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B})

From eb1e7eb06755ec4139a1b28b7966100e901a6994 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Apr 2022 16:04:04 -0400
Subject: [PATCH 086/222] Burnin check

---
 hapaseg/allelic_DP.py | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index fe13120..f6023dd 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -670,7 +670,8 @@ def compute_overall_lik_simple(self):
 
         ## overall phasing likelihood
         # p({phase_i} | {a_i, b_i})
-        phase_lik = np.log1p(-np.r_[self.seg_phase_probs.values()]).sum()
+        phase_probs = np.r_[self.seg_phase_probs.values()]
+        phase_lik = np.log1p(phase_probs).sum() if not np.isnan(phase_probs).any() else np.nan
 
         ## Dirichlet count prior (Dirichlet-categorical marginal likelihood)
         # p({c_k})
@@ -752,6 +753,7 @@ def run(self, n_iter = 0, n_samps = 0):
 
         brk = 0
         touch90 = False
+        likelihood_ready = False
 
         while True:
             if not n_it % 1000:
@@ -771,30 +773,28 @@ def run(self, n_iter = 0, n_samps = 0):
 #            if n_samps > 0 and len() > n_samps:
 #                break
 
-            # poll every 100 iterations for burnin status
+            # poll every 100 iterations for various statuses
             if not n_it % 100:
                 # have >90% of segments been touched?
                 if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.9:
                     touch90 = True
 
-                # have most segments been adjacency corrected?
-                # if so, has the overall likelihood stabilized enough that we're burned in?
-                if not burned_in:
-                    pass
-                    # 2. if >90% of segments have been adjacency corrected, check for burnin
-                    # does the smoothed derivative of the posterior numerator go below zero? this would indicate that we've solidly reached an optimum
-                    # TODO: make this check more efficient?
-#                    if all_touched and (np.convolve(np.diff(self.lik_tmp), np.ones(50)/50, mode = "same") < 0).sum() > 2:
-#                        pass
-#                        burned_in = True
-#                        n_it_last = n_it
-#                        seg_touch_idx[:] = False
-
                 # start computing likelihoods
                 if touch90:
-                    print(self.compute_overall_lik_simple())
-                    print(self.compute_overall_lik_simple().sum())
-                    self.lik_trace.append(self.compute_overall_lik_simple())
+                    lik = self.compute_overall_lik_simple()
+                    # phasing likelihood will be NaN until we've touched every singlesegment
+                    if not np.isnan(lik).any():
+                        self.lik_trace.append(lik)
+                        self.seg_track.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]})
+                        likelihood_ready = True
+
+                # check if likelihood has stabilized enough to consider us "burned in"
+                if likelihood_ready and not burned_in and len(self.lik_trace) > 100:
+                    lt = np.vstack(self.lik_trace).sum(1)
+                    if (np.convolve(np.diff(lt), np.ones(50)/50, mode = "same") < 0).sum() > 2:
+                        breakpoint()
+                        burned_in = True
+                        n_it_last = n_it
 
                 # save cluster assignments and phase orientations once burned in
                 if burned_in:

From 5309979c5e654c2417aeadf3ec9edc0580743d73 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Apr 2022 16:16:18 -0400
Subject: [PATCH 087/222] Add contingency to AMCMC if burnin criterion is never
 met due to early convergence

---
 hapaseg/allelic_MCMC.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index 7f9bbff..95a349a 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -139,7 +139,7 @@ def run(self):
                 return self
 
             # save MLE breakpoint if we've burned in
-            if self.burned_in:
+            if self.burned_in or self.iter >= self.n_iter - 100: # contingency in case we've converged on an optimum early and the chain hasn't moved at all
                 if self.marg_lik[self.iter] > self.marg_lik[self.iter - 1]:
                     self.breakpoints_MLE = self.breakpoints.copy()
 

From f37451ef04323fcafd228d22331a142975f13c2f Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Apr 2022 16:59:27 -0400
Subject: [PATCH 088/222] Save samples after DP burnin

---
 hapaseg/allelic_DP.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index f6023dd..b106d88 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -769,9 +769,9 @@ def run(self, n_iter = 0, n_samps = 0):
             if n_iter > 0 and n_it > n_iter:
                 return
 
-#            # stop after a number of samples have been taken
-#            if n_samps > 0 and len() > n_samps:
-#                break
+            # stop after a number of samples have been taken
+            if n_samps > 0 and len(self.segs_to_clusters) > n_samps:
+                break
 
             # poll every 100 iterations for various statuses
             if not n_it % 100:
@@ -792,7 +792,6 @@ def run(self, n_iter = 0, n_samps = 0):
                 if likelihood_ready and not burned_in and len(self.lik_trace) > 100:
                     lt = np.vstack(self.lik_trace).sum(1)
                     if (np.convolve(np.diff(lt), np.ones(50)/50, mode = "same") < 0).sum() > 2:
-                        breakpoint()
                         burned_in = True
                         n_it_last = n_it
 
@@ -1172,9 +1171,8 @@ def run(self, n_iter = 0, n_samps = 0):
             else:
                 self.clust_members_bps[choice] |= snp_idx
 
-            # track global state of cluster assignments
-            # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations
-            if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2):
+            # save a sample from the MCMC when >95% of segments have been touched since the last iteration
+            if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95:
                 self.segs_to_clusters.append(self.S["clust"].copy())
                 self.phase_orientations.append(self.S["flipped"].copy())
                 n_it_last = n_it

From 9ae68fc588c8b489de96b16431bf6361b2b73d29 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 14 Apr 2022 17:24:08 -0400
Subject: [PATCH 089/222] segs_to_clusters -> snps_to_clusters

---
 hapaseg/allelic_DP.py | 27 +++++++++++----------------
 1 file changed, 11 insertions(+), 16 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index b106d88..b796ef4 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -739,7 +739,7 @@ def run(self, n_iter = 0, n_samps = 0):
         self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints[:-1] })
 
         # containers for saving the MCMC trace
-        self.segs_to_clusters = []
+        self.snps_to_clusters = []
         self.phase_orientations = []
 
         burned_in = False
@@ -770,7 +770,7 @@ def run(self, n_iter = 0, n_samps = 0):
                 return
 
             # stop after a number of samples have been taken
-            if n_samps > 0 and len(self.segs_to_clusters) > n_samps:
+            if n_samps > 0 and len(self.snps_to_clusters) > n_samps:
                 break
 
             # poll every 100 iterations for various statuses
@@ -795,11 +795,6 @@ def run(self, n_iter = 0, n_samps = 0):
                         burned_in = True
                         n_it_last = n_it
 
-                # save cluster assignments and phase orientations once burned in
-                if burned_in:
-                    self.segs_to_clusters.append(self.S["clust"].copy())
-                    self.phase_orientations.append(self.S["flipped"].copy())
-
             #
             # pick either a segment or a cluster at random (50:50 prob.)
             move_clust = False
@@ -1173,13 +1168,13 @@ def run(self, n_iter = 0, n_samps = 0):
 
             # save a sample from the MCMC when >95% of segments have been touched since the last iteration
             if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95:
-                self.segs_to_clusters.append(self.S["clust"].copy())
+                self.snps_to_clusters.append(self.S["clust"].copy())
                 self.phase_orientations.append(self.S["flipped"].copy())
                 n_it_last = n_it
 
             n_it += 1
 
-        return np.r_[self.segs_to_clusters], np.r_[self.phase_orientations]
+        return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations]
 
     #_colors = mpl.cm.get_cmap("tab10").colors
     _colors = ((np.c_[1:7] & np.r_[4, 2, 1]) > 0).astype(int)
@@ -1191,11 +1186,11 @@ def run(self, n_iter = 0, n_samps = 0):
 #   np.c_[0, 23, 204],
 #   np.c_[75, 172, 227]]/255
 
-    def get_unique_clust_idxs(self, segs_to_clusters = None):
-        if segs_to_clusters is None:
-            segs_to_clusters = np.r_[self.segs_to_clusters]
-        s2cu, s2cu_j = np.unique(segs_to_clusters, return_inverse = True)
-        return s2cu, s2cu_j.reshape(segs_to_clusters.shape)
+    def get_unique_clust_idxs(self, snps_to_clusters = None):
+        if snps_to_clusters is None:
+            snps_to_clusters = np.r_[self.snps_to_clusters]
+        s2cu, s2cu_j = np.unique(snps_to_clusters, return_inverse = True)
+        return s2cu, s2cu_j.reshape(snps_to_clusters.shape)
 
     def get_colors(self):
         s2cu, s2cu_j = self.get_unique_clust_idxs()
@@ -1223,7 +1218,7 @@ def visualize_segs(self):
         colors = self.get_colors()
         s2cu, s2cu_j = self.get_unique_clust_idxs()
 
-        n_samp = len(self.segs_to_clusters)
+        n_samp = len(self.snps_to_clusters)
 
         for s2c, s2ph in zip(s2cu_j, self.phase_orientations):
             # rephase segments according to phase orientation sample
@@ -1244,7 +1239,7 @@ def visualize_adjacent_segs(self, f = None, n_samp = None):
         colors = self.get_colors()
         s2cu, s2cu_j = self.get_unique_clust_idxs()
 
-        n_samp = len(self.segs_to_clusters) if n_samp is None else n_samp
+        n_samp = len(self.snps_to_clusters) if n_samp is None else n_samp
 
         for s2c, s2ph in zip(s2cu_j, self.phase_orientations):
             # rephase segments according to phase orientation sample

From b1c38f8ae2a68b3bd63b32dcc57c6ebc9d0eb020 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 15 Apr 2022 14:11:25 -0400
Subject: [PATCH 090/222] Don't count prior twice when opening new cluster

---
 hapaseg/allelic_DP.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index b796ef4..0225ed4 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -974,6 +974,7 @@ def run(self, n_iter = 0, n_samps = 0):
             #AB = ss.betaln(A_a + B_a + 1, A_b + B_b + 1)
             # C is likelihood of target cluster pre-join
             C = ss.betaln(C_ab[:, 0] + 1 + self.betahyp, C_ab[:, 1] + 1 + self.betahyp)
+            C[0] = 0 # don't count prior twice when opening a new cluster
             # A is likelihood cluster B is part of, minus B
             #A = ss.betaln(A_a + 1, A_b + 1)
             # B+C is likelihood of target cluster post-join, with both phase orientations

From 977173945c57b1d7279aa471af386ec0dc0e8944 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sun, 17 Apr 2022 10:47:17 -0400
Subject: [PATCH 091/222] Overhaul plotting segs from DP iterations

---
 hapaseg/allelic_DP.py | 86 ++++++++++++++++++-------------------------
 1 file changed, 36 insertions(+), 50 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 0225ed4..f176d95 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -746,6 +746,7 @@ def run(self, n_iter = 0, n_samps = 0):
 
         # likelihood trace
         self.lik_trace = []
+        self.segment_trace = []
         self.post = 0
 
         n_it = 0
@@ -785,7 +786,6 @@ def run(self, n_iter = 0, n_samps = 0):
                     # phasing likelihood will be NaN until we've touched every singlesegment
                     if not np.isnan(lik).any():
                         self.lik_trace.append(lik)
-                        self.seg_track.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]})
                         likelihood_ready = True
 
                 # check if likelihood has stabilized enough to consider us "burned in"
@@ -1171,6 +1171,7 @@ def run(self, n_iter = 0, n_samps = 0):
             if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95:
                 self.snps_to_clusters.append(self.S["clust"].copy())
                 self.phase_orientations.append(self.S["flipped"].copy())
+                self.segment_trace.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]})
                 n_it_last = n_it
 
             n_it += 1
@@ -1196,24 +1197,24 @@ def get_unique_clust_idxs(self, snps_to_clusters = None):
     def get_colors(self):
         s2cu, s2cu_j = self.get_unique_clust_idxs()
 
-        seg_terr = self.S["end_gp"] - self.S["start_gp"]
-        tot_terr = np.zeros(len(s2cu))
-        for r in s2cu_j:
-           tot_terr += npg.aggregate(r, seg_terr, size = len(tot_terr))
+        T = pd.DataFrame(np.c_[np.r_[self.breakpoints[:-2]], np.r_[self.breakpoints[1:-1]]], columns = ["snp_st", "snp_end"])
+        T["gp_st"] = self.S.loc[T["snp_st"], "pos_gp"].values
+        T["gp_end"] = self.S.loc[T["snp_end"], "pos_gp"].values
+        T["terr"] = T["gp_end"] - T["gp_st"]
+        T["clust"] = self.S.loc[T["snp_st"], "clust"].values
 
-        si = np.argsort(tot_terr)[::-1]
-        terr_cs = np.cumsum(tot_terr[si])/tot_terr.sum()
+        clust_terr = T.groupby("clust")["terr"].sum().sort_values(ascending = False)
 
-        colors_to_use = np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (terr_cs < 0.99).sum())])
-        colors = np.zeros([len(s2cu), 4])
-        n_distinct = colors_to_use.shape[0] 
-        colors[si[:n_distinct], :] = colors_to_use
-        colors[si[n_distinct:], :] = colors_to_use[:(len(si) - n_distinct), :]
+        # color any cluster larger than 10Mb (~0.003 of total genomic territory)
+        return np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (clust_terr/clust_terr.sum() >= 0.003).sum())])
 
-    def visualize_segs(self):
-        plt.figure()
+    def visualize_snps(self, f = None):
+        pass
+
+    def visualize_segs(self, f = None):
+        f = plt.figure(figsize = [16, 4]) if f is None else f
         ax = plt.gca()
-        ax.set_xlim([0, self.S["end_gp"].max()])
+        ax.set_xlim([0, self.S["pos_gp"].max()])
         ax.set_ylim([0, 1])
 
         colors = self.get_colors()
@@ -1221,46 +1222,31 @@ def visualize_segs(self):
 
         n_samp = len(self.snps_to_clusters)
 
-        for s2c, s2ph in zip(s2cu_j, self.phase_orientations):
-            # rephase segments according to phase orientation sample
-            S_ph = self.S.copy()
-            flip_idx = np.flatnonzero(s2ph != S_ph["flipped"])
-            S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
-
-            for i, r in enumerate(S_ph.itertuples()):
-                ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], r.min + 1, r.maj + 1)
-                ax.add_patch(mpl.patches.Rectangle((r.start_gp, ci_lo), r.end_gp - r.start_gp, ci_hi - ci_lo, facecolor = colors[s2c[i] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000))
-
-    def visualize_adjacent_segs(self, f = None, n_samp = None):
-        plt.figure(num = f, figsize = [17.56, 5.67])
-        ax = plt.gca()
-        ax.set_xlim([0, self.S["end_gp"].max()])
-        ax.set_ylim([0, 1])
-
-        colors = self.get_colors()
-        s2cu, s2cu_j = self.get_unique_clust_idxs()
+        selff = copy.deepcopy(self)
 
-        n_samp = len(self.snps_to_clusters) if n_samp is None else n_samp
+        for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations):
+            # get uniqued clust indices for each segment start
+            seg_cu = np.searchsorted(s2cu, np.r_[list(seg2c.values())])
 
-        for s2c, s2ph in zip(s2cu_j, self.phase_orientations):
             # rephase segments according to phase orientation sample
-            S_ph = self.S.copy()
-            flip_idx = np.flatnonzero(s2ph != S_ph["flipped"])
-            S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
+            selff.S["flipped"] = s2ph
 
-            bdy = np.flatnonzero(np.r_[1, np.diff(s2c) != 0, 1])
-            bdy = np.c_[bdy[:-1], bdy[1:]]
-
-#            s2c_nz = s2c.copy()
-#            zidx = np.flatnonzero(s2c[bdy[:, 0]] == 0)
-#            for z in zidx:
-#                s2c_nz[bdy[z, 0]:bdy[z, 1]] = s2c_nz[bdy[z - 1, 0]]
-#            bdy_nz = np.flatnonzero(np.r_[1, np.diff(s2c_nz) != 0, 1])
-#            bdy_nz = np.c_[bdy_nz[:-1], bdy_nz[1:]]
+            seg_bdy = np.r_[list(seg2c.keys()), len(selff.S)]
+            seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
 
-            for st, en in bdy:
-                ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], S_ph.iloc[st:en, self.min_col].sum() + 1, S_ph.iloc[st:en, self.maj_col].sum() + 1)
-                ax.add_patch(mpl.patches.Rectangle((S_ph.iloc[st]["start_gp"], ci_lo), S_ph.iloc[en - 1]["end_gp"] - S_ph.iloc[st]["start_gp"], np.maximum(0, ci_hi - ci_lo), facecolor = colors[s2c[st] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000))
+            for i, (st, en) in enumerate(seg_bdy):
+                ci_lo, med, ci_hi = s.beta.ppf(
+                  [0.05, 0.5, 0.95],
+                  selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp,
+                  selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp,
+                )
+                ax.add_patch(mpl.patches.Rectangle(
+                  (selff.S.iloc[st]["pos_gp"], ci_lo),
+                  selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"],
+                  np.maximum(0, ci_hi - ci_lo),
+                  facecolor = colors[seg_cu[i] % len(colors)],
+                  fill = True, alpha = 1/n_samp, zorder = 1000
+                ))
 
     def visualize_clusts(self, f = None, n_samp = None, thick = False, nocolor = False):
         plt.figure(num = f, figsize = [17.56, 5.67])

From a42df536dc50b06838ce0969dea9ac4ec9fd73b2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 09:52:41 -0400
Subject: [PATCH 092/222] Overhaul clust viz. code

---
 hapaseg/allelic_DP.py | 87 ++++++++++++-------------------------------
 1 file changed, 23 insertions(+), 64 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index f176d95..104aa19 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1211,7 +1211,7 @@ def get_colors(self):
     def visualize_snps(self, f = None):
         pass
 
-    def visualize_segs(self, f = None):
+    def visualize_segs(self, f = None, use_clust = False):
         f = plt.figure(figsize = [16, 4]) if f is None else f
         ax = plt.gca()
         ax.set_xlim([0, self.S["pos_gp"].max()])
@@ -1235,72 +1235,31 @@ def visualize_segs(self, f = None):
             seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
 
             for i, (st, en) in enumerate(seg_bdy):
-                ci_lo, med, ci_hi = s.beta.ppf(
-                  [0.05, 0.5, 0.95],
-                  selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp,
-                  selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp,
-                )
-                ax.add_patch(mpl.patches.Rectangle(
-                  (selff.S.iloc[st]["pos_gp"], ci_lo),
+                if use_clust:
+                    ci_lo, med, ci_hi = s.beta.ppf(
+                      [0.05, 0.5, 0.95],
+                      selff.clust_sums[seg2c[st]][0] + 1 + self.betahyp,
+                      selff.clust_sums[seg2c[st]][1] + 1 + self.betahyp,
+                    )
+                else:
+                    ci_lo, med, ci_hi = s.beta.ppf(
+                      [0.05, 0.5, 0.95],
+                      selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp,
+                      selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp,
+                    )
+                ax.add_patch(mpl.patches.Rectangle((
+                  selff.S.iloc[st]["pos_gp"], ci_lo),
                   selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"],
                   np.maximum(0, ci_hi - ci_lo),
                   facecolor = colors[seg_cu[i] % len(colors)],
                   fill = True, alpha = 1/n_samp, zorder = 1000
                 ))
-
-    def visualize_clusts(self, f = None, n_samp = None, thick = False, nocolor = False):
-        plt.figure(num = f, figsize = [17.56, 5.67])
-        ax = plt.gca()
-        ax.set_xlim([0, self.S["end_gp"].max()])
-        ax.set_ylim([0, 1])
-
-        colors = self.get_colors()
-        s2cu, s2cu_j = self.get_unique_clust_idxs()
-
-        n_samp = len(self.segs_to_clusters) if n_samp is None else n_samp
-
-        for s2c, s2ph in zip(s2cu_j, self.phase_orientations):
-            # rephase segments according to phase orientation sample
-            S_ph = self.S.copy()
-            flip_idx = np.flatnonzero(s2ph != S_ph["flipped"])
-            S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
-
-            # get overall cluster sums
-            clust_min = npg.aggregate(s2c, S_ph["min"])
-            clust_maj = npg.aggregate(s2c, S_ph["maj"])
-            CIs = s.beta.ppf([0.05, 0.5, 0.95], clust_min[:, None] + 1, clust_maj[:, None] + 1)
-
-            # get boundaries of contiguous segments
-            bdy = np.flatnonzero(np.r_[1, np.diff(s2c) != 0, 1])
-            bdy = np.c_[bdy[:-1], bdy[1:]]
-
-#            s2c_nz = s2c.copy()
-#            zidx = np.flatnonzero(s2c[bdy[:, 0]] == 0)
-#            for z in zidx:
-#                s2c_nz[bdy[z, 0]:bdy[z, 1]] = s2c_nz[bdy[z - 1, 0]]
-#            bdy_nz = np.flatnonzero(np.r_[1, np.diff(s2c_nz) != 0, 1])
-#            bdy_nz = np.c_[bdy_nz[:-1], bdy_nz[1:]]
-
-            for st, en in bdy:
-                if thick:
-                    b = CIs[s2c[st], 1] - 0.01
-                    t = CIs[s2c[st], 1] + 0.01
-                else:
-                    color = colors[s2c[st] % len(colors)]
-                    b = CIs[s2c[st], 0]
-                    t = CIs[s2c[st], 2]
-
-                if nocolor:
-                    color = [0, 1, 0]
-                else:
-                    color = colors[s2c[st] % len(colors)]
-
-                ax.add_patch(mpl.patches.Rectangle(
-                  xy = (S_ph.iloc[st]["start_gp"], b),
-                  width = S_ph.iloc[en - 1]["end_gp"] - S_ph.iloc[st]["start_gp"],
-                  height = t - b,
-                  facecolor = color,
-                  fill = True,
-                  alpha = 1/n_samp,
-                  zorder = 1000)
+                plt.scatter(
+                  (selff.S.iloc[en - 1]["pos_gp"] + selff.S.iloc[st]["pos_gp"])/2,
+                  med,
+                  color = colors[seg_cu[i] % len(colors)],
+                  marker = '.', s = 1, alpha = 1/n_samp
                 )
+
+    def visualize_clusts(self, f = None):
+        self.visualize_segs(f = f, use_clust = True)

From 6c2e149d86228a4a21d528fb7361782c90ec4f14 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 10:15:38 -0400
Subject: [PATCH 093/222] Visualize SNPs

---
 hapaseg/allelic_DP.py | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 104aa19..0a80903 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1208,10 +1208,7 @@ def get_colors(self):
         # color any cluster larger than 10Mb (~0.003 of total genomic territory)
         return np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (clust_terr/clust_terr.sum() >= 0.003).sum())])
 
-    def visualize_snps(self, f = None):
-        pass
-
-    def visualize_segs(self, f = None, use_clust = False):
+    def visualize_segs(self, f = None, use_clust = False, show_snps = False):
         f = plt.figure(figsize = [16, 4]) if f is None else f
         ax = plt.gca()
         ax.set_xlim([0, self.S["pos_gp"].max()])
@@ -1224,6 +1221,36 @@ def visualize_segs(self, f = None, use_clust = False):
 
         selff = copy.deepcopy(self)
 
+        if show_snps:
+            # set SNP alpha based on number of SNPs
+            logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M)))
+            default_alpha = logistic(A = 0.4, K = 0.01, B = 0.00001, M = 120000, x = len(self.S))
+
+            ph_prob = np.r_[self.phase_orientations].mean(0)
+
+            # only plot unambiguous SNPs once
+            uidx = np.flatnonzero((ph_prob == 0) | (ph_prob == 1))
+            selff.S["flipped"] = ph_prob == 1
+            ax.scatter(
+              selff.S.loc[uidx, "pos_gp"],
+              selff._Sloc_ph(uidx)/(selff._Sloc_ph(uidx) + selff._Sloc_ph(uidx, min = False)),
+              color = 'k', marker = '.', alpha = default_alpha, s = 1
+            )
+
+            # plot ambiguous SNPs with opacity weighted by phase probability
+            selff.S["flipped"] = True
+            nuidx = np.flatnonzero(~((ph_prob == 0) | (ph_prob == 1)))
+            ax.scatter(
+              selff.S.loc[nuidx, "pos_gp"],
+              selff._Sloc_ph(nuidx)/(selff._Sloc_ph(nuidx) + selff._Sloc_ph(nuidx, min = False)),
+              color = 'k', marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1
+            )
+            ax.scatter(
+              selff.S.loc[nuidx, "pos_gp"],
+              selff._Sloc_ph(nuidx, min = False)/(selff._Sloc_ph(nuidx) + selff._Sloc_ph(nuidx, min = False)),
+              color = 'k', marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1
+            )
+
         for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations):
             # get uniqued clust indices for each segment start
             seg_cu = np.searchsorted(s2cu, np.r_[list(seg2c.values())])
@@ -1261,5 +1288,5 @@ def visualize_segs(self, f = None, use_clust = False):
                   marker = '.', s = 1, alpha = 1/n_samp
                 )
 
-    def visualize_clusts(self, f = None):
-        self.visualize_segs(f = f, use_clust = True)
+    def visualize_clusts(self, **kwargs):
+        self.visualize_segs(use_clust = True, **kwargs)

From 6e5f80d57cdbd2a5076ce08386eda1b494defeba Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 11:26:42 -0400
Subject: [PATCH 094/222] Fix bugs with plotting SNPs

---
 hapaseg/allelic_DP.py | 37 ++++++++++++++++++++++---------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 0a80903..3968834 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1229,26 +1229,30 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False):
             ph_prob = np.r_[self.phase_orientations].mean(0)
 
             # only plot unambiguous SNPs once
-            uidx = np.flatnonzero((ph_prob == 0) | (ph_prob == 1))
-            selff.S["flipped"] = ph_prob == 1
+            uidx = ph_prob == 0
             ax.scatter(
-              selff.S.loc[uidx, "pos_gp"],
-              selff._Sloc_ph(uidx)/(selff._Sloc_ph(uidx) + selff._Sloc_ph(uidx, min = False)),
+              self.S.loc[uidx, "pos_gp"],
+              self.S.loc[uidx, "min"]/self.S.loc[uidx, ["min", "maj"]].sum(1),
+              color = 'k', marker = '.', alpha = default_alpha, s = 1
+            )
+            uidx = ph_prob == 1
+            ax.scatter(
+              self.S.loc[uidx, "pos_gp"],
+              self.S.loc[uidx, "maj"]/self.S.loc[uidx, ["min", "maj"]].sum(1),
               color = 'k', marker = '.', alpha = default_alpha, s = 1
             )
 
             # plot ambiguous SNPs with opacity weighted by phase probability
-            selff.S["flipped"] = True
-            nuidx = np.flatnonzero(~((ph_prob == 0) | (ph_prob == 1)))
+            nuidx = (ph_prob != 0) & (ph_prob != 1)
             ax.scatter(
               selff.S.loc[nuidx, "pos_gp"],
-              selff._Sloc_ph(nuidx)/(selff._Sloc_ph(nuidx) + selff._Sloc_ph(nuidx, min = False)),
-              color = 'k', marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1
+              self.S.loc[nuidx, "min"]/self.S.loc[nuidx, ["min", "maj"]].sum(1),
+              color = 'k', marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1
             )
             ax.scatter(
               selff.S.loc[nuidx, "pos_gp"],
-              selff._Sloc_ph(nuidx, min = False)/(selff._Sloc_ph(nuidx) + selff._Sloc_ph(nuidx, min = False)),
-              color = 'k', marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1
+              self.S.loc[nuidx, "maj"]/self.S.loc[nuidx, ["min", "maj"]].sum(1),
+              color = 'k', marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1
             )
 
         for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations):
@@ -1274,19 +1278,22 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False):
                       selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp,
                       selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp,
                     )
-                ax.add_patch(mpl.patches.Rectangle((
-                  selff.S.iloc[st]["pos_gp"], ci_lo),
+                ax.add_patch(mpl.patches.Rectangle(
+                  (selff.S.iloc[st]["pos_gp"], ci_lo),
                   selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"],
                   np.maximum(0, ci_hi - ci_lo),
                   facecolor = colors[seg_cu[i] % len(colors)],
-                  fill = True, alpha = 1/n_samp, zorder = 1000
+                  fill = True, alpha = 1 if show_snps else 1/n_samp, zorder = 1000
                 ))
-                plt.scatter(
+                ax.scatter(
                   (selff.S.iloc[en - 1]["pos_gp"] + selff.S.iloc[st]["pos_gp"])/2,
                   med,
                   color = colors[seg_cu[i] % len(colors)],
-                  marker = '.', s = 1, alpha = 1/n_samp
+                  marker = '.', s = 1, alpha = 1 if show_snps else 1/n_samp
                 )
 
+            if show_snps:
+                break
+
     def visualize_clusts(self, **kwargs):
         self.visualize_segs(use_clust = True, **kwargs)

From 69828c2333ca9ae61c1d8f7ec89451b0a4238134 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 11:40:04 -0400
Subject: [PATCH 095/222] Clean up DP wrapper code

---
 hapaseg/allelic_DP.py | 259 +++++-------------------------------------
 1 file changed, 28 insertions(+), 231 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 3968834..6d31fe9 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -17,36 +17,16 @@
 class A_DP:
     def __init__(self, allelic_segs_pickle, ref_fasta = None):
         # dataframe of allelic imbalance segmentation samples for each chromosome arm
-        self.allelic_segs = pd.read_pickle(allelic_segs_pickle).dropna(0)
-        self.allelic_segs = self.allelic_segs.loc[self.allelic_segs["results"].apply(lambda x : len(x.breakpoint_list)) > 0]
-
-        # number of total segmentation samples
-        self.n_samp = self.allelic_segs["results"].apply(lambda x : len(x.breakpoint_list)).min()
-        self.ref_fasta = ref_fasta
-
-        # DP run objects for each segmentation sample
-        self.DP_runs = None
-
-        # dataframe of SNPs
-        self.SNPs = None
-
-        # number of segmentation samples used for DP run
-        self.N_seg_samps = None
-        # number of DP samples per segmentation sample
-        self.N_clust_samps = None
-
-        # assignment of SNPs to DP clusters for each MCMC sample
-        self.snps_to_clusters = None
-        # phase correction of SNPs for each MCMC sample
-        self.snps_to_phases = None
-
-    def load_seg_samp(self, samp_idx):
-        if samp_idx > self.n_samp:
-            raise ValueError(f"Only {self.n_samp} MCMC samples were taken!")
-
-        SNPs = []
+        self.allelic_segs = pd.read_pickle(allelic_segs_pickle).dropna(axis = 0)
+        # if some chromsome arms couldn't find the MLE, just use current state of chain
+        none_idx = self.allelic_segs["results"].apply(lambda x : x.breakpoints_MLE is None)
+        for i in none_idx[none_idx].index:
+            self.allelic_segs.iloc[i]["results"].breakpoints_MLE = self.allelic_segs.iloc[i]["results"].breakpoints
+
+        # load SNPs
+        self.SNPs = []
         clust_offset = 0
-        for _, H in self.allelic_segs.dropna(subset = ["results"]).iterrows():
+        for _, H in self.allelic_segs.iterrows():
             S = copy.deepcopy(H["results"].P)
             S["A_alt"] = 0
             S.loc[S["aidx"], "A_alt"] = S.loc[S["aidx"], "ALT_COUNT"]
@@ -62,8 +42,7 @@ def load_seg_samp(self, samp_idx):
 
             # set initial cluster assignments based on segmentation
             S["clust"] = -1
-            # TODO: use ML segmentation
-            bpl = np.array(H["results"].breakpoint_list[samp_idx]); bpl = np.c_[bpl[0:-1], bpl[1:]]
+            bpl = np.array(H["results"].breakpoints_MLE); bpl = np.c_[bpl[0:-1], bpl[1:]]
             for i, (st, en) in enumerate(bpl):
                 S.iloc[st:en, S.columns.get_loc("clust")] = i + clust_offset
             clust_offset += i
@@ -72,217 +51,35 @@ def load_seg_samp(self, samp_idx):
             S = S.iloc[:-1]
             assert (S["clust"] != -1).all()
 
-            SNPs.append(S)
+            self.SNPs.append(S)
 
-        SNPs = pd.concat(SNPs, ignore_index = True)
+        self.SNPs = pd.concat(self.SNPs, ignore_index = True)
 
         # convert chr-relative positions to absolute genomic coordinates
-        SNPs["pos_gp"] = seq.chrpos2gpos(SNPs["chr"], SNPs["pos"], ref = self.ref_fasta)
+        self.ref_fasta = ref_fasta
+        self.SNPs["pos_gp"] = seq.chrpos2gpos(self.SNPs["chr"], self.SNPs["pos"], ref = self.ref_fasta)
 
         # initial phasing orientation
-        SNPs["flipped"] = False
-
-        return SNPs, None
-
-    # map trace of segment cluster assignments to the SNPs within
-    @staticmethod
-    def map_seg_clust_assignments_to_SNPs(segs_to_clusters, S):
-        st_col = S.columns.get_loc("SNP_st")
-        en_col = S.columns.get_loc("SNP_en")
-        snps_to_clusters = np.zeros((segs_to_clusters.shape[0], S.iloc[-1, en_col] + 1), dtype = int)
-        for i, seg_assign in enumerate(segs_to_clusters):
-            for j, seg in enumerate(seg_assign):
-                snps_to_clusters[i, S.iloc[j, st_col]:S.iloc[j, en_col]] = seg
-
-        return snps_to_clusters
-
-    @staticmethod
-    def map_seg_phases_to_SNPs(phase, S):
-        st_col = S.columns.get_loc("SNP_st")
-        en_col = S.columns.get_loc("SNP_en")
-        snps_to_phase = np.zeros((phase.shape[0], S.iloc[-1, en_col] + 1), dtype = int)
-        for i, phase_orient in enumerate(phase):
-            for j, ph in enumerate(phase_orient):
-                snps_to_phase[i, S.iloc[j, st_col]:S.iloc[j, en_col]] = ph
-
-        return snps_to_phase
-
-    def run(self, N_seg_samps = 50, N_clust_samps = 5, seg_sample_idx = None):
-        self.N_seg_samps = N_seg_samps if seg_sample_idx is None else 1
-        self.N_clust_samps = N_clust_samps
-
-        seg_sample_idx = np.random.choice(self.n_samp - 1, self.N_seg_samps, replace = False) if seg_sample_idx is None else [seg_sample_idx]
-        S, SNPs = self.load_seg_samp(seg_sample_idx[0])
-        N_SNPs = len(SNPs)
-        
-        self.snps_to_clusters = -1*np.ones((self.N_clust_samps*self.N_seg_samps, N_SNPs), dtype = np.int16)
-        self.snps_to_phases = np.zeros((self.N_clust_samps*self.N_seg_samps, N_SNPs), dtype = bool)
-        self.DP_likelihoods = np.zeros((self.N_clust_samps*self.N_seg_samps, 2))
-
-        self.DP_runs = [None]*self.N_seg_samps
-
-        clust_prior = sc.SortedDict()
-        clust_count_prior = sc.SortedDict()
-        n_iter_clust_exist = sc.SortedDict()
-        cur_samp_iter = 0
-
-        for n_it in range(self.N_seg_samps):
-            if n_it > 0:
-                S, SNPs = self.load_seg_samp(seg_sample_idx[n_it])
-
-            # run clustering
-            self.DP_runs[n_it] = DPinstance(S, clust_prior = clust_prior, clust_count_prior = clust_count_prior)
-            segs_to_clusters, segs_to_phases = self.DP_runs[n_it].run(n_iter = self.N_clust_samps)
-
-            # compute likelihoods for each clustering
-            self.DP_likelihoods[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.DP_runs[n_it].compute_overall_lik()
-
-            # assign clusters to individual SNPs, to use as segment assignment prior for next DP iteration
-            self.snps_to_clusters[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.map_seg_clust_assignments_to_SNPs(segs_to_clusters, S)
-
-            # assign phase orientations to individual SNPs
-            self.snps_to_phases[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.map_seg_phases_to_SNPs(segs_to_phases, S)
-
-            # compute prior on cluster locations/counts
-            max_clust_idx = segs_to_clusters.max()
-            for seg_assignments, seg_phases in zip(segs_to_clusters, segs_to_phases):
-                # reset phases
-                S2 = S.copy()
-                S2.loc[S2["flipped"], ["min", "maj"]] = S2.loc[S2["flipped"], ["min", "maj"]].values[:, ::-1]
-
-                # match phases to current sample
-                S2.loc[seg_phases, ["min", "maj"]] = S2.loc[seg_phases, ["min", "maj"]].values[:, ::-1]
-
-                # minor/major counts for each cluster in this iteration
-                S_a = npg.aggregate(seg_assignments, S2["min"], size = max_clust_idx + 1)
-                S_b = npg.aggregate(seg_assignments, S2["maj"], size = max_clust_idx + 1)
-                c = np.c_[S_a, S_b]
-
-                # total numer of SNPs for each cluster in this iteration
-                #N_c = npg.aggregate(seg_assignments, S2["SNP_en"] - S2["SNP_st"], size = max_clust_idx + 1)
-                N_c = npg.aggregate(seg_assignments, 1, size = max_clust_idx + 1)
-
-                # iteratively update priors
-                next_clust_prior = sc.SortedDict(zip(np.flatnonzero(c.sum(1) > 0), c[c.sum(1) > 0]))
-                next_clust_count_prior = sc.SortedDict(zip(np.flatnonzero(c.sum(1) > 0), N_c[N_c > 0]))
-
-                for cl in np.unique(seg_assignments):
-                    if cl in n_iter_clust_exist:
-                        n_iter_clust_exist[cl] += 1
-                    else:
-                        n_iter_clust_exist[cl] = 1
-                cur_samp_iter += 1
-
-                for k, v in next_clust_prior.items():
-                    nccp = next_clust_count_prior[k]
-                    if k in clust_prior:
-                        clust_prior[k] += (v - clust_prior[k])/n_iter_clust_exist[k]
-                        clust_count_prior[k] += (nccp - clust_count_prior[k])/cur_samp_iter
-                    else:
-                        clust_prior[k] = v
-                        clust_count_prior[k] = nccp/cur_samp_iter
-                # for clusters that don't exist in this iteration, average counts with zero
-                for k, v in clust_prior.items():
-                    if k != -1 and k not in next_clust_prior:
-                        clust_count_prior[k] -= clust_count_prior[k]/cur_samp_iter
-
-            # remove improbable clusters from prior
-            for kk in [k for k, v in clust_count_prior.items() if v < 1]:
-                del clust_prior[kk]
-                del clust_count_prior[kk]
-
-        return self.snps_to_clusters, self.snps_to_phases, self.DP_likelihoods
-
-    def visualize_segs(self, snps_to_clusters = None, f = None, n_vis_samp = None):
-        f = plt.figure(figsize = [17.56, 5.67]) if f is None else f
-
-        snps_to_clusters = snps_to_clusters if snps_to_clusters is not None else self.snps_to_clusters
-
-        # plot all samples from DP
-        if n_vis_samp is None:
-            run_idx = np.r_[0:self.N_seg_samps]
-            N_seg_samps = self.N_seg_samps
+        self.SNPs["flipped"] = False
 
-        # only plot up to n_vis_samp _segmentation samples_ from DP
-        # (all DP samples for a given segmentation sample will be plotted)
-        else:
-            run_idx = np.random.choice(self.N_seg_samps, n_vis_samp, replace = False)
-            N_seg_samps = n_vis_samp
-
-        for d in [self.DP_runs[x] for x in run_idx]:
-            d.visualize_adjacent_segs(f = f.number, n_samp = N_seg_samps*self.N_clust_samps)
-
-    def visualize_clusts(self, snps_to_clusters = None, f = None, thick = False, nocolor = False, n_vis_samp = None):
-        f = plt.figure(figsize = [17.56, 5.67]) if f is None else f
-
-        snps_to_clusters = snps_to_clusters if snps_to_clusters is not None else self.snps_to_clusters
-
-        # plot all samples from DP
-        if n_vis_samp is None:
-            run_idx = np.r_[0:self.N_seg_samps]
-            N_seg_samps = self.N_seg_samps
-
-        # only plot up to n_vis_samp _segmentation samples_ from DP
-        # (all DP samples for a given segmentation sample will be plotted)
-        else:
-            run_idx = np.random.choice(self.N_seg_samps, n_vis_samp, replace = False)
-            N_seg_samps = n_vis_samp
+        self.N_clust_samps = 100
 
-        for d in [self.DP_runs[x] for x in run_idx]:
-            d.visualize_clusts(f = f.number, n_samp = N_seg_samps*self.N_clust_samps, thick = thick, nocolor = nocolor)
-
-    def visualize_SNPs(self, snps_to_phases = None, color = True, f = None):
-        snps_to_phases = snps_to_phases if snps_to_phases is not None else self.snps_to_phases
-        ph_prob = snps_to_phases.mean(0)
-
-        if color:
-            rb = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]]
-        else:
-            rb = np.full([2, 3], 0)
-
-        logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M)))
-
-        def scerrorbar(idx, rev = False, alpha = 1, show_CI = True):
-            if rev:
-                f = 1 - self.SNPs.loc[idx, "f"]
-                eb_bot = self.SNPs.loc[idx, "f"] - self.SNPs.loc[idx, "f_CI_hi"]
-                eb_top = self.SNPs.loc[idx, "f_CI_lo"] - self.SNPs.loc[idx, "f"]
-            else:
-                f = self.SNPs.loc[idx, "f"]
-                eb_bot = self.SNPs.loc[idx, "f"] - self.SNPs.loc[idx, "f_CI_lo"]
-                eb_top = self.SNPs.loc[idx, "f_CI_hi"] - self.SNPs.loc[idx, "f"]
-
-            if show_CI:
-                plt.errorbar(
-                  x = self.SNPs.loc[idx, "gpos"],
-                  y = f,
-                  yerr = np.c_[
-                    eb_bot,
-                    eb_top
-                  ].T,
-                  fmt = 'none', ecolor = np.c_[rb[self.SNPs.loc[idx, "allele"]], (alpha if isinstance(alpha, np.ndarray) else alpha*np.ones(idx.sum()))**2]
-                )
-
-            plt.scatter(
-              self.SNPs.loc[idx, "gpos"],
-              f,
-              color = rb[self.SNPs.loc[idx, "allele"]],
-              marker = '.',
-              s = 1,
-              alpha = alpha if show_CI else alpha
-            )
+        # assignment of SNPs to DP clusters for each MCMC sample
+        self.snps_to_clusters = None
+        # phase correction of SNPs for each MCMC sample
+        self.snps_to_phases = None
 
-        default_alpha = logistic(A = 0.4, K = 0.01, B = 0.00001, M = 120000, x = len(self.SNPs))
+    def run(self):
+        self.DP_run = DPinstance(
+          self.SNPs,
+          dp_count_scale_factor = self.SNPs["clust"].value_counts().mean()
+        )
+        self.snps_to_clusters, self.snps_to_phases = self.DP_run.run(n_samps = self.N_clust_samps)
 
-        f = plt.figure(figsize = [17.56, 5.67]) if f is None else f
-        scerrorbar(ph_prob == 0, alpha = default_alpha, show_CI = color)
-        scerrorbar(ph_prob == 1, rev = True, alpha = default_alpha, show_CI = color)
-        idx = (ph_prob > 0) & (ph_prob < 1)
-        scerrorbar(idx, alpha = (1 - ph_prob[idx])*default_alpha, show_CI = color)
-        scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color)
+        return self.snps_to_clusters, self.snps_to_phases
 
 class DPinstance:
-    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1, temperature = 1, dp_count_scale_factor = 1):
+    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, temperature = 1, dp_count_scale_factor = 1):
         self.S = S
         self.clust_prior = clust_prior.copy()
         self.clust_count_prior = clust_count_prior.copy()

From 44a42abda652d47a239c84823ad491d6389134b8 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 18:13:52 -0400
Subject: [PATCH 096/222] Return likelihood trace

---
 hapaseg/allelic_DP.py | 48 ++++++++++++++++++++++++++++++-------------
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 6d31fe9..65731e1 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -68,15 +68,17 @@ def __init__(self, allelic_segs_pickle, ref_fasta = None):
         self.snps_to_clusters = None
         # phase correction of SNPs for each MCMC sample
         self.snps_to_phases = None
+        # likelihoods of each clustering
+        self.likelihoods = None
 
     def run(self):
         self.DP_run = DPinstance(
           self.SNPs,
           dp_count_scale_factor = self.SNPs["clust"].value_counts().mean()
         )
-        self.snps_to_clusters, self.snps_to_phases = self.DP_run.run(n_samps = self.N_clust_samps)
+        self.snps_to_clusters, self.snps_to_phases, self.likelihoods = self.DP_run.run(n_samps = self.N_clust_samps)
 
-        return self.snps_to_clusters, self.snps_to_phases
+        return self.snps_to_clusters, self.snps_to_phases, self.likelihoods
 
 class DPinstance:
     def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, temperature = 1, dp_count_scale_factor = 1):
@@ -538,20 +540,20 @@ def run(self, n_iter = 0, n_samps = 0):
         # containers for saving the MCMC trace
         self.snps_to_clusters = []
         self.phase_orientations = []
+        self.segment_trace = []
+        self.likelihood_trace = []
 
-        burned_in = False
-
-        # likelihood trace
+        # likelihood trace for checking burnin status
         self.lik_trace = []
-        self.segment_trace = []
-        self.post = 0
+        burned_in = False
+        self.burnin_iteration = -1
+        touch90 = False
+        likelihood_ready = False
 
         n_it = 0
         n_it_last = 0
 
         brk = 0
-        touch90 = False
-        likelihood_ready = False
 
         while True:
             if not n_it % 1000:
@@ -559,9 +561,8 @@ def run(self, n_iter = 0, n_samps = 0):
                     print(pd.Series(self.clust_counts.values()).value_counts().sort_index())
                 else:
                     print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1]))
-                print(brk % (len(self.breakpoints) - 1))
-                #print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index())
-                #print("n unassigned: {}".format((self.S["clust"] == -1).sum()))
+                if likelihood_ready:
+                    print("[{}] Likelihood: {}".format("*" if burned_in else " ", self.lik_trace[-1].sum()))
 
             # stop after a raw number of iterations
             if n_iter > 0 and n_it > n_iter:
@@ -588,8 +589,10 @@ def run(self, n_iter = 0, n_samps = 0):
                 # check if likelihood has stabilized enough to consider us "burned in"
                 if likelihood_ready and not burned_in and len(self.lik_trace) > 100:
                     lt = np.vstack(self.lik_trace).sum(1)
-                    if (np.convolve(np.diff(lt), np.ones(50)/50, mode = "same") < 0).sum() > 2:
+                    if (np.convolve(np.diff(lt), np.ones(100)/100, mode = "same") < 0).sum() > 2:
+                        print("BURNED IN")
                         burned_in = True
+                        self.burnin_iteration = len(self.lik_trace)
                         n_it_last = n_it
 
             #
@@ -969,11 +972,12 @@ def run(self, n_iter = 0, n_samps = 0):
                 self.snps_to_clusters.append(self.S["clust"].copy())
                 self.phase_orientations.append(self.S["flipped"].copy())
                 self.segment_trace.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]})
+                self.likelihood_trace.append(self.compute_overall_lik_simple().sum())
                 n_it_last = n_it
 
             n_it += 1
 
-        return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations]
+        return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations], np.r_[self.likelihood_trace]
 
     #_colors = mpl.cm.get_cmap("tab10").colors
     _colors = ((np.c_[1:7] & np.r_[4, 2, 1]) > 0).astype(int)
@@ -1094,3 +1098,19 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False):
 
     def visualize_clusts(self, **kwargs):
         self.visualize_segs(use_clust = True, **kwargs)
+
+    def plot_likelihood_trace(self):
+        lt = np.vstack(self.lik_trace)
+        lt = lt[np.isnan(lt).sum(1) == 0, :]
+
+        lt = lt[self.burnin_iteration:, :]
+
+        plt.figure(); plt.clf()
+        plt.scatter(np.r_[0:len(lt)], lt[:, 0] - lt[:, 0].max())
+        #plt.scatter(np.r_[0:len(lt)], lt[:, 1] - lt[:, 1].max())
+        plt.scatter(np.r_[0:len(lt)], lt[:, 2] - lt[:, 2].max())
+        plt.scatter(np.r_[0:len(lt)], lt[:, 3] - lt[:, 3].max())
+        plt.scatter(np.r_[0:len(lt)], lt.sum(1) - lt.sum(1).max(), marker = '+', color = 'k')
+        plt.legend(["Clust", "DP", "Seg", "Total"])
+        plt.xlabel(r"Post-burnin iteration ($\times 100$)")
+        plt.ylabel(r"$\Delta$ likelihood")

From 662b9b1b27134e234ccab9d935e9631adf7ce8e3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 19:25:11 -0400
Subject: [PATCH 097/222] Overhaul plot colorization

---
 hapaseg/allelic_DP.py | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 65731e1..a81c96d 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1,5 +1,6 @@
 import colorama
 import copy
+import distinctipy
 import itertools
 import matplotlib.pyplot as plt
 import matplotlib as mpl
@@ -1005,9 +1006,16 @@ def get_colors(self):
         T["clust"] = self.S.loc[T["snp_st"], "clust"].values
 
         clust_terr = T.groupby("clust")["terr"].sum().sort_values(ascending = False)
+        si = clust_terr.index.argsort()
 
         # color any cluster larger than 10Mb (~0.003 of total genomic territory)
-        return np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (clust_terr/clust_terr.sum() >= 0.003).sum())])
+        extra_colors = np.array(
+          distinctipy.distinctipy.get_colors(
+            (clust_terr/clust_terr.sum() >= 0.003).sum() - _colors.shape[0],
+            exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], _colors]])
+        )
+
+        return np.r_[_colors, extra_colors][si]
 
     def visualize_segs(self, f = None, use_clust = False, show_snps = False):
         f = plt.figure(figsize = [16, 4]) if f is None else f
@@ -1029,18 +1037,20 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False):
 
             ph_prob = np.r_[self.phase_orientations].mean(0)
 
+            cu = np.searchsorted(s2cu, self.S["clust"])
+
             # only plot unambiguous SNPs once
             uidx = ph_prob == 0
             ax.scatter(
               self.S.loc[uidx, "pos_gp"],
               self.S.loc[uidx, "min"]/self.S.loc[uidx, ["min", "maj"]].sum(1),
-              color = 'k', marker = '.', alpha = default_alpha, s = 1
+              color = colors[cu[uidx] % len(colors)], marker = '.', alpha = default_alpha, s = 1
             )
             uidx = ph_prob == 1
             ax.scatter(
               self.S.loc[uidx, "pos_gp"],
               self.S.loc[uidx, "maj"]/self.S.loc[uidx, ["min", "maj"]].sum(1),
-              color = 'k', marker = '.', alpha = default_alpha, s = 1
+              color = colors[cu[uidx] % len(colors)], marker = '.', alpha = default_alpha, s = 1
             )
 
             # plot ambiguous SNPs with opacity weighted by phase probability
@@ -1048,15 +1058,20 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False):
             ax.scatter(
               selff.S.loc[nuidx, "pos_gp"],
               self.S.loc[nuidx, "min"]/self.S.loc[nuidx, ["min", "maj"]].sum(1),
-              color = 'k', marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1
+              color = colors[cu[nuidx] % len(colors)], marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1
             )
             ax.scatter(
               selff.S.loc[nuidx, "pos_gp"],
               self.S.loc[nuidx, "maj"]/self.S.loc[nuidx, ["min", "maj"]].sum(1),
-              color = 'k', marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1
+              color = colors[cu[nuidx] % len(colors)], marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1
             )
 
         for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations):
+            # only show maximum likelihood if we're overlaying SNPs
+            if show_snps:
+                mlidx = np.r_[self.likelihood_trace].argmax()
+                seg2c, s2ph = self.segment_trace[mlidx], self.phase_orientations[mlidx]
+
             # get uniqued clust indices for each segment start
             seg_cu = np.searchsorted(s2cu, np.r_[list(seg2c.values())])
 
@@ -1084,6 +1099,7 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False):
                   selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"],
                   np.maximum(0, ci_hi - ci_lo),
                   facecolor = colors[seg_cu[i] % len(colors)],
+                  edgecolor = 'k' if show_snps else None, linewidth = 0.5 if show_snps else None,
                   fill = True, alpha = 1 if show_snps else 1/n_samp, zorder = 1000
                 ))
                 ax.scatter(

From c8490fd6a9406891c0a4257ab4937f81a4152522 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 20:58:19 -0400
Subject: [PATCH 098/222] No magenta

---
 hapaseg/allelic_DP.py | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index a81c96d..01ce4da 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -980,22 +980,14 @@ def run(self, n_iter = 0, n_samps = 0):
 
         return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations], np.r_[self.likelihood_trace]
 
-    #_colors = mpl.cm.get_cmap("tab10").colors
-    _colors = ((np.c_[1:7] & np.r_[4, 2, 1]) > 0).astype(int)
-#   _colors = np.r_[np.c_[87, 182, 55],
-#   np.c_[253, 245, 81],
-#   np.c_[238, 109, 45],
-#   np.c_[204, 43, 30],
-#   np.c_[221, 50, 132],
-#   np.c_[0, 23, 204],
-#   np.c_[75, 172, 227]]/255
-
     def get_unique_clust_idxs(self, snps_to_clusters = None):
         if snps_to_clusters is None:
             snps_to_clusters = np.r_[self.snps_to_clusters]
         s2cu, s2cu_j = np.unique(snps_to_clusters, return_inverse = True)
         return s2cu, s2cu_j.reshape(snps_to_clusters.shape)
 
+    _colors = ((np.r_[np.c_[1:5], np.c_[6:7]] & np.r_[4, 2, 1]) > 0).astype(int)
+
     def get_colors(self):
         s2cu, s2cu_j = self.get_unique_clust_idxs()
 
@@ -1012,7 +1004,9 @@ def get_colors(self):
         extra_colors = np.array(
           distinctipy.distinctipy.get_colors(
             (clust_terr/clust_terr.sum() >= 0.003).sum() - _colors.shape[0],
-            exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], _colors]])
+            exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], np.c_[0.5, 0.5, 0.5], np.c_[1, 0, 1], _colors]],
+            rng = 1234
+          )
         )
 
         return np.r_[_colors, extra_colors][si]

From c3bf48dc0333b2b1a8ac3790b3b14004c646b7bb Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:02:58 -0400
Subject: [PATCH 099/222] Use new colorscheme

---
 hapaseg/allelic_DP.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 01ce4da..603aa35 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -986,8 +986,6 @@ def get_unique_clust_idxs(self, snps_to_clusters = None):
         s2cu, s2cu_j = np.unique(snps_to_clusters, return_inverse = True)
         return s2cu, s2cu_j.reshape(snps_to_clusters.shape)
 
-    _colors = ((np.r_[np.c_[1:5], np.c_[6:7]] & np.r_[4, 2, 1]) > 0).astype(int)
-
     def get_colors(self):
         s2cu, s2cu_j = self.get_unique_clust_idxs()
 
@@ -1001,15 +999,32 @@ def get_colors(self):
         si = clust_terr.index.argsort()
 
         # color any cluster larger than 10Mb (~0.003 of total genomic territory)
+        base_colors = np.array([
+          [0.368417, 0.506779, 0.709798],
+          [0.880722, 0.611041, 0.142051],
+          [0.560181, 0.691569, 0.194885],
+          [0.922526, 0.385626, 0.209179],
+          [0.528488, 0.470624, 0.701351],
+          [0.772079, 0.431554, 0.102387],
+          [0.363898, 0.618501, 0.782349],
+          [1, 0.75, 0],
+          [0.647624, 0.37816, 0.614037],
+          [0.571589, 0.586483, 0.],
+          [0.915, 0.3325, 0.2125],
+          [0.400822, 0.522007, 0.85],
+          [0.972829, 0.621644, 0.073362],
+          [0.736783, 0.358, 0.503027],
+          [0.280264, 0.715, 0.429209]
+        ])
         extra_colors = np.array(
           distinctipy.distinctipy.get_colors(
-            (clust_terr/clust_terr.sum() >= 0.003).sum() - _colors.shape[0],
-            exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], np.c_[0.5, 0.5, 0.5], np.c_[1, 0, 1], _colors]],
+            (clust_terr/clust_terr.sum() >= 0.003).sum() - base_colors.shape[0],
+            exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], np.c_[0.5, 0.5, 0.5], np.c_[1, 0, 1], base_colors]],
             rng = 1234
           )
         )
 
-        return np.r_[_colors, extra_colors][si]
+        return np.r_[base_colors, extra_colors if extra_colors.size > 0 else np.empty([0, 3])][si]
 
     def visualize_segs(self, f = None, use_clust = False, show_snps = False):
         f = plt.figure(figsize = [16, 4]) if f is None else f

From dc6887d3abe36492590bf0fa4136dd829f839707 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:11:29 -0400
Subject: [PATCH 100/222] Update ADP wrapper

---
 hapaseg/__main__.py | 41 ++++++++++++-----------------------------
 1 file changed, 12 insertions(+), 29 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index d110bc4..7821f5d 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -101,8 +101,6 @@ def parse_args():
     ## DP
     dp = subparsers.add_parser("dp", help = "Run DP clustering on allelic imbalance segments")
     dp.add_argument("--seg_dataframe", required = True)
-    dp.add_argument("--n_dp_iter", default = 10)
-    dp.add_argument("--seg_samp_idx", default = 0)
     dp.add_argument("--ref_fasta", required = True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load
     dp.add_argument("--cytoband_file", required = True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load
 
@@ -310,14 +308,7 @@ def main():
         A = A_DP(args.seg_dataframe, ref_fasta = args.ref_fasta)
 
         # run DP
-        # TODO: when we have better type checking, drop the int coersion here
-        #N_seg_samps = A.n_samp - 1 if int(args.n_seg_samps) == 0 else int(args.n_seg_samps)
-        # TODO: if we decide to drop support for chained sampling altogether, remove N_seg_samps logic altogether
-        snps_to_clusters, snps_to_phases, likelihoods = A.run(
-          seg_sample_idx = int(args.seg_samp_idx),
-          #N_seg_samps = N_seg_samps,
-          N_clust_samps = int(args.n_dp_iter)
-        )
+        snps_to_clusters, snps_to_phases, likelihoods = A.run()
 
         # save DP results
         np.savez(output_dir + "/allelic_DP_SNP_clusts_and_phase_assignments.npz",
@@ -331,34 +322,26 @@ def main():
         #
         # plot DP results
 
-        # 1. phased SNP visualization
-        f = plt.figure(figsize = [17.56, 5.67])
-        hs_utils.plot_chrbdy(args.cytoband_file)
-        A.visualize_SNPs(snps_to_phases, color = True, f = f)
-        A.visualize_clusts(snps_to_clusters, f = f, thick = True, nocolor = True)
-        plt.ylabel("Haplotypic imbalance")
-        plt.title("SNP phasing/segmentation")
-        plt.savefig(output_dir + "/figures/SNPs.png", dpi = 300)
-        plt.close()
+        # 0. likelihood trace
+        A.DP_run.plot_likelihood_trace()
+        plt.savefig(output_dir + "/figures/likelihood_trace.png", dpi = 300)
 
-        # 2. pre-clustering segments
+        # 1. SNPs + segments
         f = plt.figure(figsize = [17.56, 5.67])
         hs_utils.plot_chrbdy(args.cytoband_file)
-        A.visualize_SNPs(snps_to_phases, color = False, f = f)
-        A.visualize_segs(snps_to_clusters, f = f)
+        A.DP_run.visualize_segs(f = f, show_snps = True)
         plt.ylabel("Haplotypic imbalance")
-        plt.title("Allelic segmentation, pre-DP clustering")
-        plt.savefig(output_dir + "/figures/allelic_imbalance_preDP.png", dpi = 300)
+        plt.title("SNPs + allelic segmentation (MAP)")
+        plt.savefig(output_dir + "/figures/SNPs.png", dpi = 300)
         plt.close()
 
-        # 3. post-clustering segments
+        # 2. segments alone
         f = plt.figure(figsize = [17.56, 5.67])
         hs_utils.plot_chrbdy(args.cytoband_file)
-        A.visualize_SNPs(snps_to_phases, color = False, f = f)
-        A.visualize_clusts(snps_to_clusters, f = f, thick = True)
+        A.DP_run.visualize_segs(f = f, show_snps = False)
         plt.ylabel("Haplotypic imbalance")
-        plt.title("Allelic segmentation, post-DP clustering")
-        plt.savefig(output_dir + "/figures/allelic_imbalance_postDP.png", dpi = 300)
+        plt.title("Allelic segmentation (posterior)")
+        plt.savefig(output_dir + "/figures/segs_only.png", dpi = 300)
         plt.close()
 
 if __name__ == "__main__":

From 4c758741bbd322bf5b2e6c1afd60e9df5f7d9261 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:12:46 -0400
Subject: [PATCH 101/222] Bump betahyp

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 603aa35..43cc742 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -94,7 +94,7 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F")
         self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F")
 
-        self.betahyp = 1
+        self.betahyp = 10
 
         #
         # define column indices

From a5182babdc6f43dc0f1fbf742e108a58d9f433be Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:13:15 -0400
Subject: [PATCH 102/222] Burn in for longer

---
 hapaseg/allelic_DP.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 43cc742..ca7b8f2 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -588,9 +588,9 @@ def run(self, n_iter = 0, n_samps = 0):
                         likelihood_ready = True
 
                 # check if likelihood has stabilized enough to consider us "burned in"
-                if likelihood_ready and not burned_in and len(self.lik_trace) > 100:
+                if likelihood_ready and not burned_in and len(self.lik_trace) > 500:
                     lt = np.vstack(self.lik_trace).sum(1)
-                    if (np.convolve(np.diff(lt), np.ones(100)/100, mode = "same") < 0).sum() > 2:
+                    if (np.convolve(np.diff(lt), np.ones(500)/500, mode = "same") < 0).sum() > 2:
                         print("BURNED IN")
                         burned_in = True
                         self.burnin_iteration = len(self.lik_trace)

From f4bd8e56da827edd6118ee65df12dbf2d74048ae Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:13:57 -0400
Subject: [PATCH 103/222] Keep cluster indices more consistent

---
 hapaseg/allelic_DP.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index ca7b8f2..e0bb0a1 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -889,13 +889,11 @@ def run(self, n_iter = 0, n_samps = 0):
             if choice < 0:
                 # if we are moving an entire cluster, give it the same index it used to have
                 # otherwise, cluster indices will be inconsistent
-                if move_clust:
-                    new_clust_idx = cl_idx
-                elif choice == -1: # totally new cluster
+                if cur_clust not in self.clust_counts:
+                    new_clust_idx = cur_clust
+                else: # totally new cluster
                     max_clust_idx += 1
                     new_clust_idx = max_clust_idx
-                else: # match index of cluster in prior
-                    new_clust_idx = -choice - 2
 
                 self.clust_counts[new_clust_idx] = n_move
                 self.S.iloc[seg_idx, self.clust_col] = new_clust_idx

From 4bef4e03293e2d83f7a64d4c2992fcbbb1952489 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:18:11 -0400
Subject: [PATCH 104/222] Get rid of code to move clusters

---
 hapaseg/allelic_DP.py | 254 +++++++++++++++++-------------------------
 1 file changed, 100 insertions(+), 154 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index e0bb0a1..6d89d57 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -597,156 +597,124 @@ def run(self, n_iter = 0, n_samps = 0):
                         n_it_last = n_it
 
             #
-            # pick either a segment or a cluster at random (50:50 prob.)
-            move_clust = False
+            # pick  a segment to move
 
-            # move a segment
-            #if not touch90 or np.random.rand() < 0.9:
-            if True or np.random.rand() < 0.9:
-                # >90% of segments have been moved; we are iterating over segments sequentially
-                if touch90:
-                    break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)})
-                    brk += 1
-                # we are picking segments at random
-                else:
-                    break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)})
-
-                # get all SNPs within this segment
-                seg_st = self.breakpoints[break_idx[0]]
-                seg_en = self.breakpoints[break_idx[0] + 1]
-                seg_idx = np.r_[seg_st:seg_en]
-
-                cur_clust = int(self.clusts[seg_idx[0]])
-
-                # propose breaking this segment
-                if np.random.rand() < 0.1:
-                    # can't split segments of length 1
-                    if len(seg_idx) == 1:
-                        n_it += 1
-                        continue
-
-                    # TODO: memoize cumsums?
-                    min_cs = self._Scumsum_ph(seg_idx, min = True)
-                    min_csr = self.seg_sums[seg_idx[0]][0] - min_cs
-                    maj_cs = self._Scumsum_ph(seg_idx, min = False)
-                    maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs
-
-                    split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp)
-                    split_lik[-1] = ss.betaln(min_cs[-1] + 1 + self.betahyp, maj_cs[-1] + 1 + self.betahyp)
-                    split_lik -= split_lik.max()
-                    split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum())
-                    seg_idx = seg_idx[:(split_point + 1)]
-
-                    # add breakpoint (can be erased subsequently if segment rejoins original cluster)
-                    new_bp = seg_idx[-1] + 1
-                    if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment
-                        self.add_breakpoint(start = seg_idx[0], mid = new_bp, end = seg_en, clust_idx = cur_clust)
-
-                # propose splitting out a contiguous interval of segments within the current cluster {{{
-                split_clust = False
-                if np.random.rand() < 0.1:
-                    # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable?
-                    clust_snps = np.sort(np.r_[list(self.clust_members[cur_clust])])
-
-                    # can't split clusters of length 1
-                    if len(clust_snps) == 1:
-                        n_it += 1
-                        continue
-
-                    split_bdy = self.compute_cluster_splitpoints(clust_snps)
-
-                    A_tot, B_tot = self.clust_sums[cur_clust]
-
-                    lik0 = ss.betaln(A_tot + 1 + self.betahyp, B_tot + 1 + self.betahyp)
-
-                    liks = np.zeros(len(split_bdy) + 1)
-                    liks[-1] = lik0 # don't split at all
-
-                    # likelihood ratios for splitting each region into a new cluster
-                    for i, (st, en) in enumerate(split_bdy):
-                        A = self._Ssum_ph(clust_snps[st:en], min = True)
-                        B = self._Ssum_ph(clust_snps[st:en], min = False)
-
-                        liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
-
-                    # pick a region to split
-                    split_idx = np.random.choice(
-                      len(split_bdy) + 1,
-                      p = np.exp(liks - liks.max())/np.exp(liks - liks.max()).sum()
-                    )
+            # >90% of segments have been moved; we are iterating over segments sequentially
+            if touch90:
+                break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)})
+                brk += 1
+            # we are picking segments at random
+            else:
+                break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)})
+
+            # get all SNPs within this segment
+            seg_st = self.breakpoints[break_idx[0]]
+            seg_en = self.breakpoints[break_idx[0] + 1]
+            seg_idx = np.r_[seg_st:seg_en]
 
-                    # don't split at all
-                    if split_idx == len(split_bdy):
-                        n_it += 1
-                        continue
+            cur_clust = int(self.clusts[seg_idx[0]])
 
-                    # seg_idx == SNPs to propose to split off
-                    seg_idx = clust_snps[slice(*split_bdy[split_idx])]
+            # propose breaking this segment
+            if np.random.rand() < 0.1:
+                # can't split segments of length 1
+                if len(seg_idx) == 1:
+                    n_it += 1
+                    continue
 
-                    split_clust = True
+                # TODO: memoize cumsums?
+                min_cs = self._Scumsum_ph(seg_idx, min = True)
+                min_csr = self.seg_sums[seg_idx[0]][0] - min_cs
+                maj_cs = self._Scumsum_ph(seg_idx, min = False)
+                maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs
+
+                split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp)
+                split_lik[-1] = ss.betaln(min_cs[-1] + 1 + self.betahyp, maj_cs[-1] + 1 + self.betahyp)
+                split_lik -= split_lik.max()
+                split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum())
+                seg_idx = seg_idx[:(split_point + 1)]
+
+                # add breakpoint (can be erased subsequently if segment rejoins original cluster)
+                new_bp = seg_idx[-1] + 1
+                if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment
+                    self.add_breakpoint(start = seg_idx[0], mid = new_bp, end = seg_en, clust_idx = cur_clust)
+
+            # propose splitting out a contiguous interval of segments within the current cluster {{{
+            split_clust = False
+            if False and touch90 and np.random.rand() < 0.1:
+                # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable?
+                clust_snps = np.sort(np.r_[list(self.clust_members[cur_clust])])
+
+                # can't split clusters of length 1
+                if len(clust_snps) == 1:
+                    n_it += 1
+                    continue
 
-                    # add breakpoints
-                    for si in [seg_idx[0], seg_idx[-1]]:
-                        if si not in self.breakpoints:
-                            seg_st_idx = self.breakpoints.bisect_left(si) - 1
-                            seg_st = self.breakpoints[seg_st_idx]
-                            seg_en_idx = self.breakpoints.bisect_left(si)
-                            seg_en = self.breakpoints[seg_en_idx]
+                split_bdy = self.compute_cluster_splitpoints(clust_snps)
 
-                            self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust)
+                A_tot, B_tot = self.clust_sums[cur_clust]
 
-                    # get all breakpoints within this cluster/interval
-                    left_idx = self.clust_members_bps[cur_clust].bisect_left(seg_idx[0])
-                    right_idx = self.clust_members_bps[cur_clust].bisect_right(seg_idx[-1])
-                    break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cur_clust][left_idx:right_idx]])
+                lik0 = ss.betaln(A_tot + 1 + self.betahyp, B_tot + 1 + self.betahyp)
 
-                # }}}
+                liks = np.zeros(len(split_bdy) + 1)
+                liks[-1] = lik0 # don't split at all
 
-                n_move = len(seg_idx)
+                # likelihood ratios for splitting each region into a new cluster
+                for i, (st, en) in enumerate(split_bdy):
+                    A = self._Ssum_ph(clust_snps[st:en], min = True)
+                    B = self._Ssum_ph(clust_snps[st:en], min = False)
 
-                # if segment was already assigned to a cluster, unassign it
-                if cur_clust >= 0:
-                    self.clust_counts[cur_clust] -= n_move
-                    if self.clust_counts[cur_clust] == 0:
-                        del self.clust_counts[cur_clust]
-                        del self.clust_sums[cur_clust]
-                        del self.clust_members[cur_clust]
-                        del self.clust_members_bps[cur_clust]
-                    else:
-                        self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)]
-                        self.clust_members[cur_clust] -= set(seg_idx)
-                        for b in break_idx:
-                            self.clust_members_bps[cur_clust].remove(self.breakpoints[b])
+                    liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
 
-                    self.clusts[seg_idx] = -1
+                # pick a region to split
+                split_idx = np.random.choice(
+                  len(split_bdy) + 1,
+                  p = np.exp(liks - liks.max())/np.exp(liks - liks.max()).sum()
+                )
 
-            # pick a cluster at random
-            else:
-                # it only makes sense to try joining two clusters if there are at least two of them!
-                if len(self.clust_counts) < 2:
+                # don't split at all
+                if split_idx == len(split_bdy):
                     n_it += 1
                     continue
 
-                cl_idx = np.random.choice(self.clust_counts.keys())
-                seg_idx = np.r_[list(self.clust_members[cl_idx])]
+                # seg_idx == SNPs to propose to split off
+                seg_idx = clust_snps[slice(*split_bdy[split_idx])]
 
-                # get all breakpoints corresponding to this cluster
-                break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cl_idx]])
+                split_clust = True
 
-                n_move = len(seg_idx)
-                cur_clust = -1 # only applicable for individual segments, so we set to -1 here
-                               # (this is so that subsequent references to clust_sums[cur_clust]
-                               # will return (0, 0))
+                # add breakpoints
+                for si in [seg_idx[0], seg_idx[-1]]:
+                    if si not in self.breakpoints:
+                        seg_st_idx = self.breakpoints.bisect_left(si) - 1
+                        seg_st = self.breakpoints[seg_st_idx]
+                        seg_en_idx = self.breakpoints.bisect_left(si)
+                        seg_en = self.breakpoints[seg_en_idx]
 
-                # unassign all segments within this cluster
-                # (it will either be joined with a new cluster, or remade again into its own cluster)
-                del self.clust_counts[cl_idx]
-                del self.clust_sums[cl_idx]
-                del self.clust_members[cl_idx]
-                del self.clust_members_bps[cl_idx]
-                self.clusts[seg_idx] = -1
+                        self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust)
 
-                move_clust = True
+                # get all breakpoints within this cluster/interval
+                left_idx = self.clust_members_bps[cur_clust].bisect_left(seg_idx[0])
+                right_idx = self.clust_members_bps[cur_clust].bisect_right(seg_idx[-1])
+                break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cur_clust][left_idx:right_idx]])
+
+            # }}}
+
+            n_move = len(seg_idx)
+
+            # if segment was already assigned to a cluster, unassign it
+            if cur_clust >= 0:
+                self.clust_counts[cur_clust] -= n_move
+                if self.clust_counts[cur_clust] == 0:
+                    del self.clust_counts[cur_clust]
+                    del self.clust_sums[cur_clust]
+                    del self.clust_members[cur_clust]
+                    del self.clust_members_bps[cur_clust]
+                else:
+                    self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)]
+                    self.clust_members[cur_clust] -= set(seg_idx)
+                    for b in break_idx:
+                        self.clust_members_bps[cur_clust].remove(self.breakpoints[b])
+
+                self.clusts[seg_idx] = -1
 
             #
             # perform phase correction on segment/cluster
@@ -844,12 +812,7 @@ def run(self, n_iter = 0, n_samps = 0):
             #
             # adjacent segment likelihood
 
-            #adj_AB = 0
-            #adj_BC = np.zeros([len(self.clust_sums), 2])
-
-            log_adj_lik = 0
-            if not move_clust: # or (move_clust and np.random.rand() < 0.01):
-                log_adj_lik = self.compute_adj_prob(break_idx[0])
+            log_adj_lik = self.compute_adj_prob(break_idx[0])
  
             # p(X|clust,phase)p(X|seg,phase)p(clust)p(phase)
             num = (MLs               # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B})
@@ -880,11 +843,6 @@ def run(self, n_iter = 0, n_samps = 0):
                     en = self.breakpoints[b + 1]
                     self.seg_sums[st] = self.seg_sums[st][::-1]
 
-            if not move_clust:
-                print(f"{cur_clust}->{choice} ({len(seg_idx)}, s, [{seg_idx[0]}, {seg_idx[-1]}])")
-            else:
-                print(f"{cl_idx}->{choice} ({len(seg_idx)}, c, [{seg_idx[0]}, {seg_idx[-1]}])")
-
             # create new cluster
             if choice < 0:
                 # if we are moving an entire cluster, give it the same index it used to have
@@ -904,18 +862,6 @@ def run(self, n_iter = 0, n_samps = 0):
 
             # join existing cluster
             else:
-                # if we are combining two clusters, take the index of the bigger one
-                # this helps to keep cluster indices consistent
-                if move_clust and self.clust_counts[choice] < n_move:
-                    self.clust_counts[cl_idx] = self.clust_counts[choice]
-                    self.clust_sums[cl_idx] = self.clust_sums[choice]
-                    self.clust_members[cl_idx] = self.clust_members[choice]
-                    self.S.iloc[np.flatnonzero(self.S["clust"] == choice), self.clust_col] = cl_idx
-                    del self.clust_counts[choice]
-                    del self.clust_sums[choice]
-                    del self.clust_members[choice]
-                    choice = cl_idx
-
                 self.clust_counts[choice] += n_move 
                 self.clust_sums[choice] += np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a]
                 self.S.iloc[seg_idx, self.clust_col] = choice

From 37fde590b307fdc9d8c9f31a78552f86c26473e5 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:18:51 -0400
Subject: [PATCH 105/222] Commit some diagnostic code just in case

---
 hapaseg/allelic_DP.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 6d89d57..3f4d92c 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -599,6 +599,13 @@ def run(self, n_iter = 0, n_samps = 0):
             #
             # pick  a segment to move
 
+# diagnostic code to compute overall likelihood before move
+#            compute_lik = False
+#            lik_before = np.nan
+#            if touch90 and np.random.rand() < 0.1:
+#                compute_lik = True
+#                lik_before = self.compute_overall_lik_simple()
+
             # >90% of segments have been moved; we are iterating over segments sequentially
             if touch90:
                 break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)})
@@ -719,6 +726,12 @@ def run(self, n_iter = 0, n_samps = 0):
             #
             # perform phase correction on segment/cluster
             # flip min/maj with probability that alleles are oriented the "wrong" way
+#            if not np.isnan(self.seg_phase_probs[seg_idx[0]]):
+#                rfp = self.compute_rephase_prob(seg_idx)
+#                rfp_mem = self.seg_phase_probs[seg_idx[0]]
+#                if np.abs(rfp - rfp_mem) > 0.05:
+#                    print(rfp_mem, rfp)
+#                    breakpoint()
             if np.isnan(self.seg_phase_probs[seg_idx[0]]):
                 self.seg_phase_probs[seg_idx[0]] = self.compute_rephase_prob(seg_idx)
             rephase_prob = self.seg_phase_probs[seg_idx[0]]
@@ -898,6 +911,14 @@ def run(self, n_iter = 0, n_samps = 0):
                     self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster
                     update_idx.add(self.breakpoints.bisect_left(snp) - 1)
                     snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1])
+#            if len(update_idx):
+#                usnp = self.breakpoints[self.breakpoints.bisect_left(seg_idx[0]) - 1]
+#                print(f"{usnp}: {self.clusts[usnp]}")
+#                print(f"{snp_idx[0]}: {self.clusts[snp_idx[0]]} <")
+#                print(f"{snp_idx[1]}: {self.clusts[snp_idx[1]]} <")
+#                dsnp = self.breakpoints[self.breakpoints.bisect_right(seg_idx[0])]
+#                print(f"{dsnp}: {self.clusts[dsnp]}")
+#                print(f"Update: {self.breakpoints[update_idx[0]]}")
             for bp_idx in update_idx:
                 st = self.breakpoints[bp_idx]
                 en = self.breakpoints[bp_idx + 1]
@@ -912,6 +933,24 @@ def run(self, n_iter = 0, n_samps = 0):
             else:
                 self.clust_members_bps[choice] |= snp_idx
 
+# diagnostic code to check if breakpoint list is properly updated
+#            if touch90:
+#                x = sc.SortedSet()
+#                for y in self.clust_members_bps.values():
+#                    x |= y
+#                if len(x) != len(self.breakpoints) - 1:
+#                    breakpoint()
+
+# diagnostic code to compute overall likelihood delta for iteration
+#            if compute_lik:
+#                lik_after = self.compute_overall_lik_simple()
+#                lik_delta = lik_after.sum() - lik_before.sum()
+#                ML_choice = num.ravel()[choice_idx]
+#                if not np.isnan(lik_delta) and (lik_delta != 0 or ML_choice != 0):
+#                    print("lik: {}; MLs: {}".format(lik_delta, ML_choice))
+##                if lik_delta < 0 and ML_choice == 0:
+##                    breakpoint()
+
             # save a sample from the MCMC when >95% of segments have been touched since the last iteration
             if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95:
                 self.snps_to_clusters.append(self.S["clust"].copy())

From 2ab56291d921cf8bbaac82d8e6dd47860df3d173 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:19:45 -0400
Subject: [PATCH 106/222] Bump touch90->95%

---
 hapaseg/allelic_DP.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 3f4d92c..af7958c 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -575,8 +575,8 @@ def run(self, n_iter = 0, n_samps = 0):
 
             # poll every 100 iterations for various statuses
             if not n_it % 100:
-                # have >90% of segments been touched?
-                if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.9:
+                # have >95% of segments been touched?
+                if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.95:
                     touch90 = True
 
                 # start computing likelihoods

From c4662ec571c7c679fbd599809696f530caac54a5 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:19:53 -0400
Subject: [PATCH 107/222] Add TODO note

---
 hapaseg/allelic_DP.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index af7958c..4e0c9bc 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -145,6 +145,7 @@ def _Scumsum_ph(self, seg_idx, min = True):
             return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]][si].cumsum()
 
     def compute_rephase_prob(self, seg_idx):
+        # TODO: compute logcdf/logsf directly
         flip = self.S.iloc[seg_idx, self.flip_col]
         flip_n = ~flip
 

From 8235ff02ec7bf45ff7d1a4d6a1be831a1ad31857 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:33:44 -0400
Subject: [PATCH 108/222] Print MCMC trace progress

---
 hapaseg/allelic_DP.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 4e0c9bc..5d6d24c 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -565,6 +565,8 @@ def run(self, n_iter = 0, n_samps = 0):
                     print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1]))
                 if likelihood_ready:
                     print("[{}] Likelihood: {}".format("*" if burned_in else " ", self.lik_trace[-1].sum()))
+                if burned_in:
+                    print("{}/{} MCMC samples collected".format(len(self.snps_to_clusters), n_samps))
 
             # stop after a raw number of iterations
             if n_iter > 0 and n_it > n_iter:

From 05f408da8519835f28285df9c12be0cf8f99125c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:42:14 -0400
Subject: [PATCH 109/222] Bump AMCMC segmentation docker

---
 wolF/tasks.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 35a365a..02e4358 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -50,7 +50,7 @@ class Hapaseg_burnin(wolf.Task):
     output_patterns = {
       "burnin_MCMC" : "amcmc_results.pickle"
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:v458"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581"
 
 class Hapaseg_concat(wolf.Task):
     inputs = {
@@ -65,7 +65,7 @@ class Hapaseg_concat(wolf.Task):
       "arms" : "AMCMC-arm*.pickle",
       "ref_bias" : ("ref_bias.txt", wolf.read_file)
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:v458"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581"
 
 class Hapaseg_amcmc(wolf.Task):
     inputs = {
@@ -81,7 +81,7 @@ class Hapaseg_amcmc(wolf.Task):
     output_patterns = {
       "arm_level_MCMC" : "amcmc_results.pickle"
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:v458"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581"
 
 class Hapaseg_allelic_DP(wolf.Task):
     inputs = {

From c3b58d7f2aa2fd2ddc390899b060698fa32ba86f Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:42:38 -0400
Subject: [PATCH 110/222] Update ADP workflow

---
 wolF/tasks.py    | 6 +++---
 wolF/workflow.py | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 02e4358..acb75b2 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -101,9 +101,9 @@ class Hapaseg_allelic_DP(wolf.Task):
     output_patterns = {
       "cluster_and_phase_assignments" : "allelic_DP_SNP_clusts_and_phase_assignments.npz",
       "all_SNPs" : "all_SNPs.pickle",
+      "likelihood_trace_plot" : "figures/likelihood_trace.png",
       "SNP_plot" : "figures/SNPs.png",
-      "seg_plot" : "figures/allelic_imbalance_preDP.png",
-      "clust_plot" : "figures/allelic_imbalance_postDP.png",
+      "seg_plot" : "figures/segs_only.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:v499"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v608"
     resources = { "mem" : "5G" }
diff --git a/wolF/workflow.py b/wolF/workflow.py
index 6ae5505..1d12f16 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -376,8 +376,6 @@ def concat_arm_level_results(arm_results):
     hapaseg_allelic_DP_task = hapaseg.Hapaseg_allelic_DP(
      inputs = {
        "seg_dataframe" : arm_concat,
-       "n_dp_iter" : 10,   # TODO: allow to be specified?
-       "seg_samp_idx" : n_samps_range,
        "cytoband_file" : "/mnt/j/db/hg38/ref/cytoBand_primary.txt", # TODO: allow to be specified
        "ref_fasta" : localization_task["ref_fasta"],
        "ref_fasta_idx" : localization_task["ref_fasta_idx"],  # not used; just supplied for symlink

From 2c8bc4092967dd22d887f14d07c3889773d931b1 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:51:26 -0400
Subject: [PATCH 111/222] Install distinctipy

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 9234955..85baf96 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,6 +6,7 @@ WORKDIR /build
 RUN pip install sortedcontainers
 RUN git clone https://github.com/getzlab/CApy.git && pip install ./CApy
 RUN pip install dask distributed
+RUN pip install distinctipy
 
 # install hapaseg
 COPY setup.py .

From 5fcee32136c6cdf737450512c02480745b00a673 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 18 Apr 2022 23:52:13 -0400
Subject: [PATCH 112/222] Bump docker

---
 wolF/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index acb75b2..5dfc61a 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -105,5 +105,5 @@ class Hapaseg_allelic_DP(wolf.Task):
       "SNP_plot" : "figures/SNPs.png",
       "seg_plot" : "figures/segs_only.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v608"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v611"
     resources = { "mem" : "5G" }

From 3d96d2720deb15e28cbade84b0cf94013bbe2dc4 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 14:54:05 -0400
Subject: [PATCH 113/222] Initial commit of het selection notebook

---
 40_het_selection.py | 79 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)
 create mode 100644 40_het_selection.py

diff --git a/40_het_selection.py b/40_het_selection.py
new file mode 100644
index 0000000..06991e0
--- /dev/null
+++ b/40_het_selection.py
@@ -0,0 +1,79 @@
+import colorama
+import copy
+import itertools
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import ncls
+import numpy as np
+import numpy_groupies as npg
+import pandas as pd
+import scipy.stats as s
+import scipy.sparse as sp
+import scipy.special as ss
+import sortedcontainers as sc
+
+plt.figure(1); plt.clf()
+plt.figure(2); plt.clf()
+plt.figure(30); plt.clf()
+cut20_dens = {}
+cut20_lod = {}
+cut80_dens = {}
+cut80_lod = {}
+for depth in [15, 20, 30, 60, 80, 200]:
+    # simulate good hets
+    cov = s.poisson.rvs(depth, size = 10000)
+    A = s.binom.rvs(cov, 0.5)
+    B = cov - A
+
+    # simulate bad hets
+    bad_cov = s.poisson.rvs(depth, size = 10000)
+    bad_frac = np.ones_like(bad_cov).astype(float)
+    for i in range(len(bad_frac)):
+        bad_frac[i] = np.random.choice([0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9])
+    A_bad = s.binom.rvs(bad_cov, bad_frac)
+    B_bad = bad_cov - A_bad
+
+    # old criterion: beta density between 0.6 and 0.4
+    betafrac = np.diff(s.beta.cdf([0.4, 0.6], A[:, None] + 1, B[:, None] + 1))
+    betafrac_bad = np.diff(s.beta.cdf([0.4, 0.6], A_bad[:, None] + 1, B_bad[:, None] + 1))
+
+    # new criterion: log-odds ratio
+    betalod = s.beta.logsf(0.5, A + 1, B + 1) - s.beta.logcdf(0.5, A + 1, B + 1)
+    betalod_bad = s.beta.logsf(0.5, A_bad + 1, B_bad + 1) - s.beta.logcdf(0.5, A_bad + 1, B_bad + 1)
+
+    # ROC curves
+    dens_cdf = np.zeros([1000, 2])
+    for i, cut in enumerate(np.linspace(0, 1, 1000)):
+        dens_cdf[i, 0] = (betafrac >= cut).mean()
+        dens_cdf[i, 1] = (betafrac_bad >= cut).mean()
+
+    lod_cdf = np.zeros([1000, 2])
+    for i, cut in enumerate(np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)):
+        lod_cdf[i, 0] = (np.abs(betalod) <= cut).mean()
+        lod_cdf[i, 1] = (np.abs(betalod_bad) <= cut).mean()
+
+    plt.figure(30)
+    st = plt.step(dens_cdf[:, 1], dens_cdf[:, 0])
+    color = st[0].get_color()
+    plt.step(lod_cdf[:, 1], lod_cdf[:, 0], color = color, linestyle = ":")
+
+    cut20_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 1] <= 0.2)[0]]
+    cut80_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 0] <= 0.8)[0]]
+    cut20_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[np.flatnonzero(lod_cdf[:, 1] >= 0.2)[0]]
+    cut80_lod_idx = np.flatnonzero(lod_cdf[:, 0] >= 0.8)[0]
+    cut80_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[cut80_lod_idx]
+
+    plt.scatter(lod_cdf[cut80_lod_idx, 1], lod_cdf[cut80_lod_idx, 0], marker = 'x', color = color)
+    plt.text(lod_cdf[cut80_lod_idx, 1], lod_cdf[cut80_lod_idx, 0], "{0:.2f}".format(cut80_lod[depth]), color = color)
+
+    plt.figure(1)
+    sc = plt.scatter(cov, betafrac, alpha = 0.1, s = 10)
+    plt.scatter(depth, np.diff(s.beta.cdf([0.4, 0.6], depth/2 + 1, depth/2 + 1)), color = color, marker = "x")
+
+    cov_range = np.r_[cov.min():cov.max()]
+    cov_cum = np.nan*np.ones_like(cov_range)
+    for i, c in enumerate(cov_range):
+        cov_cum[i] = betafrac[cov >= c].mean()
+
+    plt.figure(2)
+    plt.scatter(cov_range, cov_cum)

From 1da6f43ed6a3def682a467668a5f2083a6ed02a0 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 15:46:22 -0400
Subject: [PATCH 114/222] Speed up AMCMC early convergence contingency

---
 hapaseg/allelic_MCMC.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index 95a349a..7928541 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -139,7 +139,7 @@ def run(self):
                 return self
 
             # save MLE breakpoint if we've burned in
-            if self.burned_in or self.iter >= self.n_iter - 100: # contingency in case we've converged on an optimum early and the chain hasn't moved at all
+            if self.burned_in:
                 if self.marg_lik[self.iter] > self.marg_lik[self.iter - 1]:
                     self.breakpoints_MLE = self.breakpoints.copy()
 
@@ -159,11 +159,23 @@ def run(self):
                   color = color
                 ))
 
-            # check if we've burned in
+            # check if we've burned in -- chain is oscillating around some
+            # optimium (and thus mean differences between marginal likelihoods might
+            # be slightly negative)
             # TODO: use a faster method of computing rolling average
             if not self.burned_in and self.iter > 1000:
                 if np.diff(self.marg_lik[(self.iter - 1000):self.iter]).mean() < 0:
                     self.burned_in = True
+                # contingency if we've unambiguously converged on an optimum and chain has not moved at all
+                # exit early to save time
+                if (np.diff(self.marg_lik[(self.iter - 1000):self.iter]) == 0).all():
+                    self.breakpoints_MLE = self.breakpoints.copy()
+                    print(colorama.Fore.GREEN + "Chain has unambiguously converged on an optimum; stopping early in {n} iterations. n_bp = {n_bp}, lik = {lik}".format(
+                      n = self.iter,
+                      n_bp = len(self.breakpoints),
+                      lik = self.marg_lik[self.iter]
+                    ) + colorama.Fore.RESET)
+                    return self
 
             self.iter += 1 
 

From 8b0e569adb7013adbc46efd061b945c193ae9eef Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 15:47:37 -0400
Subject: [PATCH 115/222] Use PoD genotyper for het pulldown

---
 wolF/workflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index 1d12f16..4af593f 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -168,7 +168,7 @@ def interval_gather(interval_files):
           ref_fasta = localization_task["ref_fasta"],
           ref_fasta_idx = localization_task["ref_fasta_idx"],
           ref_fasta_dict = localization_task["ref_fasta_dict"],
-          dens_cutoff = 0.58 # TODO: set dynamically
+          use_pod_genotyper = True
         )
 
     # otherwise, run M1 and get it from the BAM
@@ -200,7 +200,7 @@ def interval_gather(interval_files):
           ref_fasta = localization_task["ref_fasta"],
           ref_fasta_idx = localization_task["ref_fasta_idx"],
           ref_fasta_dict = localization_task["ref_fasta_dict"],
-          dens_cutoff = 0.58 # TODO: set dynamically
+          use_pod_genotyper = True
         )
 
         # gather het pulldown

From aabd727d6cd11e8650c38840338c94c68bbc7f02 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 16:07:59 -0400
Subject: [PATCH 116/222] Visualize AMCMC

---
 hapaseg/__main__.py     | 8 ++++++++
 hapaseg/allelic_MCMC.py | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 7821f5d..1526d63 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -9,6 +9,7 @@
 import scipy.stats as s
 import scipy.special as ss
 import sortedcontainers as sc
+import traceback
 
 from capy import mut
 
@@ -219,6 +220,13 @@ def main():
         with open(output_dir + "/amcmc_results.pickle", "wb") as f:
             pickle.dump(H.run(), f)
 
+        try:
+            H.visualize()
+            plt.savefig(output_dir + "/figures/MLE_segmentation.png", dpi = 300)
+        except Exception:
+            print("Error plotting segments; see stack trace for details:")
+            print(traceback.format_exc())
+
     elif args.command == "concat":
         #
         # load scatter intervals
diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index 7928541..32e4313 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -525,4 +525,6 @@ def visualize(self, show_CIs = False):
         ax.set_xlabel("SNP index")
         ax.set_ylim([0, 1])
 
+        ax.set_title(f"{self.P.iloc[0]['chr']}:{self.P.iloc[0]['pos']}-{self.P.iloc[-1]['pos']}")
+
         plt.tight_layout()

From c094f6af200f25169758e4b85d7df0a759262436 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 16:15:32 -0400
Subject: [PATCH 117/222] Increase eagle threads

---
 wolF/workflow.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index 4af593f..0218bf9 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -287,8 +287,10 @@ def order_indices(bcf_path, bcf_idx_path, localization_task):
         vcf_idx_in = F["bcf_idx_path"],
         vcf_ref = F["ref_bcf"],
         vcf_ref_idx = F["ref_bcf_idx"],
-        output_file_prefix = "foo"
-      )
+        output_file_prefix = "foo",
+        num_threads = 4,
+      ),
+      resources = { "cpus-per-task" : 4 }
     )
 
     # TODO: run whatshap

From c703ffb8fe158163b15daeb0c438f878c8694c2b Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 16:50:49 -0400
Subject: [PATCH 118/222] Save segmentation plot from AMCMC

---
 wolF/tasks.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 5dfc61a..3aa55f5 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -50,7 +50,7 @@ class Hapaseg_burnin(wolf.Task):
     output_patterns = {
       "burnin_MCMC" : "amcmc_results.pickle"
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617"
 
 class Hapaseg_concat(wolf.Task):
     inputs = {
@@ -65,7 +65,7 @@ class Hapaseg_concat(wolf.Task):
       "arms" : "AMCMC-arm*.pickle",
       "ref_bias" : ("ref_bias.txt", wolf.read_file)
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617"
 
 class Hapaseg_amcmc(wolf.Task):
     inputs = {
@@ -79,9 +79,10 @@ class Hapaseg_amcmc(wolf.Task):
             --n_iter ${n_iter}
     """
     output_patterns = {
-      "arm_level_MCMC" : "amcmc_results.pickle"
+      "arm_level_MCMC" : "amcmc_results.pickle",
+      "segmentation_plot" : "figures/MLE_segmentation.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617"
 
 class Hapaseg_allelic_DP(wolf.Task):
     inputs = {
@@ -105,5 +106,5 @@ class Hapaseg_allelic_DP(wolf.Task):
       "SNP_plot" : "figures/SNPs.png",
       "seg_plot" : "figures/segs_only.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v611"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617"
     resources = { "mem" : "5G" }

From f6e0c7b3302d9828072ca7c158e25cc666fba79d Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 20:59:03 -0400
Subject: [PATCH 119/222] Update ADP task definition

---
 wolF/tasks.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 3aa55f5..7a0bbd4 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -87,15 +87,11 @@ class Hapaseg_amcmc(wolf.Task):
 class Hapaseg_allelic_DP(wolf.Task):
     inputs = {
       "seg_dataframe" : None,
-      "n_dp_iter" : 10,
-      "seg_samp_idx" : 0,
       "ref_fasta" : None,
       "cytoband_file" : None
     }
     script = """
     hapaseg dp --seg_dataframe ${seg_dataframe} \
-            --n_dp_iter ${n_dp_iter} \
-            --seg_samp_idx ${seg_samp_idx} \
             --ref_fasta ${ref_fasta} \
             --cytoband_file ${cytoband_file}
     """

From b619cd0e42295cdbeb905d8cc2b884a868463a80 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 21:51:56 -0400
Subject: [PATCH 120/222] Increase SNP opacity

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 5d6d24c..a7624a5 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1028,7 +1028,7 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False):
         if show_snps:
             # set SNP alpha based on number of SNPs
             logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M)))
-            default_alpha = logistic(A = 0.4, K = 0.01, B = 0.00001, M = 120000, x = len(self.S))
+            default_alpha = logistic(A = 0.4, K = 0.025, B = 0.00001, M = 120000, x = len(self.S))
 
             ph_prob = np.r_[self.phase_orientations].mean(0)
 

From a66e2e7db65499ff791a86f07674228bbf5743b3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 22:33:49 -0400
Subject: [PATCH 121/222] Bump mandatory pandas version

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 670d4c6..b4501b5 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@
     #long_description = long_description,
     #long_description_content_type = 'text/markdown',
     install_requires = [
-        'pandas>=0.24.1',
+        'pandas>=1.4.1',
         'numpy>=1.18.0',
         'more-itertools>=8.10.0',
         'numpy_groupies>=0.9.14',

From e999013cbd893bdc831c140c59ac2c7fef37e5cb Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 19 Apr 2022 22:44:31 -0400
Subject: [PATCH 122/222] Bump ADP docker

---
 wolF/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 7a0bbd4..8039cb5 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -102,5 +102,5 @@ class Hapaseg_allelic_DP(wolf.Task):
       "SNP_plot" : "figures/SNPs.png",
       "seg_plot" : "figures/segs_only.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v621"
     resources = { "mem" : "5G" }

From a142ce97f92fe3f09cc9cba2d4df83bc34f44c5e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 20 Apr 2022 08:23:20 -0400
Subject: [PATCH 123/222] Restrict ADP plot to chromosome, if specified

---
 hapaseg/allelic_DP.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index a7624a5..de8ddf3 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1012,10 +1012,13 @@ def get_colors(self):
 
         return np.r_[base_colors, extra_colors if extra_colors.size > 0 else np.empty([0, 3])][si]
 
-    def visualize_segs(self, f = None, use_clust = False, show_snps = False):
+    def visualize_segs(self, f = None, use_clust = False, show_snps = False, chrom = None):
         f = plt.figure(figsize = [16, 4]) if f is None else f
         ax = plt.gca()
-        ax.set_xlim([0, self.S["pos_gp"].max()])
+        if chrom is None:
+            ax.set_xlim([0, self.S["pos_gp"].max()])
+        else:
+            ax.set_xlim([*self.S.loc[self.S["chr"] == chrom, "pos_gp"].iloc[[0, -1]]])
         ax.set_ylim([0, 1])
 
         colors = self.get_colors()
@@ -1028,7 +1031,7 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False):
         if show_snps:
             # set SNP alpha based on number of SNPs
             logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M)))
-            default_alpha = logistic(A = 0.4, K = 0.025, B = 0.00001, M = 120000, x = len(self.S))
+            default_alpha = logistic(A = 0.4, K = 0.025, B = 0.00001, M = 120000, x = len(self.S) if chrom is None else (self.S["chr"] == chrom).sum())
 
             ph_prob = np.r_[self.phase_orientations].mean(0)
 

From ef38d99947f3474129ea83b0fbf1dbda45bdc995 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 20 Apr 2022 09:04:31 -0400
Subject: [PATCH 124/222] Bump ADP docker

---
 wolF/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 8039cb5..b5bba8a 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -102,5 +102,5 @@ class Hapaseg_allelic_DP(wolf.Task):
       "SNP_plot" : "figures/SNPs.png",
       "seg_plot" : "figures/segs_only.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v621"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v623"
     resources = { "mem" : "5G" }

From 4aca76ab350a07c40bed4636ea20bf17b1f4f715 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 20 Apr 2022 12:16:15 -0400
Subject: [PATCH 125/222] Add legend

---
 40_het_selection.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/40_het_selection.py b/40_het_selection.py
index 06991e0..852de78 100644
--- a/40_het_selection.py
+++ b/40_het_selection.py
@@ -19,6 +19,7 @@
 cut20_lod = {}
 cut80_dens = {}
 cut80_lod = {}
+leg = []
 for depth in [15, 20, 30, 60, 80, 200]:
     # simulate good hets
     cov = s.poisson.rvs(depth, size = 10000)
@@ -57,6 +58,8 @@
     color = st[0].get_color()
     plt.step(lod_cdf[:, 1], lod_cdf[:, 0], color = color, linestyle = ":")
 
+    leg.append(st)
+
     cut20_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 1] <= 0.2)[0]]
     cut80_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 0] <= 0.8)[0]]
     cut20_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[np.flatnonzero(lod_cdf[:, 1] >= 0.2)[0]]
@@ -77,3 +80,6 @@
 
     plt.figure(2)
     plt.scatter(cov_range, cov_cum)
+
+plt.figure(3)
+plt.legend([x[0] for x in leg], ["15x", "20x", "30x", "60x", "80x", "200x"])

From 1c818a1f0710c0a94d3f9e739f3f813eacc02ff2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 25 Apr 2022 00:24:24 -0400
Subject: [PATCH 126/222] Add TODO for parsing cytoband file

---
 hapaseg/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hapaseg/utils.py b/hapaseg/utils.py
index e9dc502..c216f4c 100644
--- a/hapaseg/utils.py
+++ b/hapaseg/utils.py
@@ -5,7 +5,8 @@
 _chrmap = dict(zip(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]], range(1, 25)))
 
 def parse_cytoband(cytoband):
-    cband = pd.read_csv(cytoband, sep = "\t")
+    # TODO: do some cytoband files have a header? check if so!
+    cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"])
     cband["chr"] = cband["chr"].apply(lambda x : _chrmap[x])
 
     chrs = cband["chr"].unique()

From adfc8500b03331a3ef45498b0c9a6ba0d8ca4bcf Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 10:24:51 -0400
Subject: [PATCH 127/222] Remove temperature

---
 hapaseg/allelic_DP.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index de8ddf3..6c6cb86 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -82,12 +82,11 @@ def run(self):
         return self.snps_to_clusters, self.snps_to_phases, self.likelihoods
 
 class DPinstance:
-    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, temperature = 1, dp_count_scale_factor = 1):
+    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, dp_count_scale_factor = 1):
         self.S = S
         self.clust_prior = clust_prior.copy()
         self.clust_count_prior = clust_count_prior.copy()
         self.alpha = alpha
-        self.temperature = temperature
         self.dp_count_scale_factor = dp_count_scale_factor
 
         self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed
@@ -836,8 +835,6 @@ def run(self, n_iter = 0, n_samps = 0):
                   + log_count_prior  # p(clust) (DP prior on clust counts)
                   + log_phase_prob)  # p(phase)
 
-            num /= self.temperature # scale by temperature for replica-exchange
-
             num -= num.max() # avoid underflow in sum-exp
 
             # p(clust,phase|X)

From a75de21aa5ec0668c1e9cd228cd5c131b52bc918 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 10:45:08 -0400
Subject: [PATCH 128/222] persistant->persistent

---
 wolF/workflow.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index 446b3c5..f9d02b1 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -112,10 +112,10 @@ def workflow(
 
   num_cov_seg_samples=5,
 
-  persistant_dry_run = False
+  persistent_dry_run = False
 ):
-    # alert for persistant dry run
-    if persistant_dry_run:
+    # alert for persistent dry run
+    if persistent_dry_run:
         #TODO push this message to canine
         print("WARNING: Skipping file localization in dry run!")
     
@@ -160,7 +160,7 @@ def workflow(
             "t_bai" : tumor_bai,
           },
         token=localization_token,
-        persistent_disk_dry_run = persistant_dry_run
+        persistent_disk_dry_run = persistent_dry_run
         )
         collect_tumor_coverage = True
     elif tumor_coverage_bed is not None:
@@ -176,7 +176,7 @@ def workflow(
             "n_bai" : normal_bai
           },
         token=localization_token,
-        persistent_disk_dry_run = persistant_dry_run
+        persistent_disk_dry_run = persistent_dry_run
         )
         collect_normal_coverage = True
     elif normal_coverage_bed is not None:
@@ -604,14 +604,14 @@ def _get_ADP_draw_num(preprocess_data_obj):
     )
 
     #cleanup by deleting bam disks. we make seperate tasks for the bams
-    if not persistant_dry_run and t_bam is not None and t_bai is not None:
+    if not persistent_dry_run and t_bam is not None and t_bai is not None:
         delete_tbams_task = DeleteDisk(
           inputs = {
             "disk" : [tumor_bam_localization_task["t_bam"], tumor_bam_localization_task["t_bai"]],
             "upstream" : m1_task["mutect1_cs"] if callstats_file is None else tumor_cov_gather_task["coverage"] 
      )
      
-    if not persistant_dry_run and n_bam is not None and n_bai is not None:
+    if not persistent_dry_run and n_bam is not None and n_bai is not None:
         delete_nbams_task = DeleteDisk(
           inputs = {
             "disk" : [normal_bam_localization_task["n_bam"], normal_bam_localization_task["n_bai"]],

From ff23bc95b76156c1a87dd7bb25abffbcd903fbfd Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 10:45:16 -0400
Subject: [PATCH 129/222] Add forgetten brackets]

---
 wolF/workflow.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index f9d02b1..3842801 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -609,6 +609,7 @@ def _get_ADP_draw_num(preprocess_data_obj):
           inputs = {
             "disk" : [tumor_bam_localization_task["t_bam"], tumor_bam_localization_task["t_bai"]],
             "upstream" : m1_task["mutect1_cs"] if callstats_file is None else tumor_cov_gather_task["coverage"] 
+          }
      )
      
     if not persistent_dry_run and n_bam is not None and n_bai is not None:
@@ -616,6 +617,7 @@ def _get_ADP_draw_num(preprocess_data_obj):
           inputs = {
             "disk" : [normal_bam_localization_task["n_bam"], normal_bam_localization_task["n_bai"]],
             "upstream" : m1_task["mutect1_cs"]
+          }
     )
     #also delete the cached files disk
     delete_file_disk_task = DeleteDisk(

From 35b269037096e90e4ee53e5013c2faec32743378 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 10:51:33 -0400
Subject: [PATCH 130/222] t/n_bam -> tumor/normal_bam

---
 wolF/workflow.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index 3842801..bb64a87 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -604,7 +604,7 @@ def _get_ADP_draw_num(preprocess_data_obj):
     )
 
     #cleanup by deleting bam disks. we make seperate tasks for the bams
-    if not persistent_dry_run and t_bam is not None and t_bai is not None:
+    if not persistent_dry_run and tumor_bam is not None and tumor_bai is not None:
         delete_tbams_task = DeleteDisk(
           inputs = {
             "disk" : [tumor_bam_localization_task["t_bam"], tumor_bam_localization_task["t_bai"]],
@@ -612,7 +612,7 @@ def _get_ADP_draw_num(preprocess_data_obj):
           }
      )
      
-    if not persistent_dry_run and n_bam is not None and n_bai is not None:
+    if not persistent_dry_run and normal_bam is not None and normal_bai is not None:
         delete_nbams_task = DeleteDisk(
           inputs = {
             "disk" : [normal_bam_localization_task["n_bam"], normal_bam_localization_task["n_bai"]],

From 301d1df325e75e3a5ecb1be80451f64356ef94c5 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 11:06:57 -0400
Subject: [PATCH 131/222] Run on Richter's

---
 21_genome.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/21_genome.py b/21_genome.py
index 2f6d76b..c364ac2 100644
--- a/21_genome.py
+++ b/21_genome.py
@@ -58,3 +58,26 @@
       normal_bai = "gs://fc-secure-e2772064-386d-4911-b242-d6ade82bf172/360c5959-3827-4b24-92e3-d57dbc5de2f6/gdc_api_file_download/15788922-9cf8-4c83-8040-47fa60b7d374/call-download_file/98e061cd-0586-4e56-85fb-c6cc6688dbff_wgs_gdc_realn.bai",
       target_list = 200
     )
+
+# Richter's test (hg19)
+import wolf
+from wolF import workflow
+
+import dalmatian
+wm = dalmatian.WorkspaceManager("broad-firecloud-ibmwatson/Getz_Wu_Richters_WGS_UK")
+
+wic = wolf.fc.WorkspaceInputConnector("broad-firecloud-ibmwatson/Getz_Wu_Richters_WGS_UK")
+Pj = wic.get_pairs_as_joint_samples()
+
+with wolf.Workflow(workflow = workflow.workflow, namespace = "HapASeg_Richters") as w:
+    for pair, p in Pj.loc[Pj["sample_type_T"] == "Richter"].iterrows():
+        w.run(
+          RUN_NAME = pair,
+          tumor_bam = p["output_bam_T"],
+          tumor_bai = p["output_bam_index_T"],
+          normal_bam = p["output_bam_N"],
+          normal_bai = p["output_bam_index_N"],
+          target_list = 2000,
+          ref_genome_build = "hg19"
+        )
+        break

From 35c46ac84a854cd4fe69de5aff4d8cb2e6a7e6ec Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 11:14:44 -0400
Subject: [PATCH 132/222] Allow cytoband file to be specified in ADP

---
 wolF/workflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index bb64a87..5abad32 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -473,7 +473,7 @@ def concat_arm_level_results(arm_results):
      inputs = {
        "seg_dataframe" : arm_concat,
        #"seg_dataframe" : hapaseg_arm_concat_task["arm_cat_results_pickle"],
-       "cytoband_file" : "/mnt/j/db/hg38/ref/cytoBand_primary.txt", # TODO: allow to be specified
+       "cytoband_file" : localization_task["cytoband_file"],
        "ref_fasta" : localization_task["ref_fasta"],
        "ref_fasta_idx" : localization_task["ref_fasta_idx"],  # not used; just supplied for symlink
        "ref_fasta_dict" : localization_task["ref_fasta_dict"] # not used; just supplied for symlink

From c8a1652f6852ca6b8b1b99b98d4e722118e783e8 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 11:15:30 -0400
Subject: [PATCH 133/222] Minor formatting

---
 wolF/workflow.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index 5abad32..4751a2d 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -479,7 +479,7 @@ def concat_arm_level_results(arm_results):
        "ref_fasta_dict" : localization_task["ref_fasta_dict"] # not used; just supplied for symlink
      }
     )
-    
+
     ##collect DP results
     collect_adp_task = hapaseg.Hapaseg_collect_adp(
         inputs={"dp_results":[hapaseg_allelic_DP_task["cluster_and_phase_assignments"]]
@@ -528,9 +528,9 @@ def _get_ADP_cluster_list(preprocess_data_obj):
         cluster_idxs = [i for i in np.arange(num_clusters)]
         print(cluster_idxs, cluster_list, range_list) 
         return len(cluster_idxs), cluster_idxs, cluster_list, range_list
-    
+
     num_clusters, cluster_idxs, cluster_list, range_list = _get_ADP_cluster_list(prep_cov_mcmc_task["preprocess_data"])
-    
+
     # coverage MCMC burnin
     cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc_burnin(
         inputs={
@@ -541,7 +541,7 @@ def _get_ADP_cluster_list(preprocess_data_obj):
             "range":range_list
         }
     )
-    
+
     # coverage MCMC scatter post-burnin
     cov_mcmc_scatter_task = hapaseg.Hapaseg_coverage_mcmc(
         inputs={
@@ -552,7 +552,7 @@ def _get_ADP_cluster_list(preprocess_data_obj):
             "burnin_files":[cov_mcmc_burnin_task["burnin_data"]] * num_clusters # this is to account for a wolf input len bug
         }
     )
-    
+
     # collect coverage MCMC
     cov_mcmc_gather_task = hapaseg.Hapaseg_collect_coverage_mcmc(
     inputs = {
@@ -561,6 +561,7 @@ def _get_ADP_cluster_list(preprocess_data_obj):
         "bin_width":bin_width
         }
     )
+
     # coverage DP
     cov_dp_task = hapaseg.Hapaseg_coverage_dp(
     inputs = {
@@ -572,16 +573,15 @@ def _get_ADP_cluster_list(preprocess_data_obj):
         "bin_width":bin_width
         }
     )
-    
+
     #get the adp draw number from the preprocess data object
     @prefect.task
     def _get_ADP_draw_num(preprocess_data_obj):
         return int(np.load(preprocess_data_obj)["adp_cluster"])
     
     adp_draw_num = _get_ADP_draw_num(prep_cov_mcmc_task["preprocess_data"])
-    
-    # generate acdp dataframe
 
+    # generate acdp dataframe 
     gen_acdp_task = hapaseg.Hapaseg_acdp_generate_df(
     inputs = {
         "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same
@@ -592,7 +592,7 @@ def _get_ADP_draw_num(preprocess_data_obj):
         "bin_width":bin_width
         }
     )
-    
+
     # run acdp
     acdp_task = hapaseg.Hapaseg_run_acdp(
     inputs = {

From aa5676db31c69c4b7d804ec7d57b25bae4fc0352 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 13:20:36 -0400
Subject: [PATCH 134/222] Don't need separate script to collect ADP shards

---
 hapaseg/__main__.py | 21 ---------------------
 wolF/tasks.py       | 15 ---------------
 wolF/workflow.py    | 16 ++++++----------
 3 files changed, 6 insertions(+), 46 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 29ba9bb..207adac 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -476,29 +476,8 @@ def main():
         plt.title("Allelic segmentation (posterior)")
         plt.savefig(output_dir + "/figures/segs_only.png", dpi = 300)
         plt.close()
-    
-    #collect adp run data
-    elif args.command == "collect_adp":
-        with open(args.dp_results, 'r') as f:
-	        dp_results = f.readlines()
-        accum_clusts = []
-        accum_phases = []
-        accum_liks = []
-        
-        for dp_shard in dp_results:
-            obj = np.load(dp_shard.rstrip('\n'))
-            accum_clusts.append(obj['snps_to_clusters'])
-            accum_phases.append(obj['snps_to_phases'])
-            accum_liks.append(obj['likelihoods'])
-        all_clusts = np.vstack(accum_clusts)
-        all_phases = np.vstack(accum_phases)
-        all_liks = np.vstack(accum_liks)
-        # save
-        np.savez(os.path.join(output_dir, "full_dp_results"), snps_to_clusters=all_clusts, snps_to_phases=all_phases, likelihoods=all_liks)
-
 
     ## running coverage mcmc on all clusters
-
     elif args.command == "coverage_mcmc":
         cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv,
                                              args.allelic_clusters_object,
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 327481a..4f18da8 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -125,21 +125,6 @@ class Hapaseg_allelic_DP(wolf.Task):
     docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v623"
     resources = { "mem" : "5G" }
 
-class Hapaseg_collect_adp(wolf.Task):
-    inputs = {
-        "dp_results":None
-    }
-    
-    script = """
-    hapaseg collect_adp --dp_results ${dp_results}
-    """
-    output_patterns = {
-        "full_dp_results":"full_dp_results.npz"
-    }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
-    resources = { "mem" : "5G" }
-
-
 class Hapaseg_prepare_coverage_mcmc(wolf.Task):
     inputs = {
         "coverage_csv": None,
diff --git a/wolF/workflow.py b/wolF/workflow.py
index 4751a2d..08b7978 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -480,20 +480,16 @@ def concat_arm_level_results(arm_results):
      }
     )
 
-    ##collect DP results
-    collect_adp_task = hapaseg.Hapaseg_collect_adp(
-        inputs={"dp_results":[hapaseg_allelic_DP_task["cluster_and_phase_assignments"]]
-               }
-    )
-    
-    ### coverage tasks ####
+    #
+    # coverage tasks
+    #
 
     # prepare coverage MCMC
     prep_cov_mcmc_task = hapaseg.Hapaseg_prepare_coverage_mcmc(
     inputs={
         "coverage_csv":tumor_cov_gather_task["coverage"], #each scatter result is the same
-        "allelic_clusters_object":collect_adp_task["full_dp_results"],
-        "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same
+        "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"],
+        "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'],
         "repl_pickle":ref_config["repl_file"],
         "gc_pickle":ref_config["gc_file"],
         "ref_file_path":localization_task["ref_fasta"]
@@ -585,7 +581,7 @@ def _get_ADP_draw_num(preprocess_data_obj):
     gen_acdp_task = hapaseg.Hapaseg_acdp_generate_df(
     inputs = {
         "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same
-        "allelic_clusters_object":collect_adp_task["full_dp_results"],
+        "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"],
         "cdp_filepaths":[cov_dp_task["cov_dp_object"]],
         "allelic_draw_index":adp_draw_num,
         "ref_file_path":localization_task["ref_fasta"],

From d0affc1d547f52a764d8736c22c8471f68a7f55f Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 13:21:22 -0400
Subject: [PATCH 135/222] Use likelihoods computed in ADP

---
 hapaseg/run_coverage_MCMC.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 5370708..d8fff39 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -37,7 +37,7 @@ def __init__(self,
         if allelic_sample is not None:
             self.allelic_sample = allelic_sample
         else:
-            self.allelic_sample = self.select_ADP_cluster()
+            self.allelic_sample = np.argmax(self.allelic_clusters["likelihoods"])
 
         self.model = None
 
@@ -68,25 +68,6 @@ def dp_prior(cluster_counts_arr, alpha):
         N = cluster_counts_arr.sum()
         m = len(cluster_counts_arr)
         return m * np.log(alpha) + ss.gammaln(cluster_counts_arr).sum() + ss.gammaln(alpha) - ss.gammaln(N+alpha)
-    
-    # method for selecting ADP clustering based on likelihoods
-    def select_ADP_cluster(self):
-        ADP_draws = self.allelic_clusters["snps_to_clusters"]
-        tmp_snps = self.SNPs.copy()
-        lls = []
-        for ADP_draw in ADP_draws:
-            tmp_snps['cluster_assignment'] = ADP_draw
-            count_arr = tmp_snps.groupby(by='cluster_assignment').agg({"maj":sum, "min":sum}).values
-            count_arr += 1
-            beta_ll = ss.betaln(count_arr[:, 0], count_arr[:, 1]).sum()
-            cluster_counts = tmp_snps['cluster_assignment'].value_counts().values
-            dp_ll = self.dp_prior(cluster_counts, 0.5)
-            lls.append(beta_ll + dp_ll)
-        lls = np.array(lls)
-        lls_max = np.max(lls)
-        choice_p = np.exp(lls - lls_max) / np.exp(lls - lls_max).sum()
-        return np.random.choice(len(ADP_draws), p=choice_p)
-
 
     @staticmethod
     def load_coverage(coverage_csv):

From 54ec4b1b4c0010684c7b3c3d350fcb93c8111f36 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 14:02:00 -0400
Subject: [PATCH 136/222] Explicitly pass in ref_fasta to capy.seq

---
 hapaseg/__main__.py          |  1 +
 hapaseg/run_coverage_MCMC.py | 12 +++++++-----
 wolF/tasks.py                |  4 ++--
 wolF/workflow.py             |  2 +-
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 207adac..2474c9e 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -501,6 +501,7 @@ def main():
         cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv,
                                              args.allelic_clusters_object,
                                              args.SNPs_pickle,
+                                             args.ref_fasta,
                                              f_repl=args.repl_pickle,
                                              f_GC=args.gc_pickle,
                                              allelic_sample=args.allelic_sample)
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index d8fff39..18ffefc 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -17,6 +17,7 @@ def __init__(self,
                  f_allelic_clusters,
                  f_SNPs,
                  f_repl,
+                 ref_fasta,
                  f_GC=None,
                  num_draws=50,
                  cluster_num=None,
@@ -27,6 +28,7 @@ def __init__(self,
         self.cluster_num = cluster_num
         self.f_repl = f_repl
         self.f_GC = f_GC
+        self.ref_fasta = ref_fasta
 
         self.allelic_clusters = np.load(f_allelic_clusters)
         # coverage input is expected to be a df file with columns: ["chr", "start", "end", "covcorr", "covraw"]
@@ -69,27 +71,27 @@ def dp_prior(cluster_counts_arr, alpha):
         m = len(cluster_counts_arr)
         return m * np.log(alpha) + ss.gammaln(cluster_counts_arr).sum() + ss.gammaln(alpha) - ss.gammaln(N+alpha)
 
-    @staticmethod
-    def load_coverage(coverage_csv):
+    def load_coverage(self, coverage_csv):
         Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False)
         Cov.loc[Cov['chr'] == 'chrM', 'chr'] = 'chrMT' #change mitocondrial contigs to follow mut conventions
         Cov["chr"] = mut.convert_chr(Cov["chr"])
         Cov = Cov.loc[Cov["chr"] != 0]
         Cov=Cov.reset_index(drop=True)
-        Cov["start_g"] = seq.chrpos2gpos(Cov["chr"], Cov["start"])
-        Cov["end_g"] = seq.chrpos2gpos(Cov["chr"], Cov["end"])
+        Cov["start_g"] = seq.chrpos2gpos(Cov["chr"], Cov["start"], ref = self.ref_fasta)
+        Cov["end_g"] = seq.chrpos2gpos(Cov["chr"], Cov["end"], ref = self.ref_fasta)
         
         return Cov
 
     def load_SNPs(self, f_snps):
         SNPs = pd.read_pickle(f_snps)
-        SNPs["chr"], SNPs["pos"] = seq.gpos2chrpos(SNPs["gpos"])
+        SNPs["chr"], SNPs["pos"] = seq.gpos2chrpos(SNPs["gpos"], ref = self.ref_fasta)
 
         SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False)
         return SNPs
 
     def generate_GC(self):
         #grab fasta object from seq to avoid rebuilding
+        seq.set_reference(self.ref_fasta)
         F = seq._fa.ref_fa_obj
         self.full_cov_df['C_GC'] = np.nan
         
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 4f18da8..d2f0f5a 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -133,11 +133,11 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task):
         "repl_pickle": None,
         "gc_pickle":"",
         "allelic_sample":"",
-        "ref_file_path": None
+        "ref_fasta": None
     }
     script = """
-    export CAPY_REF_FA=${ref_file_path}
     hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \
+    --ref_fasta ${ref_fasta} \
     --allelic_clusters_object ${allelic_clusters_object} \
     --SNPs_pickle ${SNPs_pickle} \
     --repl_pickle ${repl_pickle}"""
diff --git a/wolF/workflow.py b/wolF/workflow.py
index 08b7978..b5e1e14 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -492,7 +492,7 @@ def concat_arm_level_results(arm_results):
         "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'],
         "repl_pickle":ref_config["repl_file"],
         "gc_pickle":ref_config["gc_file"],
-        "ref_file_path":localization_task["ref_fasta"]
+        "ref_fasta":localization_task["ref_fasta"]
         }
     )
     

From 1b84ae08b31d12ede9544766e432581a1a4da711 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 15:03:02 -0400
Subject: [PATCH 137/222] Don't need this anymore

---
 hapaseg/run_coverage_MCMC.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 18ffefc..f683000 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -84,8 +84,6 @@ def load_coverage(self, coverage_csv):
 
     def load_SNPs(self, f_snps):
         SNPs = pd.read_pickle(f_snps)
-        SNPs["chr"], SNPs["pos"] = seq.gpos2chrpos(SNPs["gpos"], ref = self.ref_fasta)
-
         SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False)
         return SNPs
 

From 42e79f86e876cd5a36b1979b4931534f8f5b9a23 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 15:07:13 -0400
Subject: [PATCH 138/222] Add progress bar

---
 hapaseg/run_coverage_MCMC.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index f683000..95f53b7 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -4,6 +4,7 @@
 import re
 import os
 import scipy.special as ss
+import tqdm
 from capy import mut, seq
 import scipy.stats as stats
 from statsmodels.discrete.discrete_model import NegativeBinomial as statsNB
@@ -94,7 +95,7 @@ def generate_GC(self):
         self.full_cov_df['C_GC'] = np.nan
         
         #this indexing assumes 0-indexed start and end cols
-        for (i, chrm, start, end) in self.full_cov_df[['chr', 'start','end']].itertuples():
+        for (i, chrm, start, end) in tqdm.tqdm(self.full_cov_df[['chr', 'start','end']].itertuples(), total = len(self.full_cov_df)):
             self.full_cov_df.iat[i, -1] = F[chrm-1][start:end+1].gc
         
 

From 904053f4ad1e480dabeaad78f06377c1e1386cbb Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 15:26:26 -0400
Subject: [PATCH 139/222] Remove unused function

---
 hapaseg/run_coverage_MCMC.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 95f53b7..ddd5c7d 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -65,13 +65,6 @@ def prepare_single_cluster(self):
         # save these results to a numpy object
         return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample
 
-    # method for calculating DP prior likelihood of an ADP cluster    
-    @staticmethod    
-    def dp_prior(cluster_counts_arr, alpha):
-        N = cluster_counts_arr.sum()
-        m = len(cluster_counts_arr)
-        return m * np.log(alpha) + ss.gammaln(cluster_counts_arr).sum() + ss.gammaln(alpha) - ss.gammaln(N+alpha)
-
     def load_coverage(self, coverage_csv):
         Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False)
         Cov.loc[Cov['chr'] == 'chrM', 'chr'] = 'chrMT' #change mitocondrial contigs to follow mut conventions

From fd55e134f1b015ea14ad26e2d912116647b80bb4 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 26 Apr 2022 15:40:53 -0400
Subject: [PATCH 140/222] Hack to fix contig names for hg19

---
 wolF/workflow.py | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index b5e1e14..21ea515 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -192,8 +192,13 @@ def workflow(
     # collect or load coverage
     # tumor
     if collect_tumor_coverage:
-        primary_contigs = ['chr{}'.format(i) for i in range(1,23)]
-        primary_contigs.extend(['chrX','chrY','chrM'])
+        # FIXME: hack to account for "chr" in hg38 but not in hg19
+        if ref_genome_build == "hg38":
+            primary_contigs = ['chr{}'.format(i) for i in range(1,23)]
+            primary_contigs.extend(['chrX','chrY','chrM'])
+        else:
+            primary_contigs = [str(x) for x in range(1, 23)] + ["X", "Y", "M"]
+
         # create scatter intervals
         split_intervals_task = split_intervals.split_intervals(
           bam = tumor_bam_localization_task["t_bam"],
@@ -204,18 +209,19 @@ def workflow(
 
         # shim task to transform split_intervals files into subset parameters for covcollect task
         @prefect.task
-        def interval_gather(interval_files):
+        def interval_gather(interval_files, primary_contigs):
             ints = []
             for f in interval_files:
                 ints.append(pd.read_csv(f, sep = "\t", header = None, names = ["chr", "start", "end"]))
             #filter non-primary contigs
-            primary_contigs = ['chr{}'.format(i) for i in range(1,23)]
-            primary_contigs.extend(['chrX','chrY','chrM'])
-            full_bed = pd.concat(ints).sort_values(["chr", "start", "end"])
+            full_bed = pd.concat(ints).sort_values(["chr", "start", "end"]).astype({ "chr" : str })
             filtered_bed = full_bed.loc[full_bed.chr.isin(primary_contigs)]
             return filtered_bed
 
-        subset_intervals = interval_gather(split_intervals_task["interval_files"])
+        subset_intervals = interval_gather(
+          split_intervals_task["interval_files"],
+          primary_contigs
+        )
 
         # dispatch coverage scatter
         tumor_cov_collect_task = cov_collect.Covcollect(
@@ -349,6 +355,13 @@ def order_indices(bcf_path, bcf_idx_path, localization_task):
 
         F = F.join(F2)
 
+        # prepend "chr" to F's index if it's missing
+        idx = ~F.index.str.contains("^chr")
+        if idx.any():
+            new_index = F.index.values
+            new_index[idx] = "chr" + F.index[idx]
+            F = F.set_index(new_index)
+
         # reference panel BCFs
         R = pd.DataFrame({ "path" : localization_task } ).reset_index()
         F = F.join(R.join(R.loc[R["index"].str.contains("^chr.*_bcf$"), "index"].str.extract(r"(?P<chr>chr[^_]+)"), how = "right").set_index("chr").drop(columns = ["index"]).rename(columns = { "path" : "ref_bcf" }), how = "inner")

From 694284908d1505be34af9b7ffa24ea2192d6f9df Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 27 Apr 2022 13:00:38 -0400
Subject: [PATCH 141/222] Add cytoband header check

---
 hapaseg/utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/hapaseg/utils.py b/hapaseg/utils.py
index c216f4c..46712d3 100644
--- a/hapaseg/utils.py
+++ b/hapaseg/utils.py
@@ -5,8 +5,13 @@
 _chrmap = dict(zip(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]], range(1, 25)))
 
 def parse_cytoband(cytoband):
-    # TODO: do some cytoband files have a header? check if so!
-    cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"])
+    # some cytoband files have a header, some don't; we need to check
+    has_header = False
+    with open(cytoband, "r") as f:
+        if f.readline().startswith("chr\t"):
+            has_header = True
+
+    cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"] if not has_header else None)
     cband["chr"] = cband["chr"].apply(lambda x : _chrmap[x])
 
     chrs = cband["chr"].unique()

From 104953d65a6ab54e77feafbd42dda3e134175870 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 27 Apr 2022 13:10:55 -0400
Subject: [PATCH 142/222] Bump ADP Docker

---
 wolF/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index d2f0f5a..d2213fb 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -122,7 +122,7 @@ class Hapaseg_allelic_DP(wolf.Task):
       "SNP_plot" : "figures/SNPs.png",
       "seg_plot" : "figures/segs_only.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v623"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v789"
     resources = { "mem" : "5G" }
 
 class Hapaseg_prepare_coverage_mcmc(wolf.Task):

From 0dd0dd618b1ab7a14c566710472e3221925ade74 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 27 Apr 2022 14:38:41 -0400
Subject: [PATCH 143/222] Speed up assigning targets to ADP clusters

---
 hapaseg/run_coverage_MCMC.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index ddd5c7d..1e25b28 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -148,15 +148,19 @@ def assign_clusters(self):
         clust_choice = self.allelic_clusters["snps_to_clusters"][self.allelic_sample]
         clust_u, clust_uj = np.unique(clust_choice, return_inverse=True)
         clust_uj = clust_uj.reshape(clust_choice.shape)
+        cuj_max = clust_uj.max() + 1
+        self.SNPs["clust_choice"] = clust_uj
 
         # assign coverage intervals to clusters
         Cov_clust_probs = np.zeros([len(self.full_cov_df), clust_uj.max()+1])
 
         # first compute assignment probabilities based on the SNPs within each bin
-        for targ, snp_idx in self.SNPs.groupby("tidx").indices.items():
-            targ_clust_hist = np.bincount(clust_uj[snp_idx].ravel(), minlength=clust_uj.max()+1)
-
-            Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum()
+        for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["clust_choice"]):
+            if len(snp_idx) == 1:
+                Cov_clust_probs[int(targ), snp_idx] = 1.0
+            else: 
+                targ_clust_hist = np.bincount(snp_idx, minlength = cuj_max) 
+                Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum()
 
         # subset intervals containing SNPs
         overlap_idx = Cov_clust_probs.sum(1) > 0

From 4922536420eba1ab73e0370ddc1fe73cb8ad19be Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 27 Apr 2022 14:43:49 -0400
Subject: [PATCH 144/222] Print log messages to stderr

---
 hapaseg/run_coverage_MCMC.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 1e25b28..b1f70dd 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -4,6 +4,7 @@
 import re
 import os
 import scipy.special as ss
+import sys
 import tqdm
 from capy import mut, seq
 import scipy.stats as stats
@@ -117,13 +118,13 @@ def load_covariates(self):
 
         # load GC content if we have it precomputed, otherwise generate it
         if wgs and self.f_GC is not None and os.path.exists(self.f_GC):
-            print("Using precomputed GC content")
+            print("Using precomputed GC content", file = sys.stderr)
             B = pd.read_pickle(self.f_GC)
             
             self.full_cov_df = self.full_cov_df.merge(B.rename(columns={"gc": "C_GC"}), left_on=["chr", "start", "end"],
                                                   right_on=["chr", "start", "end"], how="left")
         else:
-            print("Computing GC content")
+            print("Computing GC content", file = sys.stderr)
             self.generate_GC()
         
         self.full_cov_df["C_GC_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(
@@ -155,6 +156,7 @@ def assign_clusters(self):
         Cov_clust_probs = np.zeros([len(self.full_cov_df), clust_uj.max()+1])
 
         # first compute assignment probabilities based on the SNPs within each bin
+        print("Mapping SNPs to targets ...", file = sys.stderr)
         for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["clust_choice"]):
             if len(snp_idx) == 1:
                 Cov_clust_probs[int(targ), snp_idx] = 1.0

From e7673f0115eaf0dd7ebec9ed9aa1d853e31ba177 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 27 Apr 2022 16:34:07 -0400
Subject: [PATCH 145/222] Don't use such small value for covars=0

---
 hapaseg/run_coverage_MCMC.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index b1f70dd..dd48a4b 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -94,6 +94,8 @@ def generate_GC(self):
         
 
     def load_covariates(self):
+        ## Target size
+
         #check if we are doing wgs, in which case we will have uniform 200 bp bins
         wgs = True if self.f_GC is not None or len(self.full_cov_df) > 100000 else False
         
@@ -105,6 +107,10 @@ def load_covariates(self):
             if (np.diff(self.full_cov_df["C_log_len"]) == 0).all():
                 #remove the len col since it will ruin beta fitting
                 self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1)
+
+        ## Replication timing
+        zt = lambda x : (x - np.nanmean(x))/np.nanstd(x)
+
         # load repl timing
         F = pd.read_pickle(self.f_repl)
         # map targets to RT intervals
@@ -113,8 +119,9 @@ def load_covariates(self):
         self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values
 
         # z-transform
-        self.full_cov_df["C_RT_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(
-            np.log(self.full_cov_df["C_RT"] + 1e-20))
+        self.full_cov_df["C_RT_z"] = zt(np.log(self.full_cov_df["C_RT"] + 0.01))
+
+        ## GC content
 
         # load GC content if we have it precomputed, otherwise generate it
         if wgs and self.f_GC is not None and os.path.exists(self.f_GC):
@@ -127,8 +134,7 @@ def load_covariates(self):
             print("Computing GC content", file = sys.stderr)
             self.generate_GC()
         
-        self.full_cov_df["C_GC_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(
-            np.log(self.full_cov_df["C_GC"] + 1e-20))
+        self.full_cov_df["C_GC_z"] = zt(np.log(self.full_cov_df["C_GC"] + 0.01))
         
         #set zero coverage bins to nan
         self.full_cov_df.loc[(self.full_cov_df.mean_frag_len == 0) | (self.full_cov_df.std_frag_len == 0), ['mean_frag_len', 'std_frag_len']] = (np.nan, np.nan)

From 8d3917c1190e68fd2b2daac297149129b5e1967f Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 27 Apr 2022 16:34:29 -0400
Subject: [PATCH 146/222] Don't use fragment std as a covar

---
 hapaseg/run_coverage_MCMC.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index dd48a4b..4bcd793 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -136,16 +136,16 @@ def load_covariates(self):
         
         self.full_cov_df["C_GC_z"] = zt(np.log(self.full_cov_df["C_GC"] + 0.01))
         
-        #set zero coverage bins to nan
+        ## Fragment length
+
+        # some bins have zero mean fragment length(!?); NaN these out
         self.full_cov_df.loc[(self.full_cov_df.mean_frag_len == 0) | (self.full_cov_df.std_frag_len == 0), ['mean_frag_len', 'std_frag_len']] = (np.nan, np.nan)
-        
-        # add fragment based covars
-        self.full_cov_df["C_frag_len"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(np.log(self.full_cov_df["mean_frag_len"] + 1e-20))
-        self.full_cov_df["C_frag_std"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(np.log(self.full_cov_df["std_frag_len"] + 1e-20))
+
+        self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
+        self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
 
         # drop non z-cetered cols
-        self.full_cov_df = self.full_cov_df.drop(['C_GC', 'C_RT'], axis=1)
-        
+        self.full_cov_df = self.full_cov_df.drop(columns = self.full_cov_df.columns[self.full_cov_df.columns.str.contains("C_.*[^z]$")], axis=1)
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned

From add24cb6ad70d6ba9c3ccf8e14a01c2ff708d349 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 27 Apr 2022 17:04:21 -0400
Subject: [PATCH 147/222] Remove empty clusters; remove extreme outlier targets

---
 hapaseg/run_coverage_MCMC.py | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 4bcd793..d3905ca 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -151,7 +151,7 @@ def load_covariates(self):
     # clusters with snps from different clusters are probabliztically assigned
     # method returns coverage df with only bins that overlap snps
     def assign_clusters(self):
-        # generate unique clust assignments
+        ## generate unique clust assignments
         clust_choice = self.allelic_clusters["snps_to_clusters"][self.allelic_sample]
         clust_u, clust_uj = np.unique(clust_choice, return_inverse=True)
         clust_uj = clust_uj.reshape(clust_choice.shape)
@@ -177,14 +177,15 @@ def assign_clusters(self):
         # zero out improbable assignments and re-normalilze
         Cov_clust_probs_overlap[Cov_clust_probs_overlap < 0.05] = 0
         Cov_clust_probs_overlap /= Cov_clust_probs_overlap.sum(1)[:, None]
-        # prune garbage clusters
+        # prune empty clusters
         prune_idx = Cov_clust_probs_overlap.sum(0) > 0
         Cov_clust_probs_overlap = Cov_clust_probs_overlap[:, prune_idx]
         num_pruned_clusters = Cov_clust_probs_overlap.shape[1]
-        # subsetting to only targets that overlap SNPs
+
+        ## subsetting to only targets that overlap SNPs
         Cov_overlap = self.full_cov_df.loc[overlap_idx, :]
 
-        # probabilistically assign each ambiguous coverage bin to a cluster
+        ## probabilistically assign each ambiguous coverage bin to a cluster
         # for now we will take maximum instead
         amb_mask = np.max(Cov_clust_probs_overlap, 1) != 1
         amb_assgn_probs = Cov_clust_probs_overlap[amb_mask, :]
@@ -197,7 +198,7 @@ def assign_clusters(self):
         # update with assigned values
         Cov_clust_probs_overlap[amb_mask, :] = new_onehot
 
-        #downsampling for wgs
+        ## downsampling for wgs
         if len(Cov_clust_probs_overlap) > 20000:
             downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2
             Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask]
@@ -210,14 +211,16 @@ def assign_clusters(self):
 
         Cov_overlap = Cov_overlap.loc[~bad_bins, :]
         Pi = filtered.copy()
+        Cov_overlap['allelic_cluster'] = np.argmax(Pi, axis=1)
        
         r = np.c_[Cov_overlap["covcorr"]]
         
         covar_columns = sorted([c for c in Cov_overlap.columns if 'C_' in c])
-        # making covariate matrix
+
+        ## making covariate matrix
         C = np.c_[Cov_overlap[covar_columns]]
 
-        # dropping Nans
+        ## dropping Nans
         naidx = np.isnan(C).any(axis=1)
         # drop zero coverage bins as well (this is to account for a bug in coverage collector) TODO: remove need for this
         naidx = np.logical_or(naidx, (r==0).flatten())
@@ -227,14 +230,22 @@ def assign_clusters(self):
 
         Cov_overlap = Cov_overlap.iloc[~naidx]
         
-        #removing outliers
+        ## removing coverage outliers
         outlier_mask = find_outliers(r)
         r = r[~outlier_mask]
         C = C[~outlier_mask]
         Pi = Pi[~outlier_mask]
         Cov_overlap = Cov_overlap.iloc[~outlier_mask]
-        
-        Cov_overlap['allelic_cluster'] = np.argmax(Pi, axis=1)
+
+        # some clusters may have been eliminated by this point; prune them from Pi
+        Pi = Pi[:, Pi.sum(0) > 0]
+ 
+        ## remove covariate outliers (+- 6 sigma)
+        covar_outlier_idx = (Cov_overlap.loc[:, covar_columns].abs() < 6).all(axis = 1)
+        Cov_overlap = Cov_overlap.loc[covar_outlier_idx]
+        Pi = Pi[covar_outlier_idx, :]
+        r = r[covar_outlier_idx]
+        C = C[covar_outlier_idx, :]
 
         return Pi, r, C, Cov_overlap
 

From 3b5fb405c7607ed15ae155444d8bc7544b26d6da Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Wed, 27 Apr 2022 22:46:51 -0400
Subject: [PATCH 148/222] Keep non-Z transformed columns

---
 hapaseg/run_coverage_MCMC.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index d3905ca..fc791a2 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -144,8 +144,6 @@ def load_covariates(self):
         self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
         self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
 
-        # drop non z-cetered cols
-        self.full_cov_df = self.full_cov_df.drop(columns = self.full_cov_df.columns[self.full_cov_df.columns.str.contains("C_.*[^z]$")], axis=1)
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned
@@ -215,7 +213,7 @@ def assign_clusters(self):
        
         r = np.c_[Cov_overlap["covcorr"]]
         
-        covar_columns = sorted([c for c in Cov_overlap.columns if 'C_' in c])
+        covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")])
 
         ## making covariate matrix
         C = np.c_[Cov_overlap[covar_columns]]

From d8b7e221bd16da205e5af3a6e3ff40a05b035ec1 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Apr 2022 07:19:07 -0400
Subject: [PATCH 149/222] Don't log transform covariates

---
 hapaseg/run_coverage_MCMC.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index fc791a2..b7ddf39 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -119,7 +119,7 @@ def load_covariates(self):
         self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values
 
         # z-transform
-        self.full_cov_df["C_RT_z"] = zt(np.log(self.full_cov_df["C_RT"] + 0.01))
+        self.full_cov_df["C_RT_z"] = zt(self.full_cov_df["C_RT"])
 
         ## GC content
 
@@ -134,7 +134,7 @@ def load_covariates(self):
             print("Computing GC content", file = sys.stderr)
             self.generate_GC()
         
-        self.full_cov_df["C_GC_z"] = zt(np.log(self.full_cov_df["C_GC"] + 0.01))
+        self.full_cov_df["C_GC_z"] = zt(self.full_cov_df["C_GC"])
         
         ## Fragment length
 

From 2a3b1fb6d1df6b6ad710ae0475610d591eb8e5be Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Apr 2022 14:33:52 -0400
Subject: [PATCH 150/222] Minor typo fix

---
 hapaseg/run_coverage_MCMC.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index b7ddf39..05a40dc 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -157,7 +157,7 @@ def assign_clusters(self):
         self.SNPs["clust_choice"] = clust_uj
 
         # assign coverage intervals to clusters
-        Cov_clust_probs = np.zeros([len(self.full_cov_df), clust_uj.max()+1])
+        Cov_clust_probs = np.zeros([len(self.full_cov_df), cuj_max])
 
         # first compute assignment probabilities based on the SNPs within each bin
         print("Mapping SNPs to targets ...", file = sys.stderr)

From 70d924c88fbb8be5e6a17af082a1e2381261a06b Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Apr 2022 14:37:05 -0400
Subject: [PATCH 151/222] Save ADP segmentation samples

---
 hapaseg/__main__.py | 6 ++++++
 wolF/tasks.py       | 1 +
 2 files changed, 7 insertions(+)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 2474c9e..aebf06c 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -444,12 +444,18 @@ def main():
         snps_to_clusters, snps_to_phases, likelihoods = A.run()
 
         # save DP results
+        # SNP assignment/phasing samples, likelihoods of each sample
         np.savez(output_dir + "/allelic_DP_SNP_clusts_and_phase_assignments.npz",
                  snps_to_clusters=snps_to_clusters,
                  snps_to_phases=snps_to_phases,
                  likelihoods=likelihoods
                  )
 
+        # segmentation breakpoints for each sample
+        with open(output_dir + "/segmentations.pickle", "wb") as f:
+            pickle.dump(A.DP_run.segment_trace, f)
+
+        # full SNP dataframe
         A.SNPs.to_pickle(output_dir + "/all_SNPs.pickle")
 
         #
diff --git a/wolF/tasks.py b/wolF/tasks.py
index d2213fb..42179fa 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -118,6 +118,7 @@ class Hapaseg_allelic_DP(wolf.Task):
     output_patterns = {
       "cluster_and_phase_assignments" : "allelic_DP_SNP_clusts_and_phase_assignments.npz",
       "all_SNPs" : "all_SNPs.pickle",
+      "segmentation_breakpoints" : "segmentations.pickle",
       "likelihood_trace_plot" : "figures/likelihood_trace.png",
       "SNP_plot" : "figures/SNPs.png",
       "seg_plot" : "figures/segs_only.png",

From 87cafef7fbeb967a329e8ee34ed4528bece30aa0 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Apr 2022 14:43:51 -0400
Subject: [PATCH 152/222] Bump ADP memory

---
 wolF/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 42179fa..6076e0a 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -124,7 +124,7 @@ class Hapaseg_allelic_DP(wolf.Task):
       "seg_plot" : "figures/segs_only.png",
     }
     docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v789"
-    resources = { "mem" : "5G" }
+    resources = { "mem" : "8G" }
 
 class Hapaseg_prepare_coverage_mcmc(wolf.Task):
     inputs = {

From 68036f03a487f3d82a59606d91ab122596364890 Mon Sep 17 00:00:00 2001
From: Oliver Priebe <opriebe@broadinstitute.org>
Date: Thu, 28 Apr 2022 22:19:34 +0000
Subject: [PATCH 153/222] remove arbitrary wgs threshold for using bin lens

also rename columns so covariates don't get pulled in repeatedly
---
 hapaseg/run_coverage_MCMC.py | 37 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index b7ddf39..a6325f2 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -96,17 +96,13 @@ def generate_GC(self):
     def load_covariates(self):
         ## Target size
 
-        #check if we are doing wgs, in which case we will have uniform 200 bp bins
-        wgs = True if self.f_GC is not None or len(self.full_cov_df) > 100000 else False
-        
-        #we only need bin size if doing exomes
-        if not wgs:
-            self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1)
+        # we only need bin size if doing exomes but we can check by looking at the bin lengths
+        self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1)
             
-            #this is a safety in case we are doing wgs but have few bins
-            if (np.diff(self.full_cov_df["C_log_len"]) == 0).all():
-                #remove the len col since it will ruin beta fitting
-                self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1)
+        # in case we are doing wgs these will all be the same and we must remove
+        if (np.diff(self.full_cov_df["C_log_len"]) == 0).all():
+            #remove the len col since it will ruin beta fitting
+            self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1)
 
         ## Replication timing
         zt = lambda x : (x - np.nanmean(x))/np.nanstd(x)
@@ -143,7 +139,9 @@ def load_covariates(self):
 
         self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
         self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
-
+        
+        #rename non z-centered columns so that they arent pulled in as covariates
+        self.full_cov_df.rename({'C_frag_len':'frag_len', 'C_RT':'RT', 'C_GC':'GC'}, axis=1)
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned
@@ -187,14 +185,15 @@ def assign_clusters(self):
         # for now we will take maximum instead
         amb_mask = np.max(Cov_clust_probs_overlap, 1) != 1
         amb_assgn_probs = Cov_clust_probs_overlap[amb_mask, :]
-        #new_assgn = np.array([np.random.choice(np.r_[:num_pruned_clusters],
-        #                                       p=amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))])
-        new_assgn = np.array([np.argmax(amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))])
-        new_onehot = np.zeros((new_assgn.size, num_pruned_clusters))
-        new_onehot[np.arange(new_assgn.size), new_assgn] = 1
-
-        # update with assigned values
-        Cov_clust_probs_overlap[amb_mask, :] = new_onehot
+        if amb_mask.sum() > 0:
+            #new_assgn = np.array([np.random.choice(np.r_[:num_pruned_clusters],
+            #                                       p=amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))])
+            new_assgn = np.array([np.argmax(amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))])
+            new_onehot = np.zeros((new_assgn.size, num_pruned_clusters))
+            new_onehot[np.arange(new_assgn.size), new_assgn] = 1
+    
+            # update with assigned values
+            Cov_clust_probs_overlap[amb_mask, :] = new_onehot
 
         ## downsampling for wgs
         if len(Cov_clust_probs_overlap) > 20000:

From 3fc423d734a6b362c164908fb7d2cd805e09206d Mon Sep 17 00:00:00 2001
From: Oliver Priebe <opriebe@broadinstitute.org>
Date: Thu, 28 Apr 2022 22:47:48 +0000
Subject: [PATCH 154/222] committing to ^C_.*_z$ covar convention

---
 hapaseg/a_cov_DP.py          | 2 +-
 hapaseg/run_coverage_MCMC.py | 7 ++-----
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/hapaseg/a_cov_DP.py b/hapaseg/a_cov_DP.py
index 1db9b17..1c2e0a5 100644
--- a/hapaseg/a_cov_DP.py
+++ b/hapaseg/a_cov_DP.py
@@ -63,7 +63,7 @@ def generate_acdp_df(SNP_path, # path to SNP df
         print('concatenating dp run ', draw_num)
         a_cov_seg_df = dp_run.cov_df.copy()
 
-        covar_cols = sorted([c for c in a_cov_seg_df.columns if "C_" in c])
+        covar_cols = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")])
         # add minor and major allele counts for each bin to the cov_seg_df here to allow for beta draws on the fly for each segment
         a_cov_seg_df['min_count'] = 0
         a_cov_seg_df['maj_count'] = 0
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index a6325f2..78ac27f 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -120,7 +120,7 @@ def load_covariates(self):
         ## GC content
 
         # load GC content if we have it precomputed, otherwise generate it
-        if wgs and self.f_GC is not None and os.path.exists(self.f_GC):
+        if self.f_GC is not None and os.path.exists(self.f_GC):
             print("Using precomputed GC content", file = sys.stderr)
             B = pd.read_pickle(self.f_GC)
             
@@ -140,9 +140,6 @@ def load_covariates(self):
         self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
         self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
         
-        #rename non z-centered columns so that they arent pulled in as covariates
-        self.full_cov_df.rename({'C_frag_len':'frag_len', 'C_RT':'RT', 'C_GC':'GC'}, axis=1)
-
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned
     # method returns coverage df with only bins that overlap snps
@@ -348,7 +345,7 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None,
     # along with the bin exposure
     endog = np.exp(np.log(r).flatten() - np.log(bin_width) - mu_is).reshape(-1,1)
     # generate covars
-    covar_columns = sorted([c for c in cov_df.columns if 'C_' in c])
+    covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")])
     C = np.c_[cov_df[covar_columns]]
     # do regression
     pois_regr = PoissonRegression(endog, C, np.ones(endog.shape))

From 4f27e28be30a51974df56d1c0e1712da7aefe346 Mon Sep 17 00:00:00 2001
From: Oliver Priebe <opriebe@broadinstitute.org>
Date: Fri, 29 Apr 2022 13:19:22 +0000
Subject: [PATCH 155/222] fix cov_df naming issue and update cov_DP covar
 gather

---
 hapaseg/a_cov_DP.py          | 2 +-
 hapaseg/coverage_DP.py       | 2 +-
 hapaseg/run_coverage_MCMC.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/hapaseg/a_cov_DP.py b/hapaseg/a_cov_DP.py
index 1c2e0a5..975908b 100644
--- a/hapaseg/a_cov_DP.py
+++ b/hapaseg/a_cov_DP.py
@@ -63,7 +63,7 @@ def generate_acdp_df(SNP_path, # path to SNP df
         print('concatenating dp run ', draw_num)
         a_cov_seg_df = dp_run.cov_df.copy()
 
-        covar_cols = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")])
+        covar_cols = sorted(a_cov_seg_df.columns[a_cov_seg_df.columns.str.contains("^C_.*_z$")])
         # add minor and major allele counts for each bin to the cov_seg_df here to allow for beta draws on the fly for each segment
         a_cov_seg_df['min_count'] = 0
         a_cov_seg_df['maj_count'] = 0
diff --git a/hapaseg/coverage_DP.py b/hapaseg/coverage_DP.py
index 07a85e8..17687f2 100644
--- a/hapaseg/coverage_DP.py
+++ b/hapaseg/coverage_DP.py
@@ -116,7 +116,7 @@ def __init__(self, cov_df, beta, bin_exposure, prior_run=None, count_prior_sum=N
         self.seg_id_col = self.cov_df.columns.get_loc('segment_ID')
         self.beta = beta
         self.bin_exposure=bin_exposure
-        self.covar_cols = sorted([c for c in self.cov_df.columns if "C_" in c])
+        self.covar_cols = sorted(self.cov_df.columns[self.cov_df.columns.str.contains("^C_.*_z$")])
         
         self.num_segments = self.cov_df.iloc[:, self.seg_id_col].max() + 1
         self.segment_r_list = [None] * self.num_segments
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 78ac27f..d58a06f 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -345,7 +345,7 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None,
     # along with the bin exposure
     endog = np.exp(np.log(r).flatten() - np.log(bin_width) - mu_is).reshape(-1,1)
     # generate covars
-    covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")])
+    covar_columns = sorted(cov_df.columns[cov_df.columns.str.contains("^C_.*_z$")])
     C = np.c_[cov_df[covar_columns]]
     # do regression
     pois_regr = PoissonRegression(endog, C, np.ones(endog.shape))

From 53627cc8ebaa077ad41bfe274f72b554a2f48b4d Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 29 Apr 2022 10:21:12 -0400
Subject: [PATCH 156/222] Draft code for binning fraglen

---
 hapaseg/run_coverage_MCMC.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 05a40dc..0c36f41 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -144,6 +144,19 @@ def load_covariates(self):
         self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
         self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
 
+        # generate on 10x and 50x scales
+        # TODO: use rolling window rather than disjoint bins
+        for scale in [10, 50]:
+            fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0
+            wt = self.full_cov_df["num_reads"].values
+            fl = np.pad(fl, (0, scale - (len(fl) % scale))).reshape(-1, scale)
+            wt = np.pad(wt, (0, scale - (len(wt) % scale))).reshape(-1, scale)
+            wt = wt/wt.sum(1, keepdims = True)
+            self.full_cov_df[f"C_frag_len_{scale}x"] = np.tile(
+              np.einsum('ij,ij->i', wt, fl),
+              [scale, 1]
+            ).T.ravel()[:len(self.full_cov_df)]
+            self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"])
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned

From bfa98a99deb271edb6c729d038b450e14b87cd85 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 29 Apr 2022 10:21:28 -0400
Subject: [PATCH 157/222] Unused code for mapping intervals to segments

---
 hapaseg/run_coverage_MCMC.py | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 0c36f41..5259cfb 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -181,8 +181,33 @@ def assign_clusters(self):
                 targ_clust_hist = np.bincount(snp_idx, minlength = cuj_max) 
                 Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum()
 
-        # subset intervals containing SNPs
+#        # assign coverage intervals to allelic segments
+#        # TODO: segmentation boundary will be passed directly in, so we don't have to recompute it
+#        seg_bdy = np.flatnonzero(np.r_[1, np.diff(self.SNPs["clust_choice"]), 1] != 0)
+#        seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
+#        self.SNPs["seg_idx"] = 0
+#        for i, (st, en) in enumerate(seg_bdy):
+#            self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i
+#        seg_idx_max = self.SNPs["seg_idx"].max() + 1
+#
+#        Cov_clust_probs_seg = np.zeros([len(self.full_cov_df), seg_idx_max])
+#
+#        for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["seg_idx"]):
+#            if len(snp_idx) == 1:
+#                Cov_clust_probs_seg[int(targ), snp_idx] = 1.0
+#            else: 
+#                targ_clust_hist = np.bincount(snp_idx, minlength = seg_idx_max) 
+#                Cov_clust_probs_seg[int(targ), :] = targ_clust_hist / targ_clust_hist.sum()
+#
+#        # XXX: temporary
+#        Cov_clust_probs = Cov_clust_probs_seg
+
+        ## subset to targets containing SNPs
         overlap_idx = Cov_clust_probs.sum(1) > 0
+#        # add targets within a 2 targ radius
+#        overlap_idx = np.flatnonzero(Cov_clust_probs.sum(1) > 0)[:, None]
+#        overlap_idx = overlap_idx + np.c_[-2:3].T
+#        overlap_idx = np.sort(np.unique((overlap_idx + np.c_[-2:3].T).ravel()))
         Cov_clust_probs_overlap = Cov_clust_probs[overlap_idx, :]
 
         # zero out improbable assignments and re-normalilze

From 1374932713929bae834aff2d6712f61ac4c4286a Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 29 Apr 2022 10:21:38 -0400
Subject: [PATCH 158/222] Temporarily quit downsampling

---
 hapaseg/run_coverage_MCMC.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 5259cfb..79cb756 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -235,10 +235,10 @@ def assign_clusters(self):
         Cov_clust_probs_overlap[amb_mask, :] = new_onehot
 
         ## downsampling for wgs
-        if len(Cov_clust_probs_overlap) > 20000:
-            downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2
-            Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask]
-            Cov_overlap = Cov_overlap.iloc[downsample_mask]
+#        if len(Cov_clust_probs_overlap) > 20000:
+#            downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2
+#            Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask]
+#            Cov_overlap = Cov_overlap.iloc[downsample_mask]
     
         # remove clusters with fewer than 4 assigned coverage bins (remove these coverage bins as well)
         bad_clusters = Cov_clust_probs_overlap.sum(0) < 4

From 472aa6ae584e648c6cff7c35f641df0554f9880c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 2 May 2022 13:23:44 -0400
Subject: [PATCH 159/222] Set ADP betahyp dynamically

---
 hapaseg/allelic_DP.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 6c6cb86..0ae9eb5 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -93,7 +93,7 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
         self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F")
         self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F")
 
-        self.betahyp = 10
+        self.betahyp = self.S.loc[:, ["min", "maj"]].sum(1).mean()/2
 
         #
         # define column indices

From 4ce8fac1d69d31f0fb4e69dec07a861063b88cb3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 2 May 2022 14:49:30 -0400
Subject: [PATCH 160/222] Map coverage intervals to allelic segments

---
 hapaseg/run_coverage_MCMC.py | 49 +++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index cbcd51e..810a49d 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pandas as pd
+import pickle
 import glob
 import re
 import os
@@ -18,6 +19,7 @@ def __init__(self,
                  coverage_csv,
                  f_allelic_clusters,
                  f_SNPs,
+                 f_segs,
                  f_repl,
                  ref_fasta,
                  f_GC=None,
@@ -33,6 +35,8 @@ def __init__(self,
         self.ref_fasta = ref_fasta
 
         self.allelic_clusters = np.load(f_allelic_clusters)
+        with open(f_segs, "rb") as f:
+            self.segmentations = pickle.load(f)
         # coverage input is expected to be a df file with columns: ["chr", "start", "end", "covcorr", "covraw"]
         self.full_cov_df = self.load_coverage(coverage_csv)
         self.load_covariates()
@@ -165,38 +169,31 @@ def assign_clusters(self):
         cuj_max = clust_uj.max() + 1
         self.SNPs["clust_choice"] = clust_uj
 
-        # assign coverage intervals to clusters
+        ## assign coverage intervals to allelic clusters and segments
+        # assignment probabilities of each coverage interval -> allelic cluster
         Cov_clust_probs = np.zeros([len(self.full_cov_df), cuj_max])
 
+        # get allelic segment boundaries
+        seg_bdy = np.r_[list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)]
+        seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
+        self.SNPs["seg_idx"] = 0
+        for i, (st, en) in enumerate(seg_bdy):
+            self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i
+
         # first compute assignment probabilities based on the SNPs within each bin
+        # segments just get assigned to the maximum probability
+        self.full_cov_df["seg_idx"] = -1
         print("Mapping SNPs to targets ...", file = sys.stderr)
-        for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["clust_choice"]):
-            if len(snp_idx) == 1:
-                Cov_clust_probs[int(targ), snp_idx] = 1.0
+        for targ, D in tqdm.tqdm(self.SNPs.groupby("tidx")[["clust_choice", "seg_idx"]]):
+            clust_idx = D["clust_choice"].values
+            seg_idx = D["seg_idx"].values
+            if len(clust_idx) == 1:
+                Cov_clust_probs[int(targ), clust_idx] = 1.0
+                self.full_cov_df.at[int(targ), "seg_idx"] = seg_idx[0]
             else: 
-                targ_clust_hist = np.bincount(snp_idx, minlength = cuj_max) 
+                targ_clust_hist = np.bincount(clust_idx, minlength = cuj_max) 
                 Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum()
-
-#        # assign coverage intervals to allelic segments
-#        # TODO: segmentation boundary will be passed directly in, so we don't have to recompute it
-#        seg_bdy = np.flatnonzero(np.r_[1, np.diff(self.SNPs["clust_choice"]), 1] != 0)
-#        seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
-#        self.SNPs["seg_idx"] = 0
-#        for i, (st, en) in enumerate(seg_bdy):
-#            self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i
-#        seg_idx_max = self.SNPs["seg_idx"].max() + 1
-#
-#        Cov_clust_probs_seg = np.zeros([len(self.full_cov_df), seg_idx_max])
-#
-#        for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["seg_idx"]):
-#            if len(snp_idx) == 1:
-#                Cov_clust_probs_seg[int(targ), snp_idx] = 1.0
-#            else: 
-#                targ_clust_hist = np.bincount(snp_idx, minlength = seg_idx_max) 
-#                Cov_clust_probs_seg[int(targ), :] = targ_clust_hist / targ_clust_hist.sum()
-#
-#        # XXX: temporary
-#        Cov_clust_probs = Cov_clust_probs_seg
+                self.full_cov_df.at[int(targ), "seg_idx"] = np.bincount(seg_idx).argmax()
 
         ## subset to targets containing SNPs
         overlap_idx = Cov_clust_probs.sum(1) > 0

From 57032aefce9b896a6b5d37acc4128cd5188d42a0 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 2 May 2022 17:11:08 -0400
Subject: [PATCH 161/222] Expand chrbdy plot to fill ylim

---
 hapaseg/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/hapaseg/utils.py b/hapaseg/utils.py
index 46712d3..5b78213 100644
--- a/hapaseg/utils.py
+++ b/hapaseg/utils.py
@@ -46,11 +46,14 @@ def plot_chrbdy(cytoband_file):
     chrbdy = parse_cytoband(cytoband_file)
 
     # plot chromosome boundaries
+    yl_0 = plt.ylim()[0]
+    yl_1 = plt.ylim()[1]
     chr_ends = chrbdy.loc[1::2, "end"].cumsum()
     for end in chr_ends[:-1]:
         plt.axvline(end, color = 'k')
     for st, en in np.c_[chr_ends[:-1:2], chr_ends[1::2]]:
-        plt.fill_between([st, en], 0, 1, color = [0.9, 0.9, 0.9], zorder = 0)
+        plt.fill_between([st, en], yl_0, yl_1, color = [0.9, 0.9, 0.9], zorder = 0)
+    plt.ylim([yl_0, yl_1])
 
     # plot centromere locations
     for cent in (np.c_[chrbdy.loc[1::2, "start"], chrbdy.loc[::2, "end"]] + np.c_[np.r_[0, chr_ends[:-1]]]).ravel():

From 352b93ad3b9bbabe9e182f83013cc16bf97662c2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 2 May 2022 17:12:34 -0400
Subject: [PATCH 162/222] Save allelic segmentation boundaries WRT coverage
 dataframe

---
 hapaseg/__main__.py | 19 +++++++++++++++++++
 wolF/tasks.py       |  3 ++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index aebf06c..aa84d26 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -125,6 +125,7 @@ def parse_args():
     coverage_mcmc.add_argument("--allelic_clusters_object",
                                help="npy file containing allelic dp segs-to-clusters results")
     coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs")
+    coverage_mcmc.add_argument("--segmentations", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
     coverage_mcmc.add_argument("--covariate_dir",
                                help="path to covariate directory with covariates all in pickled files")
     coverage_mcmc.add_argument("--num_draws", type=int,
@@ -145,6 +146,7 @@ def parse_args():
     preprocess_coverage_mcmc.add_argument("--allelic_clusters_object",
                                           help="npy file containing allelic dp segs-to-clusters results", required=True)
     preprocess_coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs", required=True)
+    preprocess_coverage_mcmc.add_argument("--segmentations", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
     preprocess_coverage_mcmc.add_argument("--repl_pickle", help="pickled dataframe containing replication timing data", required=True)
     preprocess_coverage_mcmc.add_argument("--gc_pickle", help="pickled dataframe containing precomputed gc content. This is not required but will speed up runtime if passed", default=None)
     preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int,
@@ -504,17 +506,34 @@ def main():
 
     ## preprocess ADP data to run scattered coverage mcmc jobs on each ADP cluster
     elif args.command == "coverage_mcmc_preprocess":
+        ## perform initial Poisson regression
         cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv,
                                              args.allelic_clusters_object,
                                              args.SNPs_pickle,
+                                             args.segmentations,
                                              args.ref_fasta,
                                              f_repl=args.repl_pickle,
                                              f_GC=args.gc_pickle,
                                              allelic_sample=args.allelic_sample)
         Pi, r, C, all_mu, global_beta, cov_df, adp_cluster = cov_mcmc_runner.prepare_single_cluster()
+
+        ## create chunks for both burnin and scatter
+        cov_df = cov_df.sort_values("start_g", ignore_index = True)
+
+        # indices of coverage bins 
+        seg_g = cov_df.groupby("seg_idx")
+        seg_g_idx = pd.Series(seg_g.indices).to_frame(name = "indices")
+        seg_g_idx["allelic_cluster"] = seg_g["allelic_cluster"].first()
+        seg_g_idx["n_cov_bins"] = seg_g.size()
+
+        ## save
+        # regression matrices
         np.savez(os.path.join(output_dir, 'preprocess_data'), Pi=Pi, r=r, C=C, all_mu=all_mu,
                  global_beta=global_beta, adp_cluster=adp_cluster)
+        # coverage dataframe mapped 
         cov_df.to_pickle(os.path.join(output_dir, 'cov_df.pickle'))
+        # allelic segment indices into coverage dataframe
+        seg_g_idx.to_pickle(os.path.join(output_dir, 'allelic_seg_groups.pickle'))
 
     ## run scattered coverage mcmc job using preprocessed data
     elif args.command == "coverage_mcmc_shard":
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 6076e0a..96afa7c 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -151,7 +151,8 @@ def prolog(self):
 
     output_patterns = {
         "preprocess_data": "preprocess_data.npz",
-        "cov_df_pickle": "cov_df.pickle"
+        "cov_df_pickle": "cov_df.pickle",
+        "allelic_seg_groups": "allelic_seg_groups.pickle"
     }
 
     docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"

From 7aa624a93c9b20c7e6e4f0f8f04c9aa0dc8afe7b Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 2 May 2022 17:21:26 -0400
Subject: [PATCH 163/222] Add segmentation pickle to wolF task

---
 hapaseg/__main__.py | 6 +++---
 wolF/tasks.py       | 2 ++
 wolF/workflow.py    | 1 +
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index aa84d26..ee57783 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -125,7 +125,7 @@ def parse_args():
     coverage_mcmc.add_argument("--allelic_clusters_object",
                                help="npy file containing allelic dp segs-to-clusters results")
     coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs")
-    coverage_mcmc.add_argument("--segmentations", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
+    coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
     coverage_mcmc.add_argument("--covariate_dir",
                                help="path to covariate directory with covariates all in pickled files")
     coverage_mcmc.add_argument("--num_draws", type=int,
@@ -146,7 +146,7 @@ def parse_args():
     preprocess_coverage_mcmc.add_argument("--allelic_clusters_object",
                                           help="npy file containing allelic dp segs-to-clusters results", required=True)
     preprocess_coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs", required=True)
-    preprocess_coverage_mcmc.add_argument("--segmentations", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
+    preprocess_coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
     preprocess_coverage_mcmc.add_argument("--repl_pickle", help="pickled dataframe containing replication timing data", required=True)
     preprocess_coverage_mcmc.add_argument("--gc_pickle", help="pickled dataframe containing precomputed gc content. This is not required but will speed up runtime if passed", default=None)
     preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int,
@@ -510,7 +510,7 @@ def main():
         cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv,
                                              args.allelic_clusters_object,
                                              args.SNPs_pickle,
-                                             args.segmentations,
+                                             args.segmentations_pickle,
                                              args.ref_fasta,
                                              f_repl=args.repl_pickle,
                                              f_GC=args.gc_pickle,
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 96afa7c..3b234f6 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -131,6 +131,7 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task):
         "coverage_csv": None,
         "allelic_clusters_object": None,
         "SNPs_pickle": None,
+        "segmentations_pickle": None,
         "repl_pickle": None,
         "gc_pickle":"",
         "allelic_sample":"",
@@ -141,6 +142,7 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task):
     --ref_fasta ${ref_fasta} \
     --allelic_clusters_object ${allelic_clusters_object} \
     --SNPs_pickle ${SNPs_pickle} \
+    --segmentations_pickle ${segmentations_pickle} \
     --repl_pickle ${repl_pickle}"""
     
     def prolog(self):
diff --git a/wolF/workflow.py b/wolF/workflow.py
index 21ea515..65a0fc9 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -503,6 +503,7 @@ def concat_arm_level_results(arm_results):
         "coverage_csv":tumor_cov_gather_task["coverage"], #each scatter result is the same
         "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"],
         "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'],
+        "segmentations_pickle":hapaseg_allelic_DP_task['segmentation_breakpoints'],
         "repl_pickle":ref_config["repl_file"],
         "gc_pickle":ref_config["gc_file"],
         "ref_fasta":localization_task["ref_fasta"]

From 520364116a3768a7277106515d58de4f289205be Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 5 May 2022 11:21:35 -0400
Subject: [PATCH 164/222] Initial commit of scattering cov MCMC over allelic
 segments

---
 hapaseg/NB_coverage_MCMC.py |   6 +-
 hapaseg/__main__.py         | 115 ++++++++++++++++++++----------------
 wolF/tasks.py               |   8 ++-
 wolF/workflow.py            |  23 +++++++-
 4 files changed, 95 insertions(+), 57 deletions(-)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index 6275f39..b2473cc 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -1,6 +1,7 @@
 import numpy as np
 import scipy.special as ss
 import sortedcontainers as sc
+import sys
 from statsmodels.discrete.discrete_model import NegativeBinomial as statsNB
 import warnings
 from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning
@@ -738,13 +739,12 @@ def prepare_results(self):
     """
 class NB_MCMC_SingleCluster:
 
-    def __init__(self, n_iter, r, C, mu, beta, cluster_num, bin_width=1):
+    def __init__(self, n_iter, r, C, mu, beta, bin_width=1):
         self.n_iter = n_iter
         self.r = r
         self.C = C
         self.beta = beta
         self.mu = mu
-        self.cluster_num = cluster_num
         self.bin_width = bin_width
         # for now assume that the Pi vector assigns each bin to exactly one cluster
         
@@ -788,7 +788,7 @@ def save_sample(self):
     def run(self,
             debug=False,
             stop_after_burnin=False):
-        print("starting MCMC coverage segmentation for cluster {}...".format(self.cluster_num), flush=True)
+        print("Starting MCMC coverage segmentation ...", flush=True, file=sys.stderr)
 
         past_it = 0
         n_it = 0
diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index ee57783..6c4d574 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -156,15 +156,15 @@ def parse_args():
     ## running coverage mcmc on single cluster for scatter task
     coverage_mcmc_shard = subparsers.add_parser("coverage_mcmc_shard",
                                                 help="run coverage mcmc on single ADP cluster")
-    coverage_mcmc_shard.add_argument("--preprocess_data", help='path to numpy object containing preprocessed data',
+    coverage_mcmc_shard.add_argument("--preprocess_data", help='path to numpy object containing preprocessed data: covariate matrix (C), global beta, ADP cluster mu\'s, covbin ADP cluster assignments (all_mu), covbin raw coverage values (r)',
                                      required=True)
+    coverage_mcmc_shard.add_argument("--allelic_seg_indices", help='path to pickled pandas dataframe containing coverage bin indices for each alleic segment',
+                                     required=True)
+    coverage_mcmc_shard.add_argument("--allelic_seg_idx", help='which allelic segment to perform coverage segmentation on.',
+                                     required=True, type=int)
     coverage_mcmc_shard.add_argument("--num_draws", type=int,
                                help="number of draws to take from coverage segmentation MCMC", default=50)
-    coverage_mcmc_shard.add_argument("--cluster_num", type=int,
-                               help="cluster index for this worker to run on. If unspecified method will simulate "
-                                    "all clusters on the same machine", default=None)
     coverage_mcmc_shard.add_argument("--bin_width", type=int, default=1, help="size of uniform bins if using. Otherwise 1.")
-    coverage_mcmc_shard.add_argument("--range", type=str, help="range of coverage bins within the cluster to burnin. should be in start-end form. Note that this will cause num draws to be overridden to 1")
     coverage_mcmc_shard.add_argument("--burnin_files", type=str, help="txt file containing burnt in segment assignments")
 
     ## collect coverage MCMC shards
@@ -539,55 +539,70 @@ def main():
     elif args.command == "coverage_mcmc_shard":
         # load preprocessed data
         preprocess_data = np.load(args.preprocess_data)
-        # check to make sure that the cluster index is within the range
-        Pi = preprocess_data['Pi']
-        if args.cluster_num > Pi.shape[1] - 1:
-            raise ValueError("Received cluster number {}, which is out of range".format(args.cluster_num))
-        
+
         # extract preprocessed data from this cluster
-        mu = preprocess_data["all_mu"][args.cluster_num]
+        Pi = preprocess_data['Pi']
+        mu = preprocess_data["all_mu"]#[args.cluster_num]
         beta = preprocess_data["global_beta"]
         c_assignments = np.argmax(Pi, axis=1)
-        cluster_mask = (c_assignments == args.cluster_num)
-        r = preprocess_data['r'][cluster_mask]
-        C = preprocess_data['C'][cluster_mask]
-        
-        # if we get a range argument well be doing burnin on a subset of the coverage bins
-        if args.range is not None:
-            #parse range from string
-            range_lst = args.range.split('-')
-            st,en = int(range_lst[0]), int(range_lst[1]) 
-            if st > en or st < 0 or en > len(r):
-                raise ValueError("invalid range! got range {} for cluster {} with size {}".format(args.range, args.cluster_num, len(r)))
-            
-            #trim data to our desired range
-            r = r[st:en]
-            C = C[st:en]
-            num_draws = 1
-            
-            # if we're just burning in a subset use different save strings
-            model_save_str = 'cov_mcmc_model_cluster_{}_{}.pickle'.format(args.cluster_num, args.range)
-            data_save_str = 'cov_mcmc_data_cluster_{}_{}'.format(args.cluster_num, args.range)
-            figure_save_str = 'cov_mcmc_cluster_{}_{}_visual'.format(args.cluster_num, args.range)
-            
-        else:
-            #if not in burnin use the specified number of draws
-            num_draws = args.num_draws
-            
-            
-            model_save_str = 'cov_mcmc_model_cluster_{}.pickle'.format(args.cluster_num)
-            data_save_str = 'cov_mcmc_data_cluster_{}'.format(args.cluster_num)
-            figure_save_str = 'cov_mcmc_cluster_{}_visual'.format(args.cluster_num)
-        
-        # run on the specified cluster
-        cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.cluster_num, args.bin_width)
+        #cluster_mask = (c_assignments == args.cluster_num)
+        r = preprocess_data['r']#[cluster_mask]
+        C = preprocess_data['C']#[cluster_mask]
+
+        # load and (weakly) verify allelic segment indices
+        seg_g_idx = pd.read_pickle(args.allelic_seg_indices)
+        if len(np.hstack(seg_g_idx["indices"])) != C.shape[0]:
+            raise ValueError("Size mismatch between allelic segment assignments and coverage bin data!")
+
+        # subset to a single allelic segment
+        if args.allelic_seg_idx > len(seg_g_idx) - 1:
+            raise ValueError("Allelic segment index out of bounds!")
+
+        seg_indices = seg_g_idx.iloc[args.allelic_seg_idx]
+
+        mu = mu[seg_indices["allelic_cluster"]]
+        C = C[seg_indices["indices"], :]
+        r = r[seg_indices["indices"], :]
         
-        # if we're using burnin results load them now
-        if args.burnin_files is not None:
-            with open(args.burnin_files, 'r') as f:
-                file_list = f.read().splitlines()
-            assignments_arr = aggregate_burnin_files(file_list, args.cluster_num)
-            cov_mcmc.init_burnin(assignments_arr)
+        # run cov MCMC
+        cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.bin_width)
+
+#        # if we get a range argument well be doing burnin on a subset of the coverage bins
+#        if args.range is not None:
+#            #parse range from string
+#            range_lst = args.range.split('-')
+#            st,en = int(range_lst[0]), int(range_lst[1]) 
+#            if st > en or st < 0 or en > len(r):
+#                raise ValueError("invalid range! got range {} for cluster {} with size {}".format(args.range, args.cluster_num, len(r)))
+#            
+#            #trim data to our desired range
+#            r = r[st:en]
+#            C = C[st:en]
+#            num_draws = 1
+#            
+#            # if we're just burning in a subset use different save strings
+#            model_save_str = 'cov_mcmc_model_cluster_{}_{}.pickle'.format(args.cluster_num, args.range)
+#            data_save_str = 'cov_mcmc_data_cluster_{}_{}'.format(args.cluster_num, args.range)
+#            figure_save_str = 'cov_mcmc_cluster_{}_{}_visual'.format(args.cluster_num, args.range)
+#            
+#        else:
+#            #if not in burnin use the specified number of draws
+#            num_draws = args.num_draws
+#            
+#            
+#            model_save_str = 'cov_mcmc_model_cluster_{}.pickle'.format(args.cluster_num)
+#            data_save_str = 'cov_mcmc_data_cluster_{}'.format(args.cluster_num)
+#            figure_save_str = 'cov_mcmc_cluster_{}_visual'.format(args.cluster_num)
+#        
+#        # run on the specified cluster
+#        cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.cluster_num, args.bin_width)
+#        
+#        # if we're using burnin results load them now
+#        if args.burnin_files is not None:
+#            with open(args.burnin_files, 'r') as f:
+#                file_list = f.read().splitlines()
+#            assignments_arr = aggregate_burnin_files(file_list, args.cluster_num)
+#            cov_mcmc.init_burnin(assignments_arr)
 
         cov_mcmc.run()
 
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 3b234f6..860acfb 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -190,16 +190,18 @@ def prolog(self):
 
 class Hapaseg_coverage_mcmc(wolf.Task):
     inputs = {
-        "preprocess_data": None,
+        "preprocess_data": None,      # npz of covariate matrix (C), global beta, ADP cluster mu's, covbin ADP cluster assignments (all_mu), covbin raw coverage values (r)
+        "allelic_seg_indices": None,  # dataframe containing indicies into C/r/all_mu for each allelic segment
+        "allelic_seg_scatter_idx": None,      # allelic segment to operate on (for scatter)
         "num_draws": 50,
-        "cluster_num": None,
         "bin_width":None,
         "burnin_files":""
     }
     script = """
     hapaseg coverage_mcmc_shard --preprocess_data ${preprocess_data} \
+    --allelic_seg_indices ${allelic_seg_idx} \
+    --allelic_seg_idx ${allelic_seg_scatter_idx} \
     --num_draws ${num_draws} \
-    --cluster_num ${cluster_num} \
     --bin_width ${bin_width}"""
      
     def prolog(self):
diff --git a/wolF/workflow.py b/wolF/workflow.py
index 65a0fc9..784e3ed 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -509,7 +509,28 @@ def concat_arm_level_results(arm_results):
         "ref_fasta":localization_task["ref_fasta"]
         }
     )
-    
+
+    # shim task to get number of allelic segments
+    #   (coverage MCMC will be scattered over each allelic segment)
+    @prefect.task
+    def get_N_seg_groups(S):
+        return len(S)
+
+    N_cov_mcmc_shards = get_N_seg_groups(prep_cov_mcmc_task["allelic_seg_groups"])
+
+    # TODO: modify burnin task to subset to these indices
+
+    # coverage MCMC burnin(?) <- do we still need to burnin separately?
+    cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc_burnin(
+        inputs={
+            "preprocess_data":prep_cov_mcmc_task["preprocess_data"],
+            "allelic_seg_indices":prep_cov_mcmc_task["allelic_seg_groups"],
+            "allelic_seg_scatter_idx":range(0, N_cov_mcmc_shards),
+            "num_draws":50,
+            "bin_width":bin_width,
+        }
+    )
+ 
     #get the cluster indices from the preprocess data and generate the burnin indices
     @prefect.task(nout=4)
     def _get_ADP_cluster_list(preprocess_data_obj):

From 11fa172c290d1f78756d3490f88be40391d556c2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 5 May 2022 13:17:40 -0400
Subject: [PATCH 165/222] Use specific version of interval splitter (for now)

---
 wolF/workflow.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index 784e3ed..0509db0 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -38,7 +38,8 @@
 # for coverage collection
 split_intervals = wolf.ImportTask(
   task_path = "git@github.com:getzlab/split_intervals_TOOL.git",
-  task_name = "split_intervals"
+  task_name = "split_intervals",
+  commit = "dc102d8"
 )
 
 cov_collect = wolf.ImportTask(

From 217ee269d77dbf668a21a6999d08443e7c8b8c3d Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 5 May 2022 14:05:40 -0400
Subject: [PATCH 166/222] Misc bugs in run_coverage_MCMC call from __main__

---
 hapaseg/__main__.py          | 1 +
 hapaseg/run_coverage_MCMC.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 6c4d574..77fea79 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -152,6 +152,7 @@ def parse_args():
     preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int,
                                           help="index of sample clustering from allelic DP to use as seed for segmentation. Will use most likely clustering by default",
                                           default=None)
+    preprocess_coverage_mcmc.add_argument("--ref_fasta", required = True)
 
     ## running coverage mcmc on single cluster for scatter task
     coverage_mcmc_shard = subparsers.add_parser("coverage_mcmc_shard",
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 810a49d..b093d69 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -20,8 +20,8 @@ def __init__(self,
                  f_allelic_clusters,
                  f_SNPs,
                  f_segs,
-                 f_repl,
                  ref_fasta,
+                 f_repl,
                  f_GC=None,
                  num_draws=50,
                  cluster_num=None,

From c0942d646b3064fd2e2f7f61a0ef60e26700bb39 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 5 May 2022 14:33:00 -0400
Subject: [PATCH 167/222] Bump some dockers

---
 wolF/tasks.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 860acfb..651d22b 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -123,7 +123,7 @@ class Hapaseg_allelic_DP(wolf.Task):
       "SNP_plot" : "figures/SNPs.png",
       "seg_plot" : "figures/segs_only.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v789"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v813"
     resources = { "mem" : "8G" }
 
 class Hapaseg_prepare_coverage_mcmc(wolf.Task):
@@ -157,7 +157,7 @@ def prolog(self):
         "allelic_seg_groups": "allelic_seg_groups.pickle"
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v815"
     resources = { "mem" : "15G" }
 
 
@@ -214,7 +214,7 @@ def prolog(self):
         "cov_seg_figure": 'cov_mcmc_cluster_*_visual.png'
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v815"
     resources = {"mem" : "5G"}
 
 class Hapaseg_collect_coverage_mcmc(wolf.Task):

From 44344b2f59e98d731451246eec6d0bcbf854d278 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 5 May 2022 14:41:54 -0400
Subject: [PATCH 168/222] Added missing factor of 2

---
 hapaseg/NB_coverage_MCMC.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index b2473cc..53efda7 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -451,7 +451,7 @@ def _get_log_ML_approx_join(self, Hess):
 
     # computes ML component from hessian approximation for two split segments
     def _get_log_ML_split(self, H1, H2):
-        return np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2
+        return 2*np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2
 
     # computes the log ML of joining two segments
     def _log_ML_join(self, ind, ret_opt_params=False):

From 0e33a82363b9a76a6e7a74fb2fd4f5fb52af05db Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 10 May 2022 17:13:33 -0400
Subject: [PATCH 169/222] Use poscol instead of rename

---
 hapaseg/run_coverage_MCMC.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index b093d69..670da2f 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -108,13 +108,14 @@ def load_covariates(self):
             #remove the len col since it will ruin beta fitting
             self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1)
 
-        ## Replication timing
         zt = lambda x : (x - np.nanmean(x))/np.nanstd(x)
 
+        ## Replication timing
+
         # load repl timing
         F = pd.read_pickle(self.f_repl)
         # map targets to RT intervals
-        tidx = mut.map_mutations_to_targets(self.full_cov_df.rename(columns={"start": "pos"}), F, inplace=False)
+        tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "start")
         self.full_cov_df['C_RT'] = np.nan
         self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values
 

From b9bbf18f20376ea2e7a9c3297420e37836545354 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 10 May 2022 17:27:47 -0400
Subject: [PATCH 170/222] Use rolling convolution for smoothing fragment length

---
 hapaseg/run_coverage_MCMC.py | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 670da2f..091ff07 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -139,24 +139,22 @@ def load_covariates(self):
         
         ## Fragment length
 
-        # some bins have zero mean fragment length(!?); NaN these out
-        self.full_cov_df.loc[(self.full_cov_df.mean_frag_len == 0) | (self.full_cov_df.std_frag_len == 0), ['mean_frag_len', 'std_frag_len']] = (np.nan, np.nan)
+        # some bins have zero mean fragment length; these bins are bad and should be removed
+        self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True)
 
         self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
         self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
 
-        # generate on 10x and 50x scales
-        # TODO: use rolling window rather than disjoint bins
-        for scale in [10, 50]:
-            fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0
-            wt = self.full_cov_df["num_reads"].values
-            fl = np.pad(fl, (0, scale - (len(fl) % scale))).reshape(-1, scale)
-            wt = np.pad(wt, (0, scale - (len(wt) % scale))).reshape(-1, scale)
-            wt = wt/wt.sum(1, keepdims = True)
-            self.full_cov_df[f"C_frag_len_{scale}x"] = np.tile(
-              np.einsum('ij,ij->i', wt, fl),
-              [scale, 1]
-            ).T.ravel()[:len(self.full_cov_df)]
+        # generate on 5x and 11x scales
+        swv = np.lib.stride_tricks.sliding_window_view
+        fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0
+        wt = self.full_cov_df["num_reads"].values
+        for scale in [5, 11]:
+            fl_sw = swv(np.pad(fl, scale//2), scale)
+            wt_sw = swv(np.pad(wt, scale//2), scale)
+            conv = np.einsum('ij,ij->i', wt_sw, fl_sw)
+
+            self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1)
             self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"])
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters

From 3b44cef7fbad72aa38883e21c2f0bf044514120e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 10 May 2022 17:56:51 -0400
Subject: [PATCH 171/222] Generate DNAse/FAIRE covariates

---
 71_coverage_covariates.py | 80 +++++++++++++++++++++++++++++++++++++--
 covars/getmax.c           | 15 ++++++++
 2 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 covars/getmax.c

diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py
index 6c2bb1b..7b4525c 100644
--- a/71_coverage_covariates.py
+++ b/71_coverage_covariates.py
@@ -1,11 +1,13 @@
 import liftover
+import numpy as np
 import pandas as pd
 import pyfaidx
+import pyBigWig
 import tqdm
-from capy import mut
+from capy import mut, seq
 
 #
-# replication timing
+# replication timing {{{
 
 F = pd.read_csv("/mnt/j/proj/cnv/20201018_hapseg2/covars/GSE137764_H1_GaussiansGSE137764_mooth_scaled_autosome.mat", sep = "\t", header = None).T.rename(columns = { 0 : "chr", 1 : "start", 2 : "end" })
 F.iloc[:, 3:] = F.loc[:, 3:].astype(float)
@@ -13,7 +15,7 @@
 F["chr"] = mut.convert_chr(F["chr"])
 F.to_pickle("covars/GSE137764_H1.hg38.pickle")
 
-# liftover to hg19
+# liftover to hg19 {{{
 F["chr_start_lift"] = 0
 F["chr_end_lift"] = 0
 F["start_lift"] = 0
@@ -63,8 +65,14 @@
 (F["start_strand_lift"].notin(["+", "?"])) | \
 (F["start_lift"] > F["end_lift"])
 
+# }}}
+
+# }}}
+
 #
-# GC content
+# GC content {{{
+
+# note: this is obsolete; GC content is now computed on the fly
 
 B = pd.read_csv("/mnt/j/proj/cnv/20210326_coverage_collector/targets.bed", sep = "\t", header = None, names = ["chr", "start", "end"])
 B["chr"] = mut.convert_chr(B["chr"])
@@ -78,3 +86,67 @@
 
 B.to_pickle("covars/GC.pickle")
 
+# }}}
+
+#
+# DNAse HS/FAIRE {{{
+
+## DNAse {{{
+
+bw = pyBigWig.open("covars/wgEncodeUwDnaseGm12878RawRep1.bigWig")
+
+# WGS (2kb chunks)
+clen = seq.get_chrlens()
+C = []
+for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]]):
+    bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]]
+    tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "DNAse" : 0 })
+    for j, (st, en) in enumerate(tqdm.tqdm(bins)):
+        tmp.loc[j, "DNAse"] = np.nanmean(np.r_[bw.values(chrname, st, en)])
+    C.append(tmp)
+
+# preliminary results not so great; stick with FAIRE for now
+
+# TODO: liftover to hg38
+
+# WES
+
+# }}}
+
+## FAIRE {{{
+
+## convert bigWig to FWB
+
+# for some reason pyBigWig can't process this file
+# bw = pyBigWig.open("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigwig")
+
+# use bigWig2FWB instead
+# git clone git@github.com:getzlab/bigWig2FWB.git
+
+# figure out range of file
+# bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/bwtest
+# ./getmax
+# -> max = 5478
+# set scale factor to 11
+# bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal
+
+## WGS
+from capy import fwb
+
+F = fwb.FWB("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb");
+
+clen = seq.get_chrlens()
+C = []
+for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]]):
+    bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]]
+    tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "FAIRE" : 0 })
+    for j, (st, en) in enumerate(tqdm.tqdm(bins)):
+        tmp.loc[j, "FAIRE"] = F.get(chrname, np.r_[st:en] + 1).mean()
+    C.append(tmp)
+
+FAIRE = pd.concat(C, ignore_index = True)
+FAIRE.to_pickle("covars/FAIRE_GM12878.hg19.pickle")
+
+# }}}
+
+# }}}
diff --git a/covars/getmax.c b/covars/getmax.c
new file mode 100644
index 0000000..5bcc865
--- /dev/null
+++ b/covars/getmax.c
@@ -0,0 +1,15 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <byteswap.h>
+
+int main() {
+   FILE* x = fopen("wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb", "r");
+   uint16_t max = 0;
+   uint16_t buf;
+   while(fread(&buf, 2, 1, x)) {
+      buf = __bswap_16(buf);
+      if(buf > max) { max = buf; printf("%d\n", max); }
+   }
+   return 0;
+}

From 97030468faca07bd9e7f35fbd140a5b8365c6112 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 10 May 2022 18:20:07 -0400
Subject: [PATCH 172/222] Add FAIRE covariate to cov MCMC

---
 hapaseg/__main__.py          |  1 +
 hapaseg/run_coverage_MCMC.py | 15 ++++++++++++++-
 wolF/tasks.py                |  4 +++-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 77fea79..1ae1e46 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -148,6 +148,7 @@ def parse_args():
     preprocess_coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs", required=True)
     preprocess_coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
     preprocess_coverage_mcmc.add_argument("--repl_pickle", help="pickled dataframe containing replication timing data", required=True)
+    preprocess_coverage_mcmc.add_argument("--faire_pickle", help="pickled dataframe containing FAIRE data", required=True)
     preprocess_coverage_mcmc.add_argument("--gc_pickle", help="pickled dataframe containing precomputed gc content. This is not required but will speed up runtime if passed", default=None)
     preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int,
                                           help="index of sample clustering from allelic DP to use as seed for segmentation. Will use most likely clustering by default",
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 091ff07..47d7a7c 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -22,6 +22,7 @@ def __init__(self,
                  f_segs,
                  ref_fasta,
                  f_repl,
+                 f_faire,
                  f_GC=None,
                  num_draws=50,
                  cluster_num=None,
@@ -31,6 +32,7 @@ def __init__(self,
         self.num_draws = num_draws
         self.cluster_num = cluster_num
         self.f_repl = f_repl
+        self.f_faire = f_faire
         self.f_GC = f_GC
         self.ref_fasta = ref_fasta
 
@@ -136,7 +138,18 @@ def load_covariates(self):
             self.generate_GC()
         
         self.full_cov_df["C_GC_z"] = zt(self.full_cov_df["C_GC"])
-        
+
+        ## FAIRE
+
+        F = pd.read_pickle(self.f_faire)
+        # map targets to FAIRE intervals
+        tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "start")
+        self.full_cov_df['C_FAIRE'] = np.nan
+        self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values
+
+        # z-transform
+        self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"])
+
         ## Fragment length
 
         # some bins have zero mean fragment length; these bins are bad and should be removed
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 651d22b..3ef01e8 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -133,6 +133,7 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task):
         "SNPs_pickle": None,
         "segmentations_pickle": None,
         "repl_pickle": None,
+        "faire_pickle": "/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE_GM12878.hg19.pickle", # TODO: make remote
         "gc_pickle":"",
         "allelic_sample":"",
         "ref_fasta": None
@@ -143,7 +144,8 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task):
     --allelic_clusters_object ${allelic_clusters_object} \
     --SNPs_pickle ${SNPs_pickle} \
     --segmentations_pickle ${segmentations_pickle} \
-    --repl_pickle ${repl_pickle}"""
+    --repl_pickle ${repl_pickle} \
+    --faire_pickle ${faire_pickle}"""
     
     def prolog(self):
         if self.conf["inputs"]["gc_pickle"] != "":

From 07fea1e500a4f81957cbc0bf2be2fcb75a0cd93a Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 10 May 2022 18:20:23 -0400
Subject: [PATCH 173/222] Get rid of chrY; convert chrnames

---
 71_coverage_covariates.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py
index 7b4525c..264962c 100644
--- a/71_coverage_covariates.py
+++ b/71_coverage_covariates.py
@@ -131,13 +131,13 @@
 # bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal
 
 ## WGS
-from capy import fwb
+from capy import fwb, mut
 
 F = fwb.FWB("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb");
 
 clen = seq.get_chrlens()
 C = []
-for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]]):
+for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X"]]):
     bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]]
     tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "FAIRE" : 0 })
     for j, (st, en) in enumerate(tqdm.tqdm(bins)):
@@ -145,6 +145,7 @@
     C.append(tmp)
 
 FAIRE = pd.concat(C, ignore_index = True)
+FAIRE["chr"] = mut.convert_chr(FAIRE["chr"])
 FAIRE.to_pickle("covars/FAIRE_GM12878.hg19.pickle")
 
 # }}}

From c00c1247680fe808b85db417d3e930cc8141b802 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 10 May 2022 18:53:43 -0400
Subject: [PATCH 174/222] Compute fraglen covariate first (to remove bad bins
 immediately)

---
 hapaseg/run_coverage_MCMC.py | 40 ++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 47d7a7c..4081152 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -112,6 +112,26 @@ def load_covariates(self):
 
         zt = lambda x : (x - np.nanmean(x))/np.nanstd(x)
 
+        ## Fragment length
+
+        # some bins have zero mean fragment length; these bins are bad and should be removed
+        self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True)
+
+        self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
+        self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
+
+        # generate on 5x and 11x scales
+        swv = np.lib.stride_tricks.sliding_window_view
+        fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0
+        wt = self.full_cov_df["num_reads"].values
+        for scale in [5, 11]:
+            fl_sw = swv(np.pad(fl, scale//2), scale)
+            wt_sw = swv(np.pad(wt, scale//2), scale)
+            conv = np.einsum('ij,ij->i', wt_sw, fl_sw)
+
+            self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1)
+            self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"])
+
         ## Replication timing
 
         # load repl timing
@@ -150,26 +170,6 @@ def load_covariates(self):
         # z-transform
         self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"])
 
-        ## Fragment length
-
-        # some bins have zero mean fragment length; these bins are bad and should be removed
-        self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True)
-
-        self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
-        self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
-
-        # generate on 5x and 11x scales
-        swv = np.lib.stride_tricks.sliding_window_view
-        fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0
-        wt = self.full_cov_df["num_reads"].values
-        for scale in [5, 11]:
-            fl_sw = swv(np.pad(fl, scale//2), scale)
-            wt_sw = swv(np.pad(wt, scale//2), scale)
-            conv = np.einsum('ij,ij->i', wt_sw, fl_sw)
-
-            self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1)
-            self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"])
-
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned
     # method returns coverage df with only bins that overlap snps

From 3e79b9fe11b87ca066b9fbf5c77763c7b04e95c2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 12 May 2022 13:45:30 -0400
Subject: [PATCH 175/222] Use midpoint when mapping bins to covariates

---
 hapaseg/run_coverage_MCMC.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 4081152..0dcb1e8 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -132,12 +132,16 @@ def load_covariates(self):
             self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1)
             self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"])
 
+        ### track-based covariates
+        # use midpoint of coverage bins to map to intervals
+        self.full_cov_df["midpoint"] = ((self.full_cov_df["end"] + self.full_cov_df["start"])/2).astype(int)
+
         ## Replication timing
 
         # load repl timing
         F = pd.read_pickle(self.f_repl)
         # map targets to RT intervals
-        tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "start")
+        tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint")
         self.full_cov_df['C_RT'] = np.nan
         self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values
 
@@ -163,7 +167,7 @@ def load_covariates(self):
 
         F = pd.read_pickle(self.f_faire)
         # map targets to FAIRE intervals
-        tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "start")
+        tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint")
         self.full_cov_df['C_FAIRE'] = np.nan
         self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values
 

From 725b7b9691e2b700c63d6a4ff125ae45c921f64e Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 12 May 2022 13:56:25 -0400
Subject: [PATCH 176/222] Make FAIRE optional

---
 hapaseg/run_coverage_MCMC.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 0dcb1e8..4b56d02 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -165,14 +165,15 @@ def load_covariates(self):
 
         ## FAIRE
 
-        F = pd.read_pickle(self.f_faire)
-        # map targets to FAIRE intervals
-        tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint")
-        self.full_cov_df['C_FAIRE'] = np.nan
-        self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values
-
-        # z-transform
-        self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"])
+        if self.f_faire is not None:
+            F = pd.read_pickle(self.f_faire)
+            # map targets to FAIRE intervals
+            tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint")
+            self.full_cov_df['C_FAIRE'] = np.nan
+            self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values
+
+            # z-transform
+            self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"])
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned

From ee1f6b56b58c1f42c68372370a152254fdbe76eb Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 12 May 2022 16:01:44 -0400
Subject: [PATCH 177/222] Add log exposure to Poisson regression

---
 hapaseg/model_optimizers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py
index 43d54d4..76e95c1 100644
--- a/hapaseg/model_optimizers.py
+++ b/hapaseg/model_optimizers.py
@@ -2,14 +2,15 @@
 
 
 class PoissonRegression:
-    def __init__(self, r, C, Pi):
+    def __init__(self, r, C, Pi, log_exposure = 0):
         self.r = r
         self.C = C
         self.Pi = Pi
+        self.log_exposure = log_exposure
 
         self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1]))
         self.beta = np.ones([C.shape[1], 1])
-        self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu)
+        self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure)
 
     # mu gradient
     def gradmu(self):
@@ -33,7 +34,7 @@ def hessmubeta(self):
 
     def NR_poisson(self):
         for i in range(100):
-            self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu)
+            self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure)
             gmu = self.gradmu()
             gbeta = self.gradbeta()
             grad = np.r_[gmu, gbeta]

From ce09d1fc6d39e8038f4623363b73a83ca15fbc4f Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 12 May 2022 16:35:37 -0400
Subject: [PATCH 178/222] Add smoothed FAIRE

---
 71_coverage_covariates.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py
index 264962c..137dcad 100644
--- a/71_coverage_covariates.py
+++ b/71_coverage_covariates.py
@@ -148,6 +148,11 @@
 FAIRE["chr"] = mut.convert_chr(FAIRE["chr"])
 FAIRE.to_pickle("covars/FAIRE_GM12878.hg19.pickle")
 
+# smoothed version
+FAIRE_smooth = FAIRE.copy()
+FAIRE_smooth["FAIRE"] = np.convolve(FAIRE["FAIRE"], np.ones(5), mode = "same")/5
+FAIRE_smooth.to_pickle("covars/FAIRE_GM12878.smooth5.hg19.pickle")
+
 # }}}
 
 # }}}

From 3dcd389d3e67cc2bbb48e41645baa6e1ae882312 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 13 May 2022 14:00:52 -0400
Subject: [PATCH 179/222] Ignore FWBs and NPZs in build context

---
 .dockerignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.dockerignore b/.dockerignore
index a160cc0..50af6bb 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,5 @@
 **/*.bam
 **/*.call_stats.txt
+**/*.fw?
+**/*.bigWig
+**/*.npz

From fdd53bd5d6b53979628413ae0fa13a6af839ba43 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 16 May 2022 12:25:06 -0400
Subject: [PATCH 180/222] Add cache_invalidate to Dockerfile to force updating
 Python modules

---
 Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Dockerfile b/Dockerfile
index 85baf96..249c37e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,6 +4,7 @@ WORKDIR /build
 
 # install dependencies
 RUN pip install sortedcontainers
+ARG cache_invalidate=xxx
 RUN git clone https://github.com/getzlab/CApy.git && pip install ./CApy
 RUN pip install dask distributed
 RUN pip install distinctipy

From 723fb610f2b1cc2596868c2307275c37889f0961 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 16 May 2022 12:35:14 -0400
Subject: [PATCH 181/222] Add bin width to cov MCMC prep task

---
 hapaseg/__main__.py          |  2 ++
 hapaseg/run_coverage_MCMC.py |  6 ++++--
 wolF/tasks.py                | 31 ++++++++++++++++++-------------
 wolF/workflow.py             |  3 ++-
 4 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 1ae1e46..b18671f 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -154,6 +154,7 @@ def parse_args():
                                           help="index of sample clustering from allelic DP to use as seed for segmentation. Will use most likely clustering by default",
                                           default=None)
     preprocess_coverage_mcmc.add_argument("--ref_fasta", required = True)
+    preprocess_coverage_mcmc.add_argument("--bin_width", help = "Coverage bin width (for WGS only)", default = 1, type = int)
 
     ## running coverage mcmc on single cluster for scatter task
     coverage_mcmc_shard = subparsers.add_parser("coverage_mcmc_shard",
@@ -515,6 +516,7 @@ def main():
                                              args.segmentations_pickle,
                                              args.ref_fasta,
                                              f_repl=args.repl_pickle,
+                                             f_faire=args.faire_pickle,
                                              f_GC=args.gc_pickle,
                                              allelic_sample=args.allelic_sample)
         Pi, r, C, all_mu, global_beta, cov_df, adp_cluster = cov_mcmc_runner.prepare_single_cluster()
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 4b56d02..5704bf3 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -26,7 +26,8 @@ def __init__(self,
                  f_GC=None,
                  num_draws=50,
                  cluster_num=None,
-                 allelic_sample=None
+                 allelic_sample=None,
+                 bin_width=1,
                  ):
 
         self.num_draws = num_draws
@@ -35,6 +36,7 @@ def __init__(self,
         self.f_faire = f_faire
         self.f_GC = f_GC
         self.ref_fasta = ref_fasta
+        self.bin_width = bin_width
 
         self.allelic_clusters = np.load(f_allelic_clusters)
         with open(f_segs, "rb") as f:
@@ -66,7 +68,7 @@ def run_all_clusters(self):
     # Do preprocessing for running on each ADP cluster individually
     def prepare_single_cluster(self):
         Pi, r, C, filtered_cov_df = self.assign_clusters()
-        pois_regr = PoissonRegression(r, C, Pi)
+        pois_regr = PoissonRegression(r, C, Pi, log_exposure = np.log(self.bin_width))
         all_mu, global_beta = pois_regr.fit()
 
         # save these results to a numpy object
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 3ef01e8..4bd5c5f 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -136,22 +136,27 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task):
         "faire_pickle": "/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE_GM12878.hg19.pickle", # TODO: make remote
         "gc_pickle":"",
         "allelic_sample":"",
-        "ref_fasta": None
+        "ref_fasta": None,
+        "bin_width" : 1 # only for whole genomes; for exomes, target lengths are passed as a covariate via the coverage CSV
     }
-    script = """
-    hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \
-    --ref_fasta ${ref_fasta} \
-    --allelic_clusters_object ${allelic_clusters_object} \
-    --SNPs_pickle ${SNPs_pickle} \
-    --segmentations_pickle ${segmentations_pickle} \
-    --repl_pickle ${repl_pickle} \
-    --faire_pickle ${faire_pickle}"""
+    def script(self):
+        script = """
+        hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \
+        --ref_fasta ${ref_fasta} \
+        --allelic_clusters_object ${allelic_clusters_object} \
+        --SNPs_pickle ${SNPs_pickle} \
+        --segmentations_pickle ${segmentations_pickle} \
+        --repl_pickle ${repl_pickle} \
+        --faire_pickle ${faire_pickle} \
+        --bin_width ${bin_width}
+        """
     
-    def prolog(self):
         if self.conf["inputs"]["gc_pickle"] != "":
-            self.conf["script"][-1] += " --gc_pickle ${gc_pickle}"
+            script += " --gc_pickle ${gc_pickle} "
         if self.conf["inputs"]["allelic_sample"] != "":
-            self.conf["script"][-1] += " --allelic_sample ${allelic_sample}"
+            script += " --allelic_sample ${allelic_sample}"
+
+        return script
 
     output_patterns = {
         "preprocess_data": "preprocess_data.npz",
@@ -159,7 +164,7 @@ def prolog(self):
         "allelic_seg_groups": "allelic_seg_groups.pickle"
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v815"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v828"
     resources = { "mem" : "15G" }
 
 
diff --git a/wolF/workflow.py b/wolF/workflow.py
index 0509db0..bc66002 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -507,7 +507,8 @@ def concat_arm_level_results(arm_results):
         "segmentations_pickle":hapaseg_allelic_DP_task['segmentation_breakpoints'],
         "repl_pickle":ref_config["repl_file"],
         "gc_pickle":ref_config["gc_file"],
-        "ref_fasta":localization_task["ref_fasta"]
+        "ref_fasta":localization_task["ref_fasta"],
+        "bin_width":bin_width if wgs else 1
         }
     )
 

From 136cfa33b1117bbdfed3c87bc576f2d3f30dfbaf Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 16 May 2022 14:52:25 -0400
Subject: [PATCH 182/222] Fix bad allelic seg index bug

---
 hapaseg/__main__.py          | 2 +-
 hapaseg/run_coverage_MCMC.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index b18671f..aaa9a72 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -525,7 +525,7 @@ def main():
         cov_df = cov_df.sort_values("start_g", ignore_index = True)
 
         # indices of coverage bins 
-        seg_g = cov_df.groupby("seg_idx")
+        seg_g = cov_df.groupby("seg_idx") # NOTE: seg_idx may not be contiguous if any allelic segments were dropped 
         seg_g_idx = pd.Series(seg_g.indices).to_frame(name = "indices")
         seg_g_idx["allelic_cluster"] = seg_g["allelic_cluster"].first()
         seg_g_idx["n_cov_bins"] = seg_g.size()
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 5704bf3..7eca2cb 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -193,7 +193,7 @@ def assign_clusters(self):
         Cov_clust_probs = np.zeros([len(self.full_cov_df), cuj_max])
 
         # get allelic segment boundaries
-        seg_bdy = np.r_[list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)]
+        seg_bdy = np.r_[0, list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)]
         seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
         self.SNPs["seg_idx"] = 0
         for i, (st, en) in enumerate(seg_bdy):

From 2a09a0f1b339daaf19e8c56d5c3f9d818e11b575 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 16 May 2022 15:28:37 -0400
Subject: [PATCH 183/222] Need args

---
 hapaseg/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index aaa9a72..e6e8782 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -569,7 +569,7 @@ def main():
         r = r[seg_indices["indices"], :]
         
         # run cov MCMC
-        cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.bin_width)
+        cov_mcmc = NB_MCMC_SingleCluster(args.num_draws, r, C, mu, beta, args.bin_width)
 
 #        # if we get a range argument well be doing burnin on a subset of the coverage bins
 #        if args.range is not None:

From d89964df4a556c0d08abf435cbd3e5188ffaa79c Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 16 May 2022 15:44:25 -0400
Subject: [PATCH 184/222] Bump dockers, fix misc. workflow bugs

---
 wolF/tasks.py    |  8 ++++----
 wolF/workflow.py | 10 ++++------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 4bd5c5f..11496c9 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -164,7 +164,7 @@ def script(self):
         "allelic_seg_groups": "allelic_seg_groups.pickle"
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v828"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v832"
     resources = { "mem" : "15G" }
 
 
@@ -206,7 +206,7 @@ class Hapaseg_coverage_mcmc(wolf.Task):
     }
     script = """
     hapaseg coverage_mcmc_shard --preprocess_data ${preprocess_data} \
-    --allelic_seg_indices ${allelic_seg_idx} \
+    --allelic_seg_indices ${allelic_seg_indices} \
     --allelic_seg_idx ${allelic_seg_scatter_idx} \
     --num_draws ${num_draws} \
     --bin_width ${bin_width}"""
@@ -221,8 +221,8 @@ def prolog(self):
         "cov_seg_figure": 'cov_mcmc_cluster_*_visual.png'
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v815"
-    resources = {"mem" : "5G"}
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v830"
+    resources = {"mem" : "10G"}
 
 class Hapaseg_collect_coverage_mcmc(wolf.Task):
     inputs = {
diff --git a/wolF/workflow.py b/wolF/workflow.py
index bc66002..7bf7716 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -516,18 +516,16 @@ def concat_arm_level_results(arm_results):
     #   (coverage MCMC will be scattered over each allelic segment)
     @prefect.task
     def get_N_seg_groups(S):
-        return len(S)
+        return list(range(len(pd.read_pickle(S))))
 
-    N_cov_mcmc_shards = get_N_seg_groups(prep_cov_mcmc_task["allelic_seg_groups"])
-
-    # TODO: modify burnin task to subset to these indices
+    cov_mcmc_shard_range = get_N_seg_groups(prep_cov_mcmc_task["allelic_seg_groups"])
 
     # coverage MCMC burnin(?) <- do we still need to burnin separately?
-    cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc_burnin(
+    cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc(
         inputs={
             "preprocess_data":prep_cov_mcmc_task["preprocess_data"],
             "allelic_seg_indices":prep_cov_mcmc_task["allelic_seg_groups"],
-            "allelic_seg_scatter_idx":range(0, N_cov_mcmc_shards),
+            "allelic_seg_scatter_idx":cov_mcmc_shard_range,
             "num_draws":50,
             "bin_width":bin_width,
         }

From a9956b3306d2c278b79c7782136ca24c00637beb Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 16 May 2022 16:09:03 -0400
Subject: [PATCH 185/222] Properly export segment-level covMCMC files

---
 hapaseg/__main__.py | 6 +++---
 wolF/tasks.py       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index e6e8782..69b71ee 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -614,15 +614,15 @@ def main():
         segment_samples, global_beta, mu_i_samples = cov_mcmc.prepare_results()
         
         # save samples
-        with open(os.path.join(output_dir, model_save_str), 'wb') as f:
+        with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{args.allelic_seg_idx}.pickle"), 'wb') as f:
             pickle.dump(cov_mcmc, f)
 
-        np.savez(os.path.join(output_dir, data_save_str),
+        np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"),
                  seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples)
 
         # save visualization
         cov_mcmc.visualize_cluster_samples(
-            os.path.join(output_dir, figure_save_str))
+            os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png"))
 
     elif args.command == "collect_cov_mcmc":
         if args.coverage_dir:
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 11496c9..7668711 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -216,9 +216,9 @@ def prolog(self):
             self.conf["script"][-1] += " --burnin_files ${burnin_files}"
     
     output_patterns = {
-        "cov_segmentation_model": 'cov_mcmc_model_cluster_*.pickle',
-        "cov_segmentation_data": 'cov_mcmc_data_cluster_*.npz',
-        "cov_seg_figure": 'cov_mcmc_cluster_*_visual.png'
+        "cov_segmentation_model": 'cov_mcmc_model_*.pickle',
+        "cov_segmentation_data": 'cov_mcmc_data_*.npz',
+        "cov_seg_figure": 'cov_mcmc_*_visual.png'
     }
 
     docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v830"

From 49ea3589b55705613b84606a00371a4f2be35924 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 16 May 2022 16:22:48 -0400
Subject: [PATCH 186/222] Temporarily disable saving visualization

---
 hapaseg/__main__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 69b71ee..935ecbe 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -620,9 +620,9 @@ def main():
         np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"),
                  seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples)
 
-        # save visualization
-        cov_mcmc.visualize_cluster_samples(
-            os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png"))
+#        # save visualization
+#        cov_mcmc.visualize_cluster_samples(
+#            os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png"))
 
     elif args.command == "collect_cov_mcmc":
         if args.coverage_dir:

From fe309f31acd11329f54dc01d2c4b595f1660eea1 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 16 May 2022 17:41:27 -0400
Subject: [PATCH 187/222] ML_approx -> ML_gaussint

---
 hapaseg/NB_coverage_MCMC.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index 53efda7..d0838a5 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -413,7 +413,7 @@ def _detailed_sampling(self, ind, lls, split_indices, mus, lepsis, Hs):
     def _lls_to_MLs(self, lls, Hs):
         MLs = np.zeros(len(lls))
         for i, (ll, Hs) in enumerate(zip(lls, Hs)):
-            laplacian = self._get_log_ML_split(Hs[0], Hs[1])
+            laplacian = self._get_log_ML_gaussint_split(Hs[0], Hs[1])
             # the split results in a nan make it impossible to split there
             if np.isnan(laplacian):
                 laplacian = -1e50
@@ -445,12 +445,12 @@ def _get_split_liks(self, ind, debug=False):
 
         return split_indices, MLs, mus, lepsis
 
-    # computes ML component from hessian approximation for a single segment
-    def _get_log_ML_approx_join(self, Hess):
+    # computes Gaussian integral for ML Laplace approximation for a single segment
+    def _get_log_ML_gaussint_join(self, Hess):
         return np.log(2 * np.pi) - (np.log(np.linalg.det(-Hess))) / 2
 
-    # computes ML component from hessian approximation for two split segments
-    def _get_log_ML_split(self, H1, H2):
+    # computes Gaussian integral for ML Laplace approximation for two split segments
+    def _get_log_ML_gaussint_split(self, H1, H2):
         return 2*np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2
 
     # computes the log ML of joining two segments
@@ -461,9 +461,7 @@ def _log_ML_join(self, ind, ret_opt_params=False):
         tmp_lepsi = self.lepsi_i_arr.copy()
         tmp_lepsi[ind[0]:ind[1]] = lepsi_share
         ll_join = self.ll_cluster(tmp_mui, tmp_lepsi)
-        if ret_opt_params:
-            return mu_share, lepsi_share, self._get_log_ML_join(H_share) + ll_join
-        return mu_share, lepsi_share, self._get_log_ML_approx_join(H_share) + ll_join
+        return mu_share, lepsi_share, self._get_log_ML_gaussint_join(H_share) + ll_join
 
     """
     Split segment method. This method chooses a segment at random
@@ -546,7 +544,7 @@ def join(self, debug):
         ind = self.get_join_seg_ind(seg_l, seg_r)
 
         lls_split, _, _, Hs = self._calculate_splits(ind, [seg_r])
-        log_split_ML = lls_split[0] + self._get_log_ML_split(Hs[0][0], Hs[0][1])
+        log_split_ML = lls_split[0] + self._get_log_ML_gaussint_split(Hs[0][0], Hs[0][1])
         mu_share, lepsi_share, log_join_ML = self._log_ML_join(ind)
 
         log_MLs = np.r_[log_split_ML, log_join_ML]

From 3fdc0063df337b077948d4c640d1f30be59471f9 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 17 May 2022 11:44:16 -0400
Subject: [PATCH 188/222] Initial commit of covMCMC cache

---
 hapaseg/NB_coverage_MCMC.py | 73 +++++++++++++++++++++++++++++--------
 1 file changed, 58 insertions(+), 15 deletions(-)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index d0838a5..bdf927a 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -8,6 +8,7 @@
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 from scipy.signal import find_peaks
+import scipy.sparse as sp
 from .model_optimizers import PoissonRegression
 
 # turn off warnings for statsmodels fitting
@@ -49,8 +50,10 @@ def __init__(self, r, C, mu_0, beta_0, bin_width=1):
         self.segment_lens = sc.SortedDict([(0, len(self.r))])
         
         # keep cache of previously computed breakpoints for fast splitting
-        # these breakpoints keys are in the form (st, en, breakpoint)
-        self.breakpoint_cache = {}
+        self.cache_LL_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_LL = []
+        self.cache_mu_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_mu = []
+        self.cache_lepsi_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_lepsi = []
+        self.cache_hess_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_hess = []
 
         self.phase_history = []
         self.F = sc.SortedList()
@@ -138,12 +141,27 @@ def stats_init(self):
 
     # statsmodels NB BFGS optimizer is more stable than NR so we will use it until migration to LNP
     def stats_optimizer(self, ind, ret_hess=False):
+        # cache hit; look up values
+        if self.cache_mu_ptr[ind[0], ind[1]] != 0:
+            mu = self.cache_mu[self.cache_mu_ptr[ind[0], ind[1]]]
+            lepsi = self.cache_lepsi[self.cache_lepsi_ptr[ind[0], ind[1]]]
+            if ret_hess:
+                return mu, lepsi, self.cache_hess[self.cache_hess_ptr[ind[0], ind[1]]]
+            else:
+                return mu, lepsi
+
+        # cache miss; compute values
         endog = self.r[ind[0]:ind[1]].flatten()
         exog = np.ones(self.r[ind[0]:ind[1]].shape[0])
         exposure = np.ones(self.r[ind[0]:ind[1]].shape[0]) * self.bin_exposure
         sNB = statsNB(endog, exog, exposure=exposure, offset=(self.C[ind[0]:ind[1]] @ self.beta).flatten() + self.mu)
         res = sNB.fit(disp=0)
 
+        # save to cache
+        self.cache_mu.append(res.params[0]); self.cache_mu_ptr[ind[0], ind[1]] = len(self.cache_mu) - 1
+        self.cache_lepsi.append(-np.log(res.params[1])); self.cache_lepsi_ptr[ind[0], ind[1]] = len(self.cache_lepsi) - 1
+        self.cache_hess.append(sNB.hessian(res.params)); self.cache_hess_ptr[ind[0], ind[1]] = len(self.cache_hess) - 1
+
         if ret_hess:
             return res.params[0], -np.log(res.params[1]), sNB.hessian(res.params)
         else:
@@ -291,15 +309,31 @@ def _calculate_splits(self, ind, split_indices):
                 lepsis.append((lepsi_l, lepsi_r))
                 Hs.append((H_l, H_r))
 
-                tmp_mui = self.mu_i_arr.copy()
-                tmp_mui[ind[0]:ix] = mu_l
-                tmp_mui[ix: ind[1]] = mu_r
-                tmp_lepsi = self.lepsi_i_arr.copy()
-                tmp_lepsi[ind[0]:ix] = lepsi_l
-                tmp_lepsi[ix: ind[1]] = lepsi_r
+                # lookup likelihoods in cache
+                # left:
+                if (ptr := self.cache_LL_ptr[ind[0], ix]) != 0:
+                    ll_l = self.cache_LL[ptr]
+                else: 
+                    ll_l = self.ll_cluster(mu_l, lepsi_l)
+
+#                    tmp_mui = self.mu_i_arr.copy()
+#                    tmp_mui[ind[0]:ix] = mu_l
+#                    tmp_mui[ix: ind[1]] = mu_r
+#                    tmp_lepsi = self.lepsi_i_arr.copy()
+#                    tmp_lepsi[ind[0]:ix] = lepsi_l
+#                    tmp_lepsi[ix: ind[1]] = lepsi_r
+#                    ll = self.ll_cluster(tmp_mui, tmp_lepsi)
+
+                    self.cache_LL.append(ll_l); self.cache_LL_ptr[ind[0], ix] = len(self.cache_LL) - 1
+
+                # right:
+                if (ptr := self.cache_LL_ptr[ix, ind[1]]) != 0:
+                    ll_r = self.cache_LL[ptr]
+                else:
+                    ll_r = self.ll_cluster(mu_r, lepsi_r)
+                    self.cache_LL.append(ll_r); self.cache_LL_ptr[ix, ind[1]] = len(self.cache_LL) - 1
 
-                ll = self.ll_cluster(tmp_mui, tmp_lepsi)
-                lls.append(ll)
+                lls.append(ll_l + ll_r)
 
         return lls, mus, lepsis, Hs
 
@@ -456,11 +490,20 @@ def _get_log_ML_gaussint_split(self, H1, H2):
     # computes the log ML of joining two segments
     def _log_ML_join(self, ind, ret_opt_params=False):
         mu_share, lepsi_share, H_share = self.stats_optimizer(ind, True)
-        tmp_mui = self.mu_i_arr.copy()
-        tmp_mui[ind[0]:ind[1]] = mu_share
-        tmp_lepsi = self.lepsi_i_arr.copy()
-        tmp_lepsi[ind[0]:ind[1]] = lepsi_share
-        ll_join = self.ll_cluster(tmp_mui, tmp_lepsi)
+
+        # lookup cache
+        if (ptr := self.cache_LL_ptr[ind[0], ind[1]]) != 0:
+            ll_join = self.cache_LL[ptr]
+        else:
+#            tmp_mui = self.mu_i_arr.copy()
+#            tmp_mui[ind[0]:ind[1]] = mu_share
+#            tmp_lepsi = self.lepsi_i_arr.copy()
+#            tmp_lepsi[ind[0]:ind[1]] = lepsi_share
+#            ll_join = self.ll_cluster(tmp_mui, tmp_lepsi)
+            ll_join = self.ll_cluster(mu_share, lepsi_share)
+
+            # add to cache
+            self.cache_LL.append(ll_join); self.cache_LL_ptr[ind[0], ind[1]] = len(self.cache_LL) - 1
         return mu_share, lepsi_share, self._get_log_ML_gaussint_join(H_share) + ll_join
 
     """

From a5831faef6b4881a9aee081392cddf4e4c060bed Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 17 May 2022 11:57:22 -0400
Subject: [PATCH 189/222] Fix sparse matrix definitions

---
 hapaseg/NB_coverage_MCMC.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index bdf927a..707ea95 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -50,10 +50,11 @@ def __init__(self, r, C, mu_0, beta_0, bin_width=1):
         self.segment_lens = sc.SortedDict([(0, len(self.r))])
         
         # keep cache of previously computed breakpoints for fast splitting
-        self.cache_LL_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_LL = []
-        self.cache_mu_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_mu = []
-        self.cache_lepsi_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_lepsi = []
-        self.cache_hess_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_hess = []
+        sz = tuple(np.r_[1, 1]*(len(r) + 1))
+        self.cache_LL_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_LL = []
+        self.cache_mu_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_mu = []
+        self.cache_lepsi_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_lepsi = []
+        self.cache_hess_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_hess = []
 
         self.phase_history = []
         self.F = sc.SortedList()

From 1c7e09826b6172f0cd95fd3f79cefda55c8d425a Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 17 May 2022 12:00:57 -0400
Subject: [PATCH 190/222] tmp commit of breakpoints

---
 hapaseg/NB_coverage_MCMC.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index 707ea95..24aeab1 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -144,6 +144,7 @@ def stats_init(self):
     def stats_optimizer(self, ind, ret_hess=False):
         # cache hit; look up values
         if self.cache_mu_ptr[ind[0], ind[1]] != 0:
+            #breakpoint()
             mu = self.cache_mu[self.cache_mu_ptr[ind[0], ind[1]]]
             lepsi = self.cache_lepsi[self.cache_lepsi_ptr[ind[0], ind[1]]]
             if ret_hess:
@@ -313,6 +314,7 @@ def _calculate_splits(self, ind, split_indices):
                 # lookup likelihoods in cache
                 # left:
                 if (ptr := self.cache_LL_ptr[ind[0], ix]) != 0:
+                    #breakpoint()
                     ll_l = self.cache_LL[ptr]
                 else: 
                     ll_l = self.ll_cluster(mu_l, lepsi_l)
@@ -329,6 +331,7 @@ def _calculate_splits(self, ind, split_indices):
 
                 # right:
                 if (ptr := self.cache_LL_ptr[ix, ind[1]]) != 0:
+                    #breakpoint()
                     ll_r = self.cache_LL[ptr]
                 else:
                     ll_r = self.ll_cluster(mu_r, lepsi_r)
@@ -494,6 +497,7 @@ def _log_ML_join(self, ind, ret_opt_params=False):
 
         # lookup cache
         if (ptr := self.cache_LL_ptr[ind[0], ind[1]]) != 0:
+            #breakpoint()
             ll_join = self.cache_LL[ptr]
         else:
 #            tmp_mui = self.mu_i_arr.copy()

From f974844040d1c30f84c7a85b62f03be73e3e41ff Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 17 May 2022 12:08:33 -0400
Subject: [PATCH 191/222] Remove cruft code

---
 hapaseg/NB_coverage_MCMC.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index 24aeab1..56a6d4c 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -318,15 +318,6 @@ def _calculate_splits(self, ind, split_indices):
                     ll_l = self.cache_LL[ptr]
                 else: 
                     ll_l = self.ll_cluster(mu_l, lepsi_l)
-
-#                    tmp_mui = self.mu_i_arr.copy()
-#                    tmp_mui[ind[0]:ix] = mu_l
-#                    tmp_mui[ix: ind[1]] = mu_r
-#                    tmp_lepsi = self.lepsi_i_arr.copy()
-#                    tmp_lepsi[ind[0]:ix] = lepsi_l
-#                    tmp_lepsi[ix: ind[1]] = lepsi_r
-#                    ll = self.ll_cluster(tmp_mui, tmp_lepsi)
-
                     self.cache_LL.append(ll_l); self.cache_LL_ptr[ind[0], ix] = len(self.cache_LL) - 1
 
                 # right:
@@ -500,11 +491,6 @@ def _log_ML_join(self, ind, ret_opt_params=False):
             #breakpoint()
             ll_join = self.cache_LL[ptr]
         else:
-#            tmp_mui = self.mu_i_arr.copy()
-#            tmp_mui[ind[0]:ind[1]] = mu_share
-#            tmp_lepsi = self.lepsi_i_arr.copy()
-#            tmp_lepsi[ind[0]:ind[1]] = lepsi_share
-#            ll_join = self.ll_cluster(tmp_mui, tmp_lepsi)
             ll_join = self.ll_cluster(mu_share, lepsi_share)
 
             # add to cache

From 65829ad30b8813f2b8ca1ba64a21ecd3225546f6 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 17 May 2022 15:53:37 -0400
Subject: [PATCH 192/222] Index cov MCMC shards WRT seg index, not job number

---
 hapaseg/__main__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 935ecbe..9a9a4ee 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -614,15 +614,15 @@ def main():
         segment_samples, global_beta, mu_i_samples = cov_mcmc.prepare_results()
         
         # save samples
-        with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{args.allelic_seg_idx}.pickle"), 'wb') as f:
+        with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{seg_indices['allelic_cluster']}.pickle"), 'wb') as f:
             pickle.dump(cov_mcmc, f)
 
-        np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"),
+        np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{seg_indices['allelic_cluster']}.npz"),
                  seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples)
 
 #        # save visualization
 #        cov_mcmc.visualize_cluster_samples(
-#            os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png"))
+#            os.path.join(output_dir, f"cov_mcmc_seg_{seg_indices['allelic_cluster']}_visual.png"))
 
     elif args.command == "collect_cov_mcmc":
         if args.coverage_dir:

From 9fc6ac3a903bb7e8ec3f86c224cfc1b53aa0b4c2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 17 May 2022 16:36:59 -0400
Subject: [PATCH 193/222] Fix segment likelihood computation function

---
 hapaseg/NB_coverage_MCMC.py | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index 56a6d4c..ac93aa7 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -84,8 +84,17 @@ def get_seg_ind(self, seg):
     def get_join_seg_ind(self, seg_l, seg_r):
         return seg_l, seg_r + self.segment_lens[seg_r]
 
+    # get overall likelihood for all segments
     def get_ll(self):
-        return self.ll_cluster(self.mu_i_arr, self.lepsi_i_arr, True)
+        bdy = np.r_[list(self.segments), len(self.r)]; bdy = np.c_[bdy[:-1], bdy[1:]]
+        ll = 0
+        for st, en in bdy:
+            # lookup in cache
+            if (ptr := self.cache_LL_ptr[st, en]) != 0:
+                ll += self.cache_LL[ptr]
+            else:
+                ll += self.ll_cluster([st, en], self.mu_i_arr[st:en], self.lepsi_i_arr[st:en], True)
+        return ll
     
     # read in the merged cluster assignments from burnin scatter jobs and 
     # fill in data structures for cluster mcmc accordingly
@@ -186,16 +195,16 @@ def refit_beta(self):
             self.lepsi_i_arr[row[0]:row[1]] = lepsi_i
 
     # method for calculating the overall log likelihood of an allelic cluster given a hypothetical mu_i and lepsi arrays
-    def ll_cluster(self, mu_i_arr, lepsi_i_arr, take_sum=True):
-        mu_i_arr = mu_i_arr.flatten()
-        epsi_i_arr = np.exp(lepsi_i_arr).flatten()
+    def ll_cluster(self, ind, mu_i, lepsi_i, take_sum=True):
+        epsi_i = np.exp(lepsi_i)
         exposure= np.log(self.bin_exposure)
-        bc = (self.C @ self.beta).flatten() + exposure
-        exp = np.exp(self.mu + bc + mu_i_arr).flatten()
+        bc = (self.C[ind[0]:ind[1]] @ self.beta).flatten() + exposure
+        exp = np.exp(self.mu + bc + mu_i).flatten()
+        r_subset = self.r[ind[0]:ind[1]]
 
-        lls = (ss.gammaln(self.r + epsi_i_arr) - ss.gammaln(self.r + 1) - ss.gammaln(epsi_i_arr) +
-               (self.r * (self.mu + bc + mu_i_arr - np.log(epsi_i_arr + exp))) +
-               (epsi_i_arr * np.log(epsi_i_arr / (epsi_i_arr + exp))))
+        lls = (ss.gammaln(r_subset + epsi_i) - ss.gammaln(r_subset + 1) - ss.gammaln(epsi_i) +
+               (r_subset * (self.mu + bc + mu_i - np.log(epsi_i + exp))) +
+               (epsi_i * np.log(epsi_i / (epsi_i + exp))))
         if not take_sum:
             return lls
         return lls.sum()
@@ -297,6 +306,8 @@ def _calculate_splits(self, ind, split_indices):
         Hs = []
         for ix in split_indices:
             if ix < 0:
+                # what do we do here WRT ll_cluster indices? FIXME
+                breakpoint()
                 # no split proposal
                 ll_join = self.ll_cluster(self.mu_i_arr, self.lepsi_i_arr)
                 lls.append(ll_join)
@@ -317,7 +328,7 @@ def _calculate_splits(self, ind, split_indices):
                     #breakpoint()
                     ll_l = self.cache_LL[ptr]
                 else: 
-                    ll_l = self.ll_cluster(mu_l, lepsi_l)
+                    ll_l = self.ll_cluster([ind[0], ix], mu_l, lepsi_l)
                     self.cache_LL.append(ll_l); self.cache_LL_ptr[ind[0], ix] = len(self.cache_LL) - 1
 
                 # right:
@@ -325,7 +336,7 @@ def _calculate_splits(self, ind, split_indices):
                     #breakpoint()
                     ll_r = self.cache_LL[ptr]
                 else:
-                    ll_r = self.ll_cluster(mu_r, lepsi_r)
+                    ll_r = self.ll_cluster([ix, ind[1]], mu_r, lepsi_r)
                     self.cache_LL.append(ll_r); self.cache_LL_ptr[ix, ind[1]] = len(self.cache_LL) - 1
 
                 lls.append(ll_l + ll_r)
@@ -491,7 +502,7 @@ def _log_ML_join(self, ind, ret_opt_params=False):
             #breakpoint()
             ll_join = self.cache_LL[ptr]
         else:
-            ll_join = self.ll_cluster(mu_share, lepsi_share)
+            ll_join = self.ll_cluster(ind, mu_share, lepsi_share)
 
             # add to cache
             self.cache_LL.append(ll_join); self.cache_LL_ptr[ind[0], ind[1]] = len(self.cache_LL) - 1
@@ -526,7 +537,7 @@ def split(self, debug):
 
         max_ML = max(log_MLs)
         k_probs = np.exp(log_MLs - max_ML) / np.exp(log_MLs - max_ML).sum()
-        
+
         if np.isnan(k_probs).any():
             print("skipping split iteration due to nan. log MLs: ", log_MLs, flush=True)
             return 0

From c818a01f330b2401a9d9dc7b45525ed74f7840c2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 19 May 2022 12:18:14 -0400
Subject: [PATCH 194/222] Fix bug in 65829ad

---
 hapaseg/__main__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 9a9a4ee..935ecbe 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -614,15 +614,15 @@ def main():
         segment_samples, global_beta, mu_i_samples = cov_mcmc.prepare_results()
         
         # save samples
-        with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{seg_indices['allelic_cluster']}.pickle"), 'wb') as f:
+        with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{args.allelic_seg_idx}.pickle"), 'wb') as f:
             pickle.dump(cov_mcmc, f)
 
-        np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{seg_indices['allelic_cluster']}.npz"),
+        np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"),
                  seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples)
 
 #        # save visualization
 #        cov_mcmc.visualize_cluster_samples(
-#            os.path.join(output_dir, f"cov_mcmc_seg_{seg_indices['allelic_cluster']}_visual.png"))
+#            os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png"))
 
     elif args.command == "collect_cov_mcmc":
         if args.coverage_dir:

From 36b823a3ea4990cc53df6622c02f0d441b977484 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 19 May 2022 12:42:34 -0400
Subject: [PATCH 195/222] Add back log transform to covariates

---
 hapaseg/run_coverage_MCMC.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 7eca2cb..3c4ca12 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -102,17 +102,18 @@ def generate_GC(self):
         
 
     def load_covariates(self):
+        zt = lambda x : (x - np.nanmean(x))/np.nanstd(x)
+
         ## Target size
 
         # we only need bin size if doing exomes but we can check by looking at the bin lengths
         self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1)
+        self.full_cov_df["C_log_len_z"] = zt(self.full_cov_df["C_log_len"])
             
         # in case we are doing wgs these will all be the same and we must remove
         if (np.diff(self.full_cov_df["C_log_len"]) == 0).all():
             #remove the len col since it will ruin beta fitting
-            self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1)
-
-        zt = lambda x : (x - np.nanmean(x))/np.nanstd(x)
+            self.full_cov_df = self.full_cov_df.drop(['C_log_len', 'C_log_len_z'], axis=1)
 
         ## Fragment length
 
@@ -120,7 +121,7 @@ def load_covariates(self):
         self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True)
 
         self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
-        self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"])
+        self.full_cov_df["C_frag_len_z"] = zt(np.log(self.full_cov_df["C_frag_len"]))
 
         # generate on 5x and 11x scales
         swv = np.lib.stride_tricks.sliding_window_view
@@ -132,7 +133,7 @@ def load_covariates(self):
             conv = np.einsum('ij,ij->i', wt_sw, fl_sw)
 
             self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1)
-            self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"])
+            self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(np.log(self.full_cov_df[f"C_frag_len_{scale}x"]))
 
         ### track-based covariates
         # use midpoint of coverage bins to map to intervals
@@ -148,7 +149,7 @@ def load_covariates(self):
         self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values
 
         # z-transform
-        self.full_cov_df["C_RT_z"] = zt(self.full_cov_df["C_RT"])
+        self.full_cov_df["C_RT_z"] = zt(np.log(self.full_cov_df["C_RT"]))
 
         ## GC content
 
@@ -175,7 +176,7 @@ def load_covariates(self):
             self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values
 
             # z-transform
-            self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"])
+            self.full_cov_df["C_FAIRE_z"] = zt(np.log(self.full_cov_df["C_FAIRE"] + 1))
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned

From 7f6eb1fcc794a4b63c044c8cd2b41d44671e241b Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 23 May 2022 10:16:54 -0400
Subject: [PATCH 196/222] Initial commit of covMCMC gather

---
 hapaseg/run_coverage_MCMC.py | 79 +++++++++++++++++++++++-------------
 1 file changed, 51 insertions(+), 28 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 3c4ca12..aa4f4f5 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -341,8 +341,11 @@ def nat_sort(lst):
         return sorted(lst, key=alphanum_key)
 
 
-# function for collecting coverage mcmc results from each ADP cluster
-def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, bin_width=1):
+# function for collecting coverage mcmc results from each ADP segment
+def aggregate_adp_segments(allelic_seg_groups_pickle, coverage_dir=None, f_file_list=None, cov_df_pickle=None, bin_width=1):
+    S = pd.read_pickle(allelic_seg_groups_pickle)
+    S = S.rename_axis(index = "allelic_seg_idx").reset_index()
+
     if coverage_dir is None and f_file_list is None:
         raise ValueError("need to pass in either coverage_dir or file_list txt file!")
     if coverage_dir is not None and f_file_list is not None:
@@ -350,7 +353,7 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None,
 
     # get results files from the directory provided or from the file list provided
     if coverage_dir is not None:
-        cluster_files = nat_sort(glob.glob(os.path.join(coverage_dir, 'cov_mcmc_data_cluster_*')))
+        adp_seg_files = nat_sort(glob.glob(os.path.join(coverage_dir, 'cov_mcmc_data_*')))
         cov_df = pd.read_pickle(os.path.join(coverage_dir, 'cov_df.pickle'))
         
     else:
@@ -364,47 +367,67 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None,
                 to_add = l.rstrip('\n')
                 if to_add != "nan":
                     read_files.append(to_add)
-        cluster_files = nat_sort(read_files)
+        adp_seg_files = nat_sort(read_files)
         cov_df = pd.read_pickle(cov_df_pickle)
-    
-    clust_assignments = cov_df['allelic_cluster'].values
-    
+
+    # make sure that number of results shards is consistent with shard indices
+    if len(adp_seg_files) != len(S):
+        raise ValueError("Number of ADP seg files does not match scatter shards!")
+
+    # load in covMCMC segment boundaries and mu's for each ADP segment
     seg_results = []
     mu_i_results = []
     
-    # load data from each cluster
-    for data_path in cluster_files:
-        cluster_data = np.load(data_path)
-        seg_results.append(cluster_data['seg_samples'])
-        mu_i_results.append(cluster_data['mu_i_samples'])
-    
+    for f in adp_seg_files:
+        seg_data = np.load(f)
+        seg_results.append(seg_data['seg_samples'])
+        mu_i_results.append(seg_data['mu_i_samples'])
+
+    S["seg_results"] = seg_results
+    S["mu_i_results"] = mu_i_results
+
     num_draws = seg_results[0].shape[1]
-    num_clusters = len(seg_results)
 
-    # now we use these data to fill an overall coverage segmentation array
+    # create overall segmentation array
     coverage_segmentation = np.zeros((len(cov_df), num_draws))
     mu_i_values = np.zeros((len(cov_df), num_draws))
 
+    # loop over each cov MCMC draw
+    # TODO: only use maximum likelihood draw; CDP should be able to resegment
     for d in range(num_draws):
-        global_counter = 0
-        for c in range(num_clusters):
-            cluster_mask = (clust_assignments == c)
-            coverage_segmentation[cluster_mask, d] = seg_results[c][:,d] + global_counter
-            mu_i_values[cluster_mask, d] = mu_i_results[c][:, d]
-            global_counter += len(np.unique(seg_results[c][:,d]))
-    
-    # generate data to re-compute global beta
+        n_tot_segs = 0
+        # loop over ADP segments
+        for _, s in S.iterrows():
+            seg_idxs = s["seg_results"][:, d]
+            coverage_segmentation[s["indices"], d] = seg_idxs + n_tot_segs
+            n_tot_segs += seg_idxs[-1] + 1
+
+            mu_i_values[s["indices"], d] = s["mu_i_results"][:, d]
+
+    # TEMP HACK: for now, only take iteration with fewest number of segments
+    sidx = coverage_segmentation[-1, :].argmin()
+    coverage_segmentation = coverage_segmentation[:, [sidx]]
+    mu_i_values = mu_i_values[:, [sidx]]
+
+    # remove short segments (<200Kb)
+    # TODO: remove segments not well-modeled by covariates
+    cov_df["cov_seg_idx"] = coverage_segmentation.astype(int)
+    long_seg_idx = cov_df.groupby("cov_seg_idx").apply(lambda x : (x.iloc[-1]["end"] - x.iloc[0]["start"]) > 2e5).rename("seg_OK")
+    cov_df = cov_df.merge(long_seg_idx, left_on = "cov_seg_idx", right_index = True)
+
+    coverage_segmentation = coverage_segmentation[cov_df["seg_OK"], :]
+    mu_i_values = mu_i_values[cov_df["seg_OK"], :]
+    cov_df = cov_df.loc[cov_df["seg_OK"]]
+
+    # recompute global beta
     r = np.c_[cov_df["covcorr"]]
     # we'll use the mu_is from the last segmentation sample
-    mu_is = mu_i_values[:,-1]
-    # compute new edogenous targets by subtracking out the mu_i values of the segments
-    # along with the bin exposure
-    endog = np.exp(np.log(r).flatten() - np.log(bin_width) - mu_is).reshape(-1,1)
+    mu_is = mu_i_values[:, [-1]]
     # generate covars
     covar_columns = sorted(cov_df.columns[cov_df.columns.str.contains("^C_.*_z$")])
     C = np.c_[cov_df[covar_columns]]
     # do regression
-    pois_regr = PoissonRegression(endog, C, np.ones(endog.shape))
+    pois_regr = PoissonRegression(r, C, np.ones(r.shape), np.log(bin_width) + mu_is)
     mu_refit, beta_refit = pois_regr.fit()
     
     return coverage_segmentation, beta_refit

From 6c43e331150d2036d054ff5d4d4f5b76debc4d50 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 2 Jun 2022 15:49:02 -0400
Subject: [PATCH 197/222] Save covMCMC likelihood samples

---
 hapaseg/NB_coverage_MCMC.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index ac93aa7..dd77611 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -802,6 +802,7 @@ def __init__(self, n_iter, r, C, mu, beta, bin_width=1):
         self.mu_i_samples = []
         self.lepsi_i_samples = []
         self.F_samples = []
+        self.ll_samples = []
 
         self.ll_cluster = 0
         self.ll_iter = []
@@ -827,6 +828,7 @@ def save_sample(self):
         self.mu_i_samples.append(self.cluster.mu_i_arr.copy())
         self.lepsi_i_samples.append(self.cluster.lepsi_i_arr.copy())
         self.F_samples.append(self.cluster.F.copy())
+        self.ll_samples.append(self.ll_cluster)
 
     def run(self,
             debug=False,

From f2d933be3bb3596c5aebf34c7d3340586f75cdf5 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 7 Jun 2022 15:08:24 -0400
Subject: [PATCH 198/222] Better starting values for mu/beta

---
 hapaseg/model_optimizers.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py
index 76e95c1..288d635 100644
--- a/hapaseg/model_optimizers.py
+++ b/hapaseg/model_optimizers.py
@@ -8,8 +8,8 @@ def __init__(self, r, C, Pi, log_exposure = 0):
         self.Pi = Pi
         self.log_exposure = log_exposure
 
-        self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1]))
-        self.beta = np.ones([C.shape[1], 1])
+        self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure
+        self.beta = np.zeros([C.shape[1], 1])
         self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure)
 
     # mu gradient

From dfa7681e5dc0ee2aed9306852c7bc79c4e034cec Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Mon, 23 May 2022 11:28:57 -0400
Subject: [PATCH 199/222] Return Poisson Hessian

---
 hapaseg/model_optimizers.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py
index 288d635..bad14ed 100644
--- a/hapaseg/model_optimizers.py
+++ b/hapaseg/model_optimizers.py
@@ -54,3 +54,9 @@ def NR_poisson(self):
     def fit(self):
         self.NR_poisson()
         return self.mu, self.beta
+
+    def hess(self):
+        hmu = self.hessmu()
+        hbeta = self.hessbeta()
+        hmubeta = self.hessmubeta()
+        return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]

From 000b7fa1150a81424e616c0994c67113db7065ec Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 10 Jun 2022 11:47:16 -0400
Subject: [PATCH 200/222] Add offset to Poisson regression

Offset is expected to be length of data vector
---
 hapaseg/model_optimizers.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py
index bad14ed..f1fd32e 100644
--- a/hapaseg/model_optimizers.py
+++ b/hapaseg/model_optimizers.py
@@ -2,15 +2,16 @@
 
 
 class PoissonRegression:
-    def __init__(self, r, C, Pi, log_exposure = 0):
+    def __init__(self, r, C, Pi, log_exposure = 0, log_offset = 0):
         self.r = r
         self.C = C
         self.Pi = Pi
         self.log_exposure = log_exposure
+        self.log_offset = log_offset
 
         self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure
         self.beta = np.zeros([C.shape[1], 1])
-        self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure)
+        self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
 
     # mu gradient
     def gradmu(self):
@@ -34,7 +35,7 @@ def hessmubeta(self):
 
     def NR_poisson(self):
         for i in range(100):
-            self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure)
+            self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
             gmu = self.gradmu()
             gbeta = self.gradbeta()
             grad = np.r_[gmu, gbeta]

From d41cdd6145387879c23198c85be66c012d4c2e89 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 10 Jun 2022 11:47:40 -0400
Subject: [PATCH 201/222] Add simple normal prior to Poisson regression

---
 hapaseg/model_optimizers.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py
index f1fd32e..efed17f 100644
--- a/hapaseg/model_optimizers.py
+++ b/hapaseg/model_optimizers.py
@@ -13,21 +13,27 @@ def __init__(self, r, C, Pi, log_exposure = 0, log_offset = 0):
         self.beta = np.zeros([C.shape[1], 1])
         self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
 
+        # prior parameters
+        self.mumu = 0
+        self.musig2 = 1
+        self.betamu = np.zeros_like(self.beta)
+        self.betasiginv = np.eye(len(self.beta))
+
     # mu gradient
     def gradmu(self):
-        return self.Pi.T @ (self.r - self.e_s)
+        return self.Pi.T @ (self.r - self.e_s) - (self.mu - self.mumu)/self.musig2
 
     # mu Hessian
     def hessmu(self):
-        return (-self.Pi.T * self.e_s.T)  @ self.Pi
+        return (-self.Pi.T * self.e_s.T)  @ self.Pi - 1/self.musig2
 
     # beta gradient
     def gradbeta(self):
-        return self.C.T @ (self.r - self.e_s)
+        return self.C.T @ (self.r - self.e_s) - self.betasiginv@(self.beta - self.betamu)
 
     # beta Hessian
     def hessbeta(self):
-        return (-self.C.T * self.e_s.T) @ self.C
+        return (-self.C.T * self.e_s.T) @ self.C - self.betasiginv
 
     # mu,beta Hessian
     def hessmubeta(self):

From 5718c707f232112b708b21f565f16f4ee4d69954 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 12:58:21 -0400
Subject: [PATCH 202/222] Allow PoisRegr priors to be specified; allow running
 without intercept

---
 hapaseg/model_optimizers.py | 50 +++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 16 deletions(-)

diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py
index efed17f..b4472d1 100644
--- a/hapaseg/model_optimizers.py
+++ b/hapaseg/model_optimizers.py
@@ -2,22 +2,25 @@
 
 
 class PoissonRegression:
-    def __init__(self, r, C, Pi, log_exposure = 0, log_offset = 0):
+    def __init__(self, r, C, Pi,
+      log_exposure = 0, log_offset = 0, intercept = True,
+      mumu = 0, musig2 = 10, betamu = None, betasiginv = None):
         self.r = r
         self.C = C
         self.Pi = Pi
         self.log_exposure = log_exposure
         self.log_offset = log_offset
+        self.intercept = intercept
 
         self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure
         self.beta = np.zeros([C.shape[1], 1])
         self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
 
         # prior parameters
-        self.mumu = 0
-        self.musig2 = 1
-        self.betamu = np.zeros_like(self.beta)
-        self.betasiginv = np.eye(len(self.beta))
+        self.mumu = mumu
+        self.musig2 = musig2
+        self.betamu = np.zeros_like(self.beta) if betamu is None else betamu
+        self.betasiginv = 1/np.sqrt(10)*np.eye(len(self.beta)) if betasiginv is None else betasiginv
 
     # mu gradient
     def gradmu(self):
@@ -42,28 +45,43 @@ def hessmubeta(self):
     def NR_poisson(self):
         for i in range(100):
             self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
-            gmu = self.gradmu()
             gbeta = self.gradbeta()
-            grad = np.r_[gmu, gbeta]
+            if self.intercept:
+                gmu = self.gradmu()
+                grad = np.r_[gmu, gbeta]
+            else:
+                grad = gbeta
 
-            hmu = self.hessmu()
             hbeta = self.hessbeta()
-            hmubeta = self.hessmubeta()
-            H = np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]
+            if self.intercept:
+                hmubeta = self.hessmubeta()
+                hmu = self.hessmu()
+                H = np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]
+            else:
+                H = hbeta
 
             delta = np.linalg.inv(H) @ grad
-            self.mu -= delta[0:len(self.mu)]
-            self.beta -= delta[len(self.mu):]
+            if self.intercept:
+                self.mu -= delta[0:len(self.mu)]
+                self.beta -= delta[len(self.mu):]
+            else:
+                self.beta -= delta
 
             if np.linalg.norm(grad) < 1e-5:
                 break
 
     def fit(self):
         self.NR_poisson()
-        return self.mu, self.beta
+        if self.intercept:
+            return self.mu, self.beta
+        else:
+            return self.beta
 
     def hess(self):
-        hmu = self.hessmu()
         hbeta = self.hessbeta()
-        hmubeta = self.hessmubeta()
-        return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]
+        if self.intercept:
+            hmu = self.hessmu()
+            hmubeta = self.hessmubeta()
+            return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]
+        else:
+            return hbeta

From 57e551a527ff2575773ae66b9e4565bbc902378a Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 13:25:28 -0400
Subject: [PATCH 203/222] Pass Poisson Hessian to downstream tasks

---
 hapaseg/__main__.py          | 4 ++--
 hapaseg/run_coverage_MCMC.py | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 935ecbe..51f1400 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -519,7 +519,7 @@ def main():
                                              f_faire=args.faire_pickle,
                                              f_GC=args.gc_pickle,
                                              allelic_sample=args.allelic_sample)
-        Pi, r, C, all_mu, global_beta, cov_df, adp_cluster = cov_mcmc_runner.prepare_single_cluster()
+        Pi, r, C, all_mu, global_beta, cov_df, adp_cluster, pois_hess = cov_mcmc_runner.prepare_single_cluster()
 
         ## create chunks for both burnin and scatter
         cov_df = cov_df.sort_values("start_g", ignore_index = True)
@@ -533,7 +533,7 @@ def main():
         ## save
         # regression matrices
         np.savez(os.path.join(output_dir, 'preprocess_data'), Pi=Pi, r=r, C=C, all_mu=all_mu,
-                 global_beta=global_beta, adp_cluster=adp_cluster)
+                 global_beta=global_beta, adp_cluster=adp_cluster, pois_hess=pois_hess)
         # coverage dataframe mapped 
         cov_df.to_pickle(os.path.join(output_dir, 'cov_df.pickle'))
         # allelic segment indices into coverage dataframe
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index aa4f4f5..d1fdf8d 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -70,9 +70,10 @@ def prepare_single_cluster(self):
         Pi, r, C, filtered_cov_df = self.assign_clusters()
         pois_regr = PoissonRegression(r, C, Pi, log_exposure = np.log(self.bin_width))
         all_mu, global_beta = pois_regr.fit()
+        pois_hess = pois_regr.hess()
 
         # save these results to a numpy object
-        return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample
+        return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample, pois_hess
 
     def load_coverage(self, coverage_csv):
         Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False)

From 6502b35e9e29c1ff3ad4b11d329ac06ae19a612d Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 13:26:25 -0400
Subject: [PATCH 204/222] Compute initial Poisson regression on ADP segment
 level

Excised from 8f439ac
---
 hapaseg/run_coverage_MCMC.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index d1fdf8d..dfc5c7f 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -88,7 +88,7 @@ def load_coverage(self, coverage_csv):
 
     def load_SNPs(self, f_snps):
         SNPs = pd.read_pickle(f_snps)
-        SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False)
+        SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False).astype(int)
         return SNPs
 
     def generate_GC(self):
@@ -187,19 +187,19 @@ def assign_clusters(self):
         clust_choice = self.allelic_clusters["snps_to_clusters"][self.allelic_sample]
         clust_u, clust_uj = np.unique(clust_choice, return_inverse=True)
         clust_uj = clust_uj.reshape(clust_choice.shape)
-        cuj_max = clust_uj.max() + 1
         self.SNPs["clust_choice"] = clust_uj
 
-        ## assign coverage intervals to allelic clusters and segments
-        # assignment probabilities of each coverage interval -> allelic cluster
-        Cov_clust_probs = np.zeros([len(self.full_cov_df), cuj_max])
-
+        ## assign coverage intervals to allelic clusters and segments 
         # get allelic segment boundaries
         seg_bdy = np.r_[0, list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)]
         seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
         self.SNPs["seg_idx"] = 0
         for i, (st, en) in enumerate(seg_bdy):
             self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i
+        seg_max = self.SNPs["seg_idx"].max() + 1
+
+        # assignment probabilities of each coverage interval -> allelic segment
+        Cov_clust_probs = np.zeros([len(self.full_cov_df), seg_max])
 
         # first compute assignment probabilities based on the SNPs within each bin
         # segments just get assigned to the maximum probability
@@ -208,13 +208,13 @@ def assign_clusters(self):
         for targ, D in tqdm.tqdm(self.SNPs.groupby("tidx")[["clust_choice", "seg_idx"]]):
             clust_idx = D["clust_choice"].values
             seg_idx = D["seg_idx"].values
-            if len(clust_idx) == 1:
-                Cov_clust_probs[int(targ), clust_idx] = 1.0
-                self.full_cov_df.at[int(targ), "seg_idx"] = seg_idx[0]
+            if len(seg_idx) == 1:
+                Cov_clust_probs[targ, seg_idx] = 1.0
+                self.full_cov_df.at[targ, "seg_idx"] = seg_idx[0]
             else: 
-                targ_clust_hist = np.bincount(clust_idx, minlength = cuj_max) 
-                Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum()
-                self.full_cov_df.at[int(targ), "seg_idx"] = np.bincount(seg_idx).argmax()
+                targ_clust_hist = np.bincount(seg_idx, minlength = seg_max) 
+                Cov_clust_probs[targ, :] = targ_clust_hist / targ_clust_hist.sum()
+                self.full_cov_df.at[targ, "seg_idx"] = np.bincount(seg_idx).argmax()
 
         ## subset to targets containing SNPs
         overlap_idx = Cov_clust_probs.sum(1) > 0

From 2c69a1089e287f2392e414982e057e54eb829962 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 13:48:50 -0400
Subject: [PATCH 205/222] Bump covMCMC/prep dockers

---
 wolF/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 7668711..6311919 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -164,7 +164,7 @@ def script(self):
         "allelic_seg_groups": "allelic_seg_groups.pickle"
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v832"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v853"
     resources = { "mem" : "15G" }
 
 
@@ -221,7 +221,7 @@ def prolog(self):
         "cov_seg_figure": 'cov_mcmc_*_visual.png'
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v830"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v853"
     resources = {"mem" : "10G"}
 
 class Hapaseg_collect_coverage_mcmc(wolf.Task):

From fc9f1b5f0ef6a76cf1a7ebe372378b0da39cc7c2 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 13:58:03 -0400
Subject: [PATCH 206/222] Rename import

---
 hapaseg/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 51f1400..9798ed0 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -21,7 +21,7 @@
 from . import utils as hs_utils
 
 from .NB_coverage_MCMC import NB_MCMC_SingleCluster
-from .run_coverage_MCMC import CoverageMCMCRunner, aggregate_clusters, aggregate_burnin_files 
+from .run_coverage_MCMC import CoverageMCMCRunner, aggregate_adp_segments, aggregate_burnin_files
 from .coverage_DP import Coverage_DP
 from .a_cov_DP import generate_acdp_df, AllelicCoverage_DP
 

From 602e432beef79c08ecfeade58a408834f5aa9cf3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 14:37:37 -0400
Subject: [PATCH 207/222] Fix SNP -> coverage bin mapping bug

---
 hapaseg/run_coverage_MCMC.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index dfc5c7f..8907b89 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -88,7 +88,7 @@ def load_coverage(self, coverage_csv):
 
     def load_SNPs(self, f_snps):
         SNPs = pd.read_pickle(f_snps)
-        SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False).astype(int)
+        mut.map_mutations_to_targets(SNPs, self.full_cov_df)
         return SNPs
 
     def generate_GC(self):
@@ -205,7 +205,9 @@ def assign_clusters(self):
         # segments just get assigned to the maximum probability
         self.full_cov_df["seg_idx"] = -1
         print("Mapping SNPs to targets ...", file = sys.stderr)
-        for targ, D in tqdm.tqdm(self.SNPs.groupby("tidx")[["clust_choice", "seg_idx"]]):
+        for targ, D in tqdm.tqdm(self.SNPs.groupby("targ_idx")[["clust_choice", "seg_idx"]]):
+            if targ == -1: # SNP does not overlap a coverage bin
+                continue
             clust_idx = D["clust_choice"].values
             seg_idx = D["seg_idx"].values
             if len(seg_idx) == 1:

From 5cd7cf2190e09422c753d40c10b6f8739b4c69e8 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 14:41:10 -0400
Subject: [PATCH 208/222] Bump docker

---
 wolF/tasks.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index 6311919..e49324d 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -164,7 +164,7 @@ def script(self):
         "allelic_seg_groups": "allelic_seg_groups.pickle"
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v853"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v856"
     resources = { "mem" : "15G" }
 
 
@@ -221,7 +221,7 @@ def prolog(self):
         "cov_seg_figure": 'cov_mcmc_*_visual.png'
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v853"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v856"
     resources = {"mem" : "10G"}
 
 class Hapaseg_collect_coverage_mcmc(wolf.Task):

From ad094cdd7e64e5b644c867059679ae28c342bcdf Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 15:43:36 -0400
Subject: [PATCH 209/222] Forgot to pass bin_width to coverage preprocessor

---
 hapaseg/__main__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 9798ed0..583db50 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -518,7 +518,8 @@ def main():
                                              f_repl=args.repl_pickle,
                                              f_faire=args.faire_pickle,
                                              f_GC=args.gc_pickle,
-                                             allelic_sample=args.allelic_sample)
+                                             allelic_sample=args.allelic_sample,
+                                             bin_width=args.bin_width)
         Pi, r, C, all_mu, global_beta, cov_df, adp_cluster, pois_hess = cov_mcmc_runner.prepare_single_cluster()
 
         ## create chunks for both burnin and scatter

From a5487f14123b857ac0f6c89f6d44890f4a23d0bf Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Tue, 14 Jun 2022 15:45:09 -0400
Subject: [PATCH 210/222] Bump Docker

---
 wolF/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wolF/tasks.py b/wolF/tasks.py
index e49324d..8688199 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -164,7 +164,7 @@ def script(self):
         "allelic_seg_groups": "allelic_seg_groups.pickle"
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v856"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v858"
     resources = { "mem" : "15G" }
 
 

From 0f2c8cdb083de5495431906f171cde46d4dcc1c8 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 30 Jun 2022 13:47:04 -0400
Subject: [PATCH 211/222] Add covariate scale factor

---
 hapaseg/model_optimizers.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py
index b4472d1..3284123 100644
--- a/hapaseg/model_optimizers.py
+++ b/hapaseg/model_optimizers.py
@@ -14,6 +14,7 @@ def __init__(self, r, C, Pi,
 
         self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure
         self.beta = np.zeros([C.shape[1], 1])
+        self.f = 1
         self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
 
         # prior parameters
@@ -85,3 +86,22 @@ def hess(self):
             return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]
         else:
             return hbeta
+
+    # scale factor
+    def gradf(self):
+        return (self.C@self.beta).T@(self.r - self.e_s)
+
+    def hessf(self):
+        CB = self.C@self.beta
+        return -(CB*self.e_s).T@CB
+
+    def NR_f(self):
+        for i in range(100):
+            self.e_s = np.exp(self.f*self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
+            gf = self.gradf()
+            hf = self.hessf()
+
+            self.f -= gf/hf
+
+            if np.linalg.norm(gf) < 1e-5:
+                break

From 93c53cdead0d1d226f8bd5189922454637f2eb68 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sun, 17 Jul 2022 08:34:25 -0400
Subject: [PATCH 212/222] Use nonlinear GC model

---
 hapaseg/run_coverage_MCMC.py | 44 ++++++++++++++++++++++++++++--------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 8907b89..548d856 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -164,8 +164,11 @@ def load_covariates(self):
         else:
             print("Computing GC content", file = sys.stderr)
             self.generate_GC()
-        
-        self.full_cov_df["C_GC_z"] = zt(self.full_cov_df["C_GC"])
+
+        # bin GC content with resolution proportional to the sqrt of the number of bins
+        self.full_cov_df["GC_bin"] = np.round(self.full_cov_df["C_GC"]*np.sqrt(len(self.full_cov_df))).astype(int)
+
+        # we will subsequently transform GC content to reflect the coverage bias of that bin
 
         ## FAIRE
 
@@ -264,13 +267,36 @@ def assign_clusters(self):
 
         Cov_overlap = Cov_overlap.loc[~bad_bins, :]
         Pi = filtered.copy()
-        Cov_overlap['allelic_cluster'] = np.argmax(Pi, axis=1)
-       
-        r = np.c_[Cov_overlap["covcorr"]]
-        
-        covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")])
 
-        ## making covariate matrix
+        ## making regressor vector/covariate matrix
+
+        # scale regressor to reflect fragment counts
+        Cov_overlap["fragcorr"] = np.round(Cov_overlap["covcorr"]/Cov_overlap["C_frag_len"].mean())
+        r = np.c_[Cov_overlap["fragcorr"]]
+
+        # fit empirical GC correction model (we do this here because we only consider "good" coverage bins)
+        GC_b = []   # GC bin
+        N_gc = [] # total number of coverage intervals within GC bin
+        F_gc = [] # total number of fragments within GC bin
+        for _, cidx in Cov_overlap.groupby("allelic_cluster").indices.items():
+            ngc = Cov_overlap.iloc[cidx].groupby("GC_bin").size()
+            fgc = Cov_overlap.iloc[cidx].groupby("GC_bin")["fragcorr"].sum()
+            GC_b.extend(ngc.index)
+            N_gc.extend(ngc)
+            F_gc.extend(fgc)
+        GC_b = np.r_[GC_b]
+        N_gc = np.r_[N_gc]
+        F_gc = np.r_[F_gc]
+
+        # use quadratic model
+        v = np.polyfit(GC_b/np.sqrt(len(self.full_cov_df)), F_gc/N_gc, 2)
+
+        Cov_overlap["C_GCtr"] = v[::-1]@((Cov_overlap["GC_bin"].values/np.sqrt(len(self.full_cov_df)))**np.c_[0:3])
+        Cov_overlap.loc[Cov_overlap["C_GCtr"] < 0, "C_GCtr"] = 1
+        #Cov_overlap["C_GCtr_z"] = (lambda x : (x - np.nanmean(x))/np.nanstd(x))(np.log(Cov_overlap["C_GCtr"]))
+        Cov_overlap["C_GCtr_z"] = np.log(Cov_overlap["C_GCtr"])
+
+        covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")])
         C = np.c_[Cov_overlap[covar_columns]]
 
         ## dropping Nans
@@ -294,7 +320,7 @@ def assign_clusters(self):
         Pi = Pi[:, Pi.sum(0) > 0]
  
         ## remove covariate outliers (+- 6 sigma)
-        covar_outlier_idx = (Cov_overlap.loc[:, covar_columns].abs() < 6).all(axis = 1)
+        covar_outlier_idx = (Cov_overlap.loc[:, set(covar_columns) - {"C_GCtr_z"}].abs() < 6).all(axis = 1)
         Cov_overlap = Cov_overlap.loc[covar_outlier_idx]
         Pi = Pi[covar_outlier_idx, :]
         r = r[covar_outlier_idx]

From bfd6fa8b38773691840dc32047a56b169501963d Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Sun, 17 Jul 2022 08:34:45 -0400
Subject: [PATCH 213/222] Save allelic clusters for each ADP segment

---
 hapaseg/run_coverage_MCMC.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 548d856..28f9d8f 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -207,6 +207,7 @@ def assign_clusters(self):
         # first compute assignment probabilities based on the SNPs within each bin
         # segments just get assigned to the maximum probability
         self.full_cov_df["seg_idx"] = -1
+        self.full_cov_df["allelic_cluster"] = -1
         print("Mapping SNPs to targets ...", file = sys.stderr)
         for targ, D in tqdm.tqdm(self.SNPs.groupby("targ_idx")[["clust_choice", "seg_idx"]]):
             if targ == -1: # SNP does not overlap a coverage bin
@@ -216,10 +217,12 @@ def assign_clusters(self):
             if len(seg_idx) == 1:
                 Cov_clust_probs[targ, seg_idx] = 1.0
                 self.full_cov_df.at[targ, "seg_idx"] = seg_idx[0]
+                self.full_cov_df.at[targ, "allelic_cluster"] = clust_idx[0]
             else: 
                 targ_clust_hist = np.bincount(seg_idx, minlength = seg_max) 
                 Cov_clust_probs[targ, :] = targ_clust_hist / targ_clust_hist.sum()
                 self.full_cov_df.at[targ, "seg_idx"] = np.bincount(seg_idx).argmax()
+                self.full_cov_df.at[targ, "allelic_cluster"] = np.bincount(clust_idx).argmax()
 
         ## subset to targets containing SNPs
         overlap_idx = Cov_clust_probs.sum(1) > 0

From 0858136066e55e1560862afb03e11ac4d2ccf8f4 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Fri, 22 Jul 2022 22:33:31 -0400
Subject: [PATCH 214/222] Add sim forcecalling workflow

---
 85_simFC.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 85_simFC.py

diff --git a/85_simFC.py b/85_simFC.py
new file mode 100644
index 0000000..b9ebcf9
--- /dev/null
+++ b/85_simFC.py
@@ -0,0 +1,66 @@
+import wolf
+
+mutect = wolf.ImportTask("github.com:getzlab/MuTect1_TOOL.git", "M1")
+
+def workflow(
+  bam, bai, vcf,
+  refFasta = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.fa",
+  refFastaIdx = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.fa.fai",
+  refFastaDict = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.dict"
+):
+    localize = wolf.LocalizeToDisk(
+      files = {
+        "bam" : bam,
+        "bai" : bai,
+        "vcf" : vcf,
+        "refFasta" : refFasta,
+        "refFastaIdx" : refFastaIdx,
+        "refFastaDict" : refFastaDict
+      }
+    )
+
+    split_vcf = wolf.Task(
+      name = "split_vcf",
+      inputs = { "vcf" : localize["vcf"] },
+      script = """
+      grep '^#' ${vcf} > header
+      sed '/^#/d' ${vcf} | split -l 10000 -d -a 3 --filter='cat header /dev/stdin > $FILE' - VCF_chunk
+      """,
+      outputs = { "shards" : "VCF_chunk*" }
+    )
+
+    m1_scatter = mutect.mutect1(
+      inputs = {
+        "pairName" : "platinum",
+        "caseName" : "platinum",
+        "t_bam" : localize["bam"],
+        "t_bai" : localize["bai"],
+        "force_calling" : True,
+        "intervals" : split_vcf["shards"],
+        "fracContam" : 0,
+        "refFasta" : localize["refFasta"],
+        "refFastaIdx" : localize["refFastaIdx"],
+        "refFastaDict" :  localize["refFastaDict"]
+      }
+    )
+
+    m1_gather = wolf.Task(
+      name = "m1_gather",
+      inputs = { "callstats_array" : [m1_scatter["mutect1_cs"]] },
+      script = """
+      head -n2 $(head -n1 ${callstats_array}) > header
+      while read -r i; do
+        sed '1,2d' $i
+      done < ${callstats_array} | sort -k1,1V -k2,2n > cs_sorted
+      cat header cs_sorted > cs_concat.tsv
+      """,
+      outputs = { "cs_gather" : "cs_concat.tsv" }
+    )
+
+with wolf.Workflow(workflow = workflow, namespace = "HS_sim") as w:
+    w.run(
+      RUN_NAME = "NA12878_WGS_platinum_hg38",
+      bam = "gs://jh-xfer/NA12878_bwamem_illumina_platinum_bed.bam",
+      bai = "gs://jh-xfer/NA12878_bwamem_illumina_platinum_bed.bam.bai",
+      vcf = "gs://jh-xfer/NA12878.vcf"
+    )

From 181fff84e89d7ca4ed012bd1e41833f81ba5d0b1 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Jul 2022 17:09:44 -0400
Subject: [PATCH 215/222] Explorations on quadratic GC content estimator

---
 71_coverage_covariates.py | 122 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py
index 137dcad..7a5b2a3 100644
--- a/71_coverage_covariates.py
+++ b/71_coverage_covariates.py
@@ -72,6 +72,7 @@
 #
 # GC content {{{
 
+## precompute GC content {{{
 # note: this is obsolete; GC content is now computed on the fly
 
 B = pd.read_csv("/mnt/j/proj/cnv/20210326_coverage_collector/targets.bed", sep = "\t", header = None, names = ["chr", "start", "end"])
@@ -88,6 +89,127 @@
 
 # }}}
 
+# Terry Speed GC content estimator {{{
+
+import hapaseg.run_coverage_MCMC
+
+# load coverage
+
+args = lambda : None
+args.coverage_csv = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/coverage_cat.bed" 
+args.allelic_clusters_object  = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/allelic_DP_SNP_clusts_and_phase_assignments.npz" 
+args.SNPs_pickle  = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/all_SNPs.pickle"
+args.segmentations_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/segmentations.pickle"
+args.repl_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/GSE137764_H1.hg19_liftover.pickle"
+args.faire_pickle  = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/FAIRE_GM12878.hg19.pickle"
+args.ref_fasta = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/Homo_sapiens_assembly19.fasta"
+args.bin_width = 2000
+
+cov_mcmc_runner = hapaseg.run_coverage_MCMC.CoverageMCMCRunner(
+  args.coverage_csv,
+  args.allelic_clusters_object,
+  args.SNPs_pickle,
+  args.segmentations_pickle,
+  f_repl=args.repl_pickle,
+  f_faire=args.faire_pickle,
+ # ref_fasta = "/mnt/j/db/hg38/ref/hg38.analysisSet.fa", # ALCH
+  ref_fasta = args.ref_fasta, #"/mnt/j/db/hg19/ref/hs37d5.fa", # Richter's
+  bin_width = args.bin_width
+)
+C = cov_mcmc_runner.full_cov_df
+
+# bin intervals by GC content
+C["GC_bin"] = np.round(C["C_GC"]*1000).astype(int)
+C["num_frags_corr"] = C["covcorr"]/C["C_frag_len"].mean()
+
+N_gc = C.groupby("GC_bin").size()
+F_gc = C.groupby("GC_bin")["num_frags_corr"].sum()
+
+plt.figure(1); plt.clf()
+plt.scatter(N_gc.index, F_gc/N_gc, marker = '.', s = 1)
+
+cov_df = pd.read_pickle("/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--15-35-16_040rmzi_pid3cty_0w3oyu5xxnfwe/jobs/0/workspace/cov_df.pickle")
+cov_df = cov_df.merge(C[["start_g", "C_GC"]], left_on = "start_g", right_on = "start_g")
+
+cov_df["GC_bin"] = np.round(cov_df["C_GC"]*1000).astype(int)
+cov_df["num_frags_corr"] = cov_df["covcorr"]/cov_df["C_frag_len"].mean()
+
+N_gc = cov_df.groupby("GC_bin").size()
+F_gc = cov_df.groupby("GC_bin")["num_frags_corr"].sum()
+
+cov_df = cov_df.merge((F_gc/N_gc).rename("C_GC_f"), left_on = cov_df["GC_bin"], right_index = True)
+
+import loess
+_, y_l, _ = loess_1d.loess_1d(np.r_[N_gc.index], np.r_[F_gc/N_gc])
+
+plt.figure(2); plt.clf()
+plt.scatter(N_gc.index, F_gc/N_gc, marker = '.', s = 1)
+#plt.plot(N_gc.index, y_l)
+r = np.linspace(0, 1000, 1000)
+v = np.polyfit(np.r_[N_gc.index]/1000, F_gc/N_gc, 2)
+plt.plot(r, v[::-1]@(r**np.c_[0:3]))
+plt.ylim([0, 500])
+
+from capy import plots
+
+plt.figure(3); plt.clf()
+plots.pixplot(cov_df["C_GC_f"], cov_df["num_frags_corr"], alpha = 0.11)
+plots.pixplot(v[::-1]@(cov_df["C_GC"].values**np.c_[0:3]), cov_df["num_frags_corr"], alpha = 0.11)
+
+gc_g = []
+N_gc_g = []
+F_gc_g = []
+plt.figure(4); plt.clf()
+for _, cidx in cov_df.groupby("allelic_cluster").indices.items():
+    N_gc = cov_df.iloc[cidx].groupby("GC_bin").size()
+    F_gc = cov_df.iloc[cidx].groupby("GC_bin")["num_frags_corr"].sum()
+    lplt = plt.scatter(N_gc.index, (F_gc/N_gc)/F_gc.sum(), marker = '.', s = 1)
+
+    v = np.polyfit(N_gc.index, F_gc/N_gc, 2)
+    rng = np.linspace(0, 1000, 200)
+    plt.plot(rng, v[::-1]@(rng**np.c_[0:3]), color = lplt.get_edgecolor())
+
+    gc_g.extend(N_gc.index)
+    N_gc_g.extend(N_gc)
+    F_gc_g.extend(F_gc)
+
+N_gc_g = np.r_[N_gc_g]
+F_gc_g = np.r_[F_gc_g]
+gc_g = np.r_[gc_g]
+
+v = np.polyfit(gc_g, F_gc_g/N_gc_g, 2)
+plt.plot(r, v[::-1]@(r**np.c_[0:3]))
+_, y_l, _ = loess_1d.loess_1d(gc_g, F_gc_g/N_gc_g, xnew = r, degree = 2)
+plt.plot(r, y_l)
+
+plt.figure(3); plt.clf()
+_, y_l, _ = loess_1d.loess_1d(gc_g/1000, F_gc_g/N_gc_g, xnew = cov_df["C_GC"], degree = 2)
+plots.pixplot(cov_df["C_GC_f"], cov_df["num_frags_corr"], alpha = 0.11)
+
+## simulate quadratic relationship
+seg_sim = np.r_[np.ones([500, 1]), 1.5*np.ones([500, 1])].T
+gc_sim = np.random.rand(1000)*0.6 + 0.2
+rng = np.linspace(0, 1, 100)
+x = stats.poisson.rvs(np.exp(-30*(gc_sim - 0.5)**2 + 5*seg_sim))[:, None]
+C = np.c_[gc_sim**2, gc_sim]
+
+import hapaseg.model_optimizers
+PR = hapaseg.model_optimizers.PoissonRegression
+
+Pi = np.r_[np.tile([1, 0], [500, 1]), np.tile([0, 1], [500, 1])]
+pois_regr = PR(x, C, Pi)
+pois_regr.fit()
+pois_regr2 = PR(x, C[:, [1]], Pi)
+pois_regr2.fit()
+plt.figure(2); plt.clf()
+plt.scatter(x, np.exp(Pi@pois_regr.mu + C@pois_regr.beta), marker = '.', s = 1)
+plt.scatter(x, np.exp(Pi@pois_regr2.mu + C[:, [1]]@pois_regr2.beta), marker = '.', s = 1)
+
+
+# }}}
+
+# }}}
+
 #
 # DNAse HS/FAIRE {{{
 

From 15796f4e42b526d0cba7ca8b2031380374bbcb98 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Jul 2022 17:10:31 -0400
Subject: [PATCH 216/222] Regenerate FAIRE tracks

---
 71_coverage_covariates.py | 191 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 191 insertions(+)

diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py
index 7a5b2a3..6012afa 100644
--- a/71_coverage_covariates.py
+++ b/71_coverage_covariates.py
@@ -275,6 +275,197 @@
 FAIRE_smooth["FAIRE"] = np.convolve(FAIRE["FAIRE"], np.ones(5), mode = "same")/5
 FAIRE_smooth.to_pickle("covars/FAIRE_GM12878.smooth5.hg19.pickle")
 
+#
+# re-process all FAIRE files using samtools
+import wolf, itertools, glob, prefect
+
+## make interval list
+clen = seq.get_chrlens()
+for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X"]]):
+    bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]]
+    tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1] })
+    tmp.to_csv(f"FAIRE/intervals/{chrname}.bed", sep = "\t", header = None, index = False)
+
+## define coverage workflow
+
+class markdups(wolf.Task):
+    inputs = { "bamin" }
+    script = "samtools markdup ${bamin} $(basename ${bamin}).dedup.bam && samtools index *dedup.bam"
+    outputs = { "bam" : "*.bam", "bai" : "*.bai" }
+    docker = "gcr.io/broad-getzlab-workflows/base_image:v0.0.5"
+
+intervals = glob.glob("/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE/intervals/*.bed")
+
+def BedCovFlow(bams, intervals):
+    # mark duplicates
+    mark_dups = []
+    for b in bams:
+        mark_dups.append(markdups(
+          inputs = { "bamin" : b },
+          overrides = { "bamin" : "string" },
+          use_scratch_disk = True,
+          scratch_disk_size = 10
+        ))
+
+    # run bedcov on all BAMs (gather)
+    @prefect.task(nout = 2)
+    def bl(md):
+        return [m["bam"] for m in md], [m["bai"] for m in md]
+    bam_list, bai_list = bl(mark_dups)
+
+    BedCov = wolf.Task(
+      name = "BedCov",
+      inputs = { "intervals" : intervals, "bams" : [bam_list], "bais" : [bai_list] },
+      script = """
+      samtools bedcov -Q1 ${intervals} $(cat ${bams}) > coverage.bed
+      """,
+      outputs = { "coverage" : "coverage.bed" },
+      docker = "gcr.io/broad-getzlab-workflows/base_image:v0.0.5"
+    )
+#    for b in bam_list:
+#        wolf.DeleteDisk(b, BedCov["coverage"])
+
+    # gather BedCovs
+    BedCovGather = wolf.Task(
+      name = "BedCovGather",
+      inputs = { "beds" : [BedCov["coverage"]] },
+      script = """
+      cat $(cat ${beds}) | sort -k1,1V -k2,2n | \
+        awk -F'\t' 'BEGIN { OFS = FS } { tot = 0; for(i = 4; i <= NF; i++) { tot += $i }; print $0, tot }' > concat.bed
+      """,
+      outputs = { "concat" : "concat.bed" },
+    )
+
+## run workflow 
+
+base_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeOpenChromFaire/"
+
+BAMs = ["wgEncodeOpenChromFaireA549AlnRep1.bam", # {{{
+"wgEncodeOpenChromFaireA549AlnRep2.bam",
+"wgEncodeOpenChromFaireAstrocyAlnRep1.bam",
+"wgEncodeOpenChromFaireAstrocyAlnRep2.bam",
+"wgEncodeOpenChromFaireColonocAlnRep1.bam",
+"wgEncodeOpenChromFaireColonocAlnRep2.bam",
+"wgEncodeOpenChromFaireEndometriumocAlnRep1.bam",
+"wgEncodeOpenChromFaireEndometriumocAlnRep2.bam",
+"wgEncodeOpenChromFaireFrontalcortexocAlnRep1.bam",
+"wgEncodeOpenChromFaireFrontalcortexocAlnRep2.bam",
+"wgEncodeOpenChromFaireGlioblaAlnRep1.bam",
+"wgEncodeOpenChromFaireGlioblaAlnRep2.bam",
+"wgEncodeOpenChromFaireGlioblaAlnRep3.bam",
+"wgEncodeOpenChromFaireGm12878AlnRep1.bam",
+"wgEncodeOpenChromFaireGm12878AlnRep2.bam",
+"wgEncodeOpenChromFaireGm12878AlnRep3.bam",
+"wgEncodeOpenChromFaireGm12891AlnRep1.bam",
+"wgEncodeOpenChromFaireGm12891AlnRep2.bam",
+"wgEncodeOpenChromFaireGm12892AlnRep1.bam",
+"wgEncodeOpenChromFaireGm12892AlnRep2.bam",
+"wgEncodeOpenChromFaireGm18507AlnRep1.bam",
+"wgEncodeOpenChromFaireGm18507AlnRep2.bam",
+"wgEncodeOpenChromFaireGm19239AlnRep1.bam",
+"wgEncodeOpenChromFaireGm19239AlnRep2.bam",
+"wgEncodeOpenChromFaireH1hescAlnRep1.bam",
+"wgEncodeOpenChromFaireH1hescAlnRep2.bam",
+"wgEncodeOpenChromFaireHelas3AlnRep1.bam",
+"wgEncodeOpenChromFaireHelas3AlnRep2.bam",
+"wgEncodeOpenChromFaireHelas3Ifna4hAlnRep1.bam",
+"wgEncodeOpenChromFaireHelas3Ifna4hAlnRep2.bam",
+"wgEncodeOpenChromFaireHelas3Ifng4hAlnRep1.bam",
+"wgEncodeOpenChromFaireHelas3Ifng4hAlnRep2.bam",
+"wgEncodeOpenChromFaireHepg2AlnRep1.bam",
+"wgEncodeOpenChromFaireHepg2AlnRep2.bam",
+"wgEncodeOpenChromFaireHepg2AlnRep3.bam",
+"wgEncodeOpenChromFaireHtr8AlnRep1.bam",
+"wgEncodeOpenChromFaireHtr8AlnRep2.bam",
+"wgEncodeOpenChromFaireHuvecAlnRep1.bam",
+"wgEncodeOpenChromFaireHuvecAlnRep2.bam",
+"wgEncodeOpenChromFaireK562AlnRep1.bam",
+"wgEncodeOpenChromFaireK562AlnRep2.bam",
+"wgEncodeOpenChromFaireK562NabutAlnRep1.bam",
+"wgEncodeOpenChromFaireK562NabutAlnRep2.bam",
+"wgEncodeOpenChromFaireK562OhureaAlnRep1.bam",
+"wgEncodeOpenChromFaireK562OhureaAlnRep2.bam",
+"wgEncodeOpenChromFaireKidneyocAlnRep1.bam",
+"wgEncodeOpenChromFaireKidneyocAlnRep2.bam",
+"wgEncodeOpenChromFaireMcf7Est10nm30mAlnRep1.bam",
+"wgEncodeOpenChromFaireMcf7Est10nm30mAlnRep2.bam",
+"wgEncodeOpenChromFaireMcf7HypoxlacAlnRep1.bam",
+"wgEncodeOpenChromFaireMcf7HypoxlacAlnRep2.bam",
+"wgEncodeOpenChromFaireMcf7VehAlnRep1.bam",
+"wgEncodeOpenChromFaireMcf7VehAlnRep2.bam",
+"wgEncodeOpenChromFaireMedulloAlnRep1.bam",
+"wgEncodeOpenChromFaireMedulloAlnRep2.bam",
+"wgEncodeOpenChromFaireMrta2041AlnRep1.bam",
+"wgEncodeOpenChromFaireMrta2041AlnRep2.bam",
+"wgEncodeOpenChromFaireMrtg4016AlnRep1.bam",
+"wgEncodeOpenChromFaireMrtg4016AlnRep2.bam",
+"wgEncodeOpenChromFaireMrtttc549AlnRep1.bam",
+"wgEncodeOpenChromFaireMrtttc549AlnRep2.bam",
+"wgEncodeOpenChromFaireNhaAlnRep1.bam",
+"wgEncodeOpenChromFaireNhaAlnRep2.bam",
+"wgEncodeOpenChromFaireNhbeAlnRep1.bam",
+"wgEncodeOpenChromFaireNhbeAlnRep2.bam",
+"wgEncodeOpenChromFaireNhekAlnRep1.bam",
+"wgEncodeOpenChromFaireNhekAlnRep2.bam",
+"wgEncodeOpenChromFairePancreasocAlnRep1.bam",
+"wgEncodeOpenChromFairePancreasocAlnRep2.bam",
+"wgEncodeOpenChromFairePanisletsAlnRep1.bam",
+"wgEncodeOpenChromFaireRcc7860AlnRep1.bam",
+"wgEncodeOpenChromFaireRcc7860AlnRep2.bam",
+"wgEncodeOpenChromFaireSmallintestineocAlnRep1.bam",
+"wgEncodeOpenChromFaireSmallintestineocAlnRep2.bam",
+"wgEncodeOpenChromFaireUrotsaAlnRep1.bam",
+"wgEncodeOpenChromFaireUrotsaAlnRep2.bam",
+"wgEncodeOpenChromFaireUrotsaUt189AlnRep1.bam",
+"wgEncodeOpenChromFaireUrotsaUt189AlnRep2.bam"] # }}}
+
+B = pd.Series(BAMs).str.extract("(?P<bam>.*Faire(?P<cell_line>.*)AlnRep(?P<rep>\d+)\.bam)")
+
+with wolf.Workflow(workflow = BedCovFlow, namespace = "FAIRE_cov") as w:
+    for cell_line, b in B.groupby("cell_line"):
+        w.run(RUN_NAME = cell_line, bams = base_url + b["bam"], intervals = intervals)
+
+## parse in coverages; make covariate table
+from capy import mut
+
+w = wolf.Workflow(workflow = BedCovFlow, namespace = "FAIRE_cov")
+for cell_line, b in B.groupby("cell_line"):
+    w.load_results(RUN_NAME = cell_line, bams = base_url + b["bam"], intervals = intervals)
+
+T = w.tasks.loc[(slice(None), "BedCovGather"), ["results"]].droplevel(1)
+T["covpath"] = T["results"].apply(lambda x : x["concat"])
+
+for i, (cell_line, cov) in enumerate(T.iterrows()):
+    X = pd.read_csv(cov["covpath"], sep = "\t", header = None)
+    X = X.rename(columns = { len(X.columns) - 1 : cell_line })
+    # get common lines
+    if i == 0:
+        C = X.iloc[:, np.r_[0:3, -1]].rename(columns = { 0 : "chr", 1 : "start", 2 : "end" })
+    else:
+        C = pd.concat([C, X.iloc[:, -1]], axis = 1)
+
+C["chr"] = mut.convert_chr(C["chr"])
+
+C.to_pickle("covars/FAIRE/coverage.dedup.raw.pickle")
+
+# rebin to 10k
+C["index_r"] = C.index//5
+C10k = C.groupby(["chr", "index_r"]).agg({
+   "start" : min, "end" : max,
+   **{ k : sum for k in C.columns[3:] }
+}).droplevel(1).reset_index().drop(columns = "index_r")
+
+C10k.to_pickle("covars/FAIRE/coverage.dedup.raw.10kb.pickle")
+
+# 100k?
+C["index_r"] = C.index//50
+C100k = C.groupby(["chr", "index_r"]).agg({
+   "start" : min, "end" : max,
+   **{ k : sum for k in C.columns[3:] }
+}).droplevel(1).reset_index().drop(columns = "index_r")
+
+C100k.to_pickle("covars/FAIRE/coverage.dedup.raw.100kb.pickle")
+
 # }}}
 
 # }}}

From 92546dc4f0111584c34644df713a0b03ad4be4b3 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Jul 2022 17:12:50 -0400
Subject: [PATCH 217/222] Use quadratic GC content on raw data rather than
 binned data

---
 hapaseg/run_coverage_MCMC.py | 31 ++++---------------------------
 1 file changed, 4 insertions(+), 27 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 28f9d8f..13914ef 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -165,10 +165,8 @@ def load_covariates(self):
             print("Computing GC content", file = sys.stderr)
             self.generate_GC()
 
-        # bin GC content with resolution proportional to the sqrt of the number of bins
-        self.full_cov_df["GC_bin"] = np.round(self.full_cov_df["C_GC"]*np.sqrt(len(self.full_cov_df))).astype(int)
-
-        # we will subsequently transform GC content to reflect the coverage bias of that bin
+        # GC content follows a roughly quadratic relationship with coverage
+        self.full_cov_df["C_GC2"] = self.full_cov_df["C_GC"]**2
 
         ## FAIRE
 
@@ -277,29 +275,8 @@ def assign_clusters(self):
         Cov_overlap["fragcorr"] = np.round(Cov_overlap["covcorr"]/Cov_overlap["C_frag_len"].mean())
         r = np.c_[Cov_overlap["fragcorr"]]
 
-        # fit empirical GC correction model (we do this here because we only consider "good" coverage bins)
-        GC_b = []   # GC bin
-        N_gc = [] # total number of coverage intervals within GC bin
-        F_gc = [] # total number of fragments within GC bin
-        for _, cidx in Cov_overlap.groupby("allelic_cluster").indices.items():
-            ngc = Cov_overlap.iloc[cidx].groupby("GC_bin").size()
-            fgc = Cov_overlap.iloc[cidx].groupby("GC_bin")["fragcorr"].sum()
-            GC_b.extend(ngc.index)
-            N_gc.extend(ngc)
-            F_gc.extend(fgc)
-        GC_b = np.r_[GC_b]
-        N_gc = np.r_[N_gc]
-        F_gc = np.r_[F_gc]
-
-        # use quadratic model
-        v = np.polyfit(GC_b/np.sqrt(len(self.full_cov_df)), F_gc/N_gc, 2)
-
-        Cov_overlap["C_GCtr"] = v[::-1]@((Cov_overlap["GC_bin"].values/np.sqrt(len(self.full_cov_df)))**np.c_[0:3])
-        Cov_overlap.loc[Cov_overlap["C_GCtr"] < 0, "C_GCtr"] = 1
-        #Cov_overlap["C_GCtr_z"] = (lambda x : (x - np.nanmean(x))/np.nanstd(x))(np.log(Cov_overlap["C_GCtr"]))
-        Cov_overlap["C_GCtr_z"] = np.log(Cov_overlap["C_GCtr"])
-
-        covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")])
+        # make covariate matrix; use all z-transformed covariates + non-scaled GC content+GC^2
+        covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$") | Cov_overlap.columns.str.contains("^C_GC")])
         C = np.c_[Cov_overlap[covar_columns]]
 
         ## dropping Nans

From 5107c3c22548fdc62828314bbdafc49ebb4ceb56 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Jul 2022 17:13:24 -0400
Subject: [PATCH 218/222] Load multiple FAIRE tracks

---
 hapaseg/run_coverage_MCMC.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 13914ef..7f400a4 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -172,13 +172,17 @@ def load_covariates(self):
 
         if self.f_faire is not None:
             F = pd.read_pickle(self.f_faire)
+
             # map targets to FAIRE intervals
             tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint")
-            self.full_cov_df['C_FAIRE'] = np.nan
-            self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values
+            F = F.loc[tidx].set_index(tidx.index).iloc[:, 3:].rename(columns = lambda x : "C_" + x)
+            self.full_cov_df = pd.concat([self.full_cov_df, F], axis = 1)
 
             # z-transform
-            self.full_cov_df["C_FAIRE_z"] = zt(np.log(self.full_cov_df["C_FAIRE"] + 1))
+            self.full_cov_df = pd.concat([
+              self.full_cov_df,
+              self.full_cov_df.loc[:, F.columns].apply(lambda x : zt(np.log(x + 1))).rename(columns = lambda x : x + "_z")
+            ], axis = 1)
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned

From 4af5402289d49cbcbdfd3daeb34861db83415837 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Jul 2022 17:13:49 -0400
Subject: [PATCH 219/222] Load in update covcollect format that counts bad
 reads

---
 hapaseg/run_coverage_MCMC.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 7f400a4..e880c13 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -76,7 +76,7 @@ def prepare_single_cluster(self):
         return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample, pois_hess
 
     def load_coverage(self, coverage_csv):
-        Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False)
+        Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_frags", "tot_reads", "fail_reads"], low_memory=False)
         Cov.loc[Cov['chr'] == 'chrM', 'chr'] = 'chrMT' #change mitocondrial contigs to follow mut conventions
         Cov["chr"] = mut.convert_chr(Cov["chr"])
         Cov = Cov.loc[Cov["chr"] != 0]
@@ -127,7 +127,7 @@ def load_covariates(self):
         # generate on 5x and 11x scales
         swv = np.lib.stride_tricks.sliding_window_view
         fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0
-        wt = self.full_cov_df["num_reads"].values
+        wt = self.full_cov_df["num_frags"].values
         for scale in [5, 11]:
             fl_sw = swv(np.pad(fl, scale//2), scale)
             wt_sw = swv(np.pad(wt, scale//2), scale)

From 7bdb15e72c3b2d071a98d9b25253348c47380054 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Jul 2022 17:14:44 -0400
Subject: [PATCH 220/222] Use covcollect branch that tallies bad reads

---
 wolF/workflow.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index 7bf7716..b97c24b 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -44,7 +44,8 @@
 
 cov_collect = wolf.ImportTask(
   task_path = "git@github.com:getzlab/covcollect.git",
-  task_name = "covcollect"
+  task_name = "covcollect",
+  branch = "tot_reads"
 )
 
 ####

From 8695dc40a41a8b6ee136e736316e64971e3fac04 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Jul 2022 17:17:41 -0400
Subject: [PATCH 221/222] Draft code of force calling at het sites only

---
 wolF/workflow.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index b97c24b..c0b6038 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -261,6 +261,16 @@ def interval_gather(interval_files, primary_contigs):
 
     # otherwise, run M1 and get it from the BAM
     elif callstats_file is None and tumor_bam is not None and normal_bam is not None:
+        # split het sites file uniformly
+#        split_het_sites = wolf.Task(
+#          name = "split_het_sites",
+#          inputs = { "snp_list" : localization_task["common_snp_list"] },
+#          script = """
+#          sed '/^@/d' ${snp_list} | split -l 10000 -d -a 4 - snp_list_chunk
+#          """,
+#          outputs = { "snp_list_shards" : "snp_list_chunk*" }
+#        )
+
         m1_task = mutect1.mutect1(inputs=dict(
           pairName = "het_coverage",
           caseName = "tumor",
@@ -278,8 +288,11 @@ def interval_gather(interval_files, primary_contigs):
           refFastaDict = localization_task["ref_fasta_dict"],
 
           intervals = split_intervals_task["interval_files"],
+          #intervals = split_het_sites["snp_list_shards"],
+
+          exclude_chimeric = True#,
 
-          exclude_chimeric = True
+          #force_calling = True,
         ))
 
         hp_scatter = het_pulldown.get_het_coverage_from_callstats(

From a8e965b5f8a9c1a8ab18fab47547a3e498ef3306 Mon Sep 17 00:00:00 2001
From: Julian Hess <jhess@broadinstitute.org>
Date: Thu, 28 Jul 2022 17:18:43 -0400
Subject: [PATCH 222/222] Specify workflow path locally

---
 wolF/workflow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wolF/workflow.py b/wolF/workflow.py
index c0b6038..6ac1aa1 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -31,7 +31,7 @@
 
 # for Hapaseg itself
 hapaseg = wolf.ImportTask(
-  task_path = "../", # TODO: make remote
+  task_path = ".", # TODO: make remote
   task_name = "hapaseg"
 )