From e43dd17d86312932c087798c60f22e3fa2cc007c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 2 Dec 2021 15:18:24 -0500 Subject: [PATCH 001/222] Only compute clust_lik; don't change underlying likelihood data --- hapaseg/allelic_DP.py | 60 +++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 33 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 9bf2d2e..790b4be 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -559,48 +559,42 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None max_clust_idx = segs_to_clusters.max() + 1 - liks = np.full([segs_to_clusters.shape[0], 2], np.nan) + liks = np.full(segs_to_clusters.shape[0], np.nan) for i, (cl_samp, ph_samp) in enumerate(zip(segs_to_clusters, phase_orientations)): - # reset phases - # TODO: when we switch to faster phasing correction model that doesn't involve modifying self.S, this won't be necessary - S_ph = self.S.copy() - flip_idx = np.flatnonzero(ph_samp != S_ph["flipped"]) - S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] - ## overall clustering likelihood - A = npg.aggregate(cl_samp, S_ph["min"], size = max_clust_idx) - B = npg.aggregate(cl_samp, S_ph["maj"], size = max_clust_idx) -# for when self.S is not modified -# A = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx) + \ -# npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx) -# -# B = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx) + \ -# npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx) + A1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx) + A2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx) - clust_lik = ss.betaln(A + 1, B + 1).sum() - - ## segmentation likelihood - - # get segment boundaries - bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1]) - bdy = np.c_[bdy[:-1], bdy[1:]] + B1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx) + B2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx) - # sum log-likelihoods of each segment - seg_lik = 0 - for st, en in bdy: - A, B = S_ph.iloc[st:en, [self.min_col, self.maj_col]].sum() + count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double) + count_prior /= count_prior.sum() -# for when self.S is not modified -# A = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \ -# self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum() -# B = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \ -# self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum() + clust_lik = (ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1) + np.log(count_prior)[count_prior > 0]).sum() - seg_lik += ss.betaln(A + 1, B + 1) +# ## segmentation likelihood +# +# # get segment boundaries +# bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1]) +# bdy = np.c_[bdy[:-1], bdy[1:]] +# +# # sum log-likelihoods of each segment +# seg_lik = 0 +# for st, en in bdy: +# A, B = S_ph.iloc[st:en, [self.min_col, self.maj_col]].sum() +# +## for when self.S is not modified +## A = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \ +## self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum() +## B = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \ +## self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum() +# +# seg_lik += ss.betaln(A + 1, B + 1) - liks[i, :] = np.r_[clust_lik, seg_lik] + liks[i] = clust_lik return liks From 4f748ea898e7333abc928a154790a7c8e084bfdd Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 14:53:07 -0500 Subject: [PATCH 002/222] Split rephase into probability/realization --- hapaseg/allelic_DP.py | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 790b4be..85b07ae 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -354,29 +354,32 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster self.clust_count_prior[0] = self.alpha # relative probability of sending a cluster to the garbage + def compute_rephase_prob(self, seg_idx): + A_a = self.S.iloc[seg_idx, self.aalt_col].sum() + 1 + A_b = self.S.iloc[seg_idx, self.aref_col].sum() + 1 + B_a = self.S.iloc[seg_idx, self.balt_col].sum() + 1 + B_b = self.S.iloc[seg_idx, self.bref_col].sum() + 1 + + # use normal approximation to beta if conditions are right + if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20: + m_x = A_a/(A_a + A_b) + s_x = A_a*A_b/((A_a + A_b)**2*(A_a + A_b + 1)) + m_y = B_a/(B_a + B_b) + s_y = B_a*B_b/((B_a + B_b)**2*(B_a + B_b + 1)) + + return s.norm.cdf(0, m_y - m_x, np.sqrt(s_x + s_y)) + + # Monte Carlo simulate difference of betas + else: + x = s.beta.rvs(A_a, A_b, size = 1000) + y = s.beta.rvs(B_a, B_b, size = 1000) + + return (x > y).mean() def rephase(self, seg_idx, force = False): + do_rephase = False if not force: - A_a = self.S.iloc[seg_idx, self.aalt_col].sum() + 1 - A_b = self.S.iloc[seg_idx, self.aref_col].sum() + 1 - B_a = self.S.iloc[seg_idx, self.balt_col].sum() + 1 - B_b = self.S.iloc[seg_idx, self.bref_col].sum() + 1 - - # use normal approximation to beta if conditions are right - if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20: - m_x = A_a/(A_a + A_b) - s_x = A_a*A_b/((A_a + A_b)**2*(A_a + A_b + 1)) - m_y = B_a/(B_a + B_b) - s_y = B_a*B_b/((B_a + B_b)**2*(B_a + B_b + 1)) - - do_rephase = np.random.rand() < s.norm.cdf(0, m_y - m_x, np.sqrt(s_x + s_y)) - - # Monte Carlo simulate difference of betas - else: - x = s.beta.rvs(A_a, A_b, size = 1000) - y = s.beta.rvs(B_a, B_b, size = 1000) - - do_rephase = np.random.rand() < (x > y).mean() + do_rephase = np.random.rand() < self.compute_rephase_prob(seg_idx) if force or do_rephase: self.S.iloc[seg_idx, [self.min_col, self.maj_col]] = self.S.iloc[seg_idx, [self.min_col, self.maj_col]].values[:, ::-1] From 69efc109d69472c4034a50018766820317d942cb Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 14:53:59 -0500 Subject: [PATCH 003/222] Compute likelihood in both phase orientations --- hapaseg/allelic_DP.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 85b07ae..204119f 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -608,6 +608,7 @@ def run(self, n_iter = 50): if len(self.clust_prior) > 1: for seg_idx in range(len(self.S)): seg_idx = np.r_[seg_idx] + #self.rephase(seg_idx) # compute probability that segment belongs to each cluster prior element S_a = self.S.iloc[seg_idx[0], self.min_col] @@ -786,7 +787,7 @@ def run(self, n_iter = 50): # # perform phase correction on segment/cluster # flip min/maj with probability that alleles are oriented the "wrong" way - self.rephase(seg_idx) + rephase_prob = self.compute_rephase_prob(seg_idx) # # choose to join a cluster or make a new one @@ -821,18 +822,22 @@ def run(self, n_iter = 50): C = ss.betaln(C_ab[:, 0] + 1, C_ab[:, 1] + 1) # A is likelihood cluster B is part of, minus B #A = ss.betaln(A_a + 1, A_b + 1) - # B+C is likelihood of target cluster post-join - BC = ss.betaln(C_ab[:, 0] + B_a + 1, C_ab[:, 1] + B_b + 1) + # B+C is likelihood of target cluster post-join, with both phase orientations + BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1) + + MLs = BC - C[:, None] + np.log(np.r_[1 - rephase_prob, rephase_prob]) + # TODO: get adj_BC working again # L(join) L(split) #MLs = A + BC + adj_BC - (AB + C + adj_AB) # TODO: remove extraneous calculations (e.g. adj_AB, AB, A); # likelihood simplifies to this in the prior: - MLs = adj_BC + BC - C + #MLs = adj_BC + BC - C # if we are moving multiple contiguous segments assigned to the same # cluster, do not allow them to create a new cluster. this helps keep # cluster indices consistent + # TODO: if we don't care about keeping indices consistent, then we can probably remove this line if n_move > 1 and not move_clust: MLs[self.clust_sums.index(-1)] = -np.inf From fd60e522d1dbd549afb0f9afc7c0fe59b98ba5db Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 14:54:31 -0500 Subject: [PATCH 004/222] Compute prior in both phase orientations --- hapaseg/allelic_DP.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 204119f..8c5ddb8 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -872,16 +872,17 @@ def run(self, n_iter = 50): np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null | {0})]] ] + # prior marginal likelihoods for both phase orientations prior_MLs = ss.betaln( # prior clusters + segment - np.r_[self.clust_prior_mat[prior_idx, 0]] + B_a + 1, - np.r_[self.clust_prior_mat[prior_idx, 1]] + B_b + 1 + np.c_[self.clust_prior_mat[prior_idx, 0]] + np.c_[B_a, B_b] + 1, + np.c_[self.clust_prior_mat[prior_idx, 1]] + np.c_[B_b, B_a] + 1 ) \ - - (ss.betaln(B_a + 1, B_b + 1) + np.r_[np.r_[self.clust_prior_liks.values()][prior_idx]]) # prior clusters, segment + - np.c_[ss.betaln(B_a + 1, B_b + 1) + np.r_[np.r_[self.clust_prior_liks.values()][prior_idx]]] # prior clusters, segment clust_prior_p = np.maximum(np.exp(prior_MLs - prior_MLs.max())/np.exp(prior_MLs - prior_MLs.max()).sum(), 1e-300) # expand MLs to account for multiple new clusters - MLs = np.r_[np.full(len(prior_diff), MLs[0]), MLs[1:]] + MLs = np.r_[np.full([len(prior_diff), 2], MLs[0]), MLs[1:, :]] # DP prior based on clusters sizes # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster) @@ -890,7 +891,7 @@ def run(self, n_iter = 50): count_prior /= count_prior.sum() # choose to join a cluster or make a new one (choice_idx = 0) - num = MLs + np.log(count_prior) + np.log(clust_prior_p) + num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum() choice_idx = np.random.choice( np.r_[0:len(MLs)], From 95cd50b22232acde087ef3b29b93a11b28c27d33 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 15:25:14 -0500 Subject: [PATCH 005/222] Pick new cluster accounting for phasing state --- hapaseg/allelic_DP.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 8c5ddb8..a751406 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -894,12 +894,16 @@ def run(self, n_iter = 50): num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum() choice_idx = np.random.choice( - np.r_[0:len(MLs)], - p = choice_p + np.r_[0:np.prod(choice_p.shape)], + p = choice_p.ravel() ) # -1 = brand new, -2, -3, ... = -(prior clust index) - 2 # 0 = garbage - choice = np.r_[-np.r_[prior_diff] - 2, 0, self.clust_counts.keys()][choice_idx] + choice = np.r_[-np.r_[prior_diff] - 2, 0, self.clust_counts.keys()][choice_idx//2] + + # save rephasing status + if choice_idx & 1: + self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] # create new cluster if choice < 0: From 81e446914bcf1c9d8c411867e6d0e06e0e33f915 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 15:32:44 -0500 Subject: [PATCH 006/222] Apply some commits from fastphase branch --- hapaseg/allelic_DP.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index a751406..6144871 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -332,6 +332,10 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.clust_count_prior = clust_count_prior.copy() self.alpha = alpha + self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed + self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F") + self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F") + # # define column indices self.clust_col = self.S.columns.get_loc("clust") @@ -447,8 +451,8 @@ def compute_adj_liks(self, seg_idx, cur_clust): (self.clusts[st - j] == U_cl or self.clusts[st - j] == 0): # again, skip over segments in the garbage if self.clusts[st - j] != 0: - UD_counts[o, 0] += self.S.iloc[st - j, self.min_col] - UD_counts[o, 1] += self.S.iloc[st - j, self.maj_col] + UD_counts[o, 0] += self._Siat_ph(st - j, min = True) + UD_counts[o, 1] += self._Siat_ph(st - j, min = False) j += 1 @@ -464,8 +468,8 @@ def compute_adj_liks(self, seg_idx, cur_clust): while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \ (self.clusts[en + j] == D_cl or self.clusts[en + j] == 0): if self.clusts[en + j] != 0: - UD_counts[o, 2] += self.S.iloc[en + j, self.min_col] - UD_counts[o, 3] += self.S.iloc[en + j, self.maj_col] + UD_counts[o, 2] += self._Siat_ph(en + j, min = True) + UD_counts[o, 3] += self._Siat_ph(en + j, min = False) j += 1 @@ -497,8 +501,8 @@ def compute_adj_liks(self, seg_idx, cur_clust): # min/maj counts of the segment(s) being moved st = ordpairs[j, 0] en = ordpairs[j, 1] - S_a = self.S.iloc[:, self.min_col].values[st:(en + 1)].sum() - S_b = self.S.iloc[:, self.maj_col].values[st:(en + 1)].sum() + S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # XXX: why en + 1? + S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) # XXX: why en + 1? # adjacency likelihood of this segment remaining where it is # adj_AB += self.SJliks( @@ -608,11 +612,10 @@ def run(self, n_iter = 50): if len(self.clust_prior) > 1: for seg_idx in range(len(self.S)): seg_idx = np.r_[seg_idx] - #self.rephase(seg_idx) # compute probability that segment belongs to each cluster prior element - S_a = self.S.iloc[seg_idx[0], self.min_col] - S_b = self.S.iloc[seg_idx[0], self.maj_col] + S_a = self._Siat_ph(seg_idx[0], min = True) + S_b = self._Siat_ph(seg_idx[0], min = False) P_a = self.clust_prior_mat[1:, 0] P_b = self.clust_prior_mat[1:, 1] @@ -751,7 +754,7 @@ def run(self, n_iter = 50): del self.clust_sums[cur_clust] del self.clust_members[cur_clust] else: - self.clust_sums[cur_clust] -= np.r_[self.S.iloc[seg_idx, self.min_col].sum(), self.S.iloc[seg_idx, self.maj_col].sum()] + self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)] self.clust_members[cur_clust] -= set(seg_idx) unassigned_segs.update(seg_idx) @@ -798,8 +801,8 @@ def run(self, n_iter = 50): # C is all possible clusters to move to A_a = self.clust_sums[cur_clust][0] if cur_clust in self.clust_sums else 0 A_b = self.clust_sums[cur_clust][1] if cur_clust in self.clust_sums else 0 - B_a = self.S.iloc[seg_idx, self.min_col].sum() # TODO: slow if seg_idx contains many SNPs - B_b = self.S.iloc[seg_idx, self.maj_col].sum() + B_a = self._Ssum_ph(seg_idx, min = True) + B_b = self._Ssum_ph(seg_idx, min = False) C_ab = np.r_[self.clust_sums.values()] # first terms: (-1) = make new cluster, (0) = garbage cluster #C_ab = np.r_[[v for k, v in clust_sums.items() if k != cur_clust or cur_clust == -1]] # if we don't want to explicitly propose letting B rejoin cur_clust From f7e360c0583877d0305251757972d909a9be1026 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 15:37:27 -0500 Subject: [PATCH 007/222] Apply more commits from fastphase --- hapaseg/allelic_DP.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 6144871..5860eb7 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -358,11 +358,31 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster self.clust_count_prior[0] = self.alpha # relative probability of sending a cluster to the garbage + def _Siat_ph(self, ridx, min = True): + # min, flip => maj + # ~min, ~flip => maj + # min, ~flip => min + # ~min, flip => min + col = self.min_col if self.S.iat[ridx, self.flip_col] ^ min else self.maj_col + return self.S.iat[ridx, col] + + def _Ssum_ph(self, seg_idx, min = True): + #flip = self.flip_mat[seg_idx] + flip = self.S.iloc[seg_idx, self.flip_col] + flip_n = ~flip + if min: + return self.mm_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + else: + return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + def compute_rephase_prob(self, seg_idx): - A_a = self.S.iloc[seg_idx, self.aalt_col].sum() + 1 - A_b = self.S.iloc[seg_idx, self.aref_col].sum() + 1 - B_a = self.S.iloc[seg_idx, self.balt_col].sum() + 1 - B_b = self.S.iloc[seg_idx, self.bref_col].sum() + 1 + flip = self.S.iloc[seg_idx, self.flip_col] + flip_n = ~flip + + A_a = self.alt_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + A_b = self.ref_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + B_a = self.alt_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + B_b = self.ref_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 # use normal approximation to beta if conditions are right if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20: @@ -380,17 +400,6 @@ def compute_rephase_prob(self, seg_idx): return (x > y).mean() - def rephase(self, seg_idx, force = False): - do_rephase = False - if not force: - do_rephase = np.random.rand() < self.compute_rephase_prob(seg_idx) - - if force or do_rephase: - self.S.iloc[seg_idx, [self.min_col, self.maj_col]] = self.S.iloc[seg_idx, [self.min_col, self.maj_col]].values[:, ::-1] - self.S.iloc[seg_idx, [self.aalt_col, self.balt_col]] = self.S.iloc[seg_idx, [self.aalt_col, self.balt_col]].values[:, ::-1] - self.S.iloc[seg_idx, [self.aref_col, self.bref_col]] = self.S.iloc[seg_idx, [self.aref_col, self.bref_col]].values[:, ::-1] - self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] - def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_b, D_a, D_b): # if st == en: # J_a = S.iat[st, min_col].sum() From e9a9f00443131008c4ef4d0f8a4ce26b58112d68 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 15:38:32 -0500 Subject: [PATCH 008/222] Eliminate self.rephase --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 5860eb7..c6a3241 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -651,7 +651,7 @@ def run(self, n_iter = 50): # rephase if choice < 0: - self.rephase(seg_idx, force = True) + self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] choice = -choice self.S.iloc[seg_idx, self.clust_col] = choice From b75ae38d1375f176b81b65211ce6c0fa7b33caa7 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 15:40:56 -0500 Subject: [PATCH 009/222] Add clarifying comment --- hapaseg/allelic_DP.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index c6a3241..ddcdbf1 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -905,6 +905,7 @@ def run(self, n_iter = 50): # choose to join a cluster or make a new one (choice_idx = 0) num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum() + # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true choice_idx = np.random.choice( np.r_[0:np.prod(choice_p.shape)], p = choice_p.ravel() From 6d904da642540886ad375460fa546c0c253a1219 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 6 Dec 2021 16:05:27 -0500 Subject: [PATCH 010/222] Draft commit of getting rid of garbage cluster --- hapaseg/allelic_DP.py | 94 +++++++++++-------------------------------- 1 file changed, 24 insertions(+), 70 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index ddcdbf1..4632b22 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -111,7 +111,7 @@ def load_seg_samp(self, samp_idx): # initial cluster assignments S["clust"] = -1 # initially, all segments are unassigned - S.iloc[0, S.columns.get_loc("clust")] = 1 # first segment is assigned to cluster 1 + S.iloc[0, S.columns.get_loc("clust")] = 0 # first segment is assigned to cluster 0 # initial phasing orientation S["flipped"] = False @@ -231,10 +231,6 @@ def run(self, N_seg_samps = 50, N_clust_samps = 5, seg_sample_idx = None): del clust_prior[kk] del clust_count_prior[kk] - # remove garbage cluster from priors - #del clust_prior[0] - #del clust_count_prior[0] - return self.snps_to_clusters, self.snps_to_phases, self.DP_likelihoods def visualize_segs(self, snps_to_clusters = None, f = None, n_vis_samp = None): @@ -356,7 +352,6 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.clust_prior_mat = np.r_[self.clust_prior.values()] self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster - self.clust_count_prior[0] = self.alpha # relative probability of sending a cluster to the garbage def _Siat_ph(self, ridx, min = True): # min, flip => maj @@ -408,20 +403,14 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_ # J_a = S.iloc[st:(en + 1), min_col].sum() # J_b = S.iloc[st:(en + 1), maj_col].sum() SU_a = SU_b = SD_a = SD_b = 0 - # if target segments are being moved to the garbage, it is equivalent to making them their own segment, and joining the upstream and downstream segments - if targ_clust == 0: - SU_a = J_a - SU_b = J_b - J_a = 0 - J_b = 0 - - if targ_clust != - 1 and (targ_clust == upstream_clust or targ_clust == 0): + + if targ_clust != - 1 and targ_clust == upstream_clust: J_a += U_a J_b += U_b else: SU_a += U_a SU_b += U_b - if targ_clust != - 1 and (targ_clust == downstream_clust or targ_clust == 0): + if targ_clust != - 1 and targ_clust == downstream_clust: J_a += D_a J_b += D_b else: @@ -447,48 +436,32 @@ def compute_adj_liks(self, seg_idx, cur_clust): for o, (st, en) in enumerate(ordpairs): # maj/min counts of contiguous upstream segments belonging to the same cluster if st - 1 > 0: - # skip over adjacent segments that are in the garbage; - # we only care about adjacent segments actually assigned to clusters j = 1 - while st - j > 0 and self.clusts[st - j] == 0: - j += 1 U_cl = self.clusts[st - j] adj_clusters[o, 0] = U_cl while st - j > 0 and self.clusts[st - j] != -1 and \ - (self.clusts[st - j] == U_cl or self.clusts[st - j] == 0): - # again, skip over segments in the garbage - if self.clusts[st - j] != 0: - UD_counts[o, 0] += self._Siat_ph(st - j, min = True) - UD_counts[o, 1] += self._Siat_ph(st - j, min = False) + self.clusts[st - j] == U_cl: + UD_counts[o, 0] += self._Siat_ph(st - j, min = True) + UD_counts[o, 1] += self._Siat_ph(st - j, min = False) j += 1 # maj/min counts of contiguous downstream segments belonging to the same cluster if en + 1 < len(self.S): j = 1 - while en + j < len(self.S) - 1 and self.clusts[en + j] == 0: - j += 1 D_cl = self.clusts[en + j] adj_clusters[o, 1] = D_cl while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \ - (self.clusts[en + j] == D_cl or self.clusts[en + j] == 0): - if self.clusts[en + j] != 0: - UD_counts[o, 2] += self._Siat_ph(en + j, min = True) - UD_counts[o, 3] += self._Siat_ph(en + j, min = False) + self.clusts[en + j] == D_cl: + UD_counts[o, 2] += self._Siat_ph(en + j, min = True) + UD_counts[o, 3] += self._Siat_ph(en + j, min = False) j += 1 - # if we are looking at the segments at the very start or very end, set - # upstream/downstream cluster indices to garbage - if ordpairs[0, 0] == 0: - adj_clusters[0, 0] = 0 - if ordpairs[-1, 1] == len(self.S) - 1: - adj_clusters[-1, 1] = 0 - # if there are any segments being moved adjacent to already existing clusters, get local split/join likelihoods adj_idx = ~(adj_clusters == -1).all(1) @@ -527,8 +500,8 @@ def compute_adj_liks(self, seg_idx, cur_clust): # ) # adjacency likelihood of this segment joining each possible cluster: - # 1. those it is actually adjacent to (+ new cluster, garbage) - for cl in {-1, 0, cl_u, cl_d}: + # 1. those it is actually adjacent to (+ new cluster) + for cl in {-1, cl_u, cl_d}: idx = self.clust_sums.index(cl) adj_BC[idx] += self.SJliks( targ_clust = cl, @@ -541,12 +514,9 @@ def compute_adj_liks(self, seg_idx, cur_clust): D_a = D_a, D_b = D_b ) - # we cannot send a segment to the garbage adjacent to any unassigned segment - if cl == 0 and (cl_u == -1 or cl_d == -1): - adj_BC[idx] = -np.inf # 2. clusters it is not adjacent to (use default split value) - for cl in self.clust_sums.keys() - ({-1, 0} | set(adj_clusters[adj_idx].ravel())): + for cl in self.clust_sums.keys() - ({-1} | set(adj_clusters[adj_idx].ravel())): idx = self.clust_sums.index(cl) adj_BC[idx] += self.SJliks( targ_clust = -1, @@ -559,9 +529,6 @@ def compute_adj_liks(self, seg_idx, cur_clust): D_a = D_a, D_b = D_b ) - else: - # we cannot send a segment to the garbage adjacent to any unassigned segment - adj_BC[self.clust_sums.index(0)] = -np.inf return adj_AB, adj_BC @@ -635,7 +602,7 @@ def run(self, n_iter = 50): ] # get count prior - ccp = np.c_[[v for k, v in self.clust_count_prior.items() if k != -1 and k != 0]] + ccp = np.c_[[v for k, v in self.clust_count_prior.items() if k != -1]] # posterior numerator num = P_l + np.log(ccp) @@ -664,9 +631,9 @@ def run(self, n_iter = 50): # for the first round of clustering, this is { 1 : 1 } self.clust_sums = sc.SortedDict({ **{ k : np.r_[v["min"], v["maj"]] for k, v in self.S.groupby("clust")[["min", "maj"]].sum().to_dict(orient = "index").items() }, - **{-1 : np.r_[0, 0], 0 : np.r_[0, 0]} + **{-1 : np.r_[0, 0]} }) - # for the first round, this is { -1/0 : np.r_[0, 0], 1 : np.r_[S[0, "min"], S[0, "maj"]] } + # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]] } self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 and k != 0 }) # for the first round, this is { 1 : {0} } unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1]) @@ -694,7 +661,6 @@ def run(self, n_iter = 50): if not n_it % 1000: print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index()) print("n unassigned: {}".format((self.S["clust"] == -1).sum())) - print("n garbage: {}".format((self.S["clust"] == 0).sum())) # we are burned in (n_seg/n_clust) iterations after all segments have been touched if not n_it % 100: @@ -728,20 +694,18 @@ def run(self, n_iter = 50): # expand segment to include all adjacent segments in the same cluster, # if it has already been assigned to a cluster - if cur_clust > 0 and np.random.rand() < 0.5: + if cur_clust >= 0 and np.random.rand() < 0.5: si = seg_idx[0] j = 1 while si - j > 0 and \ (self.clusts[si - j] == cur_clust or self.clusts[si - j] == 0): - if self.clusts[si - j] != 0: - seg_idx.add(si - j) + seg_idx.add(si - j) j += 1 j = 1 while si + j < len(self.S) and \ (self.clusts[si + j] == cur_clust or self.clusts[si + j] == 0): - if self.clusts[si + j] != 0: - seg_idx.add(si + j) + seg_idx.add(si + j) j += 1 # if we've expanded to include a large fraction (>10%) of segments @@ -756,7 +720,7 @@ def run(self, n_iter = 50): n_move = len(seg_idx) # if segment was already assigned to a cluster, unassign it - if cur_clust > 0: + if cur_clust >= 0: self.clust_counts[cur_clust] -= n_move if self.clust_counts[cur_clust] == 0: del self.clust_counts[cur_clust] @@ -812,7 +776,7 @@ def run(self, n_iter = 50): A_b = self.clust_sums[cur_clust][1] if cur_clust in self.clust_sums else 0 B_a = self._Ssum_ph(seg_idx, min = True) B_b = self._Ssum_ph(seg_idx, min = False) - C_ab = np.r_[self.clust_sums.values()] # first terms: (-1) = make new cluster, (0) = garbage cluster + C_ab = np.r_[self.clust_sums.values()] # first terms: -1 = make new cluster #C_ab = np.r_[[v for k, v in clust_sums.items() if k != cur_clust or cur_clust == -1]] # if we don't want to explicitly propose letting B rejoin cur_clust # @@ -881,7 +845,7 @@ def run(self, n_iter = 50): # [-1 (totally new cluster), , ] prior_idx = np.r_[ np.r_[[self.clust_prior.index(x) for x in prior_diff]], - np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null | {0})]] + np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null)]] ] # prior marginal likelihoods for both phase orientations @@ -899,7 +863,7 @@ def run(self, n_iter = 50): # DP prior based on clusters sizes # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster) ccp = np.r_[[self.clust_count_prior[x] for x in prior_diff]] - count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_count_prior[0], self.clust_counts.values()] + count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_counts.values()] count_prior /= count_prior.sum() # choose to join a cluster or make a new one (choice_idx = 0) @@ -911,8 +875,7 @@ def run(self, n_iter = 50): p = choice_p.ravel() ) # -1 = brand new, -2, -3, ... = -(prior clust index) - 2 - # 0 = garbage - choice = np.r_[-np.r_[prior_diff] - 2, 0, self.clust_counts.keys()][choice_idx//2] + choice = np.r_[-np.r_[prior_diff] - 2, self.clust_counts.keys()][choice_idx//2] # save rephasing status if choice_idx & 1: @@ -937,11 +900,6 @@ def run(self, n_iter = 50): self.clust_sums[new_clust_idx] = np.r_[B_a, B_b] self.clust_members[new_clust_idx] = set(seg_idx) - # send to garbage - elif choice == 0: - self.S.iloc[seg_idx, self.clust_col] = 0 - self.clusts[seg_idx] = 0 - # join existing cluster else: # if we are combining two clusters, take the index of the bigger one @@ -1024,10 +982,6 @@ def visualize_segs(self): S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] for i, r in enumerate(S_ph.itertuples()): - ## don't show garbage clusters - #if s2cu[s2c[i]] == 0: - # continue - ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], r.min + 1, r.maj + 1) ax.add_patch(mpl.patches.Rectangle((r.start_gp, ci_lo), r.end_gp - r.start_gp, ci_hi - ci_lo, facecolor = colors[s2c[i] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000)) From 575ae560edd06d55c4ae5f411430270cee582ace Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 7 Dec 2021 14:30:39 -0500 Subject: [PATCH 011/222] Fix clust_sums update bug --- hapaseg/allelic_DP.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 4632b22..bdc904d 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -897,7 +897,7 @@ def run(self, n_iter = 50): self.S.iloc[seg_idx, self.clust_col] = new_clust_idx self.clusts[seg_idx] = new_clust_idx - self.clust_sums[new_clust_idx] = np.r_[B_a, B_b] + self.clust_sums[new_clust_idx] = np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a] self.clust_members[new_clust_idx] = set(seg_idx) # join existing cluster @@ -915,7 +915,7 @@ def run(self, n_iter = 50): choice = cl_idx self.clust_counts[choice] += n_move - self.clust_sums[choice] += np.r_[B_a, B_b] + self.clust_sums[choice] += np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a] self.S.iloc[seg_idx, self.clust_col] = choice self.clusts[seg_idx] = choice From 6d1c991062922cbdc9674374c0459576f84aa37c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 7 Dec 2021 14:36:06 -0500 Subject: [PATCH 012/222] Bump DP alpha --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index bdc904d..4382e67 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -322,7 +322,7 @@ def scerrorbar(idx, rev = False, alpha = 1, show_CI = True): scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color) class DPinstance: - def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 0.1): + def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1): self.S = S self.clust_prior = clust_prior.copy() self.clust_count_prior = clust_count_prior.copy() From 4e9a00a2006f85bb4a362a002fc8237fbbce4d70 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 7 Dec 2021 15:16:17 -0500 Subject: [PATCH 013/222] Fix another bug related to getting rid of garbage cluster --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 4382e67..99fef40 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -627,7 +627,7 @@ def run(self, n_iter = 50): # # initialize cluster tracking hash tables - self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore")) + self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore")) # for the first round of clustering, this is { 1 : 1 } self.clust_sums = sc.SortedDict({ **{ k : np.r_[v["min"], v["maj"]] for k, v in self.S.groupby("clust")[["min", "maj"]].sum().to_dict(orient = "index").items() }, From 627b05920067f4d03eb127864adbd7637fed35a8 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 7 Dec 2021 15:16:42 -0500 Subject: [PATCH 014/222] Avoid divide by zero warning --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 99fef40..c985996 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -801,7 +801,7 @@ def run(self, n_iter = 50): # B+C is likelihood of target cluster post-join, with both phase orientations BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1) - MLs = BC - C[:, None] + np.log(np.r_[1 - rephase_prob, rephase_prob]) + MLs = BC - C[:, None] + np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob])) # TODO: get adj_BC working again # L(join) L(split) From 13498be4a271e55e67f416308e4dc22dc3d89dce Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 7 Dec 2021 16:41:03 -0500 Subject: [PATCH 015/222] Fix bug computing overall likelihood --- hapaseg/allelic_DP.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index c985996..18f9eae 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -534,12 +534,15 @@ def compute_adj_liks(self, seg_idx, cur_clust): def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None): if segs_to_clusters is None: - _, segs_to_clusters = self.get_unique_clust_idxs() + su, segs_to_clusters = self.get_unique_clust_idxs() else: - _, segs_to_clusters = self.get_unique_clust_idxs(segs_to_clusters) + su, segs_to_clusters = self.get_unique_clust_idxs(segs_to_clusters) if phase_orientations is None: phase_orientations = np.r_[self.phase_orientations] + # account for unassigned clusters + min_clust_idx = 1 if (su == -1).any() else 0 + max_clust_idx = segs_to_clusters.max() + 1 liks = np.full(segs_to_clusters.shape[0], np.nan) @@ -556,7 +559,10 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double) count_prior /= count_prior.sum() - clust_lik = (ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1) + np.log(count_prior)[count_prior > 0]).sum() + clust_lik = (ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1))[min_clust_idx:].sum() + # account for unassigned clusters, if present + if min_clust_idx == 1: + clust_lik += ss.betaln(self.S.loc[cl_samp == 0, "maj"] + 1, self.S.loc[cl_samp == 0, "min"] + 1).sum() # ## segmentation likelihood # From 2cd6eb467746acfd5ac002be1ef7134e423fbcfc Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 7 Dec 2021 18:52:16 -0500 Subject: [PATCH 016/222] adj_BC working again, accounting for phasing? --- hapaseg/allelic_DP.py | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 18f9eae..0e214fa 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -421,9 +421,10 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_ def compute_adj_liks(self, seg_idx, cur_clust): adj_AB = 0 - adj_BC = np.zeros(len(self.clust_sums)) + adj_BC = np.zeros([len(self.clust_sums), 2]) # start/end coordinates of consecutive runs of segments being moved + # NOTE: ordpairs represents closed intervals! ordpairs = np.c_[ [np.r_[list(x)][[0, -1]] for x in more_itertools.consecutive_groups( np.sort(seg_idx)) @@ -483,8 +484,8 @@ def compute_adj_liks(self, seg_idx, cur_clust): # min/maj counts of the segment(s) being moved st = ordpairs[j, 0] en = ordpairs[j, 1] - S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # XXX: why en + 1? - S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) # XXX: why en + 1? + S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # en + 1 because ordpairs is closed + S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) # adjacency likelihood of this segment remaining where it is # adj_AB += self.SJliks( @@ -503,7 +504,7 @@ def compute_adj_liks(self, seg_idx, cur_clust): # 1. those it is actually adjacent to (+ new cluster) for cl in {-1, cl_u, cl_d}: idx = self.clust_sums.index(cl) - adj_BC[idx] += self.SJliks( + adj_BC[idx, 0] += self.SJliks( targ_clust = cl, upstream_clust = cl_u, downstream_clust = cl_d, @@ -514,11 +515,22 @@ def compute_adj_liks(self, seg_idx, cur_clust): D_a = D_a, D_b = D_b ) + adj_BC[idx, 1] += self.SJliks( + targ_clust = cl, + upstream_clust = cl_u, + downstream_clust = cl_d, + J_a = S_b, + J_b = S_a, + U_a = U_a, + U_b = U_b, + D_a = D_a, + D_b = D_b + ) # 2. clusters it is not adjacent to (use default split value) for cl in self.clust_sums.keys() - ({-1} | set(adj_clusters[adj_idx].ravel())): idx = self.clust_sums.index(cl) - adj_BC[idx] += self.SJliks( + adj_BC[idx, 0] += self.SJliks( targ_clust = -1, upstream_clust = -1, downstream_clust = -1, @@ -529,6 +541,17 @@ def compute_adj_liks(self, seg_idx, cur_clust): D_a = D_a, D_b = D_b ) + adj_BC[idx, 1] += self.SJliks( + targ_clust = -1, + upstream_clust = -1, + downstream_clust = -1, + J_a = S_b, + J_b = S_a, + U_a = U_a, + U_b = U_b, + D_a = D_a, + D_b = D_b + ) return adj_AB, adj_BC @@ -789,12 +812,10 @@ def run(self, n_iter = 50): # adjacent segment likelihoods adj_AB = 0 - adj_BC = np.zeros(len(self.clust_sums)) + adj_BC = np.zeros([len(self.clust_sums), 2]) - if not move_clust or (all_assigned and move_clust and np.random.rand() < 0.01): + if not move_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust) - else: - adj_BC[self.clust_sums.index(0)] = -np.inf # A+B,C -> A,B+C @@ -808,7 +829,6 @@ def run(self, n_iter = 50): BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1) MLs = BC - C[:, None] + np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob])) - # TODO: get adj_BC working again # L(join) L(split) #MLs = A + BC + adj_BC - (AB + C + adj_AB) @@ -873,7 +893,7 @@ def run(self, n_iter = 50): count_prior /= count_prior.sum() # choose to join a cluster or make a new one (choice_idx = 0) - num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) + num = MLs + adj_BC + np.log(count_prior[:, None]) + np.log(clust_prior_p) choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum() # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true choice_idx = np.random.choice( From f6b5dc54411abe5324c03becb09acb5d84531c19 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 8 Dec 2021 14:37:19 -0500 Subject: [PATCH 017/222] Incrementally update relative posterior estimate --- hapaseg/allelic_DP.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 0e214fa..7a290c5 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -680,9 +680,9 @@ def run(self, n_iter = 50): all_assigned = False seg_touch_idx = np.zeros(len(self.S), dtype = np.uint16) -# # containers for saving debugging information (overall likelihoods/cluster assignments pre-burnin) -# self.lik_tmp = [] -# self.vc_tmp = [] + # likelihood trace + self.lik_tmp = [] + self.post = 0 n_it = 0 n_it_last = 0 @@ -903,6 +903,11 @@ def run(self, n_iter = 50): # -1 = brand new, -2, -3, ... = -(prior clust index) - 2 choice = np.r_[-np.r_[prior_diff] - 2, self.clust_counts.keys()][choice_idx//2] + # compute posterior delta between previous and current state + post_delta = num.ravel()[choice_idx] - \ + num[self.clust_sums.index(cur_clust if cur_clust in self.clust_sums else -1), 0] + self.post += post_delta + # save rephasing status if choice_idx & 1: self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] From b8155ad750f526dfb2c762a4698bf7f84b43b2cc Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 9 Dec 2021 13:01:43 -0500 Subject: [PATCH 018/222] Remove another vestige of garbage clusters --- hapaseg/allelic_DP.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 7a290c5..98e2506 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -663,7 +663,7 @@ def run(self, n_iter = 50): **{-1 : np.r_[0, 0]} }) # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]] } - self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 and k != 0 }) + self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 }) # for the first round, this is { 1 : {0} } unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1]) @@ -727,13 +727,11 @@ def run(self, n_iter = 50): si = seg_idx[0] j = 1 - while si - j > 0 and \ - (self.clusts[si - j] == cur_clust or self.clusts[si - j] == 0): + while si - j > 0 and self.clusts[si - j] == cur_clust: seg_idx.add(si - j) j += 1 j = 1 - while si + j < len(self.S) and \ - (self.clusts[si + j] == cur_clust or self.clusts[si + j] == 0): + while si + j < len(self.S) and self.clusts[si + j] == cur_clust: seg_idx.add(si + j) j += 1 From d4418436f5a64b3b8f23391c67fa1d59e09983f2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 9 Dec 2021 15:44:27 -0500 Subject: [PATCH 019/222] Initialize clust_counts to respect new phasing indexing scheme --- hapaseg/allelic_DP.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 98e2506..dcbc543 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -658,13 +658,19 @@ def run(self, n_iter = 50): # initialize cluster tracking hash tables self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore")) # for the first round of clustering, this is { 1 : 1 } + + x = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum() + if (x.droplevel(0).index == True).any(): + x.loc[(slice(None), True), ["min", "maj"]] = x.loc[(slice(None), True), ["maj", "min"]].values self.clust_sums = sc.SortedDict({ - **{ k : np.r_[v["min"], v["maj"]] for k, v in self.S.groupby("clust")[["min", "maj"]].sum().to_dict(orient = "index").items() }, + **{ k : np.r_[v["min"], v["maj"]] for k, v in x.groupby(level = "clust").sum().to_dict(orient = "index").items() }, **{-1 : np.r_[0, 0]} }) # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]] } + self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 }) # for the first round, this is { 1 : {0} } + unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1]) # store this as numpy for speed From 02ee2035f3fbd2ee40cb4080f774a61d770ba42e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 9 Dec 2021 15:48:08 -0500 Subject: [PATCH 020/222] Initial commit of cluster splitting --- hapaseg/allelic_DP.py | 108 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index dcbc543..af50e07 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -370,6 +370,16 @@ def _Ssum_ph(self, seg_idx, min = True): else: return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + def _Scumsum_ph(self, seg_idx, min = True): + flip = self.S.iloc[seg_idx, self.flip_col] + flip_n = ~flip + if min: + si = np.argsort(np.r_[seg_idx[flip_n], seg_idx[flip]]) + return self.mm_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]][si].cumsum() + else: + si = np.argsort(np.r_[seg_idx[flip], seg_idx[flip_n]]) + return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]][si].cumsum() + def compute_rephase_prob(self, seg_idx): flip = self.S.iloc[seg_idx, self.flip_col] flip_n = ~flip @@ -555,6 +565,63 @@ def compute_adj_liks(self, seg_idx, cur_clust): return adj_AB, adj_BC + def compute_cluster_splitpoints(self, seg_idx): + spl = [] + + # left bias + end = len(seg_idx) + i = 0 + while True: + seg_idx_sp = seg_idx[0:end] + if len(seg_idx_sp) < 2: + break + + min_cs = self._Scumsum_ph(seg_idx_sp, min = True) + min_csr = self._Ssum_ph(seg_idx_sp, min = True) - min_cs + maj_cs = self._Scumsum_ph(seg_idx_sp, min = False) + maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs + + split_lik = ss.betaln(min_cs[:-1] + 1, maj_cs[:-1] + 1) + ss.betaln(min_csr[1:] + 1, maj_csr[1:] + 1) + # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum()) + # NOTE: instead of argmax, probabilistically choose? will this make a difference? + + end = split_lik.argmax() + spl.append(end) + + if end <= 1 or end == len(split_lik) - 1: + break + + i += 1 + + # right bias + start = 0 + i = 0 + while True: + seg_idx_sp = seg_idx[start:] + if len(seg_idx_sp) < 2: + break + + min_cs = self._Scumsum_ph(seg_idx_sp, min = True) + min_csr = self._Ssum_ph(seg_idx_sp, min = True) - min_cs + maj_cs = self._Scumsum_ph(seg_idx_sp, min = False) + maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs + + split_lik = ss.betaln(min_cs[:-1] + 1, maj_cs[:-1] + 1) + ss.betaln(min_csr[1:] + 1, maj_csr[1:] + 1) + # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum()) + + start += split_lik.argmax() + 1 + spl.append(start - 1) + + if start > len(seg_idx) - 1 or split_lik.argmax() == 0: + break + + i += 1 + + bdy = np.unique(np.r_[0, spl, len(seg_idx)]) + bdy = np.c_[bdy[:-1], bdy[1:]] + + return bdy + def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None): if segs_to_clusters is None: su, segs_to_clusters = self.get_unique_clust_idxs() @@ -744,10 +811,48 @@ def run(self, n_iter = 50): # if we've expanded to include a large fraction (>10%) of segments # in this cluster, cluster indexing might become inconsistent. # skip this iteration - if len(seg_idx) >= 0.1*self.clust_counts[cur_clust]: +# if len(seg_idx) >= 0.1*self.clust_counts[cur_clust]: +# breakpoint() +# n_it += 1 +# continue + + # propose splitting out a contiguous interval of segments within the current cluster + split_clust = False + if all_assigned and np.random.rand() < 0.1: + # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable? + clust_segs = np.sort(np.r_[list(self.clust_members[cur_clust])]) + split_bdy = self.compute_cluster_splitpoints(clust_segs) + + A_tot, B_tot = self.clust_sums[cur_clust] + + lik0 = ss.betaln(A_tot + 1, B_tot + 1) + + liks = np.zeros(len(split_bdy) + 1) + liks[-1] = lik0 # don't split at all + + # likelihood ratios for splitting each region into a new cluster + for i, (st, en) in enumerate(split_bdy): + A = self._Ssum_ph(clust_segs[st:en], min = True) + B = self._Ssum_ph(clust_segs[st:en], min = False) + + liks[i] = ss.betaln(A_tot - A + 1, B_tot - B + 1) + ss.betaln(A + 1, B + 1) + + # pick a region to split + split_idx = np.random.choice( + len(split_bdy) + 1, + p = np.exp(liks - liks.max())/np.exp(liks - liks.max()).sum() + ) + + # don't split at all + if split_idx == len(split_bdy): n_it += 1 continue + # seg_idx == segments to propose to split off + seg_idx = clust_segs[slice(*split_bdy[split_idx])] + + split_clust = True + seg_idx = np.r_[list(seg_idx)] n_move = len(seg_idx) @@ -819,6 +924,7 @@ def run(self, n_iter = 50): adj_BC = np.zeros([len(self.clust_sums), 2]) if not move_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): + if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust) # A+B,C -> A,B+C From e12d4a9b3e55827388e118d242573efe5c010d43 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 9 Dec 2021 20:46:11 -0500 Subject: [PATCH 021/222] Initial commit of new burnin criterion --- hapaseg/allelic_DP.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index af50e07..ee80704 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -751,7 +751,8 @@ def run(self, n_iter = 50): burned_in = False all_assigned = False - seg_touch_idx = np.zeros(len(self.S), dtype = np.uint16) + all_touched = False + seg_touch_idx = np.zeros(len(self.S), dtype = bool) # likelihood trace self.lik_tmp = [] @@ -764,20 +765,23 @@ def run(self, n_iter = 50): print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index()) print("n unassigned: {}".format((self.S["clust"] == -1).sum())) - # we are burned in (n_seg/n_clust) iterations after all segments have been touched + # poll every 100 iterations for burnin status if not n_it % 100: - if not all_assigned and (((seg_touch_idx > 0) | (self.clusts == 0)).all() or \ - # if there is only one cluster, then consider every segment to have been touched - # otherwise, waiting for every segment to actually be touched will take forever - len(unassigned_segs) == 0 and len(self.clust_counts) == 1): + self.lik_tmp.append(self.post) + if not all_assigned and len(unassigned_segs) == 0: all_assigned = True - n_it_last = n_it - if not burned_in and all_assigned and \ - n_it - n_it_last > len(self.S)/len(self.clust_counts): - burned_in = True - -# self.lik_tmp.append(self.compute_overall_lik()) -# self.vc_tmp.append(self.S["clust"].value_counts()) + if not burned_in and all_assigned: + # 1. have >90% of segments been adjacency corrected? + # print(seg_touch_idx.mean()) + if seg_touch_idx.mean() > 0.9: + all_touched = True + + # 2. if >90% of segments have been adjacency corrected, check for burnin + # does the smoothed derivative of the posterior numerator go below zero? this would indicate that we've solidly reached an optimum + # TODO: make this check more efficient? + if all_touched and (np.convolve(np.diff(self.lik_tmp), np.ones(50)/50, mode = "same") < 0).sum() > 2: + burned_in = True + breakpoint() # # pick either a segment or a cluster at random (50:50 prob.) @@ -923,9 +927,10 @@ def run(self, n_iter = 50): adj_AB = 0 adj_BC = np.zeros([len(self.clust_sums), 2]) - if not move_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust) + if all_assigned: + seg_touch_idx[seg_idx] = True # A+B,C -> A,B+C From 275d9902f384f7f275dff705f2d7be1e877fcb23 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Dec 2021 18:53:22 -0500 Subject: [PATCH 022/222] Properly cycle through cluster colors --- hapaseg/allelic_DP.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index ee80704..9bab53c 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1108,7 +1108,11 @@ def get_colors(self): si = np.argsort(tot_terr)[::-1] terr_cs = np.cumsum(tot_terr[si])/tot_terr.sum() - return [mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (terr_cs < 0.99).sum())] + colors_to_use = np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (terr_cs < 0.99).sum())]) + colors = np.zeros([len(s2cu), 4]) + n_distinct = colors_to_use.shape[0] + colors[si[:n_distinct], :] = colors_to_use + colors[si[n_distinct:], :] = colors_to_use[:(len(si) - n_distinct), :] def visualize_segs(self): plt.figure() From a851663f9bd2d74295e8a3798b595fbb9fca98b8 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 17 Dec 2021 10:28:49 -0500 Subject: [PATCH 023/222] Make adjacent segment penalty a prior, not a likelihood --- hapaseg/allelic_DP.py | 92 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 14 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 9bab53c..c7ee6ea 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -429,6 +429,73 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_ return ss.betaln(SU_a + 1, SU_b + 1) + ss.betaln(J_a + 1, J_b + 1) + ss.betaln(SD_a + 1, SD_b + 1) + def compute_adj_prob(self, seg_idx): + ## compute boundaries of adjacent segments + + # maj/min counts of contiguous upstream segments belonging to the same cluster + st = seg_idx[0] + U_A = 0 + U_B = 0 + U_cl = -1 + if st - 1 > 0: + U_cl = self.clusts[st - 1] + j = 1 + while st - j > 0 and self.clusts[st - j] != -1 and \ + self.clusts[st - j] == U_cl: + U_A += self._Siat_ph(st - j, min = True) + U_B += self._Siat_ph(st - j, min = False) + + j += 1 + + # maj/min counts of contiguous downstream segments belonging to the same cluster + en = seg_idx[-1] + D_A = 0 + D_B = 0 + D_cl = -1 + if en + 1 < len(self.S): + D_cl = self.clusts[en + 1] + j = 1 + while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \ + self.clusts[en + j] == D_cl: + D_A += self._Siat_ph(en + j, min = True) + D_B += self._Siat_ph(en + j, min = False) + + j += 1 + + # maj/min counts of segment(s) being moved + S_A = self._Ssum_ph(seg_idx, min = True) + S_B = self._Ssum_ph(seg_idx, min = False) + + ## compute all four possible segmentations relative to neighbor, in + ## both phasing orientations + MLs = np.c_[ + # UTD T U D + # -^_ or -_- (U != T & T != D) (00) + np.r_[self.SJliks(1, 0, 0, S_A, S_B, U_A, U_B, D_A, D_B), + self.SJliks(1, 0, 0, S_B, S_A, U_A, U_B, D_A, D_B)], + # -__ (U != T & T == D) (01) + np.r_[self.SJliks(0, 1, 0, S_A, S_B, U_A, U_B, D_A, D_B), + self.SJliks(0, 1, 0, S_B, S_A, U_A, U_B, D_A, D_B)], + # --_ (U == T & T != D) (10) + np.r_[self.SJliks(1, 1, 0, S_A, S_B, U_A, U_B, D_A, D_B), + self.SJliks(1, 1, 0, S_B, S_A, U_A, U_B, D_A, D_B)], + # --- (U == T & T == D) (11) + np.r_[self.SJliks(0, 0, 0, S_A, S_B, U_A, U_B, D_A, D_B), + self.SJliks(0, 0, 0, S_B, S_A, U_A, U_B, D_A, D_B)], + ] + + ## match probs to cluster choices (will match MLs matrix in main calculation) + probs = np.zeros([len(self.clust_sums), 2]) + probs_idx = np.zeros([len(self.clust_sums), 2]).astype(np.uint8) + for k in self.clust_sums.keys(): + MLs_idx = np.r_[k == U_cl, k == D_cl]@np.r_[2, 1] + probs[self.clust_sums.index(k), :] = MLs[:, MLs_idx] + probs_idx[self.clust_sums.index(k), :] = np.r_[0, 4] + MLs_idx + + ## convert to conditional likelihoods, by scaling each likelihood by number of + ## cluster candidates with that segmentation configuration + return probs - np.log(np.bincount(probs_idx.ravel())[probs_idx]) + def compute_adj_liks(self, seg_idx, cur_clust): adj_AB = 0 adj_BC = np.zeros([len(self.clust_sums), 2]) @@ -921,17 +988,6 @@ def run(self, n_iter = 50): C_ab = np.r_[self.clust_sums.values()] # first terms: -1 = make new cluster #C_ab = np.r_[[v for k, v in clust_sums.items() if k != cur_clust or cur_clust == -1]] # if we don't want to explicitly propose letting B rejoin cur_clust - # - # adjacent segment likelihoods - - adj_AB = 0 - adj_BC = np.zeros([len(self.clust_sums), 2]) - - if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): - adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust) - if all_assigned: - seg_touch_idx[seg_idx] = True - # A+B,C -> A,B+C # A+B is likelihood of current cluster B is part of @@ -961,7 +1017,7 @@ def run(self, n_iter = 50): # # priors - # prior on previous cluster fractions + ## prior on previous cluster fractions prior_diff = [] prior_com = [] @@ -1001,14 +1057,22 @@ def run(self, n_iter = 50): # expand MLs to account for multiple new clusters MLs = np.r_[np.full([len(prior_diff), 2], MLs[0]), MLs[1:, :]] - # DP prior based on clusters sizes + ## DP prior based on clusters sizes # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster) ccp = np.r_[[self.clust_count_prior[x] for x in prior_diff]] count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_counts.values()] count_prior /= count_prior.sum() + # adjacent segment prior + + log_adj_prior = 0 + if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): + log_adj_prior = self.compute_adj_prob(seg_idx) + if all_assigned: + seg_touch_idx[seg_idx] = True + # choose to join a cluster or make a new one (choice_idx = 0) - num = MLs + adj_BC + np.log(count_prior[:, None]) + np.log(clust_prior_p) + num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) + log_adj_prior choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum() # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true choice_idx = np.random.choice( From 9b78b348cc4b4cb0b9889a9b9ff7164163d6cd07 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 22 Dec 2021 14:16:58 -0500 Subject: [PATCH 024/222] Add temperature parameter --- hapaseg/allelic_DP.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index c7ee6ea..b44e71d 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -322,11 +322,12 @@ def scerrorbar(idx, rev = False, alpha = 1, show_CI = True): scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color) class DPinstance: - def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1): + def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1, temperature = 1): self.S = S self.clust_prior = clust_prior.copy() self.clust_count_prior = clust_count_prior.copy() self.alpha = alpha + self.temperature = temperature self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F") @@ -1073,6 +1074,7 @@ def run(self, n_iter = 50): # choose to join a cluster or make a new one (choice_idx = 0) num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) + log_adj_prior + num /= self.temperature choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum() # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true choice_idx = np.random.choice( From 3ddffcd65dfea1ba4525848af3e7af5c771c10bc Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 22 Dec 2021 14:17:54 -0500 Subject: [PATCH 025/222] Add simple overall likelihood calculation --- hapaseg/allelic_DP.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index b44e71d..63a7aa9 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -344,6 +344,12 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.bref_col = self.S.columns.get_loc("B_ref") self.flip_col = self.S.columns.get_loc("flipped") + # + # compute rephase probabilities for each segment + self.S["rephase_prob"] = np.nan + for i in range(0, len(self.S)): + self.S.at[i, "rephase_prob"] = self.compute_rephase_prob(np.r_[i]) + # # initialize priors @@ -690,7 +696,33 @@ def compute_cluster_splitpoints(self, seg_idx): return bdy - def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None): + def compute_overall_lik_simple(self): + ## overall clustering likelihood + clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum() + + ## overall phasing likelihood + phase_lik = 1 - self.S["rephase_prob"].copy() + phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]] + phase_lik = np.log(phase_lik).sum() + + ## count prior + count_prior = np.r_[self.clust_counts.values()].astype(float) + count_prior /= count_prior.sum() + + ## segmentation likelihood + bdy = np.flatnonzero(np.r_[1, np.diff(self.S["clust"]) != 0, 1]) + bdy = np.c_[bdy[:-1], bdy[1:]] + + seg_lik = 0.0 + for st, en in bdy: + seg_lik += ss.betaln( + self._Ssum_ph(np.r_[st:en], min = True) + 1, + self._Ssum_ph(np.r_[st:en], min = False) + 1 + ) + + return clust_lik + phase_lik + np.log(count_prior).sum() + seg_lik + + def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False): if segs_to_clusters is None: su, segs_to_clusters = self.get_unique_clust_idxs() else: From 3468a33b00fb4d6ebec3ced87836598ea3bfd2dc Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 5 Jan 2022 15:39:18 -0500 Subject: [PATCH 026/222] Use Dir-Cat marg. lik. in overall posterior --- hapaseg/allelic_DP.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 63a7aa9..8e71208 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -705,9 +705,10 @@ def compute_overall_lik_simple(self): phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]] phase_lik = np.log(phase_lik).sum() - ## count prior - count_prior = np.r_[self.clust_counts.values()].astype(float) - count_prior /= count_prior.sum() + ## Dirichlet count prior (Dirichlet-categorical marginal likelihood) + dirvec = np.r_[self.clust_counts.values()].astype(float) + k = len(dirvec) + count_prior = ss.gammaln(dirvec + self.alpha/k).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) - k*ss.gammaln(self.alpha/k) ## segmentation likelihood bdy = np.flatnonzero(np.r_[1, np.diff(self.S["clust"]) != 0, 1]) @@ -720,7 +721,7 @@ def compute_overall_lik_simple(self): self._Ssum_ph(np.r_[st:en], min = False) + 1 ) - return clust_lik + phase_lik + np.log(count_prior).sum() + seg_lik + return clust_lik + phase_lik + count_prior + seg_lik def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False): if segs_to_clusters is None: From c9bb0ad21fa65b59abfbf70054b1d3755f1badc4 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 11 Jan 2022 17:07:23 -0500 Subject: [PATCH 027/222] Don't rescale adjacency likelihoods --- hapaseg/allelic_DP.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 8e71208..b05f4b1 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -492,16 +492,12 @@ def compute_adj_prob(self, seg_idx): ] ## match probs to cluster choices (will match MLs matrix in main calculation) - probs = np.zeros([len(self.clust_sums), 2]) - probs_idx = np.zeros([len(self.clust_sums), 2]).astype(np.uint8) + probs = np.full([len(self.clust_sums), 2], -np.inf) for k in self.clust_sums.keys(): MLs_idx = np.r_[k == U_cl, k == D_cl]@np.r_[2, 1] probs[self.clust_sums.index(k), :] = MLs[:, MLs_idx] - probs_idx[self.clust_sums.index(k), :] = np.r_[0, 4] + MLs_idx - ## convert to conditional likelihoods, by scaling each likelihood by number of - ## cluster candidates with that segmentation configuration - return probs - np.log(np.bincount(probs_idx.ravel())[probs_idx]) + return probs def compute_adj_liks(self, seg_idx, cur_clust): adj_AB = 0 From acc88e04611050a2139248786a6e1491407a1005 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 11 Jan 2022 17:09:19 -0500 Subject: [PATCH 028/222] Use multi DP prior; use correct joint probability for p(clust,phase|X) --- hapaseg/allelic_DP.py | 53 ++++++++++++++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index b05f4b1..31f2116 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1029,8 +1029,9 @@ def run(self, n_iter = 50): # B+C is likelihood of target cluster post-join, with both phase orientations BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1) - MLs = BC - C[:, None] + np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob])) + MLs = BC - C[:, None] + # {{{ # L(join) L(split) #MLs = A + BC + adj_BC - (AB + C + adj_AB) # TODO: remove extraneous calculations (e.g. adj_AB, AB, A); @@ -1044,10 +1045,12 @@ def run(self, n_iter = 50): if n_move > 1 and not move_clust: MLs[self.clust_sums.index(-1)] = -np.inf + # }}} + # # priors - ## prior on previous cluster fractions + ## prior on previous cluster fractions {{{ prior_diff = [] prior_com = [] @@ -1086,25 +1089,49 @@ def run(self, n_iter = 50): # expand MLs to account for multiple new clusters MLs = np.r_[np.full([len(prior_diff), 2], MLs[0]), MLs[1:, :]] + + # }}} ## DP prior based on clusters sizes - # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster) - ccp = np.r_[[self.clust_count_prior[x] for x in prior_diff]] - count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_counts.values()] - count_prior /= count_prior.sum() + n_c = np.c_[self.clust_counts.values()] + N = n_c.sum() + n_move + log_count_prior = np.full([len(self.clust_sums), 1], np.nan) + log_count_prior[1:] = ss.gammaln(n_move + n_c) + ss.gammaln(N + self.alpha - n_move) \ + - (ss.gammaln(n_c) + ss.gammaln(N + self.alpha)) + # probability of opening a new cluster + # TODO: accommodate prior clusters here + log_count_prior[0] = ss.gammaln(n_move) + np.log(self.alpha) + ss.gammaln(N + self.alpha - n_move) - ss.gammaln(N + self.alpha) - # adjacent segment prior + # + # adjacent segment likelihood + + #adj_AB = 0 + #adj_BC = np.zeros([len(self.clust_sums), 2]) - log_adj_prior = 0 + log_adj_lik = 0 if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): - log_adj_prior = self.compute_adj_prob(seg_idx) + log_adj_lik = self.compute_adj_prob(seg_idx) if all_assigned: seg_touch_idx[seg_idx] = True - # choose to join a cluster or make a new one (choice_idx = 0) - num = MLs + np.log(count_prior[:, None]) + np.log(clust_prior_p) + log_adj_prior - num /= self.temperature - choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum() + # p(X|clust,phase)p(X|seg,phase)p(clust) + num = (MLs # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B}) + + log_adj_lik # p({a_i, b_i}_{i\in B} | U, D, phase_{i\in B}) + + log_count_prior) # p(clust) (DP prior on clust counts) + + num /= self.temperature # scale by temperature for replica-exchange + + num -= num.max(0) # avoid underflow in sum-exp + + # p(clust|X,phase) + log_clust_post = num - np.log(np.exp(num).sum(0)) + + # p(phase|X) + log_phase_prob = np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob])) + + # p(clust,phase|X) = p(clust|X,phase)p(phase|X) + choice_p = np.exp(log_clust_post + log_phase_prob) + # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true choice_idx = np.random.choice( np.r_[0:np.prod(choice_p.shape)], From f4ce65eca22340e9763f100a2210e65b952c2da0 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 8 Feb 2022 09:33:59 -0500 Subject: [PATCH 029/222] Add note --- hapaseg/allelic_DP.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 31f2116..63e3990 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -500,6 +500,13 @@ def compute_adj_prob(self, seg_idx): return probs def compute_adj_liks(self, seg_idx, cur_clust): + # idea to simplify this code: + # - strip out logic for working with noncontiguous seg_idx's + # - compute all four possibile segmentations: + # ABC, AAB, ABB, AAA + # - associate those segmentations with each cluster choice, in order + # to return `adj_BC` with same size as `MLs` + adj_AB = 0 adj_BC = np.zeros([len(self.clust_sums), 2]) From 5f08b8925f0bb1593ca2d57828711bc553a90753 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 8 Feb 2022 09:35:23 -0500 Subject: [PATCH 030/222] Temporarily return individual components of overall joint likelihood --- hapaseg/allelic_DP.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 63e3990..4f5780a 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -701,14 +701,17 @@ def compute_cluster_splitpoints(self, seg_idx): def compute_overall_lik_simple(self): ## overall clustering likelihood + # p({a_i, b_i} | {c_k}, {phase_i}) clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum() ## overall phasing likelihood + # p({phase_i} | {a_i, b_i}) phase_lik = 1 - self.S["rephase_prob"].copy() phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]] phase_lik = np.log(phase_lik).sum() ## Dirichlet count prior (Dirichlet-categorical marginal likelihood) + # p({c_k}) dirvec = np.r_[self.clust_counts.values()].astype(float) k = len(dirvec) count_prior = ss.gammaln(dirvec + self.alpha/k).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) - k*ss.gammaln(self.alpha/k) @@ -724,7 +727,9 @@ def compute_overall_lik_simple(self): self._Ssum_ph(np.r_[st:en], min = False) + 1 ) - return clust_lik + phase_lik + count_prior + seg_lik + # p({c_k}, {s}, {phase_i} | {a_i, b_i}) + #return clust_lik + phase_lik + count_prior + seg_lik + return np.r_[clust_lik, phase_lik, count_prior, seg_lik] def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False): if segs_to_clusters is None: From 6a10f4745b0fcf266139b977081cb6630a2e07a3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 8 Feb 2022 09:37:57 -0500 Subject: [PATCH 031/222] More flexible stopping criteria --- hapaseg/allelic_DP.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 4f5780a..3e18f33 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -786,7 +786,7 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None return liks - def run(self, n_iter = 50): + def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): # # assign segments to likeliest prior component {{{ @@ -869,11 +869,19 @@ def run(self, n_iter = 50): n_it = 0 n_it_last = 0 - while len(self.segs_to_clusters) < n_iter: + while True: if not n_it % 1000: print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index()) print("n unassigned: {}".format((self.S["clust"] == -1).sum())) + # stop after a raw number of iterations + if n_iter > 0 and n_it > n_iter: + return + +# # stop after a number of samples have been taken +# if n_samps > 0 and len() > n_samps: +# break + # poll every 100 iterations for burnin status if not n_it % 100: self.lik_tmp.append(self.post) From d464c7cdd92bc1a436b744308018bad2cc0b6480 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 8 Feb 2022 09:47:18 -0500 Subject: [PATCH 032/222] Don't track unassigned segs; unnecessary for warm start --- hapaseg/allelic_DP.py | 33 ++++++++------------------------- 1 file changed, 8 insertions(+), 25 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 3e18f33..52e1f99 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -786,7 +786,7 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None return liks - def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): + def run(self, n_iter = 0, n_samps = 0): # # assign segments to likeliest prior component {{{ @@ -847,8 +847,6 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 }) # for the first round, this is { 1 : {0} } - unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1]) - # store this as numpy for speed self.clusts = self.S["clust"].values @@ -859,7 +857,6 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): self.phase_orientations = [] burned_in = False - all_assigned = False all_touched = False seg_touch_idx = np.zeros(len(self.S), dtype = bool) @@ -884,10 +881,10 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): # poll every 100 iterations for burnin status if not n_it % 100: - self.lik_tmp.append(self.post) - if not all_assigned and len(unassigned_segs) == 0: - all_assigned = True - if not burned_in and all_assigned: + + # have most segments been adjacency corrected? + # if so, has the overall likelihood stabilized enough that we're burned in? + if not burned_in: # 1. have >90% of segments been adjacency corrected? # print(seg_touch_idx.mean()) if seg_touch_idx.mean() > 0.9: @@ -906,13 +903,7 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): # pick a segment at random if np.random.rand() < 0.5: - #if np.random.rand() < 1: - # bias picking unassigned segments if >90% of segments have been assigned - if len(unassigned_segs) > 0 and len(unassigned_segs)/len(self.S) < 0.1 and np.random.rand() < 0.5: - seg_idx = sc.SortedSet({np.random.choice(unassigned_segs)}) - else: - seg_idx = sc.SortedSet({np.random.choice(len(self.S))}) - + seg_idx = sc.SortedSet({np.random.choice(len(self.S))}) cur_clust = int(self.clusts[seg_idx]) # expand segment to include all adjacent segments in the same cluster, @@ -989,7 +980,6 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)] self.clust_members[cur_clust] -= set(seg_idx) - unassigned_segs.update(seg_idx) self.clusts[seg_idx] = -1 # pick a cluster at random @@ -1011,14 +1001,10 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): del self.clust_counts[cl_idx] del self.clust_sums[cl_idx] del self.clust_members[cl_idx] - unassigned_segs.update(seg_idx) self.clusts[seg_idx] = -1 move_clust = True - if not all_assigned: - seg_touch_idx[seg_idx] += 1 - # # perform phase correction on segment/cluster # flip min/maj with probability that alleles are oriented the "wrong" way @@ -1129,10 +1115,9 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): #adj_BC = np.zeros([len(self.clust_sums), 2]) log_adj_lik = 0 - if not move_clust and not split_clust: # or (all_assigned and move_clust and np.random.rand() < 0.01): + if not move_clust and not split_clust: # or (move_clust and np.random.rand() < 0.01): log_adj_lik = self.compute_adj_prob(seg_idx) - if all_assigned: - seg_touch_idx[seg_idx] = True + seg_touch_idx[seg_idx] = True # p(X|clust,phase)p(X|seg,phase)p(clust) num = (MLs # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B}) @@ -1209,8 +1194,6 @@ def run(self, n_iter = 0, n_samps = 0, stop_after_assignment = False): self.clust_members[choice].update(set(seg_idx)) - for si in seg_idx: - unassigned_segs.discard(si) # track global state of cluster assignments # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations From b6b2348e1cc49731083393146d5af909f9c3a4a6 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 8 Feb 2022 09:55:15 -0500 Subject: [PATCH 033/222] Update comments accounting for warm start --- hapaseg/allelic_DP.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 52e1f99..98a3c78 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -833,7 +833,7 @@ def run(self, n_iter = 0, n_samps = 0): # # initialize cluster tracking hash tables self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore")) - # for the first round of clustering, this is { 1 : 1 } + # for the first round of clustering, this is { 0 : 1, 1 : 1, ..., N - 1 : 1 } x = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum() if (x.droplevel(0).index == True).any(): @@ -842,10 +842,10 @@ def run(self, n_iter = 0, n_samps = 0): **{ k : np.r_[v["min"], v["maj"]] for k, v in x.groupby(level = "clust").sum().to_dict(orient = "index").items() }, **{-1 : np.r_[0, 0]} }) - # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]] } + # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]], 1 : S[1, "min"], S[1, "maj"], ..., N : S[N - 1, "min"], S[N - 1, "maj"] } self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 }) - # for the first round, this is { 1 : {0} } + # for the first round, this is { 0 : {0}, 1 : {1}, ..., N - 1 : {N - 1} } # store this as numpy for speed self.clusts = self.S["clust"].values From 78059b940ffb12451307d20d25a464db3db5f605 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:36:45 -0500 Subject: [PATCH 034/222] Print abbreviated cluster summary for warm start --- hapaseg/allelic_DP.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 98a3c78..bde7f8e 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -861,15 +861,20 @@ def run(self, n_iter = 0, n_samps = 0): seg_touch_idx = np.zeros(len(self.S), dtype = bool) # likelihood trace - self.lik_tmp = [] + self.lik_tmp = [-np.inf] self.post = 0 n_it = 0 n_it_last = 0 while True: if not n_it % 1000: - print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index()) - print("n unassigned: {}".format((self.S["clust"] == -1).sum())) + if len(self.clust_counts) > 20: + print(pd.Series(self.clust_counts.values()).value_counts().sort_index()) + else: + print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1])) + print(self.lik_tmp[-1]) + #print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index()) + #print("n unassigned: {}".format((self.S["clust"] == -1).sum())) # stop after a raw number of iterations if n_iter > 0 and n_it > n_iter: From 99ceff430dff32dea929202a867482404bd14eb3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:40:43 -0500 Subject: [PATCH 035/222] Use correct count marginal likelihood --- hapaseg/allelic_DP.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index bde7f8e..7cfee62 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -714,9 +714,10 @@ def compute_overall_lik_simple(self): # p({c_k}) dirvec = np.r_[self.clust_counts.values()].astype(float) k = len(dirvec) - count_prior = ss.gammaln(dirvec + self.alpha/k).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) - k*ss.gammaln(self.alpha/k) + count_prior = k*np.log(self.alpha) + ss.gammaln(dirvec).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) ## segmentation likelihood + # p({a_i, b_i} | {s}, {phase_i}) bdy = np.flatnonzero(np.r_[1, np.diff(self.S["clust"]) != 0, 1]) bdy = np.c_[bdy[:-1], bdy[1:]] From eb1abc54e069737b814185c41cfab21092ff5046 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:41:05 -0500 Subject: [PATCH 036/222] Misc scrap commits to old overall likelihood function --- hapaseg/allelic_DP.py | 51 ++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 7cfee62..64d31ad 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -732,6 +732,7 @@ def compute_overall_lik_simple(self): #return clust_lik + phase_lik + count_prior + seg_lik return np.r_[clust_lik, phase_lik, count_prior, seg_lik] + # {{{ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False): if segs_to_clusters is None: su, segs_to_clusters = self.get_unique_clust_idxs() @@ -745,10 +746,11 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None max_clust_idx = segs_to_clusters.max() + 1 - liks = np.full(segs_to_clusters.shape[0], np.nan) + liks = np.full([segs_to_clusters.shape[0], 2], np.nan) for i, (cl_samp, ph_samp) in enumerate(zip(segs_to_clusters, phase_orientations)): ## overall clustering likelihood + clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum() A1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx) A2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx) @@ -756,36 +758,45 @@ def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None B1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx) B2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx) - count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double) + # print(A1[1:].sum(), B1[1:].sum(), A2[1:].sum(), B2[1:].sum()) + + count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double)[min_clust_idx:] count_prior /= count_prior.sum() - clust_lik = (ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1))[min_clust_idx:].sum() + #breakpoint() + + clust_lik = ((ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1))[min_clust_idx:] + np.log(count_prior)).sum() # account for unassigned clusters, if present if min_clust_idx == 1: clust_lik += ss.betaln(self.S.loc[cl_samp == 0, "maj"] + 1, self.S.loc[cl_samp == 0, "min"] + 1).sum() -# ## segmentation likelihood -# -# # get segment boundaries -# bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1]) -# bdy = np.c_[bdy[:-1], bdy[1:]] -# -# # sum log-likelihoods of each segment -# seg_lik = 0 -# for st, en in bdy: -# A, B = S_ph.iloc[st:en, [self.min_col, self.maj_col]].sum() + if debug: + breakpoint() + + ## segmentation likelihood + + seg_lik = np.nan +# if min_clust_idx == 0: +# # get segment boundaries +# bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1]) +# bdy = np.c_[bdy[:-1], bdy[1:]] # -## for when self.S is not modified -## A = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \ -## self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum() -## B = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \ -## self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum() +# # sum log-likelihoods of each segment +# seg_lik = 0 +# for st, en in bdy: +# A1 = self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum() +# A2 = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() +# B1 = self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum() +# B2 = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() # -# seg_lik += ss.betaln(A + 1, B + 1) +# seg_lik += ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1) +# else: +# seg_lik = np.nan - liks[i] = clust_lik + liks[i, :] = np.r_[clust_lik, seg_lik] return liks +# }}} def run(self, n_iter = 0, n_samps = 0): # From 5cf3e2d67d764ffbcfefdb4065fba0a7c6c15527 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:42:45 -0500 Subject: [PATCH 037/222] Remove outdated TODO --- hapaseg/allelic_DP.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 64d31ad..1295b09 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -982,6 +982,8 @@ def run(self, n_iter = 0, n_samps = 0): split_clust = True + # }}} + seg_idx = np.r_[list(seg_idx)] n_move = len(seg_idx) @@ -1122,7 +1124,6 @@ def run(self, n_iter = 0, n_samps = 0): log_count_prior[1:] = ss.gammaln(n_move + n_c) + ss.gammaln(N + self.alpha - n_move) \ - (ss.gammaln(n_c) + ss.gammaln(N + self.alpha)) # probability of opening a new cluster - # TODO: accommodate prior clusters here log_count_prior[0] = ss.gammaln(n_move) + np.log(self.alpha) + ss.gammaln(N + self.alpha - n_move) - ss.gammaln(N + self.alpha) # From d88d7d90488b5ffcd30fdfcb1e0fe6e1f12ec4a0 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:46:39 -0500 Subject: [PATCH 038/222] Remove cruft --- hapaseg/allelic_DP.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 1295b09..92883d1 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -225,7 +225,6 @@ def run(self, N_seg_samps = 50, N_clust_samps = 5, seg_sample_idx = None): if k != -1 and k not in next_clust_prior: clust_count_prior[k] -= clust_count_prior[k]/cur_samp_iter - # remove improbable clusters from prior for kk in [k for k, v in clust_count_prior.items() if v < 1]: del clust_prior[kk] @@ -1056,22 +1055,6 @@ def run(self, n_iter = 0, n_samps = 0): MLs = BC - C[:, None] - # {{{ - # L(join) L(split) - #MLs = A + BC + adj_BC - (AB + C + adj_AB) - # TODO: remove extraneous calculations (e.g. adj_AB, AB, A); - # likelihood simplifies to this in the prior: - #MLs = adj_BC + BC - C - - # if we are moving multiple contiguous segments assigned to the same - # cluster, do not allow them to create a new cluster. this helps keep - # cluster indices consistent - # TODO: if we don't care about keeping indices consistent, then we can probably remove this line - if n_move > 1 and not move_clust: - MLs[self.clust_sums.index(-1)] = -np.inf - - # }}} - # # priors @@ -1163,11 +1146,6 @@ def run(self, n_iter = 0, n_samps = 0): # -1 = brand new, -2, -3, ... = -(prior clust index) - 2 choice = np.r_[-np.r_[prior_diff] - 2, self.clust_counts.keys()][choice_idx//2] - # compute posterior delta between previous and current state - post_delta = num.ravel()[choice_idx] - \ - num[self.clust_sums.index(cur_clust if cur_clust in self.clust_sums else -1), 0] - self.post += post_delta - # save rephasing status if choice_idx & 1: self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] From d0e6e7d12bde60209947355945a354e454bcfe6b Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:48:31 -0500 Subject: [PATCH 039/222] Unfinished burnin-related code --- hapaseg/allelic_DP.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 92883d1..3dddcc4 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -897,7 +897,6 @@ def run(self, n_iter = 0, n_samps = 0): # poll every 100 iterations for burnin status if not n_it % 100: - # have most segments been adjacency corrected? # if so, has the overall likelihood stabilized enough that we're burned in? if not burned_in: @@ -909,9 +908,16 @@ def run(self, n_iter = 0, n_samps = 0): # 2. if >90% of segments have been adjacency corrected, check for burnin # does the smoothed derivative of the posterior numerator go below zero? this would indicate that we've solidly reached an optimum # TODO: make this check more efficient? - if all_touched and (np.convolve(np.diff(self.lik_tmp), np.ones(50)/50, mode = "same") < 0).sum() > 2: - burned_in = True - breakpoint() +# if all_touched and (np.convolve(np.diff(self.lik_tmp), np.ones(50)/50, mode = "same") < 0).sum() > 2: +# pass +# burned_in = True +# n_it_last = n_it +# seg_touch_idx[:] = False + + if burned_in and seg_touch_idx.mean() > 0.3: + self.segs_to_clusters.append(self.S["clust"].copy()) + self.phase_orientations.append(self.S["flipped"].copy()) + seg_touch_idx[:] = False # # pick either a segment or a cluster at random (50:50 prob.) @@ -1190,7 +1196,6 @@ def run(self, n_iter = 0, n_samps = 0): self.clust_members[choice].update(set(seg_idx)) - # track global state of cluster assignments # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2): From 1f43c5934836092ef5b7ca3009495802fe1b56a8 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:37:28 -0500 Subject: [PATCH 040/222] Try clustering on the SNP level --- hapaseg/allelic_DP.py | 105 +++++++++++++----------------------------- 1 file changed, 33 insertions(+), 72 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 3dddcc4..97ae8fe 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -44,84 +44,45 @@ def load_seg_samp(self, samp_idx): if samp_idx > self.n_samp: raise ValueError(f"Only {self.n_samp} MCMC samples were taken!") - all_segs = [] - all_SNPs = [] - - maj_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("MAJ_COUNT") - min_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("MIN_COUNT") - - alt_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("ALT_COUNT") - ref_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("REF_COUNT") - - chunk_offset = 0 + SNPs = [] + clust_offset = 0 for _, H in self.allelic_segs.dropna(subset = ["results"]).iterrows(): - r = copy.deepcopy(H["results"]) - - # set phasing orientation back to original - for st, en in r.F.intervals(): - # code excised from flip_hap - x = r.P.iloc[st:en, maj_idx].copy() - r.P.iloc[st:en, maj_idx] = r.P.iloc[st:en, min_idx] - r.P.iloc[st:en, min_idx] = x - - # save SNPs for this chunk - if self.SNPs is None: - all_SNPs.append(pd.DataFrame({ - "maj" : r.P["MAJ_COUNT"], - "min" : r.P["MIN_COUNT"], - # TODO: gpos should be computed earlier, so that that we don't need to pass ref_fasta here - "gpos" : seq.chrpos2gpos(r.P.loc[0, "chr"], r.P["pos"], ref = self.ref_fasta), - "allele" : r.P["allele_A"] - })) - - # draw breakpoint, phasing, and SNP inclusion sample from segmentation MCMC trace - bp_samp, pi_samp, inc_samp = (r.breakpoint_list[samp_idx], r.phase_interval_list[samp_idx] if r.phase_correct else None, r.include[samp_idx]) - # flip everything according to sample - if r.phase_correct: - for st, en in pi_samp.intervals(): - x = r.P.iloc[st:en, maj_idx].copy() - r.P.iloc[st:en, maj_idx] = r.P.iloc[st:en, min_idx] - r.P.iloc[st:en, min_idx] = x - - bpl = np.array(bp_samp); bpl = np.c_[bpl[0:-1], bpl[1:]] - - # get major/minor sums for each segment - # also get {alt, ref} x {aidx, bidx} - for st, en in bpl: - all_segs.append([ - st + chunk_offset, en + chunk_offset, # SNP index for seg - r.P.loc[st, "chr"], r.P.loc[st, "pos"], r.P.loc[en, "pos"], # chromosomal position of seg - r._Piloc(st, en, min_idx, inc_samp).sum(), # min/maj counts - r._Piloc(st, en, maj_idx, inc_samp).sum(), - - r._Piloc(st, en, alt_idx, inc_samp & r.P["aidx"]).sum(), # allele A alt/ref - r._Piloc(st, en, ref_idx, inc_samp & r.P["aidx"]).sum(), - r._Piloc(st, en, alt_idx, inc_samp & ~r.P["aidx"]).sum(), # allele B alt/ref - r._Piloc(st, en, ref_idx, inc_samp & ~r.P["aidx"]).sum() - ]) - - chunk_offset += len(r.P) - - # convert samples into dataframe - S = pd.DataFrame(all_segs, columns = ["SNP_st", "SNP_en", "chr", "start", "end", "min", "maj", "A_alt", "A_ref", "B_alt", "B_ref"]) + S = copy.deepcopy(H["results"].P) + S["A_alt"] = 0 + S.loc[S["aidx"], "A_alt"] = S.loc[S["aidx"], "ALT_COUNT"] + S["A_ref"] = 0 + S.loc[S["aidx"], "A_ref"] = S.loc[S["aidx"], "REF_COUNT"] + S["B_alt"] = 0 + S.loc[~S["aidx"], "B_alt"] = S.loc[~S["aidx"], "ALT_COUNT"] + S["B_ref"] = 0 + S.loc[~S["aidx"], "B_ref"] = S.loc[~S["aidx"], "REF_COUNT"] + + S = S.rename(columns = { "MIN_COUNT" : "min", "MAJ_COUNT" : "maj" }) + S = S.loc[:, ["chr", "pos", "min", "maj", "A_alt", "A_ref", "B_alt", "B_ref"]] + + # set initial cluster assignments based on segmentation + S["clust"] = -1 + # TODO: use ML segmentation + bpl = np.array(H["results"].breakpoint_list[samp_idx]); bpl = np.c_[bpl[0:-1], bpl[1:]] + for i, (st, en) in enumerate(bpl): + S.iloc[st:en, S.columns.get_loc("clust")] = i + clust_offset + clust_offset += i + + # bug in segmentation omits final SNP? + S = S.iloc[:-1] + assert (S["clust"] != -1).all() + + SNPs.append(S) + + SNPs = pd.concat(SNPs, ignore_index = True) # convert chr-relative positions to absolute genomic coordinates - S["start_gp"] = seq.chrpos2gpos(S["chr"], S["start"], ref = self.ref_fasta) - S["end_gp"] = seq.chrpos2gpos(S["chr"], S["end"], ref = self.ref_fasta) - - # initial cluster assignments - S["clust"] = -1 # initially, all segments are unassigned - S.iloc[0, S.columns.get_loc("clust")] = 0 # first segment is assigned to cluster 0 + SNPs["pos_gp"] = seq.chrpos2gpos(SNPs["chr"], SNPs["pos"], ref = self.ref_fasta) # initial phasing orientation - S["flipped"] = False - - if self.SNPs is None: - self.SNPs = pd.concat(all_SNPs, ignore_index = True) - CI = s.beta.ppf([0.05, 0.5, 0.95], self.SNPs["min"].values[:, None] + 1, self.SNPs["maj"].values[:, None] + 1) - self.SNPs[["f_CI_lo", "f", "f_CI_hi"]] = CI + SNPs["flipped"] = False - return S, self.SNPs + return SNPs, None # map trace of segment cluster assignments to the SNPs within @staticmethod From 5d963fd122ef9fdd33ac5102eeaee7bc5d2179fc Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:38:33 -0500 Subject: [PATCH 041/222] Try moving segments by (almost) default --- hapaseg/allelic_DP.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 97ae8fe..0f4ced8 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -304,12 +304,6 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.bref_col = self.S.columns.get_loc("B_ref") self.flip_col = self.S.columns.get_loc("flipped") - # - # compute rephase probabilities for each segment - self.S["rephase_prob"] = np.nan - for i in range(0, len(self.S)): - self.S.at[i, "rephase_prob"] = self.compute_rephase_prob(np.r_[i]) - # # initialize priors @@ -891,7 +885,7 @@ def run(self, n_iter = 0, n_samps = 0): # expand segment to include all adjacent segments in the same cluster, # if it has already been assigned to a cluster - if cur_clust >= 0 and np.random.rand() < 0.5: + if cur_clust >= 0 and np.random.rand() < 0.95: si = seg_idx[0] j = 1 From 6d40fc30906b0dbc05e5dd684c235d3526c6a4eb Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:39:45 -0500 Subject: [PATCH 042/222] Can't split segs of length 1 --- hapaseg/allelic_DP.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 0f4ced8..1d86ff1 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -905,11 +905,17 @@ def run(self, n_iter = 0, n_samps = 0): # n_it += 1 # continue - # propose splitting out a contiguous interval of segments within the current cluster + # propose splitting out a contiguous interval of segments within the current cluster {{{ split_clust = False - if all_assigned and np.random.rand() < 0.1: + if np.random.rand() < 0.1: # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable? clust_segs = np.sort(np.r_[list(self.clust_members[cur_clust])]) + + # can't split clusters of length 1 + if len(clust_segs) == 1: + n_it += 1 + continue + split_bdy = self.compute_cluster_splitpoints(clust_segs) A_tot, B_tot = self.clust_sums[cur_clust] From 81686cbbdfe3d73683ddb18a6f60d47f5ec2cc5d Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 9 Feb 2022 06:51:17 -0500 Subject: [PATCH 043/222] Diagnostic code for printing each transition --- hapaseg/allelic_DP.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 1d86ff1..fd6d9e4 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1117,6 +1117,11 @@ def run(self, n_iter = 0, n_samps = 0): if choice_idx & 1: self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] + if not move_clust: + print(f"{cur_clust}->{choice} ({len(seg_idx)}, s, [{seg_idx[0]}, {seg_idx[-1]}])") + else: + print(f"{cl_idx}->{choice} ({len(seg_idx)}, c, [{seg_idx[0]}, {seg_idx[-1]}])") + # create new cluster if choice < 0: # if we are moving an entire cluster, give it the same index it used to have From 673a17259e619c1c69048c837559d342d75ad9a2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 08:41:50 -0500 Subject: [PATCH 044/222] Rescale DP counts by average segment length --- hapaseg/allelic_DP.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index fd6d9e4..25d68de 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -282,12 +282,13 @@ def scerrorbar(idx, rev = False, alpha = 1, show_CI = True): scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color) class DPinstance: - def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1, temperature = 1): + def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1, temperature = 1, dp_count_scale_factor = 1): self.S = S self.clust_prior = clust_prior.copy() self.clust_count_prior = clust_count_prior.copy() self.alpha = alpha self.temperature = temperature + self.dp_count_scale_factor = dp_count_scale_factor self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F") @@ -1068,13 +1069,14 @@ def run(self, n_iter = 0, n_samps = 0): # }}} ## DP prior based on clusters sizes - n_c = np.c_[self.clust_counts.values()] - N = n_c.sum() + n_move + n_c = np.c_[self.clust_counts.values()]/self.dp_count_scale_factor + M = n_move/self.dp_count_scale_factor + N = n_c.sum() + M log_count_prior = np.full([len(self.clust_sums), 1], np.nan) - log_count_prior[1:] = ss.gammaln(n_move + n_c) + ss.gammaln(N + self.alpha - n_move) \ + log_count_prior[1:] = ss.gammaln(M + n_c) + ss.gammaln(N + self.alpha - M) \ - (ss.gammaln(n_c) + ss.gammaln(N + self.alpha)) # probability of opening a new cluster - log_count_prior[0] = ss.gammaln(n_move) + np.log(self.alpha) + ss.gammaln(N + self.alpha - n_move) - ss.gammaln(N + self.alpha) + log_count_prior[0] = ss.gammaln(M) + np.log(self.alpha) + ss.gammaln(N + self.alpha - M) - ss.gammaln(N + self.alpha) # # adjacent segment likelihood From 0d51cce02fdbd0b99d14c42e465e1383f2e3ace9 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 10:08:07 -0500 Subject: [PATCH 045/222] Keep track of segmentation breakpoints --- hapaseg/allelic_DP.py | 64 +++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 25d68de..bea48d3 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -395,34 +395,42 @@ def compute_adj_prob(self, seg_idx): ## compute boundaries of adjacent segments # maj/min counts of contiguous upstream segments belonging to the same cluster - st = seg_idx[0] U_A = 0 U_B = 0 U_cl = -1 - if st - 1 > 0: - U_cl = self.clusts[st - 1] - j = 1 - while st - j > 0 and self.clusts[st - j] != -1 and \ - self.clusts[st - j] == U_cl: - U_A += self._Siat_ph(st - j, min = True) - U_B += self._Siat_ph(st - j, min = False) - - j += 1 + if seg_idx[0] - 1 > 0: + break_idx = self.breakpoints.index(seg_idx[0]) - 1 + seg_st = self.breakpoints[break_idx] + seg_en = self.breakpoints[break_idx + 1] + U_cl = self.clusts[seg_st] + while break_idx > 0 and self.clusts[seg_st] != -1 and \ + self.clusts[seg_st] == U_cl: + # TODO: segment sums will eventually be memoized + U_A += self._Ssum_ph(np.r_[seg_st:seg_en], min = True) + U_B += self._Ssum_ph(np.r_[seg_st:seg_en], min = False) + + break_idx = self.breakpoints.index(seg_st) - 1 + seg_st = self.breakpoints[break_idx] + seg_en = self.breakpoints[break_idx + 1] # maj/min counts of contiguous downstream segments belonging to the same cluster - en = seg_idx[-1] D_A = 0 D_B = 0 D_cl = -1 - if en + 1 < len(self.S): - D_cl = self.clusts[en + 1] - j = 1 - while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \ - self.clusts[en + j] == D_cl: - D_A += self._Siat_ph(en + j, min = True) - D_B += self._Siat_ph(en + j, min = False) - - j += 1 + if seg_idx[-1] + 1 < len(self.S): + break_idx = self.breakpoints.index(seg_idx[0]) + 1 + seg_st = self.breakpoints[break_idx] + seg_en = self.breakpoints[break_idx + 1] + D_cl = self.clusts[seg_st] + while break_idx < len(self.breakpoints) - 1 and self.clusts[seg_st] != -1 and \ + self.clusts[seg_st] == D_cl: + # TODO: segment sums will eventually be memoized + D_A += self._Ssum_ph(np.r_[seg_st:seg_en], min = True) + D_B += self._Ssum_ph(np.r_[seg_st:seg_en], min = False) + + break_idx = self.breakpoints.index(seg_st) + 1 + seg_st = self.breakpoints[break_idx] + seg_en = self.breakpoints[break_idx + 1] # maj/min counts of segment(s) being moved S_A = self._Ssum_ph(seg_idx, min = True) @@ -526,7 +534,7 @@ def compute_adj_liks(self, seg_idx, cur_clust): # min/maj counts of the segment(s) being moved st = ordpairs[j, 0] en = ordpairs[j, 1] - S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # en + 1 because ordpairs is closed + S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # en + 1 because ordpairs is a closed interval S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) # adjacency likelihood of this segment remaining where it is @@ -819,6 +827,10 @@ def run(self, n_iter = 0, n_samps = 0): max_clust_idx = np.max(self.clust_members.keys() | self.clust_prior.keys() if self.clust_prior is not None else {}) + # segmentation breakpoints + self.breakpoints = sc.SortedSet(np.flatnonzero(np.diff(self.S["clust"]) != 0) + 1) | {0, len(self.S)} + # TODO: memoize min/maj counts for each segment + # containers for saving the MCMC trace self.segs_to_clusters = [] self.phase_orientations = [] @@ -880,9 +892,13 @@ def run(self, n_iter = 0, n_samps = 0): move_clust = False # pick a segment at random - if np.random.rand() < 0.5: - seg_idx = sc.SortedSet({np.random.choice(len(self.S))}) - cur_clust = int(self.clusts[seg_idx]) + if True or np.random.rand() < 0.5: + # get all SNPs within this segment + # TODO: why is seg_idx a sortedset? we don't use that functionality elsewhere + break_idx = np.random.choice(len(self.breakpoints) - 1) + seg_idx = sc.SortedSet(np.r_[self.breakpoints[break_idx]:self.breakpoints[break_idx + 1]]) + + cur_clust = int(self.clusts[seg_idx[0]]) # expand segment to include all adjacent segments in the same cluster, # if it has already been assigned to a cluster From 719fc7a7074e60a49281c57532d5b7b18e1d2944 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 10:43:39 -0500 Subject: [PATCH 046/222] Memoize segment min/maj counts --- hapaseg/allelic_DP.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index bea48d3..5716976 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -405,9 +405,8 @@ def compute_adj_prob(self, seg_idx): U_cl = self.clusts[seg_st] while break_idx > 0 and self.clusts[seg_st] != -1 and \ self.clusts[seg_st] == U_cl: - # TODO: segment sums will eventually be memoized - U_A += self._Ssum_ph(np.r_[seg_st:seg_en], min = True) - U_B += self._Ssum_ph(np.r_[seg_st:seg_en], min = False) + U_A += self.seg_sums[seg_st][0] + U_B += self.seg_sums[seg_st][1] break_idx = self.breakpoints.index(seg_st) - 1 seg_st = self.breakpoints[break_idx] @@ -424,17 +423,16 @@ def compute_adj_prob(self, seg_idx): D_cl = self.clusts[seg_st] while break_idx < len(self.breakpoints) - 1 and self.clusts[seg_st] != -1 and \ self.clusts[seg_st] == D_cl: - # TODO: segment sums will eventually be memoized - D_A += self._Ssum_ph(np.r_[seg_st:seg_en], min = True) - D_B += self._Ssum_ph(np.r_[seg_st:seg_en], min = False) + D_A += self.seg_sums[seg_st][0] + D_B += self.seg_sums[seg_st][1] break_idx = self.breakpoints.index(seg_st) + 1 seg_st = self.breakpoints[break_idx] seg_en = self.breakpoints[break_idx + 1] # maj/min counts of segment(s) being moved - S_A = self._Ssum_ph(seg_idx, min = True) - S_B = self._Ssum_ph(seg_idx, min = False) + S_A = self.seg_sums[seg_idx[0]][0] + S_B = self.seg_sums[seg_idx[0]][1] ## compute all four possible segmentations relative to neighbor, in ## both phasing orientations @@ -810,11 +808,11 @@ def run(self, n_iter = 0, n_samps = 0): self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore")) # for the first round of clustering, this is { 0 : 1, 1 : 1, ..., N - 1 : 1 } - x = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum() - if (x.droplevel(0).index == True).any(): - x.loc[(slice(None), True), ["min", "maj"]] = x.loc[(slice(None), True), ["maj", "min"]].values + Sgc = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum() + if (Sgc.droplevel(0).index == True).any(): + Sgc.loc[(slice(None), True), ["min", "maj"]] = Sgc.loc[(slice(None), True), ["maj", "min"]].values self.clust_sums = sc.SortedDict({ - **{ k : np.r_[v["min"], v["maj"]] for k, v in x.groupby(level = "clust").sum().to_dict(orient = "index").items() }, + **{ k : np.r_[v["min"], v["maj"]] for k, v in Sgc.groupby(level = "clust").sum().to_dict(orient = "index").items() }, **{-1 : np.r_[0, 0]} }) # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]], 1 : S[1, "min"], S[1, "maj"], ..., N : S[N - 1, "min"], S[N - 1, "maj"] } @@ -829,7 +827,13 @@ def run(self, n_iter = 0, n_samps = 0): # segmentation breakpoints self.breakpoints = sc.SortedSet(np.flatnonzero(np.diff(self.S["clust"]) != 0) + 1) | {0, len(self.S)} - # TODO: memoize min/maj counts for each segment + # min/maj counts in each segment + self.seg_sums = sc.SortedDict() + bpl = np.r_[self.breakpoints] + for st, en in np.c_[bpl[:-1], bpl[1:]]: + mn = self._Ssum_ph(np.r_[st:en], min = True) + mj = self._Ssum_ph(np.r_[st:en], min = False) + self.seg_sums[st] = np.r_[mn, mj] # containers for saving the MCMC trace self.segs_to_clusters = [] From bf337b6e88a45cd05edf73ce18c7ec8d85148ad3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 11:05:30 -0500 Subject: [PATCH 047/222] Fix indexing bug --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 5716976..97ba693 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -421,7 +421,7 @@ def compute_adj_prob(self, seg_idx): seg_st = self.breakpoints[break_idx] seg_en = self.breakpoints[break_idx + 1] D_cl = self.clusts[seg_st] - while break_idx < len(self.breakpoints) - 1 and self.clusts[seg_st] != -1 and \ + while break_idx < len(self.breakpoints) - 2 and self.clusts[seg_st] != -1 and \ self.clusts[seg_st] == D_cl: D_A += self.seg_sums[seg_st][0] D_B += self.seg_sums[seg_st][1] From a63f5deb442290c6e0414452d879a24c7ba2f9bb Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 11:48:55 -0500 Subject: [PATCH 048/222] Merge adjacent segments if they're assigned to the same cluster --- hapaseg/allelic_DP.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 97ba693..9563781 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -898,9 +898,8 @@ def run(self, n_iter = 0, n_samps = 0): # pick a segment at random if True or np.random.rand() < 0.5: # get all SNPs within this segment - # TODO: why is seg_idx a sortedset? we don't use that functionality elsewhere - break_idx = np.random.choice(len(self.breakpoints) - 1) - seg_idx = sc.SortedSet(np.r_[self.breakpoints[break_idx]:self.breakpoints[break_idx + 1]]) + break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)}) + seg_idx = sc.SortedSet(np.r_[self.breakpoints[break_idx[0]]:self.breakpoints[break_idx[0] + 1]]) cur_clust = int(self.clusts[seg_idx[0]]) @@ -1184,6 +1183,21 @@ def run(self, n_iter = 0, n_samps = 0): self.clust_members[choice].update(set(seg_idx)) + # update breakpoints + snp_idx = [self.breakpoints[b] for b in break_idx] + update_idx = sc.SortedSet() + for snp in snp_idx: + if self.clusts[snp - 1] == self.clusts[snp]: + self.breakpoints.remove(snp) + self.seg_sums.pop(snp) + update_idx.add(self.breakpoints.bisect_left(snp) - 1) + for bp_idx in update_idx: + st = self.breakpoints[bp_idx] + en = self.breakpoints[bp_idx + 1] + mn = self._Ssum_ph(np.r_[st:en], min = True) + mj = self._Ssum_ph(np.r_[st:en], min = False) + self.seg_sums[st] = np.r_[mn, mj] + # track global state of cluster assignments # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2): From daa625f0dbddd13b330760d7c71041e202a01c18 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 13:12:00 -0500 Subject: [PATCH 049/222] Need to consider both ends of segments --- hapaseg/allelic_DP.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 9563781..62162fb 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1184,10 +1184,10 @@ def run(self, n_iter = 0, n_samps = 0): self.clust_members[choice].update(set(seg_idx)) # update breakpoints - snp_idx = [self.breakpoints[b] for b in break_idx] + snp_idx = [self.breakpoints[b] for b in break_idx | { x + 1 for x in break_idx }] update_idx = sc.SortedSet() for snp in snp_idx: - if self.clusts[snp - 1] == self.clusts[snp]: + if snp < len(self.S) and self.clusts[snp - 1] == self.clusts[snp]: self.breakpoints.remove(snp) self.seg_sums.pop(snp) update_idx.add(self.breakpoints.bisect_left(snp) - 1) From d7c9bd88c2658ff3fc5157b1d356be17d692098a Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 14:19:51 -0500 Subject: [PATCH 050/222] Allow segments to be broken Also remove code for expanding into adjacent segments; this is no longer necessary since we keep track of breakpoints now --- hapaseg/allelic_DP.py | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 62162fb..97c8915 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -899,31 +899,27 @@ def run(self, n_iter = 0, n_samps = 0): if True or np.random.rand() < 0.5: # get all SNPs within this segment break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)}) - seg_idx = sc.SortedSet(np.r_[self.breakpoints[break_idx[0]]:self.breakpoints[break_idx[0] + 1]]) + seg_idx = np.r_[self.breakpoints[break_idx[0]]:self.breakpoints[break_idx[0] + 1]] cur_clust = int(self.clusts[seg_idx[0]]) - # expand segment to include all adjacent segments in the same cluster, - # if it has already been assigned to a cluster - if cur_clust >= 0 and np.random.rand() < 0.95: - si = seg_idx[0] - - j = 1 - while si - j > 0 and self.clusts[si - j] == cur_clust: - seg_idx.add(si - j) - j += 1 - j = 1 - while si + j < len(self.S) and self.clusts[si + j] == cur_clust: - seg_idx.add(si + j) - j += 1 - - # if we've expanded to include a large fraction (>10%) of segments - # in this cluster, cluster indexing might become inconsistent. - # skip this iteration -# if len(seg_idx) >= 0.1*self.clust_counts[cur_clust]: -# breakpoint() -# n_it += 1 -# continue + # propose breaking this segment + if np.random.rand() < 0.1: + # can't split segments of length 1 + if len(seg_idx) == 1: + n_it += 1 + continue + + # TODO: memoize cumsums? + min_cs = self._Scumsum_ph(seg_idx, min = True) + min_csr = self.seg_sums[seg_idx[0]][0] - min_cs + maj_cs = self._Scumsum_ph(seg_idx, min = False) + maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs + + split_lik = ss.betaln(min_cs + 1, maj_cs + 1) + ss.betaln(min_csr + 1, maj_csr + 1) + split_lik -= split_lik.max() + split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum()) + seg_idx = seg_idx[:(split_point + 1)] # propose splitting out a contiguous interval of segments within the current cluster {{{ split_clust = False @@ -970,8 +966,6 @@ def run(self, n_iter = 0, n_samps = 0): # }}} - seg_idx = np.r_[list(seg_idx)] - n_move = len(seg_idx) # if segment was already assigned to a cluster, unassign it From afa35efe86abdec94efcfce54e9218bf1781b77a Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 15:20:25 -0500 Subject: [PATCH 051/222] Add breakpoints when splitting segment --- hapaseg/allelic_DP.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 97c8915..173a6cf 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -899,7 +899,9 @@ def run(self, n_iter = 0, n_samps = 0): if True or np.random.rand() < 0.5: # get all SNPs within this segment break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)}) - seg_idx = np.r_[self.breakpoints[break_idx[0]]:self.breakpoints[break_idx[0] + 1]] + seg_st = self.breakpoints[break_idx[0]] + seg_en = self.breakpoints[break_idx[0] + 1] + seg_idx = np.r_[seg_st:seg_en] cur_clust = int(self.clusts[seg_idx[0]]) @@ -921,6 +923,13 @@ def run(self, n_iter = 0, n_samps = 0): split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum()) seg_idx = seg_idx[:(split_point + 1)] + # add breakpoint (can be erased subsequently if segment rejoins original cluster) + new_bp = seg_idx[-1] + 1 + if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment + self.breakpoints.add(new_bp) + self.seg_sums[new_bp] = np.r_[self._Ssum_ph(np.r_[new_bp:seg_en], min = True), self._Ssum_ph(np.r_[new_bp:seg_en], min = False)] + self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp] + # propose splitting out a contiguous interval of segments within the current cluster {{{ split_clust = False if np.random.rand() < 0.1: From 10629c34f012e1e45b1243f08825db5b70b334b6 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 25 Feb 2022 16:26:45 -0500 Subject: [PATCH 052/222] Update seg_sums for phase flips --- hapaseg/allelic_DP.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 173a6cf..f92bc28 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1140,6 +1140,10 @@ def run(self, n_iter = 0, n_samps = 0): # save rephasing status if choice_idx & 1: self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] + for b in break_idx: + st = self.breakpoints[b] + en = self.breakpoints[b + 1] + self.seg_sums[st] = self.seg_sums[st][::-1] if not move_clust: print(f"{cur_clust}->{choice} ({len(seg_idx)}, s, [{seg_idx[0]}, {seg_idx[-1]}])") @@ -1197,9 +1201,10 @@ def run(self, n_iter = 0, n_samps = 0): for bp_idx in update_idx: st = self.breakpoints[bp_idx] en = self.breakpoints[bp_idx + 1] - mn = self._Ssum_ph(np.r_[st:en], min = True) - mj = self._Ssum_ph(np.r_[st:en], min = False) - self.seg_sums[st] = np.r_[mn, mj] + self.seg_sums[st] = np.r_[ + self._Ssum_ph(np.r_[st:en], min = True), + self._Ssum_ph(np.r_[st:en], min = False) + ] # track global state of cluster assignments # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations From dd7ffdf5aece3f50f2797798637d7119adcd5bf6 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sun, 27 Feb 2022 10:48:59 -0500 Subject: [PATCH 053/222] Fix style --- hapaseg/allelic_DP.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index f92bc28..3e211f0 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -376,13 +376,13 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_ # J_b = S.iloc[st:(en + 1), maj_col].sum() SU_a = SU_b = SD_a = SD_b = 0 - if targ_clust != - 1 and targ_clust == upstream_clust: + if targ_clust != -1 and targ_clust == upstream_clust: J_a += U_a J_b += U_b else: SU_a += U_a SU_b += U_b - if targ_clust != - 1 and targ_clust == downstream_clust: + if targ_clust != -1 and targ_clust == downstream_clust: J_a += D_a J_b += D_b else: From ff508a9107187f8752fa78878295724bc0e0265b Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sun, 27 Feb 2022 14:25:07 -0500 Subject: [PATCH 054/222] Add hash mapping cluster indices -> breakpoints --- hapaseg/allelic_DP.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 3e211f0..914e790 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -834,6 +834,11 @@ def run(self, n_iter = 0, n_samps = 0): mn = self._Ssum_ph(np.r_[st:en], min = True) mj = self._Ssum_ph(np.r_[st:en], min = False) self.seg_sums[st] = np.r_[mn, mj] + # breakpoints for each cluster + self.clust_members_bps = sc.SortedDict({ + k : sc.SortedSet(v) for k, v in \ + self.S.loc[self.breakpoints[:-1], ["clust"]].groupby("clust").groups.items() + }) # containers for saving the MCMC trace self.segs_to_clusters = [] @@ -929,6 +934,7 @@ def run(self, n_iter = 0, n_samps = 0): self.breakpoints.add(new_bp) self.seg_sums[new_bp] = np.r_[self._Ssum_ph(np.r_[new_bp:seg_en], min = True), self._Ssum_ph(np.r_[new_bp:seg_en], min = False)] self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp] + self.clust_members_bps[cur_clust].add(new_bp) # propose splitting out a contiguous interval of segments within the current cluster {{{ split_clust = False @@ -984,9 +990,11 @@ def run(self, n_iter = 0, n_samps = 0): del self.clust_counts[cur_clust] del self.clust_sums[cur_clust] del self.clust_members[cur_clust] + del self.clust_members_bps[cur_clust] else: self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)] self.clust_members[cur_clust] -= set(seg_idx) + self.clust_members_bps[cur_clust].remove(self.breakpoints[break_idx[0]]) self.clusts[seg_idx] = -1 @@ -1206,6 +1214,11 @@ def run(self, n_iter = 0, n_samps = 0): self._Ssum_ph(np.r_[st:en], min = False) ] + if choice < 0: + self.clust_members_bps[new_clust_idx] = sc.SortedSet([self.breakpoints[b] for b in break_idx | update_idx]) + else: + self.clust_members_bps[choice] |= sc.SortedSet([self.breakpoints[b] for b in break_idx | update_idx]) + # track global state of cluster assignments # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2): From c135bec43b35916118f53670d96083c314402c53 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 2 Mar 2022 14:38:41 -0500 Subject: [PATCH 055/222] Track breakpoints within each cluster --- hapaseg/allelic_DP.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 914e790..e5f5ed9 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1007,6 +1007,10 @@ def run(self, n_iter = 0, n_samps = 0): cl_idx = np.random.choice(self.clust_counts.keys()) seg_idx = np.r_[list(self.clust_members[cl_idx])] + + # get all breakpoints corresponding to this cluster + break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cl_idx]]) + n_move = len(seg_idx) cur_clust = -1 # only applicable for individual segments, so we set to -1 here # (this is so that subsequent references to clust_sums[cur_clust] @@ -1017,6 +1021,7 @@ def run(self, n_iter = 0, n_samps = 0): del self.clust_counts[cl_idx] del self.clust_sums[cl_idx] del self.clust_members[cl_idx] + del self.clust_members_bps[cl_idx] self.clusts[seg_idx] = -1 move_clust = True @@ -1199,13 +1204,25 @@ def run(self, n_iter = 0, n_samps = 0): self.clust_members[choice].update(set(seg_idx)) # update breakpoints - snp_idx = [self.breakpoints[b] for b in break_idx | { x + 1 for x in break_idx }] + + # B->A + # . . . break_idx + 1 + # A B A B A C B A + # + + + break_idx + # * * update_idx + + break_idx_bi = break_idx | { x + 1 for x in break_idx } + snp_idx_bi = sc.SortedSet([self.breakpoints[b] for b in break_idx_bi]) + snp_idx = sc.SortedSet([self.breakpoints[b] for b in break_idx]) update_idx = sc.SortedSet() - for snp in snp_idx: + for snp in snp_idx_bi: if snp < len(self.S) and self.clusts[snp - 1] == self.clusts[snp]: + snp_idx.discard(snp) # discard rather than remvoe because this could be in snp_idx + 1 self.breakpoints.remove(snp) self.seg_sums.pop(snp) + self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster update_idx.add(self.breakpoints.bisect_left(snp) - 1) + snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1]) for bp_idx in update_idx: st = self.breakpoints[bp_idx] en = self.breakpoints[bp_idx + 1] @@ -1215,9 +1232,9 @@ def run(self, n_iter = 0, n_samps = 0): ] if choice < 0: - self.clust_members_bps[new_clust_idx] = sc.SortedSet([self.breakpoints[b] for b in break_idx | update_idx]) + self.clust_members_bps[new_clust_idx] = snp_idx else: - self.clust_members_bps[choice] |= sc.SortedSet([self.breakpoints[b] for b in break_idx | update_idx]) + self.clust_members_bps[choice] |= snp_idx # track global state of cluster assignments # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations From 4bccf1a4575ed73576d737987cf58eeb24b13998 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 3 Mar 2022 12:29:05 -0500 Subject: [PATCH 056/222] Improve efficiency of computing adjacency likelihood --- hapaseg/allelic_DP.py | 78 ++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 50 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index e5f5ed9..51d83ed 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -391,48 +391,21 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_ return ss.betaln(SU_a + 1, SU_b + 1) + ss.betaln(J_a + 1, J_b + 1) + ss.betaln(SD_a + 1, SD_b + 1) - def compute_adj_prob(self, seg_idx): - ## compute boundaries of adjacent segments - - # maj/min counts of contiguous upstream segments belonging to the same cluster - U_A = 0 - U_B = 0 - U_cl = -1 - if seg_idx[0] - 1 > 0: - break_idx = self.breakpoints.index(seg_idx[0]) - 1 - seg_st = self.breakpoints[break_idx] - seg_en = self.breakpoints[break_idx + 1] - U_cl = self.clusts[seg_st] - while break_idx > 0 and self.clusts[seg_st] != -1 and \ - self.clusts[seg_st] == U_cl: - U_A += self.seg_sums[seg_st][0] - U_B += self.seg_sums[seg_st][1] - - break_idx = self.breakpoints.index(seg_st) - 1 - seg_st = self.breakpoints[break_idx] - seg_en = self.breakpoints[break_idx + 1] - - # maj/min counts of contiguous downstream segments belonging to the same cluster - D_A = 0 - D_B = 0 - D_cl = -1 - if seg_idx[-1] + 1 < len(self.S): - break_idx = self.breakpoints.index(seg_idx[0]) + 1 - seg_st = self.breakpoints[break_idx] - seg_en = self.breakpoints[break_idx + 1] - D_cl = self.clusts[seg_st] - while break_idx < len(self.breakpoints) - 2 and self.clusts[seg_st] != -1 and \ - self.clusts[seg_st] == D_cl: - D_A += self.seg_sums[seg_st][0] - D_B += self.seg_sums[seg_st][1] - - break_idx = self.breakpoints.index(seg_st) + 1 - seg_st = self.breakpoints[break_idx] - seg_en = self.breakpoints[break_idx + 1] - - # maj/min counts of segment(s) being moved - S_A = self.seg_sums[seg_idx[0]][0] - S_B = self.seg_sums[seg_idx[0]][1] + def compute_adj_prob(self, break_idx): + if break_idx > 1: + U_A, U_B = self.seg_sums[self.breakpoints[break_idx - 1]] + U_cl = self.clusts[self.breakpoints[break_idx - 1]] + else: + U_A = U_B = 0 + U_cl = -1 + if break_idx + 2 < len(self.breakpoints): + D_A, D_B = self.seg_sums[self.breakpoints[break_idx + 1]] + D_cl = self.clusts[self.breakpoints[break_idx + 1]] + else: + D_A = D_B = 0 + D_cl = -1 + + S_A, S_B = self.seg_sums[self.breakpoints[break_idx]] ## compute all four possible segmentations relative to neighbor, in ## both phasing orientations @@ -453,10 +426,15 @@ def compute_adj_prob(self, seg_idx): ] ## match probs to cluster choices (will match MLs matrix in main calculation) - probs = np.full([len(self.clust_sums), 2], -np.inf) - for k in self.clust_sums.keys(): - MLs_idx = np.r_[k == U_cl, k == D_cl]@np.r_[2, 1] - probs[self.clust_sums.index(k), :] = MLs[:, MLs_idx] + probs = np.full([len(self.clust_sums), 2], MLs[0, 0]) + if U_cl == D_cl and U_cl != -1 and D_cl != -1: + probs[self.clust_sums.index(U_cl), :] = MLs[:, 3] + probs[self.clust_sums.index(D_cl), :] = MLs[:, 3] + else: + if U_cl != -1: + probs[self.clust_sums.index(U_cl), :] = MLs[:, 2] + if D_cl != -1: + probs[self.clust_sums.index(D_cl), :] = MLs[:, 1] return probs @@ -1120,10 +1098,10 @@ def run(self, n_iter = 0, n_samps = 0): #adj_BC = np.zeros([len(self.clust_sums), 2]) log_adj_lik = 0 - if not move_clust and not split_clust: # or (move_clust and np.random.rand() < 0.01): - log_adj_lik = self.compute_adj_prob(seg_idx) - seg_touch_idx[seg_idx] = True - + if not move_clust: # or (move_clust and np.random.rand() < 0.01): + log_adj_lik = self.compute_adj_prob(break_idx[0]) + #seg_touch_idx[seg_idx] = True + # p(X|clust,phase)p(X|seg,phase)p(clust) num = (MLs # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B}) + log_adj_lik # p({a_i, b_i}_{i\in B} | U, D, phase_{i\in B}) From 3ea0fc8764603deb6c9265c324eb1816bfc20fab Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 4 Mar 2022 11:19:04 -0500 Subject: [PATCH 057/222] Allow beta hyperparam to be specified --- hapaseg/allelic_DP.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 51d83ed..45ae343 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -294,6 +294,8 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F") self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F") + self.betahyp = 1 + # # define column indices self.clust_col = self.S.columns.get_loc("clust") @@ -310,7 +312,7 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort # store likelihoods for each cluster in the prior (from previous iterations) self.clust_prior[-1] = np.r_[0, 0] - self.clust_prior_liks = sc.SortedDict({ k : ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_prior.items()}) + self.clust_prior_liks = sc.SortedDict({ k : ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_prior.items()}) self.clust_prior_mat = np.r_[self.clust_prior.values()] self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster @@ -389,7 +391,7 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_ SD_a += D_a SD_b += D_b - return ss.betaln(SU_a + 1, SU_b + 1) + ss.betaln(J_a + 1, J_b + 1) + ss.betaln(SD_a + 1, SD_b + 1) + return ss.betaln(SU_a + 1 + self.betahyp, SU_b + 1 + self.betahyp) + ss.betaln(J_a + 1 + self.betahyp, J_b + 1 + self.betahyp) + ss.betaln(SD_a + 1 + self.betahyp, SD_b + 1 + self.betahyp) def compute_adj_prob(self, break_idx): if break_idx > 1: @@ -597,7 +599,7 @@ def compute_cluster_splitpoints(self, seg_idx): maj_cs = self._Scumsum_ph(seg_idx_sp, min = False) maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs - split_lik = ss.betaln(min_cs[:-1] + 1, maj_cs[:-1] + 1) + ss.betaln(min_csr[1:] + 1, maj_csr[1:] + 1) + split_lik = ss.betaln(min_cs[:-1] + 1 + self.betahyp, maj_cs[:-1] + 1 + self.betahyp) + ss.betaln(min_csr[1:] + 1 + self.betahyp, maj_csr[1:] + 1 + self.betahyp) # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum()) # NOTE: instead of argmax, probabilistically choose? will this make a difference? @@ -622,7 +624,7 @@ def compute_cluster_splitpoints(self, seg_idx): maj_cs = self._Scumsum_ph(seg_idx_sp, min = False) maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs - split_lik = ss.betaln(min_cs[:-1] + 1, maj_cs[:-1] + 1) + ss.betaln(min_csr[1:] + 1, maj_csr[1:] + 1) + split_lik = ss.betaln(min_cs[:-1] + 1 + self.betahyp, maj_cs[:-1] + 1 + self.betahyp) + ss.betaln(min_csr[1:] + 1 + self.betahyp, maj_csr[1:] + 1 + self.betahyp) # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum()) start += split_lik.argmax() + 1 @@ -641,7 +643,7 @@ def compute_cluster_splitpoints(self, seg_idx): def compute_overall_lik_simple(self): ## overall clustering likelihood # p({a_i, b_i} | {c_k}, {phase_i}) - clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum() + clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) + self.betahyp for k, v in self.clust_sums.items() if k >= 0]].sum() ## overall phasing likelihood # p({phase_i} | {a_i, b_i}) @@ -901,7 +903,7 @@ def run(self, n_iter = 0, n_samps = 0): maj_cs = self._Scumsum_ph(seg_idx, min = False) maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs - split_lik = ss.betaln(min_cs + 1, maj_cs + 1) + ss.betaln(min_csr + 1, maj_csr + 1) + split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp) split_lik -= split_lik.max() split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum()) seg_idx = seg_idx[:(split_point + 1)] @@ -929,7 +931,7 @@ def run(self, n_iter = 0, n_samps = 0): A_tot, B_tot = self.clust_sums[cur_clust] - lik0 = ss.betaln(A_tot + 1, B_tot + 1) + lik0 = ss.betaln(A_tot + 1 + self.betahyp, B_tot + 1 + self.betahyp) liks = np.zeros(len(split_bdy) + 1) liks[-1] = lik0 # don't split at all @@ -939,7 +941,7 @@ def run(self, n_iter = 0, n_samps = 0): A = self._Ssum_ph(clust_segs[st:en], min = True) B = self._Ssum_ph(clust_segs[st:en], min = False) - liks[i] = ss.betaln(A_tot - A + 1, B_tot - B + 1) + ss.betaln(A + 1, B + 1) + liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) # pick a region to split split_idx = np.random.choice( @@ -1028,11 +1030,11 @@ def run(self, n_iter = 0, n_samps = 0): # A+B is likelihood of current cluster B is part of #AB = ss.betaln(A_a + B_a + 1, A_b + B_b + 1) # C is likelihood of target cluster pre-join - C = ss.betaln(C_ab[:, 0] + 1, C_ab[:, 1] + 1) + C = ss.betaln(C_ab[:, 0] + 1 + self.betahyp, C_ab[:, 1] + 1 + self.betahyp) # A is likelihood cluster B is part of, minus B #A = ss.betaln(A_a + 1, A_b + 1) # B+C is likelihood of target cluster post-join, with both phase orientations - BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1, C_ab[:, [1]] + np.c_[B_b, B_a] + 1) + BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1 + self.betahyp, C_ab[:, [1]] + np.c_[B_b, B_a] + 1 + self.betahyp) MLs = BC - C[:, None] From 97d3d7a8921f06073177437f617bd0b79ede6909 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 7 Mar 2022 14:29:49 -0500 Subject: [PATCH 058/222] Add beta hyperparameter to rephasing --- hapaseg/allelic_DP.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 45ae343..8e042c0 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -348,10 +348,10 @@ def compute_rephase_prob(self, seg_idx): flip = self.S.iloc[seg_idx, self.flip_col] flip_n = ~flip - A_a = self.alt_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 - A_b = self.ref_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 - B_a = self.alt_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 - B_b = self.ref_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + A_a = self.alt_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + self.betahyp + A_b = self.ref_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + self.betahyp + B_a = self.alt_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + self.betahyp + B_b = self.ref_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + self.betahyp # use normal approximation to beta if conditions are right if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20: From dc5f9c79890a5ec6d4784c1a0aa771b6ca779a85 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 7 Mar 2022 14:31:47 -0500 Subject: [PATCH 059/222] Compute segmentation likelihood on the fly --- hapaseg/allelic_DP.py | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 8e042c0..11ca47d 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -659,15 +659,7 @@ def compute_overall_lik_simple(self): ## segmentation likelihood # p({a_i, b_i} | {s}, {phase_i}) - bdy = np.flatnonzero(np.r_[1, np.diff(self.S["clust"]) != 0, 1]) - bdy = np.c_[bdy[:-1], bdy[1:]] - - seg_lik = 0.0 - for st, en in bdy: - seg_lik += ss.betaln( - self._Ssum_ph(np.r_[st:en], min = True) + 1, - self._Ssum_ph(np.r_[st:en], min = False) + 1 - ) + seg_lik = np.r_[self.seg_liks].sum() # p({c_k}, {s}, {phase_i} | {a_i, b_i}) #return clust_lik + phase_lik + count_prior + seg_lik @@ -805,8 +797,12 @@ def run(self, n_iter = 0, n_samps = 0): max_clust_idx = np.max(self.clust_members.keys() | self.clust_prior.keys() if self.clust_prior is not None else {}) + # + # breakpoint tracking + # segmentation breakpoints self.breakpoints = sc.SortedSet(np.flatnonzero(np.diff(self.S["clust"]) != 0) + 1) | {0, len(self.S)} + # min/maj counts in each segment self.seg_sums = sc.SortedDict() bpl = np.r_[self.breakpoints] @@ -814,6 +810,12 @@ def run(self, n_iter = 0, n_samps = 0): mn = self._Ssum_ph(np.r_[st:en], min = True) mj = self._Ssum_ph(np.r_[st:en], min = False) self.seg_sums[st] = np.r_[mn, mj] + + # likelihoods for each segment + self.seg_liks = sc.SortedDict() + for k, (a, b) in self.seg_sums.items(): + self.seg_liks[k] = ss.betaln(a + 1 + self.betahyp, b + 1 + self.betahyp) + # breakpoints for each cluster self.clust_members_bps = sc.SortedDict({ k : sc.SortedSet(v) for k, v in \ @@ -912,8 +914,16 @@ def run(self, n_iter = 0, n_samps = 0): new_bp = seg_idx[-1] + 1 if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment self.breakpoints.add(new_bp) - self.seg_sums[new_bp] = np.r_[self._Ssum_ph(np.r_[new_bp:seg_en], min = True), self._Ssum_ph(np.r_[new_bp:seg_en], min = False)] + + A = self._Ssum_ph(np.r_[new_bp:seg_en], min = True) + B = self._Ssum_ph(np.r_[new_bp:seg_en], min = False) + + self.seg_sums[new_bp] = np.r_[A, B] self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp] + + self.seg_liks[new_bp] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) + self.seg_liks[seg_idx[0]] -= self.seg_liks[new_bp] + self.clust_members_bps[cur_clust].add(new_bp) # propose splitting out a contiguous interval of segments within the current cluster {{{ @@ -1200,16 +1210,17 @@ def run(self, n_iter = 0, n_samps = 0): snp_idx.discard(snp) # discard rather than remvoe because this could be in snp_idx + 1 self.breakpoints.remove(snp) self.seg_sums.pop(snp) + self.seg_liks.pop(snp) self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster update_idx.add(self.breakpoints.bisect_left(snp) - 1) snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1]) for bp_idx in update_idx: st = self.breakpoints[bp_idx] en = self.breakpoints[bp_idx + 1] - self.seg_sums[st] = np.r_[ - self._Ssum_ph(np.r_[st:en], min = True), - self._Ssum_ph(np.r_[st:en], min = False) - ] + A = self._Ssum_ph(np.r_[st:en], min = True) + B = self._Ssum_ph(np.r_[st:en], min = False) + self.seg_sums[st] = np.r_[A, B] + self.seg_liks[st] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) if choice < 0: self.clust_members_bps[new_clust_idx] = snp_idx From 0b510e7e50dcc025c36d633928ad85d011f0e15e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 17 Mar 2022 14:52:23 -0400 Subject: [PATCH 060/222] Explicitly remove empty segment from nonsplit likelihood With beta hyperparameter = 0 this was fine, since Beta(A + 1, B + 1) + Beta(1, 1) == Beta(A + 1, B + 1) With beta hyp != 0, this is not fine, since Beta(A + 1 + h, B + 1 + h) + Beta(1 + h, 1 + h) != Beta(A + 1, B + 1) --- hapaseg/allelic_DP.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 11ca47d..7a3222b 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -906,6 +906,7 @@ def run(self, n_iter = 0, n_samps = 0): maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp) + split_lik[-1] = ss.betaln(min_cs[-1] + 1 + self.betahyp, maj_cs[-1] + 1 + self.betahyp) split_lik -= split_lik.max() split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum()) seg_idx = seg_idx[:(split_point + 1)] From 366ab1fd9e3b62961a83efb346e45a876bccfbd6 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 17 Mar 2022 16:03:31 -0400 Subject: [PATCH 061/222] Return proper adjacency likelihood with betahyp > 0 --- hapaseg/allelic_DP.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 7a3222b..4b13f72 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -391,7 +391,9 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_ SD_a += D_a SD_b += D_b - return ss.betaln(SU_a + 1 + self.betahyp, SU_b + 1 + self.betahyp) + ss.betaln(J_a + 1 + self.betahyp, J_b + 1 + self.betahyp) + ss.betaln(SD_a + 1 + self.betahyp, SD_b + 1 + self.betahyp) + return (ss.betaln(SU_a + 1 + self.betahyp, SU_b + 1 + self.betahyp) if SU_a > 0 or SU_b > 0 else 0) + \ + ss.betaln(J_a + 1 + self.betahyp, J_b + 1 + self.betahyp) + \ + (ss.betaln(SD_a + 1 + self.betahyp, SD_b + 1 + self.betahyp) if SD_a > 0 or SD_b > 0 else 0) def compute_adj_prob(self, break_idx): if break_idx > 1: From 5c01088aec136a86f72b470ec60b95e74ed10e58 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 5 Apr 2022 17:57:08 -0400 Subject: [PATCH 062/222] Fix up overall likelihood function --- hapaseg/allelic_DP.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 4b13f72..d058e1e 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -645,26 +645,28 @@ def compute_cluster_splitpoints(self, seg_idx): def compute_overall_lik_simple(self): ## overall clustering likelihood # p({a_i, b_i} | {c_k}, {phase_i}) - clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) + self.betahyp for k, v in self.clust_sums.items() if k >= 0]].sum() + clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_sums.items() if k >= 0]].sum() - ## overall phasing likelihood - # p({phase_i} | {a_i, b_i}) - phase_lik = 1 - self.S["rephase_prob"].copy() - phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]] - phase_lik = np.log(phase_lik).sum() +# ## overall phasing likelihood +# # p({phase_i} | {a_i, b_i}) +# TODO: memoize +# phase_lik = 1 - self.S["rephase_prob"].copy() +# phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]] +# phase_lik = np.log(phase_lik).sum() + phase_lik = 0 ## Dirichlet count prior (Dirichlet-categorical marginal likelihood) # p({c_k}) - dirvec = np.r_[self.clust_counts.values()].astype(float) + dirvec = np.r_[self.clust_counts.values()].astype(float)/self.dp_count_scale_factor k = len(dirvec) count_prior = k*np.log(self.alpha) + ss.gammaln(dirvec).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) ## segmentation likelihood # p({a_i, b_i} | {s}, {phase_i}) - seg_lik = np.r_[self.seg_liks].sum() + # TODO: memoize + seg_lik = np.r_[self.seg_liks.values()].sum() # p({c_k}, {s}, {phase_i} | {a_i, b_i}) - #return clust_lik + phase_lik + count_prior + seg_lik return np.r_[clust_lik, phase_lik, count_prior, seg_lik] # {{{ @@ -833,7 +835,7 @@ def run(self, n_iter = 0, n_samps = 0): seg_touch_idx = np.zeros(len(self.S), dtype = bool) # likelihood trace - self.lik_tmp = [-np.inf] + self.lik_trace = [] self.post = 0 n_it = 0 From 6ed69eb0dfdc5030fc6561ff930cf43c10c786f0 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 5 Apr 2022 17:58:18 -0400 Subject: [PATCH 063/222] Sequentially scan over segments if >90% have been touched --- hapaseg/allelic_DP.py | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index d058e1e..b015533 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -831,8 +831,6 @@ def run(self, n_iter = 0, n_samps = 0): self.phase_orientations = [] burned_in = False - all_touched = False - seg_touch_idx = np.zeros(len(self.S), dtype = bool) # likelihood trace self.lik_trace = [] @@ -840,13 +838,17 @@ def run(self, n_iter = 0, n_samps = 0): n_it = 0 n_it_last = 0 + + brk = 0 + touch90 = False + while True: if not n_it % 1000: if len(self.clust_counts) > 20: print(pd.Series(self.clust_counts.values()).value_counts().sort_index()) else: print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1])) - print(self.lik_tmp[-1]) + print(brk % (len(self.breakpoints) - 1)) #print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index()) #print("n unassigned: {}".format((self.S["clust"] == -1).sum())) @@ -860,14 +862,14 @@ def run(self, n_iter = 0, n_samps = 0): # poll every 100 iterations for burnin status if not n_it % 100: + # have >90% of segments been touched? + if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.9: + touch90 = True + # have most segments been adjacency corrected? # if so, has the overall likelihood stabilized enough that we're burned in? if not burned_in: - # 1. have >90% of segments been adjacency corrected? - # print(seg_touch_idx.mean()) - if seg_touch_idx.mean() > 0.9: - all_touched = True - + pass # 2. if >90% of segments have been adjacency corrected, check for burnin # does the smoothed derivative of the posterior numerator go below zero? this would indicate that we've solidly reached an optimum # TODO: make this check more efficient? @@ -877,19 +879,33 @@ def run(self, n_iter = 0, n_samps = 0): # n_it_last = n_it # seg_touch_idx[:] = False - if burned_in and seg_touch_idx.mean() > 0.3: + # start computing likelihoods + if touch90: + print(self.compute_overall_lik_simple()) + print(self.compute_overall_lik_simple().sum()) + self.lik_trace.append(self.compute_overall_lik_simple()) + + # save cluster assignments and phase orientations once burned in + if burned_in: self.segs_to_clusters.append(self.S["clust"].copy()) self.phase_orientations.append(self.S["flipped"].copy()) - seg_touch_idx[:] = False # # pick either a segment or a cluster at random (50:50 prob.) move_clust = False - # pick a segment at random - if True or np.random.rand() < 0.5: + # move a segment + #if not touch90 or np.random.rand() < 0.9: + if True or np.random.rand() < 0.9: + # >90% of segments have been moved; we are iterating over segments sequentially + if touch90: + break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)}) + brk += 1 + # we are picking segments at random + else: + break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)}) + # get all SNPs within this segment - break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)}) seg_st = self.breakpoints[break_idx[0]] seg_en = self.breakpoints[break_idx[0] + 1] seg_idx = np.r_[seg_st:seg_en] From 09fa7ca62bb9f074cd65f1c2ef3b0a3177645974 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 5 Apr 2022 19:05:24 -0400 Subject: [PATCH 064/222] Fix up cluster splitting --- hapaseg/allelic_DP.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index b015533..5fc6126 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -601,7 +601,7 @@ def compute_cluster_splitpoints(self, seg_idx): maj_cs = self._Scumsum_ph(seg_idx_sp, min = False) maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs - split_lik = ss.betaln(min_cs[:-1] + 1 + self.betahyp, maj_cs[:-1] + 1 + self.betahyp) + ss.betaln(min_csr[1:] + 1 + self.betahyp, maj_csr[1:] + 1 + self.betahyp) + split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp) # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum()) # NOTE: instead of argmax, probabilistically choose? will this make a difference? @@ -637,7 +637,7 @@ def compute_cluster_splitpoints(self, seg_idx): i += 1 - bdy = np.unique(np.r_[0, spl, len(seg_idx)]) + bdy = seg_idx[np.unique(np.r_[0, spl, len(seg_idx) - 1])] bdy = np.c_[bdy[:-1], bdy[1:]] return bdy From 90b474d3ded6e00c554520b41c5bf972d016c62f Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 7 Apr 2022 13:50:17 -0400 Subject: [PATCH 065/222] Can't update likelihood by subtracting it off like that --- hapaseg/allelic_DP.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 5fc6126..cc9114e 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -943,7 +943,9 @@ def run(self, n_iter = 0, n_samps = 0): self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp] self.seg_liks[new_bp] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) - self.seg_liks[seg_idx[0]] -= self.seg_liks[new_bp] + A = self._Ssum_ph(np.r_[seg_idx[0]:new_bp], min = True) + B = self._Ssum_ph(np.r_[seg_idx[0]:new_bp], min = False) + self.seg_liks[seg_idx[0]] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) self.clust_members_bps[cur_clust].add(new_bp) From 5e7dce19411b4374fd262a749f914d93310ff331 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 7 Apr 2022 14:30:38 -0400 Subject: [PATCH 066/222] Fix bug introduced in 09fa7ca --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index cc9114e..8cd20a0 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -637,7 +637,7 @@ def compute_cluster_splitpoints(self, seg_idx): i += 1 - bdy = seg_idx[np.unique(np.r_[0, spl, len(seg_idx) - 1])] + bdy = np.unique(np.r_[0, spl, len(seg_idx)]) bdy = np.c_[bdy[:-1], bdy[1:]] return bdy From c5dd58d354fb1521d28dbdc90b24c46d0ef86eb4 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 7 Apr 2022 14:32:12 -0400 Subject: [PATCH 067/222] Put breakpoint adding into its own function Add breakpoints when splitting out contiguous range of SNPs within a cluster --- hapaseg/allelic_DP.py | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 8cd20a0..5d5dbc5 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -642,6 +642,24 @@ def compute_cluster_splitpoints(self, seg_idx): return bdy + def add_breakpoint(self, start, mid, end, clust_idx): + """ + Add breakpoint at mid belonging to clust_idx, between start and end + """ + self.breakpoints.add(mid) + self.clust_members_bps[clust_idx].add(mid) + + A = self._Ssum_ph(np.r_[mid:end], min = True) + B = self._Ssum_ph(np.r_[mid:end], min = False) + + self.seg_sums[mid] = np.r_[A, B] + self.seg_sums[st] -= self.seg_sums[mid] + + self.seg_liks[mid] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) + A = self._Ssum_ph(np.r_[start:mid], min = True) + B = self._Ssum_ph(np.r_[start:mid], min = False) + self.seg_liks[start] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) + def compute_overall_lik_simple(self): ## overall clustering likelihood # p({a_i, b_i} | {c_k}, {phase_i}) @@ -934,20 +952,7 @@ def run(self, n_iter = 0, n_samps = 0): # add breakpoint (can be erased subsequently if segment rejoins original cluster) new_bp = seg_idx[-1] + 1 if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment - self.breakpoints.add(new_bp) - - A = self._Ssum_ph(np.r_[new_bp:seg_en], min = True) - B = self._Ssum_ph(np.r_[new_bp:seg_en], min = False) - - self.seg_sums[new_bp] = np.r_[A, B] - self.seg_sums[seg_idx[0]] -= self.seg_sums[new_bp] - - self.seg_liks[new_bp] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) - A = self._Ssum_ph(np.r_[seg_idx[0]:new_bp], min = True) - B = self._Ssum_ph(np.r_[seg_idx[0]:new_bp], min = False) - self.seg_liks[seg_idx[0]] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) - - self.clust_members_bps[cur_clust].add(new_bp) + self.add_breakpoint(start = seg_idx[0], mid = new_bp, end = seg_en, clust_idx = cur_clust) # propose splitting out a contiguous interval of segments within the current cluster {{{ split_clust = False @@ -992,6 +997,16 @@ def run(self, n_iter = 0, n_samps = 0): split_clust = True + # add breakpoints + for si in [seg_idx[0], seg_idx[-1]]: + if si not in self.breakpoints: + seg_st_idx = self.breakpoints.bisect_left(si) - 1 + seg_st = self.breakpoints[seg_st_idx] + seg_en_idx = self.breakpoints.bisect_left(si) + seg_en = self.breakpoints[seg_en_idx] + + self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust) + # }}} n_move = len(seg_idx) From 56a654349a3b3e69afad7f05d9c6d136e3966b7a Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sat, 9 Apr 2022 17:21:30 -0400 Subject: [PATCH 068/222] segs -> snps --- hapaseg/allelic_DP.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 5d5dbc5..76dcd1f 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -958,14 +958,14 @@ def run(self, n_iter = 0, n_samps = 0): split_clust = False if np.random.rand() < 0.1: # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable? - clust_segs = np.sort(np.r_[list(self.clust_members[cur_clust])]) + clust_snps = np.sort(np.r_[list(self.clust_members[cur_clust])]) # can't split clusters of length 1 - if len(clust_segs) == 1: + if len(clust_snps) == 1: n_it += 1 continue - split_bdy = self.compute_cluster_splitpoints(clust_segs) + split_bdy = self.compute_cluster_splitpoints(clust_snps) A_tot, B_tot = self.clust_sums[cur_clust] @@ -976,8 +976,8 @@ def run(self, n_iter = 0, n_samps = 0): # likelihood ratios for splitting each region into a new cluster for i, (st, en) in enumerate(split_bdy): - A = self._Ssum_ph(clust_segs[st:en], min = True) - B = self._Ssum_ph(clust_segs[st:en], min = False) + A = self._Ssum_ph(clust_snps[st:en], min = True) + B = self._Ssum_ph(clust_snps[st:en], min = False) liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) @@ -992,8 +992,8 @@ def run(self, n_iter = 0, n_samps = 0): n_it += 1 continue - # seg_idx == segments to propose to split off - seg_idx = clust_segs[slice(*split_bdy[split_idx])] + # seg_idx == SNPs to propose to split off + seg_idx = clust_snps[slice(*split_bdy[split_idx])] split_clust = True From 941f05194337d26267a876b1107adb659465797c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sat, 9 Apr 2022 17:22:00 -0400 Subject: [PATCH 069/222] fix typo --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 76dcd1f..1b2d906 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -653,7 +653,7 @@ def add_breakpoint(self, start, mid, end, clust_idx): B = self._Ssum_ph(np.r_[mid:end], min = False) self.seg_sums[mid] = np.r_[A, B] - self.seg_sums[st] -= self.seg_sums[mid] + self.seg_sums[start] -= self.seg_sums[mid] self.seg_liks[mid] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) A = self._Ssum_ph(np.r_[start:mid], min = True) From 6faa2b178366e8bf4e44c0ac5339091353499d64 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sat, 9 Apr 2022 17:22:50 -0400 Subject: [PATCH 070/222] Properly update breakpoints when splitting cluster --- hapaseg/allelic_DP.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 1b2d906..22e6998 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1007,6 +1007,11 @@ def run(self, n_iter = 0, n_samps = 0): self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust) + # get all breakpoints within this cluster/interval + left_idx = self.clust_members_bps[cur_clust].bisect_left(seg_idx[0]) + right_idx = self.clust_members_bps[cur_clust].bisect_right(seg_idx[-1]) + break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cur_clust][left_idx:right_idx]]) + # }}} n_move = len(seg_idx) @@ -1022,7 +1027,8 @@ def run(self, n_iter = 0, n_samps = 0): else: self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)] self.clust_members[cur_clust] -= set(seg_idx) - self.clust_members_bps[cur_clust].remove(self.breakpoints[break_idx[0]]) + for b in break_idx: + self.clust_members_bps[cur_clust].remove(self.breakpoints[b]) self.clusts[seg_idx] = -1 From 5fe512bce552a51be361854c609212819a254439 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 12:05:36 -0400 Subject: [PATCH 071/222] Clarify which breakpoints get updated --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 22e6998..7f1217c 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1243,7 +1243,7 @@ def run(self, n_iter = 0, n_samps = 0): # . . . break_idx + 1 # A B A B A C B A # + + + break_idx - # * * update_idx + #* * update_idx break_idx_bi = break_idx | { x + 1 for x in break_idx } snp_idx_bi = sc.SortedSet([self.breakpoints[b] for b in break_idx_bi]) From b6e5d05d75f5249fe54258583fe7d867ea0b0e9d Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 14:16:38 -0400 Subject: [PATCH 072/222] Memoize segment misphase probabilities --- hapaseg/allelic_DP.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 7f1217c..7379e6f 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -660,6 +660,9 @@ def add_breakpoint(self, start, mid, end, clust_idx): B = self._Ssum_ph(np.r_[start:mid], min = False) self.seg_liks[start] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) + self.seg_phase_probs[start] = self.compute_rephase_prob(np.r_[start:mid]) + self.seg_phase_probs[mid] = self.compute_rephase_prob(np.r_[mid:end]) + def compute_overall_lik_simple(self): ## overall clustering likelihood # p({a_i, b_i} | {c_k}, {phase_i}) @@ -844,6 +847,9 @@ def run(self, n_iter = 0, n_samps = 0): self.S.loc[self.breakpoints[:-1], ["clust"]].groupby("clust").groups.items() }) + # misphase probabilities for each segment + self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints }) + # containers for saving the MCMC trace self.segs_to_clusters = [] self.phase_orientations = [] @@ -1063,7 +1069,7 @@ def run(self, n_iter = 0, n_samps = 0): # # perform phase correction on segment/cluster # flip min/maj with probability that alleles are oriented the "wrong" way - rephase_prob = self.compute_rephase_prob(seg_idx) + rephase_prob = self.seg_phase_probs[seg_idx[0]] if not np.isnan(self.seg_phase_probs[seg_idx[0]]) else self.compute_rephase_prob(seg_idx) # # choose to join a cluster or make a new one @@ -1237,6 +1243,13 @@ def run(self, n_iter = 0, n_samps = 0): self.clust_members[choice].update(set(seg_idx)) + # if segment was rephased, update saved phasing probabilities + if choice_idx & 1: + for bp_idx in break_idx: + st = self.breakpoints[bp_idx] + en = self.breakpoints[bp_idx + 1] + self.seg_phase_probs[st] = self.compute_rephase_prob(np.r_[st:en]) + # update breakpoints # B->A @@ -1255,6 +1268,7 @@ def run(self, n_iter = 0, n_samps = 0): self.breakpoints.remove(snp) self.seg_sums.pop(snp) self.seg_liks.pop(snp) + self.seg_phase_probs.pop(snp) self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster update_idx.add(self.breakpoints.bisect_left(snp) - 1) snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1]) @@ -1265,6 +1279,7 @@ def run(self, n_iter = 0, n_samps = 0): B = self._Ssum_ph(np.r_[st:en], min = False) self.seg_sums[st] = np.r_[A, B] self.seg_liks[st] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) + self.seg_phase_probs[st] = self.compute_rephase_prob(np.r_[st:en]) if choice < 0: self.clust_members_bps[new_clust_idx] = snp_idx From 0bee5ff055434426fce1f33c13f5bad10c7b19fe Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 16:39:04 -0400 Subject: [PATCH 073/222] Fix a couple phase tracking bugs --- hapaseg/allelic_DP.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 7379e6f..a9438f8 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -848,7 +848,7 @@ def run(self, n_iter = 0, n_samps = 0): }) # misphase probabilities for each segment - self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints }) + self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints[:-1] }) # containers for saving the MCMC trace self.segs_to_clusters = [] @@ -1069,7 +1069,9 @@ def run(self, n_iter = 0, n_samps = 0): # # perform phase correction on segment/cluster # flip min/maj with probability that alleles are oriented the "wrong" way - rephase_prob = self.seg_phase_probs[seg_idx[0]] if not np.isnan(self.seg_phase_probs[seg_idx[0]]) else self.compute_rephase_prob(seg_idx) + if np.isnan(self.seg_phase_probs[seg_idx[0]]): + self.seg_phase_probs[seg_idx[0]] = self.compute_rephase_prob(seg_idx) + rephase_prob = self.seg_phase_probs[seg_idx[0]] # # choose to join a cluster or make a new one From 8899a5bb879ac7c733d1e42dca5c0386a7d3e312 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 16:40:33 -0400 Subject: [PATCH 074/222] Roughly(?) compute overall phasing likelihood --- hapaseg/allelic_DP.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index a9438f8..35055ef 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -668,13 +668,9 @@ def compute_overall_lik_simple(self): # p({a_i, b_i} | {c_k}, {phase_i}) clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_sums.items() if k >= 0]].sum() -# ## overall phasing likelihood -# # p({phase_i} | {a_i, b_i}) -# TODO: memoize -# phase_lik = 1 - self.S["rephase_prob"].copy() -# phase_lik[self.S["flipped"]] = 1 - phase_lik[self.S["flipped"]] -# phase_lik = np.log(phase_lik).sum() - phase_lik = 0 + ## overall phasing likelihood + # p({phase_i} | {a_i, b_i}) + phase_lik = np.log1p(-np.r_[self.seg_phase_probs.values()]).sum() ## Dirichlet count prior (Dirichlet-categorical marginal likelihood) # p({c_k}) From bf61793eda7dcc14b852a6840cc41eff36edb7b7 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 21:56:39 -0400 Subject: [PATCH 075/222] Remove phase correction from initial MCMC --- hapaseg/allelic_MCMC.py | 345 +--------------------------------------- 1 file changed, 3 insertions(+), 342 deletions(-) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index d38d67e..8f7b983 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -24,8 +24,6 @@ def __init__(self, P, quit_after_burnin = False, n_iter = 100000, ref_bias = 1.0, - misphase_prior = 0.001, - phase_correct = False ): # # dataframe stuff @@ -59,22 +57,11 @@ def __init__(self, P, self.quit_after_burnin = quit_after_burnin - self.misphase_prior = misphase_prior - - # whether to perform phasing correction iterations - self.phase_correct = phase_correct - - # how many post-burnin samples to use to infer phase switches - self.n_phase_correct_samples = 40 - # # chain state self.iter = 1 self.burned_in = False - # whether phase correction has been performed - self.phase_correction_ready = False - # # breakpoint storage @@ -89,17 +76,6 @@ def __init__(self, P, # list of all breakpoints at nth iteration self.breakpoint_list = [] - # - # misphase interval storage - - # candidate intervals that were misphased - self.B_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int) - - # current state of interval assignments (relative to B_ct) - self.F = MySortedList() - - # state of interval assignments at nth iteration - self.phase_interval_list = [] # # cumsum arrays for each segment @@ -141,25 +117,14 @@ def _Piloc(self, st, en, col_idx, incl_idx = None): def run(self): while self.iter < self.n_iter: - # perform a split, combine, phase correct, or prune operation - op = np.random.choice(4) + # perform a split or combine + op = np.random.choice(2) if op == 0: if self.combine(np.random.choice(self.breakpoints[:-1]), force = False) == -1: continue elif op == 1: if self.split(b_idx = np.random.choice(len(self.breakpoints))) == -1: continue - elif op == 2: - if self.phase_correct and self.phase_correction_ready: - self.rephase() - else: - continue - elif op == 3: - continue - if np.random.rand() < 0.01: - self.prune() - else: - continue # if we're only running up to burnin, bail if self.quit_after_burnin and self.burned_in: @@ -172,27 +137,11 @@ def run(self): ) + colorama.Fore.RESET) return self - # correct phases after some post-burnin iterations - if not self.phase_correction_ready and self.phase_correct and \ - self.burned_in and len(self.breakpoint_list) >= 2*self.n_phase_correct_samples: - self.correct_phases() - self.phase_correction_ready = True - - # breakpoint/prune lists are liable to change after phase correction, so clear them - self.breakpoint_list = [] - self.include = [] - - # save set of breakpoints, phase intervals, and prune states if burned in - if self.burned_in and not self.iter % 100: - self.breakpoint_list.append(self.breakpoints.copy()) - self.include.append(self.P["include"].copy()) - if self.phase_correction_ready: - self.phase_interval_list.append(self.F.copy()) # print status if not self.iter % 100: if self.burned_in: - color = colorama.Fore.MAGENTA if not self.phase_correction_ready else colorama.Fore.RESET + color = colorama.Fore.RESET else: color = colorama.Fore.YELLOW print("{color}[{st},{en}]\t{n}/{tot}\tn_bp = {n_bp}\tlik = {lik}".format( @@ -276,294 +225,6 @@ def combine(self, st = None, b_idx = None, force = True): return mid - def flip_hap(self, st, en): - """ - Flips the SNPs from st to en - """ - - x = self.P.iloc[st:en, self.maj_idx].copy() - self.P.iloc[st:en, self.maj_idx] = self.P.iloc[st:en, self.min_idx] - self.P.iloc[st:en, self.min_idx] = x - - def prob_misphase(self, bdy1, bdy2): - """ - Compute probability of misphase - """ - # TODO: change invocation to st, mid, en -- we don't need to correct - # phasing of noncontiguous segments - - # prior on misphasing probability - p_mis = self.misphase_prior if np.isnan(self.P.loc[bdy1[1] - 1, "misphase_prob"]) else self.P.loc[bdy1[1] - 1, "misphase_prob"] - if p_mis == 0: - return -np.inf, 0 - - # haps = x/y, segs = 1/2, beta params. = A/B - - # seg 1 - rng_idx = (self.P.index >= bdy1[0]) & (self.P.index < bdy1[1]) - - idx = rng_idx & self.P["aidx"] & self.P["include"] - x1_A = self.P.loc[idx, "ALT_COUNT"].sum() - x1_B = self.P.loc[idx, "REF_COUNT"].sum() - - idx = rng_idx & ~self.P["aidx"] & self.P["include"] - y1_A = self.P.loc[idx, "ALT_COUNT"].sum() - y1_B = self.P.loc[idx, "REF_COUNT"].sum() - - # seg 2 - rng_idx = (self.P.index >= bdy2[0]) & (self.P.index < bdy2[1]) - - idx = rng_idx & self.P["aidx"] & self.P["include"] - x2_A = self.P.loc[idx, "ALT_COUNT"].sum() - x2_B = self.P.loc[idx, "REF_COUNT"].sum() - - idx = rng_idx & ~self.P["aidx"] & self.P["include"] - y2_A = self.P.loc[idx, "ALT_COUNT"].sum() - y2_B = self.P.loc[idx, "REF_COUNT"].sum() - - lik_mis = ss.betaln(x1_A + y1_B + y2_A + x2_B + 1, y1_A + x1_B + x2_A + y2_B + 1) - lik_nomis = ss.betaln(x1_A + y1_B + x2_A + y2_B + 1, y1_A + x1_B + y2_A + x2_B + 1) - - # logsumexp - m = np.maximum(lik_mis, lik_nomis) - denom = m + np.log(np.exp(lik_mis - m)*p_mis + np.exp(lik_nomis - m)*(1 - p_mis)) - - return lik_mis + np.log(p_mis) - denom, lik_nomis + np.log(1 - p_mis) - denom - - def correct_phases(self): - """ - Compute potentially misphased intervals, given some segmentation samples - """ - if not self.burned_in or len(self.breakpoint_list) == 0: - raise RuntimeError("Breakpoint sample list must be populated (chain must be burned in)") - - #A_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int) - #B_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int) - - for bp_idx in np.random.choice(len(self.breakpoint_list), self.n_phase_correct_samples, replace = False): - bpl = np.array(self.breakpoint_list[bp_idx]); bpl = np.c_[bpl[:-1], bpl[1:]] - - p_mis = np.full(len(bpl) - 1, np.nan) - p_A = np.full(len(bpl) - 1, np.nan) - p_B = np.full(len(bpl) - 1, np.nan) - - V = np.full([len(bpl) - 1, 2], np.nan) - B = np.zeros([len(bpl) - 1, 2], dtype = np.uint8) - - for i, (st, mid, _, en) in enumerate(np.c_[bpl[:-1], bpl[1:]]): - p_mis, p_nomis = self.prob_misphase([st, mid], [mid, en]) - - # TODO: memoize partial sums - - # prob. that left segment is on hap. A - p_A1 = s.beta.logsf(0.5, self._Piloc(st, mid, self.min_idx).sum() + 1, self._Piloc(st, mid, self.maj_idx).sum() + 1) - # prob. that right segment is on hap. A - p_A2 = s.beta.logsf(0.5, self._Piloc(mid, en, self.min_idx).sum() + 1, self._Piloc(mid, en, self.maj_idx).sum() + 1) - - # prob. that left segment is on hap. B - p_B1 = s.beta.logcdf(0.5, self._Piloc(st, mid, self.min_idx).sum() + 1, self._Piloc(st, mid, self.maj_idx).sum() + 1) - # prob. that right segment is on hap. B - p_B2 = s.beta.logcdf(0.5, self._Piloc(mid, en, self.min_idx).sum() + 1, self._Piloc(mid, en, self.maj_idx).sum() + 1) - - if i == 0: - V[i, :] = [p_A1, p_B1] - continue - - p_AB = p_mis + p_A1 + p_B2 - p_BA = p_mis + p_B1 + p_A2 - p_AA = p_nomis + p_A1 + p_A2 - p_BB = p_nomis + p_B1 + p_B2 - - V[i, 0] = np.max(np.r_[p_AA + V[i - 1, 0], p_BA + V[i - 1, 1]]) - V[i, 1] = np.max(np.r_[p_AB + V[i - 1, 0], p_BB + V[i - 1, 1]]) - - B[i, 0] = np.argmax(np.r_[p_AA + V[i - 1, 0], p_BA + V[i - 1, 1]]) - B[i, 1] = np.argmax(np.r_[p_AB + V[i - 1, 0], p_BB + V[i - 1, 1]]) - - # backtrace - BT = np.full(len(B), -1, dtype = np.uint8) - ix = np.argmax(V[-1]) - BT[-1] = ix - for i, b in reversed(list(enumerate(B[:-1]))): - ix = b[ix] - BT[i] = ix - - # join contiguous segments assigned to hap. B - d = np.diff(BT, append = 0, prepend = 0) - ctg_idx = np.c_[np.flatnonzero(d == 1), np.flatnonzero(d == -1) - 1] - b_segs_j = np.c_[bpl[ctg_idx[:, 0], 0], bpl[ctg_idx[:, 1], 1]] - -# # join contiguous segments assigned to hap. A -# d = np.diff(1 - BT, append = 0, prepend = 0) -# ctg_idx = np.c_[np.flatnonzero(d == 1), np.flatnonzero(d == -1) - 1] -# a_segs_j = np.c_[bpl[ctg_idx[:, 0], 0], bpl[ctg_idx[:, 1], 1]] - - # plot - #for x in np.flatnonzero(BT): - # plt.plot(self.P.loc[bpl[x], "pos"], np.r_[j + 1, j + 1]*0.01) - - # record - for x in b_segs_j: - self.B_ct[x[0], x[1]] += 1 -# for x in a_segs_j: -# A_ct[x[0], x[1]] += 1 - -# # plot -# for k, v in B_ct.items(): -# for _ in range(0, v): -# plt.plot(self.P.iloc[np.r_[k], self.P.columns.get_loc("pos")], 0.2*np.random.rand()*np.r_[1, 1]) - - # MCMC iteration that corrects a phase - def rephase(self): # TODO: add parameters to force an interval? - # TODO: prerequisite checks; has correct_phases() been run? - choice = list(self.B_ct.keys()) - probs = np.r_[list(self.B_ct.values())] - - # - # propose an interval to flip from B->A - st, en = choice[np.random.choice(np.r_[0:len(choice)], p = probs/probs.sum())] - - # - # check if this overlaps any other regions that were already flipped B->A. - - # any previously flipped regions contained within will be left alone - - # return range of flipped region array that [st, en) overlaps - # TODO: rename this; f_o is a terrible name - def f_o(st = st, en = en): - st_idx = self.F.bisect_left(st + 1); st_idx -= st_idx % 2 - en_idx = self.F.bisect_right(en - 1); en_idx += en_idx % 2 - return slice(st_idx, en_idx) - - overlaps = np.array(self.F[f_o()]).reshape(-1, 2) - o_S = sc.SortedSet({st, en}) - for o in overlaps: - o_S.add(o[0]) - o_S.add(o[1]) - - # somewhere we ought to assert that the length of self.F is even - - # get list of regions to flip - flip_candidates = np.r_[o_S] # all possible regions to flip - flip_idx = np.zeros(len(flip_candidates) - 1, dtype = np.bool) # index of regions that haven't been flipped yet - A_flag = True # whether st:en consists entirely of regions that were flipped to A - for i, (st_seg, en_seg) in enumerate(np.c_[flip_candidates[:-1], flip_candidates[1:]]): - # this region was not already flipped B->A - if not self.F[f_o(st_seg, en_seg)]: - flip_idx[i] = True - A_flag = False - - flips = np.c_[flip_candidates[:-1], flip_candidates[1:]][flip_idx, :] - - # - # get full range of CNV breakpoints this region spans - st_reg = self.breakpoints.bisect_left(o_S[0]) - en_reg = self.breakpoints.bisect_right(o_S[-1]) - breakpoints0 = sc.SortedSet(self.breakpoints[(st_reg - 1):(en_reg + 1)]) - - # - # get initial marginal likelihood of this configuration - ML_orig = 0 - for b in breakpoints0[:-1]: - ML_orig += self.seg_marg_liks[b] - - # - # perform flips; update breakpoint list accordingly - for st_seg, en_seg in flips: - # if flip boundary corresponds to an extant breakpoint, remove it - # (we will propose joining these segments after flip) - if st_seg in breakpoints0: - breakpoints0 -= {st_seg} - # otherwise, add the flip boundary as a new breakpoint - # (we will propose introducing a new segment after flip) - else: - breakpoints0.add(st_seg) - if en_seg in breakpoints0: - breakpoints0 -= {en_seg} - else: - breakpoints0.add(en_seg) - - self.flip_hap(st_seg, en_seg) - - # - # if st:en is entirely assigned to A, try to flip it back to B (i.e. it was a false flip) - if A_flag: - if flip_candidates[0] in breakpoints0: - breakpoints0 -= {flip_candidates[0]} - else: - breakpoints0.add(flip_candidates[0]) - if en_reg in breakpoints0: - breakpoints0 -= {flip_candidates[-1]} - else: - breakpoints0.add(flip_candidates[-1]) - - self.flip_hap(flip_candidates[0], flip_candidates[-1]) - - # - # get marginal likelihood post-flip and breakpoint adjustment - bps = np.r_[breakpoints0] - ML = 0 - for st_bp, en_bp in np.c_[bps[:-1], bps[1:]]: - ML += ss.betaln( - self._Piloc(st_bp, en_bp, self.min_idx).sum() + 1, - self._Piloc(st_bp, en_bp, self.maj_idx).sum() + 1 - ) - - # - # probabilistically accept new configuration - if np.log(np.random.rand()) < np.minimum(0, ML - ML_orig): - # - # update F array - - # we could have either flipped a region from B->A ... - if not A_flag: - for st_seg, en_seg in flips: - self.F.update([st_seg, en_seg]) - - # ... or reverted a flip - else: - for p in self.F[f_o(flip_candidates[0], flip_candidates[-1])]: - self.F.remove(p) - - # - # combine contiguous intervals in F array - # TODO - - # - # update breakpoint list and seg. marg. liks - bps_to_del = list(self.breakpoints.islice( - self.breakpoints.bisect_left(breakpoints0[0]), - self.breakpoints.bisect_right(breakpoints0[-1]) - )) - for x in bps_to_del: - self.breakpoints.remove(x) - self.breakpoints.update(breakpoints0) - - # - # update seg. marg. liks - # TODO: recomputing each sum (even if in the future we use memoization) - # is wasteful. intelligently pick which seg_marg_liks keys to update. - for x in bps_to_del[:-1]: - self.seg_marg_liks.__delitem__(x) - for st_bp, en_bp in np.c_[bps[:-1], bps[1:]]: - self.seg_marg_liks[st_bp] = ss.betaln( - self._Piloc(st_bp, en_bp, self.min_idx).sum() + 1, - self._Piloc(st_bp, en_bp, self.maj_idx).sum() + 1 - ) - - self.marg_lik[self.iter] = self.marg_lik[self.iter - 1] - ML_orig + ML - - # - # revert - else: - # flip each region back - for st_seg, en_seg in flips: - self.flip_hap(st_seg, en_seg) - if A_flag: - self.flip_hap(flip_candidates[0], flip_candidates[-1]) - - self.marg_lik[self.iter] = self.marg_lik[self.iter - 1] - def compute_all_cumsums(self): bpl = np.array(self.breakpoints); bpl = np.c_[bpl[0:-1], bpl[1:]] for st, en in bpl: From 287b0a8d628afe61982c3fe7fe06dd39f450e642 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 21:56:54 -0400 Subject: [PATCH 076/222] Save only MLE breakpoint --- hapaseg/allelic_MCMC.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index 8f7b983..de7f765 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -76,6 +76,8 @@ def __init__(self, P, # list of all breakpoints at nth iteration self.breakpoint_list = [] + # MLE breakpoint + self.breakpoints_MLE = None # # cumsum arrays for each segment @@ -137,6 +139,10 @@ def run(self): ) + colorama.Fore.RESET) return self + # save MLE breakpoint if we've burned in + if self.burned_in: + if self.marg_lik[self.iter] > self.marg_lik[self.iter - 1]: + self.breakpoints_MLE = self.breakpoints.copy() # print status if not self.iter % 100: From a14b8d9791e6671d5c978d459cc6ba2554d329e7 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 21:57:04 -0400 Subject: [PATCH 077/222] Make burnin criterion more stringent --- hapaseg/allelic_MCMC.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index de7f765..28d1dd2 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -162,8 +162,8 @@ def run(self): # check if we've burned in # TODO: use a faster method of computing rolling average - if not self.burned_in and self.iter > 500: - if np.diff(self.marg_lik[(self.iter - 500):self.iter]).mean() < 0: + if not self.burned_in and self.iter > 1000: + if np.diff(self.marg_lik[(self.iter - 1000):self.iter]).mean() < 0: self.burned_in = True self.iter += 1 From 56afcd8edb938d06be4181073dd5cd1aecb8281f Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 22:11:02 -0400 Subject: [PATCH 078/222] Remove more cruft --- hapaseg/allelic_MCMC.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index 28d1dd2..eef040c 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -88,9 +88,6 @@ def __init__(self, P, self.cs_MAJ = sc.SortedDict() self.cs_MIN = sc.SortedDict() - # probability of picking a breakpoint - self.split_prob = sc.SortedDict() - # # marginal likelihoods @@ -231,11 +228,6 @@ def combine(self, st = None, b_idx = None, force = True): return mid - def compute_all_cumsums(self): - bpl = np.array(self.breakpoints); bpl = np.c_[bpl[0:-1], bpl[1:]] - for st, en in bpl: - self.cs_MAJ[st], self.cs_MIN[st], self.split_prob[st] = self.compute_cumsum(st, en) - def compute_cumsum(self, st, en): # major cs_MAJ = np.zeros(en - st, dtype = np.int) From 73430e35787ddc9bcc440aca9a5ea52c08b8d513 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 22:11:19 -0400 Subject: [PATCH 079/222] Add beta hyperparameter to A_MCMC --- hapaseg/allelic_MCMC.py | 60 +++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index eef040c..5505518 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -91,13 +91,15 @@ def __init__(self, P, # # marginal likelihoods + self.betahyp = 1 + # log marginal likelihoods for each segment # initialize with each SNP comprising its own segment. self.seg_marg_liks = sc.SortedDict(zip( range(0, len(self.P)), ss.betaln( - self.P.iloc[0:len(self.P), self.min_idx] + 1, - self.P.iloc[0:len(self.P), self.maj_idx] + 1 + self.P.iloc[0:len(self.P), self.min_idx] + 1 + self.betahyp, + self.P.iloc[0:len(self.P), self.maj_idx] + 1 + self.betahyp ) )) @@ -197,8 +199,8 @@ def combine(self, st = None, b_idx = None, force = True): ML_split = self.seg_marg_liks[st] + self.seg_marg_liks[mid] ML_join = ss.betaln( - self._Piloc(st, en, self.min_idx).sum() + 1, - self._Piloc(st, en, self.maj_idx).sum() + 1 + self._Piloc(st, en, self.min_idx).sum() + 1 + self.betahyp, + self._Piloc(st, en, self.maj_idx).sum() + 1 + self.betahyp ) # proposal dist. ratio @@ -241,7 +243,7 @@ def compute_cumsum(self, st, en): cs_MIN[i - st] = cs_MIN[i - st - 1] + (self.P.iat[i, self.min_idx] if self.P.iat[i, self.P.columns.get_loc("include")] else 0) # marginal likelihoods - ml = ss.betaln(cs_MAJ + 1, cs_MIN + 1) + ss.betaln(cs_MAJ[-1] - cs_MAJ + 1, cs_MIN[-1] - cs_MIN + 1) + ml = ss.betaln(cs_MAJ + 1 + self.betahyp, cs_MIN + 1 + self.betahyp) + ss.betaln(cs_MAJ[-1] - cs_MAJ + 1 + self.betahyp, cs_MIN[-1] - cs_MIN + 1 + self.betahyp) # prior # TODO: allow user to specify @@ -290,12 +292,12 @@ def split(self, st = None, b_idx = None): # M-H acceptance seg_lik_1 = ss.betaln( - self._Piloc(st, mid, self.min_idx).sum() + 1, - self._Piloc(st, mid, self.maj_idx).sum() + 1 + self._Piloc(st, mid, self.min_idx).sum() + 1 + self.betahyp, + self._Piloc(st, mid, self.maj_idx).sum() + 1 + self.betahyp ) seg_lik_2 = ss.betaln( - self._Piloc(mid, en, self.min_idx).sum() + 1, - self._Piloc(mid, en, self.maj_idx).sum() + 1 + self._Piloc(mid, en, self.min_idx).sum() + 1 + self.betahyp, + self._Piloc(mid, en, self.maj_idx).sum() + 1 + self.betahyp ) ML_split = seg_lik_1 + seg_lik_2 @@ -350,21 +352,21 @@ def prune(self): # q_i = seg(A - A_i, B - B_i) + garbage(A_i, B_i) + (1 - include prior_i) # - (seg(A, B) + (include prior_i)) r_exc = ss.betaln( - A_inc_s - I["MIN_COUNT"] + 1, - B_inc_s - I["MAJ_COUNT"] + 1 - ) + ss.betaln(I["MIN_COUNT"] + 1, I["MAJ_COUNT"] + 1) \ + A_inc_s - I["MIN_COUNT"] + 1 + self.betahyp, + B_inc_s - I["MAJ_COUNT"] + 1 + self.betahyp + ) + ss.betaln(I["MIN_COUNT"] + 1 + self.betahyp, I["MAJ_COUNT"] + 1 + self.betahyp) \ + np.log(1 - I["include_prior"]) \ - - (ss.betaln(A_inc_s + 1, B_inc_s + 1) + np.log(I["include_prior"])) + - (ss.betaln(A_inc_s + 1 + self.betahyp, B_inc_s + 1 + self.betahyp) + np.log(I["include_prior"])) # 2. probability to include SNPs (that were previously excluded) # q_i = seg(A + A_i, B + B_i) + (include prior_i) # - (seg(A, B) + garbage(A_i, B_i) + (1 - include prior_i)) r_inc = ss.betaln( - A_inc_s + E["MIN_COUNT"] + 1, - B_inc_s + E["MAJ_COUNT"] + 1 + A_inc_s + E["MIN_COUNT"] + 1 + self.betahyp, + B_inc_s + E["MAJ_COUNT"] + 1 + self.betahyp ) + np.log(E["include_prior"]) \ - - (ss.betaln(A_inc_s + 1, B_inc_s + 1) + \ - ss.betaln(E["MIN_COUNT"] + 1, E["MAJ_COUNT"] + 1) + \ + - (ss.betaln(A_inc_s + 1 + self.betahyp, B_inc_s + 1 + self.betahyp) + \ + ss.betaln(E["MIN_COUNT"] + 1 + self.betahyp, E["MAJ_COUNT"] + 1 + self.betahyp) + \ np.log(1 - E["include_prior"])) r_cat = pd.concat([r_inc, r_exc]).sort_index() @@ -398,18 +400,18 @@ def prune(self): # regardless, code for computing q_star is the same r_exc_star = ss.betaln( - A_inc_s_star - I_star["MIN_COUNT"] + 1, - B_inc_s_star - I_star["MAJ_COUNT"] + 1 - ) + ss.betaln(I_star["MIN_COUNT"] + 1, I_star["MAJ_COUNT"] + 1) \ + A_inc_s_star - I_star["MIN_COUNT"] + 1 + self.betahyp, + B_inc_s_star - I_star["MAJ_COUNT"] + 1 + self.betahyp + ) + ss.betaln(I_star["MIN_COUNT"] + 1 + self.betahyp, I_star["MAJ_COUNT"] + 1 + self.betahyp) \ + np.log(1 - I_star["include_prior"]) \ - - (ss.betaln(A_inc_s_star + 1, B_inc_s_star + 1) + np.log(I_star["include_prior"])) + - (ss.betaln(A_inc_s_star + 1 + self.betahyp, B_inc_s_star + 1 + self.betahyp) + np.log(I_star["include_prior"])) r_inc_star = ss.betaln( - A_inc_s_star + E_star["MIN_COUNT"] + 1, - B_inc_s_star + E_star["MAJ_COUNT"] + 1 + A_inc_s_star + E_star["MIN_COUNT"] + 1 + self.betahyp, + B_inc_s_star + E_star["MAJ_COUNT"] + 1 + self.betahyp ) + np.log(E_star["include_prior"]) \ - - (ss.betaln(A_inc_s_star + 1, B_inc_s_star + 1) + \ - ss.betaln(E_star["MIN_COUNT"] + 1, E_star["MAJ_COUNT"] + 1) + \ + - (ss.betaln(A_inc_s_star + 1 + self.betahyp, B_inc_s_star + 1 + self.betahyp) + \ + ss.betaln(E_star["MIN_COUNT"] + 1 + self.betahyp, E_star["MAJ_COUNT"] + 1 + self.betahyp) + \ np.log(1 - E_star["include_prior"])) r_cat_star = pd.concat([r_inc_star, r_exc_star]).sort_index() @@ -430,8 +432,8 @@ def prune(self): self.marg_lik[self.iter] -= self.seg_marg_liks[st] self.seg_marg_liks[st] = ss.betaln( - T.loc[T["include"], "MIN_COUNT"].sum() + 1, - T.loc[T["include"], "MAJ_COUNT"].sum() + 1, + T.loc[T["include"], "MIN_COUNT"].sum() + 1 + self.betahyp, + T.loc[T["include"], "MAJ_COUNT"].sum() + 1 + self.betahyp, ) self.marg_lik[self.iter] += self.seg_marg_liks[st] @@ -439,8 +441,8 @@ def prune(self): # effectively their own segments) self.marg_lik[self.iter] += (1 if ~self.P.at[choice_idx, "include"] else -1)* \ ss.betaln( - self.P.at[choice_idx, "MIN_COUNT"] + 1, - self.P.at[choice_idx, "MAJ_COUNT"] + 1 + self.P.at[choice_idx, "MIN_COUNT"] + 1 + self.betahyp, + self.P.at[choice_idx, "MAJ_COUNT"] + 1 + self.betahyp ) # TODO: update segment partial sums (when we actually use these) From a2a7c97961bd0b7ae26a8baf91a711f0ee0d085c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 22:48:54 -0400 Subject: [PATCH 080/222] Set segmentation betahyp based on coverage --- hapaseg/allelic_MCMC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index 5505518..4da203b 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -91,7 +91,7 @@ def __init__(self, P, # # marginal likelihoods - self.betahyp = 1 + self.betahyp = (self.P["REF_COUNT"] + self.P["ALT_COUNT"]).mean()/4 # log marginal likelihoods for each segment # initialize with each SNP comprising its own segment. From 14c736cfae53405c4753f669a39c5a230f203b6f Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 13 Apr 2022 22:49:12 -0400 Subject: [PATCH 081/222] Simplify initial segmentation visualization --- hapaseg/allelic_MCMC.py | 59 ++++++++++------------------------------- 1 file changed, 14 insertions(+), 45 deletions(-) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index 4da203b..7f9bbff 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -473,20 +473,16 @@ def incr_bp_counter(self, st, en, mid = None): self.breakpoint_counter[(mid + 1):en] += np.r_[0, 1] def visualize(self, show_CIs = False): - Ph = self.P.copy() - CI = s.beta.ppf([0.05, 0.5, 0.95], Ph["MIN_COUNT"][:, None] + 1, Ph["MAJ_COUNT"][:, None] + 1) - Ph[["CI_lo_hap", "median_hap", "CI_hi_hap"]] = CI - - plt.figure(); plt.clf() + plt.figure(figsize = [16, 4]); plt.clf() ax = plt.gca() # SNPs - ax.scatter(Ph["pos"], Ph["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)], alpha = 0.5, s = 4) + ax.scatter(self.P["pos"], self.P["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][self.P["aidx"].astype(np.int)], alpha = 0.5, s = 4, marker = '.') if show_CIs: - ax.errorbar(Ph["pos"], y = Ph["median_hap"], yerr = np.c_[Ph["median_hap"] - Ph["CI_lo_hap"], Ph["CI_hi_hap"] - Ph["median_hap"]].T, fmt = 'none', alpha = 0.5, color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)]) + ax.errorbar(self.P["pos"], y = self.P["median_hap"], yerr = np.c_[self.P["median_hap"] - self.P["CI_lo_hap"], self.P["CI_hi_hap"] - self.P["median_hap"]].T, fmt = 'none', alpha = 0.1, ecolor = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][self.P["aidx"].astype(np.int)]) # mask excluded SNPs - ax.scatter(Ph["pos"], Ph["median_hap"], color = 'k', alpha = 1 - pd.concat(self.include, axis = 1).mean(1).values) + # ax.scatter(Ph["pos"], Ph["median_hap"], color = 'k', alpha = 1 - pd.concat(self.include, axis = 1).mean(1).values) # breakpoints # bp_prob = self.breakpoint_counter[:, 0]/self.breakpoint_counter[:, 1] @@ -501,47 +497,20 @@ def visualize(self, show_CIs = False): # ax2.set_xlim(ax.get_xlim()); # ax2.set_xlabel("Breakpoint number in current MCMC iteration") - # beta CI's weighted by breakpoints - # flip current rephases back to baseline - for st, en in self.F.intervals(): - # code excised from flip_hap - x = Ph.iloc[st:en, self.maj_idx].copy() - Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx] - Ph.iloc[st:en, self.min_idx] = x - - pos_col = Ph.columns.get_loc("pos") - for bp_samp, pi_samp, inc_samp in itertools.zip_longest(self.breakpoint_list, self.phase_interval_list, self.include): - # flip everything according to sample - # if we did not perform phase correction, pi_samp will be none (hence - # the use of zip_longest above) - if pi_samp is not None: - for st, en in pi_samp.intervals(): - # TODO: can replace with flip_hap()? - x = Ph.iloc[st:en, self.maj_idx].copy() - Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx] - Ph.iloc[st:en, self.min_idx] = x - - # SNPs TODO: plot only those that flipped, in a diff. color? - #ax.scatter(Ph["pos"], Ph["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)], alpha = 0.5, s = 4) - - bpl = np.array(bp_samp); bpl = np.c_[bpl[0:-1], bpl[1:]] - for st, en in bpl: - Phi = Ph.iloc[st:en]; Phi = Phi.loc[inc_samp] - ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], Phi.iloc[:, self.min_idx].sum() + 1, Phi.iloc[:, self.maj_idx].sum() + 1) - ax.add_patch(mpl.patches.Rectangle((Ph.iloc[st, pos_col], ci_lo), Ph.iloc[en, pos_col] - Ph.iloc[st, pos_col], ci_hi - ci_lo, fill = True, facecolor = 'k', alpha = 1/len(self.breakpoint_list), zorder = 1000)) - - # flip everything back - if pi_samp is not None: - for st, en in pi_samp.intervals(): - # TODO: can replace with flip_hap()? - x = Ph.iloc[st:en, self.maj_idx].copy() - Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx] - Ph.iloc[st:en, self.min_idx] = x + bpl = self.breakpoints if self.breakpoints_MLE is None else self.breakpoints_MLE + bpl = np.array(bpl); bpl = np.c_[bpl[0:-1], bpl[1:]] + + pos_col = self.P.columns.get_loc("pos") + for st, en in bpl: + ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], self.P.iloc[st:en, self.maj_idx].sum() + 1, self.P.iloc[st:en, self.min_idx].sum() + 1) + ax.add_patch(mpl.patches.Rectangle((self.P.iloc[st, pos_col], ci_lo), self.P.iloc[en, pos_col] - self.P.iloc[st, pos_col], ci_hi - ci_lo, fill = True, facecolor = 'lime', alpha = 0.4, zorder = 1000)) # 50:50 line ax.axhline(0.5, color = 'k', linestyle = ":") ax.set_xticks(np.linspace(*plt.xlim(), 20)); - ax.set_xticklabels(Ph["pos"].searchsorted(np.linspace(*plt.xlim(), 20))); + ax.set_xticklabels(self.P["pos"].searchsorted(np.linspace(*plt.xlim(), 20))); ax.set_xlabel("SNP index") ax.set_ylim([0, 1]) + + plt.tight_layout() From 8ad2500c3eb822b53461b2c7fd80139d5230328e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Apr 2022 14:36:50 -0400 Subject: [PATCH 082/222] Treat p(clust,phase) jointly, not p(clust|phase) --- hapaseg/allelic_DP.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 35055ef..886beba 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1151,6 +1151,9 @@ def run(self, n_iter = 0, n_samps = 0): # probability of opening a new cluster log_count_prior[0] = ss.gammaln(M) + np.log(self.alpha) + ss.gammaln(N + self.alpha - M) - ss.gammaln(N + self.alpha) + # p(phase|X) + log_phase_prob = np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob])) + # # adjacent segment likelihood @@ -1162,23 +1165,18 @@ def run(self, n_iter = 0, n_samps = 0): log_adj_lik = self.compute_adj_prob(break_idx[0]) #seg_touch_idx[seg_idx] = True - # p(X|clust,phase)p(X|seg,phase)p(clust) + # p(X|clust,phase)p(X|seg,phase)p(clust)p(phase) num = (MLs # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B}) + log_adj_lik # p({a_i, b_i}_{i\in B} | U, D, phase_{i\in B}) - + log_count_prior) # p(clust) (DP prior on clust counts) + + log_count_prior # p(clust) (DP prior on clust counts) + + log_phase_prob) # p(phase) num /= self.temperature # scale by temperature for replica-exchange - num -= num.max(0) # avoid underflow in sum-exp - - # p(clust|X,phase) - log_clust_post = num - np.log(np.exp(num).sum(0)) - - # p(phase|X) - log_phase_prob = np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob])) + num -= num.max() # avoid underflow in sum-exp - # p(clust,phase|X) = p(clust|X,phase)p(phase|X) - choice_p = np.exp(log_clust_post + log_phase_prob) + # p(clust,phase|X) + choice_p = np.exp(num - np.log(np.exp(num).sum())) # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true choice_idx = np.random.choice( From 296b5f0efaa07bceaf7e9e3d947c0d39283309c8 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Apr 2022 15:04:44 -0400 Subject: [PATCH 083/222] Exclude chimeric reads by default --- wolF/workflow.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index da9b3bb..ebb9a33 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -189,7 +189,9 @@ def interval_gather(interval_files): refFastaIdx = localization_task["ref_fasta_idx"], refFastaDict = localization_task["ref_fasta_dict"], - intervals = split_intervals_task["interval_files"] + intervals = split_intervals_task["interval_files"], + + exclude_chimeric = True )) hp_scatter = het_pulldown.get_het_coverage_from_callstats( From 2a81227ae72a505bc60a4642463f8535c10472ba Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Apr 2022 15:05:24 -0400 Subject: [PATCH 084/222] Only need to return dataframe, since we are using MLE from initial segmentation --- wolF/workflow.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index ebb9a33..6ae5505 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -351,7 +351,7 @@ def get_chunks(scatter_chunks): ) # concat arm level results - @prefect.task(nout = 2) + @prefect.task def concat_arm_level_results(arm_results): A = [] for arm_file in arm_results: @@ -366,12 +366,9 @@ def concat_arm_level_results(arm_results): _, tmpfile = tempfile.mkstemp( ) A.to_pickle(tmpfile) - # get number of MCMC samples - n_samps = int(np.minimum(np.inf, A.loc[~A["results"].isna(), "results"].apply(lambda x : len(x.breakpoint_list))).min()) - - return tmpfile, list(range(0, n_samps)) + return tmpfile - arm_concat, n_samps_range = concat_arm_level_results(hapaseg_arm_AMCMC_task["arm_level_MCMC"]) + arm_concat = concat_arm_level_results(hapaseg_arm_AMCMC_task["arm_level_MCMC"]) ## run DP From cf97fa76d5ddbfbe266e6426486fc4ea178ad7c0 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Apr 2022 15:11:09 -0400 Subject: [PATCH 085/222] Remove cruft --- hapaseg/allelic_DP.py | 110 ------------------------------------------ 1 file changed, 110 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 886beba..fe13120 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -686,116 +686,7 @@ def compute_overall_lik_simple(self): # p({c_k}, {s}, {phase_i} | {a_i, b_i}) return np.r_[clust_lik, phase_lik, count_prior, seg_lik] - # {{{ - def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None, debug = False): - if segs_to_clusters is None: - su, segs_to_clusters = self.get_unique_clust_idxs() - else: - su, segs_to_clusters = self.get_unique_clust_idxs(segs_to_clusters) - if phase_orientations is None: - phase_orientations = np.r_[self.phase_orientations] - - # account for unassigned clusters - min_clust_idx = 1 if (su == -1).any() else 0 - - max_clust_idx = segs_to_clusters.max() + 1 - - liks = np.full([segs_to_clusters.shape[0], 2], np.nan) - - for i, (cl_samp, ph_samp) in enumerate(zip(segs_to_clusters, phase_orientations)): - ## overall clustering likelihood - clust_lik = np.r_[[ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_sums.items() if k >= 0]].sum() - - A1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx) - A2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx) - - B1 = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx) - B2 = npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx) - - # print(A1[1:].sum(), B1[1:].sum(), A2[1:].sum(), B2[1:].sum()) - - count_prior = np.bincount(cl_samp, minlength = max_clust_idx).astype(np.double)[min_clust_idx:] - count_prior /= count_prior.sum() - - #breakpoint() - - clust_lik = ((ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1))[min_clust_idx:] + np.log(count_prior)).sum() - # account for unassigned clusters, if present - if min_clust_idx == 1: - clust_lik += ss.betaln(self.S.loc[cl_samp == 0, "maj"] + 1, self.S.loc[cl_samp == 0, "min"] + 1).sum() - - if debug: - breakpoint() - - ## segmentation likelihood - - seg_lik = np.nan -# if min_clust_idx == 0: -# # get segment boundaries -# bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1]) -# bdy = np.c_[bdy[:-1], bdy[1:]] -# -# # sum log-likelihoods of each segment -# seg_lik = 0 -# for st, en in bdy: -# A1 = self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum() -# A2 = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() -# B1 = self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum() -# B2 = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() -# -# seg_lik += ss.betaln(A1 + 1, B1 + 1) + ss.betaln(A2 + 1, B2 + 1) -# else: -# seg_lik = np.nan - - liks[i, :] = np.r_[clust_lik, seg_lik] - - return liks -# }}} - def run(self, n_iter = 0, n_samps = 0): - # - # assign segments to likeliest prior component {{{ - - if len(self.clust_prior) > 1: - for seg_idx in range(len(self.S)): - seg_idx = np.r_[seg_idx] - - # compute probability that segment belongs to each cluster prior element - S_a = self._Siat_ph(seg_idx[0], min = True) - S_b = self._Siat_ph(seg_idx[0], min = False) - P_a = self.clust_prior_mat[1:, 0] - P_b = self.clust_prior_mat[1:, 1] - - # prior likelihood ratios for both phase orientations - P_l = np.c_[ - ss.betaln(S_a + P_a + 1, S_b + P_b + 1) - (ss.betaln(S_a + 1, S_b + 1) + ss.betaln(P_a + 1, P_b + 1)), - ss.betaln(S_b + P_a + 1, S_a + P_b + 1) - (ss.betaln(S_b + 1, S_a + 1) + ss.betaln(P_a + 1, P_b + 1)), - ] - - # get count prior - ccp = np.c_[[v for k, v in self.clust_count_prior.items() if k != -1]] - - # posterior numerator - num = P_l + np.log(ccp) - num -= num.max() - - # probabilistically choose a cluster - probs = np.exp(num)/np.exp(num).sum() - idx = np.tile(np.r_[self.clust_prior.keys()][1:], [2, 1]).T*[1, -1] - choice = np.random.choice( - idx.ravel(), - p = probs.ravel() - ) - - # rephase - if choice < 0: - self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] - choice = -choice - - self.S.iloc[seg_idx, self.clust_col] = choice - - # }}} - # # initialize cluster tracking hash tables self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore")) @@ -1163,7 +1054,6 @@ def run(self, n_iter = 0, n_samps = 0): log_adj_lik = 0 if not move_clust: # or (move_clust and np.random.rand() < 0.01): log_adj_lik = self.compute_adj_prob(break_idx[0]) - #seg_touch_idx[seg_idx] = True # p(X|clust,phase)p(X|seg,phase)p(clust)p(phase) num = (MLs # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B}) From eb1e7eb06755ec4139a1b28b7966100e901a6994 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Apr 2022 16:04:04 -0400 Subject: [PATCH 086/222] Burnin check --- hapaseg/allelic_DP.py | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index fe13120..f6023dd 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -670,7 +670,8 @@ def compute_overall_lik_simple(self): ## overall phasing likelihood # p({phase_i} | {a_i, b_i}) - phase_lik = np.log1p(-np.r_[self.seg_phase_probs.values()]).sum() + phase_probs = np.r_[self.seg_phase_probs.values()] + phase_lik = np.log1p(phase_probs).sum() if not np.isnan(phase_probs).any() else np.nan ## Dirichlet count prior (Dirichlet-categorical marginal likelihood) # p({c_k}) @@ -752,6 +753,7 @@ def run(self, n_iter = 0, n_samps = 0): brk = 0 touch90 = False + likelihood_ready = False while True: if not n_it % 1000: @@ -771,30 +773,28 @@ def run(self, n_iter = 0, n_samps = 0): # if n_samps > 0 and len() > n_samps: # break - # poll every 100 iterations for burnin status + # poll every 100 iterations for various statuses if not n_it % 100: # have >90% of segments been touched? if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.9: touch90 = True - # have most segments been adjacency corrected? - # if so, has the overall likelihood stabilized enough that we're burned in? - if not burned_in: - pass - # 2. if >90% of segments have been adjacency corrected, check for burnin - # does the smoothed derivative of the posterior numerator go below zero? this would indicate that we've solidly reached an optimum - # TODO: make this check more efficient? -# if all_touched and (np.convolve(np.diff(self.lik_tmp), np.ones(50)/50, mode = "same") < 0).sum() > 2: -# pass -# burned_in = True -# n_it_last = n_it -# seg_touch_idx[:] = False - # start computing likelihoods if touch90: - print(self.compute_overall_lik_simple()) - print(self.compute_overall_lik_simple().sum()) - self.lik_trace.append(self.compute_overall_lik_simple()) + lik = self.compute_overall_lik_simple() + # phasing likelihood will be NaN until we've touched every singlesegment + if not np.isnan(lik).any(): + self.lik_trace.append(lik) + self.seg_track.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]}) + likelihood_ready = True + + # check if likelihood has stabilized enough to consider us "burned in" + if likelihood_ready and not burned_in and len(self.lik_trace) > 100: + lt = np.vstack(self.lik_trace).sum(1) + if (np.convolve(np.diff(lt), np.ones(50)/50, mode = "same") < 0).sum() > 2: + breakpoint() + burned_in = True + n_it_last = n_it # save cluster assignments and phase orientations once burned in if burned_in: From 5309979c5e654c2417aeadf3ec9edc0580743d73 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Apr 2022 16:16:18 -0400 Subject: [PATCH 087/222] Add contingency to AMCMC if burnin criterion is never met due to early convergence --- hapaseg/allelic_MCMC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index 7f9bbff..95a349a 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -139,7 +139,7 @@ def run(self): return self # save MLE breakpoint if we've burned in - if self.burned_in: + if self.burned_in or self.iter >= self.n_iter - 100: # contingency in case we've converged on an optimum early and the chain hasn't moved at all if self.marg_lik[self.iter] > self.marg_lik[self.iter - 1]: self.breakpoints_MLE = self.breakpoints.copy() From f37451ef04323fcafd228d22331a142975f13c2f Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Apr 2022 16:59:27 -0400 Subject: [PATCH 088/222] Save samples after DP burnin --- hapaseg/allelic_DP.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index f6023dd..b106d88 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -769,9 +769,9 @@ def run(self, n_iter = 0, n_samps = 0): if n_iter > 0 and n_it > n_iter: return -# # stop after a number of samples have been taken -# if n_samps > 0 and len() > n_samps: -# break + # stop after a number of samples have been taken + if n_samps > 0 and len(self.segs_to_clusters) > n_samps: + break # poll every 100 iterations for various statuses if not n_it % 100: @@ -792,7 +792,6 @@ def run(self, n_iter = 0, n_samps = 0): if likelihood_ready and not burned_in and len(self.lik_trace) > 100: lt = np.vstack(self.lik_trace).sum(1) if (np.convolve(np.diff(lt), np.ones(50)/50, mode = "same") < 0).sum() > 2: - breakpoint() burned_in = True n_it_last = n_it @@ -1172,9 +1171,8 @@ def run(self, n_iter = 0, n_samps = 0): else: self.clust_members_bps[choice] |= snp_idx - # track global state of cluster assignments - # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations - if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2): + # save a sample from the MCMC when >95% of segments have been touched since the last iteration + if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95: self.segs_to_clusters.append(self.S["clust"].copy()) self.phase_orientations.append(self.S["flipped"].copy()) n_it_last = n_it From 9ae68fc588c8b489de96b16431bf6361b2b73d29 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 14 Apr 2022 17:24:08 -0400 Subject: [PATCH 089/222] segs_to_clusters -> snps_to_clusters --- hapaseg/allelic_DP.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index b106d88..b796ef4 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -739,7 +739,7 @@ def run(self, n_iter = 0, n_samps = 0): self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints[:-1] }) # containers for saving the MCMC trace - self.segs_to_clusters = [] + self.snps_to_clusters = [] self.phase_orientations = [] burned_in = False @@ -770,7 +770,7 @@ def run(self, n_iter = 0, n_samps = 0): return # stop after a number of samples have been taken - if n_samps > 0 and len(self.segs_to_clusters) > n_samps: + if n_samps > 0 and len(self.snps_to_clusters) > n_samps: break # poll every 100 iterations for various statuses @@ -795,11 +795,6 @@ def run(self, n_iter = 0, n_samps = 0): burned_in = True n_it_last = n_it - # save cluster assignments and phase orientations once burned in - if burned_in: - self.segs_to_clusters.append(self.S["clust"].copy()) - self.phase_orientations.append(self.S["flipped"].copy()) - # # pick either a segment or a cluster at random (50:50 prob.) move_clust = False @@ -1173,13 +1168,13 @@ def run(self, n_iter = 0, n_samps = 0): # save a sample from the MCMC when >95% of segments have been touched since the last iteration if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95: - self.segs_to_clusters.append(self.S["clust"].copy()) + self.snps_to_clusters.append(self.S["clust"].copy()) self.phase_orientations.append(self.S["flipped"].copy()) n_it_last = n_it n_it += 1 - return np.r_[self.segs_to_clusters], np.r_[self.phase_orientations] + return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations] #_colors = mpl.cm.get_cmap("tab10").colors _colors = ((np.c_[1:7] & np.r_[4, 2, 1]) > 0).astype(int) @@ -1191,11 +1186,11 @@ def run(self, n_iter = 0, n_samps = 0): # np.c_[0, 23, 204], # np.c_[75, 172, 227]]/255 - def get_unique_clust_idxs(self, segs_to_clusters = None): - if segs_to_clusters is None: - segs_to_clusters = np.r_[self.segs_to_clusters] - s2cu, s2cu_j = np.unique(segs_to_clusters, return_inverse = True) - return s2cu, s2cu_j.reshape(segs_to_clusters.shape) + def get_unique_clust_idxs(self, snps_to_clusters = None): + if snps_to_clusters is None: + snps_to_clusters = np.r_[self.snps_to_clusters] + s2cu, s2cu_j = np.unique(snps_to_clusters, return_inverse = True) + return s2cu, s2cu_j.reshape(snps_to_clusters.shape) def get_colors(self): s2cu, s2cu_j = self.get_unique_clust_idxs() @@ -1223,7 +1218,7 @@ def visualize_segs(self): colors = self.get_colors() s2cu, s2cu_j = self.get_unique_clust_idxs() - n_samp = len(self.segs_to_clusters) + n_samp = len(self.snps_to_clusters) for s2c, s2ph in zip(s2cu_j, self.phase_orientations): # rephase segments according to phase orientation sample @@ -1244,7 +1239,7 @@ def visualize_adjacent_segs(self, f = None, n_samp = None): colors = self.get_colors() s2cu, s2cu_j = self.get_unique_clust_idxs() - n_samp = len(self.segs_to_clusters) if n_samp is None else n_samp + n_samp = len(self.snps_to_clusters) if n_samp is None else n_samp for s2c, s2ph in zip(s2cu_j, self.phase_orientations): # rephase segments according to phase orientation sample From b1c38f8ae2a68b3bd63b32dcc57c6ebc9d0eb020 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 15 Apr 2022 14:11:25 -0400 Subject: [PATCH 090/222] Don't count prior twice when opening new cluster --- hapaseg/allelic_DP.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index b796ef4..0225ed4 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -974,6 +974,7 @@ def run(self, n_iter = 0, n_samps = 0): #AB = ss.betaln(A_a + B_a + 1, A_b + B_b + 1) # C is likelihood of target cluster pre-join C = ss.betaln(C_ab[:, 0] + 1 + self.betahyp, C_ab[:, 1] + 1 + self.betahyp) + C[0] = 0 # don't count prior twice when opening a new cluster # A is likelihood cluster B is part of, minus B #A = ss.betaln(A_a + 1, A_b + 1) # B+C is likelihood of target cluster post-join, with both phase orientations From 977173945c57b1d7279aa471af386ec0dc0e8944 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sun, 17 Apr 2022 10:47:17 -0400 Subject: [PATCH 091/222] Overhaul plotting segs from DP iterations --- hapaseg/allelic_DP.py | 86 ++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 50 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 0225ed4..f176d95 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -746,6 +746,7 @@ def run(self, n_iter = 0, n_samps = 0): # likelihood trace self.lik_trace = [] + self.segment_trace = [] self.post = 0 n_it = 0 @@ -785,7 +786,6 @@ def run(self, n_iter = 0, n_samps = 0): # phasing likelihood will be NaN until we've touched every singlesegment if not np.isnan(lik).any(): self.lik_trace.append(lik) - self.seg_track.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]}) likelihood_ready = True # check if likelihood has stabilized enough to consider us "burned in" @@ -1171,6 +1171,7 @@ def run(self, n_iter = 0, n_samps = 0): if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95: self.snps_to_clusters.append(self.S["clust"].copy()) self.phase_orientations.append(self.S["flipped"].copy()) + self.segment_trace.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]}) n_it_last = n_it n_it += 1 @@ -1196,24 +1197,24 @@ def get_unique_clust_idxs(self, snps_to_clusters = None): def get_colors(self): s2cu, s2cu_j = self.get_unique_clust_idxs() - seg_terr = self.S["end_gp"] - self.S["start_gp"] - tot_terr = np.zeros(len(s2cu)) - for r in s2cu_j: - tot_terr += npg.aggregate(r, seg_terr, size = len(tot_terr)) + T = pd.DataFrame(np.c_[np.r_[self.breakpoints[:-2]], np.r_[self.breakpoints[1:-1]]], columns = ["snp_st", "snp_end"]) + T["gp_st"] = self.S.loc[T["snp_st"], "pos_gp"].values + T["gp_end"] = self.S.loc[T["snp_end"], "pos_gp"].values + T["terr"] = T["gp_end"] - T["gp_st"] + T["clust"] = self.S.loc[T["snp_st"], "clust"].values - si = np.argsort(tot_terr)[::-1] - terr_cs = np.cumsum(tot_terr[si])/tot_terr.sum() + clust_terr = T.groupby("clust")["terr"].sum().sort_values(ascending = False) - colors_to_use = np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (terr_cs < 0.99).sum())]) - colors = np.zeros([len(s2cu), 4]) - n_distinct = colors_to_use.shape[0] - colors[si[:n_distinct], :] = colors_to_use - colors[si[n_distinct:], :] = colors_to_use[:(len(si) - n_distinct), :] + # color any cluster larger than 10Mb (~0.003 of total genomic territory) + return np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (clust_terr/clust_terr.sum() >= 0.003).sum())]) - def visualize_segs(self): - plt.figure() + def visualize_snps(self, f = None): + pass + + def visualize_segs(self, f = None): + f = plt.figure(figsize = [16, 4]) if f is None else f ax = plt.gca() - ax.set_xlim([0, self.S["end_gp"].max()]) + ax.set_xlim([0, self.S["pos_gp"].max()]) ax.set_ylim([0, 1]) colors = self.get_colors() @@ -1221,46 +1222,31 @@ def visualize_segs(self): n_samp = len(self.snps_to_clusters) - for s2c, s2ph in zip(s2cu_j, self.phase_orientations): - # rephase segments according to phase orientation sample - S_ph = self.S.copy() - flip_idx = np.flatnonzero(s2ph != S_ph["flipped"]) - S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] - - for i, r in enumerate(S_ph.itertuples()): - ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], r.min + 1, r.maj + 1) - ax.add_patch(mpl.patches.Rectangle((r.start_gp, ci_lo), r.end_gp - r.start_gp, ci_hi - ci_lo, facecolor = colors[s2c[i] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000)) - - def visualize_adjacent_segs(self, f = None, n_samp = None): - plt.figure(num = f, figsize = [17.56, 5.67]) - ax = plt.gca() - ax.set_xlim([0, self.S["end_gp"].max()]) - ax.set_ylim([0, 1]) - - colors = self.get_colors() - s2cu, s2cu_j = self.get_unique_clust_idxs() + selff = copy.deepcopy(self) - n_samp = len(self.snps_to_clusters) if n_samp is None else n_samp + for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations): + # get uniqued clust indices for each segment start + seg_cu = np.searchsorted(s2cu, np.r_[list(seg2c.values())]) - for s2c, s2ph in zip(s2cu_j, self.phase_orientations): # rephase segments according to phase orientation sample - S_ph = self.S.copy() - flip_idx = np.flatnonzero(s2ph != S_ph["flipped"]) - S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] + selff.S["flipped"] = s2ph - bdy = np.flatnonzero(np.r_[1, np.diff(s2c) != 0, 1]) - bdy = np.c_[bdy[:-1], bdy[1:]] - -# s2c_nz = s2c.copy() -# zidx = np.flatnonzero(s2c[bdy[:, 0]] == 0) -# for z in zidx: -# s2c_nz[bdy[z, 0]:bdy[z, 1]] = s2c_nz[bdy[z - 1, 0]] -# bdy_nz = np.flatnonzero(np.r_[1, np.diff(s2c_nz) != 0, 1]) -# bdy_nz = np.c_[bdy_nz[:-1], bdy_nz[1:]] + seg_bdy = np.r_[list(seg2c.keys()), len(selff.S)] + seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] - for st, en in bdy: - ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], S_ph.iloc[st:en, self.min_col].sum() + 1, S_ph.iloc[st:en, self.maj_col].sum() + 1) - ax.add_patch(mpl.patches.Rectangle((S_ph.iloc[st]["start_gp"], ci_lo), S_ph.iloc[en - 1]["end_gp"] - S_ph.iloc[st]["start_gp"], np.maximum(0, ci_hi - ci_lo), facecolor = colors[s2c[st] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000)) + for i, (st, en) in enumerate(seg_bdy): + ci_lo, med, ci_hi = s.beta.ppf( + [0.05, 0.5, 0.95], + selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp, + selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp, + ) + ax.add_patch(mpl.patches.Rectangle( + (selff.S.iloc[st]["pos_gp"], ci_lo), + selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"], + np.maximum(0, ci_hi - ci_lo), + facecolor = colors[seg_cu[i] % len(colors)], + fill = True, alpha = 1/n_samp, zorder = 1000 + )) def visualize_clusts(self, f = None, n_samp = None, thick = False, nocolor = False): plt.figure(num = f, figsize = [17.56, 5.67]) From a42df536dc50b06838ce0969dea9ac4ec9fd73b2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 09:52:41 -0400 Subject: [PATCH 092/222] Overhaul clust viz. code --- hapaseg/allelic_DP.py | 87 ++++++++++++------------------------------- 1 file changed, 23 insertions(+), 64 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index f176d95..104aa19 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1211,7 +1211,7 @@ def get_colors(self): def visualize_snps(self, f = None): pass - def visualize_segs(self, f = None): + def visualize_segs(self, f = None, use_clust = False): f = plt.figure(figsize = [16, 4]) if f is None else f ax = plt.gca() ax.set_xlim([0, self.S["pos_gp"].max()]) @@ -1235,72 +1235,31 @@ def visualize_segs(self, f = None): seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] for i, (st, en) in enumerate(seg_bdy): - ci_lo, med, ci_hi = s.beta.ppf( - [0.05, 0.5, 0.95], - selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp, - selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp, - ) - ax.add_patch(mpl.patches.Rectangle( - (selff.S.iloc[st]["pos_gp"], ci_lo), + if use_clust: + ci_lo, med, ci_hi = s.beta.ppf( + [0.05, 0.5, 0.95], + selff.clust_sums[seg2c[st]][0] + 1 + self.betahyp, + selff.clust_sums[seg2c[st]][1] + 1 + self.betahyp, + ) + else: + ci_lo, med, ci_hi = s.beta.ppf( + [0.05, 0.5, 0.95], + selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp, + selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp, + ) + ax.add_patch(mpl.patches.Rectangle(( + selff.S.iloc[st]["pos_gp"], ci_lo), selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"], np.maximum(0, ci_hi - ci_lo), facecolor = colors[seg_cu[i] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000 )) - - def visualize_clusts(self, f = None, n_samp = None, thick = False, nocolor = False): - plt.figure(num = f, figsize = [17.56, 5.67]) - ax = plt.gca() - ax.set_xlim([0, self.S["end_gp"].max()]) - ax.set_ylim([0, 1]) - - colors = self.get_colors() - s2cu, s2cu_j = self.get_unique_clust_idxs() - - n_samp = len(self.segs_to_clusters) if n_samp is None else n_samp - - for s2c, s2ph in zip(s2cu_j, self.phase_orientations): - # rephase segments according to phase orientation sample - S_ph = self.S.copy() - flip_idx = np.flatnonzero(s2ph != S_ph["flipped"]) - S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] - - # get overall cluster sums - clust_min = npg.aggregate(s2c, S_ph["min"]) - clust_maj = npg.aggregate(s2c, S_ph["maj"]) - CIs = s.beta.ppf([0.05, 0.5, 0.95], clust_min[:, None] + 1, clust_maj[:, None] + 1) - - # get boundaries of contiguous segments - bdy = np.flatnonzero(np.r_[1, np.diff(s2c) != 0, 1]) - bdy = np.c_[bdy[:-1], bdy[1:]] - -# s2c_nz = s2c.copy() -# zidx = np.flatnonzero(s2c[bdy[:, 0]] == 0) -# for z in zidx: -# s2c_nz[bdy[z, 0]:bdy[z, 1]] = s2c_nz[bdy[z - 1, 0]] -# bdy_nz = np.flatnonzero(np.r_[1, np.diff(s2c_nz) != 0, 1]) -# bdy_nz = np.c_[bdy_nz[:-1], bdy_nz[1:]] - - for st, en in bdy: - if thick: - b = CIs[s2c[st], 1] - 0.01 - t = CIs[s2c[st], 1] + 0.01 - else: - color = colors[s2c[st] % len(colors)] - b = CIs[s2c[st], 0] - t = CIs[s2c[st], 2] - - if nocolor: - color = [0, 1, 0] - else: - color = colors[s2c[st] % len(colors)] - - ax.add_patch(mpl.patches.Rectangle( - xy = (S_ph.iloc[st]["start_gp"], b), - width = S_ph.iloc[en - 1]["end_gp"] - S_ph.iloc[st]["start_gp"], - height = t - b, - facecolor = color, - fill = True, - alpha = 1/n_samp, - zorder = 1000) + plt.scatter( + (selff.S.iloc[en - 1]["pos_gp"] + selff.S.iloc[st]["pos_gp"])/2, + med, + color = colors[seg_cu[i] % len(colors)], + marker = '.', s = 1, alpha = 1/n_samp ) + + def visualize_clusts(self, f = None): + self.visualize_segs(f = f, use_clust = True) From 6c2e149d86228a4a21d528fb7361782c90ec4f14 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 10:15:38 -0400 Subject: [PATCH 093/222] Visualize SNPs --- hapaseg/allelic_DP.py | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 104aa19..0a80903 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1208,10 +1208,7 @@ def get_colors(self): # color any cluster larger than 10Mb (~0.003 of total genomic territory) return np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (clust_terr/clust_terr.sum() >= 0.003).sum())]) - def visualize_snps(self, f = None): - pass - - def visualize_segs(self, f = None, use_clust = False): + def visualize_segs(self, f = None, use_clust = False, show_snps = False): f = plt.figure(figsize = [16, 4]) if f is None else f ax = plt.gca() ax.set_xlim([0, self.S["pos_gp"].max()]) @@ -1224,6 +1221,36 @@ def visualize_segs(self, f = None, use_clust = False): selff = copy.deepcopy(self) + if show_snps: + # set SNP alpha based on number of SNPs + logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M))) + default_alpha = logistic(A = 0.4, K = 0.01, B = 0.00001, M = 120000, x = len(self.S)) + + ph_prob = np.r_[self.phase_orientations].mean(0) + + # only plot unambiguous SNPs once + uidx = np.flatnonzero((ph_prob == 0) | (ph_prob == 1)) + selff.S["flipped"] = ph_prob == 1 + ax.scatter( + selff.S.loc[uidx, "pos_gp"], + selff._Sloc_ph(uidx)/(selff._Sloc_ph(uidx) + selff._Sloc_ph(uidx, min = False)), + color = 'k', marker = '.', alpha = default_alpha, s = 1 + ) + + # plot ambiguous SNPs with opacity weighted by phase probability + selff.S["flipped"] = True + nuidx = np.flatnonzero(~((ph_prob == 0) | (ph_prob == 1))) + ax.scatter( + selff.S.loc[nuidx, "pos_gp"], + selff._Sloc_ph(nuidx)/(selff._Sloc_ph(nuidx) + selff._Sloc_ph(nuidx, min = False)), + color = 'k', marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1 + ) + ax.scatter( + selff.S.loc[nuidx, "pos_gp"], + selff._Sloc_ph(nuidx, min = False)/(selff._Sloc_ph(nuidx) + selff._Sloc_ph(nuidx, min = False)), + color = 'k', marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1 + ) + for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations): # get uniqued clust indices for each segment start seg_cu = np.searchsorted(s2cu, np.r_[list(seg2c.values())]) @@ -1261,5 +1288,5 @@ def visualize_segs(self, f = None, use_clust = False): marker = '.', s = 1, alpha = 1/n_samp ) - def visualize_clusts(self, f = None): - self.visualize_segs(f = f, use_clust = True) + def visualize_clusts(self, **kwargs): + self.visualize_segs(use_clust = True, **kwargs) From 6e5f80d57cdbd2a5076ce08386eda1b494defeba Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 11:26:42 -0400 Subject: [PATCH 094/222] Fix bugs with plotting SNPs --- hapaseg/allelic_DP.py | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 0a80903..3968834 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1229,26 +1229,30 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False): ph_prob = np.r_[self.phase_orientations].mean(0) # only plot unambiguous SNPs once - uidx = np.flatnonzero((ph_prob == 0) | (ph_prob == 1)) - selff.S["flipped"] = ph_prob == 1 + uidx = ph_prob == 0 ax.scatter( - selff.S.loc[uidx, "pos_gp"], - selff._Sloc_ph(uidx)/(selff._Sloc_ph(uidx) + selff._Sloc_ph(uidx, min = False)), + self.S.loc[uidx, "pos_gp"], + self.S.loc[uidx, "min"]/self.S.loc[uidx, ["min", "maj"]].sum(1), + color = 'k', marker = '.', alpha = default_alpha, s = 1 + ) + uidx = ph_prob == 1 + ax.scatter( + self.S.loc[uidx, "pos_gp"], + self.S.loc[uidx, "maj"]/self.S.loc[uidx, ["min", "maj"]].sum(1), color = 'k', marker = '.', alpha = default_alpha, s = 1 ) # plot ambiguous SNPs with opacity weighted by phase probability - selff.S["flipped"] = True - nuidx = np.flatnonzero(~((ph_prob == 0) | (ph_prob == 1))) + nuidx = (ph_prob != 0) & (ph_prob != 1) ax.scatter( selff.S.loc[nuidx, "pos_gp"], - selff._Sloc_ph(nuidx)/(selff._Sloc_ph(nuidx) + selff._Sloc_ph(nuidx, min = False)), - color = 'k', marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1 + self.S.loc[nuidx, "min"]/self.S.loc[nuidx, ["min", "maj"]].sum(1), + color = 'k', marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1 ) ax.scatter( selff.S.loc[nuidx, "pos_gp"], - selff._Sloc_ph(nuidx, min = False)/(selff._Sloc_ph(nuidx) + selff._Sloc_ph(nuidx, min = False)), - color = 'k', marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1 + self.S.loc[nuidx, "maj"]/self.S.loc[nuidx, ["min", "maj"]].sum(1), + color = 'k', marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1 ) for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations): @@ -1274,19 +1278,22 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False): selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp, selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp, ) - ax.add_patch(mpl.patches.Rectangle(( - selff.S.iloc[st]["pos_gp"], ci_lo), + ax.add_patch(mpl.patches.Rectangle( + (selff.S.iloc[st]["pos_gp"], ci_lo), selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"], np.maximum(0, ci_hi - ci_lo), facecolor = colors[seg_cu[i] % len(colors)], - fill = True, alpha = 1/n_samp, zorder = 1000 + fill = True, alpha = 1 if show_snps else 1/n_samp, zorder = 1000 )) - plt.scatter( + ax.scatter( (selff.S.iloc[en - 1]["pos_gp"] + selff.S.iloc[st]["pos_gp"])/2, med, color = colors[seg_cu[i] % len(colors)], - marker = '.', s = 1, alpha = 1/n_samp + marker = '.', s = 1, alpha = 1 if show_snps else 1/n_samp ) + if show_snps: + break + def visualize_clusts(self, **kwargs): self.visualize_segs(use_clust = True, **kwargs) From 69828c2333ca9ae61c1d8f7ec89451b0a4238134 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 11:40:04 -0400 Subject: [PATCH 095/222] Clean up DP wrapper code --- hapaseg/allelic_DP.py | 259 +++++------------------------------------- 1 file changed, 28 insertions(+), 231 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 3968834..6d31fe9 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -17,36 +17,16 @@ class A_DP: def __init__(self, allelic_segs_pickle, ref_fasta = None): # dataframe of allelic imbalance segmentation samples for each chromosome arm - self.allelic_segs = pd.read_pickle(allelic_segs_pickle).dropna(0) - self.allelic_segs = self.allelic_segs.loc[self.allelic_segs["results"].apply(lambda x : len(x.breakpoint_list)) > 0] - - # number of total segmentation samples - self.n_samp = self.allelic_segs["results"].apply(lambda x : len(x.breakpoint_list)).min() - self.ref_fasta = ref_fasta - - # DP run objects for each segmentation sample - self.DP_runs = None - - # dataframe of SNPs - self.SNPs = None - - # number of segmentation samples used for DP run - self.N_seg_samps = None - # number of DP samples per segmentation sample - self.N_clust_samps = None - - # assignment of SNPs to DP clusters for each MCMC sample - self.snps_to_clusters = None - # phase correction of SNPs for each MCMC sample - self.snps_to_phases = None - - def load_seg_samp(self, samp_idx): - if samp_idx > self.n_samp: - raise ValueError(f"Only {self.n_samp} MCMC samples were taken!") - - SNPs = [] + self.allelic_segs = pd.read_pickle(allelic_segs_pickle).dropna(axis = 0) + # if some chromsome arms couldn't find the MLE, just use current state of chain + none_idx = self.allelic_segs["results"].apply(lambda x : x.breakpoints_MLE is None) + for i in none_idx[none_idx].index: + self.allelic_segs.iloc[i]["results"].breakpoints_MLE = self.allelic_segs.iloc[i]["results"].breakpoints + + # load SNPs + self.SNPs = [] clust_offset = 0 - for _, H in self.allelic_segs.dropna(subset = ["results"]).iterrows(): + for _, H in self.allelic_segs.iterrows(): S = copy.deepcopy(H["results"].P) S["A_alt"] = 0 S.loc[S["aidx"], "A_alt"] = S.loc[S["aidx"], "ALT_COUNT"] @@ -62,8 +42,7 @@ def load_seg_samp(self, samp_idx): # set initial cluster assignments based on segmentation S["clust"] = -1 - # TODO: use ML segmentation - bpl = np.array(H["results"].breakpoint_list[samp_idx]); bpl = np.c_[bpl[0:-1], bpl[1:]] + bpl = np.array(H["results"].breakpoints_MLE); bpl = np.c_[bpl[0:-1], bpl[1:]] for i, (st, en) in enumerate(bpl): S.iloc[st:en, S.columns.get_loc("clust")] = i + clust_offset clust_offset += i @@ -72,217 +51,35 @@ def load_seg_samp(self, samp_idx): S = S.iloc[:-1] assert (S["clust"] != -1).all() - SNPs.append(S) + self.SNPs.append(S) - SNPs = pd.concat(SNPs, ignore_index = True) + self.SNPs = pd.concat(self.SNPs, ignore_index = True) # convert chr-relative positions to absolute genomic coordinates - SNPs["pos_gp"] = seq.chrpos2gpos(SNPs["chr"], SNPs["pos"], ref = self.ref_fasta) + self.ref_fasta = ref_fasta + self.SNPs["pos_gp"] = seq.chrpos2gpos(self.SNPs["chr"], self.SNPs["pos"], ref = self.ref_fasta) # initial phasing orientation - SNPs["flipped"] = False - - return SNPs, None - - # map trace of segment cluster assignments to the SNPs within - @staticmethod - def map_seg_clust_assignments_to_SNPs(segs_to_clusters, S): - st_col = S.columns.get_loc("SNP_st") - en_col = S.columns.get_loc("SNP_en") - snps_to_clusters = np.zeros((segs_to_clusters.shape[0], S.iloc[-1, en_col] + 1), dtype = int) - for i, seg_assign in enumerate(segs_to_clusters): - for j, seg in enumerate(seg_assign): - snps_to_clusters[i, S.iloc[j, st_col]:S.iloc[j, en_col]] = seg - - return snps_to_clusters - - @staticmethod - def map_seg_phases_to_SNPs(phase, S): - st_col = S.columns.get_loc("SNP_st") - en_col = S.columns.get_loc("SNP_en") - snps_to_phase = np.zeros((phase.shape[0], S.iloc[-1, en_col] + 1), dtype = int) - for i, phase_orient in enumerate(phase): - for j, ph in enumerate(phase_orient): - snps_to_phase[i, S.iloc[j, st_col]:S.iloc[j, en_col]] = ph - - return snps_to_phase - - def run(self, N_seg_samps = 50, N_clust_samps = 5, seg_sample_idx = None): - self.N_seg_samps = N_seg_samps if seg_sample_idx is None else 1 - self.N_clust_samps = N_clust_samps - - seg_sample_idx = np.random.choice(self.n_samp - 1, self.N_seg_samps, replace = False) if seg_sample_idx is None else [seg_sample_idx] - S, SNPs = self.load_seg_samp(seg_sample_idx[0]) - N_SNPs = len(SNPs) - - self.snps_to_clusters = -1*np.ones((self.N_clust_samps*self.N_seg_samps, N_SNPs), dtype = np.int16) - self.snps_to_phases = np.zeros((self.N_clust_samps*self.N_seg_samps, N_SNPs), dtype = bool) - self.DP_likelihoods = np.zeros((self.N_clust_samps*self.N_seg_samps, 2)) - - self.DP_runs = [None]*self.N_seg_samps - - clust_prior = sc.SortedDict() - clust_count_prior = sc.SortedDict() - n_iter_clust_exist = sc.SortedDict() - cur_samp_iter = 0 - - for n_it in range(self.N_seg_samps): - if n_it > 0: - S, SNPs = self.load_seg_samp(seg_sample_idx[n_it]) - - # run clustering - self.DP_runs[n_it] = DPinstance(S, clust_prior = clust_prior, clust_count_prior = clust_count_prior) - segs_to_clusters, segs_to_phases = self.DP_runs[n_it].run(n_iter = self.N_clust_samps) - - # compute likelihoods for each clustering - self.DP_likelihoods[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.DP_runs[n_it].compute_overall_lik() - - # assign clusters to individual SNPs, to use as segment assignment prior for next DP iteration - self.snps_to_clusters[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.map_seg_clust_assignments_to_SNPs(segs_to_clusters, S) - - # assign phase orientations to individual SNPs - self.snps_to_phases[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.map_seg_phases_to_SNPs(segs_to_phases, S) - - # compute prior on cluster locations/counts - max_clust_idx = segs_to_clusters.max() - for seg_assignments, seg_phases in zip(segs_to_clusters, segs_to_phases): - # reset phases - S2 = S.copy() - S2.loc[S2["flipped"], ["min", "maj"]] = S2.loc[S2["flipped"], ["min", "maj"]].values[:, ::-1] - - # match phases to current sample - S2.loc[seg_phases, ["min", "maj"]] = S2.loc[seg_phases, ["min", "maj"]].values[:, ::-1] - - # minor/major counts for each cluster in this iteration - S_a = npg.aggregate(seg_assignments, S2["min"], size = max_clust_idx + 1) - S_b = npg.aggregate(seg_assignments, S2["maj"], size = max_clust_idx + 1) - c = np.c_[S_a, S_b] - - # total numer of SNPs for each cluster in this iteration - #N_c = npg.aggregate(seg_assignments, S2["SNP_en"] - S2["SNP_st"], size = max_clust_idx + 1) - N_c = npg.aggregate(seg_assignments, 1, size = max_clust_idx + 1) - - # iteratively update priors - next_clust_prior = sc.SortedDict(zip(np.flatnonzero(c.sum(1) > 0), c[c.sum(1) > 0])) - next_clust_count_prior = sc.SortedDict(zip(np.flatnonzero(c.sum(1) > 0), N_c[N_c > 0])) - - for cl in np.unique(seg_assignments): - if cl in n_iter_clust_exist: - n_iter_clust_exist[cl] += 1 - else: - n_iter_clust_exist[cl] = 1 - cur_samp_iter += 1 - - for k, v in next_clust_prior.items(): - nccp = next_clust_count_prior[k] - if k in clust_prior: - clust_prior[k] += (v - clust_prior[k])/n_iter_clust_exist[k] - clust_count_prior[k] += (nccp - clust_count_prior[k])/cur_samp_iter - else: - clust_prior[k] = v - clust_count_prior[k] = nccp/cur_samp_iter - # for clusters that don't exist in this iteration, average counts with zero - for k, v in clust_prior.items(): - if k != -1 and k not in next_clust_prior: - clust_count_prior[k] -= clust_count_prior[k]/cur_samp_iter - - # remove improbable clusters from prior - for kk in [k for k, v in clust_count_prior.items() if v < 1]: - del clust_prior[kk] - del clust_count_prior[kk] - - return self.snps_to_clusters, self.snps_to_phases, self.DP_likelihoods - - def visualize_segs(self, snps_to_clusters = None, f = None, n_vis_samp = None): - f = plt.figure(figsize = [17.56, 5.67]) if f is None else f - - snps_to_clusters = snps_to_clusters if snps_to_clusters is not None else self.snps_to_clusters - - # plot all samples from DP - if n_vis_samp is None: - run_idx = np.r_[0:self.N_seg_samps] - N_seg_samps = self.N_seg_samps + self.SNPs["flipped"] = False - # only plot up to n_vis_samp _segmentation samples_ from DP - # (all DP samples for a given segmentation sample will be plotted) - else: - run_idx = np.random.choice(self.N_seg_samps, n_vis_samp, replace = False) - N_seg_samps = n_vis_samp - - for d in [self.DP_runs[x] for x in run_idx]: - d.visualize_adjacent_segs(f = f.number, n_samp = N_seg_samps*self.N_clust_samps) - - def visualize_clusts(self, snps_to_clusters = None, f = None, thick = False, nocolor = False, n_vis_samp = None): - f = plt.figure(figsize = [17.56, 5.67]) if f is None else f - - snps_to_clusters = snps_to_clusters if snps_to_clusters is not None else self.snps_to_clusters - - # plot all samples from DP - if n_vis_samp is None: - run_idx = np.r_[0:self.N_seg_samps] - N_seg_samps = self.N_seg_samps - - # only plot up to n_vis_samp _segmentation samples_ from DP - # (all DP samples for a given segmentation sample will be plotted) - else: - run_idx = np.random.choice(self.N_seg_samps, n_vis_samp, replace = False) - N_seg_samps = n_vis_samp + self.N_clust_samps = 100 - for d in [self.DP_runs[x] for x in run_idx]: - d.visualize_clusts(f = f.number, n_samp = N_seg_samps*self.N_clust_samps, thick = thick, nocolor = nocolor) - - def visualize_SNPs(self, snps_to_phases = None, color = True, f = None): - snps_to_phases = snps_to_phases if snps_to_phases is not None else self.snps_to_phases - ph_prob = snps_to_phases.mean(0) - - if color: - rb = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]] - else: - rb = np.full([2, 3], 0) - - logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M))) - - def scerrorbar(idx, rev = False, alpha = 1, show_CI = True): - if rev: - f = 1 - self.SNPs.loc[idx, "f"] - eb_bot = self.SNPs.loc[idx, "f"] - self.SNPs.loc[idx, "f_CI_hi"] - eb_top = self.SNPs.loc[idx, "f_CI_lo"] - self.SNPs.loc[idx, "f"] - else: - f = self.SNPs.loc[idx, "f"] - eb_bot = self.SNPs.loc[idx, "f"] - self.SNPs.loc[idx, "f_CI_lo"] - eb_top = self.SNPs.loc[idx, "f_CI_hi"] - self.SNPs.loc[idx, "f"] - - if show_CI: - plt.errorbar( - x = self.SNPs.loc[idx, "gpos"], - y = f, - yerr = np.c_[ - eb_bot, - eb_top - ].T, - fmt = 'none', ecolor = np.c_[rb[self.SNPs.loc[idx, "allele"]], (alpha if isinstance(alpha, np.ndarray) else alpha*np.ones(idx.sum()))**2] - ) - - plt.scatter( - self.SNPs.loc[idx, "gpos"], - f, - color = rb[self.SNPs.loc[idx, "allele"]], - marker = '.', - s = 1, - alpha = alpha if show_CI else alpha - ) + # assignment of SNPs to DP clusters for each MCMC sample + self.snps_to_clusters = None + # phase correction of SNPs for each MCMC sample + self.snps_to_phases = None - default_alpha = logistic(A = 0.4, K = 0.01, B = 0.00001, M = 120000, x = len(self.SNPs)) + def run(self): + self.DP_run = DPinstance( + self.SNPs, + dp_count_scale_factor = self.SNPs["clust"].value_counts().mean() + ) + self.snps_to_clusters, self.snps_to_phases = self.DP_run.run(n_samps = self.N_clust_samps) - f = plt.figure(figsize = [17.56, 5.67]) if f is None else f - scerrorbar(ph_prob == 0, alpha = default_alpha, show_CI = color) - scerrorbar(ph_prob == 1, rev = True, alpha = default_alpha, show_CI = color) - idx = (ph_prob > 0) & (ph_prob < 1) - scerrorbar(idx, alpha = (1 - ph_prob[idx])*default_alpha, show_CI = color) - scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color) + return self.snps_to_clusters, self.snps_to_phases class DPinstance: - def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 1, temperature = 1, dp_count_scale_factor = 1): + def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, temperature = 1, dp_count_scale_factor = 1): self.S = S self.clust_prior = clust_prior.copy() self.clust_count_prior = clust_count_prior.copy() From 44a42abda652d47a239c84823ad491d6389134b8 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 18:13:52 -0400 Subject: [PATCH 096/222] Return likelihood trace --- hapaseg/allelic_DP.py | 48 ++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 14 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 6d31fe9..65731e1 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -68,15 +68,17 @@ def __init__(self, allelic_segs_pickle, ref_fasta = None): self.snps_to_clusters = None # phase correction of SNPs for each MCMC sample self.snps_to_phases = None + # likelihoods of each clustering + self.likelihoods = None def run(self): self.DP_run = DPinstance( self.SNPs, dp_count_scale_factor = self.SNPs["clust"].value_counts().mean() ) - self.snps_to_clusters, self.snps_to_phases = self.DP_run.run(n_samps = self.N_clust_samps) + self.snps_to_clusters, self.snps_to_phases, self.likelihoods = self.DP_run.run(n_samps = self.N_clust_samps) - return self.snps_to_clusters, self.snps_to_phases + return self.snps_to_clusters, self.snps_to_phases, self.likelihoods class DPinstance: def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, temperature = 1, dp_count_scale_factor = 1): @@ -538,20 +540,20 @@ def run(self, n_iter = 0, n_samps = 0): # containers for saving the MCMC trace self.snps_to_clusters = [] self.phase_orientations = [] + self.segment_trace = [] + self.likelihood_trace = [] - burned_in = False - - # likelihood trace + # likelihood trace for checking burnin status self.lik_trace = [] - self.segment_trace = [] - self.post = 0 + burned_in = False + self.burnin_iteration = -1 + touch90 = False + likelihood_ready = False n_it = 0 n_it_last = 0 brk = 0 - touch90 = False - likelihood_ready = False while True: if not n_it % 1000: @@ -559,9 +561,8 @@ def run(self, n_iter = 0, n_samps = 0): print(pd.Series(self.clust_counts.values()).value_counts().sort_index()) else: print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1])) - print(brk % (len(self.breakpoints) - 1)) - #print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index()) - #print("n unassigned: {}".format((self.S["clust"] == -1).sum())) + if likelihood_ready: + print("[{}] Likelihood: {}".format("*" if burned_in else " ", self.lik_trace[-1].sum())) # stop after a raw number of iterations if n_iter > 0 and n_it > n_iter: @@ -588,8 +589,10 @@ def run(self, n_iter = 0, n_samps = 0): # check if likelihood has stabilized enough to consider us "burned in" if likelihood_ready and not burned_in and len(self.lik_trace) > 100: lt = np.vstack(self.lik_trace).sum(1) - if (np.convolve(np.diff(lt), np.ones(50)/50, mode = "same") < 0).sum() > 2: + if (np.convolve(np.diff(lt), np.ones(100)/100, mode = "same") < 0).sum() > 2: + print("BURNED IN") burned_in = True + self.burnin_iteration = len(self.lik_trace) n_it_last = n_it # @@ -969,11 +972,12 @@ def run(self, n_iter = 0, n_samps = 0): self.snps_to_clusters.append(self.S["clust"].copy()) self.phase_orientations.append(self.S["flipped"].copy()) self.segment_trace.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]}) + self.likelihood_trace.append(self.compute_overall_lik_simple().sum()) n_it_last = n_it n_it += 1 - return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations] + return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations], np.r_[self.likelihood_trace] #_colors = mpl.cm.get_cmap("tab10").colors _colors = ((np.c_[1:7] & np.r_[4, 2, 1]) > 0).astype(int) @@ -1094,3 +1098,19 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False): def visualize_clusts(self, **kwargs): self.visualize_segs(use_clust = True, **kwargs) + + def plot_likelihood_trace(self): + lt = np.vstack(self.lik_trace) + lt = lt[np.isnan(lt).sum(1) == 0, :] + + lt = lt[self.burnin_iteration:, :] + + plt.figure(); plt.clf() + plt.scatter(np.r_[0:len(lt)], lt[:, 0] - lt[:, 0].max()) + #plt.scatter(np.r_[0:len(lt)], lt[:, 1] - lt[:, 1].max()) + plt.scatter(np.r_[0:len(lt)], lt[:, 2] - lt[:, 2].max()) + plt.scatter(np.r_[0:len(lt)], lt[:, 3] - lt[:, 3].max()) + plt.scatter(np.r_[0:len(lt)], lt.sum(1) - lt.sum(1).max(), marker = '+', color = 'k') + plt.legend(["Clust", "DP", "Seg", "Total"]) + plt.xlabel(r"Post-burnin iteration ($\times 100$)") + plt.ylabel(r"$\Delta$ likelihood") From 662b9b1b27134e234ccab9d935e9631adf7ce8e3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 19:25:11 -0400 Subject: [PATCH 097/222] Overhaul plot colorization --- hapaseg/allelic_DP.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 65731e1..a81c96d 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1,5 +1,6 @@ import colorama import copy +import distinctipy import itertools import matplotlib.pyplot as plt import matplotlib as mpl @@ -1005,9 +1006,16 @@ def get_colors(self): T["clust"] = self.S.loc[T["snp_st"], "clust"].values clust_terr = T.groupby("clust")["terr"].sum().sort_values(ascending = False) + si = clust_terr.index.argsort() # color any cluster larger than 10Mb (~0.003 of total genomic territory) - return np.array([mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (clust_terr/clust_terr.sum() >= 0.003).sum())]) + extra_colors = np.array( + distinctipy.distinctipy.get_colors( + (clust_terr/clust_terr.sum() >= 0.003).sum() - _colors.shape[0], + exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], _colors]]) + ) + + return np.r_[_colors, extra_colors][si] def visualize_segs(self, f = None, use_clust = False, show_snps = False): f = plt.figure(figsize = [16, 4]) if f is None else f @@ -1029,18 +1037,20 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False): ph_prob = np.r_[self.phase_orientations].mean(0) + cu = np.searchsorted(s2cu, self.S["clust"]) + # only plot unambiguous SNPs once uidx = ph_prob == 0 ax.scatter( self.S.loc[uidx, "pos_gp"], self.S.loc[uidx, "min"]/self.S.loc[uidx, ["min", "maj"]].sum(1), - color = 'k', marker = '.', alpha = default_alpha, s = 1 + color = colors[cu[uidx] % len(colors)], marker = '.', alpha = default_alpha, s = 1 ) uidx = ph_prob == 1 ax.scatter( self.S.loc[uidx, "pos_gp"], self.S.loc[uidx, "maj"]/self.S.loc[uidx, ["min", "maj"]].sum(1), - color = 'k', marker = '.', alpha = default_alpha, s = 1 + color = colors[cu[uidx] % len(colors)], marker = '.', alpha = default_alpha, s = 1 ) # plot ambiguous SNPs with opacity weighted by phase probability @@ -1048,15 +1058,20 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False): ax.scatter( selff.S.loc[nuidx, "pos_gp"], self.S.loc[nuidx, "min"]/self.S.loc[nuidx, ["min", "maj"]].sum(1), - color = 'k', marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1 + color = colors[cu[nuidx] % len(colors)], marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1 ) ax.scatter( selff.S.loc[nuidx, "pos_gp"], self.S.loc[nuidx, "maj"]/self.S.loc[nuidx, ["min", "maj"]].sum(1), - color = 'k', marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1 + color = colors[cu[nuidx] % len(colors)], marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1 ) for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations): + # only show maximum likelihood if we're overlaying SNPs + if show_snps: + mlidx = np.r_[self.likelihood_trace].argmax() + seg2c, s2ph = self.segment_trace[mlidx], self.phase_orientations[mlidx] + # get uniqued clust indices for each segment start seg_cu = np.searchsorted(s2cu, np.r_[list(seg2c.values())]) @@ -1084,6 +1099,7 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False): selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"], np.maximum(0, ci_hi - ci_lo), facecolor = colors[seg_cu[i] % len(colors)], + edgecolor = 'k' if show_snps else None, linewidth = 0.5 if show_snps else None, fill = True, alpha = 1 if show_snps else 1/n_samp, zorder = 1000 )) ax.scatter( From c8490fd6a9406891c0a4257ab4937f81a4152522 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 20:58:19 -0400 Subject: [PATCH 098/222] No magenta --- hapaseg/allelic_DP.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index a81c96d..01ce4da 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -980,22 +980,14 @@ def run(self, n_iter = 0, n_samps = 0): return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations], np.r_[self.likelihood_trace] - #_colors = mpl.cm.get_cmap("tab10").colors - _colors = ((np.c_[1:7] & np.r_[4, 2, 1]) > 0).astype(int) -# _colors = np.r_[np.c_[87, 182, 55], -# np.c_[253, 245, 81], -# np.c_[238, 109, 45], -# np.c_[204, 43, 30], -# np.c_[221, 50, 132], -# np.c_[0, 23, 204], -# np.c_[75, 172, 227]]/255 - def get_unique_clust_idxs(self, snps_to_clusters = None): if snps_to_clusters is None: snps_to_clusters = np.r_[self.snps_to_clusters] s2cu, s2cu_j = np.unique(snps_to_clusters, return_inverse = True) return s2cu, s2cu_j.reshape(snps_to_clusters.shape) + _colors = ((np.r_[np.c_[1:5], np.c_[6:7]] & np.r_[4, 2, 1]) > 0).astype(int) + def get_colors(self): s2cu, s2cu_j = self.get_unique_clust_idxs() @@ -1012,7 +1004,9 @@ def get_colors(self): extra_colors = np.array( distinctipy.distinctipy.get_colors( (clust_terr/clust_terr.sum() >= 0.003).sum() - _colors.shape[0], - exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], _colors]]) + exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], np.c_[0.5, 0.5, 0.5], np.c_[1, 0, 1], _colors]], + rng = 1234 + ) ) return np.r_[_colors, extra_colors][si] From c3bf48dc0333b2b1a8ac3790b3b14004c646b7bb Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:02:58 -0400 Subject: [PATCH 099/222] Use new colorscheme --- hapaseg/allelic_DP.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 01ce4da..603aa35 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -986,8 +986,6 @@ def get_unique_clust_idxs(self, snps_to_clusters = None): s2cu, s2cu_j = np.unique(snps_to_clusters, return_inverse = True) return s2cu, s2cu_j.reshape(snps_to_clusters.shape) - _colors = ((np.r_[np.c_[1:5], np.c_[6:7]] & np.r_[4, 2, 1]) > 0).astype(int) - def get_colors(self): s2cu, s2cu_j = self.get_unique_clust_idxs() @@ -1001,15 +999,32 @@ def get_colors(self): si = clust_terr.index.argsort() # color any cluster larger than 10Mb (~0.003 of total genomic territory) + base_colors = np.array([ + [0.368417, 0.506779, 0.709798], + [0.880722, 0.611041, 0.142051], + [0.560181, 0.691569, 0.194885], + [0.922526, 0.385626, 0.209179], + [0.528488, 0.470624, 0.701351], + [0.772079, 0.431554, 0.102387], + [0.363898, 0.618501, 0.782349], + [1, 0.75, 0], + [0.647624, 0.37816, 0.614037], + [0.571589, 0.586483, 0.], + [0.915, 0.3325, 0.2125], + [0.400822, 0.522007, 0.85], + [0.972829, 0.621644, 0.073362], + [0.736783, 0.358, 0.503027], + [0.280264, 0.715, 0.429209] + ]) extra_colors = np.array( distinctipy.distinctipy.get_colors( - (clust_terr/clust_terr.sum() >= 0.003).sum() - _colors.shape[0], - exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], np.c_[0.5, 0.5, 0.5], np.c_[1, 0, 1], _colors]], + (clust_terr/clust_terr.sum() >= 0.003).sum() - base_colors.shape[0], + exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], np.c_[0.5, 0.5, 0.5], np.c_[1, 0, 1], base_colors]], rng = 1234 ) ) - return np.r_[_colors, extra_colors][si] + return np.r_[base_colors, extra_colors if extra_colors.size > 0 else np.empty([0, 3])][si] def visualize_segs(self, f = None, use_clust = False, show_snps = False): f = plt.figure(figsize = [16, 4]) if f is None else f From dc6887d3abe36492590bf0fa4136dd829f839707 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:11:29 -0400 Subject: [PATCH 100/222] Update ADP wrapper --- hapaseg/__main__.py | 41 ++++++++++++----------------------------- 1 file changed, 12 insertions(+), 29 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index d110bc4..7821f5d 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -101,8 +101,6 @@ def parse_args(): ## DP dp = subparsers.add_parser("dp", help = "Run DP clustering on allelic imbalance segments") dp.add_argument("--seg_dataframe", required = True) - dp.add_argument("--n_dp_iter", default = 10) - dp.add_argument("--seg_samp_idx", default = 0) dp.add_argument("--ref_fasta", required = True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load dp.add_argument("--cytoband_file", required = True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load @@ -310,14 +308,7 @@ def main(): A = A_DP(args.seg_dataframe, ref_fasta = args.ref_fasta) # run DP - # TODO: when we have better type checking, drop the int coersion here - #N_seg_samps = A.n_samp - 1 if int(args.n_seg_samps) == 0 else int(args.n_seg_samps) - # TODO: if we decide to drop support for chained sampling altogether, remove N_seg_samps logic altogether - snps_to_clusters, snps_to_phases, likelihoods = A.run( - seg_sample_idx = int(args.seg_samp_idx), - #N_seg_samps = N_seg_samps, - N_clust_samps = int(args.n_dp_iter) - ) + snps_to_clusters, snps_to_phases, likelihoods = A.run() # save DP results np.savez(output_dir + "/allelic_DP_SNP_clusts_and_phase_assignments.npz", @@ -331,34 +322,26 @@ def main(): # # plot DP results - # 1. phased SNP visualization - f = plt.figure(figsize = [17.56, 5.67]) - hs_utils.plot_chrbdy(args.cytoband_file) - A.visualize_SNPs(snps_to_phases, color = True, f = f) - A.visualize_clusts(snps_to_clusters, f = f, thick = True, nocolor = True) - plt.ylabel("Haplotypic imbalance") - plt.title("SNP phasing/segmentation") - plt.savefig(output_dir + "/figures/SNPs.png", dpi = 300) - plt.close() + # 0. likelihood trace + A.DP_run.plot_likelihood_trace() + plt.savefig(output_dir + "/figures/likelihood_trace.png", dpi = 300) - # 2. pre-clustering segments + # 1. SNPs + segments f = plt.figure(figsize = [17.56, 5.67]) hs_utils.plot_chrbdy(args.cytoband_file) - A.visualize_SNPs(snps_to_phases, color = False, f = f) - A.visualize_segs(snps_to_clusters, f = f) + A.DP_run.visualize_segs(f = f, show_snps = True) plt.ylabel("Haplotypic imbalance") - plt.title("Allelic segmentation, pre-DP clustering") - plt.savefig(output_dir + "/figures/allelic_imbalance_preDP.png", dpi = 300) + plt.title("SNPs + allelic segmentation (MAP)") + plt.savefig(output_dir + "/figures/SNPs.png", dpi = 300) plt.close() - # 3. post-clustering segments + # 2. segments alone f = plt.figure(figsize = [17.56, 5.67]) hs_utils.plot_chrbdy(args.cytoband_file) - A.visualize_SNPs(snps_to_phases, color = False, f = f) - A.visualize_clusts(snps_to_clusters, f = f, thick = True) + A.DP_run.visualize_segs(f = f, show_snps = False) plt.ylabel("Haplotypic imbalance") - plt.title("Allelic segmentation, post-DP clustering") - plt.savefig(output_dir + "/figures/allelic_imbalance_postDP.png", dpi = 300) + plt.title("Allelic segmentation (posterior)") + plt.savefig(output_dir + "/figures/segs_only.png", dpi = 300) plt.close() if __name__ == "__main__": From 4c758741bbd322bf5b2e6c1afd60e9df5f7d9261 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:12:46 -0400 Subject: [PATCH 101/222] Bump betahyp --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 603aa35..43cc742 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -94,7 +94,7 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F") self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F") - self.betahyp = 1 + self.betahyp = 10 # # define column indices From a5182babdc6f43dc0f1fbf742e108a58d9f433be Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:13:15 -0400 Subject: [PATCH 102/222] Burn in for longer --- hapaseg/allelic_DP.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 43cc742..ca7b8f2 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -588,9 +588,9 @@ def run(self, n_iter = 0, n_samps = 0): likelihood_ready = True # check if likelihood has stabilized enough to consider us "burned in" - if likelihood_ready and not burned_in and len(self.lik_trace) > 100: + if likelihood_ready and not burned_in and len(self.lik_trace) > 500: lt = np.vstack(self.lik_trace).sum(1) - if (np.convolve(np.diff(lt), np.ones(100)/100, mode = "same") < 0).sum() > 2: + if (np.convolve(np.diff(lt), np.ones(500)/500, mode = "same") < 0).sum() > 2: print("BURNED IN") burned_in = True self.burnin_iteration = len(self.lik_trace) From f4bd8e56da827edd6118ee65df12dbf2d74048ae Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:13:57 -0400 Subject: [PATCH 103/222] Keep cluster indices more consistent --- hapaseg/allelic_DP.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index ca7b8f2..e0bb0a1 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -889,13 +889,11 @@ def run(self, n_iter = 0, n_samps = 0): if choice < 0: # if we are moving an entire cluster, give it the same index it used to have # otherwise, cluster indices will be inconsistent - if move_clust: - new_clust_idx = cl_idx - elif choice == -1: # totally new cluster + if cur_clust not in self.clust_counts: + new_clust_idx = cur_clust + else: # totally new cluster max_clust_idx += 1 new_clust_idx = max_clust_idx - else: # match index of cluster in prior - new_clust_idx = -choice - 2 self.clust_counts[new_clust_idx] = n_move self.S.iloc[seg_idx, self.clust_col] = new_clust_idx From 4bef4e03293e2d83f7a64d4c2992fcbbb1952489 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:18:11 -0400 Subject: [PATCH 104/222] Get rid of code to move clusters --- hapaseg/allelic_DP.py | 254 +++++++++++++++++------------------------- 1 file changed, 100 insertions(+), 154 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index e0bb0a1..6d89d57 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -597,156 +597,124 @@ def run(self, n_iter = 0, n_samps = 0): n_it_last = n_it # - # pick either a segment or a cluster at random (50:50 prob.) - move_clust = False + # pick a segment to move - # move a segment - #if not touch90 or np.random.rand() < 0.9: - if True or np.random.rand() < 0.9: - # >90% of segments have been moved; we are iterating over segments sequentially - if touch90: - break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)}) - brk += 1 - # we are picking segments at random - else: - break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)}) - - # get all SNPs within this segment - seg_st = self.breakpoints[break_idx[0]] - seg_en = self.breakpoints[break_idx[0] + 1] - seg_idx = np.r_[seg_st:seg_en] - - cur_clust = int(self.clusts[seg_idx[0]]) - - # propose breaking this segment - if np.random.rand() < 0.1: - # can't split segments of length 1 - if len(seg_idx) == 1: - n_it += 1 - continue - - # TODO: memoize cumsums? - min_cs = self._Scumsum_ph(seg_idx, min = True) - min_csr = self.seg_sums[seg_idx[0]][0] - min_cs - maj_cs = self._Scumsum_ph(seg_idx, min = False) - maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs - - split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp) - split_lik[-1] = ss.betaln(min_cs[-1] + 1 + self.betahyp, maj_cs[-1] + 1 + self.betahyp) - split_lik -= split_lik.max() - split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum()) - seg_idx = seg_idx[:(split_point + 1)] - - # add breakpoint (can be erased subsequently if segment rejoins original cluster) - new_bp = seg_idx[-1] + 1 - if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment - self.add_breakpoint(start = seg_idx[0], mid = new_bp, end = seg_en, clust_idx = cur_clust) - - # propose splitting out a contiguous interval of segments within the current cluster {{{ - split_clust = False - if np.random.rand() < 0.1: - # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable? - clust_snps = np.sort(np.r_[list(self.clust_members[cur_clust])]) - - # can't split clusters of length 1 - if len(clust_snps) == 1: - n_it += 1 - continue - - split_bdy = self.compute_cluster_splitpoints(clust_snps) - - A_tot, B_tot = self.clust_sums[cur_clust] - - lik0 = ss.betaln(A_tot + 1 + self.betahyp, B_tot + 1 + self.betahyp) - - liks = np.zeros(len(split_bdy) + 1) - liks[-1] = lik0 # don't split at all - - # likelihood ratios for splitting each region into a new cluster - for i, (st, en) in enumerate(split_bdy): - A = self._Ssum_ph(clust_snps[st:en], min = True) - B = self._Ssum_ph(clust_snps[st:en], min = False) - - liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) - - # pick a region to split - split_idx = np.random.choice( - len(split_bdy) + 1, - p = np.exp(liks - liks.max())/np.exp(liks - liks.max()).sum() - ) + # >90% of segments have been moved; we are iterating over segments sequentially + if touch90: + break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)}) + brk += 1 + # we are picking segments at random + else: + break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)}) + + # get all SNPs within this segment + seg_st = self.breakpoints[break_idx[0]] + seg_en = self.breakpoints[break_idx[0] + 1] + seg_idx = np.r_[seg_st:seg_en] - # don't split at all - if split_idx == len(split_bdy): - n_it += 1 - continue + cur_clust = int(self.clusts[seg_idx[0]]) - # seg_idx == SNPs to propose to split off - seg_idx = clust_snps[slice(*split_bdy[split_idx])] + # propose breaking this segment + if np.random.rand() < 0.1: + # can't split segments of length 1 + if len(seg_idx) == 1: + n_it += 1 + continue - split_clust = True + # TODO: memoize cumsums? + min_cs = self._Scumsum_ph(seg_idx, min = True) + min_csr = self.seg_sums[seg_idx[0]][0] - min_cs + maj_cs = self._Scumsum_ph(seg_idx, min = False) + maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs + + split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp) + split_lik[-1] = ss.betaln(min_cs[-1] + 1 + self.betahyp, maj_cs[-1] + 1 + self.betahyp) + split_lik -= split_lik.max() + split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum()) + seg_idx = seg_idx[:(split_point + 1)] + + # add breakpoint (can be erased subsequently if segment rejoins original cluster) + new_bp = seg_idx[-1] + 1 + if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment + self.add_breakpoint(start = seg_idx[0], mid = new_bp, end = seg_en, clust_idx = cur_clust) + + # propose splitting out a contiguous interval of segments within the current cluster {{{ + split_clust = False + if False and touch90 and np.random.rand() < 0.1: + # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable? + clust_snps = np.sort(np.r_[list(self.clust_members[cur_clust])]) + + # can't split clusters of length 1 + if len(clust_snps) == 1: + n_it += 1 + continue - # add breakpoints - for si in [seg_idx[0], seg_idx[-1]]: - if si not in self.breakpoints: - seg_st_idx = self.breakpoints.bisect_left(si) - 1 - seg_st = self.breakpoints[seg_st_idx] - seg_en_idx = self.breakpoints.bisect_left(si) - seg_en = self.breakpoints[seg_en_idx] + split_bdy = self.compute_cluster_splitpoints(clust_snps) - self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust) + A_tot, B_tot = self.clust_sums[cur_clust] - # get all breakpoints within this cluster/interval - left_idx = self.clust_members_bps[cur_clust].bisect_left(seg_idx[0]) - right_idx = self.clust_members_bps[cur_clust].bisect_right(seg_idx[-1]) - break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cur_clust][left_idx:right_idx]]) + lik0 = ss.betaln(A_tot + 1 + self.betahyp, B_tot + 1 + self.betahyp) - # }}} + liks = np.zeros(len(split_bdy) + 1) + liks[-1] = lik0 # don't split at all - n_move = len(seg_idx) + # likelihood ratios for splitting each region into a new cluster + for i, (st, en) in enumerate(split_bdy): + A = self._Ssum_ph(clust_snps[st:en], min = True) + B = self._Ssum_ph(clust_snps[st:en], min = False) - # if segment was already assigned to a cluster, unassign it - if cur_clust >= 0: - self.clust_counts[cur_clust] -= n_move - if self.clust_counts[cur_clust] == 0: - del self.clust_counts[cur_clust] - del self.clust_sums[cur_clust] - del self.clust_members[cur_clust] - del self.clust_members_bps[cur_clust] - else: - self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)] - self.clust_members[cur_clust] -= set(seg_idx) - for b in break_idx: - self.clust_members_bps[cur_clust].remove(self.breakpoints[b]) + liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) - self.clusts[seg_idx] = -1 + # pick a region to split + split_idx = np.random.choice( + len(split_bdy) + 1, + p = np.exp(liks - liks.max())/np.exp(liks - liks.max()).sum() + ) - # pick a cluster at random - else: - # it only makes sense to try joining two clusters if there are at least two of them! - if len(self.clust_counts) < 2: + # don't split at all + if split_idx == len(split_bdy): n_it += 1 continue - cl_idx = np.random.choice(self.clust_counts.keys()) - seg_idx = np.r_[list(self.clust_members[cl_idx])] + # seg_idx == SNPs to propose to split off + seg_idx = clust_snps[slice(*split_bdy[split_idx])] - # get all breakpoints corresponding to this cluster - break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cl_idx]]) + split_clust = True - n_move = len(seg_idx) - cur_clust = -1 # only applicable for individual segments, so we set to -1 here - # (this is so that subsequent references to clust_sums[cur_clust] - # will return (0, 0)) + # add breakpoints + for si in [seg_idx[0], seg_idx[-1]]: + if si not in self.breakpoints: + seg_st_idx = self.breakpoints.bisect_left(si) - 1 + seg_st = self.breakpoints[seg_st_idx] + seg_en_idx = self.breakpoints.bisect_left(si) + seg_en = self.breakpoints[seg_en_idx] - # unassign all segments within this cluster - # (it will either be joined with a new cluster, or remade again into its own cluster) - del self.clust_counts[cl_idx] - del self.clust_sums[cl_idx] - del self.clust_members[cl_idx] - del self.clust_members_bps[cl_idx] - self.clusts[seg_idx] = -1 + self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust) - move_clust = True + # get all breakpoints within this cluster/interval + left_idx = self.clust_members_bps[cur_clust].bisect_left(seg_idx[0]) + right_idx = self.clust_members_bps[cur_clust].bisect_right(seg_idx[-1]) + break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cur_clust][left_idx:right_idx]]) + + # }}} + + n_move = len(seg_idx) + + # if segment was already assigned to a cluster, unassign it + if cur_clust >= 0: + self.clust_counts[cur_clust] -= n_move + if self.clust_counts[cur_clust] == 0: + del self.clust_counts[cur_clust] + del self.clust_sums[cur_clust] + del self.clust_members[cur_clust] + del self.clust_members_bps[cur_clust] + else: + self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)] + self.clust_members[cur_clust] -= set(seg_idx) + for b in break_idx: + self.clust_members_bps[cur_clust].remove(self.breakpoints[b]) + + self.clusts[seg_idx] = -1 # # perform phase correction on segment/cluster @@ -844,12 +812,7 @@ def run(self, n_iter = 0, n_samps = 0): # # adjacent segment likelihood - #adj_AB = 0 - #adj_BC = np.zeros([len(self.clust_sums), 2]) - - log_adj_lik = 0 - if not move_clust: # or (move_clust and np.random.rand() < 0.01): - log_adj_lik = self.compute_adj_prob(break_idx[0]) + log_adj_lik = self.compute_adj_prob(break_idx[0]) # p(X|clust,phase)p(X|seg,phase)p(clust)p(phase) num = (MLs # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B}) @@ -880,11 +843,6 @@ def run(self, n_iter = 0, n_samps = 0): en = self.breakpoints[b + 1] self.seg_sums[st] = self.seg_sums[st][::-1] - if not move_clust: - print(f"{cur_clust}->{choice} ({len(seg_idx)}, s, [{seg_idx[0]}, {seg_idx[-1]}])") - else: - print(f"{cl_idx}->{choice} ({len(seg_idx)}, c, [{seg_idx[0]}, {seg_idx[-1]}])") - # create new cluster if choice < 0: # if we are moving an entire cluster, give it the same index it used to have @@ -904,18 +862,6 @@ def run(self, n_iter = 0, n_samps = 0): # join existing cluster else: - # if we are combining two clusters, take the index of the bigger one - # this helps to keep cluster indices consistent - if move_clust and self.clust_counts[choice] < n_move: - self.clust_counts[cl_idx] = self.clust_counts[choice] - self.clust_sums[cl_idx] = self.clust_sums[choice] - self.clust_members[cl_idx] = self.clust_members[choice] - self.S.iloc[np.flatnonzero(self.S["clust"] == choice), self.clust_col] = cl_idx - del self.clust_counts[choice] - del self.clust_sums[choice] - del self.clust_members[choice] - choice = cl_idx - self.clust_counts[choice] += n_move self.clust_sums[choice] += np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a] self.S.iloc[seg_idx, self.clust_col] = choice From 37fde590b307fdc9d8c9f31a78552f86c26473e5 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:18:51 -0400 Subject: [PATCH 105/222] Commit some diagnostic code just in case --- hapaseg/allelic_DP.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 6d89d57..3f4d92c 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -599,6 +599,13 @@ def run(self, n_iter = 0, n_samps = 0): # # pick a segment to move +# diagnostic code to compute overall likelihood before move +# compute_lik = False +# lik_before = np.nan +# if touch90 and np.random.rand() < 0.1: +# compute_lik = True +# lik_before = self.compute_overall_lik_simple() + # >90% of segments have been moved; we are iterating over segments sequentially if touch90: break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)}) @@ -719,6 +726,12 @@ def run(self, n_iter = 0, n_samps = 0): # # perform phase correction on segment/cluster # flip min/maj with probability that alleles are oriented the "wrong" way +# if not np.isnan(self.seg_phase_probs[seg_idx[0]]): +# rfp = self.compute_rephase_prob(seg_idx) +# rfp_mem = self.seg_phase_probs[seg_idx[0]] +# if np.abs(rfp - rfp_mem) > 0.05: +# print(rfp_mem, rfp) +# breakpoint() if np.isnan(self.seg_phase_probs[seg_idx[0]]): self.seg_phase_probs[seg_idx[0]] = self.compute_rephase_prob(seg_idx) rephase_prob = self.seg_phase_probs[seg_idx[0]] @@ -898,6 +911,14 @@ def run(self, n_iter = 0, n_samps = 0): self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster update_idx.add(self.breakpoints.bisect_left(snp) - 1) snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1]) +# if len(update_idx): +# usnp = self.breakpoints[self.breakpoints.bisect_left(seg_idx[0]) - 1] +# print(f"{usnp}: {self.clusts[usnp]}") +# print(f"{snp_idx[0]}: {self.clusts[snp_idx[0]]} <") +# print(f"{snp_idx[1]}: {self.clusts[snp_idx[1]]} <") +# dsnp = self.breakpoints[self.breakpoints.bisect_right(seg_idx[0])] +# print(f"{dsnp}: {self.clusts[dsnp]}") +# print(f"Update: {self.breakpoints[update_idx[0]]}") for bp_idx in update_idx: st = self.breakpoints[bp_idx] en = self.breakpoints[bp_idx + 1] @@ -912,6 +933,24 @@ def run(self, n_iter = 0, n_samps = 0): else: self.clust_members_bps[choice] |= snp_idx +# diagnostic code to check if breakpoint list is properly updated +# if touch90: +# x = sc.SortedSet() +# for y in self.clust_members_bps.values(): +# x |= y +# if len(x) != len(self.breakpoints) - 1: +# breakpoint() + +# diagnostic code to compute overall likelihood delta for iteration +# if compute_lik: +# lik_after = self.compute_overall_lik_simple() +# lik_delta = lik_after.sum() - lik_before.sum() +# ML_choice = num.ravel()[choice_idx] +# if not np.isnan(lik_delta) and (lik_delta != 0 or ML_choice != 0): +# print("lik: {}; MLs: {}".format(lik_delta, ML_choice)) +## if lik_delta < 0 and ML_choice == 0: +## breakpoint() + # save a sample from the MCMC when >95% of segments have been touched since the last iteration if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95: self.snps_to_clusters.append(self.S["clust"].copy()) From 2ab56291d921cf8bbaac82d8e6dd47860df3d173 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:19:45 -0400 Subject: [PATCH 106/222] Bump touch90->95% --- hapaseg/allelic_DP.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 3f4d92c..af7958c 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -575,8 +575,8 @@ def run(self, n_iter = 0, n_samps = 0): # poll every 100 iterations for various statuses if not n_it % 100: - # have >90% of segments been touched? - if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.9: + # have >95% of segments been touched? + if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.95: touch90 = True # start computing likelihoods From c4662ec571c7c679fbd599809696f530caac54a5 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:19:53 -0400 Subject: [PATCH 107/222] Add TODO note --- hapaseg/allelic_DP.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index af7958c..4e0c9bc 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -145,6 +145,7 @@ def _Scumsum_ph(self, seg_idx, min = True): return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]][si].cumsum() def compute_rephase_prob(self, seg_idx): + # TODO: compute logcdf/logsf directly flip = self.S.iloc[seg_idx, self.flip_col] flip_n = ~flip From 8235ff02ec7bf45ff7d1a4d6a1be831a1ad31857 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:33:44 -0400 Subject: [PATCH 108/222] Print MCMC trace progress --- hapaseg/allelic_DP.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 4e0c9bc..5d6d24c 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -565,6 +565,8 @@ def run(self, n_iter = 0, n_samps = 0): print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1])) if likelihood_ready: print("[{}] Likelihood: {}".format("*" if burned_in else " ", self.lik_trace[-1].sum())) + if burned_in: + print("{}/{} MCMC samples collected".format(len(self.snps_to_clusters), n_samps)) # stop after a raw number of iterations if n_iter > 0 and n_it > n_iter: From 05f408da8519835f28285df9c12be0cf8f99125c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:42:14 -0400 Subject: [PATCH 109/222] Bump AMCMC segmentation docker --- wolF/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 35a365a..02e4358 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -50,7 +50,7 @@ class Hapaseg_burnin(wolf.Task): output_patterns = { "burnin_MCMC" : "amcmc_results.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:v458" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581" class Hapaseg_concat(wolf.Task): inputs = { @@ -65,7 +65,7 @@ class Hapaseg_concat(wolf.Task): "arms" : "AMCMC-arm*.pickle", "ref_bias" : ("ref_bias.txt", wolf.read_file) } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:v458" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581" class Hapaseg_amcmc(wolf.Task): inputs = { @@ -81,7 +81,7 @@ class Hapaseg_amcmc(wolf.Task): output_patterns = { "arm_level_MCMC" : "amcmc_results.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:v458" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581" class Hapaseg_allelic_DP(wolf.Task): inputs = { From c3b58d7f2aa2fd2ddc390899b060698fa32ba86f Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:42:38 -0400 Subject: [PATCH 110/222] Update ADP workflow --- wolF/tasks.py | 6 +++--- wolF/workflow.py | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 02e4358..acb75b2 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -101,9 +101,9 @@ class Hapaseg_allelic_DP(wolf.Task): output_patterns = { "cluster_and_phase_assignments" : "allelic_DP_SNP_clusts_and_phase_assignments.npz", "all_SNPs" : "all_SNPs.pickle", + "likelihood_trace_plot" : "figures/likelihood_trace.png", "SNP_plot" : "figures/SNPs.png", - "seg_plot" : "figures/allelic_imbalance_preDP.png", - "clust_plot" : "figures/allelic_imbalance_postDP.png", + "seg_plot" : "figures/segs_only.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:v499" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v608" resources = { "mem" : "5G" } diff --git a/wolF/workflow.py b/wolF/workflow.py index 6ae5505..1d12f16 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -376,8 +376,6 @@ def concat_arm_level_results(arm_results): hapaseg_allelic_DP_task = hapaseg.Hapaseg_allelic_DP( inputs = { "seg_dataframe" : arm_concat, - "n_dp_iter" : 10, # TODO: allow to be specified? - "seg_samp_idx" : n_samps_range, "cytoband_file" : "/mnt/j/db/hg38/ref/cytoBand_primary.txt", # TODO: allow to be specified "ref_fasta" : localization_task["ref_fasta"], "ref_fasta_idx" : localization_task["ref_fasta_idx"], # not used; just supplied for symlink From 2c8bc4092967dd22d887f14d07c3889773d931b1 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:51:26 -0400 Subject: [PATCH 111/222] Install distinctipy --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 9234955..85baf96 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,7 @@ WORKDIR /build RUN pip install sortedcontainers RUN git clone https://github.com/getzlab/CApy.git && pip install ./CApy RUN pip install dask distributed +RUN pip install distinctipy # install hapaseg COPY setup.py . From 5fcee32136c6cdf737450512c02480745b00a673 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 18 Apr 2022 23:52:13 -0400 Subject: [PATCH 112/222] Bump docker --- wolF/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index acb75b2..5dfc61a 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -105,5 +105,5 @@ class Hapaseg_allelic_DP(wolf.Task): "SNP_plot" : "figures/SNPs.png", "seg_plot" : "figures/segs_only.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v608" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v611" resources = { "mem" : "5G" } From 3d96d2720deb15e28cbade84b0cf94013bbe2dc4 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 14:54:05 -0400 Subject: [PATCH 113/222] Initial commit of het selection notebook --- 40_het_selection.py | 79 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 40_het_selection.py diff --git a/40_het_selection.py b/40_het_selection.py new file mode 100644 index 0000000..06991e0 --- /dev/null +++ b/40_het_selection.py @@ -0,0 +1,79 @@ +import colorama +import copy +import itertools +import matplotlib.pyplot as plt +import matplotlib as mpl +import ncls +import numpy as np +import numpy_groupies as npg +import pandas as pd +import scipy.stats as s +import scipy.sparse as sp +import scipy.special as ss +import sortedcontainers as sc + +plt.figure(1); plt.clf() +plt.figure(2); plt.clf() +plt.figure(30); plt.clf() +cut20_dens = {} +cut20_lod = {} +cut80_dens = {} +cut80_lod = {} +for depth in [15, 20, 30, 60, 80, 200]: + # simulate good hets + cov = s.poisson.rvs(depth, size = 10000) + A = s.binom.rvs(cov, 0.5) + B = cov - A + + # simulate bad hets + bad_cov = s.poisson.rvs(depth, size = 10000) + bad_frac = np.ones_like(bad_cov).astype(float) + for i in range(len(bad_frac)): + bad_frac[i] = np.random.choice([0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]) + A_bad = s.binom.rvs(bad_cov, bad_frac) + B_bad = bad_cov - A_bad + + # old criterion: beta density between 0.6 and 0.4 + betafrac = np.diff(s.beta.cdf([0.4, 0.6], A[:, None] + 1, B[:, None] + 1)) + betafrac_bad = np.diff(s.beta.cdf([0.4, 0.6], A_bad[:, None] + 1, B_bad[:, None] + 1)) + + # new criterion: log-odds ratio + betalod = s.beta.logsf(0.5, A + 1, B + 1) - s.beta.logcdf(0.5, A + 1, B + 1) + betalod_bad = s.beta.logsf(0.5, A_bad + 1, B_bad + 1) - s.beta.logcdf(0.5, A_bad + 1, B_bad + 1) + + # ROC curves + dens_cdf = np.zeros([1000, 2]) + for i, cut in enumerate(np.linspace(0, 1, 1000)): + dens_cdf[i, 0] = (betafrac >= cut).mean() + dens_cdf[i, 1] = (betafrac_bad >= cut).mean() + + lod_cdf = np.zeros([1000, 2]) + for i, cut in enumerate(np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)): + lod_cdf[i, 0] = (np.abs(betalod) <= cut).mean() + lod_cdf[i, 1] = (np.abs(betalod_bad) <= cut).mean() + + plt.figure(30) + st = plt.step(dens_cdf[:, 1], dens_cdf[:, 0]) + color = st[0].get_color() + plt.step(lod_cdf[:, 1], lod_cdf[:, 0], color = color, linestyle = ":") + + cut20_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 1] <= 0.2)[0]] + cut80_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 0] <= 0.8)[0]] + cut20_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[np.flatnonzero(lod_cdf[:, 1] >= 0.2)[0]] + cut80_lod_idx = np.flatnonzero(lod_cdf[:, 0] >= 0.8)[0] + cut80_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[cut80_lod_idx] + + plt.scatter(lod_cdf[cut80_lod_idx, 1], lod_cdf[cut80_lod_idx, 0], marker = 'x', color = color) + plt.text(lod_cdf[cut80_lod_idx, 1], lod_cdf[cut80_lod_idx, 0], "{0:.2f}".format(cut80_lod[depth]), color = color) + + plt.figure(1) + sc = plt.scatter(cov, betafrac, alpha = 0.1, s = 10) + plt.scatter(depth, np.diff(s.beta.cdf([0.4, 0.6], depth/2 + 1, depth/2 + 1)), color = color, marker = "x") + + cov_range = np.r_[cov.min():cov.max()] + cov_cum = np.nan*np.ones_like(cov_range) + for i, c in enumerate(cov_range): + cov_cum[i] = betafrac[cov >= c].mean() + + plt.figure(2) + plt.scatter(cov_range, cov_cum) From 1da6f43ed6a3def682a467668a5f2083a6ed02a0 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 15:46:22 -0400 Subject: [PATCH 114/222] Speed up AMCMC early convergence contingency --- hapaseg/allelic_MCMC.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index 95a349a..7928541 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -139,7 +139,7 @@ def run(self): return self # save MLE breakpoint if we've burned in - if self.burned_in or self.iter >= self.n_iter - 100: # contingency in case we've converged on an optimum early and the chain hasn't moved at all + if self.burned_in: if self.marg_lik[self.iter] > self.marg_lik[self.iter - 1]: self.breakpoints_MLE = self.breakpoints.copy() @@ -159,11 +159,23 @@ def run(self): color = color )) - # check if we've burned in + # check if we've burned in -- chain is oscillating around some + # optimium (and thus mean differences between marginal likelihoods might + # be slightly negative) # TODO: use a faster method of computing rolling average if not self.burned_in and self.iter > 1000: if np.diff(self.marg_lik[(self.iter - 1000):self.iter]).mean() < 0: self.burned_in = True + # contingency if we've unambiguously converged on an optimum and chain has not moved at all + # exit early to save time + if (np.diff(self.marg_lik[(self.iter - 1000):self.iter]) == 0).all(): + self.breakpoints_MLE = self.breakpoints.copy() + print(colorama.Fore.GREEN + "Chain has unambiguously converged on an optimum; stopping early in {n} iterations. n_bp = {n_bp}, lik = {lik}".format( + n = self.iter, + n_bp = len(self.breakpoints), + lik = self.marg_lik[self.iter] + ) + colorama.Fore.RESET) + return self self.iter += 1 From 8b0e569adb7013adbc46efd061b945c193ae9eef Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 15:47:37 -0400 Subject: [PATCH 115/222] Use PoD genotyper for het pulldown --- wolF/workflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index 1d12f16..4af593f 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -168,7 +168,7 @@ def interval_gather(interval_files): ref_fasta = localization_task["ref_fasta"], ref_fasta_idx = localization_task["ref_fasta_idx"], ref_fasta_dict = localization_task["ref_fasta_dict"], - dens_cutoff = 0.58 # TODO: set dynamically + use_pod_genotyper = True ) # otherwise, run M1 and get it from the BAM @@ -200,7 +200,7 @@ def interval_gather(interval_files): ref_fasta = localization_task["ref_fasta"], ref_fasta_idx = localization_task["ref_fasta_idx"], ref_fasta_dict = localization_task["ref_fasta_dict"], - dens_cutoff = 0.58 # TODO: set dynamically + use_pod_genotyper = True ) # gather het pulldown From aabd727d6cd11e8650c38840338c94c68bbc7f02 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 16:07:59 -0400 Subject: [PATCH 116/222] Visualize AMCMC --- hapaseg/__main__.py | 8 ++++++++ hapaseg/allelic_MCMC.py | 2 ++ 2 files changed, 10 insertions(+) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 7821f5d..1526d63 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -9,6 +9,7 @@ import scipy.stats as s import scipy.special as ss import sortedcontainers as sc +import traceback from capy import mut @@ -219,6 +220,13 @@ def main(): with open(output_dir + "/amcmc_results.pickle", "wb") as f: pickle.dump(H.run(), f) + try: + H.visualize() + plt.savefig(output_dir + "/figures/MLE_segmentation.png", dpi = 300) + except Exception: + print("Error plotting segments; see stack trace for details:") + print(traceback.format_exc()) + elif args.command == "concat": # # load scatter intervals diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index 7928541..32e4313 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -525,4 +525,6 @@ def visualize(self, show_CIs = False): ax.set_xlabel("SNP index") ax.set_ylim([0, 1]) + ax.set_title(f"{self.P.iloc[0]['chr']}:{self.P.iloc[0]['pos']}-{self.P.iloc[-1]['pos']}") + plt.tight_layout() From c094f6af200f25169758e4b85d7df0a759262436 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 16:15:32 -0400 Subject: [PATCH 117/222] Increase eagle threads --- wolF/workflow.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index 4af593f..0218bf9 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -287,8 +287,10 @@ def order_indices(bcf_path, bcf_idx_path, localization_task): vcf_idx_in = F["bcf_idx_path"], vcf_ref = F["ref_bcf"], vcf_ref_idx = F["ref_bcf_idx"], - output_file_prefix = "foo" - ) + output_file_prefix = "foo", + num_threads = 4, + ), + resources = { "cpus-per-task" : 4 } ) # TODO: run whatshap From c703ffb8fe158163b15daeb0c438f878c8694c2b Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 16:50:49 -0400 Subject: [PATCH 118/222] Save segmentation plot from AMCMC --- wolF/tasks.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 5dfc61a..3aa55f5 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -50,7 +50,7 @@ class Hapaseg_burnin(wolf.Task): output_patterns = { "burnin_MCMC" : "amcmc_results.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617" class Hapaseg_concat(wolf.Task): inputs = { @@ -65,7 +65,7 @@ class Hapaseg_concat(wolf.Task): "arms" : "AMCMC-arm*.pickle", "ref_bias" : ("ref_bias.txt", wolf.read_file) } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617" class Hapaseg_amcmc(wolf.Task): inputs = { @@ -79,9 +79,10 @@ class Hapaseg_amcmc(wolf.Task): --n_iter ${n_iter} """ output_patterns = { - "arm_level_MCMC" : "amcmc_results.pickle" + "arm_level_MCMC" : "amcmc_results.pickle", + "segmentation_plot" : "figures/MLE_segmentation.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v581" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617" class Hapaseg_allelic_DP(wolf.Task): inputs = { @@ -105,5 +106,5 @@ class Hapaseg_allelic_DP(wolf.Task): "SNP_plot" : "figures/SNPs.png", "seg_plot" : "figures/segs_only.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v611" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617" resources = { "mem" : "5G" } From f6e0c7b3302d9828072ca7c158e25cc666fba79d Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 20:59:03 -0400 Subject: [PATCH 119/222] Update ADP task definition --- wolF/tasks.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 3aa55f5..7a0bbd4 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -87,15 +87,11 @@ class Hapaseg_amcmc(wolf.Task): class Hapaseg_allelic_DP(wolf.Task): inputs = { "seg_dataframe" : None, - "n_dp_iter" : 10, - "seg_samp_idx" : 0, "ref_fasta" : None, "cytoband_file" : None } script = """ hapaseg dp --seg_dataframe ${seg_dataframe} \ - --n_dp_iter ${n_dp_iter} \ - --seg_samp_idx ${seg_samp_idx} \ --ref_fasta ${ref_fasta} \ --cytoband_file ${cytoband_file} """ From b619cd0e42295cdbeb905d8cc2b884a868463a80 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 21:51:56 -0400 Subject: [PATCH 120/222] Increase SNP opacity --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 5d6d24c..a7624a5 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1028,7 +1028,7 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False): if show_snps: # set SNP alpha based on number of SNPs logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M))) - default_alpha = logistic(A = 0.4, K = 0.01, B = 0.00001, M = 120000, x = len(self.S)) + default_alpha = logistic(A = 0.4, K = 0.025, B = 0.00001, M = 120000, x = len(self.S)) ph_prob = np.r_[self.phase_orientations].mean(0) From a66e2e7db65499ff791a86f07674228bbf5743b3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 22:33:49 -0400 Subject: [PATCH 121/222] Bump mandatory pandas version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 670d4c6..b4501b5 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ #long_description = long_description, #long_description_content_type = 'text/markdown', install_requires = [ - 'pandas>=0.24.1', + 'pandas>=1.4.1', 'numpy>=1.18.0', 'more-itertools>=8.10.0', 'numpy_groupies>=0.9.14', From e999013cbd893bdc831c140c59ac2c7fef37e5cb Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 19 Apr 2022 22:44:31 -0400 Subject: [PATCH 122/222] Bump ADP docker --- wolF/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 7a0bbd4..8039cb5 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -102,5 +102,5 @@ class Hapaseg_allelic_DP(wolf.Task): "SNP_plot" : "figures/SNPs.png", "seg_plot" : "figures/segs_only.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v621" resources = { "mem" : "5G" } From a142ce97f92fe3f09cc9cba2d4df83bc34f44c5e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 20 Apr 2022 08:23:20 -0400 Subject: [PATCH 123/222] Restrict ADP plot to chromosome, if specified --- hapaseg/allelic_DP.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index a7624a5..de8ddf3 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1012,10 +1012,13 @@ def get_colors(self): return np.r_[base_colors, extra_colors if extra_colors.size > 0 else np.empty([0, 3])][si] - def visualize_segs(self, f = None, use_clust = False, show_snps = False): + def visualize_segs(self, f = None, use_clust = False, show_snps = False, chrom = None): f = plt.figure(figsize = [16, 4]) if f is None else f ax = plt.gca() - ax.set_xlim([0, self.S["pos_gp"].max()]) + if chrom is None: + ax.set_xlim([0, self.S["pos_gp"].max()]) + else: + ax.set_xlim([*self.S.loc[self.S["chr"] == chrom, "pos_gp"].iloc[[0, -1]]]) ax.set_ylim([0, 1]) colors = self.get_colors() @@ -1028,7 +1031,7 @@ def visualize_segs(self, f = None, use_clust = False, show_snps = False): if show_snps: # set SNP alpha based on number of SNPs logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M))) - default_alpha = logistic(A = 0.4, K = 0.025, B = 0.00001, M = 120000, x = len(self.S)) + default_alpha = logistic(A = 0.4, K = 0.025, B = 0.00001, M = 120000, x = len(self.S) if chrom is None else (self.S["chr"] == chrom).sum()) ph_prob = np.r_[self.phase_orientations].mean(0) From ef38d99947f3474129ea83b0fbf1dbda45bdc995 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 20 Apr 2022 09:04:31 -0400 Subject: [PATCH 124/222] Bump ADP docker --- wolF/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 8039cb5..b5bba8a 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -102,5 +102,5 @@ class Hapaseg_allelic_DP(wolf.Task): "SNP_plot" : "figures/SNPs.png", "seg_plot" : "figures/segs_only.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v621" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v623" resources = { "mem" : "5G" } From 4aca76ab350a07c40bed4636ea20bf17b1f4f715 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 20 Apr 2022 12:16:15 -0400 Subject: [PATCH 125/222] Add legend --- 40_het_selection.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/40_het_selection.py b/40_het_selection.py index 06991e0..852de78 100644 --- a/40_het_selection.py +++ b/40_het_selection.py @@ -19,6 +19,7 @@ cut20_lod = {} cut80_dens = {} cut80_lod = {} +leg = [] for depth in [15, 20, 30, 60, 80, 200]: # simulate good hets cov = s.poisson.rvs(depth, size = 10000) @@ -57,6 +58,8 @@ color = st[0].get_color() plt.step(lod_cdf[:, 1], lod_cdf[:, 0], color = color, linestyle = ":") + leg.append(st) + cut20_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 1] <= 0.2)[0]] cut80_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 0] <= 0.8)[0]] cut20_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[np.flatnonzero(lod_cdf[:, 1] >= 0.2)[0]] @@ -77,3 +80,6 @@ plt.figure(2) plt.scatter(cov_range, cov_cum) + +plt.figure(3) +plt.legend([x[0] for x in leg], ["15x", "20x", "30x", "60x", "80x", "200x"]) From 1c818a1f0710c0a94d3f9e739f3f813eacc02ff2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 25 Apr 2022 00:24:24 -0400 Subject: [PATCH 126/222] Add TODO for parsing cytoband file --- hapaseg/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hapaseg/utils.py b/hapaseg/utils.py index e9dc502..c216f4c 100644 --- a/hapaseg/utils.py +++ b/hapaseg/utils.py @@ -5,7 +5,8 @@ _chrmap = dict(zip(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]], range(1, 25))) def parse_cytoband(cytoband): - cband = pd.read_csv(cytoband, sep = "\t") + # TODO: do some cytoband files have a header? check if so! + cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"]) cband["chr"] = cband["chr"].apply(lambda x : _chrmap[x]) chrs = cband["chr"].unique() From adfc8500b03331a3ef45498b0c9a6ba0d8ca4bcf Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 10:24:51 -0400 Subject: [PATCH 127/222] Remove temperature --- hapaseg/allelic_DP.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index de8ddf3..6c6cb86 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -82,12 +82,11 @@ def run(self): return self.snps_to_clusters, self.snps_to_phases, self.likelihoods class DPinstance: - def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, temperature = 1, dp_count_scale_factor = 1): + def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, dp_count_scale_factor = 1): self.S = S self.clust_prior = clust_prior.copy() self.clust_count_prior = clust_count_prior.copy() self.alpha = alpha - self.temperature = temperature self.dp_count_scale_factor = dp_count_scale_factor self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed @@ -836,8 +835,6 @@ def run(self, n_iter = 0, n_samps = 0): + log_count_prior # p(clust) (DP prior on clust counts) + log_phase_prob) # p(phase) - num /= self.temperature # scale by temperature for replica-exchange - num -= num.max() # avoid underflow in sum-exp # p(clust,phase|X) From a75de21aa5ec0668c1e9cd228cd5c131b52bc918 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 10:45:08 -0400 Subject: [PATCH 128/222] persistant->persistent --- wolF/workflow.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index 446b3c5..f9d02b1 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -112,10 +112,10 @@ def workflow( num_cov_seg_samples=5, - persistant_dry_run = False + persistent_dry_run = False ): - # alert for persistant dry run - if persistant_dry_run: + # alert for persistent dry run + if persistent_dry_run: #TODO push this message to canine print("WARNING: Skipping file localization in dry run!") @@ -160,7 +160,7 @@ def workflow( "t_bai" : tumor_bai, }, token=localization_token, - persistent_disk_dry_run = persistant_dry_run + persistent_disk_dry_run = persistent_dry_run ) collect_tumor_coverage = True elif tumor_coverage_bed is not None: @@ -176,7 +176,7 @@ def workflow( "n_bai" : normal_bai }, token=localization_token, - persistent_disk_dry_run = persistant_dry_run + persistent_disk_dry_run = persistent_dry_run ) collect_normal_coverage = True elif normal_coverage_bed is not None: @@ -604,14 +604,14 @@ def _get_ADP_draw_num(preprocess_data_obj): ) #cleanup by deleting bam disks. we make seperate tasks for the bams - if not persistant_dry_run and t_bam is not None and t_bai is not None: + if not persistent_dry_run and t_bam is not None and t_bai is not None: delete_tbams_task = DeleteDisk( inputs = { "disk" : [tumor_bam_localization_task["t_bam"], tumor_bam_localization_task["t_bai"]], "upstream" : m1_task["mutect1_cs"] if callstats_file is None else tumor_cov_gather_task["coverage"] ) - if not persistant_dry_run and n_bam is not None and n_bai is not None: + if not persistent_dry_run and n_bam is not None and n_bai is not None: delete_nbams_task = DeleteDisk( inputs = { "disk" : [normal_bam_localization_task["n_bam"], normal_bam_localization_task["n_bai"]], From ff23bc95b76156c1a87dd7bb25abffbcd903fbfd Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 10:45:16 -0400 Subject: [PATCH 129/222] Add forgetten brackets] --- wolF/workflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/wolF/workflow.py b/wolF/workflow.py index f9d02b1..3842801 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -609,6 +609,7 @@ def _get_ADP_draw_num(preprocess_data_obj): inputs = { "disk" : [tumor_bam_localization_task["t_bam"], tumor_bam_localization_task["t_bai"]], "upstream" : m1_task["mutect1_cs"] if callstats_file is None else tumor_cov_gather_task["coverage"] + } ) if not persistent_dry_run and n_bam is not None and n_bai is not None: @@ -616,6 +617,7 @@ def _get_ADP_draw_num(preprocess_data_obj): inputs = { "disk" : [normal_bam_localization_task["n_bam"], normal_bam_localization_task["n_bai"]], "upstream" : m1_task["mutect1_cs"] + } ) #also delete the cached files disk delete_file_disk_task = DeleteDisk( From 35b269037096e90e4ee53e5013c2faec32743378 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 10:51:33 -0400 Subject: [PATCH 130/222] t/n_bam -> tumor/normal_bam --- wolF/workflow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index 3842801..bb64a87 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -604,7 +604,7 @@ def _get_ADP_draw_num(preprocess_data_obj): ) #cleanup by deleting bam disks. we make seperate tasks for the bams - if not persistent_dry_run and t_bam is not None and t_bai is not None: + if not persistent_dry_run and tumor_bam is not None and tumor_bai is not None: delete_tbams_task = DeleteDisk( inputs = { "disk" : [tumor_bam_localization_task["t_bam"], tumor_bam_localization_task["t_bai"]], @@ -612,7 +612,7 @@ def _get_ADP_draw_num(preprocess_data_obj): } ) - if not persistent_dry_run and n_bam is not None and n_bai is not None: + if not persistent_dry_run and normal_bam is not None and normal_bai is not None: delete_nbams_task = DeleteDisk( inputs = { "disk" : [normal_bam_localization_task["n_bam"], normal_bam_localization_task["n_bai"]], From 301d1df325e75e3a5ecb1be80451f64356ef94c5 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 11:06:57 -0400 Subject: [PATCH 131/222] Run on Richter's --- 21_genome.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/21_genome.py b/21_genome.py index 2f6d76b..c364ac2 100644 --- a/21_genome.py +++ b/21_genome.py @@ -58,3 +58,26 @@ normal_bai = "gs://fc-secure-e2772064-386d-4911-b242-d6ade82bf172/360c5959-3827-4b24-92e3-d57dbc5de2f6/gdc_api_file_download/15788922-9cf8-4c83-8040-47fa60b7d374/call-download_file/98e061cd-0586-4e56-85fb-c6cc6688dbff_wgs_gdc_realn.bai", target_list = 200 ) + +# Richter's test (hg19) +import wolf +from wolF import workflow + +import dalmatian +wm = dalmatian.WorkspaceManager("broad-firecloud-ibmwatson/Getz_Wu_Richters_WGS_UK") + +wic = wolf.fc.WorkspaceInputConnector("broad-firecloud-ibmwatson/Getz_Wu_Richters_WGS_UK") +Pj = wic.get_pairs_as_joint_samples() + +with wolf.Workflow(workflow = workflow.workflow, namespace = "HapASeg_Richters") as w: + for pair, p in Pj.loc[Pj["sample_type_T"] == "Richter"].iterrows(): + w.run( + RUN_NAME = pair, + tumor_bam = p["output_bam_T"], + tumor_bai = p["output_bam_index_T"], + normal_bam = p["output_bam_N"], + normal_bai = p["output_bam_index_N"], + target_list = 2000, + ref_genome_build = "hg19" + ) + break From 35c46ac84a854cd4fe69de5aff4d8cb2e6a7e6ec Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 11:14:44 -0400 Subject: [PATCH 132/222] Allow cytoband file to be specified in ADP --- wolF/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index bb64a87..5abad32 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -473,7 +473,7 @@ def concat_arm_level_results(arm_results): inputs = { "seg_dataframe" : arm_concat, #"seg_dataframe" : hapaseg_arm_concat_task["arm_cat_results_pickle"], - "cytoband_file" : "/mnt/j/db/hg38/ref/cytoBand_primary.txt", # TODO: allow to be specified + "cytoband_file" : localization_task["cytoband_file"], "ref_fasta" : localization_task["ref_fasta"], "ref_fasta_idx" : localization_task["ref_fasta_idx"], # not used; just supplied for symlink "ref_fasta_dict" : localization_task["ref_fasta_dict"] # not used; just supplied for symlink From c8a1652f6852ca6b8b1b99b98d4e722118e783e8 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 11:15:30 -0400 Subject: [PATCH 133/222] Minor formatting --- wolF/workflow.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index 5abad32..4751a2d 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -479,7 +479,7 @@ def concat_arm_level_results(arm_results): "ref_fasta_dict" : localization_task["ref_fasta_dict"] # not used; just supplied for symlink } ) - + ##collect DP results collect_adp_task = hapaseg.Hapaseg_collect_adp( inputs={"dp_results":[hapaseg_allelic_DP_task["cluster_and_phase_assignments"]] @@ -528,9 +528,9 @@ def _get_ADP_cluster_list(preprocess_data_obj): cluster_idxs = [i for i in np.arange(num_clusters)] print(cluster_idxs, cluster_list, range_list) return len(cluster_idxs), cluster_idxs, cluster_list, range_list - + num_clusters, cluster_idxs, cluster_list, range_list = _get_ADP_cluster_list(prep_cov_mcmc_task["preprocess_data"]) - + # coverage MCMC burnin cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc_burnin( inputs={ @@ -541,7 +541,7 @@ def _get_ADP_cluster_list(preprocess_data_obj): "range":range_list } ) - + # coverage MCMC scatter post-burnin cov_mcmc_scatter_task = hapaseg.Hapaseg_coverage_mcmc( inputs={ @@ -552,7 +552,7 @@ def _get_ADP_cluster_list(preprocess_data_obj): "burnin_files":[cov_mcmc_burnin_task["burnin_data"]] * num_clusters # this is to account for a wolf input len bug } ) - + # collect coverage MCMC cov_mcmc_gather_task = hapaseg.Hapaseg_collect_coverage_mcmc( inputs = { @@ -561,6 +561,7 @@ def _get_ADP_cluster_list(preprocess_data_obj): "bin_width":bin_width } ) + # coverage DP cov_dp_task = hapaseg.Hapaseg_coverage_dp( inputs = { @@ -572,16 +573,15 @@ def _get_ADP_cluster_list(preprocess_data_obj): "bin_width":bin_width } ) - + #get the adp draw number from the preprocess data object @prefect.task def _get_ADP_draw_num(preprocess_data_obj): return int(np.load(preprocess_data_obj)["adp_cluster"]) adp_draw_num = _get_ADP_draw_num(prep_cov_mcmc_task["preprocess_data"]) - - # generate acdp dataframe + # generate acdp dataframe gen_acdp_task = hapaseg.Hapaseg_acdp_generate_df( inputs = { "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same @@ -592,7 +592,7 @@ def _get_ADP_draw_num(preprocess_data_obj): "bin_width":bin_width } ) - + # run acdp acdp_task = hapaseg.Hapaseg_run_acdp( inputs = { From aa5676db31c69c4b7d804ec7d57b25bae4fc0352 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 13:20:36 -0400 Subject: [PATCH 134/222] Don't need separate script to collect ADP shards --- hapaseg/__main__.py | 21 --------------------- wolF/tasks.py | 15 --------------- wolF/workflow.py | 16 ++++++---------- 3 files changed, 6 insertions(+), 46 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 29ba9bb..207adac 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -476,29 +476,8 @@ def main(): plt.title("Allelic segmentation (posterior)") plt.savefig(output_dir + "/figures/segs_only.png", dpi = 300) plt.close() - - #collect adp run data - elif args.command == "collect_adp": - with open(args.dp_results, 'r') as f: - dp_results = f.readlines() - accum_clusts = [] - accum_phases = [] - accum_liks = [] - - for dp_shard in dp_results: - obj = np.load(dp_shard.rstrip('\n')) - accum_clusts.append(obj['snps_to_clusters']) - accum_phases.append(obj['snps_to_phases']) - accum_liks.append(obj['likelihoods']) - all_clusts = np.vstack(accum_clusts) - all_phases = np.vstack(accum_phases) - all_liks = np.vstack(accum_liks) - # save - np.savez(os.path.join(output_dir, "full_dp_results"), snps_to_clusters=all_clusts, snps_to_phases=all_phases, likelihoods=all_liks) - ## running coverage mcmc on all clusters - elif args.command == "coverage_mcmc": cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv, args.allelic_clusters_object, diff --git a/wolF/tasks.py b/wolF/tasks.py index 327481a..4f18da8 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -125,21 +125,6 @@ class Hapaseg_allelic_DP(wolf.Task): docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v623" resources = { "mem" : "5G" } -class Hapaseg_collect_adp(wolf.Task): - inputs = { - "dp_results":None - } - - script = """ - hapaseg collect_adp --dp_results ${dp_results} - """ - output_patterns = { - "full_dp_results":"full_dp_results.npz" - } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" - resources = { "mem" : "5G" } - - class Hapaseg_prepare_coverage_mcmc(wolf.Task): inputs = { "coverage_csv": None, diff --git a/wolF/workflow.py b/wolF/workflow.py index 4751a2d..08b7978 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -480,20 +480,16 @@ def concat_arm_level_results(arm_results): } ) - ##collect DP results - collect_adp_task = hapaseg.Hapaseg_collect_adp( - inputs={"dp_results":[hapaseg_allelic_DP_task["cluster_and_phase_assignments"]] - } - ) - - ### coverage tasks #### + # + # coverage tasks + # # prepare coverage MCMC prep_cov_mcmc_task = hapaseg.Hapaseg_prepare_coverage_mcmc( inputs={ "coverage_csv":tumor_cov_gather_task["coverage"], #each scatter result is the same - "allelic_clusters_object":collect_adp_task["full_dp_results"], - "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same + "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"], + "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'], "repl_pickle":ref_config["repl_file"], "gc_pickle":ref_config["gc_file"], "ref_file_path":localization_task["ref_fasta"] @@ -585,7 +581,7 @@ def _get_ADP_draw_num(preprocess_data_obj): gen_acdp_task = hapaseg.Hapaseg_acdp_generate_df( inputs = { "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same - "allelic_clusters_object":collect_adp_task["full_dp_results"], + "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"], "cdp_filepaths":[cov_dp_task["cov_dp_object"]], "allelic_draw_index":adp_draw_num, "ref_file_path":localization_task["ref_fasta"], From d0affc1d547f52a764d8736c22c8471f68a7f55f Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 13:21:22 -0400 Subject: [PATCH 135/222] Use likelihoods computed in ADP --- hapaseg/run_coverage_MCMC.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 5370708..d8fff39 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -37,7 +37,7 @@ def __init__(self, if allelic_sample is not None: self.allelic_sample = allelic_sample else: - self.allelic_sample = self.select_ADP_cluster() + self.allelic_sample = np.argmax(self.allelic_clusters["likelihoods"]) self.model = None @@ -68,25 +68,6 @@ def dp_prior(cluster_counts_arr, alpha): N = cluster_counts_arr.sum() m = len(cluster_counts_arr) return m * np.log(alpha) + ss.gammaln(cluster_counts_arr).sum() + ss.gammaln(alpha) - ss.gammaln(N+alpha) - - # method for selecting ADP clustering based on likelihoods - def select_ADP_cluster(self): - ADP_draws = self.allelic_clusters["snps_to_clusters"] - tmp_snps = self.SNPs.copy() - lls = [] - for ADP_draw in ADP_draws: - tmp_snps['cluster_assignment'] = ADP_draw - count_arr = tmp_snps.groupby(by='cluster_assignment').agg({"maj":sum, "min":sum}).values - count_arr += 1 - beta_ll = ss.betaln(count_arr[:, 0], count_arr[:, 1]).sum() - cluster_counts = tmp_snps['cluster_assignment'].value_counts().values - dp_ll = self.dp_prior(cluster_counts, 0.5) - lls.append(beta_ll + dp_ll) - lls = np.array(lls) - lls_max = np.max(lls) - choice_p = np.exp(lls - lls_max) / np.exp(lls - lls_max).sum() - return np.random.choice(len(ADP_draws), p=choice_p) - @staticmethod def load_coverage(coverage_csv): From 54ec4b1b4c0010684c7b3c3d350fcb93c8111f36 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 14:02:00 -0400 Subject: [PATCH 136/222] Explicitly pass in ref_fasta to capy.seq --- hapaseg/__main__.py | 1 + hapaseg/run_coverage_MCMC.py | 12 +++++++----- wolF/tasks.py | 4 ++-- wolF/workflow.py | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 207adac..2474c9e 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -501,6 +501,7 @@ def main(): cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv, args.allelic_clusters_object, args.SNPs_pickle, + args.ref_fasta, f_repl=args.repl_pickle, f_GC=args.gc_pickle, allelic_sample=args.allelic_sample) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index d8fff39..18ffefc 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -17,6 +17,7 @@ def __init__(self, f_allelic_clusters, f_SNPs, f_repl, + ref_fasta, f_GC=None, num_draws=50, cluster_num=None, @@ -27,6 +28,7 @@ def __init__(self, self.cluster_num = cluster_num self.f_repl = f_repl self.f_GC = f_GC + self.ref_fasta = ref_fasta self.allelic_clusters = np.load(f_allelic_clusters) # coverage input is expected to be a df file with columns: ["chr", "start", "end", "covcorr", "covraw"] @@ -69,27 +71,27 @@ def dp_prior(cluster_counts_arr, alpha): m = len(cluster_counts_arr) return m * np.log(alpha) + ss.gammaln(cluster_counts_arr).sum() + ss.gammaln(alpha) - ss.gammaln(N+alpha) - @staticmethod - def load_coverage(coverage_csv): + def load_coverage(self, coverage_csv): Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False) Cov.loc[Cov['chr'] == 'chrM', 'chr'] = 'chrMT' #change mitocondrial contigs to follow mut conventions Cov["chr"] = mut.convert_chr(Cov["chr"]) Cov = Cov.loc[Cov["chr"] != 0] Cov=Cov.reset_index(drop=True) - Cov["start_g"] = seq.chrpos2gpos(Cov["chr"], Cov["start"]) - Cov["end_g"] = seq.chrpos2gpos(Cov["chr"], Cov["end"]) + Cov["start_g"] = seq.chrpos2gpos(Cov["chr"], Cov["start"], ref = self.ref_fasta) + Cov["end_g"] = seq.chrpos2gpos(Cov["chr"], Cov["end"], ref = self.ref_fasta) return Cov def load_SNPs(self, f_snps): SNPs = pd.read_pickle(f_snps) - SNPs["chr"], SNPs["pos"] = seq.gpos2chrpos(SNPs["gpos"]) + SNPs["chr"], SNPs["pos"] = seq.gpos2chrpos(SNPs["gpos"], ref = self.ref_fasta) SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False) return SNPs def generate_GC(self): #grab fasta object from seq to avoid rebuilding + seq.set_reference(self.ref_fasta) F = seq._fa.ref_fa_obj self.full_cov_df['C_GC'] = np.nan diff --git a/wolF/tasks.py b/wolF/tasks.py index 4f18da8..d2f0f5a 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -133,11 +133,11 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task): "repl_pickle": None, "gc_pickle":"", "allelic_sample":"", - "ref_file_path": None + "ref_fasta": None } script = """ - export CAPY_REF_FA=${ref_file_path} hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \ + --ref_fasta ${ref_fasta} \ --allelic_clusters_object ${allelic_clusters_object} \ --SNPs_pickle ${SNPs_pickle} \ --repl_pickle ${repl_pickle}""" diff --git a/wolF/workflow.py b/wolF/workflow.py index 08b7978..b5e1e14 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -492,7 +492,7 @@ def concat_arm_level_results(arm_results): "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'], "repl_pickle":ref_config["repl_file"], "gc_pickle":ref_config["gc_file"], - "ref_file_path":localization_task["ref_fasta"] + "ref_fasta":localization_task["ref_fasta"] } ) From 1b84ae08b31d12ede9544766e432581a1a4da711 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 15:03:02 -0400 Subject: [PATCH 137/222] Don't need this anymore --- hapaseg/run_coverage_MCMC.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 18ffefc..f683000 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -84,8 +84,6 @@ def load_coverage(self, coverage_csv): def load_SNPs(self, f_snps): SNPs = pd.read_pickle(f_snps) - SNPs["chr"], SNPs["pos"] = seq.gpos2chrpos(SNPs["gpos"], ref = self.ref_fasta) - SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False) return SNPs From 42e79f86e876cd5a36b1979b4931534f8f5b9a23 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 15:07:13 -0400 Subject: [PATCH 138/222] Add progress bar --- hapaseg/run_coverage_MCMC.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index f683000..95f53b7 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -4,6 +4,7 @@ import re import os import scipy.special as ss +import tqdm from capy import mut, seq import scipy.stats as stats from statsmodels.discrete.discrete_model import NegativeBinomial as statsNB @@ -94,7 +95,7 @@ def generate_GC(self): self.full_cov_df['C_GC'] = np.nan #this indexing assumes 0-indexed start and end cols - for (i, chrm, start, end) in self.full_cov_df[['chr', 'start','end']].itertuples(): + for (i, chrm, start, end) in tqdm.tqdm(self.full_cov_df[['chr', 'start','end']].itertuples(), total = len(self.full_cov_df)): self.full_cov_df.iat[i, -1] = F[chrm-1][start:end+1].gc From 904053f4ad1e480dabeaad78f06377c1e1386cbb Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 15:26:26 -0400 Subject: [PATCH 139/222] Remove unused function --- hapaseg/run_coverage_MCMC.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 95f53b7..ddd5c7d 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -65,13 +65,6 @@ def prepare_single_cluster(self): # save these results to a numpy object return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample - # method for calculating DP prior likelihood of an ADP cluster - @staticmethod - def dp_prior(cluster_counts_arr, alpha): - N = cluster_counts_arr.sum() - m = len(cluster_counts_arr) - return m * np.log(alpha) + ss.gammaln(cluster_counts_arr).sum() + ss.gammaln(alpha) - ss.gammaln(N+alpha) - def load_coverage(self, coverage_csv): Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False) Cov.loc[Cov['chr'] == 'chrM', 'chr'] = 'chrMT' #change mitocondrial contigs to follow mut conventions From fd55e134f1b015ea14ad26e2d912116647b80bb4 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 26 Apr 2022 15:40:53 -0400 Subject: [PATCH 140/222] Hack to fix contig names for hg19 --- wolF/workflow.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index b5e1e14..21ea515 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -192,8 +192,13 @@ def workflow( # collect or load coverage # tumor if collect_tumor_coverage: - primary_contigs = ['chr{}'.format(i) for i in range(1,23)] - primary_contigs.extend(['chrX','chrY','chrM']) + # FIXME: hack to account for "chr" in hg38 but not in hg19 + if ref_genome_build == "hg38": + primary_contigs = ['chr{}'.format(i) for i in range(1,23)] + primary_contigs.extend(['chrX','chrY','chrM']) + else: + primary_contigs = [str(x) for x in range(1, 23)] + ["X", "Y", "M"] + # create scatter intervals split_intervals_task = split_intervals.split_intervals( bam = tumor_bam_localization_task["t_bam"], @@ -204,18 +209,19 @@ def workflow( # shim task to transform split_intervals files into subset parameters for covcollect task @prefect.task - def interval_gather(interval_files): + def interval_gather(interval_files, primary_contigs): ints = [] for f in interval_files: ints.append(pd.read_csv(f, sep = "\t", header = None, names = ["chr", "start", "end"])) #filter non-primary contigs - primary_contigs = ['chr{}'.format(i) for i in range(1,23)] - primary_contigs.extend(['chrX','chrY','chrM']) - full_bed = pd.concat(ints).sort_values(["chr", "start", "end"]) + full_bed = pd.concat(ints).sort_values(["chr", "start", "end"]).astype({ "chr" : str }) filtered_bed = full_bed.loc[full_bed.chr.isin(primary_contigs)] return filtered_bed - subset_intervals = interval_gather(split_intervals_task["interval_files"]) + subset_intervals = interval_gather( + split_intervals_task["interval_files"], + primary_contigs + ) # dispatch coverage scatter tumor_cov_collect_task = cov_collect.Covcollect( @@ -349,6 +355,13 @@ def order_indices(bcf_path, bcf_idx_path, localization_task): F = F.join(F2) + # prepend "chr" to F's index if it's missing + idx = ~F.index.str.contains("^chr") + if idx.any(): + new_index = F.index.values + new_index[idx] = "chr" + F.index[idx] + F = F.set_index(new_index) + # reference panel BCFs R = pd.DataFrame({ "path" : localization_task } ).reset_index() F = F.join(R.join(R.loc[R["index"].str.contains("^chr.*_bcf$"), "index"].str.extract(r"(?Pchr[^_]+)"), how = "right").set_index("chr").drop(columns = ["index"]).rename(columns = { "path" : "ref_bcf" }), how = "inner") From 694284908d1505be34af9b7ffa24ea2192d6f9df Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 27 Apr 2022 13:00:38 -0400 Subject: [PATCH 141/222] Add cytoband header check --- hapaseg/utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/hapaseg/utils.py b/hapaseg/utils.py index c216f4c..46712d3 100644 --- a/hapaseg/utils.py +++ b/hapaseg/utils.py @@ -5,8 +5,13 @@ _chrmap = dict(zip(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]], range(1, 25))) def parse_cytoband(cytoband): - # TODO: do some cytoband files have a header? check if so! - cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"]) + # some cytoband files have a header, some don't; we need to check + has_header = False + with open(cytoband, "r") as f: + if f.readline().startswith("chr\t"): + has_header = True + + cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"] if not has_header else None) cband["chr"] = cband["chr"].apply(lambda x : _chrmap[x]) chrs = cband["chr"].unique() From 104953d65a6ab54e77feafbd42dda3e134175870 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 27 Apr 2022 13:10:55 -0400 Subject: [PATCH 142/222] Bump ADP Docker --- wolF/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index d2f0f5a..d2213fb 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -122,7 +122,7 @@ class Hapaseg_allelic_DP(wolf.Task): "SNP_plot" : "figures/SNPs.png", "seg_plot" : "figures/segs_only.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v623" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v789" resources = { "mem" : "5G" } class Hapaseg_prepare_coverage_mcmc(wolf.Task): From 0dd0dd618b1ab7a14c566710472e3221925ade74 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 27 Apr 2022 14:38:41 -0400 Subject: [PATCH 143/222] Speed up assigning targets to ADP clusters --- hapaseg/run_coverage_MCMC.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index ddd5c7d..1e25b28 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -148,15 +148,19 @@ def assign_clusters(self): clust_choice = self.allelic_clusters["snps_to_clusters"][self.allelic_sample] clust_u, clust_uj = np.unique(clust_choice, return_inverse=True) clust_uj = clust_uj.reshape(clust_choice.shape) + cuj_max = clust_uj.max() + 1 + self.SNPs["clust_choice"] = clust_uj # assign coverage intervals to clusters Cov_clust_probs = np.zeros([len(self.full_cov_df), clust_uj.max()+1]) # first compute assignment probabilities based on the SNPs within each bin - for targ, snp_idx in self.SNPs.groupby("tidx").indices.items(): - targ_clust_hist = np.bincount(clust_uj[snp_idx].ravel(), minlength=clust_uj.max()+1) - - Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum() + for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["clust_choice"]): + if len(snp_idx) == 1: + Cov_clust_probs[int(targ), snp_idx] = 1.0 + else: + targ_clust_hist = np.bincount(snp_idx, minlength = cuj_max) + Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum() # subset intervals containing SNPs overlap_idx = Cov_clust_probs.sum(1) > 0 From 4922536420eba1ab73e0370ddc1fe73cb8ad19be Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 27 Apr 2022 14:43:49 -0400 Subject: [PATCH 144/222] Print log messages to stderr --- hapaseg/run_coverage_MCMC.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 1e25b28..b1f70dd 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -4,6 +4,7 @@ import re import os import scipy.special as ss +import sys import tqdm from capy import mut, seq import scipy.stats as stats @@ -117,13 +118,13 @@ def load_covariates(self): # load GC content if we have it precomputed, otherwise generate it if wgs and self.f_GC is not None and os.path.exists(self.f_GC): - print("Using precomputed GC content") + print("Using precomputed GC content", file = sys.stderr) B = pd.read_pickle(self.f_GC) self.full_cov_df = self.full_cov_df.merge(B.rename(columns={"gc": "C_GC"}), left_on=["chr", "start", "end"], right_on=["chr", "start", "end"], how="left") else: - print("Computing GC content") + print("Computing GC content", file = sys.stderr) self.generate_GC() self.full_cov_df["C_GC_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))( @@ -155,6 +156,7 @@ def assign_clusters(self): Cov_clust_probs = np.zeros([len(self.full_cov_df), clust_uj.max()+1]) # first compute assignment probabilities based on the SNPs within each bin + print("Mapping SNPs to targets ...", file = sys.stderr) for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["clust_choice"]): if len(snp_idx) == 1: Cov_clust_probs[int(targ), snp_idx] = 1.0 From e7673f0115eaf0dd7ebec9ed9aa1d853e31ba177 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 27 Apr 2022 16:34:07 -0400 Subject: [PATCH 145/222] Don't use such small value for covars=0 --- hapaseg/run_coverage_MCMC.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index b1f70dd..dd48a4b 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -94,6 +94,8 @@ def generate_GC(self): def load_covariates(self): + ## Target size + #check if we are doing wgs, in which case we will have uniform 200 bp bins wgs = True if self.f_GC is not None or len(self.full_cov_df) > 100000 else False @@ -105,6 +107,10 @@ def load_covariates(self): if (np.diff(self.full_cov_df["C_log_len"]) == 0).all(): #remove the len col since it will ruin beta fitting self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1) + + ## Replication timing + zt = lambda x : (x - np.nanmean(x))/np.nanstd(x) + # load repl timing F = pd.read_pickle(self.f_repl) # map targets to RT intervals @@ -113,8 +119,9 @@ def load_covariates(self): self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values # z-transform - self.full_cov_df["C_RT_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))( - np.log(self.full_cov_df["C_RT"] + 1e-20)) + self.full_cov_df["C_RT_z"] = zt(np.log(self.full_cov_df["C_RT"] + 0.01)) + + ## GC content # load GC content if we have it precomputed, otherwise generate it if wgs and self.f_GC is not None and os.path.exists(self.f_GC): @@ -127,8 +134,7 @@ def load_covariates(self): print("Computing GC content", file = sys.stderr) self.generate_GC() - self.full_cov_df["C_GC_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))( - np.log(self.full_cov_df["C_GC"] + 1e-20)) + self.full_cov_df["C_GC_z"] = zt(np.log(self.full_cov_df["C_GC"] + 0.01)) #set zero coverage bins to nan self.full_cov_df.loc[(self.full_cov_df.mean_frag_len == 0) | (self.full_cov_df.std_frag_len == 0), ['mean_frag_len', 'std_frag_len']] = (np.nan, np.nan) From 8d3917c1190e68fd2b2daac297149129b5e1967f Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 27 Apr 2022 16:34:29 -0400 Subject: [PATCH 146/222] Don't use fragment std as a covar --- hapaseg/run_coverage_MCMC.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index dd48a4b..4bcd793 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -136,16 +136,16 @@ def load_covariates(self): self.full_cov_df["C_GC_z"] = zt(np.log(self.full_cov_df["C_GC"] + 0.01)) - #set zero coverage bins to nan + ## Fragment length + + # some bins have zero mean fragment length(!?); NaN these out self.full_cov_df.loc[(self.full_cov_df.mean_frag_len == 0) | (self.full_cov_df.std_frag_len == 0), ['mean_frag_len', 'std_frag_len']] = (np.nan, np.nan) - - # add fragment based covars - self.full_cov_df["C_frag_len"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(np.log(self.full_cov_df["mean_frag_len"] + 1e-20)) - self.full_cov_df["C_frag_std"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(np.log(self.full_cov_df["std_frag_len"] + 1e-20)) + + self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) + self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) # drop non z-cetered cols - self.full_cov_df = self.full_cov_df.drop(['C_GC', 'C_RT'], axis=1) - + self.full_cov_df = self.full_cov_df.drop(columns = self.full_cov_df.columns[self.full_cov_df.columns.str.contains("C_.*[^z]$")], axis=1) # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned From add24cb6ad70d6ba9c3ccf8e14a01c2ff708d349 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 27 Apr 2022 17:04:21 -0400 Subject: [PATCH 147/222] Remove empty clusters; remove extreme outlier targets --- hapaseg/run_coverage_MCMC.py | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 4bcd793..d3905ca 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -151,7 +151,7 @@ def load_covariates(self): # clusters with snps from different clusters are probabliztically assigned # method returns coverage df with only bins that overlap snps def assign_clusters(self): - # generate unique clust assignments + ## generate unique clust assignments clust_choice = self.allelic_clusters["snps_to_clusters"][self.allelic_sample] clust_u, clust_uj = np.unique(clust_choice, return_inverse=True) clust_uj = clust_uj.reshape(clust_choice.shape) @@ -177,14 +177,15 @@ def assign_clusters(self): # zero out improbable assignments and re-normalilze Cov_clust_probs_overlap[Cov_clust_probs_overlap < 0.05] = 0 Cov_clust_probs_overlap /= Cov_clust_probs_overlap.sum(1)[:, None] - # prune garbage clusters + # prune empty clusters prune_idx = Cov_clust_probs_overlap.sum(0) > 0 Cov_clust_probs_overlap = Cov_clust_probs_overlap[:, prune_idx] num_pruned_clusters = Cov_clust_probs_overlap.shape[1] - # subsetting to only targets that overlap SNPs + + ## subsetting to only targets that overlap SNPs Cov_overlap = self.full_cov_df.loc[overlap_idx, :] - # probabilistically assign each ambiguous coverage bin to a cluster + ## probabilistically assign each ambiguous coverage bin to a cluster # for now we will take maximum instead amb_mask = np.max(Cov_clust_probs_overlap, 1) != 1 amb_assgn_probs = Cov_clust_probs_overlap[amb_mask, :] @@ -197,7 +198,7 @@ def assign_clusters(self): # update with assigned values Cov_clust_probs_overlap[amb_mask, :] = new_onehot - #downsampling for wgs + ## downsampling for wgs if len(Cov_clust_probs_overlap) > 20000: downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2 Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask] @@ -210,14 +211,16 @@ def assign_clusters(self): Cov_overlap = Cov_overlap.loc[~bad_bins, :] Pi = filtered.copy() + Cov_overlap['allelic_cluster'] = np.argmax(Pi, axis=1) r = np.c_[Cov_overlap["covcorr"]] covar_columns = sorted([c for c in Cov_overlap.columns if 'C_' in c]) - # making covariate matrix + + ## making covariate matrix C = np.c_[Cov_overlap[covar_columns]] - # dropping Nans + ## dropping Nans naidx = np.isnan(C).any(axis=1) # drop zero coverage bins as well (this is to account for a bug in coverage collector) TODO: remove need for this naidx = np.logical_or(naidx, (r==0).flatten()) @@ -227,14 +230,22 @@ def assign_clusters(self): Cov_overlap = Cov_overlap.iloc[~naidx] - #removing outliers + ## removing coverage outliers outlier_mask = find_outliers(r) r = r[~outlier_mask] C = C[~outlier_mask] Pi = Pi[~outlier_mask] Cov_overlap = Cov_overlap.iloc[~outlier_mask] - - Cov_overlap['allelic_cluster'] = np.argmax(Pi, axis=1) + + # some clusters may have been eliminated by this point; prune them from Pi + Pi = Pi[:, Pi.sum(0) > 0] + + ## remove covariate outliers (+- 6 sigma) + covar_outlier_idx = (Cov_overlap.loc[:, covar_columns].abs() < 6).all(axis = 1) + Cov_overlap = Cov_overlap.loc[covar_outlier_idx] + Pi = Pi[covar_outlier_idx, :] + r = r[covar_outlier_idx] + C = C[covar_outlier_idx, :] return Pi, r, C, Cov_overlap From 3b5fb405c7607ed15ae155444d8bc7544b26d6da Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Wed, 27 Apr 2022 22:46:51 -0400 Subject: [PATCH 148/222] Keep non-Z transformed columns --- hapaseg/run_coverage_MCMC.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index d3905ca..fc791a2 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -144,8 +144,6 @@ def load_covariates(self): self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) - # drop non z-cetered cols - self.full_cov_df = self.full_cov_df.drop(columns = self.full_cov_df.columns[self.full_cov_df.columns.str.contains("C_.*[^z]$")], axis=1) # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned @@ -215,7 +213,7 @@ def assign_clusters(self): r = np.c_[Cov_overlap["covcorr"]] - covar_columns = sorted([c for c in Cov_overlap.columns if 'C_' in c]) + covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")]) ## making covariate matrix C = np.c_[Cov_overlap[covar_columns]] From d8b7e221bd16da205e5af3a6e3ff40a05b035ec1 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Apr 2022 07:19:07 -0400 Subject: [PATCH 149/222] Don't log transform covariates --- hapaseg/run_coverage_MCMC.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index fc791a2..b7ddf39 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -119,7 +119,7 @@ def load_covariates(self): self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values # z-transform - self.full_cov_df["C_RT_z"] = zt(np.log(self.full_cov_df["C_RT"] + 0.01)) + self.full_cov_df["C_RT_z"] = zt(self.full_cov_df["C_RT"]) ## GC content @@ -134,7 +134,7 @@ def load_covariates(self): print("Computing GC content", file = sys.stderr) self.generate_GC() - self.full_cov_df["C_GC_z"] = zt(np.log(self.full_cov_df["C_GC"] + 0.01)) + self.full_cov_df["C_GC_z"] = zt(self.full_cov_df["C_GC"]) ## Fragment length From 2a3b1fb6d1df6b6ad710ae0475610d591eb8e5be Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Apr 2022 14:33:52 -0400 Subject: [PATCH 150/222] Minor typo fix --- hapaseg/run_coverage_MCMC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index b7ddf39..05a40dc 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -157,7 +157,7 @@ def assign_clusters(self): self.SNPs["clust_choice"] = clust_uj # assign coverage intervals to clusters - Cov_clust_probs = np.zeros([len(self.full_cov_df), clust_uj.max()+1]) + Cov_clust_probs = np.zeros([len(self.full_cov_df), cuj_max]) # first compute assignment probabilities based on the SNPs within each bin print("Mapping SNPs to targets ...", file = sys.stderr) From 70d924c88fbb8be5e6a17af082a1e2381261a06b Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Apr 2022 14:37:05 -0400 Subject: [PATCH 151/222] Save ADP segmentation samples --- hapaseg/__main__.py | 6 ++++++ wolF/tasks.py | 1 + 2 files changed, 7 insertions(+) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 2474c9e..aebf06c 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -444,12 +444,18 @@ def main(): snps_to_clusters, snps_to_phases, likelihoods = A.run() # save DP results + # SNP assignment/phasing samples, likelihoods of each sample np.savez(output_dir + "/allelic_DP_SNP_clusts_and_phase_assignments.npz", snps_to_clusters=snps_to_clusters, snps_to_phases=snps_to_phases, likelihoods=likelihoods ) + # segmentation breakpoints for each sample + with open(output_dir + "/segmentations.pickle", "wb") as f: + pickle.dump(A.DP_run.segment_trace, f) + + # full SNP dataframe A.SNPs.to_pickle(output_dir + "/all_SNPs.pickle") # diff --git a/wolF/tasks.py b/wolF/tasks.py index d2213fb..42179fa 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -118,6 +118,7 @@ class Hapaseg_allelic_DP(wolf.Task): output_patterns = { "cluster_and_phase_assignments" : "allelic_DP_SNP_clusts_and_phase_assignments.npz", "all_SNPs" : "all_SNPs.pickle", + "segmentation_breakpoints" : "segmentations.pickle", "likelihood_trace_plot" : "figures/likelihood_trace.png", "SNP_plot" : "figures/SNPs.png", "seg_plot" : "figures/segs_only.png", From 87cafef7fbeb967a329e8ee34ed4528bece30aa0 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Apr 2022 14:43:51 -0400 Subject: [PATCH 152/222] Bump ADP memory --- wolF/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 42179fa..6076e0a 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -124,7 +124,7 @@ class Hapaseg_allelic_DP(wolf.Task): "seg_plot" : "figures/segs_only.png", } docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v789" - resources = { "mem" : "5G" } + resources = { "mem" : "8G" } class Hapaseg_prepare_coverage_mcmc(wolf.Task): inputs = { From 68036f03a487f3d82a59606d91ab122596364890 Mon Sep 17 00:00:00 2001 From: Oliver Priebe Date: Thu, 28 Apr 2022 22:19:34 +0000 Subject: [PATCH 153/222] remove arbitrary wgs threshold for using bin lens also rename columns so covariates don't get pulled in repeatedly --- hapaseg/run_coverage_MCMC.py | 37 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index b7ddf39..a6325f2 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -96,17 +96,13 @@ def generate_GC(self): def load_covariates(self): ## Target size - #check if we are doing wgs, in which case we will have uniform 200 bp bins - wgs = True if self.f_GC is not None or len(self.full_cov_df) > 100000 else False - - #we only need bin size if doing exomes - if not wgs: - self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1) + # we only need bin size if doing exomes but we can check by looking at the bin lengths + self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1) - #this is a safety in case we are doing wgs but have few bins - if (np.diff(self.full_cov_df["C_log_len"]) == 0).all(): - #remove the len col since it will ruin beta fitting - self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1) + # in case we are doing wgs these will all be the same and we must remove + if (np.diff(self.full_cov_df["C_log_len"]) == 0).all(): + #remove the len col since it will ruin beta fitting + self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1) ## Replication timing zt = lambda x : (x - np.nanmean(x))/np.nanstd(x) @@ -143,7 +139,9 @@ def load_covariates(self): self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) - + + #rename non z-centered columns so that they arent pulled in as covariates + self.full_cov_df.rename({'C_frag_len':'frag_len', 'C_RT':'RT', 'C_GC':'GC'}, axis=1) # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned @@ -187,14 +185,15 @@ def assign_clusters(self): # for now we will take maximum instead amb_mask = np.max(Cov_clust_probs_overlap, 1) != 1 amb_assgn_probs = Cov_clust_probs_overlap[amb_mask, :] - #new_assgn = np.array([np.random.choice(np.r_[:num_pruned_clusters], - # p=amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))]) - new_assgn = np.array([np.argmax(amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))]) - new_onehot = np.zeros((new_assgn.size, num_pruned_clusters)) - new_onehot[np.arange(new_assgn.size), new_assgn] = 1 - - # update with assigned values - Cov_clust_probs_overlap[amb_mask, :] = new_onehot + if amb_mask.sum() > 0: + #new_assgn = np.array([np.random.choice(np.r_[:num_pruned_clusters], + # p=amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))]) + new_assgn = np.array([np.argmax(amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))]) + new_onehot = np.zeros((new_assgn.size, num_pruned_clusters)) + new_onehot[np.arange(new_assgn.size), new_assgn] = 1 + + # update with assigned values + Cov_clust_probs_overlap[amb_mask, :] = new_onehot ## downsampling for wgs if len(Cov_clust_probs_overlap) > 20000: From 3fc423d734a6b362c164908fb7d2cd805e09206d Mon Sep 17 00:00:00 2001 From: Oliver Priebe Date: Thu, 28 Apr 2022 22:47:48 +0000 Subject: [PATCH 154/222] committing to ^C_.*_z$ covar convention --- hapaseg/a_cov_DP.py | 2 +- hapaseg/run_coverage_MCMC.py | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/hapaseg/a_cov_DP.py b/hapaseg/a_cov_DP.py index 1db9b17..1c2e0a5 100644 --- a/hapaseg/a_cov_DP.py +++ b/hapaseg/a_cov_DP.py @@ -63,7 +63,7 @@ def generate_acdp_df(SNP_path, # path to SNP df print('concatenating dp run ', draw_num) a_cov_seg_df = dp_run.cov_df.copy() - covar_cols = sorted([c for c in a_cov_seg_df.columns if "C_" in c]) + covar_cols = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")]) # add minor and major allele counts for each bin to the cov_seg_df here to allow for beta draws on the fly for each segment a_cov_seg_df['min_count'] = 0 a_cov_seg_df['maj_count'] = 0 diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index a6325f2..78ac27f 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -120,7 +120,7 @@ def load_covariates(self): ## GC content # load GC content if we have it precomputed, otherwise generate it - if wgs and self.f_GC is not None and os.path.exists(self.f_GC): + if self.f_GC is not None and os.path.exists(self.f_GC): print("Using precomputed GC content", file = sys.stderr) B = pd.read_pickle(self.f_GC) @@ -140,9 +140,6 @@ def load_covariates(self): self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) - #rename non z-centered columns so that they arent pulled in as covariates - self.full_cov_df.rename({'C_frag_len':'frag_len', 'C_RT':'RT', 'C_GC':'GC'}, axis=1) - # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned # method returns coverage df with only bins that overlap snps @@ -348,7 +345,7 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, # along with the bin exposure endog = np.exp(np.log(r).flatten() - np.log(bin_width) - mu_is).reshape(-1,1) # generate covars - covar_columns = sorted([c for c in cov_df.columns if 'C_' in c]) + covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")]) C = np.c_[cov_df[covar_columns]] # do regression pois_regr = PoissonRegression(endog, C, np.ones(endog.shape)) From 4f27e28be30a51974df56d1c0e1712da7aefe346 Mon Sep 17 00:00:00 2001 From: Oliver Priebe Date: Fri, 29 Apr 2022 13:19:22 +0000 Subject: [PATCH 155/222] fix cov_df naming issue and update cov_DP covar gather --- hapaseg/a_cov_DP.py | 2 +- hapaseg/coverage_DP.py | 2 +- hapaseg/run_coverage_MCMC.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hapaseg/a_cov_DP.py b/hapaseg/a_cov_DP.py index 1c2e0a5..975908b 100644 --- a/hapaseg/a_cov_DP.py +++ b/hapaseg/a_cov_DP.py @@ -63,7 +63,7 @@ def generate_acdp_df(SNP_path, # path to SNP df print('concatenating dp run ', draw_num) a_cov_seg_df = dp_run.cov_df.copy() - covar_cols = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")]) + covar_cols = sorted(a_cov_seg_df.columns[a_cov_seg_df.columns.str.contains("^C_.*_z$")]) # add minor and major allele counts for each bin to the cov_seg_df here to allow for beta draws on the fly for each segment a_cov_seg_df['min_count'] = 0 a_cov_seg_df['maj_count'] = 0 diff --git a/hapaseg/coverage_DP.py b/hapaseg/coverage_DP.py index 07a85e8..17687f2 100644 --- a/hapaseg/coverage_DP.py +++ b/hapaseg/coverage_DP.py @@ -116,7 +116,7 @@ def __init__(self, cov_df, beta, bin_exposure, prior_run=None, count_prior_sum=N self.seg_id_col = self.cov_df.columns.get_loc('segment_ID') self.beta = beta self.bin_exposure=bin_exposure - self.covar_cols = sorted([c for c in self.cov_df.columns if "C_" in c]) + self.covar_cols = sorted(self.cov_df.columns[self.cov_df.columns.str.contains("^C_.*_z$")]) self.num_segments = self.cov_df.iloc[:, self.seg_id_col].max() + 1 self.segment_r_list = [None] * self.num_segments diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 78ac27f..d58a06f 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -345,7 +345,7 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, # along with the bin exposure endog = np.exp(np.log(r).flatten() - np.log(bin_width) - mu_is).reshape(-1,1) # generate covars - covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")]) + covar_columns = sorted(cov_df.columns[cov_df.columns.str.contains("^C_.*_z$")]) C = np.c_[cov_df[covar_columns]] # do regression pois_regr = PoissonRegression(endog, C, np.ones(endog.shape)) From 53627cc8ebaa077ad41bfe274f72b554a2f48b4d Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 29 Apr 2022 10:21:12 -0400 Subject: [PATCH 156/222] Draft code for binning fraglen --- hapaseg/run_coverage_MCMC.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 05a40dc..0c36f41 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -144,6 +144,19 @@ def load_covariates(self): self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) + # generate on 10x and 50x scales + # TODO: use rolling window rather than disjoint bins + for scale in [10, 50]: + fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0 + wt = self.full_cov_df["num_reads"].values + fl = np.pad(fl, (0, scale - (len(fl) % scale))).reshape(-1, scale) + wt = np.pad(wt, (0, scale - (len(wt) % scale))).reshape(-1, scale) + wt = wt/wt.sum(1, keepdims = True) + self.full_cov_df[f"C_frag_len_{scale}x"] = np.tile( + np.einsum('ij,ij->i', wt, fl), + [scale, 1] + ).T.ravel()[:len(self.full_cov_df)] + self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"]) # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned From bfa98a99deb271edb6c729d038b450e14b87cd85 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 29 Apr 2022 10:21:28 -0400 Subject: [PATCH 157/222] Unused code for mapping intervals to segments --- hapaseg/run_coverage_MCMC.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 0c36f41..5259cfb 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -181,8 +181,33 @@ def assign_clusters(self): targ_clust_hist = np.bincount(snp_idx, minlength = cuj_max) Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum() - # subset intervals containing SNPs +# # assign coverage intervals to allelic segments +# # TODO: segmentation boundary will be passed directly in, so we don't have to recompute it +# seg_bdy = np.flatnonzero(np.r_[1, np.diff(self.SNPs["clust_choice"]), 1] != 0) +# seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] +# self.SNPs["seg_idx"] = 0 +# for i, (st, en) in enumerate(seg_bdy): +# self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i +# seg_idx_max = self.SNPs["seg_idx"].max() + 1 +# +# Cov_clust_probs_seg = np.zeros([len(self.full_cov_df), seg_idx_max]) +# +# for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["seg_idx"]): +# if len(snp_idx) == 1: +# Cov_clust_probs_seg[int(targ), snp_idx] = 1.0 +# else: +# targ_clust_hist = np.bincount(snp_idx, minlength = seg_idx_max) +# Cov_clust_probs_seg[int(targ), :] = targ_clust_hist / targ_clust_hist.sum() +# +# # XXX: temporary +# Cov_clust_probs = Cov_clust_probs_seg + + ## subset to targets containing SNPs overlap_idx = Cov_clust_probs.sum(1) > 0 +# # add targets within a 2 targ radius +# overlap_idx = np.flatnonzero(Cov_clust_probs.sum(1) > 0)[:, None] +# overlap_idx = overlap_idx + np.c_[-2:3].T +# overlap_idx = np.sort(np.unique((overlap_idx + np.c_[-2:3].T).ravel())) Cov_clust_probs_overlap = Cov_clust_probs[overlap_idx, :] # zero out improbable assignments and re-normalilze From 1374932713929bae834aff2d6712f61ac4c4286a Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 29 Apr 2022 10:21:38 -0400 Subject: [PATCH 158/222] Temporarily quit downsampling --- hapaseg/run_coverage_MCMC.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 5259cfb..79cb756 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -235,10 +235,10 @@ def assign_clusters(self): Cov_clust_probs_overlap[amb_mask, :] = new_onehot ## downsampling for wgs - if len(Cov_clust_probs_overlap) > 20000: - downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2 - Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask] - Cov_overlap = Cov_overlap.iloc[downsample_mask] +# if len(Cov_clust_probs_overlap) > 20000: +# downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2 +# Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask] +# Cov_overlap = Cov_overlap.iloc[downsample_mask] # remove clusters with fewer than 4 assigned coverage bins (remove these coverage bins as well) bad_clusters = Cov_clust_probs_overlap.sum(0) < 4 From 472aa6ae584e648c6cff7c35f641df0554f9880c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 2 May 2022 13:23:44 -0400 Subject: [PATCH 159/222] Set ADP betahyp dynamically --- hapaseg/allelic_DP.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 6c6cb86..0ae9eb5 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -93,7 +93,7 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F") self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F") - self.betahyp = 10 + self.betahyp = self.S.loc[:, ["min", "maj"]].sum(1).mean()/2 # # define column indices From 4ce8fac1d69d31f0fb4e69dec07a861063b88cb3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 2 May 2022 14:49:30 -0400 Subject: [PATCH 160/222] Map coverage intervals to allelic segments --- hapaseg/run_coverage_MCMC.py | 49 +++++++++++++++++------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index cbcd51e..810a49d 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -1,5 +1,6 @@ import numpy as np import pandas as pd +import pickle import glob import re import os @@ -18,6 +19,7 @@ def __init__(self, coverage_csv, f_allelic_clusters, f_SNPs, + f_segs, f_repl, ref_fasta, f_GC=None, @@ -33,6 +35,8 @@ def __init__(self, self.ref_fasta = ref_fasta self.allelic_clusters = np.load(f_allelic_clusters) + with open(f_segs, "rb") as f: + self.segmentations = pickle.load(f) # coverage input is expected to be a df file with columns: ["chr", "start", "end", "covcorr", "covraw"] self.full_cov_df = self.load_coverage(coverage_csv) self.load_covariates() @@ -165,38 +169,31 @@ def assign_clusters(self): cuj_max = clust_uj.max() + 1 self.SNPs["clust_choice"] = clust_uj - # assign coverage intervals to clusters + ## assign coverage intervals to allelic clusters and segments + # assignment probabilities of each coverage interval -> allelic cluster Cov_clust_probs = np.zeros([len(self.full_cov_df), cuj_max]) + # get allelic segment boundaries + seg_bdy = np.r_[list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)] + seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] + self.SNPs["seg_idx"] = 0 + for i, (st, en) in enumerate(seg_bdy): + self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i + # first compute assignment probabilities based on the SNPs within each bin + # segments just get assigned to the maximum probability + self.full_cov_df["seg_idx"] = -1 print("Mapping SNPs to targets ...", file = sys.stderr) - for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["clust_choice"]): - if len(snp_idx) == 1: - Cov_clust_probs[int(targ), snp_idx] = 1.0 + for targ, D in tqdm.tqdm(self.SNPs.groupby("tidx")[["clust_choice", "seg_idx"]]): + clust_idx = D["clust_choice"].values + seg_idx = D["seg_idx"].values + if len(clust_idx) == 1: + Cov_clust_probs[int(targ), clust_idx] = 1.0 + self.full_cov_df.at[int(targ), "seg_idx"] = seg_idx[0] else: - targ_clust_hist = np.bincount(snp_idx, minlength = cuj_max) + targ_clust_hist = np.bincount(clust_idx, minlength = cuj_max) Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum() - -# # assign coverage intervals to allelic segments -# # TODO: segmentation boundary will be passed directly in, so we don't have to recompute it -# seg_bdy = np.flatnonzero(np.r_[1, np.diff(self.SNPs["clust_choice"]), 1] != 0) -# seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] -# self.SNPs["seg_idx"] = 0 -# for i, (st, en) in enumerate(seg_bdy): -# self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i -# seg_idx_max = self.SNPs["seg_idx"].max() + 1 -# -# Cov_clust_probs_seg = np.zeros([len(self.full_cov_df), seg_idx_max]) -# -# for targ, snp_idx in tqdm.tqdm(self.SNPs.groupby("tidx")["seg_idx"]): -# if len(snp_idx) == 1: -# Cov_clust_probs_seg[int(targ), snp_idx] = 1.0 -# else: -# targ_clust_hist = np.bincount(snp_idx, minlength = seg_idx_max) -# Cov_clust_probs_seg[int(targ), :] = targ_clust_hist / targ_clust_hist.sum() -# -# # XXX: temporary -# Cov_clust_probs = Cov_clust_probs_seg + self.full_cov_df.at[int(targ), "seg_idx"] = np.bincount(seg_idx).argmax() ## subset to targets containing SNPs overlap_idx = Cov_clust_probs.sum(1) > 0 From 57032aefce9b896a6b5d37acc4128cd5188d42a0 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 2 May 2022 17:11:08 -0400 Subject: [PATCH 161/222] Expand chrbdy plot to fill ylim --- hapaseg/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hapaseg/utils.py b/hapaseg/utils.py index 46712d3..5b78213 100644 --- a/hapaseg/utils.py +++ b/hapaseg/utils.py @@ -46,11 +46,14 @@ def plot_chrbdy(cytoband_file): chrbdy = parse_cytoband(cytoband_file) # plot chromosome boundaries + yl_0 = plt.ylim()[0] + yl_1 = plt.ylim()[1] chr_ends = chrbdy.loc[1::2, "end"].cumsum() for end in chr_ends[:-1]: plt.axvline(end, color = 'k') for st, en in np.c_[chr_ends[:-1:2], chr_ends[1::2]]: - plt.fill_between([st, en], 0, 1, color = [0.9, 0.9, 0.9], zorder = 0) + plt.fill_between([st, en], yl_0, yl_1, color = [0.9, 0.9, 0.9], zorder = 0) + plt.ylim([yl_0, yl_1]) # plot centromere locations for cent in (np.c_[chrbdy.loc[1::2, "start"], chrbdy.loc[::2, "end"]] + np.c_[np.r_[0, chr_ends[:-1]]]).ravel(): From 352b93ad3b9bbabe9e182f83013cc16bf97662c2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 2 May 2022 17:12:34 -0400 Subject: [PATCH 162/222] Save allelic segmentation boundaries WRT coverage dataframe --- hapaseg/__main__.py | 19 +++++++++++++++++++ wolF/tasks.py | 3 ++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index aebf06c..aa84d26 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -125,6 +125,7 @@ def parse_args(): coverage_mcmc.add_argument("--allelic_clusters_object", help="npy file containing allelic dp segs-to-clusters results") coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs") + coverage_mcmc.add_argument("--segmentations", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) coverage_mcmc.add_argument("--covariate_dir", help="path to covariate directory with covariates all in pickled files") coverage_mcmc.add_argument("--num_draws", type=int, @@ -145,6 +146,7 @@ def parse_args(): preprocess_coverage_mcmc.add_argument("--allelic_clusters_object", help="npy file containing allelic dp segs-to-clusters results", required=True) preprocess_coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs", required=True) + preprocess_coverage_mcmc.add_argument("--segmentations", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) preprocess_coverage_mcmc.add_argument("--repl_pickle", help="pickled dataframe containing replication timing data", required=True) preprocess_coverage_mcmc.add_argument("--gc_pickle", help="pickled dataframe containing precomputed gc content. This is not required but will speed up runtime if passed", default=None) preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int, @@ -504,17 +506,34 @@ def main(): ## preprocess ADP data to run scattered coverage mcmc jobs on each ADP cluster elif args.command == "coverage_mcmc_preprocess": + ## perform initial Poisson regression cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv, args.allelic_clusters_object, args.SNPs_pickle, + args.segmentations, args.ref_fasta, f_repl=args.repl_pickle, f_GC=args.gc_pickle, allelic_sample=args.allelic_sample) Pi, r, C, all_mu, global_beta, cov_df, adp_cluster = cov_mcmc_runner.prepare_single_cluster() + + ## create chunks for both burnin and scatter + cov_df = cov_df.sort_values("start_g", ignore_index = True) + + # indices of coverage bins + seg_g = cov_df.groupby("seg_idx") + seg_g_idx = pd.Series(seg_g.indices).to_frame(name = "indices") + seg_g_idx["allelic_cluster"] = seg_g["allelic_cluster"].first() + seg_g_idx["n_cov_bins"] = seg_g.size() + + ## save + # regression matrices np.savez(os.path.join(output_dir, 'preprocess_data'), Pi=Pi, r=r, C=C, all_mu=all_mu, global_beta=global_beta, adp_cluster=adp_cluster) + # coverage dataframe mapped cov_df.to_pickle(os.path.join(output_dir, 'cov_df.pickle')) + # allelic segment indices into coverage dataframe + seg_g_idx.to_pickle(os.path.join(output_dir, 'allelic_seg_groups.pickle')) ## run scattered coverage mcmc job using preprocessed data elif args.command == "coverage_mcmc_shard": diff --git a/wolF/tasks.py b/wolF/tasks.py index 6076e0a..96afa7c 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -151,7 +151,8 @@ def prolog(self): output_patterns = { "preprocess_data": "preprocess_data.npz", - "cov_df_pickle": "cov_df.pickle" + "cov_df_pickle": "cov_df.pickle", + "allelic_seg_groups": "allelic_seg_groups.pickle" } docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" From 7aa624a93c9b20c7e6e4f0f8f04c9aa0dc8afe7b Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 2 May 2022 17:21:26 -0400 Subject: [PATCH 163/222] Add segmentation pickle to wolF task --- hapaseg/__main__.py | 6 +++--- wolF/tasks.py | 2 ++ wolF/workflow.py | 1 + 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index aa84d26..ee57783 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -125,7 +125,7 @@ def parse_args(): coverage_mcmc.add_argument("--allelic_clusters_object", help="npy file containing allelic dp segs-to-clusters results") coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs") - coverage_mcmc.add_argument("--segmentations", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) + coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) coverage_mcmc.add_argument("--covariate_dir", help="path to covariate directory with covariates all in pickled files") coverage_mcmc.add_argument("--num_draws", type=int, @@ -146,7 +146,7 @@ def parse_args(): preprocess_coverage_mcmc.add_argument("--allelic_clusters_object", help="npy file containing allelic dp segs-to-clusters results", required=True) preprocess_coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs", required=True) - preprocess_coverage_mcmc.add_argument("--segmentations", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) + preprocess_coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) preprocess_coverage_mcmc.add_argument("--repl_pickle", help="pickled dataframe containing replication timing data", required=True) preprocess_coverage_mcmc.add_argument("--gc_pickle", help="pickled dataframe containing precomputed gc content. This is not required but will speed up runtime if passed", default=None) preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int, @@ -510,7 +510,7 @@ def main(): cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv, args.allelic_clusters_object, args.SNPs_pickle, - args.segmentations, + args.segmentations_pickle, args.ref_fasta, f_repl=args.repl_pickle, f_GC=args.gc_pickle, diff --git a/wolF/tasks.py b/wolF/tasks.py index 96afa7c..3b234f6 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -131,6 +131,7 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task): "coverage_csv": None, "allelic_clusters_object": None, "SNPs_pickle": None, + "segmentations_pickle": None, "repl_pickle": None, "gc_pickle":"", "allelic_sample":"", @@ -141,6 +142,7 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task): --ref_fasta ${ref_fasta} \ --allelic_clusters_object ${allelic_clusters_object} \ --SNPs_pickle ${SNPs_pickle} \ + --segmentations_pickle ${segmentations_pickle} \ --repl_pickle ${repl_pickle}""" def prolog(self): diff --git a/wolF/workflow.py b/wolF/workflow.py index 21ea515..65a0fc9 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -503,6 +503,7 @@ def concat_arm_level_results(arm_results): "coverage_csv":tumor_cov_gather_task["coverage"], #each scatter result is the same "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"], "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'], + "segmentations_pickle":hapaseg_allelic_DP_task['segmentation_breakpoints'], "repl_pickle":ref_config["repl_file"], "gc_pickle":ref_config["gc_file"], "ref_fasta":localization_task["ref_fasta"] From 520364116a3768a7277106515d58de4f289205be Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 5 May 2022 11:21:35 -0400 Subject: [PATCH 164/222] Initial commit of scattering cov MCMC over allelic segments --- hapaseg/NB_coverage_MCMC.py | 6 +- hapaseg/__main__.py | 115 ++++++++++++++++++++---------------- wolF/tasks.py | 8 ++- wolF/workflow.py | 23 +++++++- 4 files changed, 95 insertions(+), 57 deletions(-) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index 6275f39..b2473cc 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -1,6 +1,7 @@ import numpy as np import scipy.special as ss import sortedcontainers as sc +import sys from statsmodels.discrete.discrete_model import NegativeBinomial as statsNB import warnings from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning @@ -738,13 +739,12 @@ def prepare_results(self): """ class NB_MCMC_SingleCluster: - def __init__(self, n_iter, r, C, mu, beta, cluster_num, bin_width=1): + def __init__(self, n_iter, r, C, mu, beta, bin_width=1): self.n_iter = n_iter self.r = r self.C = C self.beta = beta self.mu = mu - self.cluster_num = cluster_num self.bin_width = bin_width # for now assume that the Pi vector assigns each bin to exactly one cluster @@ -788,7 +788,7 @@ def save_sample(self): def run(self, debug=False, stop_after_burnin=False): - print("starting MCMC coverage segmentation for cluster {}...".format(self.cluster_num), flush=True) + print("Starting MCMC coverage segmentation ...", flush=True, file=sys.stderr) past_it = 0 n_it = 0 diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index ee57783..6c4d574 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -156,15 +156,15 @@ def parse_args(): ## running coverage mcmc on single cluster for scatter task coverage_mcmc_shard = subparsers.add_parser("coverage_mcmc_shard", help="run coverage mcmc on single ADP cluster") - coverage_mcmc_shard.add_argument("--preprocess_data", help='path to numpy object containing preprocessed data', + coverage_mcmc_shard.add_argument("--preprocess_data", help='path to numpy object containing preprocessed data: covariate matrix (C), global beta, ADP cluster mu\'s, covbin ADP cluster assignments (all_mu), covbin raw coverage values (r)', required=True) + coverage_mcmc_shard.add_argument("--allelic_seg_indices", help='path to pickled pandas dataframe containing coverage bin indices for each alleic segment', + required=True) + coverage_mcmc_shard.add_argument("--allelic_seg_idx", help='which allelic segment to perform coverage segmentation on.', + required=True, type=int) coverage_mcmc_shard.add_argument("--num_draws", type=int, help="number of draws to take from coverage segmentation MCMC", default=50) - coverage_mcmc_shard.add_argument("--cluster_num", type=int, - help="cluster index for this worker to run on. If unspecified method will simulate " - "all clusters on the same machine", default=None) coverage_mcmc_shard.add_argument("--bin_width", type=int, default=1, help="size of uniform bins if using. Otherwise 1.") - coverage_mcmc_shard.add_argument("--range", type=str, help="range of coverage bins within the cluster to burnin. should be in start-end form. Note that this will cause num draws to be overridden to 1") coverage_mcmc_shard.add_argument("--burnin_files", type=str, help="txt file containing burnt in segment assignments") ## collect coverage MCMC shards @@ -539,55 +539,70 @@ def main(): elif args.command == "coverage_mcmc_shard": # load preprocessed data preprocess_data = np.load(args.preprocess_data) - # check to make sure that the cluster index is within the range - Pi = preprocess_data['Pi'] - if args.cluster_num > Pi.shape[1] - 1: - raise ValueError("Received cluster number {}, which is out of range".format(args.cluster_num)) - + # extract preprocessed data from this cluster - mu = preprocess_data["all_mu"][args.cluster_num] + Pi = preprocess_data['Pi'] + mu = preprocess_data["all_mu"]#[args.cluster_num] beta = preprocess_data["global_beta"] c_assignments = np.argmax(Pi, axis=1) - cluster_mask = (c_assignments == args.cluster_num) - r = preprocess_data['r'][cluster_mask] - C = preprocess_data['C'][cluster_mask] - - # if we get a range argument well be doing burnin on a subset of the coverage bins - if args.range is not None: - #parse range from string - range_lst = args.range.split('-') - st,en = int(range_lst[0]), int(range_lst[1]) - if st > en or st < 0 or en > len(r): - raise ValueError("invalid range! got range {} for cluster {} with size {}".format(args.range, args.cluster_num, len(r))) - - #trim data to our desired range - r = r[st:en] - C = C[st:en] - num_draws = 1 - - # if we're just burning in a subset use different save strings - model_save_str = 'cov_mcmc_model_cluster_{}_{}.pickle'.format(args.cluster_num, args.range) - data_save_str = 'cov_mcmc_data_cluster_{}_{}'.format(args.cluster_num, args.range) - figure_save_str = 'cov_mcmc_cluster_{}_{}_visual'.format(args.cluster_num, args.range) - - else: - #if not in burnin use the specified number of draws - num_draws = args.num_draws - - - model_save_str = 'cov_mcmc_model_cluster_{}.pickle'.format(args.cluster_num) - data_save_str = 'cov_mcmc_data_cluster_{}'.format(args.cluster_num) - figure_save_str = 'cov_mcmc_cluster_{}_visual'.format(args.cluster_num) - - # run on the specified cluster - cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.cluster_num, args.bin_width) + #cluster_mask = (c_assignments == args.cluster_num) + r = preprocess_data['r']#[cluster_mask] + C = preprocess_data['C']#[cluster_mask] + + # load and (weakly) verify allelic segment indices + seg_g_idx = pd.read_pickle(args.allelic_seg_indices) + if len(np.hstack(seg_g_idx["indices"])) != C.shape[0]: + raise ValueError("Size mismatch between allelic segment assignments and coverage bin data!") + + # subset to a single allelic segment + if args.allelic_seg_idx > len(seg_g_idx) - 1: + raise ValueError("Allelic segment index out of bounds!") + + seg_indices = seg_g_idx.iloc[args.allelic_seg_idx] + + mu = mu[seg_indices["allelic_cluster"]] + C = C[seg_indices["indices"], :] + r = r[seg_indices["indices"], :] - # if we're using burnin results load them now - if args.burnin_files is not None: - with open(args.burnin_files, 'r') as f: - file_list = f.read().splitlines() - assignments_arr = aggregate_burnin_files(file_list, args.cluster_num) - cov_mcmc.init_burnin(assignments_arr) + # run cov MCMC + cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.bin_width) + +# # if we get a range argument well be doing burnin on a subset of the coverage bins +# if args.range is not None: +# #parse range from string +# range_lst = args.range.split('-') +# st,en = int(range_lst[0]), int(range_lst[1]) +# if st > en or st < 0 or en > len(r): +# raise ValueError("invalid range! got range {} for cluster {} with size {}".format(args.range, args.cluster_num, len(r))) +# +# #trim data to our desired range +# r = r[st:en] +# C = C[st:en] +# num_draws = 1 +# +# # if we're just burning in a subset use different save strings +# model_save_str = 'cov_mcmc_model_cluster_{}_{}.pickle'.format(args.cluster_num, args.range) +# data_save_str = 'cov_mcmc_data_cluster_{}_{}'.format(args.cluster_num, args.range) +# figure_save_str = 'cov_mcmc_cluster_{}_{}_visual'.format(args.cluster_num, args.range) +# +# else: +# #if not in burnin use the specified number of draws +# num_draws = args.num_draws +# +# +# model_save_str = 'cov_mcmc_model_cluster_{}.pickle'.format(args.cluster_num) +# data_save_str = 'cov_mcmc_data_cluster_{}'.format(args.cluster_num) +# figure_save_str = 'cov_mcmc_cluster_{}_visual'.format(args.cluster_num) +# +# # run on the specified cluster +# cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.cluster_num, args.bin_width) +# +# # if we're using burnin results load them now +# if args.burnin_files is not None: +# with open(args.burnin_files, 'r') as f: +# file_list = f.read().splitlines() +# assignments_arr = aggregate_burnin_files(file_list, args.cluster_num) +# cov_mcmc.init_burnin(assignments_arr) cov_mcmc.run() diff --git a/wolF/tasks.py b/wolF/tasks.py index 3b234f6..860acfb 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -190,16 +190,18 @@ def prolog(self): class Hapaseg_coverage_mcmc(wolf.Task): inputs = { - "preprocess_data": None, + "preprocess_data": None, # npz of covariate matrix (C), global beta, ADP cluster mu's, covbin ADP cluster assignments (all_mu), covbin raw coverage values (r) + "allelic_seg_indices": None, # dataframe containing indicies into C/r/all_mu for each allelic segment + "allelic_seg_scatter_idx": None, # allelic segment to operate on (for scatter) "num_draws": 50, - "cluster_num": None, "bin_width":None, "burnin_files":"" } script = """ hapaseg coverage_mcmc_shard --preprocess_data ${preprocess_data} \ + --allelic_seg_indices ${allelic_seg_idx} \ + --allelic_seg_idx ${allelic_seg_scatter_idx} \ --num_draws ${num_draws} \ - --cluster_num ${cluster_num} \ --bin_width ${bin_width}""" def prolog(self): diff --git a/wolF/workflow.py b/wolF/workflow.py index 65a0fc9..784e3ed 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -509,7 +509,28 @@ def concat_arm_level_results(arm_results): "ref_fasta":localization_task["ref_fasta"] } ) - + + # shim task to get number of allelic segments + # (coverage MCMC will be scattered over each allelic segment) + @prefect.task + def get_N_seg_groups(S): + return len(S) + + N_cov_mcmc_shards = get_N_seg_groups(prep_cov_mcmc_task["allelic_seg_groups"]) + + # TODO: modify burnin task to subset to these indices + + # coverage MCMC burnin(?) <- do we still need to burnin separately? + cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc_burnin( + inputs={ + "preprocess_data":prep_cov_mcmc_task["preprocess_data"], + "allelic_seg_indices":prep_cov_mcmc_task["allelic_seg_groups"], + "allelic_seg_scatter_idx":range(0, N_cov_mcmc_shards), + "num_draws":50, + "bin_width":bin_width, + } + ) + #get the cluster indices from the preprocess data and generate the burnin indices @prefect.task(nout=4) def _get_ADP_cluster_list(preprocess_data_obj): From 11fa172c290d1f78756d3490f88be40391d556c2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 5 May 2022 13:17:40 -0400 Subject: [PATCH 165/222] Use specific version of interval splitter (for now) --- wolF/workflow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index 784e3ed..0509db0 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -38,7 +38,8 @@ # for coverage collection split_intervals = wolf.ImportTask( task_path = "git@github.com:getzlab/split_intervals_TOOL.git", - task_name = "split_intervals" + task_name = "split_intervals", + commit = "dc102d8" ) cov_collect = wolf.ImportTask( From 217ee269d77dbf668a21a6999d08443e7c8b8c3d Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 5 May 2022 14:05:40 -0400 Subject: [PATCH 166/222] Misc bugs in run_coverage_MCMC call from __main__ --- hapaseg/__main__.py | 1 + hapaseg/run_coverage_MCMC.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 6c4d574..77fea79 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -152,6 +152,7 @@ def parse_args(): preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int, help="index of sample clustering from allelic DP to use as seed for segmentation. Will use most likely clustering by default", default=None) + preprocess_coverage_mcmc.add_argument("--ref_fasta", required = True) ## running coverage mcmc on single cluster for scatter task coverage_mcmc_shard = subparsers.add_parser("coverage_mcmc_shard", diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 810a49d..b093d69 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -20,8 +20,8 @@ def __init__(self, f_allelic_clusters, f_SNPs, f_segs, - f_repl, ref_fasta, + f_repl, f_GC=None, num_draws=50, cluster_num=None, From c0942d646b3064fd2e2f7f61a0ef60e26700bb39 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 5 May 2022 14:33:00 -0400 Subject: [PATCH 167/222] Bump some dockers --- wolF/tasks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 860acfb..651d22b 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -123,7 +123,7 @@ class Hapaseg_allelic_DP(wolf.Task): "SNP_plot" : "figures/SNPs.png", "seg_plot" : "figures/segs_only.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v789" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v813" resources = { "mem" : "8G" } class Hapaseg_prepare_coverage_mcmc(wolf.Task): @@ -157,7 +157,7 @@ def prolog(self): "allelic_seg_groups": "allelic_seg_groups.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v815" resources = { "mem" : "15G" } @@ -214,7 +214,7 @@ def prolog(self): "cov_seg_figure": 'cov_mcmc_cluster_*_visual.png' } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v815" resources = {"mem" : "5G"} class Hapaseg_collect_coverage_mcmc(wolf.Task): From 44344b2f59e98d731451246eec6d0bcbf854d278 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 5 May 2022 14:41:54 -0400 Subject: [PATCH 168/222] Added missing factor of 2 --- hapaseg/NB_coverage_MCMC.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index b2473cc..53efda7 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -451,7 +451,7 @@ def _get_log_ML_approx_join(self, Hess): # computes ML component from hessian approximation for two split segments def _get_log_ML_split(self, H1, H2): - return np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2 + return 2*np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2 # computes the log ML of joining two segments def _log_ML_join(self, ind, ret_opt_params=False): From 0e33a82363b9a76a6e7a74fb2fd4f5fb52af05db Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 10 May 2022 17:13:33 -0400 Subject: [PATCH 169/222] Use poscol instead of rename --- hapaseg/run_coverage_MCMC.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index b093d69..670da2f 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -108,13 +108,14 @@ def load_covariates(self): #remove the len col since it will ruin beta fitting self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1) - ## Replication timing zt = lambda x : (x - np.nanmean(x))/np.nanstd(x) + ## Replication timing + # load repl timing F = pd.read_pickle(self.f_repl) # map targets to RT intervals - tidx = mut.map_mutations_to_targets(self.full_cov_df.rename(columns={"start": "pos"}), F, inplace=False) + tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "start") self.full_cov_df['C_RT'] = np.nan self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values From b9bbf18f20376ea2e7a9c3297420e37836545354 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 10 May 2022 17:27:47 -0400 Subject: [PATCH 170/222] Use rolling convolution for smoothing fragment length --- hapaseg/run_coverage_MCMC.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 670da2f..091ff07 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -139,24 +139,22 @@ def load_covariates(self): ## Fragment length - # some bins have zero mean fragment length(!?); NaN these out - self.full_cov_df.loc[(self.full_cov_df.mean_frag_len == 0) | (self.full_cov_df.std_frag_len == 0), ['mean_frag_len', 'std_frag_len']] = (np.nan, np.nan) + # some bins have zero mean fragment length; these bins are bad and should be removed + self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True) self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) - # generate on 10x and 50x scales - # TODO: use rolling window rather than disjoint bins - for scale in [10, 50]: - fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0 - wt = self.full_cov_df["num_reads"].values - fl = np.pad(fl, (0, scale - (len(fl) % scale))).reshape(-1, scale) - wt = np.pad(wt, (0, scale - (len(wt) % scale))).reshape(-1, scale) - wt = wt/wt.sum(1, keepdims = True) - self.full_cov_df[f"C_frag_len_{scale}x"] = np.tile( - np.einsum('ij,ij->i', wt, fl), - [scale, 1] - ).T.ravel()[:len(self.full_cov_df)] + # generate on 5x and 11x scales + swv = np.lib.stride_tricks.sliding_window_view + fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0 + wt = self.full_cov_df["num_reads"].values + for scale in [5, 11]: + fl_sw = swv(np.pad(fl, scale//2), scale) + wt_sw = swv(np.pad(wt, scale//2), scale) + conv = np.einsum('ij,ij->i', wt_sw, fl_sw) + + self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1) self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"]) # use SNP cluster assignments from the given draw assign coverage bins to clusters From 3b44cef7fbad72aa38883e21c2f0bf044514120e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 10 May 2022 17:56:51 -0400 Subject: [PATCH 171/222] Generate DNAse/FAIRE covariates --- 71_coverage_covariates.py | 80 +++++++++++++++++++++++++++++++++++++-- covars/getmax.c | 15 ++++++++ 2 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 covars/getmax.c diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py index 6c2bb1b..7b4525c 100644 --- a/71_coverage_covariates.py +++ b/71_coverage_covariates.py @@ -1,11 +1,13 @@ import liftover +import numpy as np import pandas as pd import pyfaidx +import pyBigWig import tqdm -from capy import mut +from capy import mut, seq # -# replication timing +# replication timing {{{ F = pd.read_csv("/mnt/j/proj/cnv/20201018_hapseg2/covars/GSE137764_H1_GaussiansGSE137764_mooth_scaled_autosome.mat", sep = "\t", header = None).T.rename(columns = { 0 : "chr", 1 : "start", 2 : "end" }) F.iloc[:, 3:] = F.loc[:, 3:].astype(float) @@ -13,7 +15,7 @@ F["chr"] = mut.convert_chr(F["chr"]) F.to_pickle("covars/GSE137764_H1.hg38.pickle") -# liftover to hg19 +# liftover to hg19 {{{ F["chr_start_lift"] = 0 F["chr_end_lift"] = 0 F["start_lift"] = 0 @@ -63,8 +65,14 @@ (F["start_strand_lift"].notin(["+", "?"])) | \ (F["start_lift"] > F["end_lift"]) +# }}} + +# }}} + # -# GC content +# GC content {{{ + +# note: this is obsolete; GC content is now computed on the fly B = pd.read_csv("/mnt/j/proj/cnv/20210326_coverage_collector/targets.bed", sep = "\t", header = None, names = ["chr", "start", "end"]) B["chr"] = mut.convert_chr(B["chr"]) @@ -78,3 +86,67 @@ B.to_pickle("covars/GC.pickle") +# }}} + +# +# DNAse HS/FAIRE {{{ + +## DNAse {{{ + +bw = pyBigWig.open("covars/wgEncodeUwDnaseGm12878RawRep1.bigWig") + +# WGS (2kb chunks) +clen = seq.get_chrlens() +C = [] +for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]]): + bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]] + tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "DNAse" : 0 }) + for j, (st, en) in enumerate(tqdm.tqdm(bins)): + tmp.loc[j, "DNAse"] = np.nanmean(np.r_[bw.values(chrname, st, en)]) + C.append(tmp) + +# preliminary results not so great; stick with FAIRE for now + +# TODO: liftover to hg38 + +# WES + +# }}} + +## FAIRE {{{ + +## convert bigWig to FWB + +# for some reason pyBigWig can't process this file +# bw = pyBigWig.open("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigwig") + +# use bigWig2FWB instead +# git clone git@github.com:getzlab/bigWig2FWB.git + +# figure out range of file +# bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/bwtest +# ./getmax +# -> max = 5478 +# set scale factor to 11 +# bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal + +## WGS +from capy import fwb + +F = fwb.FWB("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb"); + +clen = seq.get_chrlens() +C = [] +for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]]): + bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]] + tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "FAIRE" : 0 }) + for j, (st, en) in enumerate(tqdm.tqdm(bins)): + tmp.loc[j, "FAIRE"] = F.get(chrname, np.r_[st:en] + 1).mean() + C.append(tmp) + +FAIRE = pd.concat(C, ignore_index = True) +FAIRE.to_pickle("covars/FAIRE_GM12878.hg19.pickle") + +# }}} + +# }}} diff --git a/covars/getmax.c b/covars/getmax.c new file mode 100644 index 0000000..5bcc865 --- /dev/null +++ b/covars/getmax.c @@ -0,0 +1,15 @@ +#include +#include +#include +#include + +int main() { + FILE* x = fopen("wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb", "r"); + uint16_t max = 0; + uint16_t buf; + while(fread(&buf, 2, 1, x)) { + buf = __bswap_16(buf); + if(buf > max) { max = buf; printf("%d\n", max); } + } + return 0; +} From 97030468faca07bd9e7f35fbd140a5b8365c6112 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 10 May 2022 18:20:07 -0400 Subject: [PATCH 172/222] Add FAIRE covariate to cov MCMC --- hapaseg/__main__.py | 1 + hapaseg/run_coverage_MCMC.py | 15 ++++++++++++++- wolF/tasks.py | 4 +++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 77fea79..1ae1e46 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -148,6 +148,7 @@ def parse_args(): preprocess_coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs", required=True) preprocess_coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) preprocess_coverage_mcmc.add_argument("--repl_pickle", help="pickled dataframe containing replication timing data", required=True) + preprocess_coverage_mcmc.add_argument("--faire_pickle", help="pickled dataframe containing FAIRE data", required=True) preprocess_coverage_mcmc.add_argument("--gc_pickle", help="pickled dataframe containing precomputed gc content. This is not required but will speed up runtime if passed", default=None) preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int, help="index of sample clustering from allelic DP to use as seed for segmentation. Will use most likely clustering by default", diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 091ff07..47d7a7c 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -22,6 +22,7 @@ def __init__(self, f_segs, ref_fasta, f_repl, + f_faire, f_GC=None, num_draws=50, cluster_num=None, @@ -31,6 +32,7 @@ def __init__(self, self.num_draws = num_draws self.cluster_num = cluster_num self.f_repl = f_repl + self.f_faire = f_faire self.f_GC = f_GC self.ref_fasta = ref_fasta @@ -136,7 +138,18 @@ def load_covariates(self): self.generate_GC() self.full_cov_df["C_GC_z"] = zt(self.full_cov_df["C_GC"]) - + + ## FAIRE + + F = pd.read_pickle(self.f_faire) + # map targets to FAIRE intervals + tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "start") + self.full_cov_df['C_FAIRE'] = np.nan + self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values + + # z-transform + self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"]) + ## Fragment length # some bins have zero mean fragment length; these bins are bad and should be removed diff --git a/wolF/tasks.py b/wolF/tasks.py index 651d22b..3ef01e8 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -133,6 +133,7 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task): "SNPs_pickle": None, "segmentations_pickle": None, "repl_pickle": None, + "faire_pickle": "/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE_GM12878.hg19.pickle", # TODO: make remote "gc_pickle":"", "allelic_sample":"", "ref_fasta": None @@ -143,7 +144,8 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task): --allelic_clusters_object ${allelic_clusters_object} \ --SNPs_pickle ${SNPs_pickle} \ --segmentations_pickle ${segmentations_pickle} \ - --repl_pickle ${repl_pickle}""" + --repl_pickle ${repl_pickle} \ + --faire_pickle ${faire_pickle}""" def prolog(self): if self.conf["inputs"]["gc_pickle"] != "": From 07fea1e500a4f81957cbc0bf2be2fcb75a0cd93a Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 10 May 2022 18:20:23 -0400 Subject: [PATCH 173/222] Get rid of chrY; convert chrnames --- 71_coverage_covariates.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py index 7b4525c..264962c 100644 --- a/71_coverage_covariates.py +++ b/71_coverage_covariates.py @@ -131,13 +131,13 @@ # bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal ## WGS -from capy import fwb +from capy import fwb, mut F = fwb.FWB("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb"); clen = seq.get_chrlens() C = [] -for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]]): +for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X"]]): bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]] tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "FAIRE" : 0 }) for j, (st, en) in enumerate(tqdm.tqdm(bins)): @@ -145,6 +145,7 @@ C.append(tmp) FAIRE = pd.concat(C, ignore_index = True) +FAIRE["chr"] = mut.convert_chr(FAIRE["chr"]) FAIRE.to_pickle("covars/FAIRE_GM12878.hg19.pickle") # }}} From c00c1247680fe808b85db417d3e930cc8141b802 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 10 May 2022 18:53:43 -0400 Subject: [PATCH 174/222] Compute fraglen covariate first (to remove bad bins immediately) --- hapaseg/run_coverage_MCMC.py | 40 ++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 47d7a7c..4081152 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -112,6 +112,26 @@ def load_covariates(self): zt = lambda x : (x - np.nanmean(x))/np.nanstd(x) + ## Fragment length + + # some bins have zero mean fragment length; these bins are bad and should be removed + self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True) + + self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) + self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) + + # generate on 5x and 11x scales + swv = np.lib.stride_tricks.sliding_window_view + fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0 + wt = self.full_cov_df["num_reads"].values + for scale in [5, 11]: + fl_sw = swv(np.pad(fl, scale//2), scale) + wt_sw = swv(np.pad(wt, scale//2), scale) + conv = np.einsum('ij,ij->i', wt_sw, fl_sw) + + self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1) + self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"]) + ## Replication timing # load repl timing @@ -150,26 +170,6 @@ def load_covariates(self): # z-transform self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"]) - ## Fragment length - - # some bins have zero mean fragment length; these bins are bad and should be removed - self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True) - - self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) - self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) - - # generate on 5x and 11x scales - swv = np.lib.stride_tricks.sliding_window_view - fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0 - wt = self.full_cov_df["num_reads"].values - for scale in [5, 11]: - fl_sw = swv(np.pad(fl, scale//2), scale) - wt_sw = swv(np.pad(wt, scale//2), scale) - conv = np.einsum('ij,ij->i', wt_sw, fl_sw) - - self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1) - self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"]) - # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned # method returns coverage df with only bins that overlap snps From 3e79b9fe11b87ca066b9fbf5c77763c7b04e95c2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 12 May 2022 13:45:30 -0400 Subject: [PATCH 175/222] Use midpoint when mapping bins to covariates --- hapaseg/run_coverage_MCMC.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 4081152..0dcb1e8 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -132,12 +132,16 @@ def load_covariates(self): self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1) self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"]) + ### track-based covariates + # use midpoint of coverage bins to map to intervals + self.full_cov_df["midpoint"] = ((self.full_cov_df["end"] + self.full_cov_df["start"])/2).astype(int) + ## Replication timing # load repl timing F = pd.read_pickle(self.f_repl) # map targets to RT intervals - tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "start") + tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint") self.full_cov_df['C_RT'] = np.nan self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values @@ -163,7 +167,7 @@ def load_covariates(self): F = pd.read_pickle(self.f_faire) # map targets to FAIRE intervals - tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "start") + tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint") self.full_cov_df['C_FAIRE'] = np.nan self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values From 725b7b9691e2b700c63d6a4ff125ae45c921f64e Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 12 May 2022 13:56:25 -0400 Subject: [PATCH 176/222] Make FAIRE optional --- hapaseg/run_coverage_MCMC.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 0dcb1e8..4b56d02 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -165,14 +165,15 @@ def load_covariates(self): ## FAIRE - F = pd.read_pickle(self.f_faire) - # map targets to FAIRE intervals - tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint") - self.full_cov_df['C_FAIRE'] = np.nan - self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values - - # z-transform - self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"]) + if self.f_faire is not None: + F = pd.read_pickle(self.f_faire) + # map targets to FAIRE intervals + tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint") + self.full_cov_df['C_FAIRE'] = np.nan + self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values + + # z-transform + self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"]) # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned From ee1f6b56b58c1f42c68372370a152254fdbe76eb Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 12 May 2022 16:01:44 -0400 Subject: [PATCH 177/222] Add log exposure to Poisson regression --- hapaseg/model_optimizers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py index 43d54d4..76e95c1 100644 --- a/hapaseg/model_optimizers.py +++ b/hapaseg/model_optimizers.py @@ -2,14 +2,15 @@ class PoissonRegression: - def __init__(self, r, C, Pi): + def __init__(self, r, C, Pi, log_exposure = 0): self.r = r self.C = C self.Pi = Pi + self.log_exposure = log_exposure self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) self.beta = np.ones([C.shape[1], 1]) - self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu) + self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure) # mu gradient def gradmu(self): @@ -33,7 +34,7 @@ def hessmubeta(self): def NR_poisson(self): for i in range(100): - self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu) + self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure) gmu = self.gradmu() gbeta = self.gradbeta() grad = np.r_[gmu, gbeta] From ce09d1fc6d39e8038f4623363b73a83ca15fbc4f Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 12 May 2022 16:35:37 -0400 Subject: [PATCH 178/222] Add smoothed FAIRE --- 71_coverage_covariates.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py index 264962c..137dcad 100644 --- a/71_coverage_covariates.py +++ b/71_coverage_covariates.py @@ -148,6 +148,11 @@ FAIRE["chr"] = mut.convert_chr(FAIRE["chr"]) FAIRE.to_pickle("covars/FAIRE_GM12878.hg19.pickle") +# smoothed version +FAIRE_smooth = FAIRE.copy() +FAIRE_smooth["FAIRE"] = np.convolve(FAIRE["FAIRE"], np.ones(5), mode = "same")/5 +FAIRE_smooth.to_pickle("covars/FAIRE_GM12878.smooth5.hg19.pickle") + # }}} # }}} From 3dcd389d3e67cc2bbb48e41645baa6e1ae882312 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 13 May 2022 14:00:52 -0400 Subject: [PATCH 179/222] Ignore FWBs and NPZs in build context --- .dockerignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.dockerignore b/.dockerignore index a160cc0..50af6bb 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,5 @@ **/*.bam **/*.call_stats.txt +**/*.fw? +**/*.bigWig +**/*.npz From fdd53bd5d6b53979628413ae0fa13a6af839ba43 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 16 May 2022 12:25:06 -0400 Subject: [PATCH 180/222] Add cache_invalidate to Dockerfile to force updating Python modules --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 85baf96..249c37e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,6 +4,7 @@ WORKDIR /build # install dependencies RUN pip install sortedcontainers +ARG cache_invalidate=xxx RUN git clone https://github.com/getzlab/CApy.git && pip install ./CApy RUN pip install dask distributed RUN pip install distinctipy From 723fb610f2b1cc2596868c2307275c37889f0961 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 16 May 2022 12:35:14 -0400 Subject: [PATCH 181/222] Add bin width to cov MCMC prep task --- hapaseg/__main__.py | 2 ++ hapaseg/run_coverage_MCMC.py | 6 ++++-- wolF/tasks.py | 31 ++++++++++++++++++------------- wolF/workflow.py | 3 ++- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 1ae1e46..b18671f 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -154,6 +154,7 @@ def parse_args(): help="index of sample clustering from allelic DP to use as seed for segmentation. Will use most likely clustering by default", default=None) preprocess_coverage_mcmc.add_argument("--ref_fasta", required = True) + preprocess_coverage_mcmc.add_argument("--bin_width", help = "Coverage bin width (for WGS only)", default = 1, type = int) ## running coverage mcmc on single cluster for scatter task coverage_mcmc_shard = subparsers.add_parser("coverage_mcmc_shard", @@ -515,6 +516,7 @@ def main(): args.segmentations_pickle, args.ref_fasta, f_repl=args.repl_pickle, + f_faire=args.faire_pickle, f_GC=args.gc_pickle, allelic_sample=args.allelic_sample) Pi, r, C, all_mu, global_beta, cov_df, adp_cluster = cov_mcmc_runner.prepare_single_cluster() diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 4b56d02..5704bf3 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -26,7 +26,8 @@ def __init__(self, f_GC=None, num_draws=50, cluster_num=None, - allelic_sample=None + allelic_sample=None, + bin_width=1, ): self.num_draws = num_draws @@ -35,6 +36,7 @@ def __init__(self, self.f_faire = f_faire self.f_GC = f_GC self.ref_fasta = ref_fasta + self.bin_width = bin_width self.allelic_clusters = np.load(f_allelic_clusters) with open(f_segs, "rb") as f: @@ -66,7 +68,7 @@ def run_all_clusters(self): # Do preprocessing for running on each ADP cluster individually def prepare_single_cluster(self): Pi, r, C, filtered_cov_df = self.assign_clusters() - pois_regr = PoissonRegression(r, C, Pi) + pois_regr = PoissonRegression(r, C, Pi, log_exposure = np.log(self.bin_width)) all_mu, global_beta = pois_regr.fit() # save these results to a numpy object diff --git a/wolF/tasks.py b/wolF/tasks.py index 3ef01e8..4bd5c5f 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -136,22 +136,27 @@ class Hapaseg_prepare_coverage_mcmc(wolf.Task): "faire_pickle": "/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE_GM12878.hg19.pickle", # TODO: make remote "gc_pickle":"", "allelic_sample":"", - "ref_fasta": None + "ref_fasta": None, + "bin_width" : 1 # only for whole genomes; for exomes, target lengths are passed as a covariate via the coverage CSV } - script = """ - hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \ - --ref_fasta ${ref_fasta} \ - --allelic_clusters_object ${allelic_clusters_object} \ - --SNPs_pickle ${SNPs_pickle} \ - --segmentations_pickle ${segmentations_pickle} \ - --repl_pickle ${repl_pickle} \ - --faire_pickle ${faire_pickle}""" + def script(self): + script = """ + hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \ + --ref_fasta ${ref_fasta} \ + --allelic_clusters_object ${allelic_clusters_object} \ + --SNPs_pickle ${SNPs_pickle} \ + --segmentations_pickle ${segmentations_pickle} \ + --repl_pickle ${repl_pickle} \ + --faire_pickle ${faire_pickle} \ + --bin_width ${bin_width} + """ - def prolog(self): if self.conf["inputs"]["gc_pickle"] != "": - self.conf["script"][-1] += " --gc_pickle ${gc_pickle}" + script += " --gc_pickle ${gc_pickle} " if self.conf["inputs"]["allelic_sample"] != "": - self.conf["script"][-1] += " --allelic_sample ${allelic_sample}" + script += " --allelic_sample ${allelic_sample}" + + return script output_patterns = { "preprocess_data": "preprocess_data.npz", @@ -159,7 +164,7 @@ def prolog(self): "allelic_seg_groups": "allelic_seg_groups.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v815" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v828" resources = { "mem" : "15G" } diff --git a/wolF/workflow.py b/wolF/workflow.py index 0509db0..bc66002 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -507,7 +507,8 @@ def concat_arm_level_results(arm_results): "segmentations_pickle":hapaseg_allelic_DP_task['segmentation_breakpoints'], "repl_pickle":ref_config["repl_file"], "gc_pickle":ref_config["gc_file"], - "ref_fasta":localization_task["ref_fasta"] + "ref_fasta":localization_task["ref_fasta"], + "bin_width":bin_width if wgs else 1 } ) From 136cfa33b1117bbdfed3c87bc576f2d3f30dfbaf Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 16 May 2022 14:52:25 -0400 Subject: [PATCH 182/222] Fix bad allelic seg index bug --- hapaseg/__main__.py | 2 +- hapaseg/run_coverage_MCMC.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index b18671f..aaa9a72 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -525,7 +525,7 @@ def main(): cov_df = cov_df.sort_values("start_g", ignore_index = True) # indices of coverage bins - seg_g = cov_df.groupby("seg_idx") + seg_g = cov_df.groupby("seg_idx") # NOTE: seg_idx may not be contiguous if any allelic segments were dropped seg_g_idx = pd.Series(seg_g.indices).to_frame(name = "indices") seg_g_idx["allelic_cluster"] = seg_g["allelic_cluster"].first() seg_g_idx["n_cov_bins"] = seg_g.size() diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 5704bf3..7eca2cb 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -193,7 +193,7 @@ def assign_clusters(self): Cov_clust_probs = np.zeros([len(self.full_cov_df), cuj_max]) # get allelic segment boundaries - seg_bdy = np.r_[list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)] + seg_bdy = np.r_[0, list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)] seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] self.SNPs["seg_idx"] = 0 for i, (st, en) in enumerate(seg_bdy): From 2a09a0f1b339daaf19e8c56d5c3f9d818e11b575 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 16 May 2022 15:28:37 -0400 Subject: [PATCH 183/222] Need args --- hapaseg/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index aaa9a72..e6e8782 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -569,7 +569,7 @@ def main(): r = r[seg_indices["indices"], :] # run cov MCMC - cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.bin_width) + cov_mcmc = NB_MCMC_SingleCluster(args.num_draws, r, C, mu, beta, args.bin_width) # # if we get a range argument well be doing burnin on a subset of the coverage bins # if args.range is not None: From d89964df4a556c0d08abf435cbd3e5188ffaa79c Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 16 May 2022 15:44:25 -0400 Subject: [PATCH 184/222] Bump dockers, fix misc. workflow bugs --- wolF/tasks.py | 8 ++++---- wolF/workflow.py | 10 ++++------ 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 4bd5c5f..11496c9 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -164,7 +164,7 @@ def script(self): "allelic_seg_groups": "allelic_seg_groups.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v828" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v832" resources = { "mem" : "15G" } @@ -206,7 +206,7 @@ class Hapaseg_coverage_mcmc(wolf.Task): } script = """ hapaseg coverage_mcmc_shard --preprocess_data ${preprocess_data} \ - --allelic_seg_indices ${allelic_seg_idx} \ + --allelic_seg_indices ${allelic_seg_indices} \ --allelic_seg_idx ${allelic_seg_scatter_idx} \ --num_draws ${num_draws} \ --bin_width ${bin_width}""" @@ -221,8 +221,8 @@ def prolog(self): "cov_seg_figure": 'cov_mcmc_cluster_*_visual.png' } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v815" - resources = {"mem" : "5G"} + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v830" + resources = {"mem" : "10G"} class Hapaseg_collect_coverage_mcmc(wolf.Task): inputs = { diff --git a/wolF/workflow.py b/wolF/workflow.py index bc66002..7bf7716 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -516,18 +516,16 @@ def concat_arm_level_results(arm_results): # (coverage MCMC will be scattered over each allelic segment) @prefect.task def get_N_seg_groups(S): - return len(S) + return list(range(len(pd.read_pickle(S)))) - N_cov_mcmc_shards = get_N_seg_groups(prep_cov_mcmc_task["allelic_seg_groups"]) - - # TODO: modify burnin task to subset to these indices + cov_mcmc_shard_range = get_N_seg_groups(prep_cov_mcmc_task["allelic_seg_groups"]) # coverage MCMC burnin(?) <- do we still need to burnin separately? - cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc_burnin( + cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc( inputs={ "preprocess_data":prep_cov_mcmc_task["preprocess_data"], "allelic_seg_indices":prep_cov_mcmc_task["allelic_seg_groups"], - "allelic_seg_scatter_idx":range(0, N_cov_mcmc_shards), + "allelic_seg_scatter_idx":cov_mcmc_shard_range, "num_draws":50, "bin_width":bin_width, } From a9956b3306d2c278b79c7782136ca24c00637beb Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 16 May 2022 16:09:03 -0400 Subject: [PATCH 185/222] Properly export segment-level covMCMC files --- hapaseg/__main__.py | 6 +++--- wolF/tasks.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index e6e8782..69b71ee 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -614,15 +614,15 @@ def main(): segment_samples, global_beta, mu_i_samples = cov_mcmc.prepare_results() # save samples - with open(os.path.join(output_dir, model_save_str), 'wb') as f: + with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{args.allelic_seg_idx}.pickle"), 'wb') as f: pickle.dump(cov_mcmc, f) - np.savez(os.path.join(output_dir, data_save_str), + np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"), seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples) # save visualization cov_mcmc.visualize_cluster_samples( - os.path.join(output_dir, figure_save_str)) + os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png")) elif args.command == "collect_cov_mcmc": if args.coverage_dir: diff --git a/wolF/tasks.py b/wolF/tasks.py index 11496c9..7668711 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -216,9 +216,9 @@ def prolog(self): self.conf["script"][-1] += " --burnin_files ${burnin_files}" output_patterns = { - "cov_segmentation_model": 'cov_mcmc_model_cluster_*.pickle', - "cov_segmentation_data": 'cov_mcmc_data_cluster_*.npz', - "cov_seg_figure": 'cov_mcmc_cluster_*_visual.png' + "cov_segmentation_model": 'cov_mcmc_model_*.pickle', + "cov_segmentation_data": 'cov_mcmc_data_*.npz', + "cov_seg_figure": 'cov_mcmc_*_visual.png' } docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v830" From 49ea3589b55705613b84606a00371a4f2be35924 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 16 May 2022 16:22:48 -0400 Subject: [PATCH 186/222] Temporarily disable saving visualization --- hapaseg/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 69b71ee..935ecbe 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -620,9 +620,9 @@ def main(): np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"), seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples) - # save visualization - cov_mcmc.visualize_cluster_samples( - os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png")) +# # save visualization +# cov_mcmc.visualize_cluster_samples( +# os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png")) elif args.command == "collect_cov_mcmc": if args.coverage_dir: From fe309f31acd11329f54dc01d2c4b595f1660eea1 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 16 May 2022 17:41:27 -0400 Subject: [PATCH 187/222] ML_approx -> ML_gaussint --- hapaseg/NB_coverage_MCMC.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index 53efda7..d0838a5 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -413,7 +413,7 @@ def _detailed_sampling(self, ind, lls, split_indices, mus, lepsis, Hs): def _lls_to_MLs(self, lls, Hs): MLs = np.zeros(len(lls)) for i, (ll, Hs) in enumerate(zip(lls, Hs)): - laplacian = self._get_log_ML_split(Hs[0], Hs[1]) + laplacian = self._get_log_ML_gaussint_split(Hs[0], Hs[1]) # the split results in a nan make it impossible to split there if np.isnan(laplacian): laplacian = -1e50 @@ -445,12 +445,12 @@ def _get_split_liks(self, ind, debug=False): return split_indices, MLs, mus, lepsis - # computes ML component from hessian approximation for a single segment - def _get_log_ML_approx_join(self, Hess): + # computes Gaussian integral for ML Laplace approximation for a single segment + def _get_log_ML_gaussint_join(self, Hess): return np.log(2 * np.pi) - (np.log(np.linalg.det(-Hess))) / 2 - # computes ML component from hessian approximation for two split segments - def _get_log_ML_split(self, H1, H2): + # computes Gaussian integral for ML Laplace approximation for two split segments + def _get_log_ML_gaussint_split(self, H1, H2): return 2*np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2 # computes the log ML of joining two segments @@ -461,9 +461,7 @@ def _log_ML_join(self, ind, ret_opt_params=False): tmp_lepsi = self.lepsi_i_arr.copy() tmp_lepsi[ind[0]:ind[1]] = lepsi_share ll_join = self.ll_cluster(tmp_mui, tmp_lepsi) - if ret_opt_params: - return mu_share, lepsi_share, self._get_log_ML_join(H_share) + ll_join - return mu_share, lepsi_share, self._get_log_ML_approx_join(H_share) + ll_join + return mu_share, lepsi_share, self._get_log_ML_gaussint_join(H_share) + ll_join """ Split segment method. This method chooses a segment at random @@ -546,7 +544,7 @@ def join(self, debug): ind = self.get_join_seg_ind(seg_l, seg_r) lls_split, _, _, Hs = self._calculate_splits(ind, [seg_r]) - log_split_ML = lls_split[0] + self._get_log_ML_split(Hs[0][0], Hs[0][1]) + log_split_ML = lls_split[0] + self._get_log_ML_gaussint_split(Hs[0][0], Hs[0][1]) mu_share, lepsi_share, log_join_ML = self._log_ML_join(ind) log_MLs = np.r_[log_split_ML, log_join_ML] From 3fdc0063df337b077948d4c640d1f30be59471f9 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 17 May 2022 11:44:16 -0400 Subject: [PATCH 188/222] Initial commit of covMCMC cache --- hapaseg/NB_coverage_MCMC.py | 73 +++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 15 deletions(-) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index d0838a5..bdf927a 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -8,6 +8,7 @@ import matplotlib as mpl import matplotlib.pyplot as plt from scipy.signal import find_peaks +import scipy.sparse as sp from .model_optimizers import PoissonRegression # turn off warnings for statsmodels fitting @@ -49,8 +50,10 @@ def __init__(self, r, C, mu_0, beta_0, bin_width=1): self.segment_lens = sc.SortedDict([(0, len(self.r))]) # keep cache of previously computed breakpoints for fast splitting - # these breakpoints keys are in the form (st, en, breakpoint) - self.breakpoint_cache = {} + self.cache_LL_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_LL = [] + self.cache_mu_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_mu = [] + self.cache_lepsi_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_lepsi = [] + self.cache_hess_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_hess = [] self.phase_history = [] self.F = sc.SortedList() @@ -138,12 +141,27 @@ def stats_init(self): # statsmodels NB BFGS optimizer is more stable than NR so we will use it until migration to LNP def stats_optimizer(self, ind, ret_hess=False): + # cache hit; look up values + if self.cache_mu_ptr[ind[0], ind[1]] != 0: + mu = self.cache_mu[self.cache_mu_ptr[ind[0], ind[1]]] + lepsi = self.cache_lepsi[self.cache_lepsi_ptr[ind[0], ind[1]]] + if ret_hess: + return mu, lepsi, self.cache_hess[self.cache_hess_ptr[ind[0], ind[1]]] + else: + return mu, lepsi + + # cache miss; compute values endog = self.r[ind[0]:ind[1]].flatten() exog = np.ones(self.r[ind[0]:ind[1]].shape[0]) exposure = np.ones(self.r[ind[0]:ind[1]].shape[0]) * self.bin_exposure sNB = statsNB(endog, exog, exposure=exposure, offset=(self.C[ind[0]:ind[1]] @ self.beta).flatten() + self.mu) res = sNB.fit(disp=0) + # save to cache + self.cache_mu.append(res.params[0]); self.cache_mu_ptr[ind[0], ind[1]] = len(self.cache_mu) - 1 + self.cache_lepsi.append(-np.log(res.params[1])); self.cache_lepsi_ptr[ind[0], ind[1]] = len(self.cache_lepsi) - 1 + self.cache_hess.append(sNB.hessian(res.params)); self.cache_hess_ptr[ind[0], ind[1]] = len(self.cache_hess) - 1 + if ret_hess: return res.params[0], -np.log(res.params[1]), sNB.hessian(res.params) else: @@ -291,15 +309,31 @@ def _calculate_splits(self, ind, split_indices): lepsis.append((lepsi_l, lepsi_r)) Hs.append((H_l, H_r)) - tmp_mui = self.mu_i_arr.copy() - tmp_mui[ind[0]:ix] = mu_l - tmp_mui[ix: ind[1]] = mu_r - tmp_lepsi = self.lepsi_i_arr.copy() - tmp_lepsi[ind[0]:ix] = lepsi_l - tmp_lepsi[ix: ind[1]] = lepsi_r + # lookup likelihoods in cache + # left: + if (ptr := self.cache_LL_ptr[ind[0], ix]) != 0: + ll_l = self.cache_LL[ptr] + else: + ll_l = self.ll_cluster(mu_l, lepsi_l) + +# tmp_mui = self.mu_i_arr.copy() +# tmp_mui[ind[0]:ix] = mu_l +# tmp_mui[ix: ind[1]] = mu_r +# tmp_lepsi = self.lepsi_i_arr.copy() +# tmp_lepsi[ind[0]:ix] = lepsi_l +# tmp_lepsi[ix: ind[1]] = lepsi_r +# ll = self.ll_cluster(tmp_mui, tmp_lepsi) + + self.cache_LL.append(ll_l); self.cache_LL_ptr[ind[0], ix] = len(self.cache_LL) - 1 + + # right: + if (ptr := self.cache_LL_ptr[ix, ind[1]]) != 0: + ll_r = self.cache_LL[ptr] + else: + ll_r = self.ll_cluster(mu_r, lepsi_r) + self.cache_LL.append(ll_r); self.cache_LL_ptr[ix, ind[1]] = len(self.cache_LL) - 1 - ll = self.ll_cluster(tmp_mui, tmp_lepsi) - lls.append(ll) + lls.append(ll_l + ll_r) return lls, mus, lepsis, Hs @@ -456,11 +490,20 @@ def _get_log_ML_gaussint_split(self, H1, H2): # computes the log ML of joining two segments def _log_ML_join(self, ind, ret_opt_params=False): mu_share, lepsi_share, H_share = self.stats_optimizer(ind, True) - tmp_mui = self.mu_i_arr.copy() - tmp_mui[ind[0]:ind[1]] = mu_share - tmp_lepsi = self.lepsi_i_arr.copy() - tmp_lepsi[ind[0]:ind[1]] = lepsi_share - ll_join = self.ll_cluster(tmp_mui, tmp_lepsi) + + # lookup cache + if (ptr := self.cache_LL_ptr[ind[0], ind[1]]) != 0: + ll_join = self.cache_LL[ptr] + else: +# tmp_mui = self.mu_i_arr.copy() +# tmp_mui[ind[0]:ind[1]] = mu_share +# tmp_lepsi = self.lepsi_i_arr.copy() +# tmp_lepsi[ind[0]:ind[1]] = lepsi_share +# ll_join = self.ll_cluster(tmp_mui, tmp_lepsi) + ll_join = self.ll_cluster(mu_share, lepsi_share) + + # add to cache + self.cache_LL.append(ll_join); self.cache_LL_ptr[ind[0], ind[1]] = len(self.cache_LL) - 1 return mu_share, lepsi_share, self._get_log_ML_gaussint_join(H_share) + ll_join """ From a5831faef6b4881a9aee081392cddf4e4c060bed Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 17 May 2022 11:57:22 -0400 Subject: [PATCH 189/222] Fix sparse matrix definitions --- hapaseg/NB_coverage_MCMC.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index bdf927a..707ea95 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -50,10 +50,11 @@ def __init__(self, r, C, mu_0, beta_0, bin_width=1): self.segment_lens = sc.SortedDict([(0, len(self.r))]) # keep cache of previously computed breakpoints for fast splitting - self.cache_LL_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_LL = [] - self.cache_mu_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_mu = [] - self.cache_lepsi_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_lepsi = [] - self.cache_hess_ptr = sp.dok_matrix((r.shape, r.shape)); self.cache_hess = [] + sz = tuple(np.r_[1, 1]*(len(r) + 1)) + self.cache_LL_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_LL = [] + self.cache_mu_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_mu = [] + self.cache_lepsi_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_lepsi = [] + self.cache_hess_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_hess = [] self.phase_history = [] self.F = sc.SortedList() From 1c7e09826b6172f0cd95fd3f79cefda55c8d425a Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 17 May 2022 12:00:57 -0400 Subject: [PATCH 190/222] tmp commit of breakpoints --- hapaseg/NB_coverage_MCMC.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index 707ea95..24aeab1 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -144,6 +144,7 @@ def stats_init(self): def stats_optimizer(self, ind, ret_hess=False): # cache hit; look up values if self.cache_mu_ptr[ind[0], ind[1]] != 0: + #breakpoint() mu = self.cache_mu[self.cache_mu_ptr[ind[0], ind[1]]] lepsi = self.cache_lepsi[self.cache_lepsi_ptr[ind[0], ind[1]]] if ret_hess: @@ -313,6 +314,7 @@ def _calculate_splits(self, ind, split_indices): # lookup likelihoods in cache # left: if (ptr := self.cache_LL_ptr[ind[0], ix]) != 0: + #breakpoint() ll_l = self.cache_LL[ptr] else: ll_l = self.ll_cluster(mu_l, lepsi_l) @@ -329,6 +331,7 @@ def _calculate_splits(self, ind, split_indices): # right: if (ptr := self.cache_LL_ptr[ix, ind[1]]) != 0: + #breakpoint() ll_r = self.cache_LL[ptr] else: ll_r = self.ll_cluster(mu_r, lepsi_r) @@ -494,6 +497,7 @@ def _log_ML_join(self, ind, ret_opt_params=False): # lookup cache if (ptr := self.cache_LL_ptr[ind[0], ind[1]]) != 0: + #breakpoint() ll_join = self.cache_LL[ptr] else: # tmp_mui = self.mu_i_arr.copy() From f974844040d1c30f84c7a85b62f03be73e3e41ff Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 17 May 2022 12:08:33 -0400 Subject: [PATCH 191/222] Remove cruft code --- hapaseg/NB_coverage_MCMC.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index 24aeab1..56a6d4c 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -318,15 +318,6 @@ def _calculate_splits(self, ind, split_indices): ll_l = self.cache_LL[ptr] else: ll_l = self.ll_cluster(mu_l, lepsi_l) - -# tmp_mui = self.mu_i_arr.copy() -# tmp_mui[ind[0]:ix] = mu_l -# tmp_mui[ix: ind[1]] = mu_r -# tmp_lepsi = self.lepsi_i_arr.copy() -# tmp_lepsi[ind[0]:ix] = lepsi_l -# tmp_lepsi[ix: ind[1]] = lepsi_r -# ll = self.ll_cluster(tmp_mui, tmp_lepsi) - self.cache_LL.append(ll_l); self.cache_LL_ptr[ind[0], ix] = len(self.cache_LL) - 1 # right: @@ -500,11 +491,6 @@ def _log_ML_join(self, ind, ret_opt_params=False): #breakpoint() ll_join = self.cache_LL[ptr] else: -# tmp_mui = self.mu_i_arr.copy() -# tmp_mui[ind[0]:ind[1]] = mu_share -# tmp_lepsi = self.lepsi_i_arr.copy() -# tmp_lepsi[ind[0]:ind[1]] = lepsi_share -# ll_join = self.ll_cluster(tmp_mui, tmp_lepsi) ll_join = self.ll_cluster(mu_share, lepsi_share) # add to cache From 65829ad30b8813f2b8ca1ba64a21ecd3225546f6 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 17 May 2022 15:53:37 -0400 Subject: [PATCH 192/222] Index cov MCMC shards WRT seg index, not job number --- hapaseg/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 935ecbe..9a9a4ee 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -614,15 +614,15 @@ def main(): segment_samples, global_beta, mu_i_samples = cov_mcmc.prepare_results() # save samples - with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{args.allelic_seg_idx}.pickle"), 'wb') as f: + with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{seg_indices['allelic_cluster']}.pickle"), 'wb') as f: pickle.dump(cov_mcmc, f) - np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"), + np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{seg_indices['allelic_cluster']}.npz"), seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples) # # save visualization # cov_mcmc.visualize_cluster_samples( -# os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png")) +# os.path.join(output_dir, f"cov_mcmc_seg_{seg_indices['allelic_cluster']}_visual.png")) elif args.command == "collect_cov_mcmc": if args.coverage_dir: From 9fc6ac3a903bb7e8ec3f86c224cfc1b53aa0b4c2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 17 May 2022 16:36:59 -0400 Subject: [PATCH 193/222] Fix segment likelihood computation function --- hapaseg/NB_coverage_MCMC.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index 56a6d4c..ac93aa7 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -84,8 +84,17 @@ def get_seg_ind(self, seg): def get_join_seg_ind(self, seg_l, seg_r): return seg_l, seg_r + self.segment_lens[seg_r] + # get overall likelihood for all segments def get_ll(self): - return self.ll_cluster(self.mu_i_arr, self.lepsi_i_arr, True) + bdy = np.r_[list(self.segments), len(self.r)]; bdy = np.c_[bdy[:-1], bdy[1:]] + ll = 0 + for st, en in bdy: + # lookup in cache + if (ptr := self.cache_LL_ptr[st, en]) != 0: + ll += self.cache_LL[ptr] + else: + ll += self.ll_cluster([st, en], self.mu_i_arr[st:en], self.lepsi_i_arr[st:en], True) + return ll # read in the merged cluster assignments from burnin scatter jobs and # fill in data structures for cluster mcmc accordingly @@ -186,16 +195,16 @@ def refit_beta(self): self.lepsi_i_arr[row[0]:row[1]] = lepsi_i # method for calculating the overall log likelihood of an allelic cluster given a hypothetical mu_i and lepsi arrays - def ll_cluster(self, mu_i_arr, lepsi_i_arr, take_sum=True): - mu_i_arr = mu_i_arr.flatten() - epsi_i_arr = np.exp(lepsi_i_arr).flatten() + def ll_cluster(self, ind, mu_i, lepsi_i, take_sum=True): + epsi_i = np.exp(lepsi_i) exposure= np.log(self.bin_exposure) - bc = (self.C @ self.beta).flatten() + exposure - exp = np.exp(self.mu + bc + mu_i_arr).flatten() + bc = (self.C[ind[0]:ind[1]] @ self.beta).flatten() + exposure + exp = np.exp(self.mu + bc + mu_i).flatten() + r_subset = self.r[ind[0]:ind[1]] - lls = (ss.gammaln(self.r + epsi_i_arr) - ss.gammaln(self.r + 1) - ss.gammaln(epsi_i_arr) + - (self.r * (self.mu + bc + mu_i_arr - np.log(epsi_i_arr + exp))) + - (epsi_i_arr * np.log(epsi_i_arr / (epsi_i_arr + exp)))) + lls = (ss.gammaln(r_subset + epsi_i) - ss.gammaln(r_subset + 1) - ss.gammaln(epsi_i) + + (r_subset * (self.mu + bc + mu_i - np.log(epsi_i + exp))) + + (epsi_i * np.log(epsi_i / (epsi_i + exp)))) if not take_sum: return lls return lls.sum() @@ -297,6 +306,8 @@ def _calculate_splits(self, ind, split_indices): Hs = [] for ix in split_indices: if ix < 0: + # what do we do here WRT ll_cluster indices? FIXME + breakpoint() # no split proposal ll_join = self.ll_cluster(self.mu_i_arr, self.lepsi_i_arr) lls.append(ll_join) @@ -317,7 +328,7 @@ def _calculate_splits(self, ind, split_indices): #breakpoint() ll_l = self.cache_LL[ptr] else: - ll_l = self.ll_cluster(mu_l, lepsi_l) + ll_l = self.ll_cluster([ind[0], ix], mu_l, lepsi_l) self.cache_LL.append(ll_l); self.cache_LL_ptr[ind[0], ix] = len(self.cache_LL) - 1 # right: @@ -325,7 +336,7 @@ def _calculate_splits(self, ind, split_indices): #breakpoint() ll_r = self.cache_LL[ptr] else: - ll_r = self.ll_cluster(mu_r, lepsi_r) + ll_r = self.ll_cluster([ix, ind[1]], mu_r, lepsi_r) self.cache_LL.append(ll_r); self.cache_LL_ptr[ix, ind[1]] = len(self.cache_LL) - 1 lls.append(ll_l + ll_r) @@ -491,7 +502,7 @@ def _log_ML_join(self, ind, ret_opt_params=False): #breakpoint() ll_join = self.cache_LL[ptr] else: - ll_join = self.ll_cluster(mu_share, lepsi_share) + ll_join = self.ll_cluster(ind, mu_share, lepsi_share) # add to cache self.cache_LL.append(ll_join); self.cache_LL_ptr[ind[0], ind[1]] = len(self.cache_LL) - 1 @@ -526,7 +537,7 @@ def split(self, debug): max_ML = max(log_MLs) k_probs = np.exp(log_MLs - max_ML) / np.exp(log_MLs - max_ML).sum() - + if np.isnan(k_probs).any(): print("skipping split iteration due to nan. log MLs: ", log_MLs, flush=True) return 0 From c818a01f330b2401a9d9dc7b45525ed74f7840c2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 19 May 2022 12:18:14 -0400 Subject: [PATCH 194/222] Fix bug in 65829ad --- hapaseg/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 9a9a4ee..935ecbe 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -614,15 +614,15 @@ def main(): segment_samples, global_beta, mu_i_samples = cov_mcmc.prepare_results() # save samples - with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{seg_indices['allelic_cluster']}.pickle"), 'wb') as f: + with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{args.allelic_seg_idx}.pickle"), 'wb') as f: pickle.dump(cov_mcmc, f) - np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{seg_indices['allelic_cluster']}.npz"), + np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"), seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples) # # save visualization # cov_mcmc.visualize_cluster_samples( -# os.path.join(output_dir, f"cov_mcmc_seg_{seg_indices['allelic_cluster']}_visual.png")) +# os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png")) elif args.command == "collect_cov_mcmc": if args.coverage_dir: From 36b823a3ea4990cc53df6622c02f0d441b977484 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 19 May 2022 12:42:34 -0400 Subject: [PATCH 195/222] Add back log transform to covariates --- hapaseg/run_coverage_MCMC.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 7eca2cb..3c4ca12 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -102,17 +102,18 @@ def generate_GC(self): def load_covariates(self): + zt = lambda x : (x - np.nanmean(x))/np.nanstd(x) + ## Target size # we only need bin size if doing exomes but we can check by looking at the bin lengths self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1) + self.full_cov_df["C_log_len_z"] = zt(self.full_cov_df["C_log_len"]) # in case we are doing wgs these will all be the same and we must remove if (np.diff(self.full_cov_df["C_log_len"]) == 0).all(): #remove the len col since it will ruin beta fitting - self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1) - - zt = lambda x : (x - np.nanmean(x))/np.nanstd(x) + self.full_cov_df = self.full_cov_df.drop(['C_log_len', 'C_log_len_z'], axis=1) ## Fragment length @@ -120,7 +121,7 @@ def load_covariates(self): self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True) self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) - self.full_cov_df["C_frag_len_z"] = zt(self.full_cov_df["C_frag_len"]) + self.full_cov_df["C_frag_len_z"] = zt(np.log(self.full_cov_df["C_frag_len"])) # generate on 5x and 11x scales swv = np.lib.stride_tricks.sliding_window_view @@ -132,7 +133,7 @@ def load_covariates(self): conv = np.einsum('ij,ij->i', wt_sw, fl_sw) self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1) - self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(self.full_cov_df[f"C_frag_len_{scale}x"]) + self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(np.log(self.full_cov_df[f"C_frag_len_{scale}x"])) ### track-based covariates # use midpoint of coverage bins to map to intervals @@ -148,7 +149,7 @@ def load_covariates(self): self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values # z-transform - self.full_cov_df["C_RT_z"] = zt(self.full_cov_df["C_RT"]) + self.full_cov_df["C_RT_z"] = zt(np.log(self.full_cov_df["C_RT"])) ## GC content @@ -175,7 +176,7 @@ def load_covariates(self): self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values # z-transform - self.full_cov_df["C_FAIRE_z"] = zt(self.full_cov_df["C_FAIRE"]) + self.full_cov_df["C_FAIRE_z"] = zt(np.log(self.full_cov_df["C_FAIRE"] + 1)) # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned From 7f6eb1fcc794a4b63c044c8cd2b41d44671e241b Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 23 May 2022 10:16:54 -0400 Subject: [PATCH 196/222] Initial commit of covMCMC gather --- hapaseg/run_coverage_MCMC.py | 79 +++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 3c4ca12..aa4f4f5 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -341,8 +341,11 @@ def nat_sort(lst): return sorted(lst, key=alphanum_key) -# function for collecting coverage mcmc results from each ADP cluster -def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, bin_width=1): +# function for collecting coverage mcmc results from each ADP segment +def aggregate_adp_segments(allelic_seg_groups_pickle, coverage_dir=None, f_file_list=None, cov_df_pickle=None, bin_width=1): + S = pd.read_pickle(allelic_seg_groups_pickle) + S = S.rename_axis(index = "allelic_seg_idx").reset_index() + if coverage_dir is None and f_file_list is None: raise ValueError("need to pass in either coverage_dir or file_list txt file!") if coverage_dir is not None and f_file_list is not None: @@ -350,7 +353,7 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, # get results files from the directory provided or from the file list provided if coverage_dir is not None: - cluster_files = nat_sort(glob.glob(os.path.join(coverage_dir, 'cov_mcmc_data_cluster_*'))) + adp_seg_files = nat_sort(glob.glob(os.path.join(coverage_dir, 'cov_mcmc_data_*'))) cov_df = pd.read_pickle(os.path.join(coverage_dir, 'cov_df.pickle')) else: @@ -364,47 +367,67 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, to_add = l.rstrip('\n') if to_add != "nan": read_files.append(to_add) - cluster_files = nat_sort(read_files) + adp_seg_files = nat_sort(read_files) cov_df = pd.read_pickle(cov_df_pickle) - - clust_assignments = cov_df['allelic_cluster'].values - + + # make sure that number of results shards is consistent with shard indices + if len(adp_seg_files) != len(S): + raise ValueError("Number of ADP seg files does not match scatter shards!") + + # load in covMCMC segment boundaries and mu's for each ADP segment seg_results = [] mu_i_results = [] - # load data from each cluster - for data_path in cluster_files: - cluster_data = np.load(data_path) - seg_results.append(cluster_data['seg_samples']) - mu_i_results.append(cluster_data['mu_i_samples']) - + for f in adp_seg_files: + seg_data = np.load(f) + seg_results.append(seg_data['seg_samples']) + mu_i_results.append(seg_data['mu_i_samples']) + + S["seg_results"] = seg_results + S["mu_i_results"] = mu_i_results + num_draws = seg_results[0].shape[1] - num_clusters = len(seg_results) - # now we use these data to fill an overall coverage segmentation array + # create overall segmentation array coverage_segmentation = np.zeros((len(cov_df), num_draws)) mu_i_values = np.zeros((len(cov_df), num_draws)) + # loop over each cov MCMC draw + # TODO: only use maximum likelihood draw; CDP should be able to resegment for d in range(num_draws): - global_counter = 0 - for c in range(num_clusters): - cluster_mask = (clust_assignments == c) - coverage_segmentation[cluster_mask, d] = seg_results[c][:,d] + global_counter - mu_i_values[cluster_mask, d] = mu_i_results[c][:, d] - global_counter += len(np.unique(seg_results[c][:,d])) - - # generate data to re-compute global beta + n_tot_segs = 0 + # loop over ADP segments + for _, s in S.iterrows(): + seg_idxs = s["seg_results"][:, d] + coverage_segmentation[s["indices"], d] = seg_idxs + n_tot_segs + n_tot_segs += seg_idxs[-1] + 1 + + mu_i_values[s["indices"], d] = s["mu_i_results"][:, d] + + # TEMP HACK: for now, only take iteration with fewest number of segments + sidx = coverage_segmentation[-1, :].argmin() + coverage_segmentation = coverage_segmentation[:, [sidx]] + mu_i_values = mu_i_values[:, [sidx]] + + # remove short segments (<200Kb) + # TODO: remove segments not well-modeled by covariates + cov_df["cov_seg_idx"] = coverage_segmentation.astype(int) + long_seg_idx = cov_df.groupby("cov_seg_idx").apply(lambda x : (x.iloc[-1]["end"] - x.iloc[0]["start"]) > 2e5).rename("seg_OK") + cov_df = cov_df.merge(long_seg_idx, left_on = "cov_seg_idx", right_index = True) + + coverage_segmentation = coverage_segmentation[cov_df["seg_OK"], :] + mu_i_values = mu_i_values[cov_df["seg_OK"], :] + cov_df = cov_df.loc[cov_df["seg_OK"]] + + # recompute global beta r = np.c_[cov_df["covcorr"]] # we'll use the mu_is from the last segmentation sample - mu_is = mu_i_values[:,-1] - # compute new edogenous targets by subtracking out the mu_i values of the segments - # along with the bin exposure - endog = np.exp(np.log(r).flatten() - np.log(bin_width) - mu_is).reshape(-1,1) + mu_is = mu_i_values[:, [-1]] # generate covars covar_columns = sorted(cov_df.columns[cov_df.columns.str.contains("^C_.*_z$")]) C = np.c_[cov_df[covar_columns]] # do regression - pois_regr = PoissonRegression(endog, C, np.ones(endog.shape)) + pois_regr = PoissonRegression(r, C, np.ones(r.shape), np.log(bin_width) + mu_is) mu_refit, beta_refit = pois_regr.fit() return coverage_segmentation, beta_refit From 6c43e331150d2036d054ff5d4d4f5b76debc4d50 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 2 Jun 2022 15:49:02 -0400 Subject: [PATCH 197/222] Save covMCMC likelihood samples --- hapaseg/NB_coverage_MCMC.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index ac93aa7..dd77611 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -802,6 +802,7 @@ def __init__(self, n_iter, r, C, mu, beta, bin_width=1): self.mu_i_samples = [] self.lepsi_i_samples = [] self.F_samples = [] + self.ll_samples = [] self.ll_cluster = 0 self.ll_iter = [] @@ -827,6 +828,7 @@ def save_sample(self): self.mu_i_samples.append(self.cluster.mu_i_arr.copy()) self.lepsi_i_samples.append(self.cluster.lepsi_i_arr.copy()) self.F_samples.append(self.cluster.F.copy()) + self.ll_samples.append(self.ll_cluster) def run(self, debug=False, From f2d933be3bb3596c5aebf34c7d3340586f75cdf5 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 7 Jun 2022 15:08:24 -0400 Subject: [PATCH 198/222] Better starting values for mu/beta --- hapaseg/model_optimizers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py index 76e95c1..288d635 100644 --- a/hapaseg/model_optimizers.py +++ b/hapaseg/model_optimizers.py @@ -8,8 +8,8 @@ def __init__(self, r, C, Pi, log_exposure = 0): self.Pi = Pi self.log_exposure = log_exposure - self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.beta = np.ones([C.shape[1], 1]) + self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure + self.beta = np.zeros([C.shape[1], 1]) self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure) # mu gradient From dfa7681e5dc0ee2aed9306852c7bc79c4e034cec Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Mon, 23 May 2022 11:28:57 -0400 Subject: [PATCH 199/222] Return Poisson Hessian --- hapaseg/model_optimizers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py index 288d635..bad14ed 100644 --- a/hapaseg/model_optimizers.py +++ b/hapaseg/model_optimizers.py @@ -54,3 +54,9 @@ def NR_poisson(self): def fit(self): self.NR_poisson() return self.mu, self.beta + + def hess(self): + hmu = self.hessmu() + hbeta = self.hessbeta() + hmubeta = self.hessmubeta() + return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] From 000b7fa1150a81424e616c0994c67113db7065ec Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 10 Jun 2022 11:47:16 -0400 Subject: [PATCH 200/222] Add offset to Poisson regression Offset is expected to be length of data vector --- hapaseg/model_optimizers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py index bad14ed..f1fd32e 100644 --- a/hapaseg/model_optimizers.py +++ b/hapaseg/model_optimizers.py @@ -2,15 +2,16 @@ class PoissonRegression: - def __init__(self, r, C, Pi, log_exposure = 0): + def __init__(self, r, C, Pi, log_exposure = 0, log_offset = 0): self.r = r self.C = C self.Pi = Pi self.log_exposure = log_exposure + self.log_offset = log_offset self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure self.beta = np.zeros([C.shape[1], 1]) - self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure) + self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) # mu gradient def gradmu(self): @@ -34,7 +35,7 @@ def hessmubeta(self): def NR_poisson(self): for i in range(100): - self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure) + self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) gmu = self.gradmu() gbeta = self.gradbeta() grad = np.r_[gmu, gbeta] From d41cdd6145387879c23198c85be66c012d4c2e89 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 10 Jun 2022 11:47:40 -0400 Subject: [PATCH 201/222] Add simple normal prior to Poisson regression --- hapaseg/model_optimizers.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py index f1fd32e..efed17f 100644 --- a/hapaseg/model_optimizers.py +++ b/hapaseg/model_optimizers.py @@ -13,21 +13,27 @@ def __init__(self, r, C, Pi, log_exposure = 0, log_offset = 0): self.beta = np.zeros([C.shape[1], 1]) self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) + # prior parameters + self.mumu = 0 + self.musig2 = 1 + self.betamu = np.zeros_like(self.beta) + self.betasiginv = np.eye(len(self.beta)) + # mu gradient def gradmu(self): - return self.Pi.T @ (self.r - self.e_s) + return self.Pi.T @ (self.r - self.e_s) - (self.mu - self.mumu)/self.musig2 # mu Hessian def hessmu(self): - return (-self.Pi.T * self.e_s.T) @ self.Pi + return (-self.Pi.T * self.e_s.T) @ self.Pi - 1/self.musig2 # beta gradient def gradbeta(self): - return self.C.T @ (self.r - self.e_s) + return self.C.T @ (self.r - self.e_s) - self.betasiginv@(self.beta - self.betamu) # beta Hessian def hessbeta(self): - return (-self.C.T * self.e_s.T) @ self.C + return (-self.C.T * self.e_s.T) @ self.C - self.betasiginv # mu,beta Hessian def hessmubeta(self): From 5718c707f232112b708b21f565f16f4ee4d69954 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 12:58:21 -0400 Subject: [PATCH 202/222] Allow PoisRegr priors to be specified; allow running without intercept --- hapaseg/model_optimizers.py | 50 +++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 16 deletions(-) diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py index efed17f..b4472d1 100644 --- a/hapaseg/model_optimizers.py +++ b/hapaseg/model_optimizers.py @@ -2,22 +2,25 @@ class PoissonRegression: - def __init__(self, r, C, Pi, log_exposure = 0, log_offset = 0): + def __init__(self, r, C, Pi, + log_exposure = 0, log_offset = 0, intercept = True, + mumu = 0, musig2 = 10, betamu = None, betasiginv = None): self.r = r self.C = C self.Pi = Pi self.log_exposure = log_exposure self.log_offset = log_offset + self.intercept = intercept self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure self.beta = np.zeros([C.shape[1], 1]) self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) # prior parameters - self.mumu = 0 - self.musig2 = 1 - self.betamu = np.zeros_like(self.beta) - self.betasiginv = np.eye(len(self.beta)) + self.mumu = mumu + self.musig2 = musig2 + self.betamu = np.zeros_like(self.beta) if betamu is None else betamu + self.betasiginv = 1/np.sqrt(10)*np.eye(len(self.beta)) if betasiginv is None else betasiginv # mu gradient def gradmu(self): @@ -42,28 +45,43 @@ def hessmubeta(self): def NR_poisson(self): for i in range(100): self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) - gmu = self.gradmu() gbeta = self.gradbeta() - grad = np.r_[gmu, gbeta] + if self.intercept: + gmu = self.gradmu() + grad = np.r_[gmu, gbeta] + else: + grad = gbeta - hmu = self.hessmu() hbeta = self.hessbeta() - hmubeta = self.hessmubeta() - H = np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] + if self.intercept: + hmubeta = self.hessmubeta() + hmu = self.hessmu() + H = np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] + else: + H = hbeta delta = np.linalg.inv(H) @ grad - self.mu -= delta[0:len(self.mu)] - self.beta -= delta[len(self.mu):] + if self.intercept: + self.mu -= delta[0:len(self.mu)] + self.beta -= delta[len(self.mu):] + else: + self.beta -= delta if np.linalg.norm(grad) < 1e-5: break def fit(self): self.NR_poisson() - return self.mu, self.beta + if self.intercept: + return self.mu, self.beta + else: + return self.beta def hess(self): - hmu = self.hessmu() hbeta = self.hessbeta() - hmubeta = self.hessmubeta() - return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] + if self.intercept: + hmu = self.hessmu() + hmubeta = self.hessmubeta() + return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] + else: + return hbeta From 57e551a527ff2575773ae66b9e4565bbc902378a Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 13:25:28 -0400 Subject: [PATCH 203/222] Pass Poisson Hessian to downstream tasks --- hapaseg/__main__.py | 4 ++-- hapaseg/run_coverage_MCMC.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 935ecbe..51f1400 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -519,7 +519,7 @@ def main(): f_faire=args.faire_pickle, f_GC=args.gc_pickle, allelic_sample=args.allelic_sample) - Pi, r, C, all_mu, global_beta, cov_df, adp_cluster = cov_mcmc_runner.prepare_single_cluster() + Pi, r, C, all_mu, global_beta, cov_df, adp_cluster, pois_hess = cov_mcmc_runner.prepare_single_cluster() ## create chunks for both burnin and scatter cov_df = cov_df.sort_values("start_g", ignore_index = True) @@ -533,7 +533,7 @@ def main(): ## save # regression matrices np.savez(os.path.join(output_dir, 'preprocess_data'), Pi=Pi, r=r, C=C, all_mu=all_mu, - global_beta=global_beta, adp_cluster=adp_cluster) + global_beta=global_beta, adp_cluster=adp_cluster, pois_hess=pois_hess) # coverage dataframe mapped cov_df.to_pickle(os.path.join(output_dir, 'cov_df.pickle')) # allelic segment indices into coverage dataframe diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index aa4f4f5..d1fdf8d 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -70,9 +70,10 @@ def prepare_single_cluster(self): Pi, r, C, filtered_cov_df = self.assign_clusters() pois_regr = PoissonRegression(r, C, Pi, log_exposure = np.log(self.bin_width)) all_mu, global_beta = pois_regr.fit() + pois_hess = pois_regr.hess() # save these results to a numpy object - return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample + return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample, pois_hess def load_coverage(self, coverage_csv): Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False) From 6502b35e9e29c1ff3ad4b11d329ac06ae19a612d Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 13:26:25 -0400 Subject: [PATCH 204/222] Compute initial Poisson regression on ADP segment level Excised from 8f439ac --- hapaseg/run_coverage_MCMC.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index d1fdf8d..dfc5c7f 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -88,7 +88,7 @@ def load_coverage(self, coverage_csv): def load_SNPs(self, f_snps): SNPs = pd.read_pickle(f_snps) - SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False) + SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False).astype(int) return SNPs def generate_GC(self): @@ -187,19 +187,19 @@ def assign_clusters(self): clust_choice = self.allelic_clusters["snps_to_clusters"][self.allelic_sample] clust_u, clust_uj = np.unique(clust_choice, return_inverse=True) clust_uj = clust_uj.reshape(clust_choice.shape) - cuj_max = clust_uj.max() + 1 self.SNPs["clust_choice"] = clust_uj - ## assign coverage intervals to allelic clusters and segments - # assignment probabilities of each coverage interval -> allelic cluster - Cov_clust_probs = np.zeros([len(self.full_cov_df), cuj_max]) - + ## assign coverage intervals to allelic clusters and segments # get allelic segment boundaries seg_bdy = np.r_[0, list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)] seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] self.SNPs["seg_idx"] = 0 for i, (st, en) in enumerate(seg_bdy): self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i + seg_max = self.SNPs["seg_idx"].max() + 1 + + # assignment probabilities of each coverage interval -> allelic segment + Cov_clust_probs = np.zeros([len(self.full_cov_df), seg_max]) # first compute assignment probabilities based on the SNPs within each bin # segments just get assigned to the maximum probability @@ -208,13 +208,13 @@ def assign_clusters(self): for targ, D in tqdm.tqdm(self.SNPs.groupby("tidx")[["clust_choice", "seg_idx"]]): clust_idx = D["clust_choice"].values seg_idx = D["seg_idx"].values - if len(clust_idx) == 1: - Cov_clust_probs[int(targ), clust_idx] = 1.0 - self.full_cov_df.at[int(targ), "seg_idx"] = seg_idx[0] + if len(seg_idx) == 1: + Cov_clust_probs[targ, seg_idx] = 1.0 + self.full_cov_df.at[targ, "seg_idx"] = seg_idx[0] else: - targ_clust_hist = np.bincount(clust_idx, minlength = cuj_max) - Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum() - self.full_cov_df.at[int(targ), "seg_idx"] = np.bincount(seg_idx).argmax() + targ_clust_hist = np.bincount(seg_idx, minlength = seg_max) + Cov_clust_probs[targ, :] = targ_clust_hist / targ_clust_hist.sum() + self.full_cov_df.at[targ, "seg_idx"] = np.bincount(seg_idx).argmax() ## subset to targets containing SNPs overlap_idx = Cov_clust_probs.sum(1) > 0 From 2c69a1089e287f2392e414982e057e54eb829962 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 13:48:50 -0400 Subject: [PATCH 205/222] Bump covMCMC/prep dockers --- wolF/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 7668711..6311919 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -164,7 +164,7 @@ def script(self): "allelic_seg_groups": "allelic_seg_groups.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v832" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v853" resources = { "mem" : "15G" } @@ -221,7 +221,7 @@ def prolog(self): "cov_seg_figure": 'cov_mcmc_*_visual.png' } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v830" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v853" resources = {"mem" : "10G"} class Hapaseg_collect_coverage_mcmc(wolf.Task): From fc9f1b5f0ef6a76cf1a7ebe372378b0da39cc7c2 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 13:58:03 -0400 Subject: [PATCH 206/222] Rename import --- hapaseg/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 51f1400..9798ed0 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -21,7 +21,7 @@ from . import utils as hs_utils from .NB_coverage_MCMC import NB_MCMC_SingleCluster -from .run_coverage_MCMC import CoverageMCMCRunner, aggregate_clusters, aggregate_burnin_files +from .run_coverage_MCMC import CoverageMCMCRunner, aggregate_adp_segments, aggregate_burnin_files from .coverage_DP import Coverage_DP from .a_cov_DP import generate_acdp_df, AllelicCoverage_DP From 602e432beef79c08ecfeade58a408834f5aa9cf3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 14:37:37 -0400 Subject: [PATCH 207/222] Fix SNP -> coverage bin mapping bug --- hapaseg/run_coverage_MCMC.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index dfc5c7f..8907b89 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -88,7 +88,7 @@ def load_coverage(self, coverage_csv): def load_SNPs(self, f_snps): SNPs = pd.read_pickle(f_snps) - SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False).astype(int) + mut.map_mutations_to_targets(SNPs, self.full_cov_df) return SNPs def generate_GC(self): @@ -205,7 +205,9 @@ def assign_clusters(self): # segments just get assigned to the maximum probability self.full_cov_df["seg_idx"] = -1 print("Mapping SNPs to targets ...", file = sys.stderr) - for targ, D in tqdm.tqdm(self.SNPs.groupby("tidx")[["clust_choice", "seg_idx"]]): + for targ, D in tqdm.tqdm(self.SNPs.groupby("targ_idx")[["clust_choice", "seg_idx"]]): + if targ == -1: # SNP does not overlap a coverage bin + continue clust_idx = D["clust_choice"].values seg_idx = D["seg_idx"].values if len(seg_idx) == 1: From 5cd7cf2190e09422c753d40c10b6f8739b4c69e8 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 14:41:10 -0400 Subject: [PATCH 208/222] Bump docker --- wolF/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index 6311919..e49324d 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -164,7 +164,7 @@ def script(self): "allelic_seg_groups": "allelic_seg_groups.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v853" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v856" resources = { "mem" : "15G" } @@ -221,7 +221,7 @@ def prolog(self): "cov_seg_figure": 'cov_mcmc_*_visual.png' } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v853" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v856" resources = {"mem" : "10G"} class Hapaseg_collect_coverage_mcmc(wolf.Task): From ad094cdd7e64e5b644c867059679ae28c342bcdf Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 15:43:36 -0400 Subject: [PATCH 209/222] Forgot to pass bin_width to coverage preprocessor --- hapaseg/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 9798ed0..583db50 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -518,7 +518,8 @@ def main(): f_repl=args.repl_pickle, f_faire=args.faire_pickle, f_GC=args.gc_pickle, - allelic_sample=args.allelic_sample) + allelic_sample=args.allelic_sample, + bin_width=args.bin_width) Pi, r, C, all_mu, global_beta, cov_df, adp_cluster, pois_hess = cov_mcmc_runner.prepare_single_cluster() ## create chunks for both burnin and scatter From a5487f14123b857ac0f6c89f6d44890f4a23d0bf Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Tue, 14 Jun 2022 15:45:09 -0400 Subject: [PATCH 210/222] Bump Docker --- wolF/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolF/tasks.py b/wolF/tasks.py index e49324d..8688199 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -164,7 +164,7 @@ def script(self): "allelic_seg_groups": "allelic_seg_groups.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v856" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v858" resources = { "mem" : "15G" } From 0f2c8cdb083de5495431906f171cde46d4dcc1c8 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 30 Jun 2022 13:47:04 -0400 Subject: [PATCH 211/222] Add covariate scale factor --- hapaseg/model_optimizers.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py index b4472d1..3284123 100644 --- a/hapaseg/model_optimizers.py +++ b/hapaseg/model_optimizers.py @@ -14,6 +14,7 @@ def __init__(self, r, C, Pi, self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure self.beta = np.zeros([C.shape[1], 1]) + self.f = 1 self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) # prior parameters @@ -85,3 +86,22 @@ def hess(self): return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] else: return hbeta + + # scale factor + def gradf(self): + return (self.C@self.beta).T@(self.r - self.e_s) + + def hessf(self): + CB = self.C@self.beta + return -(CB*self.e_s).T@CB + + def NR_f(self): + for i in range(100): + self.e_s = np.exp(self.f*self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) + gf = self.gradf() + hf = self.hessf() + + self.f -= gf/hf + + if np.linalg.norm(gf) < 1e-5: + break From 93c53cdead0d1d226f8bd5189922454637f2eb68 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sun, 17 Jul 2022 08:34:25 -0400 Subject: [PATCH 212/222] Use nonlinear GC model --- hapaseg/run_coverage_MCMC.py | 44 ++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 8907b89..548d856 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -164,8 +164,11 @@ def load_covariates(self): else: print("Computing GC content", file = sys.stderr) self.generate_GC() - - self.full_cov_df["C_GC_z"] = zt(self.full_cov_df["C_GC"]) + + # bin GC content with resolution proportional to the sqrt of the number of bins + self.full_cov_df["GC_bin"] = np.round(self.full_cov_df["C_GC"]*np.sqrt(len(self.full_cov_df))).astype(int) + + # we will subsequently transform GC content to reflect the coverage bias of that bin ## FAIRE @@ -264,13 +267,36 @@ def assign_clusters(self): Cov_overlap = Cov_overlap.loc[~bad_bins, :] Pi = filtered.copy() - Cov_overlap['allelic_cluster'] = np.argmax(Pi, axis=1) - - r = np.c_[Cov_overlap["covcorr"]] - - covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")]) - ## making covariate matrix + ## making regressor vector/covariate matrix + + # scale regressor to reflect fragment counts + Cov_overlap["fragcorr"] = np.round(Cov_overlap["covcorr"]/Cov_overlap["C_frag_len"].mean()) + r = np.c_[Cov_overlap["fragcorr"]] + + # fit empirical GC correction model (we do this here because we only consider "good" coverage bins) + GC_b = [] # GC bin + N_gc = [] # total number of coverage intervals within GC bin + F_gc = [] # total number of fragments within GC bin + for _, cidx in Cov_overlap.groupby("allelic_cluster").indices.items(): + ngc = Cov_overlap.iloc[cidx].groupby("GC_bin").size() + fgc = Cov_overlap.iloc[cidx].groupby("GC_bin")["fragcorr"].sum() + GC_b.extend(ngc.index) + N_gc.extend(ngc) + F_gc.extend(fgc) + GC_b = np.r_[GC_b] + N_gc = np.r_[N_gc] + F_gc = np.r_[F_gc] + + # use quadratic model + v = np.polyfit(GC_b/np.sqrt(len(self.full_cov_df)), F_gc/N_gc, 2) + + Cov_overlap["C_GCtr"] = v[::-1]@((Cov_overlap["GC_bin"].values/np.sqrt(len(self.full_cov_df)))**np.c_[0:3]) + Cov_overlap.loc[Cov_overlap["C_GCtr"] < 0, "C_GCtr"] = 1 + #Cov_overlap["C_GCtr_z"] = (lambda x : (x - np.nanmean(x))/np.nanstd(x))(np.log(Cov_overlap["C_GCtr"])) + Cov_overlap["C_GCtr_z"] = np.log(Cov_overlap["C_GCtr"]) + + covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")]) C = np.c_[Cov_overlap[covar_columns]] ## dropping Nans @@ -294,7 +320,7 @@ def assign_clusters(self): Pi = Pi[:, Pi.sum(0) > 0] ## remove covariate outliers (+- 6 sigma) - covar_outlier_idx = (Cov_overlap.loc[:, covar_columns].abs() < 6).all(axis = 1) + covar_outlier_idx = (Cov_overlap.loc[:, set(covar_columns) - {"C_GCtr_z"}].abs() < 6).all(axis = 1) Cov_overlap = Cov_overlap.loc[covar_outlier_idx] Pi = Pi[covar_outlier_idx, :] r = r[covar_outlier_idx] From bfd6fa8b38773691840dc32047a56b169501963d Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Sun, 17 Jul 2022 08:34:45 -0400 Subject: [PATCH 213/222] Save allelic clusters for each ADP segment --- hapaseg/run_coverage_MCMC.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 548d856..28f9d8f 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -207,6 +207,7 @@ def assign_clusters(self): # first compute assignment probabilities based on the SNPs within each bin # segments just get assigned to the maximum probability self.full_cov_df["seg_idx"] = -1 + self.full_cov_df["allelic_cluster"] = -1 print("Mapping SNPs to targets ...", file = sys.stderr) for targ, D in tqdm.tqdm(self.SNPs.groupby("targ_idx")[["clust_choice", "seg_idx"]]): if targ == -1: # SNP does not overlap a coverage bin @@ -216,10 +217,12 @@ def assign_clusters(self): if len(seg_idx) == 1: Cov_clust_probs[targ, seg_idx] = 1.0 self.full_cov_df.at[targ, "seg_idx"] = seg_idx[0] + self.full_cov_df.at[targ, "allelic_cluster"] = clust_idx[0] else: targ_clust_hist = np.bincount(seg_idx, minlength = seg_max) Cov_clust_probs[targ, :] = targ_clust_hist / targ_clust_hist.sum() self.full_cov_df.at[targ, "seg_idx"] = np.bincount(seg_idx).argmax() + self.full_cov_df.at[targ, "allelic_cluster"] = np.bincount(clust_idx).argmax() ## subset to targets containing SNPs overlap_idx = Cov_clust_probs.sum(1) > 0 From 0858136066e55e1560862afb03e11ac4d2ccf8f4 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Fri, 22 Jul 2022 22:33:31 -0400 Subject: [PATCH 214/222] Add sim forcecalling workflow --- 85_simFC.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 85_simFC.py diff --git a/85_simFC.py b/85_simFC.py new file mode 100644 index 0000000..b9ebcf9 --- /dev/null +++ b/85_simFC.py @@ -0,0 +1,66 @@ +import wolf + +mutect = wolf.ImportTask("github.com:getzlab/MuTect1_TOOL.git", "M1") + +def workflow( + bam, bai, vcf, + refFasta = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.fa", + refFastaIdx = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.fa.fai", + refFastaDict = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.dict" +): + localize = wolf.LocalizeToDisk( + files = { + "bam" : bam, + "bai" : bai, + "vcf" : vcf, + "refFasta" : refFasta, + "refFastaIdx" : refFastaIdx, + "refFastaDict" : refFastaDict + } + ) + + split_vcf = wolf.Task( + name = "split_vcf", + inputs = { "vcf" : localize["vcf"] }, + script = """ + grep '^#' ${vcf} > header + sed '/^#/d' ${vcf} | split -l 10000 -d -a 3 --filter='cat header /dev/stdin > $FILE' - VCF_chunk + """, + outputs = { "shards" : "VCF_chunk*" } + ) + + m1_scatter = mutect.mutect1( + inputs = { + "pairName" : "platinum", + "caseName" : "platinum", + "t_bam" : localize["bam"], + "t_bai" : localize["bai"], + "force_calling" : True, + "intervals" : split_vcf["shards"], + "fracContam" : 0, + "refFasta" : localize["refFasta"], + "refFastaIdx" : localize["refFastaIdx"], + "refFastaDict" : localize["refFastaDict"] + } + ) + + m1_gather = wolf.Task( + name = "m1_gather", + inputs = { "callstats_array" : [m1_scatter["mutect1_cs"]] }, + script = """ + head -n2 $(head -n1 ${callstats_array}) > header + while read -r i; do + sed '1,2d' $i + done < ${callstats_array} | sort -k1,1V -k2,2n > cs_sorted + cat header cs_sorted > cs_concat.tsv + """, + outputs = { "cs_gather" : "cs_concat.tsv" } + ) + +with wolf.Workflow(workflow = workflow, namespace = "HS_sim") as w: + w.run( + RUN_NAME = "NA12878_WGS_platinum_hg38", + bam = "gs://jh-xfer/NA12878_bwamem_illumina_platinum_bed.bam", + bai = "gs://jh-xfer/NA12878_bwamem_illumina_platinum_bed.bam.bai", + vcf = "gs://jh-xfer/NA12878.vcf" + ) From 181fff84e89d7ca4ed012bd1e41833f81ba5d0b1 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Jul 2022 17:09:44 -0400 Subject: [PATCH 215/222] Explorations on quadratic GC content estimator --- 71_coverage_covariates.py | 122 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py index 137dcad..7a5b2a3 100644 --- a/71_coverage_covariates.py +++ b/71_coverage_covariates.py @@ -72,6 +72,7 @@ # # GC content {{{ +## precompute GC content {{{ # note: this is obsolete; GC content is now computed on the fly B = pd.read_csv("/mnt/j/proj/cnv/20210326_coverage_collector/targets.bed", sep = "\t", header = None, names = ["chr", "start", "end"]) @@ -88,6 +89,127 @@ # }}} +# Terry Speed GC content estimator {{{ + +import hapaseg.run_coverage_MCMC + +# load coverage + +args = lambda : None +args.coverage_csv = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/coverage_cat.bed" +args.allelic_clusters_object = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/allelic_DP_SNP_clusts_and_phase_assignments.npz" +args.SNPs_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/all_SNPs.pickle" +args.segmentations_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/segmentations.pickle" +args.repl_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/GSE137764_H1.hg19_liftover.pickle" +args.faire_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/FAIRE_GM12878.hg19.pickle" +args.ref_fasta = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/Homo_sapiens_assembly19.fasta" +args.bin_width = 2000 + +cov_mcmc_runner = hapaseg.run_coverage_MCMC.CoverageMCMCRunner( + args.coverage_csv, + args.allelic_clusters_object, + args.SNPs_pickle, + args.segmentations_pickle, + f_repl=args.repl_pickle, + f_faire=args.faire_pickle, + # ref_fasta = "/mnt/j/db/hg38/ref/hg38.analysisSet.fa", # ALCH + ref_fasta = args.ref_fasta, #"/mnt/j/db/hg19/ref/hs37d5.fa", # Richter's + bin_width = args.bin_width +) +C = cov_mcmc_runner.full_cov_df + +# bin intervals by GC content +C["GC_bin"] = np.round(C["C_GC"]*1000).astype(int) +C["num_frags_corr"] = C["covcorr"]/C["C_frag_len"].mean() + +N_gc = C.groupby("GC_bin").size() +F_gc = C.groupby("GC_bin")["num_frags_corr"].sum() + +plt.figure(1); plt.clf() +plt.scatter(N_gc.index, F_gc/N_gc, marker = '.', s = 1) + +cov_df = pd.read_pickle("/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--15-35-16_040rmzi_pid3cty_0w3oyu5xxnfwe/jobs/0/workspace/cov_df.pickle") +cov_df = cov_df.merge(C[["start_g", "C_GC"]], left_on = "start_g", right_on = "start_g") + +cov_df["GC_bin"] = np.round(cov_df["C_GC"]*1000).astype(int) +cov_df["num_frags_corr"] = cov_df["covcorr"]/cov_df["C_frag_len"].mean() + +N_gc = cov_df.groupby("GC_bin").size() +F_gc = cov_df.groupby("GC_bin")["num_frags_corr"].sum() + +cov_df = cov_df.merge((F_gc/N_gc).rename("C_GC_f"), left_on = cov_df["GC_bin"], right_index = True) + +import loess +_, y_l, _ = loess_1d.loess_1d(np.r_[N_gc.index], np.r_[F_gc/N_gc]) + +plt.figure(2); plt.clf() +plt.scatter(N_gc.index, F_gc/N_gc, marker = '.', s = 1) +#plt.plot(N_gc.index, y_l) +r = np.linspace(0, 1000, 1000) +v = np.polyfit(np.r_[N_gc.index]/1000, F_gc/N_gc, 2) +plt.plot(r, v[::-1]@(r**np.c_[0:3])) +plt.ylim([0, 500]) + +from capy import plots + +plt.figure(3); plt.clf() +plots.pixplot(cov_df["C_GC_f"], cov_df["num_frags_corr"], alpha = 0.11) +plots.pixplot(v[::-1]@(cov_df["C_GC"].values**np.c_[0:3]), cov_df["num_frags_corr"], alpha = 0.11) + +gc_g = [] +N_gc_g = [] +F_gc_g = [] +plt.figure(4); plt.clf() +for _, cidx in cov_df.groupby("allelic_cluster").indices.items(): + N_gc = cov_df.iloc[cidx].groupby("GC_bin").size() + F_gc = cov_df.iloc[cidx].groupby("GC_bin")["num_frags_corr"].sum() + lplt = plt.scatter(N_gc.index, (F_gc/N_gc)/F_gc.sum(), marker = '.', s = 1) + + v = np.polyfit(N_gc.index, F_gc/N_gc, 2) + rng = np.linspace(0, 1000, 200) + plt.plot(rng, v[::-1]@(rng**np.c_[0:3]), color = lplt.get_edgecolor()) + + gc_g.extend(N_gc.index) + N_gc_g.extend(N_gc) + F_gc_g.extend(F_gc) + +N_gc_g = np.r_[N_gc_g] +F_gc_g = np.r_[F_gc_g] +gc_g = np.r_[gc_g] + +v = np.polyfit(gc_g, F_gc_g/N_gc_g, 2) +plt.plot(r, v[::-1]@(r**np.c_[0:3])) +_, y_l, _ = loess_1d.loess_1d(gc_g, F_gc_g/N_gc_g, xnew = r, degree = 2) +plt.plot(r, y_l) + +plt.figure(3); plt.clf() +_, y_l, _ = loess_1d.loess_1d(gc_g/1000, F_gc_g/N_gc_g, xnew = cov_df["C_GC"], degree = 2) +plots.pixplot(cov_df["C_GC_f"], cov_df["num_frags_corr"], alpha = 0.11) + +## simulate quadratic relationship +seg_sim = np.r_[np.ones([500, 1]), 1.5*np.ones([500, 1])].T +gc_sim = np.random.rand(1000)*0.6 + 0.2 +rng = np.linspace(0, 1, 100) +x = stats.poisson.rvs(np.exp(-30*(gc_sim - 0.5)**2 + 5*seg_sim))[:, None] +C = np.c_[gc_sim**2, gc_sim] + +import hapaseg.model_optimizers +PR = hapaseg.model_optimizers.PoissonRegression + +Pi = np.r_[np.tile([1, 0], [500, 1]), np.tile([0, 1], [500, 1])] +pois_regr = PR(x, C, Pi) +pois_regr.fit() +pois_regr2 = PR(x, C[:, [1]], Pi) +pois_regr2.fit() +plt.figure(2); plt.clf() +plt.scatter(x, np.exp(Pi@pois_regr.mu + C@pois_regr.beta), marker = '.', s = 1) +plt.scatter(x, np.exp(Pi@pois_regr2.mu + C[:, [1]]@pois_regr2.beta), marker = '.', s = 1) + + +# }}} + +# }}} + # # DNAse HS/FAIRE {{{ From 15796f4e42b526d0cba7ca8b2031380374bbcb98 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Jul 2022 17:10:31 -0400 Subject: [PATCH 216/222] Regenerate FAIRE tracks --- 71_coverage_covariates.py | 191 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py index 7a5b2a3..6012afa 100644 --- a/71_coverage_covariates.py +++ b/71_coverage_covariates.py @@ -275,6 +275,197 @@ FAIRE_smooth["FAIRE"] = np.convolve(FAIRE["FAIRE"], np.ones(5), mode = "same")/5 FAIRE_smooth.to_pickle("covars/FAIRE_GM12878.smooth5.hg19.pickle") +# +# re-process all FAIRE files using samtools +import wolf, itertools, glob, prefect + +## make interval list +clen = seq.get_chrlens() +for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X"]]): + bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]] + tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1] }) + tmp.to_csv(f"FAIRE/intervals/{chrname}.bed", sep = "\t", header = None, index = False) + +## define coverage workflow + +class markdups(wolf.Task): + inputs = { "bamin" } + script = "samtools markdup ${bamin} $(basename ${bamin}).dedup.bam && samtools index *dedup.bam" + outputs = { "bam" : "*.bam", "bai" : "*.bai" } + docker = "gcr.io/broad-getzlab-workflows/base_image:v0.0.5" + +intervals = glob.glob("/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE/intervals/*.bed") + +def BedCovFlow(bams, intervals): + # mark duplicates + mark_dups = [] + for b in bams: + mark_dups.append(markdups( + inputs = { "bamin" : b }, + overrides = { "bamin" : "string" }, + use_scratch_disk = True, + scratch_disk_size = 10 + )) + + # run bedcov on all BAMs (gather) + @prefect.task(nout = 2) + def bl(md): + return [m["bam"] for m in md], [m["bai"] for m in md] + bam_list, bai_list = bl(mark_dups) + + BedCov = wolf.Task( + name = "BedCov", + inputs = { "intervals" : intervals, "bams" : [bam_list], "bais" : [bai_list] }, + script = """ + samtools bedcov -Q1 ${intervals} $(cat ${bams}) > coverage.bed + """, + outputs = { "coverage" : "coverage.bed" }, + docker = "gcr.io/broad-getzlab-workflows/base_image:v0.0.5" + ) +# for b in bam_list: +# wolf.DeleteDisk(b, BedCov["coverage"]) + + # gather BedCovs + BedCovGather = wolf.Task( + name = "BedCovGather", + inputs = { "beds" : [BedCov["coverage"]] }, + script = """ + cat $(cat ${beds}) | sort -k1,1V -k2,2n | \ + awk -F'\t' 'BEGIN { OFS = FS } { tot = 0; for(i = 4; i <= NF; i++) { tot += $i }; print $0, tot }' > concat.bed + """, + outputs = { "concat" : "concat.bed" }, + ) + +## run workflow + +base_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeOpenChromFaire/" + +BAMs = ["wgEncodeOpenChromFaireA549AlnRep1.bam", # {{{ +"wgEncodeOpenChromFaireA549AlnRep2.bam", +"wgEncodeOpenChromFaireAstrocyAlnRep1.bam", +"wgEncodeOpenChromFaireAstrocyAlnRep2.bam", +"wgEncodeOpenChromFaireColonocAlnRep1.bam", +"wgEncodeOpenChromFaireColonocAlnRep2.bam", +"wgEncodeOpenChromFaireEndometriumocAlnRep1.bam", +"wgEncodeOpenChromFaireEndometriumocAlnRep2.bam", +"wgEncodeOpenChromFaireFrontalcortexocAlnRep1.bam", +"wgEncodeOpenChromFaireFrontalcortexocAlnRep2.bam", +"wgEncodeOpenChromFaireGlioblaAlnRep1.bam", +"wgEncodeOpenChromFaireGlioblaAlnRep2.bam", +"wgEncodeOpenChromFaireGlioblaAlnRep3.bam", +"wgEncodeOpenChromFaireGm12878AlnRep1.bam", +"wgEncodeOpenChromFaireGm12878AlnRep2.bam", +"wgEncodeOpenChromFaireGm12878AlnRep3.bam", +"wgEncodeOpenChromFaireGm12891AlnRep1.bam", +"wgEncodeOpenChromFaireGm12891AlnRep2.bam", +"wgEncodeOpenChromFaireGm12892AlnRep1.bam", +"wgEncodeOpenChromFaireGm12892AlnRep2.bam", +"wgEncodeOpenChromFaireGm18507AlnRep1.bam", +"wgEncodeOpenChromFaireGm18507AlnRep2.bam", +"wgEncodeOpenChromFaireGm19239AlnRep1.bam", +"wgEncodeOpenChromFaireGm19239AlnRep2.bam", +"wgEncodeOpenChromFaireH1hescAlnRep1.bam", +"wgEncodeOpenChromFaireH1hescAlnRep2.bam", +"wgEncodeOpenChromFaireHelas3AlnRep1.bam", +"wgEncodeOpenChromFaireHelas3AlnRep2.bam", +"wgEncodeOpenChromFaireHelas3Ifna4hAlnRep1.bam", +"wgEncodeOpenChromFaireHelas3Ifna4hAlnRep2.bam", +"wgEncodeOpenChromFaireHelas3Ifng4hAlnRep1.bam", +"wgEncodeOpenChromFaireHelas3Ifng4hAlnRep2.bam", +"wgEncodeOpenChromFaireHepg2AlnRep1.bam", +"wgEncodeOpenChromFaireHepg2AlnRep2.bam", +"wgEncodeOpenChromFaireHepg2AlnRep3.bam", +"wgEncodeOpenChromFaireHtr8AlnRep1.bam", +"wgEncodeOpenChromFaireHtr8AlnRep2.bam", +"wgEncodeOpenChromFaireHuvecAlnRep1.bam", +"wgEncodeOpenChromFaireHuvecAlnRep2.bam", +"wgEncodeOpenChromFaireK562AlnRep1.bam", +"wgEncodeOpenChromFaireK562AlnRep2.bam", +"wgEncodeOpenChromFaireK562NabutAlnRep1.bam", +"wgEncodeOpenChromFaireK562NabutAlnRep2.bam", +"wgEncodeOpenChromFaireK562OhureaAlnRep1.bam", +"wgEncodeOpenChromFaireK562OhureaAlnRep2.bam", +"wgEncodeOpenChromFaireKidneyocAlnRep1.bam", +"wgEncodeOpenChromFaireKidneyocAlnRep2.bam", +"wgEncodeOpenChromFaireMcf7Est10nm30mAlnRep1.bam", +"wgEncodeOpenChromFaireMcf7Est10nm30mAlnRep2.bam", +"wgEncodeOpenChromFaireMcf7HypoxlacAlnRep1.bam", +"wgEncodeOpenChromFaireMcf7HypoxlacAlnRep2.bam", +"wgEncodeOpenChromFaireMcf7VehAlnRep1.bam", +"wgEncodeOpenChromFaireMcf7VehAlnRep2.bam", +"wgEncodeOpenChromFaireMedulloAlnRep1.bam", +"wgEncodeOpenChromFaireMedulloAlnRep2.bam", +"wgEncodeOpenChromFaireMrta2041AlnRep1.bam", +"wgEncodeOpenChromFaireMrta2041AlnRep2.bam", +"wgEncodeOpenChromFaireMrtg4016AlnRep1.bam", +"wgEncodeOpenChromFaireMrtg4016AlnRep2.bam", +"wgEncodeOpenChromFaireMrtttc549AlnRep1.bam", +"wgEncodeOpenChromFaireMrtttc549AlnRep2.bam", +"wgEncodeOpenChromFaireNhaAlnRep1.bam", +"wgEncodeOpenChromFaireNhaAlnRep2.bam", +"wgEncodeOpenChromFaireNhbeAlnRep1.bam", +"wgEncodeOpenChromFaireNhbeAlnRep2.bam", +"wgEncodeOpenChromFaireNhekAlnRep1.bam", +"wgEncodeOpenChromFaireNhekAlnRep2.bam", +"wgEncodeOpenChromFairePancreasocAlnRep1.bam", +"wgEncodeOpenChromFairePancreasocAlnRep2.bam", +"wgEncodeOpenChromFairePanisletsAlnRep1.bam", +"wgEncodeOpenChromFaireRcc7860AlnRep1.bam", +"wgEncodeOpenChromFaireRcc7860AlnRep2.bam", +"wgEncodeOpenChromFaireSmallintestineocAlnRep1.bam", +"wgEncodeOpenChromFaireSmallintestineocAlnRep2.bam", +"wgEncodeOpenChromFaireUrotsaAlnRep1.bam", +"wgEncodeOpenChromFaireUrotsaAlnRep2.bam", +"wgEncodeOpenChromFaireUrotsaUt189AlnRep1.bam", +"wgEncodeOpenChromFaireUrotsaUt189AlnRep2.bam"] # }}} + +B = pd.Series(BAMs).str.extract("(?P.*Faire(?P.*)AlnRep(?P\d+)\.bam)") + +with wolf.Workflow(workflow = BedCovFlow, namespace = "FAIRE_cov") as w: + for cell_line, b in B.groupby("cell_line"): + w.run(RUN_NAME = cell_line, bams = base_url + b["bam"], intervals = intervals) + +## parse in coverages; make covariate table +from capy import mut + +w = wolf.Workflow(workflow = BedCovFlow, namespace = "FAIRE_cov") +for cell_line, b in B.groupby("cell_line"): + w.load_results(RUN_NAME = cell_line, bams = base_url + b["bam"], intervals = intervals) + +T = w.tasks.loc[(slice(None), "BedCovGather"), ["results"]].droplevel(1) +T["covpath"] = T["results"].apply(lambda x : x["concat"]) + +for i, (cell_line, cov) in enumerate(T.iterrows()): + X = pd.read_csv(cov["covpath"], sep = "\t", header = None) + X = X.rename(columns = { len(X.columns) - 1 : cell_line }) + # get common lines + if i == 0: + C = X.iloc[:, np.r_[0:3, -1]].rename(columns = { 0 : "chr", 1 : "start", 2 : "end" }) + else: + C = pd.concat([C, X.iloc[:, -1]], axis = 1) + +C["chr"] = mut.convert_chr(C["chr"]) + +C.to_pickle("covars/FAIRE/coverage.dedup.raw.pickle") + +# rebin to 10k +C["index_r"] = C.index//5 +C10k = C.groupby(["chr", "index_r"]).agg({ + "start" : min, "end" : max, + **{ k : sum for k in C.columns[3:] } +}).droplevel(1).reset_index().drop(columns = "index_r") + +C10k.to_pickle("covars/FAIRE/coverage.dedup.raw.10kb.pickle") + +# 100k? +C["index_r"] = C.index//50 +C100k = C.groupby(["chr", "index_r"]).agg({ + "start" : min, "end" : max, + **{ k : sum for k in C.columns[3:] } +}).droplevel(1).reset_index().drop(columns = "index_r") + +C100k.to_pickle("covars/FAIRE/coverage.dedup.raw.100kb.pickle") + # }}} # }}} From 92546dc4f0111584c34644df713a0b03ad4be4b3 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Jul 2022 17:12:50 -0400 Subject: [PATCH 217/222] Use quadratic GC content on raw data rather than binned data --- hapaseg/run_coverage_MCMC.py | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 28f9d8f..13914ef 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -165,10 +165,8 @@ def load_covariates(self): print("Computing GC content", file = sys.stderr) self.generate_GC() - # bin GC content with resolution proportional to the sqrt of the number of bins - self.full_cov_df["GC_bin"] = np.round(self.full_cov_df["C_GC"]*np.sqrt(len(self.full_cov_df))).astype(int) - - # we will subsequently transform GC content to reflect the coverage bias of that bin + # GC content follows a roughly quadratic relationship with coverage + self.full_cov_df["C_GC2"] = self.full_cov_df["C_GC"]**2 ## FAIRE @@ -277,29 +275,8 @@ def assign_clusters(self): Cov_overlap["fragcorr"] = np.round(Cov_overlap["covcorr"]/Cov_overlap["C_frag_len"].mean()) r = np.c_[Cov_overlap["fragcorr"]] - # fit empirical GC correction model (we do this here because we only consider "good" coverage bins) - GC_b = [] # GC bin - N_gc = [] # total number of coverage intervals within GC bin - F_gc = [] # total number of fragments within GC bin - for _, cidx in Cov_overlap.groupby("allelic_cluster").indices.items(): - ngc = Cov_overlap.iloc[cidx].groupby("GC_bin").size() - fgc = Cov_overlap.iloc[cidx].groupby("GC_bin")["fragcorr"].sum() - GC_b.extend(ngc.index) - N_gc.extend(ngc) - F_gc.extend(fgc) - GC_b = np.r_[GC_b] - N_gc = np.r_[N_gc] - F_gc = np.r_[F_gc] - - # use quadratic model - v = np.polyfit(GC_b/np.sqrt(len(self.full_cov_df)), F_gc/N_gc, 2) - - Cov_overlap["C_GCtr"] = v[::-1]@((Cov_overlap["GC_bin"].values/np.sqrt(len(self.full_cov_df)))**np.c_[0:3]) - Cov_overlap.loc[Cov_overlap["C_GCtr"] < 0, "C_GCtr"] = 1 - #Cov_overlap["C_GCtr_z"] = (lambda x : (x - np.nanmean(x))/np.nanstd(x))(np.log(Cov_overlap["C_GCtr"])) - Cov_overlap["C_GCtr_z"] = np.log(Cov_overlap["C_GCtr"]) - - covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$")]) + # make covariate matrix; use all z-transformed covariates + non-scaled GC content+GC^2 + covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$") | Cov_overlap.columns.str.contains("^C_GC")]) C = np.c_[Cov_overlap[covar_columns]] ## dropping Nans From 5107c3c22548fdc62828314bbdafc49ebb4ceb56 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Jul 2022 17:13:24 -0400 Subject: [PATCH 218/222] Load multiple FAIRE tracks --- hapaseg/run_coverage_MCMC.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 13914ef..7f400a4 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -172,13 +172,17 @@ def load_covariates(self): if self.f_faire is not None: F = pd.read_pickle(self.f_faire) + # map targets to FAIRE intervals tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint") - self.full_cov_df['C_FAIRE'] = np.nan - self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, -1].values + F = F.loc[tidx].set_index(tidx.index).iloc[:, 3:].rename(columns = lambda x : "C_" + x) + self.full_cov_df = pd.concat([self.full_cov_df, F], axis = 1) # z-transform - self.full_cov_df["C_FAIRE_z"] = zt(np.log(self.full_cov_df["C_FAIRE"] + 1)) + self.full_cov_df = pd.concat([ + self.full_cov_df, + self.full_cov_df.loc[:, F.columns].apply(lambda x : zt(np.log(x + 1))).rename(columns = lambda x : x + "_z") + ], axis = 1) # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned From 4af5402289d49cbcbdfd3daeb34861db83415837 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Jul 2022 17:13:49 -0400 Subject: [PATCH 219/222] Load in update covcollect format that counts bad reads --- hapaseg/run_coverage_MCMC.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 7f400a4..e880c13 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -76,7 +76,7 @@ def prepare_single_cluster(self): return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample, pois_hess def load_coverage(self, coverage_csv): - Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False) + Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_frags", "tot_reads", "fail_reads"], low_memory=False) Cov.loc[Cov['chr'] == 'chrM', 'chr'] = 'chrMT' #change mitocondrial contigs to follow mut conventions Cov["chr"] = mut.convert_chr(Cov["chr"]) Cov = Cov.loc[Cov["chr"] != 0] @@ -127,7 +127,7 @@ def load_covariates(self): # generate on 5x and 11x scales swv = np.lib.stride_tricks.sliding_window_view fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0 - wt = self.full_cov_df["num_reads"].values + wt = self.full_cov_df["num_frags"].values for scale in [5, 11]: fl_sw = swv(np.pad(fl, scale//2), scale) wt_sw = swv(np.pad(wt, scale//2), scale) From 7bdb15e72c3b2d071a98d9b25253348c47380054 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Jul 2022 17:14:44 -0400 Subject: [PATCH 220/222] Use covcollect branch that tallies bad reads --- wolF/workflow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index 7bf7716..b97c24b 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -44,7 +44,8 @@ cov_collect = wolf.ImportTask( task_path = "git@github.com:getzlab/covcollect.git", - task_name = "covcollect" + task_name = "covcollect", + branch = "tot_reads" ) #### From 8695dc40a41a8b6ee136e736316e64971e3fac04 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Jul 2022 17:17:41 -0400 Subject: [PATCH 221/222] Draft code of force calling at het sites only --- wolF/workflow.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index b97c24b..c0b6038 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -261,6 +261,16 @@ def interval_gather(interval_files, primary_contigs): # otherwise, run M1 and get it from the BAM elif callstats_file is None and tumor_bam is not None and normal_bam is not None: + # split het sites file uniformly +# split_het_sites = wolf.Task( +# name = "split_het_sites", +# inputs = { "snp_list" : localization_task["common_snp_list"] }, +# script = """ +# sed '/^@/d' ${snp_list} | split -l 10000 -d -a 4 - snp_list_chunk +# """, +# outputs = { "snp_list_shards" : "snp_list_chunk*" } +# ) + m1_task = mutect1.mutect1(inputs=dict( pairName = "het_coverage", caseName = "tumor", @@ -278,8 +288,11 @@ def interval_gather(interval_files, primary_contigs): refFastaDict = localization_task["ref_fasta_dict"], intervals = split_intervals_task["interval_files"], + #intervals = split_het_sites["snp_list_shards"], + + exclude_chimeric = True#, - exclude_chimeric = True + #force_calling = True, )) hp_scatter = het_pulldown.get_het_coverage_from_callstats( From a8e965b5f8a9c1a8ab18fab47547a3e498ef3306 Mon Sep 17 00:00:00 2001 From: Julian Hess Date: Thu, 28 Jul 2022 17:18:43 -0400 Subject: [PATCH 222/222] Specify workflow path locally --- wolF/workflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wolF/workflow.py b/wolF/workflow.py index c0b6038..6ac1aa1 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -31,7 +31,7 @@ # for Hapaseg itself hapaseg = wolf.ImportTask( - task_path = "../", # TODO: make remote + task_path = ".", # TODO: make remote task_name = "hapaseg" )