diff --git a/.dockerignore b/.dockerignore index a160cc0..50af6bb 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,2 +1,5 @@ **/*.bam **/*.call_stats.txt +**/*.fw? +**/*.bigWig +**/*.npz diff --git a/21_genome.py b/21_genome.py index 2f6d76b..c364ac2 100644 --- a/21_genome.py +++ b/21_genome.py @@ -58,3 +58,26 @@ normal_bai = "gs://fc-secure-e2772064-386d-4911-b242-d6ade82bf172/360c5959-3827-4b24-92e3-d57dbc5de2f6/gdc_api_file_download/15788922-9cf8-4c83-8040-47fa60b7d374/call-download_file/98e061cd-0586-4e56-85fb-c6cc6688dbff_wgs_gdc_realn.bai", target_list = 200 ) + +# Richter's test (hg19) +import wolf +from wolF import workflow + +import dalmatian +wm = dalmatian.WorkspaceManager("broad-firecloud-ibmwatson/Getz_Wu_Richters_WGS_UK") + +wic = wolf.fc.WorkspaceInputConnector("broad-firecloud-ibmwatson/Getz_Wu_Richters_WGS_UK") +Pj = wic.get_pairs_as_joint_samples() + +with wolf.Workflow(workflow = workflow.workflow, namespace = "HapASeg_Richters") as w: + for pair, p in Pj.loc[Pj["sample_type_T"] == "Richter"].iterrows(): + w.run( + RUN_NAME = pair, + tumor_bam = p["output_bam_T"], + tumor_bai = p["output_bam_index_T"], + normal_bam = p["output_bam_N"], + normal_bai = p["output_bam_index_N"], + target_list = 2000, + ref_genome_build = "hg19" + ) + break diff --git a/40_het_selection.py b/40_het_selection.py new file mode 100644 index 0000000..852de78 --- /dev/null +++ b/40_het_selection.py @@ -0,0 +1,85 @@ +import colorama +import copy +import itertools +import matplotlib.pyplot as plt +import matplotlib as mpl +import ncls +import numpy as np +import numpy_groupies as npg +import pandas as pd +import scipy.stats as s +import scipy.sparse as sp +import scipy.special as ss +import sortedcontainers as sc + +plt.figure(1); plt.clf() +plt.figure(2); plt.clf() +plt.figure(30); plt.clf() +cut20_dens = {} +cut20_lod = {} +cut80_dens = {} +cut80_lod = {} +leg = [] +for depth in [15, 20, 30, 60, 80, 200]: + # simulate good hets + cov = s.poisson.rvs(depth, size = 10000) + A = s.binom.rvs(cov, 0.5) + B = cov - A + + # simulate bad hets + bad_cov = s.poisson.rvs(depth, size = 10000) + bad_frac = np.ones_like(bad_cov).astype(float) + for i in range(len(bad_frac)): + bad_frac[i] = np.random.choice([0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9]) + A_bad = s.binom.rvs(bad_cov, bad_frac) + B_bad = bad_cov - A_bad + + # old criterion: beta density between 0.6 and 0.4 + betafrac = np.diff(s.beta.cdf([0.4, 0.6], A[:, None] + 1, B[:, None] + 1)) + betafrac_bad = np.diff(s.beta.cdf([0.4, 0.6], A_bad[:, None] + 1, B_bad[:, None] + 1)) + + # new criterion: log-odds ratio + betalod = s.beta.logsf(0.5, A + 1, B + 1) - s.beta.logcdf(0.5, A + 1, B + 1) + betalod_bad = s.beta.logsf(0.5, A_bad + 1, B_bad + 1) - s.beta.logcdf(0.5, A_bad + 1, B_bad + 1) + + # ROC curves + dens_cdf = np.zeros([1000, 2]) + for i, cut in enumerate(np.linspace(0, 1, 1000)): + dens_cdf[i, 0] = (betafrac >= cut).mean() + dens_cdf[i, 1] = (betafrac_bad >= cut).mean() + + lod_cdf = np.zeros([1000, 2]) + for i, cut in enumerate(np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)): + lod_cdf[i, 0] = (np.abs(betalod) <= cut).mean() + lod_cdf[i, 1] = (np.abs(betalod_bad) <= cut).mean() + + plt.figure(30) + st = plt.step(dens_cdf[:, 1], dens_cdf[:, 0]) + color = st[0].get_color() + plt.step(lod_cdf[:, 1], lod_cdf[:, 0], color = color, linestyle = ":") + + leg.append(st) + + cut20_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 1] <= 0.2)[0]] + cut80_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 0] <= 0.8)[0]] + cut20_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[np.flatnonzero(lod_cdf[:, 1] >= 0.2)[0]] + cut80_lod_idx = np.flatnonzero(lod_cdf[:, 0] >= 0.8)[0] + cut80_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[cut80_lod_idx] + + plt.scatter(lod_cdf[cut80_lod_idx, 1], lod_cdf[cut80_lod_idx, 0], marker = 'x', color = color) + plt.text(lod_cdf[cut80_lod_idx, 1], lod_cdf[cut80_lod_idx, 0], "{0:.2f}".format(cut80_lod[depth]), color = color) + + plt.figure(1) + sc = plt.scatter(cov, betafrac, alpha = 0.1, s = 10) + plt.scatter(depth, np.diff(s.beta.cdf([0.4, 0.6], depth/2 + 1, depth/2 + 1)), color = color, marker = "x") + + cov_range = np.r_[cov.min():cov.max()] + cov_cum = np.nan*np.ones_like(cov_range) + for i, c in enumerate(cov_range): + cov_cum[i] = betafrac[cov >= c].mean() + + plt.figure(2) + plt.scatter(cov_range, cov_cum) + +plt.figure(3) +plt.legend([x[0] for x in leg], ["15x", "20x", "30x", "60x", "80x", "200x"]) diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py index 6c2bb1b..6012afa 100644 --- a/71_coverage_covariates.py +++ b/71_coverage_covariates.py @@ -1,11 +1,13 @@ import liftover +import numpy as np import pandas as pd import pyfaidx +import pyBigWig import tqdm -from capy import mut +from capy import mut, seq # -# replication timing +# replication timing {{{ F = pd.read_csv("/mnt/j/proj/cnv/20201018_hapseg2/covars/GSE137764_H1_GaussiansGSE137764_mooth_scaled_autosome.mat", sep = "\t", header = None).T.rename(columns = { 0 : "chr", 1 : "start", 2 : "end" }) F.iloc[:, 3:] = F.loc[:, 3:].astype(float) @@ -13,7 +15,7 @@ F["chr"] = mut.convert_chr(F["chr"]) F.to_pickle("covars/GSE137764_H1.hg38.pickle") -# liftover to hg19 +# liftover to hg19 {{{ F["chr_start_lift"] = 0 F["chr_end_lift"] = 0 F["start_lift"] = 0 @@ -63,8 +65,15 @@ (F["start_strand_lift"].notin(["+", "?"])) | \ (F["start_lift"] > F["end_lift"]) +# }}} + +# }}} + # -# GC content +# GC content {{{ + +## precompute GC content {{{ +# note: this is obsolete; GC content is now computed on the fly B = pd.read_csv("/mnt/j/proj/cnv/20210326_coverage_collector/targets.bed", sep = "\t", header = None, names = ["chr", "start", "end"]) B["chr"] = mut.convert_chr(B["chr"]) @@ -78,3 +87,385 @@ B.to_pickle("covars/GC.pickle") +# }}} + +# Terry Speed GC content estimator {{{ + +import hapaseg.run_coverage_MCMC + +# load coverage + +args = lambda : None +args.coverage_csv = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/coverage_cat.bed" +args.allelic_clusters_object = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/allelic_DP_SNP_clusts_and_phase_assignments.npz" +args.SNPs_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/all_SNPs.pickle" +args.segmentations_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/segmentations.pickle" +args.repl_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/GSE137764_H1.hg19_liftover.pickle" +args.faire_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/FAIRE_GM12878.hg19.pickle" +args.ref_fasta = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/Homo_sapiens_assembly19.fasta" +args.bin_width = 2000 + +cov_mcmc_runner = hapaseg.run_coverage_MCMC.CoverageMCMCRunner( + args.coverage_csv, + args.allelic_clusters_object, + args.SNPs_pickle, + args.segmentations_pickle, + f_repl=args.repl_pickle, + f_faire=args.faire_pickle, + # ref_fasta = "/mnt/j/db/hg38/ref/hg38.analysisSet.fa", # ALCH + ref_fasta = args.ref_fasta, #"/mnt/j/db/hg19/ref/hs37d5.fa", # Richter's + bin_width = args.bin_width +) +C = cov_mcmc_runner.full_cov_df + +# bin intervals by GC content +C["GC_bin"] = np.round(C["C_GC"]*1000).astype(int) +C["num_frags_corr"] = C["covcorr"]/C["C_frag_len"].mean() + +N_gc = C.groupby("GC_bin").size() +F_gc = C.groupby("GC_bin")["num_frags_corr"].sum() + +plt.figure(1); plt.clf() +plt.scatter(N_gc.index, F_gc/N_gc, marker = '.', s = 1) + +cov_df = pd.read_pickle("/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--15-35-16_040rmzi_pid3cty_0w3oyu5xxnfwe/jobs/0/workspace/cov_df.pickle") +cov_df = cov_df.merge(C[["start_g", "C_GC"]], left_on = "start_g", right_on = "start_g") + +cov_df["GC_bin"] = np.round(cov_df["C_GC"]*1000).astype(int) +cov_df["num_frags_corr"] = cov_df["covcorr"]/cov_df["C_frag_len"].mean() + +N_gc = cov_df.groupby("GC_bin").size() +F_gc = cov_df.groupby("GC_bin")["num_frags_corr"].sum() + +cov_df = cov_df.merge((F_gc/N_gc).rename("C_GC_f"), left_on = cov_df["GC_bin"], right_index = True) + +import loess +_, y_l, _ = loess_1d.loess_1d(np.r_[N_gc.index], np.r_[F_gc/N_gc]) + +plt.figure(2); plt.clf() +plt.scatter(N_gc.index, F_gc/N_gc, marker = '.', s = 1) +#plt.plot(N_gc.index, y_l) +r = np.linspace(0, 1000, 1000) +v = np.polyfit(np.r_[N_gc.index]/1000, F_gc/N_gc, 2) +plt.plot(r, v[::-1]@(r**np.c_[0:3])) +plt.ylim([0, 500]) + +from capy import plots + +plt.figure(3); plt.clf() +plots.pixplot(cov_df["C_GC_f"], cov_df["num_frags_corr"], alpha = 0.11) +plots.pixplot(v[::-1]@(cov_df["C_GC"].values**np.c_[0:3]), cov_df["num_frags_corr"], alpha = 0.11) + +gc_g = [] +N_gc_g = [] +F_gc_g = [] +plt.figure(4); plt.clf() +for _, cidx in cov_df.groupby("allelic_cluster").indices.items(): + N_gc = cov_df.iloc[cidx].groupby("GC_bin").size() + F_gc = cov_df.iloc[cidx].groupby("GC_bin")["num_frags_corr"].sum() + lplt = plt.scatter(N_gc.index, (F_gc/N_gc)/F_gc.sum(), marker = '.', s = 1) + + v = np.polyfit(N_gc.index, F_gc/N_gc, 2) + rng = np.linspace(0, 1000, 200) + plt.plot(rng, v[::-1]@(rng**np.c_[0:3]), color = lplt.get_edgecolor()) + + gc_g.extend(N_gc.index) + N_gc_g.extend(N_gc) + F_gc_g.extend(F_gc) + +N_gc_g = np.r_[N_gc_g] +F_gc_g = np.r_[F_gc_g] +gc_g = np.r_[gc_g] + +v = np.polyfit(gc_g, F_gc_g/N_gc_g, 2) +plt.plot(r, v[::-1]@(r**np.c_[0:3])) +_, y_l, _ = loess_1d.loess_1d(gc_g, F_gc_g/N_gc_g, xnew = r, degree = 2) +plt.plot(r, y_l) + +plt.figure(3); plt.clf() +_, y_l, _ = loess_1d.loess_1d(gc_g/1000, F_gc_g/N_gc_g, xnew = cov_df["C_GC"], degree = 2) +plots.pixplot(cov_df["C_GC_f"], cov_df["num_frags_corr"], alpha = 0.11) + +## simulate quadratic relationship +seg_sim = np.r_[np.ones([500, 1]), 1.5*np.ones([500, 1])].T +gc_sim = np.random.rand(1000)*0.6 + 0.2 +rng = np.linspace(0, 1, 100) +x = stats.poisson.rvs(np.exp(-30*(gc_sim - 0.5)**2 + 5*seg_sim))[:, None] +C = np.c_[gc_sim**2, gc_sim] + +import hapaseg.model_optimizers +PR = hapaseg.model_optimizers.PoissonRegression + +Pi = np.r_[np.tile([1, 0], [500, 1]), np.tile([0, 1], [500, 1])] +pois_regr = PR(x, C, Pi) +pois_regr.fit() +pois_regr2 = PR(x, C[:, [1]], Pi) +pois_regr2.fit() +plt.figure(2); plt.clf() +plt.scatter(x, np.exp(Pi@pois_regr.mu + C@pois_regr.beta), marker = '.', s = 1) +plt.scatter(x, np.exp(Pi@pois_regr2.mu + C[:, [1]]@pois_regr2.beta), marker = '.', s = 1) + + +# }}} + +# }}} + +# +# DNAse HS/FAIRE {{{ + +## DNAse {{{ + +bw = pyBigWig.open("covars/wgEncodeUwDnaseGm12878RawRep1.bigWig") + +# WGS (2kb chunks) +clen = seq.get_chrlens() +C = [] +for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]]): + bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]] + tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "DNAse" : 0 }) + for j, (st, en) in enumerate(tqdm.tqdm(bins)): + tmp.loc[j, "DNAse"] = np.nanmean(np.r_[bw.values(chrname, st, en)]) + C.append(tmp) + +# preliminary results not so great; stick with FAIRE for now + +# TODO: liftover to hg38 + +# WES + +# }}} + +## FAIRE {{{ + +## convert bigWig to FWB + +# for some reason pyBigWig can't process this file +# bw = pyBigWig.open("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigwig") + +# use bigWig2FWB instead +# git clone git@github.com:getzlab/bigWig2FWB.git + +# figure out range of file +# bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/bwtest +# ./getmax +# -> max = 5478 +# set scale factor to 11 +# bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal + +## WGS +from capy import fwb, mut + +F = fwb.FWB("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb"); + +clen = seq.get_chrlens() +C = [] +for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X"]]): + bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]] + tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "FAIRE" : 0 }) + for j, (st, en) in enumerate(tqdm.tqdm(bins)): + tmp.loc[j, "FAIRE"] = F.get(chrname, np.r_[st:en] + 1).mean() + C.append(tmp) + +FAIRE = pd.concat(C, ignore_index = True) +FAIRE["chr"] = mut.convert_chr(FAIRE["chr"]) +FAIRE.to_pickle("covars/FAIRE_GM12878.hg19.pickle") + +# smoothed version +FAIRE_smooth = FAIRE.copy() +FAIRE_smooth["FAIRE"] = np.convolve(FAIRE["FAIRE"], np.ones(5), mode = "same")/5 +FAIRE_smooth.to_pickle("covars/FAIRE_GM12878.smooth5.hg19.pickle") + +# +# re-process all FAIRE files using samtools +import wolf, itertools, glob, prefect + +## make interval list +clen = seq.get_chrlens() +for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X"]]): + bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]] + tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1] }) + tmp.to_csv(f"FAIRE/intervals/{chrname}.bed", sep = "\t", header = None, index = False) + +## define coverage workflow + +class markdups(wolf.Task): + inputs = { "bamin" } + script = "samtools markdup ${bamin} $(basename ${bamin}).dedup.bam && samtools index *dedup.bam" + outputs = { "bam" : "*.bam", "bai" : "*.bai" } + docker = "gcr.io/broad-getzlab-workflows/base_image:v0.0.5" + +intervals = glob.glob("/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE/intervals/*.bed") + +def BedCovFlow(bams, intervals): + # mark duplicates + mark_dups = [] + for b in bams: + mark_dups.append(markdups( + inputs = { "bamin" : b }, + overrides = { "bamin" : "string" }, + use_scratch_disk = True, + scratch_disk_size = 10 + )) + + # run bedcov on all BAMs (gather) + @prefect.task(nout = 2) + def bl(md): + return [m["bam"] for m in md], [m["bai"] for m in md] + bam_list, bai_list = bl(mark_dups) + + BedCov = wolf.Task( + name = "BedCov", + inputs = { "intervals" : intervals, "bams" : [bam_list], "bais" : [bai_list] }, + script = """ + samtools bedcov -Q1 ${intervals} $(cat ${bams}) > coverage.bed + """, + outputs = { "coverage" : "coverage.bed" }, + docker = "gcr.io/broad-getzlab-workflows/base_image:v0.0.5" + ) +# for b in bam_list: +# wolf.DeleteDisk(b, BedCov["coverage"]) + + # gather BedCovs + BedCovGather = wolf.Task( + name = "BedCovGather", + inputs = { "beds" : [BedCov["coverage"]] }, + script = """ + cat $(cat ${beds}) | sort -k1,1V -k2,2n | \ + awk -F'\t' 'BEGIN { OFS = FS } { tot = 0; for(i = 4; i <= NF; i++) { tot += $i }; print $0, tot }' > concat.bed + """, + outputs = { "concat" : "concat.bed" }, + ) + +## run workflow + +base_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeOpenChromFaire/" + +BAMs = ["wgEncodeOpenChromFaireA549AlnRep1.bam", # {{{ +"wgEncodeOpenChromFaireA549AlnRep2.bam", +"wgEncodeOpenChromFaireAstrocyAlnRep1.bam", +"wgEncodeOpenChromFaireAstrocyAlnRep2.bam", +"wgEncodeOpenChromFaireColonocAlnRep1.bam", +"wgEncodeOpenChromFaireColonocAlnRep2.bam", +"wgEncodeOpenChromFaireEndometriumocAlnRep1.bam", +"wgEncodeOpenChromFaireEndometriumocAlnRep2.bam", +"wgEncodeOpenChromFaireFrontalcortexocAlnRep1.bam", +"wgEncodeOpenChromFaireFrontalcortexocAlnRep2.bam", +"wgEncodeOpenChromFaireGlioblaAlnRep1.bam", +"wgEncodeOpenChromFaireGlioblaAlnRep2.bam", +"wgEncodeOpenChromFaireGlioblaAlnRep3.bam", +"wgEncodeOpenChromFaireGm12878AlnRep1.bam", +"wgEncodeOpenChromFaireGm12878AlnRep2.bam", +"wgEncodeOpenChromFaireGm12878AlnRep3.bam", +"wgEncodeOpenChromFaireGm12891AlnRep1.bam", +"wgEncodeOpenChromFaireGm12891AlnRep2.bam", +"wgEncodeOpenChromFaireGm12892AlnRep1.bam", +"wgEncodeOpenChromFaireGm12892AlnRep2.bam", +"wgEncodeOpenChromFaireGm18507AlnRep1.bam", +"wgEncodeOpenChromFaireGm18507AlnRep2.bam", +"wgEncodeOpenChromFaireGm19239AlnRep1.bam", +"wgEncodeOpenChromFaireGm19239AlnRep2.bam", +"wgEncodeOpenChromFaireH1hescAlnRep1.bam", +"wgEncodeOpenChromFaireH1hescAlnRep2.bam", +"wgEncodeOpenChromFaireHelas3AlnRep1.bam", +"wgEncodeOpenChromFaireHelas3AlnRep2.bam", +"wgEncodeOpenChromFaireHelas3Ifna4hAlnRep1.bam", +"wgEncodeOpenChromFaireHelas3Ifna4hAlnRep2.bam", +"wgEncodeOpenChromFaireHelas3Ifng4hAlnRep1.bam", +"wgEncodeOpenChromFaireHelas3Ifng4hAlnRep2.bam", +"wgEncodeOpenChromFaireHepg2AlnRep1.bam", +"wgEncodeOpenChromFaireHepg2AlnRep2.bam", +"wgEncodeOpenChromFaireHepg2AlnRep3.bam", +"wgEncodeOpenChromFaireHtr8AlnRep1.bam", +"wgEncodeOpenChromFaireHtr8AlnRep2.bam", +"wgEncodeOpenChromFaireHuvecAlnRep1.bam", +"wgEncodeOpenChromFaireHuvecAlnRep2.bam", +"wgEncodeOpenChromFaireK562AlnRep1.bam", +"wgEncodeOpenChromFaireK562AlnRep2.bam", +"wgEncodeOpenChromFaireK562NabutAlnRep1.bam", +"wgEncodeOpenChromFaireK562NabutAlnRep2.bam", +"wgEncodeOpenChromFaireK562OhureaAlnRep1.bam", +"wgEncodeOpenChromFaireK562OhureaAlnRep2.bam", +"wgEncodeOpenChromFaireKidneyocAlnRep1.bam", +"wgEncodeOpenChromFaireKidneyocAlnRep2.bam", +"wgEncodeOpenChromFaireMcf7Est10nm30mAlnRep1.bam", +"wgEncodeOpenChromFaireMcf7Est10nm30mAlnRep2.bam", +"wgEncodeOpenChromFaireMcf7HypoxlacAlnRep1.bam", +"wgEncodeOpenChromFaireMcf7HypoxlacAlnRep2.bam", +"wgEncodeOpenChromFaireMcf7VehAlnRep1.bam", +"wgEncodeOpenChromFaireMcf7VehAlnRep2.bam", +"wgEncodeOpenChromFaireMedulloAlnRep1.bam", +"wgEncodeOpenChromFaireMedulloAlnRep2.bam", +"wgEncodeOpenChromFaireMrta2041AlnRep1.bam", +"wgEncodeOpenChromFaireMrta2041AlnRep2.bam", +"wgEncodeOpenChromFaireMrtg4016AlnRep1.bam", +"wgEncodeOpenChromFaireMrtg4016AlnRep2.bam", +"wgEncodeOpenChromFaireMrtttc549AlnRep1.bam", +"wgEncodeOpenChromFaireMrtttc549AlnRep2.bam", +"wgEncodeOpenChromFaireNhaAlnRep1.bam", +"wgEncodeOpenChromFaireNhaAlnRep2.bam", +"wgEncodeOpenChromFaireNhbeAlnRep1.bam", +"wgEncodeOpenChromFaireNhbeAlnRep2.bam", +"wgEncodeOpenChromFaireNhekAlnRep1.bam", +"wgEncodeOpenChromFaireNhekAlnRep2.bam", +"wgEncodeOpenChromFairePancreasocAlnRep1.bam", +"wgEncodeOpenChromFairePancreasocAlnRep2.bam", +"wgEncodeOpenChromFairePanisletsAlnRep1.bam", +"wgEncodeOpenChromFaireRcc7860AlnRep1.bam", +"wgEncodeOpenChromFaireRcc7860AlnRep2.bam", +"wgEncodeOpenChromFaireSmallintestineocAlnRep1.bam", +"wgEncodeOpenChromFaireSmallintestineocAlnRep2.bam", +"wgEncodeOpenChromFaireUrotsaAlnRep1.bam", +"wgEncodeOpenChromFaireUrotsaAlnRep2.bam", +"wgEncodeOpenChromFaireUrotsaUt189AlnRep1.bam", +"wgEncodeOpenChromFaireUrotsaUt189AlnRep2.bam"] # }}} + +B = pd.Series(BAMs).str.extract("(?P.*Faire(?P.*)AlnRep(?P\d+)\.bam)") + +with wolf.Workflow(workflow = BedCovFlow, namespace = "FAIRE_cov") as w: + for cell_line, b in B.groupby("cell_line"): + w.run(RUN_NAME = cell_line, bams = base_url + b["bam"], intervals = intervals) + +## parse in coverages; make covariate table +from capy import mut + +w = wolf.Workflow(workflow = BedCovFlow, namespace = "FAIRE_cov") +for cell_line, b in B.groupby("cell_line"): + w.load_results(RUN_NAME = cell_line, bams = base_url + b["bam"], intervals = intervals) + +T = w.tasks.loc[(slice(None), "BedCovGather"), ["results"]].droplevel(1) +T["covpath"] = T["results"].apply(lambda x : x["concat"]) + +for i, (cell_line, cov) in enumerate(T.iterrows()): + X = pd.read_csv(cov["covpath"], sep = "\t", header = None) + X = X.rename(columns = { len(X.columns) - 1 : cell_line }) + # get common lines + if i == 0: + C = X.iloc[:, np.r_[0:3, -1]].rename(columns = { 0 : "chr", 1 : "start", 2 : "end" }) + else: + C = pd.concat([C, X.iloc[:, -1]], axis = 1) + +C["chr"] = mut.convert_chr(C["chr"]) + +C.to_pickle("covars/FAIRE/coverage.dedup.raw.pickle") + +# rebin to 10k +C["index_r"] = C.index//5 +C10k = C.groupby(["chr", "index_r"]).agg({ + "start" : min, "end" : max, + **{ k : sum for k in C.columns[3:] } +}).droplevel(1).reset_index().drop(columns = "index_r") + +C10k.to_pickle("covars/FAIRE/coverage.dedup.raw.10kb.pickle") + +# 100k? +C["index_r"] = C.index//50 +C100k = C.groupby(["chr", "index_r"]).agg({ + "start" : min, "end" : max, + **{ k : sum for k in C.columns[3:] } +}).droplevel(1).reset_index().drop(columns = "index_r") + +C100k.to_pickle("covars/FAIRE/coverage.dedup.raw.100kb.pickle") + +# }}} + +# }}} diff --git a/85_simFC.py b/85_simFC.py new file mode 100644 index 0000000..b9ebcf9 --- /dev/null +++ b/85_simFC.py @@ -0,0 +1,66 @@ +import wolf + +mutect = wolf.ImportTask("github.com:getzlab/MuTect1_TOOL.git", "M1") + +def workflow( + bam, bai, vcf, + refFasta = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.fa", + refFastaIdx = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.fa.fai", + refFastaDict = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.dict" +): + localize = wolf.LocalizeToDisk( + files = { + "bam" : bam, + "bai" : bai, + "vcf" : vcf, + "refFasta" : refFasta, + "refFastaIdx" : refFastaIdx, + "refFastaDict" : refFastaDict + } + ) + + split_vcf = wolf.Task( + name = "split_vcf", + inputs = { "vcf" : localize["vcf"] }, + script = """ + grep '^#' ${vcf} > header + sed '/^#/d' ${vcf} | split -l 10000 -d -a 3 --filter='cat header /dev/stdin > $FILE' - VCF_chunk + """, + outputs = { "shards" : "VCF_chunk*" } + ) + + m1_scatter = mutect.mutect1( + inputs = { + "pairName" : "platinum", + "caseName" : "platinum", + "t_bam" : localize["bam"], + "t_bai" : localize["bai"], + "force_calling" : True, + "intervals" : split_vcf["shards"], + "fracContam" : 0, + "refFasta" : localize["refFasta"], + "refFastaIdx" : localize["refFastaIdx"], + "refFastaDict" : localize["refFastaDict"] + } + ) + + m1_gather = wolf.Task( + name = "m1_gather", + inputs = { "callstats_array" : [m1_scatter["mutect1_cs"]] }, + script = """ + head -n2 $(head -n1 ${callstats_array}) > header + while read -r i; do + sed '1,2d' $i + done < ${callstats_array} | sort -k1,1V -k2,2n > cs_sorted + cat header cs_sorted > cs_concat.tsv + """, + outputs = { "cs_gather" : "cs_concat.tsv" } + ) + +with wolf.Workflow(workflow = workflow, namespace = "HS_sim") as w: + w.run( + RUN_NAME = "NA12878_WGS_platinum_hg38", + bam = "gs://jh-xfer/NA12878_bwamem_illumina_platinum_bed.bam", + bai = "gs://jh-xfer/NA12878_bwamem_illumina_platinum_bed.bam.bai", + vcf = "gs://jh-xfer/NA12878.vcf" + ) diff --git a/Dockerfile b/Dockerfile index 9234955..249c37e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,8 +4,10 @@ WORKDIR /build # install dependencies RUN pip install sortedcontainers +ARG cache_invalidate=xxx RUN git clone https://github.com/getzlab/CApy.git && pip install ./CApy RUN pip install dask distributed +RUN pip install distinctipy # install hapaseg COPY setup.py . diff --git a/covars/getmax.c b/covars/getmax.c new file mode 100644 index 0000000..5bcc865 --- /dev/null +++ b/covars/getmax.c @@ -0,0 +1,15 @@ +#include +#include +#include +#include + +int main() { + FILE* x = fopen("wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb", "r"); + uint16_t max = 0; + uint16_t buf; + while(fread(&buf, 2, 1, x)) { + buf = __bswap_16(buf); + if(buf > max) { max = buf; printf("%d\n", max); } + } + return 0; +} diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py index 6275f39..dd77611 100644 --- a/hapaseg/NB_coverage_MCMC.py +++ b/hapaseg/NB_coverage_MCMC.py @@ -1,12 +1,14 @@ import numpy as np import scipy.special as ss import sortedcontainers as sc +import sys from statsmodels.discrete.discrete_model import NegativeBinomial as statsNB import warnings from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning import matplotlib as mpl import matplotlib.pyplot as plt from scipy.signal import find_peaks +import scipy.sparse as sp from .model_optimizers import PoissonRegression # turn off warnings for statsmodels fitting @@ -48,8 +50,11 @@ def __init__(self, r, C, mu_0, beta_0, bin_width=1): self.segment_lens = sc.SortedDict([(0, len(self.r))]) # keep cache of previously computed breakpoints for fast splitting - # these breakpoints keys are in the form (st, en, breakpoint) - self.breakpoint_cache = {} + sz = tuple(np.r_[1, 1]*(len(r) + 1)) + self.cache_LL_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_LL = [] + self.cache_mu_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_mu = [] + self.cache_lepsi_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_lepsi = [] + self.cache_hess_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_hess = [] self.phase_history = [] self.F = sc.SortedList() @@ -79,8 +84,17 @@ def get_seg_ind(self, seg): def get_join_seg_ind(self, seg_l, seg_r): return seg_l, seg_r + self.segment_lens[seg_r] + # get overall likelihood for all segments def get_ll(self): - return self.ll_cluster(self.mu_i_arr, self.lepsi_i_arr, True) + bdy = np.r_[list(self.segments), len(self.r)]; bdy = np.c_[bdy[:-1], bdy[1:]] + ll = 0 + for st, en in bdy: + # lookup in cache + if (ptr := self.cache_LL_ptr[st, en]) != 0: + ll += self.cache_LL[ptr] + else: + ll += self.ll_cluster([st, en], self.mu_i_arr[st:en], self.lepsi_i_arr[st:en], True) + return ll # read in the merged cluster assignments from burnin scatter jobs and # fill in data structures for cluster mcmc accordingly @@ -137,12 +151,28 @@ def stats_init(self): # statsmodels NB BFGS optimizer is more stable than NR so we will use it until migration to LNP def stats_optimizer(self, ind, ret_hess=False): + # cache hit; look up values + if self.cache_mu_ptr[ind[0], ind[1]] != 0: + #breakpoint() + mu = self.cache_mu[self.cache_mu_ptr[ind[0], ind[1]]] + lepsi = self.cache_lepsi[self.cache_lepsi_ptr[ind[0], ind[1]]] + if ret_hess: + return mu, lepsi, self.cache_hess[self.cache_hess_ptr[ind[0], ind[1]]] + else: + return mu, lepsi + + # cache miss; compute values endog = self.r[ind[0]:ind[1]].flatten() exog = np.ones(self.r[ind[0]:ind[1]].shape[0]) exposure = np.ones(self.r[ind[0]:ind[1]].shape[0]) * self.bin_exposure sNB = statsNB(endog, exog, exposure=exposure, offset=(self.C[ind[0]:ind[1]] @ self.beta).flatten() + self.mu) res = sNB.fit(disp=0) + # save to cache + self.cache_mu.append(res.params[0]); self.cache_mu_ptr[ind[0], ind[1]] = len(self.cache_mu) - 1 + self.cache_lepsi.append(-np.log(res.params[1])); self.cache_lepsi_ptr[ind[0], ind[1]] = len(self.cache_lepsi) - 1 + self.cache_hess.append(sNB.hessian(res.params)); self.cache_hess_ptr[ind[0], ind[1]] = len(self.cache_hess) - 1 + if ret_hess: return res.params[0], -np.log(res.params[1]), sNB.hessian(res.params) else: @@ -165,16 +195,16 @@ def refit_beta(self): self.lepsi_i_arr[row[0]:row[1]] = lepsi_i # method for calculating the overall log likelihood of an allelic cluster given a hypothetical mu_i and lepsi arrays - def ll_cluster(self, mu_i_arr, lepsi_i_arr, take_sum=True): - mu_i_arr = mu_i_arr.flatten() - epsi_i_arr = np.exp(lepsi_i_arr).flatten() + def ll_cluster(self, ind, mu_i, lepsi_i, take_sum=True): + epsi_i = np.exp(lepsi_i) exposure= np.log(self.bin_exposure) - bc = (self.C @ self.beta).flatten() + exposure - exp = np.exp(self.mu + bc + mu_i_arr).flatten() + bc = (self.C[ind[0]:ind[1]] @ self.beta).flatten() + exposure + exp = np.exp(self.mu + bc + mu_i).flatten() + r_subset = self.r[ind[0]:ind[1]] - lls = (ss.gammaln(self.r + epsi_i_arr) - ss.gammaln(self.r + 1) - ss.gammaln(epsi_i_arr) + - (self.r * (self.mu + bc + mu_i_arr - np.log(epsi_i_arr + exp))) + - (epsi_i_arr * np.log(epsi_i_arr / (epsi_i_arr + exp)))) + lls = (ss.gammaln(r_subset + epsi_i) - ss.gammaln(r_subset + 1) - ss.gammaln(epsi_i) + + (r_subset * (self.mu + bc + mu_i - np.log(epsi_i + exp))) + + (epsi_i * np.log(epsi_i / (epsi_i + exp)))) if not take_sum: return lls return lls.sum() @@ -276,6 +306,8 @@ def _calculate_splits(self, ind, split_indices): Hs = [] for ix in split_indices: if ix < 0: + # what do we do here WRT ll_cluster indices? FIXME + breakpoint() # no split proposal ll_join = self.ll_cluster(self.mu_i_arr, self.lepsi_i_arr) lls.append(ll_join) @@ -290,15 +322,24 @@ def _calculate_splits(self, ind, split_indices): lepsis.append((lepsi_l, lepsi_r)) Hs.append((H_l, H_r)) - tmp_mui = self.mu_i_arr.copy() - tmp_mui[ind[0]:ix] = mu_l - tmp_mui[ix: ind[1]] = mu_r - tmp_lepsi = self.lepsi_i_arr.copy() - tmp_lepsi[ind[0]:ix] = lepsi_l - tmp_lepsi[ix: ind[1]] = lepsi_r + # lookup likelihoods in cache + # left: + if (ptr := self.cache_LL_ptr[ind[0], ix]) != 0: + #breakpoint() + ll_l = self.cache_LL[ptr] + else: + ll_l = self.ll_cluster([ind[0], ix], mu_l, lepsi_l) + self.cache_LL.append(ll_l); self.cache_LL_ptr[ind[0], ix] = len(self.cache_LL) - 1 + + # right: + if (ptr := self.cache_LL_ptr[ix, ind[1]]) != 0: + #breakpoint() + ll_r = self.cache_LL[ptr] + else: + ll_r = self.ll_cluster([ix, ind[1]], mu_r, lepsi_r) + self.cache_LL.append(ll_r); self.cache_LL_ptr[ix, ind[1]] = len(self.cache_LL) - 1 - ll = self.ll_cluster(tmp_mui, tmp_lepsi) - lls.append(ll) + lls.append(ll_l + ll_r) return lls, mus, lepsis, Hs @@ -412,7 +453,7 @@ def _detailed_sampling(self, ind, lls, split_indices, mus, lepsis, Hs): def _lls_to_MLs(self, lls, Hs): MLs = np.zeros(len(lls)) for i, (ll, Hs) in enumerate(zip(lls, Hs)): - laplacian = self._get_log_ML_split(Hs[0], Hs[1]) + laplacian = self._get_log_ML_gaussint_split(Hs[0], Hs[1]) # the split results in a nan make it impossible to split there if np.isnan(laplacian): laplacian = -1e50 @@ -444,25 +485,28 @@ def _get_split_liks(self, ind, debug=False): return split_indices, MLs, mus, lepsis - # computes ML component from hessian approximation for a single segment - def _get_log_ML_approx_join(self, Hess): + # computes Gaussian integral for ML Laplace approximation for a single segment + def _get_log_ML_gaussint_join(self, Hess): return np.log(2 * np.pi) - (np.log(np.linalg.det(-Hess))) / 2 - # computes ML component from hessian approximation for two split segments - def _get_log_ML_split(self, H1, H2): - return np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2 + # computes Gaussian integral for ML Laplace approximation for two split segments + def _get_log_ML_gaussint_split(self, H1, H2): + return 2*np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2 # computes the log ML of joining two segments def _log_ML_join(self, ind, ret_opt_params=False): mu_share, lepsi_share, H_share = self.stats_optimizer(ind, True) - tmp_mui = self.mu_i_arr.copy() - tmp_mui[ind[0]:ind[1]] = mu_share - tmp_lepsi = self.lepsi_i_arr.copy() - tmp_lepsi[ind[0]:ind[1]] = lepsi_share - ll_join = self.ll_cluster(tmp_mui, tmp_lepsi) - if ret_opt_params: - return mu_share, lepsi_share, self._get_log_ML_join(H_share) + ll_join - return mu_share, lepsi_share, self._get_log_ML_approx_join(H_share) + ll_join + + # lookup cache + if (ptr := self.cache_LL_ptr[ind[0], ind[1]]) != 0: + #breakpoint() + ll_join = self.cache_LL[ptr] + else: + ll_join = self.ll_cluster(ind, mu_share, lepsi_share) + + # add to cache + self.cache_LL.append(ll_join); self.cache_LL_ptr[ind[0], ind[1]] = len(self.cache_LL) - 1 + return mu_share, lepsi_share, self._get_log_ML_gaussint_join(H_share) + ll_join """ Split segment method. This method chooses a segment at random @@ -493,7 +537,7 @@ def split(self, debug): max_ML = max(log_MLs) k_probs = np.exp(log_MLs - max_ML) / np.exp(log_MLs - max_ML).sum() - + if np.isnan(k_probs).any(): print("skipping split iteration due to nan. log MLs: ", log_MLs, flush=True) return 0 @@ -545,7 +589,7 @@ def join(self, debug): ind = self.get_join_seg_ind(seg_l, seg_r) lls_split, _, _, Hs = self._calculate_splits(ind, [seg_r]) - log_split_ML = lls_split[0] + self._get_log_ML_split(Hs[0][0], Hs[0][1]) + log_split_ML = lls_split[0] + self._get_log_ML_gaussint_split(Hs[0][0], Hs[0][1]) mu_share, lepsi_share, log_join_ML = self._log_ML_join(ind) log_MLs = np.r_[log_split_ML, log_join_ML] @@ -738,13 +782,12 @@ def prepare_results(self): """ class NB_MCMC_SingleCluster: - def __init__(self, n_iter, r, C, mu, beta, cluster_num, bin_width=1): + def __init__(self, n_iter, r, C, mu, beta, bin_width=1): self.n_iter = n_iter self.r = r self.C = C self.beta = beta self.mu = mu - self.cluster_num = cluster_num self.bin_width = bin_width # for now assume that the Pi vector assigns each bin to exactly one cluster @@ -759,6 +802,7 @@ def __init__(self, n_iter, r, C, mu, beta, cluster_num, bin_width=1): self.mu_i_samples = [] self.lepsi_i_samples = [] self.F_samples = [] + self.ll_samples = [] self.ll_cluster = 0 self.ll_iter = [] @@ -784,11 +828,12 @@ def save_sample(self): self.mu_i_samples.append(self.cluster.mu_i_arr.copy()) self.lepsi_i_samples.append(self.cluster.lepsi_i_arr.copy()) self.F_samples.append(self.cluster.F.copy()) + self.ll_samples.append(self.ll_cluster) def run(self, debug=False, stop_after_burnin=False): - print("starting MCMC coverage segmentation for cluster {}...".format(self.cluster_num), flush=True) + print("Starting MCMC coverage segmentation ...", flush=True, file=sys.stderr) past_it = 0 n_it = 0 diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py index 32b19d4..583db50 100644 --- a/hapaseg/__main__.py +++ b/hapaseg/__main__.py @@ -9,6 +9,7 @@ import scipy.stats as s import scipy.special as ss import sortedcontainers as sc +import traceback from capy import mut @@ -20,7 +21,7 @@ from . import utils as hs_utils from .NB_coverage_MCMC import NB_MCMC_SingleCluster -from .run_coverage_MCMC import CoverageMCMCRunner, aggregate_clusters, aggregate_burnin_files +from .run_coverage_MCMC import CoverageMCMCRunner, aggregate_adp_segments, aggregate_burnin_files from .coverage_DP import Coverage_DP from .a_cov_DP import generate_acdp_df, AllelicCoverage_DP @@ -112,13 +113,9 @@ def parse_args(): required=True) ## DP dp = subparsers.add_parser("dp", help="Run DP clustering on allelic imbalance segments") - dp.add_argument("--seg_dataframe", required=True) - dp.add_argument("--n_dp_iter", default=10) - dp.add_argument("--seg_samp_idx", default=0) - dp.add_argument("--ref_fasta", - required=True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load - dp.add_argument("--cytoband_file", - required=True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load + dp.add_argument("--seg_dataframe", required = True) + dp.add_argument("--ref_fasta", required = True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load + dp.add_argument("--cytoband_file", required = True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load ## coverage MCMC coverage_mcmc = subparsers.add_parser("coverage_mcmc", @@ -128,6 +125,7 @@ def parse_args(): coverage_mcmc.add_argument("--allelic_clusters_object", help="npy file containing allelic dp segs-to-clusters results") coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs") + coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) coverage_mcmc.add_argument("--covariate_dir", help="path to covariate directory with covariates all in pickled files") coverage_mcmc.add_argument("--num_draws", type=int, @@ -148,24 +146,28 @@ def parse_args(): preprocess_coverage_mcmc.add_argument("--allelic_clusters_object", help="npy file containing allelic dp segs-to-clusters results", required=True) preprocess_coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs", required=True) + preprocess_coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True) preprocess_coverage_mcmc.add_argument("--repl_pickle", help="pickled dataframe containing replication timing data", required=True) + preprocess_coverage_mcmc.add_argument("--faire_pickle", help="pickled dataframe containing FAIRE data", required=True) preprocess_coverage_mcmc.add_argument("--gc_pickle", help="pickled dataframe containing precomputed gc content. This is not required but will speed up runtime if passed", default=None) preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int, help="index of sample clustering from allelic DP to use as seed for segmentation. Will use most likely clustering by default", default=None) + preprocess_coverage_mcmc.add_argument("--ref_fasta", required = True) + preprocess_coverage_mcmc.add_argument("--bin_width", help = "Coverage bin width (for WGS only)", default = 1, type = int) ## running coverage mcmc on single cluster for scatter task coverage_mcmc_shard = subparsers.add_parser("coverage_mcmc_shard", help="run coverage mcmc on single ADP cluster") - coverage_mcmc_shard.add_argument("--preprocess_data", help='path to numpy object containing preprocessed data', + coverage_mcmc_shard.add_argument("--preprocess_data", help='path to numpy object containing preprocessed data: covariate matrix (C), global beta, ADP cluster mu\'s, covbin ADP cluster assignments (all_mu), covbin raw coverage values (r)', required=True) + coverage_mcmc_shard.add_argument("--allelic_seg_indices", help='path to pickled pandas dataframe containing coverage bin indices for each alleic segment', + required=True) + coverage_mcmc_shard.add_argument("--allelic_seg_idx", help='which allelic segment to perform coverage segmentation on.', + required=True, type=int) coverage_mcmc_shard.add_argument("--num_draws", type=int, help="number of draws to take from coverage segmentation MCMC", default=50) - coverage_mcmc_shard.add_argument("--cluster_num", type=int, - help="cluster index for this worker to run on. If unspecified method will simulate " - "all clusters on the same machine", default=None) coverage_mcmc_shard.add_argument("--bin_width", type=int, default=1, help="size of uniform bins if using. Otherwise 1.") - coverage_mcmc_shard.add_argument("--range", type=str, help="range of coverage bins within the cluster to burnin. should be in start-end form. Note that this will cause num draws to be overridden to 1") coverage_mcmc_shard.add_argument("--burnin_files", type=str, help="txt file containing burnt in segment assignments") ## collect coverage MCMC shards @@ -208,6 +210,7 @@ def parse_args(): ac_dp.add_argument("--num_samples", type=int, help="number of samples to take") ac_dp.add_argument("--cytoband_file", help="path to cytoband txt file") ac_dp.add_argument("--warmstart", type=bool, default=True, help="run clustering with warmstart") + args = parser.parse_args() # validate arguments @@ -324,6 +327,13 @@ def main(): with open(output_dir + "/amcmc_results.pickle", "wb") as f: pickle.dump(H.run(), f) + try: + H.visualize() + plt.savefig(output_dir + "/figures/MLE_segmentation.png", dpi = 300) + except Exception: + print("Error plotting segments; see stack trace for details:") + print(traceback.format_exc()) + elif args.command == "concat": # # load scatter intervals @@ -436,79 +446,49 @@ def main(): A = A_DP(args.seg_dataframe, ref_fasta=args.ref_fasta) # run DP - # TODO: when we have better type checking, drop the int coersion here - # N_seg_samps = A.n_samp - 1 if int(args.n_seg_samps) == 0 else int(args.n_seg_samps) - # TODO: if we decide to drop support for chained sampling altogether, remove N_seg_samps logic altogether - snps_to_clusters, snps_to_phases, likelihoods = A.run( - seg_sample_idx=int(args.seg_samp_idx), - # N_seg_samps = N_seg_samps, - N_clust_samps=int(args.n_dp_iter) - ) + snps_to_clusters, snps_to_phases, likelihoods = A.run() # save DP results + # SNP assignment/phasing samples, likelihoods of each sample np.savez(output_dir + "/allelic_DP_SNP_clusts_and_phase_assignments.npz", snps_to_clusters=snps_to_clusters, snps_to_phases=snps_to_phases, likelihoods=likelihoods ) + # segmentation breakpoints for each sample + with open(output_dir + "/segmentations.pickle", "wb") as f: + pickle.dump(A.DP_run.segment_trace, f) + + # full SNP dataframe A.SNPs.to_pickle(output_dir + "/all_SNPs.pickle") # # plot DP results - # 1. phased SNP visualization - f = plt.figure(figsize=[17.56, 5.67]) - hs_utils.plot_chrbdy(args.cytoband_file) - A.visualize_SNPs(snps_to_phases, color=True, f=f) - A.visualize_clusts(snps_to_clusters, f=f, thick=True, nocolor=True) - plt.ylabel("Haplotypic imbalance") - plt.title("SNP phasing/segmentation") - plt.savefig(output_dir + "/figures/SNPs.png", dpi=300) - plt.close() + # 0. likelihood trace + A.DP_run.plot_likelihood_trace() + plt.savefig(output_dir + "/figures/likelihood_trace.png", dpi = 300) - # 2. pre-clustering segments - f = plt.figure(figsize=[17.56, 5.67]) + # 1. SNPs + segments + f = plt.figure(figsize = [17.56, 5.67]) hs_utils.plot_chrbdy(args.cytoband_file) - A.visualize_SNPs(snps_to_phases, color=False, f=f) - A.visualize_segs(snps_to_clusters, f=f) + A.DP_run.visualize_segs(f = f, show_snps = True) plt.ylabel("Haplotypic imbalance") - plt.title("Allelic segmentation, pre-DP clustering") - plt.savefig(output_dir + "/figures/allelic_imbalance_preDP.png", dpi=300) + plt.title("SNPs + allelic segmentation (MAP)") + plt.savefig(output_dir + "/figures/SNPs.png", dpi = 300) plt.close() - # 3. post-clustering segments - f = plt.figure(figsize=[17.56, 5.67]) + # 2. segments alone + f = plt.figure(figsize = [17.56, 5.67]) hs_utils.plot_chrbdy(args.cytoband_file) - A.visualize_SNPs(snps_to_phases, color=False, f=f) - A.visualize_clusts(snps_to_clusters, f=f, thick=True) + A.DP_run.visualize_segs(f = f, show_snps = False) plt.ylabel("Haplotypic imbalance") - plt.title("Allelic segmentation, post-DP clustering") - plt.savefig(output_dir + "/figures/allelic_imbalance_postDP.png", dpi=300) + plt.title("Allelic segmentation (posterior)") + plt.savefig(output_dir + "/figures/segs_only.png", dpi = 300) plt.close() - - #collect adp run data - elif args.command == "collect_adp": - with open(args.dp_results, 'r') as f: - dp_results = f.readlines() - accum_clusts = [] - accum_phases = [] - accum_liks = [] - - for dp_shard in dp_results: - obj = np.load(dp_shard.rstrip('\n')) - accum_clusts.append(obj['snps_to_clusters']) - accum_phases.append(obj['snps_to_phases']) - accum_liks.append(obj['likelihoods']) - all_clusts = np.vstack(accum_clusts) - all_phases = np.vstack(accum_phases) - all_liks = np.vstack(accum_liks) - # save - np.savez(os.path.join(output_dir, "full_dp_results"), snps_to_clusters=all_clusts, snps_to_phases=all_phases, likelihoods=all_liks) - ## running coverage mcmc on all clusters - elif args.command == "coverage_mcmc": cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv, args.allelic_clusters_object, @@ -529,70 +509,105 @@ def main(): ## preprocess ADP data to run scattered coverage mcmc jobs on each ADP cluster elif args.command == "coverage_mcmc_preprocess": + ## perform initial Poisson regression cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv, args.allelic_clusters_object, args.SNPs_pickle, + args.segmentations_pickle, + args.ref_fasta, f_repl=args.repl_pickle, + f_faire=args.faire_pickle, f_GC=args.gc_pickle, - allelic_sample=args.allelic_sample) - Pi, r, C, all_mu, global_beta, cov_df, adp_cluster = cov_mcmc_runner.prepare_single_cluster() + allelic_sample=args.allelic_sample, + bin_width=args.bin_width) + Pi, r, C, all_mu, global_beta, cov_df, adp_cluster, pois_hess = cov_mcmc_runner.prepare_single_cluster() + + ## create chunks for both burnin and scatter + cov_df = cov_df.sort_values("start_g", ignore_index = True) + + # indices of coverage bins + seg_g = cov_df.groupby("seg_idx") # NOTE: seg_idx may not be contiguous if any allelic segments were dropped + seg_g_idx = pd.Series(seg_g.indices).to_frame(name = "indices") + seg_g_idx["allelic_cluster"] = seg_g["allelic_cluster"].first() + seg_g_idx["n_cov_bins"] = seg_g.size() + + ## save + # regression matrices np.savez(os.path.join(output_dir, 'preprocess_data'), Pi=Pi, r=r, C=C, all_mu=all_mu, - global_beta=global_beta, adp_cluster=adp_cluster) + global_beta=global_beta, adp_cluster=adp_cluster, pois_hess=pois_hess) + # coverage dataframe mapped cov_df.to_pickle(os.path.join(output_dir, 'cov_df.pickle')) + # allelic segment indices into coverage dataframe + seg_g_idx.to_pickle(os.path.join(output_dir, 'allelic_seg_groups.pickle')) ## run scattered coverage mcmc job using preprocessed data elif args.command == "coverage_mcmc_shard": # load preprocessed data preprocess_data = np.load(args.preprocess_data) - # check to make sure that the cluster index is within the range - Pi = preprocess_data['Pi'] - if args.cluster_num > Pi.shape[1] - 1: - raise ValueError("Received cluster number {}, which is out of range".format(args.cluster_num)) - + # extract preprocessed data from this cluster - mu = preprocess_data["all_mu"][args.cluster_num] + Pi = preprocess_data['Pi'] + mu = preprocess_data["all_mu"]#[args.cluster_num] beta = preprocess_data["global_beta"] c_assignments = np.argmax(Pi, axis=1) - cluster_mask = (c_assignments == args.cluster_num) - r = preprocess_data['r'][cluster_mask] - C = preprocess_data['C'][cluster_mask] - - # if we get a range argument well be doing burnin on a subset of the coverage bins - if args.range is not None: - #parse range from string - range_lst = args.range.split('-') - st,en = int(range_lst[0]), int(range_lst[1]) - if st > en or st < 0 or en > len(r): - raise ValueError("invalid range! got range {} for cluster {} with size {}".format(args.range, args.cluster_num, len(r))) - - #trim data to our desired range - r = r[st:en] - C = C[st:en] - num_draws = 1 - - # if we're just burning in a subset use different save strings - model_save_str = 'cov_mcmc_model_cluster_{}_{}.pickle'.format(args.cluster_num, args.range) - data_save_str = 'cov_mcmc_data_cluster_{}_{}'.format(args.cluster_num, args.range) - figure_save_str = 'cov_mcmc_cluster_{}_{}_visual'.format(args.cluster_num, args.range) - - else: - #if not in burnin use the specified number of draws - num_draws = args.num_draws - - - model_save_str = 'cov_mcmc_model_cluster_{}.pickle'.format(args.cluster_num) - data_save_str = 'cov_mcmc_data_cluster_{}'.format(args.cluster_num) - figure_save_str = 'cov_mcmc_cluster_{}_visual'.format(args.cluster_num) - - # run on the specified cluster - cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.cluster_num, args.bin_width) + #cluster_mask = (c_assignments == args.cluster_num) + r = preprocess_data['r']#[cluster_mask] + C = preprocess_data['C']#[cluster_mask] + + # load and (weakly) verify allelic segment indices + seg_g_idx = pd.read_pickle(args.allelic_seg_indices) + if len(np.hstack(seg_g_idx["indices"])) != C.shape[0]: + raise ValueError("Size mismatch between allelic segment assignments and coverage bin data!") + + # subset to a single allelic segment + if args.allelic_seg_idx > len(seg_g_idx) - 1: + raise ValueError("Allelic segment index out of bounds!") + + seg_indices = seg_g_idx.iloc[args.allelic_seg_idx] + + mu = mu[seg_indices["allelic_cluster"]] + C = C[seg_indices["indices"], :] + r = r[seg_indices["indices"], :] - # if we're using burnin results load them now - if args.burnin_files is not None: - with open(args.burnin_files, 'r') as f: - file_list = f.read().splitlines() - assignments_arr = aggregate_burnin_files(file_list, args.cluster_num) - cov_mcmc.init_burnin(assignments_arr) + # run cov MCMC + cov_mcmc = NB_MCMC_SingleCluster(args.num_draws, r, C, mu, beta, args.bin_width) + +# # if we get a range argument well be doing burnin on a subset of the coverage bins +# if args.range is not None: +# #parse range from string +# range_lst = args.range.split('-') +# st,en = int(range_lst[0]), int(range_lst[1]) +# if st > en or st < 0 or en > len(r): +# raise ValueError("invalid range! got range {} for cluster {} with size {}".format(args.range, args.cluster_num, len(r))) +# +# #trim data to our desired range +# r = r[st:en] +# C = C[st:en] +# num_draws = 1 +# +# # if we're just burning in a subset use different save strings +# model_save_str = 'cov_mcmc_model_cluster_{}_{}.pickle'.format(args.cluster_num, args.range) +# data_save_str = 'cov_mcmc_data_cluster_{}_{}'.format(args.cluster_num, args.range) +# figure_save_str = 'cov_mcmc_cluster_{}_{}_visual'.format(args.cluster_num, args.range) +# +# else: +# #if not in burnin use the specified number of draws +# num_draws = args.num_draws +# +# +# model_save_str = 'cov_mcmc_model_cluster_{}.pickle'.format(args.cluster_num) +# data_save_str = 'cov_mcmc_data_cluster_{}'.format(args.cluster_num) +# figure_save_str = 'cov_mcmc_cluster_{}_visual'.format(args.cluster_num) +# +# # run on the specified cluster +# cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.cluster_num, args.bin_width) +# +# # if we're using burnin results load them now +# if args.burnin_files is not None: +# with open(args.burnin_files, 'r') as f: +# file_list = f.read().splitlines() +# assignments_arr = aggregate_burnin_files(file_list, args.cluster_num) +# cov_mcmc.init_burnin(assignments_arr) cov_mcmc.run() @@ -600,15 +615,15 @@ def main(): segment_samples, global_beta, mu_i_samples = cov_mcmc.prepare_results() # save samples - with open(os.path.join(output_dir, model_save_str), 'wb') as f: + with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{args.allelic_seg_idx}.pickle"), 'wb') as f: pickle.dump(cov_mcmc, f) - np.savez(os.path.join(output_dir, data_save_str), + np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"), seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples) - # save visualization - cov_mcmc.visualize_cluster_samples( - os.path.join(output_dir, figure_save_str)) +# # save visualization +# cov_mcmc.visualize_cluster_samples( +# os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png")) elif args.command == "collect_cov_mcmc": if args.coverage_dir: diff --git a/hapaseg/a_cov_DP.py b/hapaseg/a_cov_DP.py index 1db9b17..975908b 100644 --- a/hapaseg/a_cov_DP.py +++ b/hapaseg/a_cov_DP.py @@ -63,7 +63,7 @@ def generate_acdp_df(SNP_path, # path to SNP df print('concatenating dp run ', draw_num) a_cov_seg_df = dp_run.cov_df.copy() - covar_cols = sorted([c for c in a_cov_seg_df.columns if "C_" in c]) + covar_cols = sorted(a_cov_seg_df.columns[a_cov_seg_df.columns.str.contains("^C_.*_z$")]) # add minor and major allele counts for each bin to the cov_seg_df here to allow for beta draws on the fly for each segment a_cov_seg_df['min_count'] = 0 a_cov_seg_df['maj_count'] = 0 diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py index 9bf2d2e..0ae9eb5 100644 --- a/hapaseg/allelic_DP.py +++ b/hapaseg/allelic_DP.py @@ -1,5 +1,6 @@ import colorama import copy +import distinctipy import itertools import matplotlib.pyplot as plt import matplotlib as mpl @@ -17,320 +18,82 @@ class A_DP: def __init__(self, allelic_segs_pickle, ref_fasta = None): # dataframe of allelic imbalance segmentation samples for each chromosome arm - self.allelic_segs = pd.read_pickle(allelic_segs_pickle).dropna(0) - self.allelic_segs = self.allelic_segs.loc[self.allelic_segs["results"].apply(lambda x : len(x.breakpoint_list)) > 0] + self.allelic_segs = pd.read_pickle(allelic_segs_pickle).dropna(axis = 0) + # if some chromsome arms couldn't find the MLE, just use current state of chain + none_idx = self.allelic_segs["results"].apply(lambda x : x.breakpoints_MLE is None) + for i in none_idx[none_idx].index: + self.allelic_segs.iloc[i]["results"].breakpoints_MLE = self.allelic_segs.iloc[i]["results"].breakpoints + + # load SNPs + self.SNPs = [] + clust_offset = 0 + for _, H in self.allelic_segs.iterrows(): + S = copy.deepcopy(H["results"].P) + S["A_alt"] = 0 + S.loc[S["aidx"], "A_alt"] = S.loc[S["aidx"], "ALT_COUNT"] + S["A_ref"] = 0 + S.loc[S["aidx"], "A_ref"] = S.loc[S["aidx"], "REF_COUNT"] + S["B_alt"] = 0 + S.loc[~S["aidx"], "B_alt"] = S.loc[~S["aidx"], "ALT_COUNT"] + S["B_ref"] = 0 + S.loc[~S["aidx"], "B_ref"] = S.loc[~S["aidx"], "REF_COUNT"] + + S = S.rename(columns = { "MIN_COUNT" : "min", "MAJ_COUNT" : "maj" }) + S = S.loc[:, ["chr", "pos", "min", "maj", "A_alt", "A_ref", "B_alt", "B_ref"]] + + # set initial cluster assignments based on segmentation + S["clust"] = -1 + bpl = np.array(H["results"].breakpoints_MLE); bpl = np.c_[bpl[0:-1], bpl[1:]] + for i, (st, en) in enumerate(bpl): + S.iloc[st:en, S.columns.get_loc("clust")] = i + clust_offset + clust_offset += i + + # bug in segmentation omits final SNP? + S = S.iloc[:-1] + assert (S["clust"] != -1).all() + + self.SNPs.append(S) + + self.SNPs = pd.concat(self.SNPs, ignore_index = True) - # number of total segmentation samples - self.n_samp = self.allelic_segs["results"].apply(lambda x : len(x.breakpoint_list)).min() + # convert chr-relative positions to absolute genomic coordinates self.ref_fasta = ref_fasta + self.SNPs["pos_gp"] = seq.chrpos2gpos(self.SNPs["chr"], self.SNPs["pos"], ref = self.ref_fasta) - # DP run objects for each segmentation sample - self.DP_runs = None - - # dataframe of SNPs - self.SNPs = None + # initial phasing orientation + self.SNPs["flipped"] = False - # number of segmentation samples used for DP run - self.N_seg_samps = None - # number of DP samples per segmentation sample - self.N_clust_samps = None + self.N_clust_samps = 100 # assignment of SNPs to DP clusters for each MCMC sample self.snps_to_clusters = None # phase correction of SNPs for each MCMC sample self.snps_to_phases = None + # likelihoods of each clustering + self.likelihoods = None - def load_seg_samp(self, samp_idx): - if samp_idx > self.n_samp: - raise ValueError(f"Only {self.n_samp} MCMC samples were taken!") - - all_segs = [] - all_SNPs = [] - - maj_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("MAJ_COUNT") - min_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("MIN_COUNT") - - alt_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("ALT_COUNT") - ref_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("REF_COUNT") - - chunk_offset = 0 - for _, H in self.allelic_segs.dropna(subset = ["results"]).iterrows(): - r = copy.deepcopy(H["results"]) - - # set phasing orientation back to original - for st, en in r.F.intervals(): - # code excised from flip_hap - x = r.P.iloc[st:en, maj_idx].copy() - r.P.iloc[st:en, maj_idx] = r.P.iloc[st:en, min_idx] - r.P.iloc[st:en, min_idx] = x - - # save SNPs for this chunk - if self.SNPs is None: - all_SNPs.append(pd.DataFrame({ - "maj" : r.P["MAJ_COUNT"], - "min" : r.P["MIN_COUNT"], - # TODO: gpos should be computed earlier, so that that we don't need to pass ref_fasta here - "gpos" : seq.chrpos2gpos(r.P.loc[0, "chr"], r.P["pos"], ref = self.ref_fasta), - "allele" : r.P["allele_A"] - })) - - # draw breakpoint, phasing, and SNP inclusion sample from segmentation MCMC trace - bp_samp, pi_samp, inc_samp = (r.breakpoint_list[samp_idx], r.phase_interval_list[samp_idx] if r.phase_correct else None, r.include[samp_idx]) - # flip everything according to sample - if r.phase_correct: - for st, en in pi_samp.intervals(): - x = r.P.iloc[st:en, maj_idx].copy() - r.P.iloc[st:en, maj_idx] = r.P.iloc[st:en, min_idx] - r.P.iloc[st:en, min_idx] = x - - bpl = np.array(bp_samp); bpl = np.c_[bpl[0:-1], bpl[1:]] - - # get major/minor sums for each segment - # also get {alt, ref} x {aidx, bidx} - for st, en in bpl: - all_segs.append([ - st + chunk_offset, en + chunk_offset, # SNP index for seg - r.P.loc[st, "chr"], r.P.loc[st, "pos"], r.P.loc[en, "pos"], # chromosomal position of seg - r._Piloc(st, en, min_idx, inc_samp).sum(), # min/maj counts - r._Piloc(st, en, maj_idx, inc_samp).sum(), - - r._Piloc(st, en, alt_idx, inc_samp & r.P["aidx"]).sum(), # allele A alt/ref - r._Piloc(st, en, ref_idx, inc_samp & r.P["aidx"]).sum(), - r._Piloc(st, en, alt_idx, inc_samp & ~r.P["aidx"]).sum(), # allele B alt/ref - r._Piloc(st, en, ref_idx, inc_samp & ~r.P["aidx"]).sum() - ]) - - chunk_offset += len(r.P) - - # convert samples into dataframe - S = pd.DataFrame(all_segs, columns = ["SNP_st", "SNP_en", "chr", "start", "end", "min", "maj", "A_alt", "A_ref", "B_alt", "B_ref"]) - - # convert chr-relative positions to absolute genomic coordinates - S["start_gp"] = seq.chrpos2gpos(S["chr"], S["start"], ref = self.ref_fasta) - S["end_gp"] = seq.chrpos2gpos(S["chr"], S["end"], ref = self.ref_fasta) - - # initial cluster assignments - S["clust"] = -1 # initially, all segments are unassigned - S.iloc[0, S.columns.get_loc("clust")] = 1 # first segment is assigned to cluster 1 - - # initial phasing orientation - S["flipped"] = False - - if self.SNPs is None: - self.SNPs = pd.concat(all_SNPs, ignore_index = True) - CI = s.beta.ppf([0.05, 0.5, 0.95], self.SNPs["min"].values[:, None] + 1, self.SNPs["maj"].values[:, None] + 1) - self.SNPs[["f_CI_lo", "f", "f_CI_hi"]] = CI - - return S, self.SNPs - - # map trace of segment cluster assignments to the SNPs within - @staticmethod - def map_seg_clust_assignments_to_SNPs(segs_to_clusters, S): - st_col = S.columns.get_loc("SNP_st") - en_col = S.columns.get_loc("SNP_en") - snps_to_clusters = np.zeros((segs_to_clusters.shape[0], S.iloc[-1, en_col] + 1), dtype = int) - for i, seg_assign in enumerate(segs_to_clusters): - for j, seg in enumerate(seg_assign): - snps_to_clusters[i, S.iloc[j, st_col]:S.iloc[j, en_col]] = seg - - return snps_to_clusters - - @staticmethod - def map_seg_phases_to_SNPs(phase, S): - st_col = S.columns.get_loc("SNP_st") - en_col = S.columns.get_loc("SNP_en") - snps_to_phase = np.zeros((phase.shape[0], S.iloc[-1, en_col] + 1), dtype = int) - for i, phase_orient in enumerate(phase): - for j, ph in enumerate(phase_orient): - snps_to_phase[i, S.iloc[j, st_col]:S.iloc[j, en_col]] = ph - - return snps_to_phase - - def run(self, N_seg_samps = 50, N_clust_samps = 5, seg_sample_idx = None): - self.N_seg_samps = N_seg_samps if seg_sample_idx is None else 1 - self.N_clust_samps = N_clust_samps - - seg_sample_idx = np.random.choice(self.n_samp - 1, self.N_seg_samps, replace = False) if seg_sample_idx is None else [seg_sample_idx] - S, SNPs = self.load_seg_samp(seg_sample_idx[0]) - N_SNPs = len(SNPs) - - self.snps_to_clusters = -1*np.ones((self.N_clust_samps*self.N_seg_samps, N_SNPs), dtype = np.int16) - self.snps_to_phases = np.zeros((self.N_clust_samps*self.N_seg_samps, N_SNPs), dtype = bool) - self.DP_likelihoods = np.zeros((self.N_clust_samps*self.N_seg_samps, 2)) - - self.DP_runs = [None]*self.N_seg_samps - - clust_prior = sc.SortedDict() - clust_count_prior = sc.SortedDict() - n_iter_clust_exist = sc.SortedDict() - cur_samp_iter = 0 - - for n_it in range(self.N_seg_samps): - if n_it > 0: - S, SNPs = self.load_seg_samp(seg_sample_idx[n_it]) - - # run clustering - self.DP_runs[n_it] = DPinstance(S, clust_prior = clust_prior, clust_count_prior = clust_count_prior) - segs_to_clusters, segs_to_phases = self.DP_runs[n_it].run(n_iter = self.N_clust_samps) - - # compute likelihoods for each clustering - self.DP_likelihoods[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.DP_runs[n_it].compute_overall_lik() - - # assign clusters to individual SNPs, to use as segment assignment prior for next DP iteration - self.snps_to_clusters[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.map_seg_clust_assignments_to_SNPs(segs_to_clusters, S) - - # assign phase orientations to individual SNPs - self.snps_to_phases[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.map_seg_phases_to_SNPs(segs_to_phases, S) - - # compute prior on cluster locations/counts - max_clust_idx = segs_to_clusters.max() - for seg_assignments, seg_phases in zip(segs_to_clusters, segs_to_phases): - # reset phases - S2 = S.copy() - S2.loc[S2["flipped"], ["min", "maj"]] = S2.loc[S2["flipped"], ["min", "maj"]].values[:, ::-1] - - # match phases to current sample - S2.loc[seg_phases, ["min", "maj"]] = S2.loc[seg_phases, ["min", "maj"]].values[:, ::-1] - - # minor/major counts for each cluster in this iteration - S_a = npg.aggregate(seg_assignments, S2["min"], size = max_clust_idx + 1) - S_b = npg.aggregate(seg_assignments, S2["maj"], size = max_clust_idx + 1) - c = np.c_[S_a, S_b] - - # total numer of SNPs for each cluster in this iteration - #N_c = npg.aggregate(seg_assignments, S2["SNP_en"] - S2["SNP_st"], size = max_clust_idx + 1) - N_c = npg.aggregate(seg_assignments, 1, size = max_clust_idx + 1) - - # iteratively update priors - next_clust_prior = sc.SortedDict(zip(np.flatnonzero(c.sum(1) > 0), c[c.sum(1) > 0])) - next_clust_count_prior = sc.SortedDict(zip(np.flatnonzero(c.sum(1) > 0), N_c[N_c > 0])) - - for cl in np.unique(seg_assignments): - if cl in n_iter_clust_exist: - n_iter_clust_exist[cl] += 1 - else: - n_iter_clust_exist[cl] = 1 - cur_samp_iter += 1 - - for k, v in next_clust_prior.items(): - nccp = next_clust_count_prior[k] - if k in clust_prior: - clust_prior[k] += (v - clust_prior[k])/n_iter_clust_exist[k] - clust_count_prior[k] += (nccp - clust_count_prior[k])/cur_samp_iter - else: - clust_prior[k] = v - clust_count_prior[k] = nccp/cur_samp_iter - # for clusters that don't exist in this iteration, average counts with zero - for k, v in clust_prior.items(): - if k != -1 and k not in next_clust_prior: - clust_count_prior[k] -= clust_count_prior[k]/cur_samp_iter - - - # remove improbable clusters from prior - for kk in [k for k, v in clust_count_prior.items() if v < 1]: - del clust_prior[kk] - del clust_count_prior[kk] - - # remove garbage cluster from priors - #del clust_prior[0] - #del clust_count_prior[0] - - return self.snps_to_clusters, self.snps_to_phases, self.DP_likelihoods - - def visualize_segs(self, snps_to_clusters = None, f = None, n_vis_samp = None): - f = plt.figure(figsize = [17.56, 5.67]) if f is None else f - - snps_to_clusters = snps_to_clusters if snps_to_clusters is not None else self.snps_to_clusters - - # plot all samples from DP - if n_vis_samp is None: - run_idx = np.r_[0:self.N_seg_samps] - N_seg_samps = self.N_seg_samps - - # only plot up to n_vis_samp _segmentation samples_ from DP - # (all DP samples for a given segmentation sample will be plotted) - else: - run_idx = np.random.choice(self.N_seg_samps, n_vis_samp, replace = False) - N_seg_samps = n_vis_samp - - for d in [self.DP_runs[x] for x in run_idx]: - d.visualize_adjacent_segs(f = f.number, n_samp = N_seg_samps*self.N_clust_samps) - - def visualize_clusts(self, snps_to_clusters = None, f = None, thick = False, nocolor = False, n_vis_samp = None): - f = plt.figure(figsize = [17.56, 5.67]) if f is None else f - - snps_to_clusters = snps_to_clusters if snps_to_clusters is not None else self.snps_to_clusters - - # plot all samples from DP - if n_vis_samp is None: - run_idx = np.r_[0:self.N_seg_samps] - N_seg_samps = self.N_seg_samps - - # only plot up to n_vis_samp _segmentation samples_ from DP - # (all DP samples for a given segmentation sample will be plotted) - else: - run_idx = np.random.choice(self.N_seg_samps, n_vis_samp, replace = False) - N_seg_samps = n_vis_samp - - for d in [self.DP_runs[x] for x in run_idx]: - d.visualize_clusts(f = f.number, n_samp = N_seg_samps*self.N_clust_samps, thick = thick, nocolor = nocolor) - - def visualize_SNPs(self, snps_to_phases = None, color = True, f = None): - snps_to_phases = snps_to_phases if snps_to_phases is not None else self.snps_to_phases - ph_prob = snps_to_phases.mean(0) - - if color: - rb = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]] - else: - rb = np.full([2, 3], 0) - - logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M))) - - def scerrorbar(idx, rev = False, alpha = 1, show_CI = True): - if rev: - f = 1 - self.SNPs.loc[idx, "f"] - eb_bot = self.SNPs.loc[idx, "f"] - self.SNPs.loc[idx, "f_CI_hi"] - eb_top = self.SNPs.loc[idx, "f_CI_lo"] - self.SNPs.loc[idx, "f"] - else: - f = self.SNPs.loc[idx, "f"] - eb_bot = self.SNPs.loc[idx, "f"] - self.SNPs.loc[idx, "f_CI_lo"] - eb_top = self.SNPs.loc[idx, "f_CI_hi"] - self.SNPs.loc[idx, "f"] - - if show_CI: - plt.errorbar( - x = self.SNPs.loc[idx, "gpos"], - y = f, - yerr = np.c_[ - eb_bot, - eb_top - ].T, - fmt = 'none', ecolor = np.c_[rb[self.SNPs.loc[idx, "allele"]], (alpha if isinstance(alpha, np.ndarray) else alpha*np.ones(idx.sum()))**2] - ) - - plt.scatter( - self.SNPs.loc[idx, "gpos"], - f, - color = rb[self.SNPs.loc[idx, "allele"]], - marker = '.', - s = 1, - alpha = alpha if show_CI else alpha - ) - - default_alpha = logistic(A = 0.4, K = 0.01, B = 0.00001, M = 120000, x = len(self.SNPs)) + def run(self): + self.DP_run = DPinstance( + self.SNPs, + dp_count_scale_factor = self.SNPs["clust"].value_counts().mean() + ) + self.snps_to_clusters, self.snps_to_phases, self.likelihoods = self.DP_run.run(n_samps = self.N_clust_samps) - f = plt.figure(figsize = [17.56, 5.67]) if f is None else f - scerrorbar(ph_prob == 0, alpha = default_alpha, show_CI = color) - scerrorbar(ph_prob == 1, rev = True, alpha = default_alpha, show_CI = color) - idx = (ph_prob > 0) & (ph_prob < 1) - scerrorbar(idx, alpha = (1 - ph_prob[idx])*default_alpha, show_CI = color) - scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color) + return self.snps_to_clusters, self.snps_to_phases, self.likelihoods class DPinstance: - def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 0.1): + def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, dp_count_scale_factor = 1): self.S = S self.clust_prior = clust_prior.copy() self.clust_count_prior = clust_count_prior.copy() self.alpha = alpha + self.dp_count_scale_factor = dp_count_scale_factor + + self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed + self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F") + self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F") + + self.betahyp = self.S.loc[:, ["min", "maj"]].sum(1).mean()/2 # # define column indices @@ -348,41 +111,63 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort # store likelihoods for each cluster in the prior (from previous iterations) self.clust_prior[-1] = np.r_[0, 0] - self.clust_prior_liks = sc.SortedDict({ k : ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_prior.items()}) + self.clust_prior_liks = sc.SortedDict({ k : ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_prior.items()}) self.clust_prior_mat = np.r_[self.clust_prior.values()] self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster - self.clust_count_prior[0] = self.alpha # relative probability of sending a cluster to the garbage + def _Siat_ph(self, ridx, min = True): + # min, flip => maj + # ~min, ~flip => maj + # min, ~flip => min + # ~min, flip => min + col = self.min_col if self.S.iat[ridx, self.flip_col] ^ min else self.maj_col + return self.S.iat[ridx, col] + + def _Ssum_ph(self, seg_idx, min = True): + #flip = self.flip_mat[seg_idx] + flip = self.S.iloc[seg_idx, self.flip_col] + flip_n = ~flip + if min: + return self.mm_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + else: + return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + + def _Scumsum_ph(self, seg_idx, min = True): + flip = self.S.iloc[seg_idx, self.flip_col] + flip_n = ~flip + if min: + si = np.argsort(np.r_[seg_idx[flip_n], seg_idx[flip]]) + return self.mm_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]][si].cumsum() + else: + si = np.argsort(np.r_[seg_idx[flip], seg_idx[flip_n]]) + return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]][si].cumsum() - def rephase(self, seg_idx, force = False): - if not force: - A_a = self.S.iloc[seg_idx, self.aalt_col].sum() + 1 - A_b = self.S.iloc[seg_idx, self.aref_col].sum() + 1 - B_a = self.S.iloc[seg_idx, self.balt_col].sum() + 1 - B_b = self.S.iloc[seg_idx, self.bref_col].sum() + 1 + def compute_rephase_prob(self, seg_idx): + # TODO: compute logcdf/logsf directly + flip = self.S.iloc[seg_idx, self.flip_col] + flip_n = ~flip - # use normal approximation to beta if conditions are right - if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20: - m_x = A_a/(A_a + A_b) - s_x = A_a*A_b/((A_a + A_b)**2*(A_a + A_b + 1)) - m_y = B_a/(B_a + B_b) - s_y = B_a*B_b/((B_a + B_b)**2*(B_a + B_b + 1)) + A_a = self.alt_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + self.betahyp + A_b = self.ref_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + self.betahyp + B_a = self.alt_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + self.betahyp + B_b = self.ref_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + self.betahyp - do_rephase = np.random.rand() < s.norm.cdf(0, m_y - m_x, np.sqrt(s_x + s_y)) + # use normal approximation to beta if conditions are right + if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20: + m_x = A_a/(A_a + A_b) + s_x = A_a*A_b/((A_a + A_b)**2*(A_a + A_b + 1)) + m_y = B_a/(B_a + B_b) + s_y = B_a*B_b/((B_a + B_b)**2*(B_a + B_b + 1)) - # Monte Carlo simulate difference of betas - else: - x = s.beta.rvs(A_a, A_b, size = 1000) - y = s.beta.rvs(B_a, B_b, size = 1000) + return s.norm.cdf(0, m_y - m_x, np.sqrt(s_x + s_y)) - do_rephase = np.random.rand() < (x > y).mean() + # Monte Carlo simulate difference of betas + else: + x = s.beta.rvs(A_a, A_b, size = 1000) + y = s.beta.rvs(B_a, B_b, size = 1000) - if force or do_rephase: - self.S.iloc[seg_idx, [self.min_col, self.maj_col]] = self.S.iloc[seg_idx, [self.min_col, self.maj_col]].values[:, ::-1] - self.S.iloc[seg_idx, [self.aalt_col, self.balt_col]] = self.S.iloc[seg_idx, [self.aalt_col, self.balt_col]].values[:, ::-1] - self.S.iloc[seg_idx, [self.aref_col, self.bref_col]] = self.S.iloc[seg_idx, [self.aref_col, self.bref_col]].values[:, ::-1] - self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] + return (x > y).mean() def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_b, D_a, D_b): # if st == en: @@ -392,33 +177,84 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_ # J_a = S.iloc[st:(en + 1), min_col].sum() # J_b = S.iloc[st:(en + 1), maj_col].sum() SU_a = SU_b = SD_a = SD_b = 0 - # if target segments are being moved to the garbage, it is equivalent to making them their own segment, and joining the upstream and downstream segments - if targ_clust == 0: - SU_a = J_a - SU_b = J_b - J_a = 0 - J_b = 0 - - if targ_clust != - 1 and (targ_clust == upstream_clust or targ_clust == 0): + + if targ_clust != -1 and targ_clust == upstream_clust: J_a += U_a J_b += U_b else: SU_a += U_a SU_b += U_b - if targ_clust != - 1 and (targ_clust == downstream_clust or targ_clust == 0): + if targ_clust != -1 and targ_clust == downstream_clust: J_a += D_a J_b += D_b else: SD_a += D_a SD_b += D_b - return ss.betaln(SU_a + 1, SU_b + 1) + ss.betaln(J_a + 1, J_b + 1) + ss.betaln(SD_a + 1, SD_b + 1) + return (ss.betaln(SU_a + 1 + self.betahyp, SU_b + 1 + self.betahyp) if SU_a > 0 or SU_b > 0 else 0) + \ + ss.betaln(J_a + 1 + self.betahyp, J_b + 1 + self.betahyp) + \ + (ss.betaln(SD_a + 1 + self.betahyp, SD_b + 1 + self.betahyp) if SD_a > 0 or SD_b > 0 else 0) + + def compute_adj_prob(self, break_idx): + if break_idx > 1: + U_A, U_B = self.seg_sums[self.breakpoints[break_idx - 1]] + U_cl = self.clusts[self.breakpoints[break_idx - 1]] + else: + U_A = U_B = 0 + U_cl = -1 + if break_idx + 2 < len(self.breakpoints): + D_A, D_B = self.seg_sums[self.breakpoints[break_idx + 1]] + D_cl = self.clusts[self.breakpoints[break_idx + 1]] + else: + D_A = D_B = 0 + D_cl = -1 + + S_A, S_B = self.seg_sums[self.breakpoints[break_idx]] + + ## compute all four possible segmentations relative to neighbor, in + ## both phasing orientations + MLs = np.c_[ + # UTD T U D + # -^_ or -_- (U != T & T != D) (00) + np.r_[self.SJliks(1, 0, 0, S_A, S_B, U_A, U_B, D_A, D_B), + self.SJliks(1, 0, 0, S_B, S_A, U_A, U_B, D_A, D_B)], + # -__ (U != T & T == D) (01) + np.r_[self.SJliks(0, 1, 0, S_A, S_B, U_A, U_B, D_A, D_B), + self.SJliks(0, 1, 0, S_B, S_A, U_A, U_B, D_A, D_B)], + # --_ (U == T & T != D) (10) + np.r_[self.SJliks(1, 1, 0, S_A, S_B, U_A, U_B, D_A, D_B), + self.SJliks(1, 1, 0, S_B, S_A, U_A, U_B, D_A, D_B)], + # --- (U == T & T == D) (11) + np.r_[self.SJliks(0, 0, 0, S_A, S_B, U_A, U_B, D_A, D_B), + self.SJliks(0, 0, 0, S_B, S_A, U_A, U_B, D_A, D_B)], + ] + + ## match probs to cluster choices (will match MLs matrix in main calculation) + probs = np.full([len(self.clust_sums), 2], MLs[0, 0]) + if U_cl == D_cl and U_cl != -1 and D_cl != -1: + probs[self.clust_sums.index(U_cl), :] = MLs[:, 3] + probs[self.clust_sums.index(D_cl), :] = MLs[:, 3] + else: + if U_cl != -1: + probs[self.clust_sums.index(U_cl), :] = MLs[:, 2] + if D_cl != -1: + probs[self.clust_sums.index(D_cl), :] = MLs[:, 1] + + return probs def compute_adj_liks(self, seg_idx, cur_clust): + # idea to simplify this code: + # - strip out logic for working with noncontiguous seg_idx's + # - compute all four possibile segmentations: + # ABC, AAB, ABB, AAA + # - associate those segmentations with each cluster choice, in order + # to return `adj_BC` with same size as `MLs` + adj_AB = 0 - adj_BC = np.zeros(len(self.clust_sums)) + adj_BC = np.zeros([len(self.clust_sums), 2]) # start/end coordinates of consecutive runs of segments being moved + # NOTE: ordpairs represents closed intervals! ordpairs = np.c_[ [np.r_[list(x)][[0, -1]] for x in more_itertools.consecutive_groups( np.sort(seg_idx)) @@ -431,48 +267,32 @@ def compute_adj_liks(self, seg_idx, cur_clust): for o, (st, en) in enumerate(ordpairs): # maj/min counts of contiguous upstream segments belonging to the same cluster if st - 1 > 0: - # skip over adjacent segments that are in the garbage; - # we only care about adjacent segments actually assigned to clusters j = 1 - while st - j > 0 and self.clusts[st - j] == 0: - j += 1 U_cl = self.clusts[st - j] adj_clusters[o, 0] = U_cl while st - j > 0 and self.clusts[st - j] != -1 and \ - (self.clusts[st - j] == U_cl or self.clusts[st - j] == 0): - # again, skip over segments in the garbage - if self.clusts[st - j] != 0: - UD_counts[o, 0] += self.S.iloc[st - j, self.min_col] - UD_counts[o, 1] += self.S.iloc[st - j, self.maj_col] + self.clusts[st - j] == U_cl: + UD_counts[o, 0] += self._Siat_ph(st - j, min = True) + UD_counts[o, 1] += self._Siat_ph(st - j, min = False) j += 1 # maj/min counts of contiguous downstream segments belonging to the same cluster if en + 1 < len(self.S): j = 1 - while en + j < len(self.S) - 1 and self.clusts[en + j] == 0: - j += 1 D_cl = self.clusts[en + j] adj_clusters[o, 1] = D_cl while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \ - (self.clusts[en + j] == D_cl or self.clusts[en + j] == 0): - if self.clusts[en + j] != 0: - UD_counts[o, 2] += self.S.iloc[en + j, self.min_col] - UD_counts[o, 3] += self.S.iloc[en + j, self.maj_col] + self.clusts[en + j] == D_cl: + UD_counts[o, 2] += self._Siat_ph(en + j, min = True) + UD_counts[o, 3] += self._Siat_ph(en + j, min = False) j += 1 - # if we are looking at the segments at the very start or very end, set - # upstream/downstream cluster indices to garbage - if ordpairs[0, 0] == 0: - adj_clusters[0, 0] = 0 - if ordpairs[-1, 1] == len(self.S) - 1: - adj_clusters[-1, 1] = 0 - # if there are any segments being moved adjacent to already existing clusters, get local split/join likelihoods adj_idx = ~(adj_clusters == -1).all(1) @@ -494,8 +314,8 @@ def compute_adj_liks(self, seg_idx, cur_clust): # min/maj counts of the segment(s) being moved st = ordpairs[j, 0] en = ordpairs[j, 1] - S_a = self.S.iloc[:, self.min_col].values[st:(en + 1)].sum() - S_b = self.S.iloc[:, self.maj_col].values[st:(en + 1)].sum() + S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # en + 1 because ordpairs is a closed interval + S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) # adjacency likelihood of this segment remaining where it is # adj_AB += self.SJliks( @@ -511,10 +331,10 @@ def compute_adj_liks(self, seg_idx, cur_clust): # ) # adjacency likelihood of this segment joining each possible cluster: - # 1. those it is actually adjacent to (+ new cluster, garbage) - for cl in {-1, 0, cl_u, cl_d}: + # 1. those it is actually adjacent to (+ new cluster) + for cl in {-1, cl_u, cl_d}: idx = self.clust_sums.index(cl) - adj_BC[idx] += self.SJliks( + adj_BC[idx, 0] += self.SJliks( targ_clust = cl, upstream_clust = cl_u, downstream_clust = cl_d, @@ -525,14 +345,22 @@ def compute_adj_liks(self, seg_idx, cur_clust): D_a = D_a, D_b = D_b ) - # we cannot send a segment to the garbage adjacent to any unassigned segment - if cl == 0 and (cl_u == -1 or cl_d == -1): - adj_BC[idx] = -np.inf + adj_BC[idx, 1] += self.SJliks( + targ_clust = cl, + upstream_clust = cl_u, + downstream_clust = cl_d, + J_a = S_b, + J_b = S_a, + U_a = U_a, + U_b = U_b, + D_a = D_a, + D_b = D_b + ) # 2. clusters it is not adjacent to (use default split value) - for cl in self.clust_sums.keys() - ({-1, 0} | set(adj_clusters[adj_idx].ravel())): + for cl in self.clust_sums.keys() - ({-1} | set(adj_clusters[adj_idx].ravel())): idx = self.clust_sums.index(cl) - adj_BC[idx] += self.SJliks( + adj_BC[idx, 0] += self.SJliks( targ_clust = -1, upstream_clust = -1, downstream_clust = -1, @@ -543,253 +371,372 @@ def compute_adj_liks(self, seg_idx, cur_clust): D_a = D_a, D_b = D_b ) - else: - # we cannot send a segment to the garbage adjacent to any unassigned segment - adj_BC[self.clust_sums.index(0)] = -np.inf + adj_BC[idx, 1] += self.SJliks( + targ_clust = -1, + upstream_clust = -1, + downstream_clust = -1, + J_a = S_b, + J_b = S_a, + U_a = U_a, + U_b = U_b, + D_a = D_a, + D_b = D_b + ) return adj_AB, adj_BC - def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None): - if segs_to_clusters is None: - _, segs_to_clusters = self.get_unique_clust_idxs() - else: - _, segs_to_clusters = self.get_unique_clust_idxs(segs_to_clusters) - if phase_orientations is None: - phase_orientations = np.r_[self.phase_orientations] + def compute_cluster_splitpoints(self, seg_idx): + spl = [] - max_clust_idx = segs_to_clusters.max() + 1 + # left bias + end = len(seg_idx) + i = 0 + while True: + seg_idx_sp = seg_idx[0:end] + if len(seg_idx_sp) < 2: + break - liks = np.full([segs_to_clusters.shape[0], 2], np.nan) + min_cs = self._Scumsum_ph(seg_idx_sp, min = True) + min_csr = self._Ssum_ph(seg_idx_sp, min = True) - min_cs + maj_cs = self._Scumsum_ph(seg_idx_sp, min = False) + maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs - for i, (cl_samp, ph_samp) in enumerate(zip(segs_to_clusters, phase_orientations)): - # reset phases - # TODO: when we switch to faster phasing correction model that doesn't involve modifying self.S, this won't be necessary - S_ph = self.S.copy() - flip_idx = np.flatnonzero(ph_samp != S_ph["flipped"]) - S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] + split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp) + # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum()) + # NOTE: instead of argmax, probabilistically choose? will this make a difference? - ## overall clustering likelihood - A = npg.aggregate(cl_samp, S_ph["min"], size = max_clust_idx) - B = npg.aggregate(cl_samp, S_ph["maj"], size = max_clust_idx) + end = split_lik.argmax() + spl.append(end) -# for when self.S is not modified -# A = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx) + \ -# npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx) -# -# B = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx) + \ -# npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx) + if end <= 1 or end == len(split_lik) - 1: + break - clust_lik = ss.betaln(A + 1, B + 1).sum() + i += 1 - ## segmentation likelihood + # right bias + start = 0 + i = 0 + while True: + seg_idx_sp = seg_idx[start:] + if len(seg_idx_sp) < 2: + break - # get segment boundaries - bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1]) - bdy = np.c_[bdy[:-1], bdy[1:]] + min_cs = self._Scumsum_ph(seg_idx_sp, min = True) + min_csr = self._Ssum_ph(seg_idx_sp, min = True) - min_cs + maj_cs = self._Scumsum_ph(seg_idx_sp, min = False) + maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs - # sum log-likelihoods of each segment - seg_lik = 0 - for st, en in bdy: - A, B = S_ph.iloc[st:en, [self.min_col, self.maj_col]].sum() + split_lik = ss.betaln(min_cs[:-1] + 1 + self.betahyp, maj_cs[:-1] + 1 + self.betahyp) + ss.betaln(min_csr[1:] + 1 + self.betahyp, maj_csr[1:] + 1 + self.betahyp) + # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum()) -# for when self.S is not modified -# A = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \ -# self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum() -# B = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \ -# self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum() + start += split_lik.argmax() + 1 + spl.append(start - 1) - seg_lik += ss.betaln(A + 1, B + 1) + if start > len(seg_idx) - 1 or split_lik.argmax() == 0: + break - liks[i, :] = np.r_[clust_lik, seg_lik] + i += 1 - return liks + bdy = np.unique(np.r_[0, spl, len(seg_idx)]) + bdy = np.c_[bdy[:-1], bdy[1:]] - def run(self, n_iter = 50): - # - # assign segments to likeliest prior component {{{ - - if len(self.clust_prior) > 1: - for seg_idx in range(len(self.S)): - seg_idx = np.r_[seg_idx] - - # compute probability that segment belongs to each cluster prior element - S_a = self.S.iloc[seg_idx[0], self.min_col] - S_b = self.S.iloc[seg_idx[0], self.maj_col] - P_a = self.clust_prior_mat[1:, 0] - P_b = self.clust_prior_mat[1:, 1] - - # prior likelihood ratios for both phase orientations - P_l = np.c_[ - ss.betaln(S_a + P_a + 1, S_b + P_b + 1) - (ss.betaln(S_a + 1, S_b + 1) + ss.betaln(P_a + 1, P_b + 1)), - ss.betaln(S_b + P_a + 1, S_a + P_b + 1) - (ss.betaln(S_b + 1, S_a + 1) + ss.betaln(P_a + 1, P_b + 1)), - ] + return bdy - # get count prior - ccp = np.c_[[v for k, v in self.clust_count_prior.items() if k != -1 and k != 0]] + def add_breakpoint(self, start, mid, end, clust_idx): + """ + Add breakpoint at mid belonging to clust_idx, between start and end + """ + self.breakpoints.add(mid) + self.clust_members_bps[clust_idx].add(mid) + + A = self._Ssum_ph(np.r_[mid:end], min = True) + B = self._Ssum_ph(np.r_[mid:end], min = False) - # posterior numerator - num = P_l + np.log(ccp) - num -= num.max() + self.seg_sums[mid] = np.r_[A, B] + self.seg_sums[start] -= self.seg_sums[mid] - # probabilistically choose a cluster - probs = np.exp(num)/np.exp(num).sum() - idx = np.tile(np.r_[self.clust_prior.keys()][1:], [2, 1]).T*[1, -1] - choice = np.random.choice( - idx.ravel(), - p = probs.ravel() - ) + self.seg_liks[mid] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) + A = self._Ssum_ph(np.r_[start:mid], min = True) + B = self._Ssum_ph(np.r_[start:mid], min = False) + self.seg_liks[start] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) - # rephase - if choice < 0: - self.rephase(seg_idx, force = True) - choice = -choice + self.seg_phase_probs[start] = self.compute_rephase_prob(np.r_[start:mid]) + self.seg_phase_probs[mid] = self.compute_rephase_prob(np.r_[mid:end]) - self.S.iloc[seg_idx, self.clust_col] = choice + def compute_overall_lik_simple(self): + ## overall clustering likelihood + # p({a_i, b_i} | {c_k}, {phase_i}) + clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_sums.items() if k >= 0]].sum() + + ## overall phasing likelihood + # p({phase_i} | {a_i, b_i}) + phase_probs = np.r_[self.seg_phase_probs.values()] + phase_lik = np.log1p(phase_probs).sum() if not np.isnan(phase_probs).any() else np.nan + + ## Dirichlet count prior (Dirichlet-categorical marginal likelihood) + # p({c_k}) + dirvec = np.r_[self.clust_counts.values()].astype(float)/self.dp_count_scale_factor + k = len(dirvec) + count_prior = k*np.log(self.alpha) + ss.gammaln(dirvec).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha) - # }}} + ## segmentation likelihood + # p({a_i, b_i} | {s}, {phase_i}) + # TODO: memoize + seg_lik = np.r_[self.seg_liks.values()].sum() + # p({c_k}, {s}, {phase_i} | {a_i, b_i}) + return np.r_[clust_lik, phase_lik, count_prior, seg_lik] + + def run(self, n_iter = 0, n_samps = 0): # # initialize cluster tracking hash tables - self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore")) - # for the first round of clustering, this is { 1 : 1 } + self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore")) + # for the first round of clustering, this is { 0 : 1, 1 : 1, ..., N - 1 : 1 } + + Sgc = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum() + if (Sgc.droplevel(0).index == True).any(): + Sgc.loc[(slice(None), True), ["min", "maj"]] = Sgc.loc[(slice(None), True), ["maj", "min"]].values self.clust_sums = sc.SortedDict({ - **{ k : np.r_[v["min"], v["maj"]] for k, v in self.S.groupby("clust")[["min", "maj"]].sum().to_dict(orient = "index").items() }, - **{-1 : np.r_[0, 0], 0 : np.r_[0, 0]} + **{ k : np.r_[v["min"], v["maj"]] for k, v in Sgc.groupby(level = "clust").sum().to_dict(orient = "index").items() }, + **{-1 : np.r_[0, 0]} }) - # for the first round, this is { -1/0 : np.r_[0, 0], 1 : np.r_[S[0, "min"], S[0, "maj"]] } - self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 and k != 0 }) - # for the first round, this is { 1 : {0} } - unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1]) + # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]], 1 : S[1, "min"], S[1, "maj"], ..., N : S[N - 1, "min"], S[N - 1, "maj"] } + + self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 }) + # for the first round, this is { 0 : {0}, 1 : {1}, ..., N - 1 : {N - 1} } # store this as numpy for speed self.clusts = self.S["clust"].values max_clust_idx = np.max(self.clust_members.keys() | self.clust_prior.keys() if self.clust_prior is not None else {}) + # + # breakpoint tracking + + # segmentation breakpoints + self.breakpoints = sc.SortedSet(np.flatnonzero(np.diff(self.S["clust"]) != 0) + 1) | {0, len(self.S)} + + # min/maj counts in each segment + self.seg_sums = sc.SortedDict() + bpl = np.r_[self.breakpoints] + for st, en in np.c_[bpl[:-1], bpl[1:]]: + mn = self._Ssum_ph(np.r_[st:en], min = True) + mj = self._Ssum_ph(np.r_[st:en], min = False) + self.seg_sums[st] = np.r_[mn, mj] + + # likelihoods for each segment + self.seg_liks = sc.SortedDict() + for k, (a, b) in self.seg_sums.items(): + self.seg_liks[k] = ss.betaln(a + 1 + self.betahyp, b + 1 + self.betahyp) + + # breakpoints for each cluster + self.clust_members_bps = sc.SortedDict({ + k : sc.SortedSet(v) for k, v in \ + self.S.loc[self.breakpoints[:-1], ["clust"]].groupby("clust").groups.items() + }) + + # misphase probabilities for each segment + self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints[:-1] }) + # containers for saving the MCMC trace - self.segs_to_clusters = [] + self.snps_to_clusters = [] self.phase_orientations = [] + self.segment_trace = [] + self.likelihood_trace = [] + # likelihood trace for checking burnin status + self.lik_trace = [] burned_in = False - all_assigned = False - seg_touch_idx = np.zeros(len(self.S), dtype = np.uint16) - -# # containers for saving debugging information (overall likelihoods/cluster assignments pre-burnin) -# self.lik_tmp = [] -# self.vc_tmp = [] + self.burnin_iteration = -1 + touch90 = False + likelihood_ready = False n_it = 0 n_it_last = 0 - while len(self.segs_to_clusters) < n_iter: + + brk = 0 + + while True: if not n_it % 1000: - print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index()) - print("n unassigned: {}".format((self.S["clust"] == -1).sum())) - print("n garbage: {}".format((self.S["clust"] == 0).sum())) + if len(self.clust_counts) > 20: + print(pd.Series(self.clust_counts.values()).value_counts().sort_index()) + else: + print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1])) + if likelihood_ready: + print("[{}] Likelihood: {}".format("*" if burned_in else " ", self.lik_trace[-1].sum())) + if burned_in: + print("{}/{} MCMC samples collected".format(len(self.snps_to_clusters), n_samps)) + + # stop after a raw number of iterations + if n_iter > 0 and n_it > n_iter: + return - # we are burned in (n_seg/n_clust) iterations after all segments have been touched + # stop after a number of samples have been taken + if n_samps > 0 and len(self.snps_to_clusters) > n_samps: + break + + # poll every 100 iterations for various statuses if not n_it % 100: - if not all_assigned and (((seg_touch_idx > 0) | (self.clusts == 0)).all() or \ - # if there is only one cluster, then consider every segment to have been touched - # otherwise, waiting for every segment to actually be touched will take forever - len(unassigned_segs) == 0 and len(self.clust_counts) == 1): - all_assigned = True - n_it_last = n_it - if not burned_in and all_assigned and \ - n_it - n_it_last > len(self.S)/len(self.clust_counts): - burned_in = True - -# self.lik_tmp.append(self.compute_overall_lik()) -# self.vc_tmp.append(self.S["clust"].value_counts()) + # have >95% of segments been touched? + if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.95: + touch90 = True + + # start computing likelihoods + if touch90: + lik = self.compute_overall_lik_simple() + # phasing likelihood will be NaN until we've touched every singlesegment + if not np.isnan(lik).any(): + self.lik_trace.append(lik) + likelihood_ready = True + + # check if likelihood has stabilized enough to consider us "burned in" + if likelihood_ready and not burned_in and len(self.lik_trace) > 500: + lt = np.vstack(self.lik_trace).sum(1) + if (np.convolve(np.diff(lt), np.ones(500)/500, mode = "same") < 0).sum() > 2: + print("BURNED IN") + burned_in = True + self.burnin_iteration = len(self.lik_trace) + n_it_last = n_it # - # pick either a segment or a cluster at random (50:50 prob.) - move_clust = False - - # pick a segment at random - if np.random.rand() < 0.5: - #if np.random.rand() < 1: - # bias picking unassigned segments if >90% of segments have been assigned - if len(unassigned_segs) > 0 and len(unassigned_segs)/len(self.S) < 0.1 and np.random.rand() < 0.5: - seg_idx = sc.SortedSet({np.random.choice(unassigned_segs)}) - else: - seg_idx = sc.SortedSet({np.random.choice(len(self.S))}) - - cur_clust = int(self.clusts[seg_idx]) - - # expand segment to include all adjacent segments in the same cluster, - # if it has already been assigned to a cluster - if cur_clust > 0 and np.random.rand() < 0.5: - si = seg_idx[0] - - j = 1 - while si - j > 0 and \ - (self.clusts[si - j] == cur_clust or self.clusts[si - j] == 0): - if self.clusts[si - j] != 0: - seg_idx.add(si - j) - j += 1 - j = 1 - while si + j < len(self.S) and \ - (self.clusts[si + j] == cur_clust or self.clusts[si + j] == 0): - if self.clusts[si + j] != 0: - seg_idx.add(si + j) - j += 1 - - # if we've expanded to include a large fraction (>10%) of segments - # in this cluster, cluster indexing might become inconsistent. - # skip this iteration - if len(seg_idx) >= 0.1*self.clust_counts[cur_clust]: - n_it += 1 - continue - - seg_idx = np.r_[list(seg_idx)] - - n_move = len(seg_idx) - - # if segment was already assigned to a cluster, unassign it - if cur_clust > 0: - self.clust_counts[cur_clust] -= n_move - if self.clust_counts[cur_clust] == 0: - del self.clust_counts[cur_clust] - del self.clust_sums[cur_clust] - del self.clust_members[cur_clust] - else: - self.clust_sums[cur_clust] -= np.r_[self.S.iloc[seg_idx, self.min_col].sum(), self.S.iloc[seg_idx, self.maj_col].sum()] - self.clust_members[cur_clust] -= set(seg_idx) - - unassigned_segs.update(seg_idx) - self.clusts[seg_idx] = -1 - - # pick a cluster at random + # pick a segment to move + +# diagnostic code to compute overall likelihood before move +# compute_lik = False +# lik_before = np.nan +# if touch90 and np.random.rand() < 0.1: +# compute_lik = True +# lik_before = self.compute_overall_lik_simple() + + # >90% of segments have been moved; we are iterating over segments sequentially + if touch90: + break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)}) + brk += 1 + # we are picking segments at random else: - # it only makes sense to try joining two clusters if there are at least two of them! - if len(self.clust_counts) < 2: + break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)}) + + # get all SNPs within this segment + seg_st = self.breakpoints[break_idx[0]] + seg_en = self.breakpoints[break_idx[0] + 1] + seg_idx = np.r_[seg_st:seg_en] + + cur_clust = int(self.clusts[seg_idx[0]]) + + # propose breaking this segment + if np.random.rand() < 0.1: + # can't split segments of length 1 + if len(seg_idx) == 1: n_it += 1 continue - cl_idx = np.random.choice(self.clust_counts.keys()) - seg_idx = np.r_[list(self.clust_members[cl_idx])] - n_move = len(seg_idx) - cur_clust = -1 # only applicable for individual segments, so we set to -1 here - # (this is so that subsequent references to clust_sums[cur_clust] - # will return (0, 0)) - - # unassign all segments within this cluster - # (it will either be joined with a new cluster, or remade again into its own cluster) - del self.clust_counts[cl_idx] - del self.clust_sums[cl_idx] - del self.clust_members[cl_idx] - unassigned_segs.update(seg_idx) - self.clusts[seg_idx] = -1 + # TODO: memoize cumsums? + min_cs = self._Scumsum_ph(seg_idx, min = True) + min_csr = self.seg_sums[seg_idx[0]][0] - min_cs + maj_cs = self._Scumsum_ph(seg_idx, min = False) + maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs + + split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp) + split_lik[-1] = ss.betaln(min_cs[-1] + 1 + self.betahyp, maj_cs[-1] + 1 + self.betahyp) + split_lik -= split_lik.max() + split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum()) + seg_idx = seg_idx[:(split_point + 1)] + + # add breakpoint (can be erased subsequently if segment rejoins original cluster) + new_bp = seg_idx[-1] + 1 + if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment + self.add_breakpoint(start = seg_idx[0], mid = new_bp, end = seg_en, clust_idx = cur_clust) + + # propose splitting out a contiguous interval of segments within the current cluster {{{ + split_clust = False + if False and touch90 and np.random.rand() < 0.1: + # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable? + clust_snps = np.sort(np.r_[list(self.clust_members[cur_clust])]) + + # can't split clusters of length 1 + if len(clust_snps) == 1: + n_it += 1 + continue + + split_bdy = self.compute_cluster_splitpoints(clust_snps) + + A_tot, B_tot = self.clust_sums[cur_clust] + + lik0 = ss.betaln(A_tot + 1 + self.betahyp, B_tot + 1 + self.betahyp) + + liks = np.zeros(len(split_bdy) + 1) + liks[-1] = lik0 # don't split at all + + # likelihood ratios for splitting each region into a new cluster + for i, (st, en) in enumerate(split_bdy): + A = self._Ssum_ph(clust_snps[st:en], min = True) + B = self._Ssum_ph(clust_snps[st:en], min = False) + + liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) + + # pick a region to split + split_idx = np.random.choice( + len(split_bdy) + 1, + p = np.exp(liks - liks.max())/np.exp(liks - liks.max()).sum() + ) + + # don't split at all + if split_idx == len(split_bdy): + n_it += 1 + continue + + # seg_idx == SNPs to propose to split off + seg_idx = clust_snps[slice(*split_bdy[split_idx])] + + split_clust = True + + # add breakpoints + for si in [seg_idx[0], seg_idx[-1]]: + if si not in self.breakpoints: + seg_st_idx = self.breakpoints.bisect_left(si) - 1 + seg_st = self.breakpoints[seg_st_idx] + seg_en_idx = self.breakpoints.bisect_left(si) + seg_en = self.breakpoints[seg_en_idx] - move_clust = True + self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust) + + # get all breakpoints within this cluster/interval + left_idx = self.clust_members_bps[cur_clust].bisect_left(seg_idx[0]) + right_idx = self.clust_members_bps[cur_clust].bisect_right(seg_idx[-1]) + break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cur_clust][left_idx:right_idx]]) + + # }}} + + n_move = len(seg_idx) + + # if segment was already assigned to a cluster, unassign it + if cur_clust >= 0: + self.clust_counts[cur_clust] -= n_move + if self.clust_counts[cur_clust] == 0: + del self.clust_counts[cur_clust] + del self.clust_sums[cur_clust] + del self.clust_members[cur_clust] + del self.clust_members_bps[cur_clust] + else: + self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)] + self.clust_members[cur_clust] -= set(seg_idx) + for b in break_idx: + self.clust_members_bps[cur_clust].remove(self.breakpoints[b]) - if not all_assigned: - seg_touch_idx[seg_idx] += 1 + self.clusts[seg_idx] = -1 # # perform phase correction on segment/cluster # flip min/maj with probability that alleles are oriented the "wrong" way - self.rephase(seg_idx) +# if not np.isnan(self.seg_phase_probs[seg_idx[0]]): +# rfp = self.compute_rephase_prob(seg_idx) +# rfp_mem = self.seg_phase_probs[seg_idx[0]] +# if np.abs(rfp - rfp_mem) > 0.05: +# print(rfp_mem, rfp) +# breakpoint() + if np.isnan(self.seg_phase_probs[seg_idx[0]]): + self.seg_phase_probs[seg_idx[0]] = self.compute_rephase_prob(seg_idx) + rephase_prob = self.seg_phase_probs[seg_idx[0]] # # choose to join a cluster or make a new one @@ -800,49 +747,29 @@ def run(self, n_iter = 50): # C is all possible clusters to move to A_a = self.clust_sums[cur_clust][0] if cur_clust in self.clust_sums else 0 A_b = self.clust_sums[cur_clust][1] if cur_clust in self.clust_sums else 0 - B_a = self.S.iloc[seg_idx, self.min_col].sum() # TODO: slow if seg_idx contains many SNPs - B_b = self.S.iloc[seg_idx, self.maj_col].sum() - C_ab = np.r_[self.clust_sums.values()] # first terms: (-1) = make new cluster, (0) = garbage cluster + B_a = self._Ssum_ph(seg_idx, min = True) + B_b = self._Ssum_ph(seg_idx, min = False) + C_ab = np.r_[self.clust_sums.values()] # first terms: -1 = make new cluster #C_ab = np.r_[[v for k, v in clust_sums.items() if k != cur_clust or cur_clust == -1]] # if we don't want to explicitly propose letting B rejoin cur_clust - # - # adjacent segment likelihoods - - adj_AB = 0 - adj_BC = np.zeros(len(self.clust_sums)) - - if not move_clust or (all_assigned and move_clust and np.random.rand() < 0.01): - adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust) - else: - adj_BC[self.clust_sums.index(0)] = -np.inf - # A+B,C -> A,B+C # A+B is likelihood of current cluster B is part of #AB = ss.betaln(A_a + B_a + 1, A_b + B_b + 1) # C is likelihood of target cluster pre-join - C = ss.betaln(C_ab[:, 0] + 1, C_ab[:, 1] + 1) + C = ss.betaln(C_ab[:, 0] + 1 + self.betahyp, C_ab[:, 1] + 1 + self.betahyp) + C[0] = 0 # don't count prior twice when opening a new cluster # A is likelihood cluster B is part of, minus B #A = ss.betaln(A_a + 1, A_b + 1) - # B+C is likelihood of target cluster post-join - BC = ss.betaln(C_ab[:, 0] + B_a + 1, C_ab[:, 1] + B_b + 1) - - # L(join) L(split) - #MLs = A + BC + adj_BC - (AB + C + adj_AB) - # TODO: remove extraneous calculations (e.g. adj_AB, AB, A); - # likelihood simplifies to this in the prior: - MLs = adj_BC + BC - C + # B+C is likelihood of target cluster post-join, with both phase orientations + BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1 + self.betahyp, C_ab[:, [1]] + np.c_[B_b, B_a] + 1 + self.betahyp) - # if we are moving multiple contiguous segments assigned to the same - # cluster, do not allow them to create a new cluster. this helps keep - # cluster indices consistent - if n_move > 1 and not move_clust: - MLs[self.clust_sums.index(-1)] = -np.inf + MLs = BC - C[:, None] # # priors - # prior on previous cluster fractions + ## prior on previous cluster fractions {{{ prior_diff = [] prior_com = [] @@ -867,234 +794,334 @@ def run(self, n_iter = 50): # [-1 (totally new cluster), , ] prior_idx = np.r_[ np.r_[[self.clust_prior.index(x) for x in prior_diff]], - np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null | {0})]] + np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null)]] ] + # prior marginal likelihoods for both phase orientations prior_MLs = ss.betaln( # prior clusters + segment - np.r_[self.clust_prior_mat[prior_idx, 0]] + B_a + 1, - np.r_[self.clust_prior_mat[prior_idx, 1]] + B_b + 1 + np.c_[self.clust_prior_mat[prior_idx, 0]] + np.c_[B_a, B_b] + 1, + np.c_[self.clust_prior_mat[prior_idx, 1]] + np.c_[B_b, B_a] + 1 ) \ - - (ss.betaln(B_a + 1, B_b + 1) + np.r_[np.r_[self.clust_prior_liks.values()][prior_idx]]) # prior clusters, segment + - np.c_[ss.betaln(B_a + 1, B_b + 1) + np.r_[np.r_[self.clust_prior_liks.values()][prior_idx]]] # prior clusters, segment clust_prior_p = np.maximum(np.exp(prior_MLs - prior_MLs.max())/np.exp(prior_MLs - prior_MLs.max()).sum(), 1e-300) # expand MLs to account for multiple new clusters - MLs = np.r_[np.full(len(prior_diff), MLs[0]), MLs[1:]] + MLs = np.r_[np.full([len(prior_diff), 2], MLs[0]), MLs[1:, :]] + + # }}} - # DP prior based on clusters sizes - # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster) - ccp = np.r_[[self.clust_count_prior[x] for x in prior_diff]] - count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_count_prior[0], self.clust_counts.values()] - count_prior /= count_prior.sum() - - # choose to join a cluster or make a new one (choice_idx = 0) - num = MLs + np.log(count_prior) + np.log(clust_prior_p) - choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum() + ## DP prior based on clusters sizes + n_c = np.c_[self.clust_counts.values()]/self.dp_count_scale_factor + M = n_move/self.dp_count_scale_factor + N = n_c.sum() + M + log_count_prior = np.full([len(self.clust_sums), 1], np.nan) + log_count_prior[1:] = ss.gammaln(M + n_c) + ss.gammaln(N + self.alpha - M) \ + - (ss.gammaln(n_c) + ss.gammaln(N + self.alpha)) + # probability of opening a new cluster + log_count_prior[0] = ss.gammaln(M) + np.log(self.alpha) + ss.gammaln(N + self.alpha - M) - ss.gammaln(N + self.alpha) + + # p(phase|X) + log_phase_prob = np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob])) + + # + # adjacent segment likelihood + + log_adj_lik = self.compute_adj_prob(break_idx[0]) + + # p(X|clust,phase)p(X|seg,phase)p(clust)p(phase) + num = (MLs # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B}) + + log_adj_lik # p({a_i, b_i}_{i\in B} | U, D, phase_{i\in B}) + + log_count_prior # p(clust) (DP prior on clust counts) + + log_phase_prob) # p(phase) + + num -= num.max() # avoid underflow in sum-exp + + # p(clust,phase|X) + choice_p = np.exp(num - np.log(np.exp(num).sum())) + + # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true choice_idx = np.random.choice( - np.r_[0:len(MLs)], - p = choice_p + np.r_[0:np.prod(choice_p.shape)], + p = choice_p.ravel() ) # -1 = brand new, -2, -3, ... = -(prior clust index) - 2 - # 0 = garbage - choice = np.r_[-np.r_[prior_diff] - 2, 0, self.clust_counts.keys()][choice_idx] + choice = np.r_[-np.r_[prior_diff] - 2, self.clust_counts.keys()][choice_idx//2] + + # save rephasing status + if choice_idx & 1: + self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col] + for b in break_idx: + st = self.breakpoints[b] + en = self.breakpoints[b + 1] + self.seg_sums[st] = self.seg_sums[st][::-1] # create new cluster if choice < 0: # if we are moving an entire cluster, give it the same index it used to have # otherwise, cluster indices will be inconsistent - if move_clust: - new_clust_idx = cl_idx - elif choice == -1: # totally new cluster + if cur_clust not in self.clust_counts: + new_clust_idx = cur_clust + else: # totally new cluster max_clust_idx += 1 new_clust_idx = max_clust_idx - else: # match index of cluster in prior - new_clust_idx = -choice - 2 self.clust_counts[new_clust_idx] = n_move self.S.iloc[seg_idx, self.clust_col] = new_clust_idx self.clusts[seg_idx] = new_clust_idx - self.clust_sums[new_clust_idx] = np.r_[B_a, B_b] + self.clust_sums[new_clust_idx] = np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a] self.clust_members[new_clust_idx] = set(seg_idx) - # send to garbage - elif choice == 0: - self.S.iloc[seg_idx, self.clust_col] = 0 - self.clusts[seg_idx] = 0 - # join existing cluster else: - # if we are combining two clusters, take the index of the bigger one - # this helps to keep cluster indices consistent - if move_clust and self.clust_counts[choice] < n_move: - self.clust_counts[cl_idx] = self.clust_counts[choice] - self.clust_sums[cl_idx] = self.clust_sums[choice] - self.clust_members[cl_idx] = self.clust_members[choice] - self.S.iloc[np.flatnonzero(self.S["clust"] == choice), self.clust_col] = cl_idx - del self.clust_counts[choice] - del self.clust_sums[choice] - del self.clust_members[choice] - choice = cl_idx - self.clust_counts[choice] += n_move - self.clust_sums[choice] += np.r_[B_a, B_b] + self.clust_sums[choice] += np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a] self.S.iloc[seg_idx, self.clust_col] = choice self.clusts[seg_idx] = choice self.clust_members[choice].update(set(seg_idx)) - for si in seg_idx: - unassigned_segs.discard(si) + # if segment was rephased, update saved phasing probabilities + if choice_idx & 1: + for bp_idx in break_idx: + st = self.breakpoints[bp_idx] + en = self.breakpoints[bp_idx + 1] + self.seg_phase_probs[st] = self.compute_rephase_prob(np.r_[st:en]) + + # update breakpoints + + # B->A + # . . . break_idx + 1 + # A B A B A C B A + # + + + break_idx + #* * update_idx + + break_idx_bi = break_idx | { x + 1 for x in break_idx } + snp_idx_bi = sc.SortedSet([self.breakpoints[b] for b in break_idx_bi]) + snp_idx = sc.SortedSet([self.breakpoints[b] for b in break_idx]) + update_idx = sc.SortedSet() + for snp in snp_idx_bi: + if snp < len(self.S) and self.clusts[snp - 1] == self.clusts[snp]: + snp_idx.discard(snp) # discard rather than remvoe because this could be in snp_idx + 1 + self.breakpoints.remove(snp) + self.seg_sums.pop(snp) + self.seg_liks.pop(snp) + self.seg_phase_probs.pop(snp) + self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster + update_idx.add(self.breakpoints.bisect_left(snp) - 1) + snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1]) +# if len(update_idx): +# usnp = self.breakpoints[self.breakpoints.bisect_left(seg_idx[0]) - 1] +# print(f"{usnp}: {self.clusts[usnp]}") +# print(f"{snp_idx[0]}: {self.clusts[snp_idx[0]]} <") +# print(f"{snp_idx[1]}: {self.clusts[snp_idx[1]]} <") +# dsnp = self.breakpoints[self.breakpoints.bisect_right(seg_idx[0])] +# print(f"{dsnp}: {self.clusts[dsnp]}") +# print(f"Update: {self.breakpoints[update_idx[0]]}") + for bp_idx in update_idx: + st = self.breakpoints[bp_idx] + en = self.breakpoints[bp_idx + 1] + A = self._Ssum_ph(np.r_[st:en], min = True) + B = self._Ssum_ph(np.r_[st:en], min = False) + self.seg_sums[st] = np.r_[A, B] + self.seg_liks[st] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp) + self.seg_phase_probs[st] = self.compute_rephase_prob(np.r_[st:en]) - # track global state of cluster assignments - # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations - if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2): - self.segs_to_clusters.append(self.S["clust"].copy()) + if choice < 0: + self.clust_members_bps[new_clust_idx] = snp_idx + else: + self.clust_members_bps[choice] |= snp_idx + +# diagnostic code to check if breakpoint list is properly updated +# if touch90: +# x = sc.SortedSet() +# for y in self.clust_members_bps.values(): +# x |= y +# if len(x) != len(self.breakpoints) - 1: +# breakpoint() + +# diagnostic code to compute overall likelihood delta for iteration +# if compute_lik: +# lik_after = self.compute_overall_lik_simple() +# lik_delta = lik_after.sum() - lik_before.sum() +# ML_choice = num.ravel()[choice_idx] +# if not np.isnan(lik_delta) and (lik_delta != 0 or ML_choice != 0): +# print("lik: {}; MLs: {}".format(lik_delta, ML_choice)) +## if lik_delta < 0 and ML_choice == 0: +## breakpoint() + + # save a sample from the MCMC when >95% of segments have been touched since the last iteration + if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95: + self.snps_to_clusters.append(self.S["clust"].copy()) self.phase_orientations.append(self.S["flipped"].copy()) + self.segment_trace.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]}) + self.likelihood_trace.append(self.compute_overall_lik_simple().sum()) n_it_last = n_it n_it += 1 - return np.r_[self.segs_to_clusters], np.r_[self.phase_orientations] - - #_colors = mpl.cm.get_cmap("tab10").colors - _colors = ((np.c_[1:7] & np.r_[4, 2, 1]) > 0).astype(int) -# _colors = np.r_[np.c_[87, 182, 55], -# np.c_[253, 245, 81], -# np.c_[238, 109, 45], -# np.c_[204, 43, 30], -# np.c_[221, 50, 132], -# np.c_[0, 23, 204], -# np.c_[75, 172, 227]]/255 + return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations], np.r_[self.likelihood_trace] - def get_unique_clust_idxs(self, segs_to_clusters = None): - if segs_to_clusters is None: - segs_to_clusters = np.r_[self.segs_to_clusters] - s2cu, s2cu_j = np.unique(segs_to_clusters, return_inverse = True) - return s2cu, s2cu_j.reshape(segs_to_clusters.shape) + def get_unique_clust_idxs(self, snps_to_clusters = None): + if snps_to_clusters is None: + snps_to_clusters = np.r_[self.snps_to_clusters] + s2cu, s2cu_j = np.unique(snps_to_clusters, return_inverse = True) + return s2cu, s2cu_j.reshape(snps_to_clusters.shape) def get_colors(self): s2cu, s2cu_j = self.get_unique_clust_idxs() - seg_terr = self.S["end_gp"] - self.S["start_gp"] - tot_terr = np.zeros(len(s2cu)) - for r in s2cu_j: - tot_terr += npg.aggregate(r, seg_terr, size = len(tot_terr)) - - si = np.argsort(tot_terr)[::-1] - terr_cs = np.cumsum(tot_terr[si])/tot_terr.sum() - - return [mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (terr_cs < 0.99).sum())] - - def visualize_segs(self): - plt.figure() + T = pd.DataFrame(np.c_[np.r_[self.breakpoints[:-2]], np.r_[self.breakpoints[1:-1]]], columns = ["snp_st", "snp_end"]) + T["gp_st"] = self.S.loc[T["snp_st"], "pos_gp"].values + T["gp_end"] = self.S.loc[T["snp_end"], "pos_gp"].values + T["terr"] = T["gp_end"] - T["gp_st"] + T["clust"] = self.S.loc[T["snp_st"], "clust"].values + + clust_terr = T.groupby("clust")["terr"].sum().sort_values(ascending = False) + si = clust_terr.index.argsort() + + # color any cluster larger than 10Mb (~0.003 of total genomic territory) + base_colors = np.array([ + [0.368417, 0.506779, 0.709798], + [0.880722, 0.611041, 0.142051], + [0.560181, 0.691569, 0.194885], + [0.922526, 0.385626, 0.209179], + [0.528488, 0.470624, 0.701351], + [0.772079, 0.431554, 0.102387], + [0.363898, 0.618501, 0.782349], + [1, 0.75, 0], + [0.647624, 0.37816, 0.614037], + [0.571589, 0.586483, 0.], + [0.915, 0.3325, 0.2125], + [0.400822, 0.522007, 0.85], + [0.972829, 0.621644, 0.073362], + [0.736783, 0.358, 0.503027], + [0.280264, 0.715, 0.429209] + ]) + extra_colors = np.array( + distinctipy.distinctipy.get_colors( + (clust_terr/clust_terr.sum() >= 0.003).sum() - base_colors.shape[0], + exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], np.c_[0.5, 0.5, 0.5], np.c_[1, 0, 1], base_colors]], + rng = 1234 + ) + ) + + return np.r_[base_colors, extra_colors if extra_colors.size > 0 else np.empty([0, 3])][si] + + def visualize_segs(self, f = None, use_clust = False, show_snps = False, chrom = None): + f = plt.figure(figsize = [16, 4]) if f is None else f ax = plt.gca() - ax.set_xlim([0, self.S["end_gp"].max()]) + if chrom is None: + ax.set_xlim([0, self.S["pos_gp"].max()]) + else: + ax.set_xlim([*self.S.loc[self.S["chr"] == chrom, "pos_gp"].iloc[[0, -1]]]) ax.set_ylim([0, 1]) colors = self.get_colors() s2cu, s2cu_j = self.get_unique_clust_idxs() - n_samp = len(self.segs_to_clusters) - - for s2c, s2ph in zip(s2cu_j, self.phase_orientations): - # rephase segments according to phase orientation sample - S_ph = self.S.copy() - flip_idx = np.flatnonzero(s2ph != S_ph["flipped"]) - S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] + n_samp = len(self.snps_to_clusters) - for i, r in enumerate(S_ph.itertuples()): - ## don't show garbage clusters - #if s2cu[s2c[i]] == 0: - # continue + selff = copy.deepcopy(self) - ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], r.min + 1, r.maj + 1) - ax.add_patch(mpl.patches.Rectangle((r.start_gp, ci_lo), r.end_gp - r.start_gp, ci_hi - ci_lo, facecolor = colors[s2c[i] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000)) + if show_snps: + # set SNP alpha based on number of SNPs + logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M))) + default_alpha = logistic(A = 0.4, K = 0.025, B = 0.00001, M = 120000, x = len(self.S) if chrom is None else (self.S["chr"] == chrom).sum()) - def visualize_adjacent_segs(self, f = None, n_samp = None): - plt.figure(num = f, figsize = [17.56, 5.67]) - ax = plt.gca() - ax.set_xlim([0, self.S["end_gp"].max()]) - ax.set_ylim([0, 1]) + ph_prob = np.r_[self.phase_orientations].mean(0) - colors = self.get_colors() - s2cu, s2cu_j = self.get_unique_clust_idxs() + cu = np.searchsorted(s2cu, self.S["clust"]) - n_samp = len(self.segs_to_clusters) if n_samp is None else n_samp + # only plot unambiguous SNPs once + uidx = ph_prob == 0 + ax.scatter( + self.S.loc[uidx, "pos_gp"], + self.S.loc[uidx, "min"]/self.S.loc[uidx, ["min", "maj"]].sum(1), + color = colors[cu[uidx] % len(colors)], marker = '.', alpha = default_alpha, s = 1 + ) + uidx = ph_prob == 1 + ax.scatter( + self.S.loc[uidx, "pos_gp"], + self.S.loc[uidx, "maj"]/self.S.loc[uidx, ["min", "maj"]].sum(1), + color = colors[cu[uidx] % len(colors)], marker = '.', alpha = default_alpha, s = 1 + ) - for s2c, s2ph in zip(s2cu_j, self.phase_orientations): - # rephase segments according to phase orientation sample - S_ph = self.S.copy() - flip_idx = np.flatnonzero(s2ph != S_ph["flipped"]) - S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] - - bdy = np.flatnonzero(np.r_[1, np.diff(s2c) != 0, 1]) - bdy = np.c_[bdy[:-1], bdy[1:]] - -# s2c_nz = s2c.copy() -# zidx = np.flatnonzero(s2c[bdy[:, 0]] == 0) -# for z in zidx: -# s2c_nz[bdy[z, 0]:bdy[z, 1]] = s2c_nz[bdy[z - 1, 0]] -# bdy_nz = np.flatnonzero(np.r_[1, np.diff(s2c_nz) != 0, 1]) -# bdy_nz = np.c_[bdy_nz[:-1], bdy_nz[1:]] - - for st, en in bdy: - ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], S_ph.iloc[st:en, self.min_col].sum() + 1, S_ph.iloc[st:en, self.maj_col].sum() + 1) - ax.add_patch(mpl.patches.Rectangle((S_ph.iloc[st]["start_gp"], ci_lo), S_ph.iloc[en - 1]["end_gp"] - S_ph.iloc[st]["start_gp"], np.maximum(0, ci_hi - ci_lo), facecolor = colors[s2c[st] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000)) - - def visualize_clusts(self, f = None, n_samp = None, thick = False, nocolor = False): - plt.figure(num = f, figsize = [17.56, 5.67]) - ax = plt.gca() - ax.set_xlim([0, self.S["end_gp"].max()]) - ax.set_ylim([0, 1]) + # plot ambiguous SNPs with opacity weighted by phase probability + nuidx = (ph_prob != 0) & (ph_prob != 1) + ax.scatter( + selff.S.loc[nuidx, "pos_gp"], + self.S.loc[nuidx, "min"]/self.S.loc[nuidx, ["min", "maj"]].sum(1), + color = colors[cu[nuidx] % len(colors)], marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1 + ) + ax.scatter( + selff.S.loc[nuidx, "pos_gp"], + self.S.loc[nuidx, "maj"]/self.S.loc[nuidx, ["min", "maj"]].sum(1), + color = colors[cu[nuidx] % len(colors)], marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1 + ) - colors = self.get_colors() - s2cu, s2cu_j = self.get_unique_clust_idxs() + for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations): + # only show maximum likelihood if we're overlaying SNPs + if show_snps: + mlidx = np.r_[self.likelihood_trace].argmax() + seg2c, s2ph = self.segment_trace[mlidx], self.phase_orientations[mlidx] - n_samp = len(self.segs_to_clusters) if n_samp is None else n_samp + # get uniqued clust indices for each segment start + seg_cu = np.searchsorted(s2cu, np.r_[list(seg2c.values())]) - for s2c, s2ph in zip(s2cu_j, self.phase_orientations): # rephase segments according to phase orientation sample - S_ph = self.S.copy() - flip_idx = np.flatnonzero(s2ph != S_ph["flipped"]) - S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]] - - # get overall cluster sums - clust_min = npg.aggregate(s2c, S_ph["min"]) - clust_maj = npg.aggregate(s2c, S_ph["maj"]) - CIs = s.beta.ppf([0.05, 0.5, 0.95], clust_min[:, None] + 1, clust_maj[:, None] + 1) - - # get boundaries of contiguous segments - bdy = np.flatnonzero(np.r_[1, np.diff(s2c) != 0, 1]) - bdy = np.c_[bdy[:-1], bdy[1:]] - -# s2c_nz = s2c.copy() -# zidx = np.flatnonzero(s2c[bdy[:, 0]] == 0) -# for z in zidx: -# s2c_nz[bdy[z, 0]:bdy[z, 1]] = s2c_nz[bdy[z - 1, 0]] -# bdy_nz = np.flatnonzero(np.r_[1, np.diff(s2c_nz) != 0, 1]) -# bdy_nz = np.c_[bdy_nz[:-1], bdy_nz[1:]] - - for st, en in bdy: - if thick: - b = CIs[s2c[st], 1] - 0.01 - t = CIs[s2c[st], 1] + 0.01 - else: - color = colors[s2c[st] % len(colors)] - b = CIs[s2c[st], 0] - t = CIs[s2c[st], 2] + selff.S["flipped"] = s2ph - if nocolor: - color = [0, 1, 0] - else: - color = colors[s2c[st] % len(colors)] + seg_bdy = np.r_[list(seg2c.keys()), len(selff.S)] + seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] + for i, (st, en) in enumerate(seg_bdy): + if use_clust: + ci_lo, med, ci_hi = s.beta.ppf( + [0.05, 0.5, 0.95], + selff.clust_sums[seg2c[st]][0] + 1 + self.betahyp, + selff.clust_sums[seg2c[st]][1] + 1 + self.betahyp, + ) + else: + ci_lo, med, ci_hi = s.beta.ppf( + [0.05, 0.5, 0.95], + selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp, + selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp, + ) ax.add_patch(mpl.patches.Rectangle( - xy = (S_ph.iloc[st]["start_gp"], b), - width = S_ph.iloc[en - 1]["end_gp"] - S_ph.iloc[st]["start_gp"], - height = t - b, - facecolor = color, - fill = True, - alpha = 1/n_samp, - zorder = 1000) + (selff.S.iloc[st]["pos_gp"], ci_lo), + selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"], + np.maximum(0, ci_hi - ci_lo), + facecolor = colors[seg_cu[i] % len(colors)], + edgecolor = 'k' if show_snps else None, linewidth = 0.5 if show_snps else None, + fill = True, alpha = 1 if show_snps else 1/n_samp, zorder = 1000 + )) + ax.scatter( + (selff.S.iloc[en - 1]["pos_gp"] + selff.S.iloc[st]["pos_gp"])/2, + med, + color = colors[seg_cu[i] % len(colors)], + marker = '.', s = 1, alpha = 1 if show_snps else 1/n_samp ) + + if show_snps: + break + + def visualize_clusts(self, **kwargs): + self.visualize_segs(use_clust = True, **kwargs) + + def plot_likelihood_trace(self): + lt = np.vstack(self.lik_trace) + lt = lt[np.isnan(lt).sum(1) == 0, :] + + lt = lt[self.burnin_iteration:, :] + + plt.figure(); plt.clf() + plt.scatter(np.r_[0:len(lt)], lt[:, 0] - lt[:, 0].max()) + #plt.scatter(np.r_[0:len(lt)], lt[:, 1] - lt[:, 1].max()) + plt.scatter(np.r_[0:len(lt)], lt[:, 2] - lt[:, 2].max()) + plt.scatter(np.r_[0:len(lt)], lt[:, 3] - lt[:, 3].max()) + plt.scatter(np.r_[0:len(lt)], lt.sum(1) - lt.sum(1).max(), marker = '+', color = 'k') + plt.legend(["Clust", "DP", "Seg", "Total"]) + plt.xlabel(r"Post-burnin iteration ($\times 100$)") + plt.ylabel(r"$\Delta$ likelihood") diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py index d38d67e..32e4313 100644 --- a/hapaseg/allelic_MCMC.py +++ b/hapaseg/allelic_MCMC.py @@ -24,8 +24,6 @@ def __init__(self, P, quit_after_burnin = False, n_iter = 100000, ref_bias = 1.0, - misphase_prior = 0.001, - phase_correct = False ): # # dataframe stuff @@ -59,22 +57,11 @@ def __init__(self, P, self.quit_after_burnin = quit_after_burnin - self.misphase_prior = misphase_prior - - # whether to perform phasing correction iterations - self.phase_correct = phase_correct - - # how many post-burnin samples to use to infer phase switches - self.n_phase_correct_samples = 40 - # # chain state self.iter = 1 self.burned_in = False - # whether phase correction has been performed - self.phase_correction_ready = False - # # breakpoint storage @@ -89,17 +76,8 @@ def __init__(self, P, # list of all breakpoints at nth iteration self.breakpoint_list = [] - # - # misphase interval storage - - # candidate intervals that were misphased - self.B_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int) - - # current state of interval assignments (relative to B_ct) - self.F = MySortedList() - - # state of interval assignments at nth iteration - self.phase_interval_list = [] + # MLE breakpoint + self.breakpoints_MLE = None # # cumsum arrays for each segment @@ -110,19 +88,18 @@ def __init__(self, P, self.cs_MAJ = sc.SortedDict() self.cs_MIN = sc.SortedDict() - # probability of picking a breakpoint - self.split_prob = sc.SortedDict() - # # marginal likelihoods + self.betahyp = (self.P["REF_COUNT"] + self.P["ALT_COUNT"]).mean()/4 + # log marginal likelihoods for each segment # initialize with each SNP comprising its own segment. self.seg_marg_liks = sc.SortedDict(zip( range(0, len(self.P)), ss.betaln( - self.P.iloc[0:len(self.P), self.min_idx] + 1, - self.P.iloc[0:len(self.P), self.maj_idx] + 1 + self.P.iloc[0:len(self.P), self.min_idx] + 1 + self.betahyp, + self.P.iloc[0:len(self.P), self.maj_idx] + 1 + self.betahyp ) )) @@ -141,25 +118,14 @@ def _Piloc(self, st, en, col_idx, incl_idx = None): def run(self): while self.iter < self.n_iter: - # perform a split, combine, phase correct, or prune operation - op = np.random.choice(4) + # perform a split or combine + op = np.random.choice(2) if op == 0: if self.combine(np.random.choice(self.breakpoints[:-1]), force = False) == -1: continue elif op == 1: if self.split(b_idx = np.random.choice(len(self.breakpoints))) == -1: continue - elif op == 2: - if self.phase_correct and self.phase_correction_ready: - self.rephase() - else: - continue - elif op == 3: - continue - if np.random.rand() < 0.01: - self.prune() - else: - continue # if we're only running up to burnin, bail if self.quit_after_burnin and self.burned_in: @@ -172,27 +138,15 @@ def run(self): ) + colorama.Fore.RESET) return self - # correct phases after some post-burnin iterations - if not self.phase_correction_ready and self.phase_correct and \ - self.burned_in and len(self.breakpoint_list) >= 2*self.n_phase_correct_samples: - self.correct_phases() - self.phase_correction_ready = True - - # breakpoint/prune lists are liable to change after phase correction, so clear them - self.breakpoint_list = [] - self.include = [] - - # save set of breakpoints, phase intervals, and prune states if burned in - if self.burned_in and not self.iter % 100: - self.breakpoint_list.append(self.breakpoints.copy()) - self.include.append(self.P["include"].copy()) - if self.phase_correction_ready: - self.phase_interval_list.append(self.F.copy()) + # save MLE breakpoint if we've burned in + if self.burned_in: + if self.marg_lik[self.iter] > self.marg_lik[self.iter - 1]: + self.breakpoints_MLE = self.breakpoints.copy() # print status if not self.iter % 100: if self.burned_in: - color = colorama.Fore.MAGENTA if not self.phase_correction_ready else colorama.Fore.RESET + color = colorama.Fore.RESET else: color = colorama.Fore.YELLOW print("{color}[{st},{en}]\t{n}/{tot}\tn_bp = {n_bp}\tlik = {lik}".format( @@ -205,11 +159,23 @@ def run(self): color = color )) - # check if we've burned in + # check if we've burned in -- chain is oscillating around some + # optimium (and thus mean differences between marginal likelihoods might + # be slightly negative) # TODO: use a faster method of computing rolling average - if not self.burned_in and self.iter > 500: - if np.diff(self.marg_lik[(self.iter - 500):self.iter]).mean() < 0: + if not self.burned_in and self.iter > 1000: + if np.diff(self.marg_lik[(self.iter - 1000):self.iter]).mean() < 0: self.burned_in = True + # contingency if we've unambiguously converged on an optimum and chain has not moved at all + # exit early to save time + if (np.diff(self.marg_lik[(self.iter - 1000):self.iter]) == 0).all(): + self.breakpoints_MLE = self.breakpoints.copy() + print(colorama.Fore.GREEN + "Chain has unambiguously converged on an optimum; stopping early in {n} iterations. n_bp = {n_bp}, lik = {lik}".format( + n = self.iter, + n_bp = len(self.breakpoints), + lik = self.marg_lik[self.iter] + ) + colorama.Fore.RESET) + return self self.iter += 1 @@ -245,8 +211,8 @@ def combine(self, st = None, b_idx = None, force = True): ML_split = self.seg_marg_liks[st] + self.seg_marg_liks[mid] ML_join = ss.betaln( - self._Piloc(st, en, self.min_idx).sum() + 1, - self._Piloc(st, en, self.maj_idx).sum() + 1 + self._Piloc(st, en, self.min_idx).sum() + 1 + self.betahyp, + self._Piloc(st, en, self.maj_idx).sum() + 1 + self.betahyp ) # proposal dist. ratio @@ -276,299 +242,6 @@ def combine(self, st = None, b_idx = None, force = True): return mid - def flip_hap(self, st, en): - """ - Flips the SNPs from st to en - """ - - x = self.P.iloc[st:en, self.maj_idx].copy() - self.P.iloc[st:en, self.maj_idx] = self.P.iloc[st:en, self.min_idx] - self.P.iloc[st:en, self.min_idx] = x - - def prob_misphase(self, bdy1, bdy2): - """ - Compute probability of misphase - """ - # TODO: change invocation to st, mid, en -- we don't need to correct - # phasing of noncontiguous segments - - # prior on misphasing probability - p_mis = self.misphase_prior if np.isnan(self.P.loc[bdy1[1] - 1, "misphase_prob"]) else self.P.loc[bdy1[1] - 1, "misphase_prob"] - if p_mis == 0: - return -np.inf, 0 - - # haps = x/y, segs = 1/2, beta params. = A/B - - # seg 1 - rng_idx = (self.P.index >= bdy1[0]) & (self.P.index < bdy1[1]) - - idx = rng_idx & self.P["aidx"] & self.P["include"] - x1_A = self.P.loc[idx, "ALT_COUNT"].sum() - x1_B = self.P.loc[idx, "REF_COUNT"].sum() - - idx = rng_idx & ~self.P["aidx"] & self.P["include"] - y1_A = self.P.loc[idx, "ALT_COUNT"].sum() - y1_B = self.P.loc[idx, "REF_COUNT"].sum() - - # seg 2 - rng_idx = (self.P.index >= bdy2[0]) & (self.P.index < bdy2[1]) - - idx = rng_idx & self.P["aidx"] & self.P["include"] - x2_A = self.P.loc[idx, "ALT_COUNT"].sum() - x2_B = self.P.loc[idx, "REF_COUNT"].sum() - - idx = rng_idx & ~self.P["aidx"] & self.P["include"] - y2_A = self.P.loc[idx, "ALT_COUNT"].sum() - y2_B = self.P.loc[idx, "REF_COUNT"].sum() - - lik_mis = ss.betaln(x1_A + y1_B + y2_A + x2_B + 1, y1_A + x1_B + x2_A + y2_B + 1) - lik_nomis = ss.betaln(x1_A + y1_B + x2_A + y2_B + 1, y1_A + x1_B + y2_A + x2_B + 1) - - # logsumexp - m = np.maximum(lik_mis, lik_nomis) - denom = m + np.log(np.exp(lik_mis - m)*p_mis + np.exp(lik_nomis - m)*(1 - p_mis)) - - return lik_mis + np.log(p_mis) - denom, lik_nomis + np.log(1 - p_mis) - denom - - def correct_phases(self): - """ - Compute potentially misphased intervals, given some segmentation samples - """ - if not self.burned_in or len(self.breakpoint_list) == 0: - raise RuntimeError("Breakpoint sample list must be populated (chain must be burned in)") - - #A_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int) - #B_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int) - - for bp_idx in np.random.choice(len(self.breakpoint_list), self.n_phase_correct_samples, replace = False): - bpl = np.array(self.breakpoint_list[bp_idx]); bpl = np.c_[bpl[:-1], bpl[1:]] - - p_mis = np.full(len(bpl) - 1, np.nan) - p_A = np.full(len(bpl) - 1, np.nan) - p_B = np.full(len(bpl) - 1, np.nan) - - V = np.full([len(bpl) - 1, 2], np.nan) - B = np.zeros([len(bpl) - 1, 2], dtype = np.uint8) - - for i, (st, mid, _, en) in enumerate(np.c_[bpl[:-1], bpl[1:]]): - p_mis, p_nomis = self.prob_misphase([st, mid], [mid, en]) - - # TODO: memoize partial sums - - # prob. that left segment is on hap. A - p_A1 = s.beta.logsf(0.5, self._Piloc(st, mid, self.min_idx).sum() + 1, self._Piloc(st, mid, self.maj_idx).sum() + 1) - # prob. that right segment is on hap. A - p_A2 = s.beta.logsf(0.5, self._Piloc(mid, en, self.min_idx).sum() + 1, self._Piloc(mid, en, self.maj_idx).sum() + 1) - - # prob. that left segment is on hap. B - p_B1 = s.beta.logcdf(0.5, self._Piloc(st, mid, self.min_idx).sum() + 1, self._Piloc(st, mid, self.maj_idx).sum() + 1) - # prob. that right segment is on hap. B - p_B2 = s.beta.logcdf(0.5, self._Piloc(mid, en, self.min_idx).sum() + 1, self._Piloc(mid, en, self.maj_idx).sum() + 1) - - if i == 0: - V[i, :] = [p_A1, p_B1] - continue - - p_AB = p_mis + p_A1 + p_B2 - p_BA = p_mis + p_B1 + p_A2 - p_AA = p_nomis + p_A1 + p_A2 - p_BB = p_nomis + p_B1 + p_B2 - - V[i, 0] = np.max(np.r_[p_AA + V[i - 1, 0], p_BA + V[i - 1, 1]]) - V[i, 1] = np.max(np.r_[p_AB + V[i - 1, 0], p_BB + V[i - 1, 1]]) - - B[i, 0] = np.argmax(np.r_[p_AA + V[i - 1, 0], p_BA + V[i - 1, 1]]) - B[i, 1] = np.argmax(np.r_[p_AB + V[i - 1, 0], p_BB + V[i - 1, 1]]) - - # backtrace - BT = np.full(len(B), -1, dtype = np.uint8) - ix = np.argmax(V[-1]) - BT[-1] = ix - for i, b in reversed(list(enumerate(B[:-1]))): - ix = b[ix] - BT[i] = ix - - # join contiguous segments assigned to hap. B - d = np.diff(BT, append = 0, prepend = 0) - ctg_idx = np.c_[np.flatnonzero(d == 1), np.flatnonzero(d == -1) - 1] - b_segs_j = np.c_[bpl[ctg_idx[:, 0], 0], bpl[ctg_idx[:, 1], 1]] - -# # join contiguous segments assigned to hap. A -# d = np.diff(1 - BT, append = 0, prepend = 0) -# ctg_idx = np.c_[np.flatnonzero(d == 1), np.flatnonzero(d == -1) - 1] -# a_segs_j = np.c_[bpl[ctg_idx[:, 0], 0], bpl[ctg_idx[:, 1], 1]] - - # plot - #for x in np.flatnonzero(BT): - # plt.plot(self.P.loc[bpl[x], "pos"], np.r_[j + 1, j + 1]*0.01) - - # record - for x in b_segs_j: - self.B_ct[x[0], x[1]] += 1 -# for x in a_segs_j: -# A_ct[x[0], x[1]] += 1 - -# # plot -# for k, v in B_ct.items(): -# for _ in range(0, v): -# plt.plot(self.P.iloc[np.r_[k], self.P.columns.get_loc("pos")], 0.2*np.random.rand()*np.r_[1, 1]) - - # MCMC iteration that corrects a phase - def rephase(self): # TODO: add parameters to force an interval? - # TODO: prerequisite checks; has correct_phases() been run? - choice = list(self.B_ct.keys()) - probs = np.r_[list(self.B_ct.values())] - - # - # propose an interval to flip from B->A - st, en = choice[np.random.choice(np.r_[0:len(choice)], p = probs/probs.sum())] - - # - # check if this overlaps any other regions that were already flipped B->A. - - # any previously flipped regions contained within will be left alone - - # return range of flipped region array that [st, en) overlaps - # TODO: rename this; f_o is a terrible name - def f_o(st = st, en = en): - st_idx = self.F.bisect_left(st + 1); st_idx -= st_idx % 2 - en_idx = self.F.bisect_right(en - 1); en_idx += en_idx % 2 - return slice(st_idx, en_idx) - - overlaps = np.array(self.F[f_o()]).reshape(-1, 2) - o_S = sc.SortedSet({st, en}) - for o in overlaps: - o_S.add(o[0]) - o_S.add(o[1]) - - # somewhere we ought to assert that the length of self.F is even - - # get list of regions to flip - flip_candidates = np.r_[o_S] # all possible regions to flip - flip_idx = np.zeros(len(flip_candidates) - 1, dtype = np.bool) # index of regions that haven't been flipped yet - A_flag = True # whether st:en consists entirely of regions that were flipped to A - for i, (st_seg, en_seg) in enumerate(np.c_[flip_candidates[:-1], flip_candidates[1:]]): - # this region was not already flipped B->A - if not self.F[f_o(st_seg, en_seg)]: - flip_idx[i] = True - A_flag = False - - flips = np.c_[flip_candidates[:-1], flip_candidates[1:]][flip_idx, :] - - # - # get full range of CNV breakpoints this region spans - st_reg = self.breakpoints.bisect_left(o_S[0]) - en_reg = self.breakpoints.bisect_right(o_S[-1]) - breakpoints0 = sc.SortedSet(self.breakpoints[(st_reg - 1):(en_reg + 1)]) - - # - # get initial marginal likelihood of this configuration - ML_orig = 0 - for b in breakpoints0[:-1]: - ML_orig += self.seg_marg_liks[b] - - # - # perform flips; update breakpoint list accordingly - for st_seg, en_seg in flips: - # if flip boundary corresponds to an extant breakpoint, remove it - # (we will propose joining these segments after flip) - if st_seg in breakpoints0: - breakpoints0 -= {st_seg} - # otherwise, add the flip boundary as a new breakpoint - # (we will propose introducing a new segment after flip) - else: - breakpoints0.add(st_seg) - if en_seg in breakpoints0: - breakpoints0 -= {en_seg} - else: - breakpoints0.add(en_seg) - - self.flip_hap(st_seg, en_seg) - - # - # if st:en is entirely assigned to A, try to flip it back to B (i.e. it was a false flip) - if A_flag: - if flip_candidates[0] in breakpoints0: - breakpoints0 -= {flip_candidates[0]} - else: - breakpoints0.add(flip_candidates[0]) - if en_reg in breakpoints0: - breakpoints0 -= {flip_candidates[-1]} - else: - breakpoints0.add(flip_candidates[-1]) - - self.flip_hap(flip_candidates[0], flip_candidates[-1]) - - # - # get marginal likelihood post-flip and breakpoint adjustment - bps = np.r_[breakpoints0] - ML = 0 - for st_bp, en_bp in np.c_[bps[:-1], bps[1:]]: - ML += ss.betaln( - self._Piloc(st_bp, en_bp, self.min_idx).sum() + 1, - self._Piloc(st_bp, en_bp, self.maj_idx).sum() + 1 - ) - - # - # probabilistically accept new configuration - if np.log(np.random.rand()) < np.minimum(0, ML - ML_orig): - # - # update F array - - # we could have either flipped a region from B->A ... - if not A_flag: - for st_seg, en_seg in flips: - self.F.update([st_seg, en_seg]) - - # ... or reverted a flip - else: - for p in self.F[f_o(flip_candidates[0], flip_candidates[-1])]: - self.F.remove(p) - - # - # combine contiguous intervals in F array - # TODO - - # - # update breakpoint list and seg. marg. liks - bps_to_del = list(self.breakpoints.islice( - self.breakpoints.bisect_left(breakpoints0[0]), - self.breakpoints.bisect_right(breakpoints0[-1]) - )) - for x in bps_to_del: - self.breakpoints.remove(x) - self.breakpoints.update(breakpoints0) - - # - # update seg. marg. liks - # TODO: recomputing each sum (even if in the future we use memoization) - # is wasteful. intelligently pick which seg_marg_liks keys to update. - for x in bps_to_del[:-1]: - self.seg_marg_liks.__delitem__(x) - for st_bp, en_bp in np.c_[bps[:-1], bps[1:]]: - self.seg_marg_liks[st_bp] = ss.betaln( - self._Piloc(st_bp, en_bp, self.min_idx).sum() + 1, - self._Piloc(st_bp, en_bp, self.maj_idx).sum() + 1 - ) - - self.marg_lik[self.iter] = self.marg_lik[self.iter - 1] - ML_orig + ML - - # - # revert - else: - # flip each region back - for st_seg, en_seg in flips: - self.flip_hap(st_seg, en_seg) - if A_flag: - self.flip_hap(flip_candidates[0], flip_candidates[-1]) - - self.marg_lik[self.iter] = self.marg_lik[self.iter - 1] - - def compute_all_cumsums(self): - bpl = np.array(self.breakpoints); bpl = np.c_[bpl[0:-1], bpl[1:]] - for st, en in bpl: - self.cs_MAJ[st], self.cs_MIN[st], self.split_prob[st] = self.compute_cumsum(st, en) - def compute_cumsum(self, st, en): # major cs_MAJ = np.zeros(en - st, dtype = np.int) @@ -582,7 +255,7 @@ def compute_cumsum(self, st, en): cs_MIN[i - st] = cs_MIN[i - st - 1] + (self.P.iat[i, self.min_idx] if self.P.iat[i, self.P.columns.get_loc("include")] else 0) # marginal likelihoods - ml = ss.betaln(cs_MAJ + 1, cs_MIN + 1) + ss.betaln(cs_MAJ[-1] - cs_MAJ + 1, cs_MIN[-1] - cs_MIN + 1) + ml = ss.betaln(cs_MAJ + 1 + self.betahyp, cs_MIN + 1 + self.betahyp) + ss.betaln(cs_MAJ[-1] - cs_MAJ + 1 + self.betahyp, cs_MIN[-1] - cs_MIN + 1 + self.betahyp) # prior # TODO: allow user to specify @@ -631,12 +304,12 @@ def split(self, st = None, b_idx = None): # M-H acceptance seg_lik_1 = ss.betaln( - self._Piloc(st, mid, self.min_idx).sum() + 1, - self._Piloc(st, mid, self.maj_idx).sum() + 1 + self._Piloc(st, mid, self.min_idx).sum() + 1 + self.betahyp, + self._Piloc(st, mid, self.maj_idx).sum() + 1 + self.betahyp ) seg_lik_2 = ss.betaln( - self._Piloc(mid, en, self.min_idx).sum() + 1, - self._Piloc(mid, en, self.maj_idx).sum() + 1 + self._Piloc(mid, en, self.min_idx).sum() + 1 + self.betahyp, + self._Piloc(mid, en, self.maj_idx).sum() + 1 + self.betahyp ) ML_split = seg_lik_1 + seg_lik_2 @@ -691,21 +364,21 @@ def prune(self): # q_i = seg(A - A_i, B - B_i) + garbage(A_i, B_i) + (1 - include prior_i) # - (seg(A, B) + (include prior_i)) r_exc = ss.betaln( - A_inc_s - I["MIN_COUNT"] + 1, - B_inc_s - I["MAJ_COUNT"] + 1 - ) + ss.betaln(I["MIN_COUNT"] + 1, I["MAJ_COUNT"] + 1) \ + A_inc_s - I["MIN_COUNT"] + 1 + self.betahyp, + B_inc_s - I["MAJ_COUNT"] + 1 + self.betahyp + ) + ss.betaln(I["MIN_COUNT"] + 1 + self.betahyp, I["MAJ_COUNT"] + 1 + self.betahyp) \ + np.log(1 - I["include_prior"]) \ - - (ss.betaln(A_inc_s + 1, B_inc_s + 1) + np.log(I["include_prior"])) + - (ss.betaln(A_inc_s + 1 + self.betahyp, B_inc_s + 1 + self.betahyp) + np.log(I["include_prior"])) # 2. probability to include SNPs (that were previously excluded) # q_i = seg(A + A_i, B + B_i) + (include prior_i) # - (seg(A, B) + garbage(A_i, B_i) + (1 - include prior_i)) r_inc = ss.betaln( - A_inc_s + E["MIN_COUNT"] + 1, - B_inc_s + E["MAJ_COUNT"] + 1 + A_inc_s + E["MIN_COUNT"] + 1 + self.betahyp, + B_inc_s + E["MAJ_COUNT"] + 1 + self.betahyp ) + np.log(E["include_prior"]) \ - - (ss.betaln(A_inc_s + 1, B_inc_s + 1) + \ - ss.betaln(E["MIN_COUNT"] + 1, E["MAJ_COUNT"] + 1) + \ + - (ss.betaln(A_inc_s + 1 + self.betahyp, B_inc_s + 1 + self.betahyp) + \ + ss.betaln(E["MIN_COUNT"] + 1 + self.betahyp, E["MAJ_COUNT"] + 1 + self.betahyp) + \ np.log(1 - E["include_prior"])) r_cat = pd.concat([r_inc, r_exc]).sort_index() @@ -739,18 +412,18 @@ def prune(self): # regardless, code for computing q_star is the same r_exc_star = ss.betaln( - A_inc_s_star - I_star["MIN_COUNT"] + 1, - B_inc_s_star - I_star["MAJ_COUNT"] + 1 - ) + ss.betaln(I_star["MIN_COUNT"] + 1, I_star["MAJ_COUNT"] + 1) \ + A_inc_s_star - I_star["MIN_COUNT"] + 1 + self.betahyp, + B_inc_s_star - I_star["MAJ_COUNT"] + 1 + self.betahyp + ) + ss.betaln(I_star["MIN_COUNT"] + 1 + self.betahyp, I_star["MAJ_COUNT"] + 1 + self.betahyp) \ + np.log(1 - I_star["include_prior"]) \ - - (ss.betaln(A_inc_s_star + 1, B_inc_s_star + 1) + np.log(I_star["include_prior"])) + - (ss.betaln(A_inc_s_star + 1 + self.betahyp, B_inc_s_star + 1 + self.betahyp) + np.log(I_star["include_prior"])) r_inc_star = ss.betaln( - A_inc_s_star + E_star["MIN_COUNT"] + 1, - B_inc_s_star + E_star["MAJ_COUNT"] + 1 + A_inc_s_star + E_star["MIN_COUNT"] + 1 + self.betahyp, + B_inc_s_star + E_star["MAJ_COUNT"] + 1 + self.betahyp ) + np.log(E_star["include_prior"]) \ - - (ss.betaln(A_inc_s_star + 1, B_inc_s_star + 1) + \ - ss.betaln(E_star["MIN_COUNT"] + 1, E_star["MAJ_COUNT"] + 1) + \ + - (ss.betaln(A_inc_s_star + 1 + self.betahyp, B_inc_s_star + 1 + self.betahyp) + \ + ss.betaln(E_star["MIN_COUNT"] + 1 + self.betahyp, E_star["MAJ_COUNT"] + 1 + self.betahyp) + \ np.log(1 - E_star["include_prior"])) r_cat_star = pd.concat([r_inc_star, r_exc_star]).sort_index() @@ -771,8 +444,8 @@ def prune(self): self.marg_lik[self.iter] -= self.seg_marg_liks[st] self.seg_marg_liks[st] = ss.betaln( - T.loc[T["include"], "MIN_COUNT"].sum() + 1, - T.loc[T["include"], "MAJ_COUNT"].sum() + 1, + T.loc[T["include"], "MIN_COUNT"].sum() + 1 + self.betahyp, + T.loc[T["include"], "MAJ_COUNT"].sum() + 1 + self.betahyp, ) self.marg_lik[self.iter] += self.seg_marg_liks[st] @@ -780,8 +453,8 @@ def prune(self): # effectively their own segments) self.marg_lik[self.iter] += (1 if ~self.P.at[choice_idx, "include"] else -1)* \ ss.betaln( - self.P.at[choice_idx, "MIN_COUNT"] + 1, - self.P.at[choice_idx, "MAJ_COUNT"] + 1 + self.P.at[choice_idx, "MIN_COUNT"] + 1 + self.betahyp, + self.P.at[choice_idx, "MAJ_COUNT"] + 1 + self.betahyp ) # TODO: update segment partial sums (when we actually use these) @@ -812,20 +485,16 @@ def incr_bp_counter(self, st, en, mid = None): self.breakpoint_counter[(mid + 1):en] += np.r_[0, 1] def visualize(self, show_CIs = False): - Ph = self.P.copy() - CI = s.beta.ppf([0.05, 0.5, 0.95], Ph["MIN_COUNT"][:, None] + 1, Ph["MAJ_COUNT"][:, None] + 1) - Ph[["CI_lo_hap", "median_hap", "CI_hi_hap"]] = CI - - plt.figure(); plt.clf() + plt.figure(figsize = [16, 4]); plt.clf() ax = plt.gca() # SNPs - ax.scatter(Ph["pos"], Ph["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)], alpha = 0.5, s = 4) + ax.scatter(self.P["pos"], self.P["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][self.P["aidx"].astype(np.int)], alpha = 0.5, s = 4, marker = '.') if show_CIs: - ax.errorbar(Ph["pos"], y = Ph["median_hap"], yerr = np.c_[Ph["median_hap"] - Ph["CI_lo_hap"], Ph["CI_hi_hap"] - Ph["median_hap"]].T, fmt = 'none', alpha = 0.5, color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)]) + ax.errorbar(self.P["pos"], y = self.P["median_hap"], yerr = np.c_[self.P["median_hap"] - self.P["CI_lo_hap"], self.P["CI_hi_hap"] - self.P["median_hap"]].T, fmt = 'none', alpha = 0.1, ecolor = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][self.P["aidx"].astype(np.int)]) # mask excluded SNPs - ax.scatter(Ph["pos"], Ph["median_hap"], color = 'k', alpha = 1 - pd.concat(self.include, axis = 1).mean(1).values) + # ax.scatter(Ph["pos"], Ph["median_hap"], color = 'k', alpha = 1 - pd.concat(self.include, axis = 1).mean(1).values) # breakpoints # bp_prob = self.breakpoint_counter[:, 0]/self.breakpoint_counter[:, 1] @@ -840,47 +509,22 @@ def visualize(self, show_CIs = False): # ax2.set_xlim(ax.get_xlim()); # ax2.set_xlabel("Breakpoint number in current MCMC iteration") - # beta CI's weighted by breakpoints - # flip current rephases back to baseline - for st, en in self.F.intervals(): - # code excised from flip_hap - x = Ph.iloc[st:en, self.maj_idx].copy() - Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx] - Ph.iloc[st:en, self.min_idx] = x - - pos_col = Ph.columns.get_loc("pos") - for bp_samp, pi_samp, inc_samp in itertools.zip_longest(self.breakpoint_list, self.phase_interval_list, self.include): - # flip everything according to sample - # if we did not perform phase correction, pi_samp will be none (hence - # the use of zip_longest above) - if pi_samp is not None: - for st, en in pi_samp.intervals(): - # TODO: can replace with flip_hap()? - x = Ph.iloc[st:en, self.maj_idx].copy() - Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx] - Ph.iloc[st:en, self.min_idx] = x - - # SNPs TODO: plot only those that flipped, in a diff. color? - #ax.scatter(Ph["pos"], Ph["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)], alpha = 0.5, s = 4) - - bpl = np.array(bp_samp); bpl = np.c_[bpl[0:-1], bpl[1:]] - for st, en in bpl: - Phi = Ph.iloc[st:en]; Phi = Phi.loc[inc_samp] - ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], Phi.iloc[:, self.min_idx].sum() + 1, Phi.iloc[:, self.maj_idx].sum() + 1) - ax.add_patch(mpl.patches.Rectangle((Ph.iloc[st, pos_col], ci_lo), Ph.iloc[en, pos_col] - Ph.iloc[st, pos_col], ci_hi - ci_lo, fill = True, facecolor = 'k', alpha = 1/len(self.breakpoint_list), zorder = 1000)) - - # flip everything back - if pi_samp is not None: - for st, en in pi_samp.intervals(): - # TODO: can replace with flip_hap()? - x = Ph.iloc[st:en, self.maj_idx].copy() - Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx] - Ph.iloc[st:en, self.min_idx] = x + bpl = self.breakpoints if self.breakpoints_MLE is None else self.breakpoints_MLE + bpl = np.array(bpl); bpl = np.c_[bpl[0:-1], bpl[1:]] + + pos_col = self.P.columns.get_loc("pos") + for st, en in bpl: + ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], self.P.iloc[st:en, self.maj_idx].sum() + 1, self.P.iloc[st:en, self.min_idx].sum() + 1) + ax.add_patch(mpl.patches.Rectangle((self.P.iloc[st, pos_col], ci_lo), self.P.iloc[en, pos_col] - self.P.iloc[st, pos_col], ci_hi - ci_lo, fill = True, facecolor = 'lime', alpha = 0.4, zorder = 1000)) # 50:50 line ax.axhline(0.5, color = 'k', linestyle = ":") ax.set_xticks(np.linspace(*plt.xlim(), 20)); - ax.set_xticklabels(Ph["pos"].searchsorted(np.linspace(*plt.xlim(), 20))); + ax.set_xticklabels(self.P["pos"].searchsorted(np.linspace(*plt.xlim(), 20))); ax.set_xlabel("SNP index") ax.set_ylim([0, 1]) + + ax.set_title(f"{self.P.iloc[0]['chr']}:{self.P.iloc[0]['pos']}-{self.P.iloc[-1]['pos']}") + + plt.tight_layout() diff --git a/hapaseg/coverage_DP.py b/hapaseg/coverage_DP.py index 07a85e8..17687f2 100644 --- a/hapaseg/coverage_DP.py +++ b/hapaseg/coverage_DP.py @@ -116,7 +116,7 @@ def __init__(self, cov_df, beta, bin_exposure, prior_run=None, count_prior_sum=N self.seg_id_col = self.cov_df.columns.get_loc('segment_ID') self.beta = beta self.bin_exposure=bin_exposure - self.covar_cols = sorted([c for c in self.cov_df.columns if "C_" in c]) + self.covar_cols = sorted(self.cov_df.columns[self.cov_df.columns.str.contains("^C_.*_z$")]) self.num_segments = self.cov_df.iloc[:, self.seg_id_col].max() + 1 self.segment_r_list = [None] * self.num_segments diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py index 43d54d4..3284123 100644 --- a/hapaseg/model_optimizers.py +++ b/hapaseg/model_optimizers.py @@ -2,30 +2,42 @@ class PoissonRegression: - def __init__(self, r, C, Pi): + def __init__(self, r, C, Pi, + log_exposure = 0, log_offset = 0, intercept = True, + mumu = 0, musig2 = 10, betamu = None, betasiginv = None): self.r = r self.C = C self.Pi = Pi + self.log_exposure = log_exposure + self.log_offset = log_offset + self.intercept = intercept - self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.beta = np.ones([C.shape[1], 1]) - self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu) + self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure + self.beta = np.zeros([C.shape[1], 1]) + self.f = 1 + self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) + + # prior parameters + self.mumu = mumu + self.musig2 = musig2 + self.betamu = np.zeros_like(self.beta) if betamu is None else betamu + self.betasiginv = 1/np.sqrt(10)*np.eye(len(self.beta)) if betasiginv is None else betasiginv # mu gradient def gradmu(self): - return self.Pi.T @ (self.r - self.e_s) + return self.Pi.T @ (self.r - self.e_s) - (self.mu - self.mumu)/self.musig2 # mu Hessian def hessmu(self): - return (-self.Pi.T * self.e_s.T) @ self.Pi + return (-self.Pi.T * self.e_s.T) @ self.Pi - 1/self.musig2 # beta gradient def gradbeta(self): - return self.C.T @ (self.r - self.e_s) + return self.C.T @ (self.r - self.e_s) - self.betasiginv@(self.beta - self.betamu) # beta Hessian def hessbeta(self): - return (-self.C.T * self.e_s.T) @ self.C + return (-self.C.T * self.e_s.T) @ self.C - self.betasiginv # mu,beta Hessian def hessmubeta(self): @@ -33,23 +45,63 @@ def hessmubeta(self): def NR_poisson(self): for i in range(100): - self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu) - gmu = self.gradmu() + self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) gbeta = self.gradbeta() - grad = np.r_[gmu, gbeta] + if self.intercept: + gmu = self.gradmu() + grad = np.r_[gmu, gbeta] + else: + grad = gbeta - hmu = self.hessmu() hbeta = self.hessbeta() - hmubeta = self.hessmubeta() - H = np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] + if self.intercept: + hmubeta = self.hessmubeta() + hmu = self.hessmu() + H = np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] + else: + H = hbeta delta = np.linalg.inv(H) @ grad - self.mu -= delta[0:len(self.mu)] - self.beta -= delta[len(self.mu):] + if self.intercept: + self.mu -= delta[0:len(self.mu)] + self.beta -= delta[len(self.mu):] + else: + self.beta -= delta if np.linalg.norm(grad) < 1e-5: break def fit(self): self.NR_poisson() - return self.mu, self.beta + if self.intercept: + return self.mu, self.beta + else: + return self.beta + + def hess(self): + hbeta = self.hessbeta() + if self.intercept: + hmu = self.hessmu() + hmubeta = self.hessmubeta() + return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]] + else: + return hbeta + + # scale factor + def gradf(self): + return (self.C@self.beta).T@(self.r - self.e_s) + + def hessf(self): + CB = self.C@self.beta + return -(CB*self.e_s).T@CB + + def NR_f(self): + for i in range(100): + self.e_s = np.exp(self.f*self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset) + gf = self.gradf() + hf = self.hessf() + + self.f -= gf/hf + + if np.linalg.norm(gf) < 1e-5: + break diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py index 5370708..e880c13 100644 --- a/hapaseg/run_coverage_MCMC.py +++ b/hapaseg/run_coverage_MCMC.py @@ -1,9 +1,12 @@ import numpy as np import pandas as pd +import pickle import glob import re import os import scipy.special as ss +import sys +import tqdm from capy import mut, seq import scipy.stats as stats from statsmodels.discrete.discrete_model import NegativeBinomial as statsNB @@ -16,19 +19,28 @@ def __init__(self, coverage_csv, f_allelic_clusters, f_SNPs, + f_segs, + ref_fasta, f_repl, + f_faire, f_GC=None, num_draws=50, cluster_num=None, - allelic_sample=None + allelic_sample=None, + bin_width=1, ): self.num_draws = num_draws self.cluster_num = cluster_num self.f_repl = f_repl + self.f_faire = f_faire self.f_GC = f_GC + self.ref_fasta = ref_fasta + self.bin_width = bin_width self.allelic_clusters = np.load(f_allelic_clusters) + with open(f_segs, "rb") as f: + self.segmentations = pickle.load(f) # coverage input is expected to be a df file with columns: ["chr", "start", "end", "covcorr", "covraw"] self.full_cov_df = self.load_coverage(coverage_csv) self.load_covariates() @@ -37,7 +49,7 @@ def __init__(self, if allelic_sample is not None: self.allelic_sample = allelic_sample else: - self.allelic_sample = self.select_ADP_cluster() + self.allelic_sample = np.argmax(self.allelic_clusters["likelihoods"]) self.model = None @@ -56,165 +68,202 @@ def run_all_clusters(self): # Do preprocessing for running on each ADP cluster individually def prepare_single_cluster(self): Pi, r, C, filtered_cov_df = self.assign_clusters() - pois_regr = PoissonRegression(r, C, Pi) + pois_regr = PoissonRegression(r, C, Pi, log_exposure = np.log(self.bin_width)) all_mu, global_beta = pois_regr.fit() + pois_hess = pois_regr.hess() # save these results to a numpy object - return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample - - # method for calculating DP prior likelihood of an ADP cluster - @staticmethod - def dp_prior(cluster_counts_arr, alpha): - N = cluster_counts_arr.sum() - m = len(cluster_counts_arr) - return m * np.log(alpha) + ss.gammaln(cluster_counts_arr).sum() + ss.gammaln(alpha) - ss.gammaln(N+alpha) - - # method for selecting ADP clustering based on likelihoods - def select_ADP_cluster(self): - ADP_draws = self.allelic_clusters["snps_to_clusters"] - tmp_snps = self.SNPs.copy() - lls = [] - for ADP_draw in ADP_draws: - tmp_snps['cluster_assignment'] = ADP_draw - count_arr = tmp_snps.groupby(by='cluster_assignment').agg({"maj":sum, "min":sum}).values - count_arr += 1 - beta_ll = ss.betaln(count_arr[:, 0], count_arr[:, 1]).sum() - cluster_counts = tmp_snps['cluster_assignment'].value_counts().values - dp_ll = self.dp_prior(cluster_counts, 0.5) - lls.append(beta_ll + dp_ll) - lls = np.array(lls) - lls_max = np.max(lls) - choice_p = np.exp(lls - lls_max) / np.exp(lls - lls_max).sum() - return np.random.choice(len(ADP_draws), p=choice_p) - - - @staticmethod - def load_coverage(coverage_csv): - Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False) + return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample, pois_hess + + def load_coverage(self, coverage_csv): + Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_frags", "tot_reads", "fail_reads"], low_memory=False) Cov.loc[Cov['chr'] == 'chrM', 'chr'] = 'chrMT' #change mitocondrial contigs to follow mut conventions Cov["chr"] = mut.convert_chr(Cov["chr"]) Cov = Cov.loc[Cov["chr"] != 0] Cov=Cov.reset_index(drop=True) - Cov["start_g"] = seq.chrpos2gpos(Cov["chr"], Cov["start"]) - Cov["end_g"] = seq.chrpos2gpos(Cov["chr"], Cov["end"]) + Cov["start_g"] = seq.chrpos2gpos(Cov["chr"], Cov["start"], ref = self.ref_fasta) + Cov["end_g"] = seq.chrpos2gpos(Cov["chr"], Cov["end"], ref = self.ref_fasta) return Cov def load_SNPs(self, f_snps): SNPs = pd.read_pickle(f_snps) - SNPs["chr"], SNPs["pos"] = seq.gpos2chrpos(SNPs["gpos"]) - - SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False) + mut.map_mutations_to_targets(SNPs, self.full_cov_df) return SNPs def generate_GC(self): #grab fasta object from seq to avoid rebuilding + seq.set_reference(self.ref_fasta) F = seq._fa.ref_fa_obj self.full_cov_df['C_GC'] = np.nan #this indexing assumes 0-indexed start and end cols - for (i, chrm, start, end) in self.full_cov_df[['chr', 'start','end']].itertuples(): + for (i, chrm, start, end) in tqdm.tqdm(self.full_cov_df[['chr', 'start','end']].itertuples(), total = len(self.full_cov_df)): self.full_cov_df.iat[i, -1] = F[chrm-1][start:end+1].gc def load_covariates(self): - #check if we are doing wgs, in which case we will have uniform 200 bp bins - wgs = True if self.f_GC is not None or len(self.full_cov_df) > 100000 else False - - #we only need bin size if doing exomes - if not wgs: - self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1) + zt = lambda x : (x - np.nanmean(x))/np.nanstd(x) + + ## Target size + + # we only need bin size if doing exomes but we can check by looking at the bin lengths + self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1) + self.full_cov_df["C_log_len_z"] = zt(self.full_cov_df["C_log_len"]) - #this is a safety in case we are doing wgs but have few bins - if (np.diff(self.full_cov_df["C_log_len"]) == 0).all(): - #remove the len col since it will ruin beta fitting - self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1) + # in case we are doing wgs these will all be the same and we must remove + if (np.diff(self.full_cov_df["C_log_len"]) == 0).all(): + #remove the len col since it will ruin beta fitting + self.full_cov_df = self.full_cov_df.drop(['C_log_len', 'C_log_len_z'], axis=1) + + ## Fragment length + + # some bins have zero mean fragment length; these bins are bad and should be removed + self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True) + + self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" }) + self.full_cov_df["C_frag_len_z"] = zt(np.log(self.full_cov_df["C_frag_len"])) + + # generate on 5x and 11x scales + swv = np.lib.stride_tricks.sliding_window_view + fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0 + wt = self.full_cov_df["num_frags"].values + for scale in [5, 11]: + fl_sw = swv(np.pad(fl, scale//2), scale) + wt_sw = swv(np.pad(wt, scale//2), scale) + conv = np.einsum('ij,ij->i', wt_sw, fl_sw) + + self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1) + self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(np.log(self.full_cov_df[f"C_frag_len_{scale}x"])) + + ### track-based covariates + # use midpoint of coverage bins to map to intervals + self.full_cov_df["midpoint"] = ((self.full_cov_df["end"] + self.full_cov_df["start"])/2).astype(int) + + ## Replication timing + # load repl timing F = pd.read_pickle(self.f_repl) # map targets to RT intervals - tidx = mut.map_mutations_to_targets(self.full_cov_df.rename(columns={"start": "pos"}), F, inplace=False) + tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint") self.full_cov_df['C_RT'] = np.nan self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values # z-transform - self.full_cov_df["C_RT_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))( - np.log(self.full_cov_df["C_RT"] + 1e-20)) + self.full_cov_df["C_RT_z"] = zt(np.log(self.full_cov_df["C_RT"])) + + ## GC content # load GC content if we have it precomputed, otherwise generate it - if wgs and self.f_GC is not None and os.path.exists(self.f_GC): - print("Using precomputed GC content") + if self.f_GC is not None and os.path.exists(self.f_GC): + print("Using precomputed GC content", file = sys.stderr) B = pd.read_pickle(self.f_GC) self.full_cov_df = self.full_cov_df.merge(B.rename(columns={"gc": "C_GC"}), left_on=["chr", "start", "end"], right_on=["chr", "start", "end"], how="left") else: - print("Computing GC content") + print("Computing GC content", file = sys.stderr) self.generate_GC() - - self.full_cov_df["C_GC_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))( - np.log(self.full_cov_df["C_GC"] + 1e-20)) - - #set zero coverage bins to nan - self.full_cov_df.loc[(self.full_cov_df.mean_frag_len == 0) | (self.full_cov_df.std_frag_len == 0), ['mean_frag_len', 'std_frag_len']] = (np.nan, np.nan) - - # add fragment based covars - self.full_cov_df["C_frag_len"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(np.log(self.full_cov_df["mean_frag_len"] + 1e-20)) - self.full_cov_df["C_frag_std"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(np.log(self.full_cov_df["std_frag_len"] + 1e-20)) - # drop non z-cetered cols - self.full_cov_df = self.full_cov_df.drop(['C_GC', 'C_RT'], axis=1) - + # GC content follows a roughly quadratic relationship with coverage + self.full_cov_df["C_GC2"] = self.full_cov_df["C_GC"]**2 + + ## FAIRE + + if self.f_faire is not None: + F = pd.read_pickle(self.f_faire) + + # map targets to FAIRE intervals + tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint") + F = F.loc[tidx].set_index(tidx.index).iloc[:, 3:].rename(columns = lambda x : "C_" + x) + self.full_cov_df = pd.concat([self.full_cov_df, F], axis = 1) + + # z-transform + self.full_cov_df = pd.concat([ + self.full_cov_df, + self.full_cov_df.loc[:, F.columns].apply(lambda x : zt(np.log(x + 1))).rename(columns = lambda x : x + "_z") + ], axis = 1) # use SNP cluster assignments from the given draw assign coverage bins to clusters # clusters with snps from different clusters are probabliztically assigned # method returns coverage df with only bins that overlap snps def assign_clusters(self): - # generate unique clust assignments + ## generate unique clust assignments clust_choice = self.allelic_clusters["snps_to_clusters"][self.allelic_sample] clust_u, clust_uj = np.unique(clust_choice, return_inverse=True) clust_uj = clust_uj.reshape(clust_choice.shape) + self.SNPs["clust_choice"] = clust_uj - # assign coverage intervals to clusters - Cov_clust_probs = np.zeros([len(self.full_cov_df), clust_uj.max()+1]) - - # first compute assignment probabilities based on the SNPs within each bin - for targ, snp_idx in self.SNPs.groupby("tidx").indices.items(): - targ_clust_hist = np.bincount(clust_uj[snp_idx].ravel(), minlength=clust_uj.max()+1) + ## assign coverage intervals to allelic clusters and segments + # get allelic segment boundaries + seg_bdy = np.r_[0, list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)] + seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]] + self.SNPs["seg_idx"] = 0 + for i, (st, en) in enumerate(seg_bdy): + self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i + seg_max = self.SNPs["seg_idx"].max() + 1 - Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum() + # assignment probabilities of each coverage interval -> allelic segment + Cov_clust_probs = np.zeros([len(self.full_cov_df), seg_max]) - # subset intervals containing SNPs + # first compute assignment probabilities based on the SNPs within each bin + # segments just get assigned to the maximum probability + self.full_cov_df["seg_idx"] = -1 + self.full_cov_df["allelic_cluster"] = -1 + print("Mapping SNPs to targets ...", file = sys.stderr) + for targ, D in tqdm.tqdm(self.SNPs.groupby("targ_idx")[["clust_choice", "seg_idx"]]): + if targ == -1: # SNP does not overlap a coverage bin + continue + clust_idx = D["clust_choice"].values + seg_idx = D["seg_idx"].values + if len(seg_idx) == 1: + Cov_clust_probs[targ, seg_idx] = 1.0 + self.full_cov_df.at[targ, "seg_idx"] = seg_idx[0] + self.full_cov_df.at[targ, "allelic_cluster"] = clust_idx[0] + else: + targ_clust_hist = np.bincount(seg_idx, minlength = seg_max) + Cov_clust_probs[targ, :] = targ_clust_hist / targ_clust_hist.sum() + self.full_cov_df.at[targ, "seg_idx"] = np.bincount(seg_idx).argmax() + self.full_cov_df.at[targ, "allelic_cluster"] = np.bincount(clust_idx).argmax() + + ## subset to targets containing SNPs overlap_idx = Cov_clust_probs.sum(1) > 0 +# # add targets within a 2 targ radius +# overlap_idx = np.flatnonzero(Cov_clust_probs.sum(1) > 0)[:, None] +# overlap_idx = overlap_idx + np.c_[-2:3].T +# overlap_idx = np.sort(np.unique((overlap_idx + np.c_[-2:3].T).ravel())) Cov_clust_probs_overlap = Cov_clust_probs[overlap_idx, :] # zero out improbable assignments and re-normalilze Cov_clust_probs_overlap[Cov_clust_probs_overlap < 0.05] = 0 Cov_clust_probs_overlap /= Cov_clust_probs_overlap.sum(1)[:, None] - # prune garbage clusters + # prune empty clusters prune_idx = Cov_clust_probs_overlap.sum(0) > 0 Cov_clust_probs_overlap = Cov_clust_probs_overlap[:, prune_idx] num_pruned_clusters = Cov_clust_probs_overlap.shape[1] - # subsetting to only targets that overlap SNPs + + ## subsetting to only targets that overlap SNPs Cov_overlap = self.full_cov_df.loc[overlap_idx, :] - # probabilistically assign each ambiguous coverage bin to a cluster + ## probabilistically assign each ambiguous coverage bin to a cluster # for now we will take maximum instead amb_mask = np.max(Cov_clust_probs_overlap, 1) != 1 amb_assgn_probs = Cov_clust_probs_overlap[amb_mask, :] - #new_assgn = np.array([np.random.choice(np.r_[:num_pruned_clusters], - # p=amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))]) - new_assgn = np.array([np.argmax(amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))]) - new_onehot = np.zeros((new_assgn.size, num_pruned_clusters)) - new_onehot[np.arange(new_assgn.size), new_assgn] = 1 - - # update with assigned values - Cov_clust_probs_overlap[amb_mask, :] = new_onehot - - #downsampling for wgs - if len(Cov_clust_probs_overlap) > 20000: - downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2 - Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask] - Cov_overlap = Cov_overlap.iloc[downsample_mask] + if amb_mask.sum() > 0: + #new_assgn = np.array([np.random.choice(np.r_[:num_pruned_clusters], + # p=amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))]) + new_assgn = np.array([np.argmax(amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))]) + new_onehot = np.zeros((new_assgn.size, num_pruned_clusters)) + new_onehot[np.arange(new_assgn.size), new_assgn] = 1 + + # update with assigned values + Cov_clust_probs_overlap[amb_mask, :] = new_onehot + + ## downsampling for wgs +# if len(Cov_clust_probs_overlap) > 20000: +# downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2 +# Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask] +# Cov_overlap = Cov_overlap.iloc[downsample_mask] # remove clusters with fewer than 4 assigned coverage bins (remove these coverage bins as well) bad_clusters = Cov_clust_probs_overlap.sum(0) < 4 @@ -223,14 +272,18 @@ def assign_clusters(self): Cov_overlap = Cov_overlap.loc[~bad_bins, :] Pi = filtered.copy() - - r = np.c_[Cov_overlap["covcorr"]] - - covar_columns = sorted([c for c in Cov_overlap.columns if 'C_' in c]) - # making covariate matrix + + ## making regressor vector/covariate matrix + + # scale regressor to reflect fragment counts + Cov_overlap["fragcorr"] = np.round(Cov_overlap["covcorr"]/Cov_overlap["C_frag_len"].mean()) + r = np.c_[Cov_overlap["fragcorr"]] + + # make covariate matrix; use all z-transformed covariates + non-scaled GC content+GC^2 + covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$") | Cov_overlap.columns.str.contains("^C_GC")]) C = np.c_[Cov_overlap[covar_columns]] - # dropping Nans + ## dropping Nans naidx = np.isnan(C).any(axis=1) # drop zero coverage bins as well (this is to account for a bug in coverage collector) TODO: remove need for this naidx = np.logical_or(naidx, (r==0).flatten()) @@ -240,14 +293,22 @@ def assign_clusters(self): Cov_overlap = Cov_overlap.iloc[~naidx] - #removing outliers + ## removing coverage outliers outlier_mask = find_outliers(r) r = r[~outlier_mask] C = C[~outlier_mask] Pi = Pi[~outlier_mask] Cov_overlap = Cov_overlap.iloc[~outlier_mask] - - Cov_overlap['allelic_cluster'] = np.argmax(Pi, axis=1) + + # some clusters may have been eliminated by this point; prune them from Pi + Pi = Pi[:, Pi.sum(0) > 0] + + ## remove covariate outliers (+- 6 sigma) + covar_outlier_idx = (Cov_overlap.loc[:, set(covar_columns) - {"C_GCtr_z"}].abs() < 6).all(axis = 1) + Cov_overlap = Cov_overlap.loc[covar_outlier_idx] + Pi = Pi[covar_outlier_idx, :] + r = r[covar_outlier_idx] + C = C[covar_outlier_idx, :] return Pi, r, C, Cov_overlap @@ -293,8 +354,11 @@ def nat_sort(lst): return sorted(lst, key=alphanum_key) -# function for collecting coverage mcmc results from each ADP cluster -def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, bin_width=1): +# function for collecting coverage mcmc results from each ADP segment +def aggregate_adp_segments(allelic_seg_groups_pickle, coverage_dir=None, f_file_list=None, cov_df_pickle=None, bin_width=1): + S = pd.read_pickle(allelic_seg_groups_pickle) + S = S.rename_axis(index = "allelic_seg_idx").reset_index() + if coverage_dir is None and f_file_list is None: raise ValueError("need to pass in either coverage_dir or file_list txt file!") if coverage_dir is not None and f_file_list is not None: @@ -302,7 +366,7 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, # get results files from the directory provided or from the file list provided if coverage_dir is not None: - cluster_files = nat_sort(glob.glob(os.path.join(coverage_dir, 'cov_mcmc_data_cluster_*'))) + adp_seg_files = nat_sort(glob.glob(os.path.join(coverage_dir, 'cov_mcmc_data_*'))) cov_df = pd.read_pickle(os.path.join(coverage_dir, 'cov_df.pickle')) else: @@ -316,47 +380,67 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, to_add = l.rstrip('\n') if to_add != "nan": read_files.append(to_add) - cluster_files = nat_sort(read_files) + adp_seg_files = nat_sort(read_files) cov_df = pd.read_pickle(cov_df_pickle) - - clust_assignments = cov_df['allelic_cluster'].values - + + # make sure that number of results shards is consistent with shard indices + if len(adp_seg_files) != len(S): + raise ValueError("Number of ADP seg files does not match scatter shards!") + + # load in covMCMC segment boundaries and mu's for each ADP segment seg_results = [] mu_i_results = [] - # load data from each cluster - for data_path in cluster_files: - cluster_data = np.load(data_path) - seg_results.append(cluster_data['seg_samples']) - mu_i_results.append(cluster_data['mu_i_samples']) - + for f in adp_seg_files: + seg_data = np.load(f) + seg_results.append(seg_data['seg_samples']) + mu_i_results.append(seg_data['mu_i_samples']) + + S["seg_results"] = seg_results + S["mu_i_results"] = mu_i_results + num_draws = seg_results[0].shape[1] - num_clusters = len(seg_results) - # now we use these data to fill an overall coverage segmentation array + # create overall segmentation array coverage_segmentation = np.zeros((len(cov_df), num_draws)) mu_i_values = np.zeros((len(cov_df), num_draws)) + # loop over each cov MCMC draw + # TODO: only use maximum likelihood draw; CDP should be able to resegment for d in range(num_draws): - global_counter = 0 - for c in range(num_clusters): - cluster_mask = (clust_assignments == c) - coverage_segmentation[cluster_mask, d] = seg_results[c][:,d] + global_counter - mu_i_values[cluster_mask, d] = mu_i_results[c][:, d] - global_counter += len(np.unique(seg_results[c][:,d])) - - # generate data to re-compute global beta + n_tot_segs = 0 + # loop over ADP segments + for _, s in S.iterrows(): + seg_idxs = s["seg_results"][:, d] + coverage_segmentation[s["indices"], d] = seg_idxs + n_tot_segs + n_tot_segs += seg_idxs[-1] + 1 + + mu_i_values[s["indices"], d] = s["mu_i_results"][:, d] + + # TEMP HACK: for now, only take iteration with fewest number of segments + sidx = coverage_segmentation[-1, :].argmin() + coverage_segmentation = coverage_segmentation[:, [sidx]] + mu_i_values = mu_i_values[:, [sidx]] + + # remove short segments (<200Kb) + # TODO: remove segments not well-modeled by covariates + cov_df["cov_seg_idx"] = coverage_segmentation.astype(int) + long_seg_idx = cov_df.groupby("cov_seg_idx").apply(lambda x : (x.iloc[-1]["end"] - x.iloc[0]["start"]) > 2e5).rename("seg_OK") + cov_df = cov_df.merge(long_seg_idx, left_on = "cov_seg_idx", right_index = True) + + coverage_segmentation = coverage_segmentation[cov_df["seg_OK"], :] + mu_i_values = mu_i_values[cov_df["seg_OK"], :] + cov_df = cov_df.loc[cov_df["seg_OK"]] + + # recompute global beta r = np.c_[cov_df["covcorr"]] # we'll use the mu_is from the last segmentation sample - mu_is = mu_i_values[:,-1] - # compute new edogenous targets by subtracking out the mu_i values of the segments - # along with the bin exposure - endog = np.exp(np.log(r).flatten() - np.log(bin_width) - mu_is).reshape(-1,1) + mu_is = mu_i_values[:, [-1]] # generate covars - covar_columns = sorted([c for c in cov_df.columns if 'C_' in c]) + covar_columns = sorted(cov_df.columns[cov_df.columns.str.contains("^C_.*_z$")]) C = np.c_[cov_df[covar_columns]] # do regression - pois_regr = PoissonRegression(endog, C, np.ones(endog.shape)) + pois_regr = PoissonRegression(r, C, np.ones(r.shape), np.log(bin_width) + mu_is) mu_refit, beta_refit = pois_regr.fit() return coverage_segmentation, beta_refit diff --git a/hapaseg/utils.py b/hapaseg/utils.py index e9dc502..5b78213 100644 --- a/hapaseg/utils.py +++ b/hapaseg/utils.py @@ -5,7 +5,13 @@ _chrmap = dict(zip(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]], range(1, 25))) def parse_cytoband(cytoband): - cband = pd.read_csv(cytoband, sep = "\t") + # some cytoband files have a header, some don't; we need to check + has_header = False + with open(cytoband, "r") as f: + if f.readline().startswith("chr\t"): + has_header = True + + cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"] if not has_header else None) cband["chr"] = cband["chr"].apply(lambda x : _chrmap[x]) chrs = cband["chr"].unique() @@ -40,11 +46,14 @@ def plot_chrbdy(cytoband_file): chrbdy = parse_cytoband(cytoband_file) # plot chromosome boundaries + yl_0 = plt.ylim()[0] + yl_1 = plt.ylim()[1] chr_ends = chrbdy.loc[1::2, "end"].cumsum() for end in chr_ends[:-1]: plt.axvline(end, color = 'k') for st, en in np.c_[chr_ends[:-1:2], chr_ends[1::2]]: - plt.fill_between([st, en], 0, 1, color = [0.9, 0.9, 0.9], zorder = 0) + plt.fill_between([st, en], yl_0, yl_1, color = [0.9, 0.9, 0.9], zorder = 0) + plt.ylim([yl_0, yl_1]) # plot centromere locations for cent in (np.c_[chrbdy.loc[1::2, "start"], chrbdy.loc[::2, "end"]] + np.c_[np.r_[0, chr_ends[:-1]]]).ravel(): diff --git a/setup.py b/setup.py index 670d4c6..b4501b5 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,7 @@ #long_description = long_description, #long_description_content_type = 'text/markdown', install_requires = [ - 'pandas>=0.24.1', + 'pandas>=1.4.1', 'numpy>=1.18.0', 'more-itertools>=8.10.0', 'numpy_groupies>=0.9.14', diff --git a/wolF/tasks.py b/wolF/tasks.py index 194bf94..8688199 100644 --- a/wolF/tasks.py +++ b/wolF/tasks.py @@ -53,7 +53,7 @@ class Hapaseg_burnin(wolf.Task): output_patterns = { "burnin_MCMC" : "amcmc_results.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617" class Hapaseg_concat(wolf.Task): inputs = { @@ -68,7 +68,7 @@ class Hapaseg_concat(wolf.Task): "arms" : "AMCMC-arm*.pickle", "ref_bias" : ("ref_bias.txt", wolf.read_file) } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617" class Hapaseg_amcmc(wolf.Task): inputs = { @@ -82,10 +82,10 @@ class Hapaseg_amcmc(wolf.Task): --n_iter ${n_iter} """ output_patterns = { - "arm_level_MCMC" : "amcmc_results.pickle" + "arm_level_MCMC" : "amcmc_results.pickle", + "segmentation_plot" : "figures/MLE_segmentation.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" - + docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617" class Hapaseg_concat_arms(wolf.Task): inputs = { @@ -103,76 +103,68 @@ class Hapaseg_concat_arms(wolf.Task): } docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" - class Hapaseg_allelic_DP(wolf.Task): inputs = { "seg_dataframe" : None, - "n_dp_iter" : 10, - "seg_samp_idx" : 0, "ref_fasta" : None, "cytoband_file" : None } script = """ export CAPY_REF_FA=${ref_fasta} hapaseg dp --seg_dataframe ${seg_dataframe} \ - --n_dp_iter ${n_dp_iter} \ - --seg_samp_idx ${seg_samp_idx} \ --ref_fasta ${ref_fasta} \ --cytoband_file ${cytoband_file} """ output_patterns = { "cluster_and_phase_assignments" : "allelic_DP_SNP_clusts_and_phase_assignments.npz", "all_SNPs" : "all_SNPs.pickle", + "segmentation_breakpoints" : "segmentations.pickle", + "likelihood_trace_plot" : "figures/likelihood_trace.png", "SNP_plot" : "figures/SNPs.png", - "seg_plot" : "figures/allelic_imbalance_preDP.png", - "clust_plot" : "figures/allelic_imbalance_postDP.png", - } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" - resources = { "mem" : "5G" } -class Hapaseg_collect_adp(wolf.Task): - inputs = { - "dp_results":None - } - - script = """ - hapaseg collect_adp --dp_results ${dp_results} - """ - output_patterns = { - "full_dp_results":"full_dp_results.npz" + "seg_plot" : "figures/segs_only.png", } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" - resources = { "mem" : "5G" } - + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v813" + resources = { "mem" : "8G" } class Hapaseg_prepare_coverage_mcmc(wolf.Task): inputs = { "coverage_csv": None, "allelic_clusters_object": None, "SNPs_pickle": None, + "segmentations_pickle": None, "repl_pickle": None, + "faire_pickle": "/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE_GM12878.hg19.pickle", # TODO: make remote "gc_pickle":"", "allelic_sample":"", - "ref_file_path": None + "ref_fasta": None, + "bin_width" : 1 # only for whole genomes; for exomes, target lengths are passed as a covariate via the coverage CSV } - script = """ - export CAPY_REF_FA=${ref_file_path} - hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \ - --allelic_clusters_object ${allelic_clusters_object} \ - --SNPs_pickle ${SNPs_pickle} \ - --repl_pickle ${repl_pickle}""" + def script(self): + script = """ + hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \ + --ref_fasta ${ref_fasta} \ + --allelic_clusters_object ${allelic_clusters_object} \ + --SNPs_pickle ${SNPs_pickle} \ + --segmentations_pickle ${segmentations_pickle} \ + --repl_pickle ${repl_pickle} \ + --faire_pickle ${faire_pickle} \ + --bin_width ${bin_width} + """ - def prolog(self): if self.conf["inputs"]["gc_pickle"] != "": - self.conf["script"][-1] += " --gc_pickle ${gc_pickle}" + script += " --gc_pickle ${gc_pickle} " if self.conf["inputs"]["allelic_sample"] != "": - self.conf["script"][-1] += " --allelic_sample ${allelic_sample}" + script += " --allelic_sample ${allelic_sample}" + + return script output_patterns = { "preprocess_data": "preprocess_data.npz", - "cov_df_pickle": "cov_df.pickle" + "cov_df_pickle": "cov_df.pickle", + "allelic_seg_groups": "allelic_seg_groups.pickle" } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v858" resources = { "mem" : "15G" } @@ -205,16 +197,18 @@ def prolog(self): class Hapaseg_coverage_mcmc(wolf.Task): inputs = { - "preprocess_data": None, + "preprocess_data": None, # npz of covariate matrix (C), global beta, ADP cluster mu's, covbin ADP cluster assignments (all_mu), covbin raw coverage values (r) + "allelic_seg_indices": None, # dataframe containing indicies into C/r/all_mu for each allelic segment + "allelic_seg_scatter_idx": None, # allelic segment to operate on (for scatter) "num_draws": 50, - "cluster_num": None, "bin_width":None, "burnin_files":"" } script = """ hapaseg coverage_mcmc_shard --preprocess_data ${preprocess_data} \ + --allelic_seg_indices ${allelic_seg_indices} \ + --allelic_seg_idx ${allelic_seg_scatter_idx} \ --num_draws ${num_draws} \ - --cluster_num ${cluster_num} \ --bin_width ${bin_width}""" def prolog(self): @@ -222,13 +216,13 @@ def prolog(self): self.conf["script"][-1] += " --burnin_files ${burnin_files}" output_patterns = { - "cov_segmentation_model": 'cov_mcmc_model_cluster_*.pickle', - "cov_segmentation_data": 'cov_mcmc_data_cluster_*.npz', - "cov_seg_figure": 'cov_mcmc_cluster_*_visual.png' + "cov_segmentation_model": 'cov_mcmc_model_*.pickle', + "cov_segmentation_data": 'cov_mcmc_data_*.npz', + "cov_seg_figure": 'cov_mcmc_*_visual.png' } - docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623" - resources = {"mem" : "5G"} + docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v856" + resources = {"mem" : "10G"} class Hapaseg_collect_coverage_mcmc(wolf.Task): inputs = { diff --git a/wolF/workflow.py b/wolF/workflow.py index 340636a..6ac1aa1 100644 --- a/wolF/workflow.py +++ b/wolF/workflow.py @@ -31,19 +31,21 @@ # for Hapaseg itself hapaseg = wolf.ImportTask( - task_path = "../", # TODO: make remote + task_path = ".", # TODO: make remote task_name = "hapaseg" ) # for coverage collection split_intervals = wolf.ImportTask( task_path = "git@github.com:getzlab/split_intervals_TOOL.git", - task_name = "split_intervals" + task_name = "split_intervals", + commit = "dc102d8" ) cov_collect = wolf.ImportTask( task_path = "git@github.com:getzlab/covcollect.git", - task_name = "covcollect" + task_name = "covcollect", + branch = "tot_reads" ) #### @@ -112,10 +114,10 @@ def workflow( num_cov_seg_samples=5, - persistant_dry_run = False + persistent_dry_run = False ): - # alert for persistant dry run - if persistant_dry_run: + # alert for persistent dry run + if persistent_dry_run: #TODO push this message to canine print("WARNING: Skipping file localization in dry run!") @@ -160,7 +162,7 @@ def workflow( "t_bai" : tumor_bai, }, token=localization_token, - persistent_disk_dry_run = persistant_dry_run + persistent_disk_dry_run = persistent_dry_run ) collect_tumor_coverage = True elif tumor_coverage_bed is not None: @@ -176,7 +178,7 @@ def workflow( "n_bai" : normal_bai }, token=localization_token, - persistent_disk_dry_run = persistant_dry_run + persistent_disk_dry_run = persistent_dry_run ) collect_normal_coverage = True elif normal_coverage_bed is not None: @@ -192,8 +194,13 @@ def workflow( # collect or load coverage # tumor if collect_tumor_coverage: - primary_contigs = ['chr{}'.format(i) for i in range(1,23)] - primary_contigs.extend(['chrX','chrY','chrM']) + # FIXME: hack to account for "chr" in hg38 but not in hg19 + if ref_genome_build == "hg38": + primary_contigs = ['chr{}'.format(i) for i in range(1,23)] + primary_contigs.extend(['chrX','chrY','chrM']) + else: + primary_contigs = [str(x) for x in range(1, 23)] + ["X", "Y", "M"] + # create scatter intervals split_intervals_task = split_intervals.split_intervals( bam = tumor_bam_localization_task["t_bam"], @@ -204,18 +211,19 @@ def workflow( # shim task to transform split_intervals files into subset parameters for covcollect task @prefect.task - def interval_gather(interval_files): + def interval_gather(interval_files, primary_contigs): ints = [] for f in interval_files: ints.append(pd.read_csv(f, sep = "\t", header = None, names = ["chr", "start", "end"])) #filter non-primary contigs - primary_contigs = ['chr{}'.format(i) for i in range(1,23)] - primary_contigs.extend(['chrX','chrY','chrM']) - full_bed = pd.concat(ints).sort_values(["chr", "start", "end"]) + full_bed = pd.concat(ints).sort_values(["chr", "start", "end"]).astype({ "chr" : str }) filtered_bed = full_bed.loc[full_bed.chr.isin(primary_contigs)] return filtered_bed - subset_intervals = interval_gather(split_intervals_task["interval_files"]) + subset_intervals = interval_gather( + split_intervals_task["interval_files"], + primary_contigs + ) # dispatch coverage scatter tumor_cov_collect_task = cov_collect.Covcollect( @@ -248,11 +256,21 @@ def interval_gather(interval_files): ref_fasta = localization_task["ref_fasta"], ref_fasta_idx = localization_task["ref_fasta_idx"], ref_fasta_dict = localization_task["ref_fasta_dict"], - dens_cutoff = 0.58 # TODO: set dynamically + use_pod_genotyper = True ) # otherwise, run M1 and get it from the BAM elif callstats_file is None and tumor_bam is not None and normal_bam is not None: + # split het sites file uniformly +# split_het_sites = wolf.Task( +# name = "split_het_sites", +# inputs = { "snp_list" : localization_task["common_snp_list"] }, +# script = """ +# sed '/^@/d' ${snp_list} | split -l 10000 -d -a 4 - snp_list_chunk +# """, +# outputs = { "snp_list_shards" : "snp_list_chunk*" } +# ) + m1_task = mutect1.mutect1(inputs=dict( pairName = "het_coverage", caseName = "tumor", @@ -269,7 +287,12 @@ def interval_gather(interval_files): refFastaIdx = localization_task["ref_fasta_idx"], refFastaDict = localization_task["ref_fasta_dict"], - intervals = split_intervals_task["interval_files"] + intervals = split_intervals_task["interval_files"], + #intervals = split_het_sites["snp_list_shards"], + + exclude_chimeric = True#, + + #force_calling = True, )) hp_scatter = het_pulldown.get_het_coverage_from_callstats( @@ -278,7 +301,7 @@ def interval_gather(interval_files): ref_fasta = localization_task["ref_fasta"], ref_fasta_idx = localization_task["ref_fasta_idx"], ref_fasta_dict = localization_task["ref_fasta_dict"], - dens_cutoff = 0.58 # TODO: set dynamically + use_pod_genotyper = True ) # gather het pulldown @@ -347,6 +370,13 @@ def order_indices(bcf_path, bcf_idx_path, localization_task): F = F.join(F2) + # prepend "chr" to F's index if it's missing + idx = ~F.index.str.contains("^chr") + if idx.any(): + new_index = F.index.values + new_index[idx] = "chr" + F.index[idx] + F = F.set_index(new_index) + # reference panel BCFs R = pd.DataFrame({ "path" : localization_task } ).reset_index() F = F.join(R.join(R.loc[R["index"].str.contains("^chr.*_bcf$"), "index"].str.extract(r"(?Pchr[^_]+)"), how = "right").set_index("chr").drop(columns = ["index"]).rename(columns = { "path" : "ref_bcf" }), how = "inner") @@ -365,8 +395,10 @@ def order_indices(bcf_path, bcf_idx_path, localization_task): vcf_idx_in = F["bcf_idx_path"], vcf_ref = F["ref_bcf"], vcf_ref_idx = F["ref_bcf_idx"], - output_file_prefix = "foo" - ) + output_file_prefix = "foo", + num_threads = 4, + ), + resources = { "cpus-per-task" : 4 } ) # TODO: run whatshap @@ -429,55 +461,90 @@ def get_chunks(scatter_chunks): } ) - hapaseg_arm_concat_task = hapaseg.Hapaseg_concat_arms( - inputs={ - "arm_results":[hapaseg_arm_AMCMC_task["arm_level_MCMC"]], - "ref_fasta":localization_task["ref_fasta"] #pickle load will import capy - } - ) +# hapaseg_arm_concat_task = hapaseg.Hapaseg_concat_arms( +# inputs={ +# "arm_results":[hapaseg_arm_AMCMC_task["arm_level_MCMC"]], +# "ref_fasta":localization_task["ref_fasta"] #pickle load will import capy +# } +# ) +# @prefect.task +# def get_arm_samples_range(arm_concat_object): +# obj = np.load(arm_concat_object) +# n_samples_range = list(range(int(obj["n_samps"]))) +# return n_samples_range +# +# n_samps_range = get_arm_samples_range(hapaseg_arm_concat_task["num_samples_obj"]) + + # concat arm level results @prefect.task - def get_arm_samples_range(arm_concat_object): - obj = np.load(arm_concat_object) - n_samples_range = list(range(int(obj["n_samps"]))) - return n_samples_range - - n_samps_range = get_arm_samples_range(hapaseg_arm_concat_task["num_samples_obj"]) - + def concat_arm_level_results(arm_results): + A = [] + for arm_file in arm_results: + with open(arm_file, "rb") as f: + H = pickle.load(f) + A.append(pd.Series({ "chr" : H.P["chr"].iloc[0], "start" : H.P["pos"].iloc[0], "end" : H.P["pos"].iloc[-1], "results" : H })) + + # get into order + A = pd.concat(A, axis = 1).T.sort_values(["chr", "start", "end"]).reset_index(drop = True) + + # save + _, tmpfile = tempfile.mkstemp( ) + A.to_pickle(tmpfile) + + return tmpfile + + arm_concat = concat_arm_level_results(hapaseg_arm_AMCMC_task["arm_level_MCMC"]) + ## run DP - # scatter DP hapaseg_allelic_DP_task = hapaseg.Hapaseg_allelic_DP( inputs = { - "seg_dataframe" : hapaseg_arm_concat_task["arm_cat_results_pickle"], - "n_dp_iter" : 10, # TODO: allow to be specified? - "seg_samp_idx" : n_samps_range, - "cytoband_file" : localization_task["cytoband_file"], # TODO: allow to be specified + "seg_dataframe" : arm_concat, + #"seg_dataframe" : hapaseg_arm_concat_task["arm_cat_results_pickle"], + "cytoband_file" : localization_task["cytoband_file"], "ref_fasta" : localization_task["ref_fasta"], "ref_fasta_idx" : localization_task["ref_fasta_idx"], # not used; just supplied for symlink "ref_fasta_dict" : localization_task["ref_fasta_dict"] # not used; just supplied for symlink } ) - - ##collect DP results - collect_adp_task = hapaseg.Hapaseg_collect_adp( - inputs={"dp_results":[hapaseg_allelic_DP_task["cluster_and_phase_assignments"]] - } - ) - - ### coverage tasks #### + + # + # coverage tasks + # # prepare coverage MCMC prep_cov_mcmc_task = hapaseg.Hapaseg_prepare_coverage_mcmc( inputs={ "coverage_csv":tumor_cov_gather_task["coverage"], #each scatter result is the same - "allelic_clusters_object":collect_adp_task["full_dp_results"], - "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same + "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"], + "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'], + "segmentations_pickle":hapaseg_allelic_DP_task['segmentation_breakpoints'], "repl_pickle":ref_config["repl_file"], "gc_pickle":ref_config["gc_file"], - "ref_file_path":localization_task["ref_fasta"] + "ref_fasta":localization_task["ref_fasta"], + "bin_width":bin_width if wgs else 1 } ) - + + # shim task to get number of allelic segments + # (coverage MCMC will be scattered over each allelic segment) + @prefect.task + def get_N_seg_groups(S): + return list(range(len(pd.read_pickle(S)))) + + cov_mcmc_shard_range = get_N_seg_groups(prep_cov_mcmc_task["allelic_seg_groups"]) + + # coverage MCMC burnin(?) <- do we still need to burnin separately? + cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc( + inputs={ + "preprocess_data":prep_cov_mcmc_task["preprocess_data"], + "allelic_seg_indices":prep_cov_mcmc_task["allelic_seg_groups"], + "allelic_seg_scatter_idx":cov_mcmc_shard_range, + "num_draws":50, + "bin_width":bin_width, + } + ) + #get the cluster indices from the preprocess data and generate the burnin indices @prefect.task(nout=4) def _get_ADP_cluster_list(preprocess_data_obj): @@ -506,9 +573,9 @@ def _get_ADP_cluster_list(preprocess_data_obj): cluster_idxs = [i for i in np.arange(num_clusters)] print(cluster_idxs, cluster_list, range_list) return len(cluster_idxs), cluster_idxs, cluster_list, range_list - + num_clusters, cluster_idxs, cluster_list, range_list = _get_ADP_cluster_list(prep_cov_mcmc_task["preprocess_data"]) - + # coverage MCMC burnin cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc_burnin( inputs={ @@ -519,7 +586,7 @@ def _get_ADP_cluster_list(preprocess_data_obj): "range":range_list } ) - + # coverage MCMC scatter post-burnin cov_mcmc_scatter_task = hapaseg.Hapaseg_coverage_mcmc( inputs={ @@ -530,7 +597,7 @@ def _get_ADP_cluster_list(preprocess_data_obj): "burnin_files":[cov_mcmc_burnin_task["burnin_data"]] * num_clusters # this is to account for a wolf input len bug } ) - + # collect coverage MCMC cov_mcmc_gather_task = hapaseg.Hapaseg_collect_coverage_mcmc( inputs = { @@ -539,6 +606,7 @@ def _get_ADP_cluster_list(preprocess_data_obj): "bin_width":bin_width } ) + # coverage DP cov_dp_task = hapaseg.Hapaseg_coverage_dp( inputs = { @@ -550,27 +618,26 @@ def _get_ADP_cluster_list(preprocess_data_obj): "bin_width":bin_width } ) - + #get the adp draw number from the preprocess data object @prefect.task def _get_ADP_draw_num(preprocess_data_obj): return int(np.load(preprocess_data_obj)["adp_cluster"]) adp_draw_num = _get_ADP_draw_num(prep_cov_mcmc_task["preprocess_data"]) - - # generate acdp dataframe + # generate acdp dataframe gen_acdp_task = hapaseg.Hapaseg_acdp_generate_df( inputs = { "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same - "allelic_clusters_object":collect_adp_task["full_dp_results"], + "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"], "cdp_filepaths":[cov_dp_task["cov_dp_object"]], "allelic_draw_index":adp_draw_num, "ref_file_path":localization_task["ref_fasta"], "bin_width":bin_width } ) - + # run acdp acdp_task = hapaseg.Hapaseg_run_acdp( inputs = { @@ -582,18 +649,20 @@ def _get_ADP_draw_num(preprocess_data_obj): ) #cleanup by deleting bam disks. we make seperate tasks for the bams - if not persistant_dry_run and t_bam is not None and t_bai is not None: + if not persistent_dry_run and tumor_bam is not None and tumor_bai is not None: delete_tbams_task = DeleteDisk( inputs = { "disk" : [tumor_bam_localization_task["t_bam"], tumor_bam_localization_task["t_bai"]], "upstream" : m1_task["mutect1_cs"] if callstats_file is None else tumor_cov_gather_task["coverage"] + } ) - if not persistant_dry_run and n_bam is not None and n_bai is not None: + if not persistent_dry_run and normal_bam is not None and normal_bai is not None: delete_nbams_task = DeleteDisk( inputs = { "disk" : [normal_bam_localization_task["n_bam"], normal_bam_localization_task["n_bai"]], "upstream" : m1_task["mutect1_cs"] + } ) #also delete the cached files disk delete_file_disk_task = DeleteDisk(