diff --git a/.dockerignore b/.dockerignore
index a160cc0..50af6bb 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -1,2 +1,5 @@
 **/*.bam
 **/*.call_stats.txt
+**/*.fw?
+**/*.bigWig
+**/*.npz
diff --git a/21_genome.py b/21_genome.py
index 2f6d76b..c364ac2 100644
--- a/21_genome.py
+++ b/21_genome.py
@@ -58,3 +58,26 @@
       normal_bai = "gs://fc-secure-e2772064-386d-4911-b242-d6ade82bf172/360c5959-3827-4b24-92e3-d57dbc5de2f6/gdc_api_file_download/15788922-9cf8-4c83-8040-47fa60b7d374/call-download_file/98e061cd-0586-4e56-85fb-c6cc6688dbff_wgs_gdc_realn.bai",
       target_list = 200
     )
+
+# Richter's test (hg19)
+import wolf
+from wolF import workflow
+
+import dalmatian
+wm = dalmatian.WorkspaceManager("broad-firecloud-ibmwatson/Getz_Wu_Richters_WGS_UK")
+
+wic = wolf.fc.WorkspaceInputConnector("broad-firecloud-ibmwatson/Getz_Wu_Richters_WGS_UK")
+Pj = wic.get_pairs_as_joint_samples()
+
+with wolf.Workflow(workflow = workflow.workflow, namespace = "HapASeg_Richters") as w:
+    for pair, p in Pj.loc[Pj["sample_type_T"] == "Richter"].iterrows():
+        w.run(
+          RUN_NAME = pair,
+          tumor_bam = p["output_bam_T"],
+          tumor_bai = p["output_bam_index_T"],
+          normal_bam = p["output_bam_N"],
+          normal_bai = p["output_bam_index_N"],
+          target_list = 2000,
+          ref_genome_build = "hg19"
+        )
+        break
diff --git a/40_het_selection.py b/40_het_selection.py
new file mode 100644
index 0000000..852de78
--- /dev/null
+++ b/40_het_selection.py
@@ -0,0 +1,85 @@
+import colorama
+import copy
+import itertools
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import ncls
+import numpy as np
+import numpy_groupies as npg
+import pandas as pd
+import scipy.stats as s
+import scipy.sparse as sp
+import scipy.special as ss
+import sortedcontainers as sc
+
+plt.figure(1); plt.clf()
+plt.figure(2); plt.clf()
+plt.figure(30); plt.clf()
+cut20_dens = {}
+cut20_lod = {}
+cut80_dens = {}
+cut80_lod = {}
+leg = []
+for depth in [15, 20, 30, 60, 80, 200]:
+    # simulate good hets
+    cov = s.poisson.rvs(depth, size = 10000)
+    A = s.binom.rvs(cov, 0.5)
+    B = cov - A
+
+    # simulate bad hets
+    bad_cov = s.poisson.rvs(depth, size = 10000)
+    bad_frac = np.ones_like(bad_cov).astype(float)
+    for i in range(len(bad_frac)):
+        bad_frac[i] = np.random.choice([0.1, 0.2, 0.3, 0.4, 0.6, 0.7, 0.8, 0.9])
+    A_bad = s.binom.rvs(bad_cov, bad_frac)
+    B_bad = bad_cov - A_bad
+
+    # old criterion: beta density between 0.6 and 0.4
+    betafrac = np.diff(s.beta.cdf([0.4, 0.6], A[:, None] + 1, B[:, None] + 1))
+    betafrac_bad = np.diff(s.beta.cdf([0.4, 0.6], A_bad[:, None] + 1, B_bad[:, None] + 1))
+
+    # new criterion: log-odds ratio
+    betalod = s.beta.logsf(0.5, A + 1, B + 1) - s.beta.logcdf(0.5, A + 1, B + 1)
+    betalod_bad = s.beta.logsf(0.5, A_bad + 1, B_bad + 1) - s.beta.logcdf(0.5, A_bad + 1, B_bad + 1)
+
+    # ROC curves
+    dens_cdf = np.zeros([1000, 2])
+    for i, cut in enumerate(np.linspace(0, 1, 1000)):
+        dens_cdf[i, 0] = (betafrac >= cut).mean()
+        dens_cdf[i, 1] = (betafrac_bad >= cut).mean()
+
+    lod_cdf = np.zeros([1000, 2])
+    for i, cut in enumerate(np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)):
+        lod_cdf[i, 0] = (np.abs(betalod) <= cut).mean()
+        lod_cdf[i, 1] = (np.abs(betalod_bad) <= cut).mean()
+
+    plt.figure(30)
+    st = plt.step(dens_cdf[:, 1], dens_cdf[:, 0])
+    color = st[0].get_color()
+    plt.step(lod_cdf[:, 1], lod_cdf[:, 0], color = color, linestyle = ":")
+
+    leg.append(st)
+
+    cut20_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 1] <= 0.2)[0]]
+    cut80_dens[depth] = np.linspace(0, 1, 1000)[np.flatnonzero(dens_cdf[:, 0] <= 0.8)[0]]
+    cut20_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[np.flatnonzero(lod_cdf[:, 1] >= 0.2)[0]]
+    cut80_lod_idx = np.flatnonzero(lod_cdf[:, 0] >= 0.8)[0]
+    cut80_lod[depth] = np.linspace(0, np.abs(np.r_[betalod_bad, betalod]).max(), 1000)[cut80_lod_idx]
+
+    plt.scatter(lod_cdf[cut80_lod_idx, 1], lod_cdf[cut80_lod_idx, 0], marker = 'x', color = color)
+    plt.text(lod_cdf[cut80_lod_idx, 1], lod_cdf[cut80_lod_idx, 0], "{0:.2f}".format(cut80_lod[depth]), color = color)
+
+    plt.figure(1)
+    sc = plt.scatter(cov, betafrac, alpha = 0.1, s = 10)
+    plt.scatter(depth, np.diff(s.beta.cdf([0.4, 0.6], depth/2 + 1, depth/2 + 1)), color = color, marker = "x")
+
+    cov_range = np.r_[cov.min():cov.max()]
+    cov_cum = np.nan*np.ones_like(cov_range)
+    for i, c in enumerate(cov_range):
+        cov_cum[i] = betafrac[cov >= c].mean()
+
+    plt.figure(2)
+    plt.scatter(cov_range, cov_cum)
+
+plt.figure(3)
+plt.legend([x[0] for x in leg], ["15x", "20x", "30x", "60x", "80x", "200x"])
diff --git a/71_coverage_covariates.py b/71_coverage_covariates.py
index 6c2bb1b..6012afa 100644
--- a/71_coverage_covariates.py
+++ b/71_coverage_covariates.py
@@ -1,11 +1,13 @@
 import liftover
+import numpy as np
 import pandas as pd
 import pyfaidx
+import pyBigWig
 import tqdm
-from capy import mut
+from capy import mut, seq
 
 #
-# replication timing
+# replication timing {{{
 
 F = pd.read_csv("/mnt/j/proj/cnv/20201018_hapseg2/covars/GSE137764_H1_GaussiansGSE137764_mooth_scaled_autosome.mat", sep = "\t", header = None).T.rename(columns = { 0 : "chr", 1 : "start", 2 : "end" })
 F.iloc[:, 3:] = F.loc[:, 3:].astype(float)
@@ -13,7 +15,7 @@
 F["chr"] = mut.convert_chr(F["chr"])
 F.to_pickle("covars/GSE137764_H1.hg38.pickle")
 
-# liftover to hg19
+# liftover to hg19 {{{
 F["chr_start_lift"] = 0
 F["chr_end_lift"] = 0
 F["start_lift"] = 0
@@ -63,8 +65,15 @@
 (F["start_strand_lift"].notin(["+", "?"])) | \
 (F["start_lift"] > F["end_lift"])
 
+# }}}
+
+# }}}
+
 #
-# GC content
+# GC content {{{
+
+## precompute GC content {{{
+# note: this is obsolete; GC content is now computed on the fly
 
 B = pd.read_csv("/mnt/j/proj/cnv/20210326_coverage_collector/targets.bed", sep = "\t", header = None, names = ["chr", "start", "end"])
 B["chr"] = mut.convert_chr(B["chr"])
@@ -78,3 +87,385 @@
 
 B.to_pickle("covars/GC.pickle")
 
+# }}}
+
+# Terry Speed GC content estimator {{{
+
+import hapaseg.run_coverage_MCMC
+
+# load coverage
+
+args = lambda : None
+args.coverage_csv = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/coverage_cat.bed" 
+args.allelic_clusters_object  = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/allelic_DP_SNP_clusts_and_phase_assignments.npz" 
+args.SNPs_pickle  = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/all_SNPs.pickle"
+args.segmentations_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/segmentations.pickle"
+args.repl_pickle = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/GSE137764_H1.hg19_liftover.pickle"
+args.faire_pickle  = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/FAIRE_GM12878.hg19.pickle"
+args.ref_fasta = "/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--12-14-09_040rmzi_1kaanny_0w3oyu5xxnfwe/jobs/0/inputs/Homo_sapiens_assembly19.fasta"
+args.bin_width = 2000
+
+cov_mcmc_runner = hapaseg.run_coverage_MCMC.CoverageMCMCRunner(
+  args.coverage_csv,
+  args.allelic_clusters_object,
+  args.SNPs_pickle,
+  args.segmentations_pickle,
+  f_repl=args.repl_pickle,
+  f_faire=args.faire_pickle,
+ # ref_fasta = "/mnt/j/db/hg38/ref/hg38.analysisSet.fa", # ALCH
+  ref_fasta = args.ref_fasta, #"/mnt/j/db/hg19/ref/hs37d5.fa", # Richter's
+  bin_width = args.bin_width
+)
+C = cov_mcmc_runner.full_cov_df
+
+# bin intervals by GC content
+C["GC_bin"] = np.round(C["C_GC"]*1000).astype(int)
+C["num_frags_corr"] = C["covcorr"]/C["C_frag_len"].mean()
+
+N_gc = C.groupby("GC_bin").size()
+F_gc = C.groupby("GC_bin")["num_frags_corr"].sum()
+
+plt.figure(1); plt.clf()
+plt.scatter(N_gc.index, F_gc/N_gc, marker = '.', s = 1)
+
+cov_df = pd.read_pickle("/mnt/nfs/HapASeg_Richters/CH1001LN-CH1001GL/Hapaseg_prepare_coverage_mcmc__2022-05-16--15-35-16_040rmzi_pid3cty_0w3oyu5xxnfwe/jobs/0/workspace/cov_df.pickle")
+cov_df = cov_df.merge(C[["start_g", "C_GC"]], left_on = "start_g", right_on = "start_g")
+
+cov_df["GC_bin"] = np.round(cov_df["C_GC"]*1000).astype(int)
+cov_df["num_frags_corr"] = cov_df["covcorr"]/cov_df["C_frag_len"].mean()
+
+N_gc = cov_df.groupby("GC_bin").size()
+F_gc = cov_df.groupby("GC_bin")["num_frags_corr"].sum()
+
+cov_df = cov_df.merge((F_gc/N_gc).rename("C_GC_f"), left_on = cov_df["GC_bin"], right_index = True)
+
+import loess
+_, y_l, _ = loess_1d.loess_1d(np.r_[N_gc.index], np.r_[F_gc/N_gc])
+
+plt.figure(2); plt.clf()
+plt.scatter(N_gc.index, F_gc/N_gc, marker = '.', s = 1)
+#plt.plot(N_gc.index, y_l)
+r = np.linspace(0, 1000, 1000)
+v = np.polyfit(np.r_[N_gc.index]/1000, F_gc/N_gc, 2)
+plt.plot(r, v[::-1]@(r**np.c_[0:3]))
+plt.ylim([0, 500])
+
+from capy import plots
+
+plt.figure(3); plt.clf()
+plots.pixplot(cov_df["C_GC_f"], cov_df["num_frags_corr"], alpha = 0.11)
+plots.pixplot(v[::-1]@(cov_df["C_GC"].values**np.c_[0:3]), cov_df["num_frags_corr"], alpha = 0.11)
+
+gc_g = []
+N_gc_g = []
+F_gc_g = []
+plt.figure(4); plt.clf()
+for _, cidx in cov_df.groupby("allelic_cluster").indices.items():
+    N_gc = cov_df.iloc[cidx].groupby("GC_bin").size()
+    F_gc = cov_df.iloc[cidx].groupby("GC_bin")["num_frags_corr"].sum()
+    lplt = plt.scatter(N_gc.index, (F_gc/N_gc)/F_gc.sum(), marker = '.', s = 1)
+
+    v = np.polyfit(N_gc.index, F_gc/N_gc, 2)
+    rng = np.linspace(0, 1000, 200)
+    plt.plot(rng, v[::-1]@(rng**np.c_[0:3]), color = lplt.get_edgecolor())
+
+    gc_g.extend(N_gc.index)
+    N_gc_g.extend(N_gc)
+    F_gc_g.extend(F_gc)
+
+N_gc_g = np.r_[N_gc_g]
+F_gc_g = np.r_[F_gc_g]
+gc_g = np.r_[gc_g]
+
+v = np.polyfit(gc_g, F_gc_g/N_gc_g, 2)
+plt.plot(r, v[::-1]@(r**np.c_[0:3]))
+_, y_l, _ = loess_1d.loess_1d(gc_g, F_gc_g/N_gc_g, xnew = r, degree = 2)
+plt.plot(r, y_l)
+
+plt.figure(3); plt.clf()
+_, y_l, _ = loess_1d.loess_1d(gc_g/1000, F_gc_g/N_gc_g, xnew = cov_df["C_GC"], degree = 2)
+plots.pixplot(cov_df["C_GC_f"], cov_df["num_frags_corr"], alpha = 0.11)
+
+## simulate quadratic relationship
+seg_sim = np.r_[np.ones([500, 1]), 1.5*np.ones([500, 1])].T
+gc_sim = np.random.rand(1000)*0.6 + 0.2
+rng = np.linspace(0, 1, 100)
+x = stats.poisson.rvs(np.exp(-30*(gc_sim - 0.5)**2 + 5*seg_sim))[:, None]
+C = np.c_[gc_sim**2, gc_sim]
+
+import hapaseg.model_optimizers
+PR = hapaseg.model_optimizers.PoissonRegression
+
+Pi = np.r_[np.tile([1, 0], [500, 1]), np.tile([0, 1], [500, 1])]
+pois_regr = PR(x, C, Pi)
+pois_regr.fit()
+pois_regr2 = PR(x, C[:, [1]], Pi)
+pois_regr2.fit()
+plt.figure(2); plt.clf()
+plt.scatter(x, np.exp(Pi@pois_regr.mu + C@pois_regr.beta), marker = '.', s = 1)
+plt.scatter(x, np.exp(Pi@pois_regr2.mu + C[:, [1]]@pois_regr2.beta), marker = '.', s = 1)
+
+
+# }}}
+
+# }}}
+
+#
+# DNAse HS/FAIRE {{{
+
+## DNAse {{{
+
+bw = pyBigWig.open("covars/wgEncodeUwDnaseGm12878RawRep1.bigWig")
+
+# WGS (2kb chunks)
+clen = seq.get_chrlens()
+C = []
+for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]]):
+    bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]]
+    tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "DNAse" : 0 })
+    for j, (st, en) in enumerate(tqdm.tqdm(bins)):
+        tmp.loc[j, "DNAse"] = np.nanmean(np.r_[bw.values(chrname, st, en)])
+    C.append(tmp)
+
+# preliminary results not so great; stick with FAIRE for now
+
+# TODO: liftover to hg38
+
+# WES
+
+# }}}
+
+## FAIRE {{{
+
+## convert bigWig to FWB
+
+# for some reason pyBigWig can't process this file
+# bw = pyBigWig.open("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigwig")
+
+# use bigWig2FWB instead
+# git clone git@github.com:getzlab/bigWig2FWB.git
+
+# figure out range of file
+# bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/bwtest
+# ./getmax
+# -> max = 5478
+# set scale factor to 11
+# bigWig2FWB/bigWig2FWB covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.bigWig covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal
+
+## WGS
+from capy import fwb, mut
+
+F = fwb.FWB("covars/wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb");
+
+clen = seq.get_chrlens()
+C = []
+for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X"]]):
+    bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]]
+    tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1], "FAIRE" : 0 })
+    for j, (st, en) in enumerate(tqdm.tqdm(bins)):
+        tmp.loc[j, "FAIRE"] = F.get(chrname, np.r_[st:en] + 1).mean()
+    C.append(tmp)
+
+FAIRE = pd.concat(C, ignore_index = True)
+FAIRE["chr"] = mut.convert_chr(FAIRE["chr"])
+FAIRE.to_pickle("covars/FAIRE_GM12878.hg19.pickle")
+
+# smoothed version
+FAIRE_smooth = FAIRE.copy()
+FAIRE_smooth["FAIRE"] = np.convolve(FAIRE["FAIRE"], np.ones(5), mode = "same")/5
+FAIRE_smooth.to_pickle("covars/FAIRE_GM12878.smooth5.hg19.pickle")
+
+#
+# re-process all FAIRE files using samtools
+import wolf, itertools, glob, prefect
+
+## make interval list
+clen = seq.get_chrlens()
+for i, chrname in enumerate(["chr" + str(x) for x in list(range(1, 23)) + ["X"]]):
+    bins = np.r_[0:clen[i]:2000, clen[i]]; bins = np.c_[bins[:-1], bins[1:]]
+    tmp = pd.DataFrame({ "chr" : chrname, "start" : bins[:, 0], "end" : bins[:, 1] })
+    tmp.to_csv(f"FAIRE/intervals/{chrname}.bed", sep = "\t", header = None, index = False)
+
+## define coverage workflow
+
+class markdups(wolf.Task):
+    inputs = { "bamin" }
+    script = "samtools markdup ${bamin} $(basename ${bamin}).dedup.bam && samtools index *dedup.bam"
+    outputs = { "bam" : "*.bam", "bai" : "*.bai" }
+    docker = "gcr.io/broad-getzlab-workflows/base_image:v0.0.5"
+
+intervals = glob.glob("/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE/intervals/*.bed")
+
+def BedCovFlow(bams, intervals):
+    # mark duplicates
+    mark_dups = []
+    for b in bams:
+        mark_dups.append(markdups(
+          inputs = { "bamin" : b },
+          overrides = { "bamin" : "string" },
+          use_scratch_disk = True,
+          scratch_disk_size = 10
+        ))
+
+    # run bedcov on all BAMs (gather)
+    @prefect.task(nout = 2)
+    def bl(md):
+        return [m["bam"] for m in md], [m["bai"] for m in md]
+    bam_list, bai_list = bl(mark_dups)
+
+    BedCov = wolf.Task(
+      name = "BedCov",
+      inputs = { "intervals" : intervals, "bams" : [bam_list], "bais" : [bai_list] },
+      script = """
+      samtools bedcov -Q1 ${intervals} $(cat ${bams}) > coverage.bed
+      """,
+      outputs = { "coverage" : "coverage.bed" },
+      docker = "gcr.io/broad-getzlab-workflows/base_image:v0.0.5"
+    )
+#    for b in bam_list:
+#        wolf.DeleteDisk(b, BedCov["coverage"])
+
+    # gather BedCovs
+    BedCovGather = wolf.Task(
+      name = "BedCovGather",
+      inputs = { "beds" : [BedCov["coverage"]] },
+      script = """
+      cat $(cat ${beds}) | sort -k1,1V -k2,2n | \
+        awk -F'\t' 'BEGIN { OFS = FS } { tot = 0; for(i = 4; i <= NF; i++) { tot += $i }; print $0, tot }' > concat.bed
+      """,
+      outputs = { "concat" : "concat.bed" },
+    )
+
+## run workflow 
+
+base_url = "http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeOpenChromFaire/"
+
+BAMs = ["wgEncodeOpenChromFaireA549AlnRep1.bam", # {{{
+"wgEncodeOpenChromFaireA549AlnRep2.bam",
+"wgEncodeOpenChromFaireAstrocyAlnRep1.bam",
+"wgEncodeOpenChromFaireAstrocyAlnRep2.bam",
+"wgEncodeOpenChromFaireColonocAlnRep1.bam",
+"wgEncodeOpenChromFaireColonocAlnRep2.bam",
+"wgEncodeOpenChromFaireEndometriumocAlnRep1.bam",
+"wgEncodeOpenChromFaireEndometriumocAlnRep2.bam",
+"wgEncodeOpenChromFaireFrontalcortexocAlnRep1.bam",
+"wgEncodeOpenChromFaireFrontalcortexocAlnRep2.bam",
+"wgEncodeOpenChromFaireGlioblaAlnRep1.bam",
+"wgEncodeOpenChromFaireGlioblaAlnRep2.bam",
+"wgEncodeOpenChromFaireGlioblaAlnRep3.bam",
+"wgEncodeOpenChromFaireGm12878AlnRep1.bam",
+"wgEncodeOpenChromFaireGm12878AlnRep2.bam",
+"wgEncodeOpenChromFaireGm12878AlnRep3.bam",
+"wgEncodeOpenChromFaireGm12891AlnRep1.bam",
+"wgEncodeOpenChromFaireGm12891AlnRep2.bam",
+"wgEncodeOpenChromFaireGm12892AlnRep1.bam",
+"wgEncodeOpenChromFaireGm12892AlnRep2.bam",
+"wgEncodeOpenChromFaireGm18507AlnRep1.bam",
+"wgEncodeOpenChromFaireGm18507AlnRep2.bam",
+"wgEncodeOpenChromFaireGm19239AlnRep1.bam",
+"wgEncodeOpenChromFaireGm19239AlnRep2.bam",
+"wgEncodeOpenChromFaireH1hescAlnRep1.bam",
+"wgEncodeOpenChromFaireH1hescAlnRep2.bam",
+"wgEncodeOpenChromFaireHelas3AlnRep1.bam",
+"wgEncodeOpenChromFaireHelas3AlnRep2.bam",
+"wgEncodeOpenChromFaireHelas3Ifna4hAlnRep1.bam",
+"wgEncodeOpenChromFaireHelas3Ifna4hAlnRep2.bam",
+"wgEncodeOpenChromFaireHelas3Ifng4hAlnRep1.bam",
+"wgEncodeOpenChromFaireHelas3Ifng4hAlnRep2.bam",
+"wgEncodeOpenChromFaireHepg2AlnRep1.bam",
+"wgEncodeOpenChromFaireHepg2AlnRep2.bam",
+"wgEncodeOpenChromFaireHepg2AlnRep3.bam",
+"wgEncodeOpenChromFaireHtr8AlnRep1.bam",
+"wgEncodeOpenChromFaireHtr8AlnRep2.bam",
+"wgEncodeOpenChromFaireHuvecAlnRep1.bam",
+"wgEncodeOpenChromFaireHuvecAlnRep2.bam",
+"wgEncodeOpenChromFaireK562AlnRep1.bam",
+"wgEncodeOpenChromFaireK562AlnRep2.bam",
+"wgEncodeOpenChromFaireK562NabutAlnRep1.bam",
+"wgEncodeOpenChromFaireK562NabutAlnRep2.bam",
+"wgEncodeOpenChromFaireK562OhureaAlnRep1.bam",
+"wgEncodeOpenChromFaireK562OhureaAlnRep2.bam",
+"wgEncodeOpenChromFaireKidneyocAlnRep1.bam",
+"wgEncodeOpenChromFaireKidneyocAlnRep2.bam",
+"wgEncodeOpenChromFaireMcf7Est10nm30mAlnRep1.bam",
+"wgEncodeOpenChromFaireMcf7Est10nm30mAlnRep2.bam",
+"wgEncodeOpenChromFaireMcf7HypoxlacAlnRep1.bam",
+"wgEncodeOpenChromFaireMcf7HypoxlacAlnRep2.bam",
+"wgEncodeOpenChromFaireMcf7VehAlnRep1.bam",
+"wgEncodeOpenChromFaireMcf7VehAlnRep2.bam",
+"wgEncodeOpenChromFaireMedulloAlnRep1.bam",
+"wgEncodeOpenChromFaireMedulloAlnRep2.bam",
+"wgEncodeOpenChromFaireMrta2041AlnRep1.bam",
+"wgEncodeOpenChromFaireMrta2041AlnRep2.bam",
+"wgEncodeOpenChromFaireMrtg4016AlnRep1.bam",
+"wgEncodeOpenChromFaireMrtg4016AlnRep2.bam",
+"wgEncodeOpenChromFaireMrtttc549AlnRep1.bam",
+"wgEncodeOpenChromFaireMrtttc549AlnRep2.bam",
+"wgEncodeOpenChromFaireNhaAlnRep1.bam",
+"wgEncodeOpenChromFaireNhaAlnRep2.bam",
+"wgEncodeOpenChromFaireNhbeAlnRep1.bam",
+"wgEncodeOpenChromFaireNhbeAlnRep2.bam",
+"wgEncodeOpenChromFaireNhekAlnRep1.bam",
+"wgEncodeOpenChromFaireNhekAlnRep2.bam",
+"wgEncodeOpenChromFairePancreasocAlnRep1.bam",
+"wgEncodeOpenChromFairePancreasocAlnRep2.bam",
+"wgEncodeOpenChromFairePanisletsAlnRep1.bam",
+"wgEncodeOpenChromFaireRcc7860AlnRep1.bam",
+"wgEncodeOpenChromFaireRcc7860AlnRep2.bam",
+"wgEncodeOpenChromFaireSmallintestineocAlnRep1.bam",
+"wgEncodeOpenChromFaireSmallintestineocAlnRep2.bam",
+"wgEncodeOpenChromFaireUrotsaAlnRep1.bam",
+"wgEncodeOpenChromFaireUrotsaAlnRep2.bam",
+"wgEncodeOpenChromFaireUrotsaUt189AlnRep1.bam",
+"wgEncodeOpenChromFaireUrotsaUt189AlnRep2.bam"] # }}}
+
+B = pd.Series(BAMs).str.extract("(?P<bam>.*Faire(?P<cell_line>.*)AlnRep(?P<rep>\d+)\.bam)")
+
+with wolf.Workflow(workflow = BedCovFlow, namespace = "FAIRE_cov") as w:
+    for cell_line, b in B.groupby("cell_line"):
+        w.run(RUN_NAME = cell_line, bams = base_url + b["bam"], intervals = intervals)
+
+## parse in coverages; make covariate table
+from capy import mut
+
+w = wolf.Workflow(workflow = BedCovFlow, namespace = "FAIRE_cov")
+for cell_line, b in B.groupby("cell_line"):
+    w.load_results(RUN_NAME = cell_line, bams = base_url + b["bam"], intervals = intervals)
+
+T = w.tasks.loc[(slice(None), "BedCovGather"), ["results"]].droplevel(1)
+T["covpath"] = T["results"].apply(lambda x : x["concat"])
+
+for i, (cell_line, cov) in enumerate(T.iterrows()):
+    X = pd.read_csv(cov["covpath"], sep = "\t", header = None)
+    X = X.rename(columns = { len(X.columns) - 1 : cell_line })
+    # get common lines
+    if i == 0:
+        C = X.iloc[:, np.r_[0:3, -1]].rename(columns = { 0 : "chr", 1 : "start", 2 : "end" })
+    else:
+        C = pd.concat([C, X.iloc[:, -1]], axis = 1)
+
+C["chr"] = mut.convert_chr(C["chr"])
+
+C.to_pickle("covars/FAIRE/coverage.dedup.raw.pickle")
+
+# rebin to 10k
+C["index_r"] = C.index//5
+C10k = C.groupby(["chr", "index_r"]).agg({
+   "start" : min, "end" : max,
+   **{ k : sum for k in C.columns[3:] }
+}).droplevel(1).reset_index().drop(columns = "index_r")
+
+C10k.to_pickle("covars/FAIRE/coverage.dedup.raw.10kb.pickle")
+
+# 100k?
+C["index_r"] = C.index//50
+C100k = C.groupby(["chr", "index_r"]).agg({
+   "start" : min, "end" : max,
+   **{ k : sum for k in C.columns[3:] }
+}).droplevel(1).reset_index().drop(columns = "index_r")
+
+C100k.to_pickle("covars/FAIRE/coverage.dedup.raw.100kb.pickle")
+
+# }}}
+
+# }}}
diff --git a/85_simFC.py b/85_simFC.py
new file mode 100644
index 0000000..b9ebcf9
--- /dev/null
+++ b/85_simFC.py
@@ -0,0 +1,66 @@
+import wolf
+
+mutect = wolf.ImportTask("github.com:getzlab/MuTect1_TOOL.git", "M1")
+
+def workflow(
+  bam, bai, vcf,
+  refFasta = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.fa",
+  refFastaIdx = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.fa.fai",
+  refFastaDict = "gs://getzlab-workflows-reference_files-oa/hg38/gdc/GRCh38.d1.vd1.dict"
+):
+    localize = wolf.LocalizeToDisk(
+      files = {
+        "bam" : bam,
+        "bai" : bai,
+        "vcf" : vcf,
+        "refFasta" : refFasta,
+        "refFastaIdx" : refFastaIdx,
+        "refFastaDict" : refFastaDict
+      }
+    )
+
+    split_vcf = wolf.Task(
+      name = "split_vcf",
+      inputs = { "vcf" : localize["vcf"] },
+      script = """
+      grep '^#' ${vcf} > header
+      sed '/^#/d' ${vcf} | split -l 10000 -d -a 3 --filter='cat header /dev/stdin > $FILE' - VCF_chunk
+      """,
+      outputs = { "shards" : "VCF_chunk*" }
+    )
+
+    m1_scatter = mutect.mutect1(
+      inputs = {
+        "pairName" : "platinum",
+        "caseName" : "platinum",
+        "t_bam" : localize["bam"],
+        "t_bai" : localize["bai"],
+        "force_calling" : True,
+        "intervals" : split_vcf["shards"],
+        "fracContam" : 0,
+        "refFasta" : localize["refFasta"],
+        "refFastaIdx" : localize["refFastaIdx"],
+        "refFastaDict" :  localize["refFastaDict"]
+      }
+    )
+
+    m1_gather = wolf.Task(
+      name = "m1_gather",
+      inputs = { "callstats_array" : [m1_scatter["mutect1_cs"]] },
+      script = """
+      head -n2 $(head -n1 ${callstats_array}) > header
+      while read -r i; do
+        sed '1,2d' $i
+      done < ${callstats_array} | sort -k1,1V -k2,2n > cs_sorted
+      cat header cs_sorted > cs_concat.tsv
+      """,
+      outputs = { "cs_gather" : "cs_concat.tsv" }
+    )
+
+with wolf.Workflow(workflow = workflow, namespace = "HS_sim") as w:
+    w.run(
+      RUN_NAME = "NA12878_WGS_platinum_hg38",
+      bam = "gs://jh-xfer/NA12878_bwamem_illumina_platinum_bed.bam",
+      bai = "gs://jh-xfer/NA12878_bwamem_illumina_platinum_bed.bam.bai",
+      vcf = "gs://jh-xfer/NA12878.vcf"
+    )
diff --git a/Dockerfile b/Dockerfile
index 9234955..249c37e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -4,8 +4,10 @@ WORKDIR /build
 
 # install dependencies
 RUN pip install sortedcontainers
+ARG cache_invalidate=xxx
 RUN git clone https://github.com/getzlab/CApy.git && pip install ./CApy
 RUN pip install dask distributed
+RUN pip install distinctipy
 
 # install hapaseg
 COPY setup.py .
diff --git a/covars/getmax.c b/covars/getmax.c
new file mode 100644
index 0000000..5bcc865
--- /dev/null
+++ b/covars/getmax.c
@@ -0,0 +1,15 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <byteswap.h>
+
+int main() {
+   FILE* x = fopen("wgEncodeOpenChromFaireGm12878BaseOverlapSignal.fwb", "r");
+   uint16_t max = 0;
+   uint16_t buf;
+   while(fread(&buf, 2, 1, x)) {
+      buf = __bswap_16(buf);
+      if(buf > max) { max = buf; printf("%d\n", max); }
+   }
+   return 0;
+}
diff --git a/hapaseg/NB_coverage_MCMC.py b/hapaseg/NB_coverage_MCMC.py
index 6275f39..dd77611 100644
--- a/hapaseg/NB_coverage_MCMC.py
+++ b/hapaseg/NB_coverage_MCMC.py
@@ -1,12 +1,14 @@
 import numpy as np
 import scipy.special as ss
 import sortedcontainers as sc
+import sys
 from statsmodels.discrete.discrete_model import NegativeBinomial as statsNB
 import warnings
 from statsmodels.tools.sm_exceptions import ConvergenceWarning, HessianInversionWarning
 import matplotlib as mpl
 import matplotlib.pyplot as plt
 from scipy.signal import find_peaks
+import scipy.sparse as sp
 from .model_optimizers import PoissonRegression
 
 # turn off warnings for statsmodels fitting
@@ -48,8 +50,11 @@ def __init__(self, r, C, mu_0, beta_0, bin_width=1):
         self.segment_lens = sc.SortedDict([(0, len(self.r))])
         
         # keep cache of previously computed breakpoints for fast splitting
-        # these breakpoints keys are in the form (st, en, breakpoint)
-        self.breakpoint_cache = {}
+        sz = tuple(np.r_[1, 1]*(len(r) + 1))
+        self.cache_LL_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_LL = []
+        self.cache_mu_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_mu = []
+        self.cache_lepsi_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_lepsi = []
+        self.cache_hess_ptr = sp.dok_matrix(sz, dtype = np.int64); self.cache_hess = []
 
         self.phase_history = []
         self.F = sc.SortedList()
@@ -79,8 +84,17 @@ def get_seg_ind(self, seg):
     def get_join_seg_ind(self, seg_l, seg_r):
         return seg_l, seg_r + self.segment_lens[seg_r]
 
+    # get overall likelihood for all segments
     def get_ll(self):
-        return self.ll_cluster(self.mu_i_arr, self.lepsi_i_arr, True)
+        bdy = np.r_[list(self.segments), len(self.r)]; bdy = np.c_[bdy[:-1], bdy[1:]]
+        ll = 0
+        for st, en in bdy:
+            # lookup in cache
+            if (ptr := self.cache_LL_ptr[st, en]) != 0:
+                ll += self.cache_LL[ptr]
+            else:
+                ll += self.ll_cluster([st, en], self.mu_i_arr[st:en], self.lepsi_i_arr[st:en], True)
+        return ll
     
     # read in the merged cluster assignments from burnin scatter jobs and 
     # fill in data structures for cluster mcmc accordingly
@@ -137,12 +151,28 @@ def stats_init(self):
 
     # statsmodels NB BFGS optimizer is more stable than NR so we will use it until migration to LNP
     def stats_optimizer(self, ind, ret_hess=False):
+        # cache hit; look up values
+        if self.cache_mu_ptr[ind[0], ind[1]] != 0:
+            #breakpoint()
+            mu = self.cache_mu[self.cache_mu_ptr[ind[0], ind[1]]]
+            lepsi = self.cache_lepsi[self.cache_lepsi_ptr[ind[0], ind[1]]]
+            if ret_hess:
+                return mu, lepsi, self.cache_hess[self.cache_hess_ptr[ind[0], ind[1]]]
+            else:
+                return mu, lepsi
+
+        # cache miss; compute values
         endog = self.r[ind[0]:ind[1]].flatten()
         exog = np.ones(self.r[ind[0]:ind[1]].shape[0])
         exposure = np.ones(self.r[ind[0]:ind[1]].shape[0]) * self.bin_exposure
         sNB = statsNB(endog, exog, exposure=exposure, offset=(self.C[ind[0]:ind[1]] @ self.beta).flatten() + self.mu)
         res = sNB.fit(disp=0)
 
+        # save to cache
+        self.cache_mu.append(res.params[0]); self.cache_mu_ptr[ind[0], ind[1]] = len(self.cache_mu) - 1
+        self.cache_lepsi.append(-np.log(res.params[1])); self.cache_lepsi_ptr[ind[0], ind[1]] = len(self.cache_lepsi) - 1
+        self.cache_hess.append(sNB.hessian(res.params)); self.cache_hess_ptr[ind[0], ind[1]] = len(self.cache_hess) - 1
+
         if ret_hess:
             return res.params[0], -np.log(res.params[1]), sNB.hessian(res.params)
         else:
@@ -165,16 +195,16 @@ def refit_beta(self):
             self.lepsi_i_arr[row[0]:row[1]] = lepsi_i
 
     # method for calculating the overall log likelihood of an allelic cluster given a hypothetical mu_i and lepsi arrays
-    def ll_cluster(self, mu_i_arr, lepsi_i_arr, take_sum=True):
-        mu_i_arr = mu_i_arr.flatten()
-        epsi_i_arr = np.exp(lepsi_i_arr).flatten()
+    def ll_cluster(self, ind, mu_i, lepsi_i, take_sum=True):
+        epsi_i = np.exp(lepsi_i)
         exposure= np.log(self.bin_exposure)
-        bc = (self.C @ self.beta).flatten() + exposure
-        exp = np.exp(self.mu + bc + mu_i_arr).flatten()
+        bc = (self.C[ind[0]:ind[1]] @ self.beta).flatten() + exposure
+        exp = np.exp(self.mu + bc + mu_i).flatten()
+        r_subset = self.r[ind[0]:ind[1]]
 
-        lls = (ss.gammaln(self.r + epsi_i_arr) - ss.gammaln(self.r + 1) - ss.gammaln(epsi_i_arr) +
-               (self.r * (self.mu + bc + mu_i_arr - np.log(epsi_i_arr + exp))) +
-               (epsi_i_arr * np.log(epsi_i_arr / (epsi_i_arr + exp))))
+        lls = (ss.gammaln(r_subset + epsi_i) - ss.gammaln(r_subset + 1) - ss.gammaln(epsi_i) +
+               (r_subset * (self.mu + bc + mu_i - np.log(epsi_i + exp))) +
+               (epsi_i * np.log(epsi_i / (epsi_i + exp))))
         if not take_sum:
             return lls
         return lls.sum()
@@ -276,6 +306,8 @@ def _calculate_splits(self, ind, split_indices):
         Hs = []
         for ix in split_indices:
             if ix < 0:
+                # what do we do here WRT ll_cluster indices? FIXME
+                breakpoint()
                 # no split proposal
                 ll_join = self.ll_cluster(self.mu_i_arr, self.lepsi_i_arr)
                 lls.append(ll_join)
@@ -290,15 +322,24 @@ def _calculate_splits(self, ind, split_indices):
                 lepsis.append((lepsi_l, lepsi_r))
                 Hs.append((H_l, H_r))
 
-                tmp_mui = self.mu_i_arr.copy()
-                tmp_mui[ind[0]:ix] = mu_l
-                tmp_mui[ix: ind[1]] = mu_r
-                tmp_lepsi = self.lepsi_i_arr.copy()
-                tmp_lepsi[ind[0]:ix] = lepsi_l
-                tmp_lepsi[ix: ind[1]] = lepsi_r
+                # lookup likelihoods in cache
+                # left:
+                if (ptr := self.cache_LL_ptr[ind[0], ix]) != 0:
+                    #breakpoint()
+                    ll_l = self.cache_LL[ptr]
+                else: 
+                    ll_l = self.ll_cluster([ind[0], ix], mu_l, lepsi_l)
+                    self.cache_LL.append(ll_l); self.cache_LL_ptr[ind[0], ix] = len(self.cache_LL) - 1
+
+                # right:
+                if (ptr := self.cache_LL_ptr[ix, ind[1]]) != 0:
+                    #breakpoint()
+                    ll_r = self.cache_LL[ptr]
+                else:
+                    ll_r = self.ll_cluster([ix, ind[1]], mu_r, lepsi_r)
+                    self.cache_LL.append(ll_r); self.cache_LL_ptr[ix, ind[1]] = len(self.cache_LL) - 1
 
-                ll = self.ll_cluster(tmp_mui, tmp_lepsi)
-                lls.append(ll)
+                lls.append(ll_l + ll_r)
 
         return lls, mus, lepsis, Hs
 
@@ -412,7 +453,7 @@ def _detailed_sampling(self, ind, lls, split_indices, mus, lepsis, Hs):
     def _lls_to_MLs(self, lls, Hs):
         MLs = np.zeros(len(lls))
         for i, (ll, Hs) in enumerate(zip(lls, Hs)):
-            laplacian = self._get_log_ML_split(Hs[0], Hs[1])
+            laplacian = self._get_log_ML_gaussint_split(Hs[0], Hs[1])
             # the split results in a nan make it impossible to split there
             if np.isnan(laplacian):
                 laplacian = -1e50
@@ -444,25 +485,28 @@ def _get_split_liks(self, ind, debug=False):
 
         return split_indices, MLs, mus, lepsis
 
-    # computes ML component from hessian approximation for a single segment
-    def _get_log_ML_approx_join(self, Hess):
+    # computes Gaussian integral for ML Laplace approximation for a single segment
+    def _get_log_ML_gaussint_join(self, Hess):
         return np.log(2 * np.pi) - (np.log(np.linalg.det(-Hess))) / 2
 
-    # computes ML component from hessian approximation for two split segments
-    def _get_log_ML_split(self, H1, H2):
-        return np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2
+    # computes Gaussian integral for ML Laplace approximation for two split segments
+    def _get_log_ML_gaussint_split(self, H1, H2):
+        return 2*np.log(2 * np.pi) - (np.log(np.linalg.det(-H1) * np.linalg.det(-H2))) / 2
 
     # computes the log ML of joining two segments
     def _log_ML_join(self, ind, ret_opt_params=False):
         mu_share, lepsi_share, H_share = self.stats_optimizer(ind, True)
-        tmp_mui = self.mu_i_arr.copy()
-        tmp_mui[ind[0]:ind[1]] = mu_share
-        tmp_lepsi = self.lepsi_i_arr.copy()
-        tmp_lepsi[ind[0]:ind[1]] = lepsi_share
-        ll_join = self.ll_cluster(tmp_mui, tmp_lepsi)
-        if ret_opt_params:
-            return mu_share, lepsi_share, self._get_log_ML_join(H_share) + ll_join
-        return mu_share, lepsi_share, self._get_log_ML_approx_join(H_share) + ll_join
+
+        # lookup cache
+        if (ptr := self.cache_LL_ptr[ind[0], ind[1]]) != 0:
+            #breakpoint()
+            ll_join = self.cache_LL[ptr]
+        else:
+            ll_join = self.ll_cluster(ind, mu_share, lepsi_share)
+
+            # add to cache
+            self.cache_LL.append(ll_join); self.cache_LL_ptr[ind[0], ind[1]] = len(self.cache_LL) - 1
+        return mu_share, lepsi_share, self._get_log_ML_gaussint_join(H_share) + ll_join
 
     """
     Split segment method. This method chooses a segment at random
@@ -493,7 +537,7 @@ def split(self, debug):
 
         max_ML = max(log_MLs)
         k_probs = np.exp(log_MLs - max_ML) / np.exp(log_MLs - max_ML).sum()
-        
+
         if np.isnan(k_probs).any():
             print("skipping split iteration due to nan. log MLs: ", log_MLs, flush=True)
             return 0
@@ -545,7 +589,7 @@ def join(self, debug):
         ind = self.get_join_seg_ind(seg_l, seg_r)
 
         lls_split, _, _, Hs = self._calculate_splits(ind, [seg_r])
-        log_split_ML = lls_split[0] + self._get_log_ML_split(Hs[0][0], Hs[0][1])
+        log_split_ML = lls_split[0] + self._get_log_ML_gaussint_split(Hs[0][0], Hs[0][1])
         mu_share, lepsi_share, log_join_ML = self._log_ML_join(ind)
 
         log_MLs = np.r_[log_split_ML, log_join_ML]
@@ -738,13 +782,12 @@ def prepare_results(self):
     """
 class NB_MCMC_SingleCluster:
 
-    def __init__(self, n_iter, r, C, mu, beta, cluster_num, bin_width=1):
+    def __init__(self, n_iter, r, C, mu, beta, bin_width=1):
         self.n_iter = n_iter
         self.r = r
         self.C = C
         self.beta = beta
         self.mu = mu
-        self.cluster_num = cluster_num
         self.bin_width = bin_width
         # for now assume that the Pi vector assigns each bin to exactly one cluster
         
@@ -759,6 +802,7 @@ def __init__(self, n_iter, r, C, mu, beta, cluster_num, bin_width=1):
         self.mu_i_samples = []
         self.lepsi_i_samples = []
         self.F_samples = []
+        self.ll_samples = []
 
         self.ll_cluster = 0
         self.ll_iter = []
@@ -784,11 +828,12 @@ def save_sample(self):
         self.mu_i_samples.append(self.cluster.mu_i_arr.copy())
         self.lepsi_i_samples.append(self.cluster.lepsi_i_arr.copy())
         self.F_samples.append(self.cluster.F.copy())
+        self.ll_samples.append(self.ll_cluster)
 
     def run(self,
             debug=False,
             stop_after_burnin=False):
-        print("starting MCMC coverage segmentation for cluster {}...".format(self.cluster_num), flush=True)
+        print("Starting MCMC coverage segmentation ...", flush=True, file=sys.stderr)
 
         past_it = 0
         n_it = 0
diff --git a/hapaseg/__main__.py b/hapaseg/__main__.py
index 32b19d4..583db50 100644
--- a/hapaseg/__main__.py
+++ b/hapaseg/__main__.py
@@ -9,6 +9,7 @@
 import scipy.stats as s
 import scipy.special as ss
 import sortedcontainers as sc
+import traceback
 
 from capy import mut
 
@@ -20,7 +21,7 @@
 from . import utils as hs_utils
 
 from .NB_coverage_MCMC import NB_MCMC_SingleCluster
-from .run_coverage_MCMC import CoverageMCMCRunner, aggregate_clusters, aggregate_burnin_files 
+from .run_coverage_MCMC import CoverageMCMCRunner, aggregate_adp_segments, aggregate_burnin_files
 from .coverage_DP import Coverage_DP
 from .a_cov_DP import generate_acdp_df, AllelicCoverage_DP
 
@@ -112,13 +113,9 @@ def parse_args():
                             required=True)
     ## DP
     dp = subparsers.add_parser("dp", help="Run DP clustering on allelic imbalance segments")
-    dp.add_argument("--seg_dataframe", required=True)
-    dp.add_argument("--n_dp_iter", default=10)
-    dp.add_argument("--seg_samp_idx", default=0)
-    dp.add_argument("--ref_fasta",
-                    required=True)  # TODO: only useful for chrpos->gpos; will be removed when this is passed from load
-    dp.add_argument("--cytoband_file",
-                    required=True)  # TODO: only useful for chrpos->gpos; will be removed when this is passed from load
+    dp.add_argument("--seg_dataframe", required = True)
+    dp.add_argument("--ref_fasta", required = True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load
+    dp.add_argument("--cytoband_file", required = True) # TODO: only useful for chrpos->gpos; will be removed when this is passed from load
   
     ## coverage MCMC
     coverage_mcmc = subparsers.add_parser("coverage_mcmc",
@@ -128,6 +125,7 @@ def parse_args():
     coverage_mcmc.add_argument("--allelic_clusters_object",
                                help="npy file containing allelic dp segs-to-clusters results")
     coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs")
+    coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
     coverage_mcmc.add_argument("--covariate_dir",
                                help="path to covariate directory with covariates all in pickled files")
     coverage_mcmc.add_argument("--num_draws", type=int,
@@ -148,24 +146,28 @@ def parse_args():
     preprocess_coverage_mcmc.add_argument("--allelic_clusters_object",
                                           help="npy file containing allelic dp segs-to-clusters results", required=True)
     preprocess_coverage_mcmc.add_argument("--SNPs_pickle", help="pickled dataframe containing SNPs", required=True)
+    preprocess_coverage_mcmc.add_argument("--segmentations_pickle", help="pickled sorteddict containing allelic imbalance segment boundaries", required=True)
     preprocess_coverage_mcmc.add_argument("--repl_pickle", help="pickled dataframe containing replication timing data", required=True)
+    preprocess_coverage_mcmc.add_argument("--faire_pickle", help="pickled dataframe containing FAIRE data", required=True)
     preprocess_coverage_mcmc.add_argument("--gc_pickle", help="pickled dataframe containing precomputed gc content. This is not required but will speed up runtime if passed", default=None)
     preprocess_coverage_mcmc.add_argument("--allelic_sample", type=int,
                                           help="index of sample clustering from allelic DP to use as seed for segmentation. Will use most likely clustering by default",
                                           default=None)
+    preprocess_coverage_mcmc.add_argument("--ref_fasta", required = True)
+    preprocess_coverage_mcmc.add_argument("--bin_width", help = "Coverage bin width (for WGS only)", default = 1, type = int)
 
     ## running coverage mcmc on single cluster for scatter task
     coverage_mcmc_shard = subparsers.add_parser("coverage_mcmc_shard",
                                                 help="run coverage mcmc on single ADP cluster")
-    coverage_mcmc_shard.add_argument("--preprocess_data", help='path to numpy object containing preprocessed data',
+    coverage_mcmc_shard.add_argument("--preprocess_data", help='path to numpy object containing preprocessed data: covariate matrix (C), global beta, ADP cluster mu\'s, covbin ADP cluster assignments (all_mu), covbin raw coverage values (r)',
                                      required=True)
+    coverage_mcmc_shard.add_argument("--allelic_seg_indices", help='path to pickled pandas dataframe containing coverage bin indices for each alleic segment',
+                                     required=True)
+    coverage_mcmc_shard.add_argument("--allelic_seg_idx", help='which allelic segment to perform coverage segmentation on.',
+                                     required=True, type=int)
     coverage_mcmc_shard.add_argument("--num_draws", type=int,
                                help="number of draws to take from coverage segmentation MCMC", default=50)
-    coverage_mcmc_shard.add_argument("--cluster_num", type=int,
-                               help="cluster index for this worker to run on. If unspecified method will simulate "
-                                    "all clusters on the same machine", default=None)
     coverage_mcmc_shard.add_argument("--bin_width", type=int, default=1, help="size of uniform bins if using. Otherwise 1.")
-    coverage_mcmc_shard.add_argument("--range", type=str, help="range of coverage bins within the cluster to burnin. should be in start-end form. Note that this will cause num draws to be overridden to 1")
     coverage_mcmc_shard.add_argument("--burnin_files", type=str, help="txt file containing burnt in segment assignments")
 
     ## collect coverage MCMC shards
@@ -208,6 +210,7 @@ def parse_args():
     ac_dp.add_argument("--num_samples", type=int, help="number of samples to take")
     ac_dp.add_argument("--cytoband_file", help="path to cytoband txt file")
     ac_dp.add_argument("--warmstart", type=bool, default=True, help="run clustering with warmstart")
+
     args = parser.parse_args()
 
     # validate arguments
@@ -324,6 +327,13 @@ def main():
         with open(output_dir + "/amcmc_results.pickle", "wb") as f:
             pickle.dump(H.run(), f)
 
+        try:
+            H.visualize()
+            plt.savefig(output_dir + "/figures/MLE_segmentation.png", dpi = 300)
+        except Exception:
+            print("Error plotting segments; see stack trace for details:")
+            print(traceback.format_exc())
+
     elif args.command == "concat":
         #
         # load scatter intervals
@@ -436,79 +446,49 @@ def main():
         A = A_DP(args.seg_dataframe, ref_fasta=args.ref_fasta)
 
         # run DP
-        # TODO: when we have better type checking, drop the int coersion here
-        # N_seg_samps = A.n_samp - 1 if int(args.n_seg_samps) == 0 else int(args.n_seg_samps)
-        # TODO: if we decide to drop support for chained sampling altogether, remove N_seg_samps logic altogether
-        snps_to_clusters, snps_to_phases, likelihoods = A.run(
-            seg_sample_idx=int(args.seg_samp_idx),
-            # N_seg_samps = N_seg_samps,
-            N_clust_samps=int(args.n_dp_iter)
-        )
+        snps_to_clusters, snps_to_phases, likelihoods = A.run()
 
         # save DP results
+        # SNP assignment/phasing samples, likelihoods of each sample
         np.savez(output_dir + "/allelic_DP_SNP_clusts_and_phase_assignments.npz",
                  snps_to_clusters=snps_to_clusters,
                  snps_to_phases=snps_to_phases,
                  likelihoods=likelihoods
                  )
 
+        # segmentation breakpoints for each sample
+        with open(output_dir + "/segmentations.pickle", "wb") as f:
+            pickle.dump(A.DP_run.segment_trace, f)
+
+        # full SNP dataframe
         A.SNPs.to_pickle(output_dir + "/all_SNPs.pickle")
 
         #
         # plot DP results
 
-        # 1. phased SNP visualization
-        f = plt.figure(figsize=[17.56, 5.67])
-        hs_utils.plot_chrbdy(args.cytoband_file)
-        A.visualize_SNPs(snps_to_phases, color=True, f=f)
-        A.visualize_clusts(snps_to_clusters, f=f, thick=True, nocolor=True)
-        plt.ylabel("Haplotypic imbalance")
-        plt.title("SNP phasing/segmentation")
-        plt.savefig(output_dir + "/figures/SNPs.png", dpi=300)
-        plt.close()
+        # 0. likelihood trace
+        A.DP_run.plot_likelihood_trace()
+        plt.savefig(output_dir + "/figures/likelihood_trace.png", dpi = 300)
 
-        # 2. pre-clustering segments
-        f = plt.figure(figsize=[17.56, 5.67])
+        # 1. SNPs + segments
+        f = plt.figure(figsize = [17.56, 5.67])
         hs_utils.plot_chrbdy(args.cytoband_file)
-        A.visualize_SNPs(snps_to_phases, color=False, f=f)
-        A.visualize_segs(snps_to_clusters, f=f)
+        A.DP_run.visualize_segs(f = f, show_snps = True)
         plt.ylabel("Haplotypic imbalance")
-        plt.title("Allelic segmentation, pre-DP clustering")
-        plt.savefig(output_dir + "/figures/allelic_imbalance_preDP.png", dpi=300)
+        plt.title("SNPs + allelic segmentation (MAP)")
+        plt.savefig(output_dir + "/figures/SNPs.png", dpi = 300)
         plt.close()
 
-        # 3. post-clustering segments
-        f = plt.figure(figsize=[17.56, 5.67])
+        # 2. segments alone
+        f = plt.figure(figsize = [17.56, 5.67])
         hs_utils.plot_chrbdy(args.cytoband_file)
-        A.visualize_SNPs(snps_to_phases, color=False, f=f)
-        A.visualize_clusts(snps_to_clusters, f=f, thick=True)
+        A.DP_run.visualize_segs(f = f, show_snps = False)
         plt.ylabel("Haplotypic imbalance")
-        plt.title("Allelic segmentation, post-DP clustering")
-        plt.savefig(output_dir + "/figures/allelic_imbalance_postDP.png", dpi=300)
+        plt.title("Allelic segmentation (posterior)")
+        plt.savefig(output_dir + "/figures/segs_only.png", dpi = 300)
         plt.close()
-    
-    #collect adp run data
-    elif args.command == "collect_adp":
-        with open(args.dp_results, 'r') as f:
-	        dp_results = f.readlines()
-        accum_clusts = []
-        accum_phases = []
-        accum_liks = []
-        
-        for dp_shard in dp_results:
-            obj = np.load(dp_shard.rstrip('\n'))
-            accum_clusts.append(obj['snps_to_clusters'])
-            accum_phases.append(obj['snps_to_phases'])
-            accum_liks.append(obj['likelihoods'])
-        all_clusts = np.vstack(accum_clusts)
-        all_phases = np.vstack(accum_phases)
-        all_liks = np.vstack(accum_liks)
-        # save
-        np.savez(os.path.join(output_dir, "full_dp_results"), snps_to_clusters=all_clusts, snps_to_phases=all_phases, likelihoods=all_liks)
-
 
     ## running coverage mcmc on all clusters
-
     elif args.command == "coverage_mcmc":
         cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv,
                                              args.allelic_clusters_object,
@@ -529,70 +509,105 @@ def main():
 
     ## preprocess ADP data to run scattered coverage mcmc jobs on each ADP cluster
     elif args.command == "coverage_mcmc_preprocess":
+        ## perform initial Poisson regression
         cov_mcmc_runner = CoverageMCMCRunner(args.coverage_csv,
                                              args.allelic_clusters_object,
                                              args.SNPs_pickle,
+                                             args.segmentations_pickle,
+                                             args.ref_fasta,
                                              f_repl=args.repl_pickle,
+                                             f_faire=args.faire_pickle,
                                              f_GC=args.gc_pickle,
-                                             allelic_sample=args.allelic_sample)
-        Pi, r, C, all_mu, global_beta, cov_df, adp_cluster = cov_mcmc_runner.prepare_single_cluster()
+                                             allelic_sample=args.allelic_sample,
+                                             bin_width=args.bin_width)
+        Pi, r, C, all_mu, global_beta, cov_df, adp_cluster, pois_hess = cov_mcmc_runner.prepare_single_cluster()
+
+        ## create chunks for both burnin and scatter
+        cov_df = cov_df.sort_values("start_g", ignore_index = True)
+
+        # indices of coverage bins 
+        seg_g = cov_df.groupby("seg_idx") # NOTE: seg_idx may not be contiguous if any allelic segments were dropped 
+        seg_g_idx = pd.Series(seg_g.indices).to_frame(name = "indices")
+        seg_g_idx["allelic_cluster"] = seg_g["allelic_cluster"].first()
+        seg_g_idx["n_cov_bins"] = seg_g.size()
+
+        ## save
+        # regression matrices
         np.savez(os.path.join(output_dir, 'preprocess_data'), Pi=Pi, r=r, C=C, all_mu=all_mu,
-                 global_beta=global_beta, adp_cluster=adp_cluster)
+                 global_beta=global_beta, adp_cluster=adp_cluster, pois_hess=pois_hess)
+        # coverage dataframe mapped 
         cov_df.to_pickle(os.path.join(output_dir, 'cov_df.pickle'))
+        # allelic segment indices into coverage dataframe
+        seg_g_idx.to_pickle(os.path.join(output_dir, 'allelic_seg_groups.pickle'))
 
     ## run scattered coverage mcmc job using preprocessed data
     elif args.command == "coverage_mcmc_shard":
         # load preprocessed data
         preprocess_data = np.load(args.preprocess_data)
-        # check to make sure that the cluster index is within the range
-        Pi = preprocess_data['Pi']
-        if args.cluster_num > Pi.shape[1] - 1:
-            raise ValueError("Received cluster number {}, which is out of range".format(args.cluster_num))
-        
+
         # extract preprocessed data from this cluster
-        mu = preprocess_data["all_mu"][args.cluster_num]
+        Pi = preprocess_data['Pi']
+        mu = preprocess_data["all_mu"]#[args.cluster_num]
         beta = preprocess_data["global_beta"]
         c_assignments = np.argmax(Pi, axis=1)
-        cluster_mask = (c_assignments == args.cluster_num)
-        r = preprocess_data['r'][cluster_mask]
-        C = preprocess_data['C'][cluster_mask]
-        
-        # if we get a range argument well be doing burnin on a subset of the coverage bins
-        if args.range is not None:
-            #parse range from string
-            range_lst = args.range.split('-')
-            st,en = int(range_lst[0]), int(range_lst[1]) 
-            if st > en or st < 0 or en > len(r):
-                raise ValueError("invalid range! got range {} for cluster {} with size {}".format(args.range, args.cluster_num, len(r)))
-            
-            #trim data to our desired range
-            r = r[st:en]
-            C = C[st:en]
-            num_draws = 1
-            
-            # if we're just burning in a subset use different save strings
-            model_save_str = 'cov_mcmc_model_cluster_{}_{}.pickle'.format(args.cluster_num, args.range)
-            data_save_str = 'cov_mcmc_data_cluster_{}_{}'.format(args.cluster_num, args.range)
-            figure_save_str = 'cov_mcmc_cluster_{}_{}_visual'.format(args.cluster_num, args.range)
-            
-        else:
-            #if not in burnin use the specified number of draws
-            num_draws = args.num_draws
-            
-            
-            model_save_str = 'cov_mcmc_model_cluster_{}.pickle'.format(args.cluster_num)
-            data_save_str = 'cov_mcmc_data_cluster_{}'.format(args.cluster_num)
-            figure_save_str = 'cov_mcmc_cluster_{}_visual'.format(args.cluster_num)
-        
-        # run on the specified cluster
-        cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.cluster_num, args.bin_width)
+        #cluster_mask = (c_assignments == args.cluster_num)
+        r = preprocess_data['r']#[cluster_mask]
+        C = preprocess_data['C']#[cluster_mask]
+
+        # load and (weakly) verify allelic segment indices
+        seg_g_idx = pd.read_pickle(args.allelic_seg_indices)
+        if len(np.hstack(seg_g_idx["indices"])) != C.shape[0]:
+            raise ValueError("Size mismatch between allelic segment assignments and coverage bin data!")
+
+        # subset to a single allelic segment
+        if args.allelic_seg_idx > len(seg_g_idx) - 1:
+            raise ValueError("Allelic segment index out of bounds!")
+
+        seg_indices = seg_g_idx.iloc[args.allelic_seg_idx]
+
+        mu = mu[seg_indices["allelic_cluster"]]
+        C = C[seg_indices["indices"], :]
+        r = r[seg_indices["indices"], :]
         
-        # if we're using burnin results load them now
-        if args.burnin_files is not None:
-            with open(args.burnin_files, 'r') as f:
-                file_list = f.read().splitlines()
-            assignments_arr = aggregate_burnin_files(file_list, args.cluster_num)
-            cov_mcmc.init_burnin(assignments_arr)
+        # run cov MCMC
+        cov_mcmc = NB_MCMC_SingleCluster(args.num_draws, r, C, mu, beta, args.bin_width)
+
+#        # if we get a range argument well be doing burnin on a subset of the coverage bins
+#        if args.range is not None:
+#            #parse range from string
+#            range_lst = args.range.split('-')
+#            st,en = int(range_lst[0]), int(range_lst[1]) 
+#            if st > en or st < 0 or en > len(r):
+#                raise ValueError("invalid range! got range {} for cluster {} with size {}".format(args.range, args.cluster_num, len(r)))
+#            
+#            #trim data to our desired range
+#            r = r[st:en]
+#            C = C[st:en]
+#            num_draws = 1
+#            
+#            # if we're just burning in a subset use different save strings
+#            model_save_str = 'cov_mcmc_model_cluster_{}_{}.pickle'.format(args.cluster_num, args.range)
+#            data_save_str = 'cov_mcmc_data_cluster_{}_{}'.format(args.cluster_num, args.range)
+#            figure_save_str = 'cov_mcmc_cluster_{}_{}_visual'.format(args.cluster_num, args.range)
+#            
+#        else:
+#            #if not in burnin use the specified number of draws
+#            num_draws = args.num_draws
+#            
+#            
+#            model_save_str = 'cov_mcmc_model_cluster_{}.pickle'.format(args.cluster_num)
+#            data_save_str = 'cov_mcmc_data_cluster_{}'.format(args.cluster_num)
+#            figure_save_str = 'cov_mcmc_cluster_{}_visual'.format(args.cluster_num)
+#        
+#        # run on the specified cluster
+#        cov_mcmc = NB_MCMC_SingleCluster(num_draws, r, C, mu, beta, args.cluster_num, args.bin_width)
+#        
+#        # if we're using burnin results load them now
+#        if args.burnin_files is not None:
+#            with open(args.burnin_files, 'r') as f:
+#                file_list = f.read().splitlines()
+#            assignments_arr = aggregate_burnin_files(file_list, args.cluster_num)
+#            cov_mcmc.init_burnin(assignments_arr)
 
         cov_mcmc.run()
 
@@ -600,15 +615,15 @@ def main():
         segment_samples, global_beta, mu_i_samples = cov_mcmc.prepare_results()
         
         # save samples
-        with open(os.path.join(output_dir, model_save_str), 'wb') as f:
+        with open(os.path.join(output_dir, f"cov_mcmc_model_seg_{args.allelic_seg_idx}.pickle"), 'wb') as f:
             pickle.dump(cov_mcmc, f)
 
-        np.savez(os.path.join(output_dir, data_save_str),
+        np.savez(os.path.join(output_dir, f"cov_mcmc_data_seg_{args.allelic_seg_idx}.npz"),
                  seg_samples=segment_samples, beta=global_beta, mu_i_samples=mu_i_samples)
 
-        # save visualization
-        cov_mcmc.visualize_cluster_samples(
-            os.path.join(output_dir, figure_save_str))
+#        # save visualization
+#        cov_mcmc.visualize_cluster_samples(
+#            os.path.join(output_dir, f"cov_mcmc_seg_{args.allelic_seg_idx}_visual.png"))
 
     elif args.command == "collect_cov_mcmc":
         if args.coverage_dir:
diff --git a/hapaseg/a_cov_DP.py b/hapaseg/a_cov_DP.py
index 1db9b17..975908b 100644
--- a/hapaseg/a_cov_DP.py
+++ b/hapaseg/a_cov_DP.py
@@ -63,7 +63,7 @@ def generate_acdp_df(SNP_path, # path to SNP df
         print('concatenating dp run ', draw_num)
         a_cov_seg_df = dp_run.cov_df.copy()
 
-        covar_cols = sorted([c for c in a_cov_seg_df.columns if "C_" in c])
+        covar_cols = sorted(a_cov_seg_df.columns[a_cov_seg_df.columns.str.contains("^C_.*_z$")])
         # add minor and major allele counts for each bin to the cov_seg_df here to allow for beta draws on the fly for each segment
         a_cov_seg_df['min_count'] = 0
         a_cov_seg_df['maj_count'] = 0
diff --git a/hapaseg/allelic_DP.py b/hapaseg/allelic_DP.py
index 9bf2d2e..0ae9eb5 100644
--- a/hapaseg/allelic_DP.py
+++ b/hapaseg/allelic_DP.py
@@ -1,5 +1,6 @@
 import colorama
 import copy
+import distinctipy
 import itertools
 import matplotlib.pyplot as plt
 import matplotlib as mpl
@@ -17,320 +18,82 @@
 class A_DP:
     def __init__(self, allelic_segs_pickle, ref_fasta = None):
         # dataframe of allelic imbalance segmentation samples for each chromosome arm
-        self.allelic_segs = pd.read_pickle(allelic_segs_pickle).dropna(0)
-        self.allelic_segs = self.allelic_segs.loc[self.allelic_segs["results"].apply(lambda x : len(x.breakpoint_list)) > 0]
+        self.allelic_segs = pd.read_pickle(allelic_segs_pickle).dropna(axis = 0)
+        # if some chromsome arms couldn't find the MLE, just use current state of chain
+        none_idx = self.allelic_segs["results"].apply(lambda x : x.breakpoints_MLE is None)
+        for i in none_idx[none_idx].index:
+            self.allelic_segs.iloc[i]["results"].breakpoints_MLE = self.allelic_segs.iloc[i]["results"].breakpoints
+
+        # load SNPs
+        self.SNPs = []
+        clust_offset = 0
+        for _, H in self.allelic_segs.iterrows():
+            S = copy.deepcopy(H["results"].P)
+            S["A_alt"] = 0
+            S.loc[S["aidx"], "A_alt"] = S.loc[S["aidx"], "ALT_COUNT"]
+            S["A_ref"] = 0
+            S.loc[S["aidx"], "A_ref"] = S.loc[S["aidx"], "REF_COUNT"]
+            S["B_alt"] = 0
+            S.loc[~S["aidx"], "B_alt"] = S.loc[~S["aidx"], "ALT_COUNT"]
+            S["B_ref"] = 0
+            S.loc[~S["aidx"], "B_ref"] = S.loc[~S["aidx"], "REF_COUNT"]
+
+            S = S.rename(columns = { "MIN_COUNT" : "min", "MAJ_COUNT" : "maj" })
+            S = S.loc[:, ["chr", "pos", "min", "maj", "A_alt", "A_ref", "B_alt", "B_ref"]]
+
+            # set initial cluster assignments based on segmentation
+            S["clust"] = -1
+            bpl = np.array(H["results"].breakpoints_MLE); bpl = np.c_[bpl[0:-1], bpl[1:]]
+            for i, (st, en) in enumerate(bpl):
+                S.iloc[st:en, S.columns.get_loc("clust")] = i + clust_offset
+            clust_offset += i
+
+            # bug in segmentation omits final SNP?
+            S = S.iloc[:-1]
+            assert (S["clust"] != -1).all()
+
+            self.SNPs.append(S)
+
+        self.SNPs = pd.concat(self.SNPs, ignore_index = True)
 
-        # number of total segmentation samples
-        self.n_samp = self.allelic_segs["results"].apply(lambda x : len(x.breakpoint_list)).min()
+        # convert chr-relative positions to absolute genomic coordinates
         self.ref_fasta = ref_fasta
+        self.SNPs["pos_gp"] = seq.chrpos2gpos(self.SNPs["chr"], self.SNPs["pos"], ref = self.ref_fasta)
 
-        # DP run objects for each segmentation sample
-        self.DP_runs = None
-
-        # dataframe of SNPs
-        self.SNPs = None
+        # initial phasing orientation
+        self.SNPs["flipped"] = False
 
-        # number of segmentation samples used for DP run
-        self.N_seg_samps = None
-        # number of DP samples per segmentation sample
-        self.N_clust_samps = None
+        self.N_clust_samps = 100
 
         # assignment of SNPs to DP clusters for each MCMC sample
         self.snps_to_clusters = None
         # phase correction of SNPs for each MCMC sample
         self.snps_to_phases = None
+        # likelihoods of each clustering
+        self.likelihoods = None
 
-    def load_seg_samp(self, samp_idx):
-        if samp_idx > self.n_samp:
-            raise ValueError(f"Only {self.n_samp} MCMC samples were taken!")
-
-        all_segs = []
-        all_SNPs = []
-
-        maj_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("MAJ_COUNT")
-        min_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("MIN_COUNT")
-
-        alt_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("ALT_COUNT")
-        ref_idx = self.allelic_segs["results"].iloc[0].P.columns.get_loc("REF_COUNT")
-
-        chunk_offset = 0
-        for _, H in self.allelic_segs.dropna(subset = ["results"]).iterrows():
-            r = copy.deepcopy(H["results"])
-
-            # set phasing orientation back to original
-            for st, en in r.F.intervals():
-                # code excised from flip_hap
-                x = r.P.iloc[st:en, maj_idx].copy()
-                r.P.iloc[st:en, maj_idx] = r.P.iloc[st:en, min_idx]
-                r.P.iloc[st:en, min_idx] = x
-
-            # save SNPs for this chunk
-            if self.SNPs is None:
-                all_SNPs.append(pd.DataFrame({
-                  "maj" : r.P["MAJ_COUNT"],
-                  "min" : r.P["MIN_COUNT"],
-                  # TODO: gpos should be computed earlier, so that that we don't need to pass ref_fasta here
-                  "gpos" : seq.chrpos2gpos(r.P.loc[0, "chr"], r.P["pos"], ref = self.ref_fasta),
-                  "allele" : r.P["allele_A"]
-                }))
-
-            # draw breakpoint, phasing, and SNP inclusion sample from segmentation MCMC trace
-            bp_samp, pi_samp, inc_samp = (r.breakpoint_list[samp_idx], r.phase_interval_list[samp_idx] if r.phase_correct else None, r.include[samp_idx])
-            # flip everything according to sample
-            if r.phase_correct:
-                for st, en in pi_samp.intervals():
-                    x = r.P.iloc[st:en, maj_idx].copy()
-                    r.P.iloc[st:en, maj_idx] = r.P.iloc[st:en, min_idx]
-                    r.P.iloc[st:en, min_idx] = x
-
-            bpl = np.array(bp_samp); bpl = np.c_[bpl[0:-1], bpl[1:]]
-
-            # get major/minor sums for each segment
-            # also get {alt, ref} x {aidx, bidx}
-            for st, en in bpl:
-                all_segs.append([
-                  st + chunk_offset, en + chunk_offset,                        # SNP index for seg
-                  r.P.loc[st, "chr"], r.P.loc[st, "pos"], r.P.loc[en, "pos"],  # chromosomal position of seg
-                  r._Piloc(st, en, min_idx, inc_samp).sum(),                   # min/maj counts
-                  r._Piloc(st, en, maj_idx, inc_samp).sum(),
-
-                  r._Piloc(st, en, alt_idx, inc_samp & r.P["aidx"]).sum(),     # allele A alt/ref
-                  r._Piloc(st, en, ref_idx, inc_samp & r.P["aidx"]).sum(),
-                  r._Piloc(st, en, alt_idx, inc_samp & ~r.P["aidx"]).sum(),    # allele B alt/ref
-                  r._Piloc(st, en, ref_idx, inc_samp & ~r.P["aidx"]).sum()
-                ])
-
-            chunk_offset += len(r.P)
-
-        # convert samples into dataframe
-        S = pd.DataFrame(all_segs, columns = ["SNP_st", "SNP_en", "chr", "start", "end", "min", "maj", "A_alt", "A_ref", "B_alt", "B_ref"])
-
-        # convert chr-relative positions to absolute genomic coordinates
-        S["start_gp"] = seq.chrpos2gpos(S["chr"], S["start"], ref = self.ref_fasta)
-        S["end_gp"] = seq.chrpos2gpos(S["chr"], S["end"], ref = self.ref_fasta)
-
-        # initial cluster assignments
-        S["clust"] = -1 # initially, all segments are unassigned
-        S.iloc[0, S.columns.get_loc("clust")] = 1 # first segment is assigned to cluster 1
-
-        # initial phasing orientation
-        S["flipped"] = False
-
-        if self.SNPs is None:
-            self.SNPs = pd.concat(all_SNPs, ignore_index = True)
-            CI = s.beta.ppf([0.05, 0.5, 0.95], self.SNPs["min"].values[:, None] + 1, self.SNPs["maj"].values[:, None] + 1)
-            self.SNPs[["f_CI_lo", "f", "f_CI_hi"]] = CI
-
-        return S, self.SNPs
-
-    # map trace of segment cluster assignments to the SNPs within
-    @staticmethod
-    def map_seg_clust_assignments_to_SNPs(segs_to_clusters, S):
-        st_col = S.columns.get_loc("SNP_st")
-        en_col = S.columns.get_loc("SNP_en")
-        snps_to_clusters = np.zeros((segs_to_clusters.shape[0], S.iloc[-1, en_col] + 1), dtype = int)
-        for i, seg_assign in enumerate(segs_to_clusters):
-            for j, seg in enumerate(seg_assign):
-                snps_to_clusters[i, S.iloc[j, st_col]:S.iloc[j, en_col]] = seg
-
-        return snps_to_clusters
-
-    @staticmethod
-    def map_seg_phases_to_SNPs(phase, S):
-        st_col = S.columns.get_loc("SNP_st")
-        en_col = S.columns.get_loc("SNP_en")
-        snps_to_phase = np.zeros((phase.shape[0], S.iloc[-1, en_col] + 1), dtype = int)
-        for i, phase_orient in enumerate(phase):
-            for j, ph in enumerate(phase_orient):
-                snps_to_phase[i, S.iloc[j, st_col]:S.iloc[j, en_col]] = ph
-
-        return snps_to_phase
-
-    def run(self, N_seg_samps = 50, N_clust_samps = 5, seg_sample_idx = None):
-        self.N_seg_samps = N_seg_samps if seg_sample_idx is None else 1
-        self.N_clust_samps = N_clust_samps
-
-        seg_sample_idx = np.random.choice(self.n_samp - 1, self.N_seg_samps, replace = False) if seg_sample_idx is None else [seg_sample_idx]
-        S, SNPs = self.load_seg_samp(seg_sample_idx[0])
-        N_SNPs = len(SNPs)
-        
-        self.snps_to_clusters = -1*np.ones((self.N_clust_samps*self.N_seg_samps, N_SNPs), dtype = np.int16)
-        self.snps_to_phases = np.zeros((self.N_clust_samps*self.N_seg_samps, N_SNPs), dtype = bool)
-        self.DP_likelihoods = np.zeros((self.N_clust_samps*self.N_seg_samps, 2))
-
-        self.DP_runs = [None]*self.N_seg_samps
-
-        clust_prior = sc.SortedDict()
-        clust_count_prior = sc.SortedDict()
-        n_iter_clust_exist = sc.SortedDict()
-        cur_samp_iter = 0
-
-        for n_it in range(self.N_seg_samps):
-            if n_it > 0:
-                S, SNPs = self.load_seg_samp(seg_sample_idx[n_it])
-
-            # run clustering
-            self.DP_runs[n_it] = DPinstance(S, clust_prior = clust_prior, clust_count_prior = clust_count_prior)
-            segs_to_clusters, segs_to_phases = self.DP_runs[n_it].run(n_iter = self.N_clust_samps)
-
-            # compute likelihoods for each clustering
-            self.DP_likelihoods[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.DP_runs[n_it].compute_overall_lik()
-
-            # assign clusters to individual SNPs, to use as segment assignment prior for next DP iteration
-            self.snps_to_clusters[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.map_seg_clust_assignments_to_SNPs(segs_to_clusters, S)
-
-            # assign phase orientations to individual SNPs
-            self.snps_to_phases[self.N_clust_samps*n_it:self.N_clust_samps*(n_it + 1), :] = self.map_seg_phases_to_SNPs(segs_to_phases, S)
-
-            # compute prior on cluster locations/counts
-            max_clust_idx = segs_to_clusters.max()
-            for seg_assignments, seg_phases in zip(segs_to_clusters, segs_to_phases):
-                # reset phases
-                S2 = S.copy()
-                S2.loc[S2["flipped"], ["min", "maj"]] = S2.loc[S2["flipped"], ["min", "maj"]].values[:, ::-1]
-
-                # match phases to current sample
-                S2.loc[seg_phases, ["min", "maj"]] = S2.loc[seg_phases, ["min", "maj"]].values[:, ::-1]
-
-                # minor/major counts for each cluster in this iteration
-                S_a = npg.aggregate(seg_assignments, S2["min"], size = max_clust_idx + 1)
-                S_b = npg.aggregate(seg_assignments, S2["maj"], size = max_clust_idx + 1)
-                c = np.c_[S_a, S_b]
-
-                # total numer of SNPs for each cluster in this iteration
-                #N_c = npg.aggregate(seg_assignments, S2["SNP_en"] - S2["SNP_st"], size = max_clust_idx + 1)
-                N_c = npg.aggregate(seg_assignments, 1, size = max_clust_idx + 1)
-
-                # iteratively update priors
-                next_clust_prior = sc.SortedDict(zip(np.flatnonzero(c.sum(1) > 0), c[c.sum(1) > 0]))
-                next_clust_count_prior = sc.SortedDict(zip(np.flatnonzero(c.sum(1) > 0), N_c[N_c > 0]))
-
-                for cl in np.unique(seg_assignments):
-                    if cl in n_iter_clust_exist:
-                        n_iter_clust_exist[cl] += 1
-                    else:
-                        n_iter_clust_exist[cl] = 1
-                cur_samp_iter += 1
-
-                for k, v in next_clust_prior.items():
-                    nccp = next_clust_count_prior[k]
-                    if k in clust_prior:
-                        clust_prior[k] += (v - clust_prior[k])/n_iter_clust_exist[k]
-                        clust_count_prior[k] += (nccp - clust_count_prior[k])/cur_samp_iter
-                    else:
-                        clust_prior[k] = v
-                        clust_count_prior[k] = nccp/cur_samp_iter
-                # for clusters that don't exist in this iteration, average counts with zero
-                for k, v in clust_prior.items():
-                    if k != -1 and k not in next_clust_prior:
-                        clust_count_prior[k] -= clust_count_prior[k]/cur_samp_iter
-
-
-            # remove improbable clusters from prior
-            for kk in [k for k, v in clust_count_prior.items() if v < 1]:
-                del clust_prior[kk]
-                del clust_count_prior[kk]
-
-            # remove garbage cluster from priors
-            #del clust_prior[0]
-            #del clust_count_prior[0]
-
-        return self.snps_to_clusters, self.snps_to_phases, self.DP_likelihoods
-
-    def visualize_segs(self, snps_to_clusters = None, f = None, n_vis_samp = None):
-        f = plt.figure(figsize = [17.56, 5.67]) if f is None else f
-
-        snps_to_clusters = snps_to_clusters if snps_to_clusters is not None else self.snps_to_clusters
-
-        # plot all samples from DP
-        if n_vis_samp is None:
-            run_idx = np.r_[0:self.N_seg_samps]
-            N_seg_samps = self.N_seg_samps
-
-        # only plot up to n_vis_samp _segmentation samples_ from DP
-        # (all DP samples for a given segmentation sample will be plotted)
-        else:
-            run_idx = np.random.choice(self.N_seg_samps, n_vis_samp, replace = False)
-            N_seg_samps = n_vis_samp
-
-        for d in [self.DP_runs[x] for x in run_idx]:
-            d.visualize_adjacent_segs(f = f.number, n_samp = N_seg_samps*self.N_clust_samps)
-
-    def visualize_clusts(self, snps_to_clusters = None, f = None, thick = False, nocolor = False, n_vis_samp = None):
-        f = plt.figure(figsize = [17.56, 5.67]) if f is None else f
-
-        snps_to_clusters = snps_to_clusters if snps_to_clusters is not None else self.snps_to_clusters
-
-        # plot all samples from DP
-        if n_vis_samp is None:
-            run_idx = np.r_[0:self.N_seg_samps]
-            N_seg_samps = self.N_seg_samps
-
-        # only plot up to n_vis_samp _segmentation samples_ from DP
-        # (all DP samples for a given segmentation sample will be plotted)
-        else:
-            run_idx = np.random.choice(self.N_seg_samps, n_vis_samp, replace = False)
-            N_seg_samps = n_vis_samp
-
-        for d in [self.DP_runs[x] for x in run_idx]:
-            d.visualize_clusts(f = f.number, n_samp = N_seg_samps*self.N_clust_samps, thick = thick, nocolor = nocolor)
-
-    def visualize_SNPs(self, snps_to_phases = None, color = True, f = None):
-        snps_to_phases = snps_to_phases if snps_to_phases is not None else self.snps_to_phases
-        ph_prob = snps_to_phases.mean(0)
-
-        if color:
-            rb = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]]
-        else:
-            rb = np.full([2, 3], 0)
-
-        logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M)))
-
-        def scerrorbar(idx, rev = False, alpha = 1, show_CI = True):
-            if rev:
-                f = 1 - self.SNPs.loc[idx, "f"]
-                eb_bot = self.SNPs.loc[idx, "f"] - self.SNPs.loc[idx, "f_CI_hi"]
-                eb_top = self.SNPs.loc[idx, "f_CI_lo"] - self.SNPs.loc[idx, "f"]
-            else:
-                f = self.SNPs.loc[idx, "f"]
-                eb_bot = self.SNPs.loc[idx, "f"] - self.SNPs.loc[idx, "f_CI_lo"]
-                eb_top = self.SNPs.loc[idx, "f_CI_hi"] - self.SNPs.loc[idx, "f"]
-
-            if show_CI:
-                plt.errorbar(
-                  x = self.SNPs.loc[idx, "gpos"],
-                  y = f,
-                  yerr = np.c_[
-                    eb_bot,
-                    eb_top
-                  ].T,
-                  fmt = 'none', ecolor = np.c_[rb[self.SNPs.loc[idx, "allele"]], (alpha if isinstance(alpha, np.ndarray) else alpha*np.ones(idx.sum()))**2]
-                )
-
-            plt.scatter(
-              self.SNPs.loc[idx, "gpos"],
-              f,
-              color = rb[self.SNPs.loc[idx, "allele"]],
-              marker = '.',
-              s = 1,
-              alpha = alpha if show_CI else alpha
-            )
-
-        default_alpha = logistic(A = 0.4, K = 0.01, B = 0.00001, M = 120000, x = len(self.SNPs))
+    def run(self):
+        self.DP_run = DPinstance(
+          self.SNPs,
+          dp_count_scale_factor = self.SNPs["clust"].value_counts().mean()
+        )
+        self.snps_to_clusters, self.snps_to_phases, self.likelihoods = self.DP_run.run(n_samps = self.N_clust_samps)
 
-        f = plt.figure(figsize = [17.56, 5.67]) if f is None else f
-        scerrorbar(ph_prob == 0, alpha = default_alpha, show_CI = color)
-        scerrorbar(ph_prob == 1, rev = True, alpha = default_alpha, show_CI = color)
-        idx = (ph_prob > 0) & (ph_prob < 1)
-        scerrorbar(idx, alpha = (1 - ph_prob[idx])*default_alpha, show_CI = color)
-        scerrorbar(idx, rev = True, alpha = ph_prob[idx]*default_alpha, show_CI = color)
+        return self.snps_to_clusters, self.snps_to_phases, self.likelihoods
 
 class DPinstance:
-    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), n_iter = 50, alpha = 0.1):
+    def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.SortedDict(), alpha = 1, dp_count_scale_factor = 1):
         self.S = S
         self.clust_prior = clust_prior.copy()
         self.clust_count_prior = clust_count_prior.copy()
         self.alpha = alpha
+        self.dp_count_scale_factor = dp_count_scale_factor
+
+        self.mm_mat = self.S.loc[:, ["min", "maj"]].values.reshape(-1, order = "F") # numpy for speed
+        self.ref_mat = self.S.loc[:, ["A_ref", "B_ref"]].values.reshape(-1, order = "F")
+        self.alt_mat = self.S.loc[:, ["A_alt", "B_alt"]].values.reshape(-1, order = "F")
+
+        self.betahyp = self.S.loc[:, ["min", "maj"]].sum(1).mean()/2
 
         #
         # define column indices
@@ -348,41 +111,63 @@ def __init__(self, S, clust_prior = sc.SortedDict(), clust_count_prior = sc.Sort
 
         # store likelihoods for each cluster in the prior (from previous iterations)
         self.clust_prior[-1] = np.r_[0, 0]
-        self.clust_prior_liks = sc.SortedDict({ k : ss.betaln(v[0] + 1, v[1] + 1) for k, v in self.clust_prior.items()})
+        self.clust_prior_liks = sc.SortedDict({ k : ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_prior.items()})
         self.clust_prior_mat = np.r_[self.clust_prior.values()]
 
         self.clust_count_prior[-1] = self.alpha # DP alpha factor, i.e. relative probability of opening new cluster
-        self.clust_count_prior[0] = self.alpha # relative probability of sending a cluster to the garbage
 
+    def _Siat_ph(self, ridx, min = True):
+        # min, flip => maj
+        # ~min, ~flip => maj
+        # min, ~flip => min
+        # ~min, flip => min
+        col = self.min_col if self.S.iat[ridx, self.flip_col] ^ min else self.maj_col
+        return self.S.iat[ridx, col]
+
+    def _Ssum_ph(self, seg_idx, min = True):
+        #flip = self.flip_mat[seg_idx]
+        flip = self.S.iloc[seg_idx, self.flip_col]
+        flip_n = ~flip
+        if min:
+            return self.mm_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum()
+        else:
+            return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum()
+
+    def _Scumsum_ph(self, seg_idx, min = True):
+        flip = self.S.iloc[seg_idx, self.flip_col]
+        flip_n = ~flip
+        if min:
+            si = np.argsort(np.r_[seg_idx[flip_n], seg_idx[flip]])
+            return self.mm_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]][si].cumsum()
+        else:
+            si = np.argsort(np.r_[seg_idx[flip], seg_idx[flip_n]])
+            return self.mm_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]][si].cumsum()
 
-    def rephase(self, seg_idx, force = False):
-        if not force:
-            A_a = self.S.iloc[seg_idx, self.aalt_col].sum() + 1
-            A_b = self.S.iloc[seg_idx, self.aref_col].sum() + 1
-            B_a = self.S.iloc[seg_idx, self.balt_col].sum() + 1
-            B_b = self.S.iloc[seg_idx, self.bref_col].sum() + 1
+    def compute_rephase_prob(self, seg_idx):
+        # TODO: compute logcdf/logsf directly
+        flip = self.S.iloc[seg_idx, self.flip_col]
+        flip_n = ~flip
 
-            # use normal approximation to beta if conditions are right
-            if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20:
-                m_x = A_a/(A_a + A_b)
-                s_x = A_a*A_b/((A_a + A_b)**2*(A_a + A_b + 1))
-                m_y = B_a/(B_a + B_b)
-                s_y = B_a*B_b/((B_a + B_b)**2*(B_a + B_b + 1))
+        A_a = self.alt_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + self.betahyp
+        A_b = self.ref_mat[np.r_[seg_idx[flip_n], seg_idx[flip] + len(self.S)]].sum() + 1 + self.betahyp
+        B_a = self.alt_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + self.betahyp
+        B_b = self.ref_mat[np.r_[seg_idx[flip], seg_idx[flip_n] + len(self.S)]].sum() + 1 + self.betahyp
 
-                do_rephase = np.random.rand() < s.norm.cdf(0, m_y - m_x, np.sqrt(s_x + s_y))
+        # use normal approximation to beta if conditions are right
+        if A_a > 20 and A_b > 20 and B_a > 20 and B_b > 20:
+            m_x = A_a/(A_a + A_b)
+            s_x = A_a*A_b/((A_a + A_b)**2*(A_a + A_b + 1))
+            m_y = B_a/(B_a + B_b)
+            s_y = B_a*B_b/((B_a + B_b)**2*(B_a + B_b + 1))
 
-            # Monte Carlo simulate difference of betas
-            else:
-                x = s.beta.rvs(A_a, A_b, size = 1000)
-                y = s.beta.rvs(B_a, B_b, size = 1000)
+            return s.norm.cdf(0, m_y - m_x, np.sqrt(s_x + s_y))
 
-                do_rephase = np.random.rand() < (x > y).mean()
+        # Monte Carlo simulate difference of betas
+        else:
+            x = s.beta.rvs(A_a, A_b, size = 1000)
+            y = s.beta.rvs(B_a, B_b, size = 1000)
 
-        if force or do_rephase:
-            self.S.iloc[seg_idx, [self.min_col, self.maj_col]] = self.S.iloc[seg_idx, [self.min_col, self.maj_col]].values[:, ::-1]
-            self.S.iloc[seg_idx, [self.aalt_col, self.balt_col]] = self.S.iloc[seg_idx, [self.aalt_col, self.balt_col]].values[:, ::-1]
-            self.S.iloc[seg_idx, [self.aref_col, self.bref_col]] = self.S.iloc[seg_idx, [self.aref_col, self.bref_col]].values[:, ::-1]
-            self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]
+            return (x > y).mean()
 
     def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_b, D_a, D_b):
 #            if st == en:
@@ -392,33 +177,84 @@ def SJliks(self, targ_clust, upstream_clust, downstream_clust, J_a, J_b, U_a, U_
 #                J_a = S.iloc[st:(en + 1), min_col].sum()
 #                J_b = S.iloc[st:(en + 1), maj_col].sum()
         SU_a = SU_b = SD_a = SD_b = 0
-        # if target segments are being moved to the garbage, it is equivalent to making them their own segment, and joining the upstream and downstream segments
-        if targ_clust == 0:
-            SU_a = J_a
-            SU_b = J_b
-            J_a = 0
-            J_b = 0
-
-        if targ_clust != - 1 and (targ_clust == upstream_clust or targ_clust == 0):
+
+        if targ_clust != -1 and targ_clust == upstream_clust:
             J_a += U_a
             J_b += U_b
         else:
             SU_a += U_a
             SU_b += U_b
-        if targ_clust != - 1 and (targ_clust == downstream_clust or targ_clust == 0):
+        if targ_clust != -1 and targ_clust == downstream_clust:
             J_a += D_a
             J_b += D_b
         else:
             SD_a += D_a
             SD_b += D_b
 
-        return ss.betaln(SU_a + 1, SU_b + 1) + ss.betaln(J_a + 1, J_b + 1) + ss.betaln(SD_a + 1, SD_b + 1)
+        return (ss.betaln(SU_a + 1 + self.betahyp, SU_b + 1 + self.betahyp) if SU_a > 0 or SU_b > 0 else 0) + \
+          ss.betaln(J_a + 1 + self.betahyp, J_b + 1 + self.betahyp) + \
+          (ss.betaln(SD_a + 1 + self.betahyp, SD_b + 1 + self.betahyp) if SD_a > 0 or SD_b > 0 else 0)
+
+    def compute_adj_prob(self, break_idx):
+        if break_idx > 1:
+            U_A, U_B = self.seg_sums[self.breakpoints[break_idx - 1]]
+            U_cl = self.clusts[self.breakpoints[break_idx - 1]]
+        else:
+            U_A = U_B = 0
+            U_cl = -1
+        if break_idx + 2 < len(self.breakpoints):
+            D_A, D_B = self.seg_sums[self.breakpoints[break_idx + 1]]
+            D_cl = self.clusts[self.breakpoints[break_idx + 1]]
+        else:
+            D_A = D_B = 0
+            D_cl = -1
+
+        S_A, S_B = self.seg_sums[self.breakpoints[break_idx]]
+
+        ## compute all four possible segmentations relative to neighbor, in
+        ## both phasing orientations
+        MLs = np.c_[
+          # UTD             T  U  D
+          # -^_ or -_- (U != T & T != D) (00)
+          np.r_[self.SJliks(1, 0, 0, S_A, S_B, U_A, U_B, D_A, D_B),
+                self.SJliks(1, 0, 0, S_B, S_A, U_A, U_B, D_A, D_B)],
+          # -__ (U != T & T == D) (01)
+          np.r_[self.SJliks(0, 1, 0, S_A, S_B, U_A, U_B, D_A, D_B),
+                self.SJliks(0, 1, 0, S_B, S_A, U_A, U_B, D_A, D_B)],
+          # --_ (U == T & T != D) (10)
+          np.r_[self.SJliks(1, 1, 0, S_A, S_B, U_A, U_B, D_A, D_B),
+                self.SJliks(1, 1, 0, S_B, S_A, U_A, U_B, D_A, D_B)],
+          # --- (U == T & T == D) (11)
+          np.r_[self.SJliks(0, 0, 0, S_A, S_B, U_A, U_B, D_A, D_B),
+                self.SJliks(0, 0, 0, S_B, S_A, U_A, U_B, D_A, D_B)],
+        ]
+
+        ## match probs to cluster choices (will match MLs matrix in main calculation)
+        probs = np.full([len(self.clust_sums), 2], MLs[0, 0])
+        if U_cl == D_cl and U_cl != -1 and D_cl != -1:
+            probs[self.clust_sums.index(U_cl), :] = MLs[:, 3]
+            probs[self.clust_sums.index(D_cl), :] = MLs[:, 3]
+        else:
+            if U_cl != -1:
+                probs[self.clust_sums.index(U_cl), :] = MLs[:, 2]
+            if D_cl != -1:
+                probs[self.clust_sums.index(D_cl), :] = MLs[:, 1]
+
+        return probs
 
     def compute_adj_liks(self, seg_idx, cur_clust):
+        # idea to simplify this code:
+        # - strip out logic for working with noncontiguous seg_idx's
+        # - compute all four possibile segmentations:
+        #   ABC, AAB, ABB, AAA
+        # - associate those segmentations with each cluster choice, in order
+        #   to return `adj_BC` with same size as `MLs`
+
         adj_AB = 0
-        adj_BC = np.zeros(len(self.clust_sums))
+        adj_BC = np.zeros([len(self.clust_sums), 2])
 
         # start/end coordinates of consecutive runs of segments being moved
+        # NOTE: ordpairs represents closed intervals!
         ordpairs = np.c_[
           [np.r_[list(x)][[0, -1]] for x in more_itertools.consecutive_groups(
             np.sort(seg_idx))
@@ -431,48 +267,32 @@ def compute_adj_liks(self, seg_idx, cur_clust):
         for o, (st, en) in enumerate(ordpairs):
             # maj/min counts of contiguous upstream segments belonging to the same cluster
             if st - 1 > 0:
-                # skip over adjacent segments that are in the garbage;
-                # we only care about adjacent segments actually assigned to clusters
                 j = 1
-                while st - j > 0 and self.clusts[st - j] == 0:
-                    j += 1
 
                 U_cl = self.clusts[st - j]
                 adj_clusters[o, 0] = U_cl
 
                 while st - j > 0 and self.clusts[st - j] != -1 and \
-                  (self.clusts[st - j] == U_cl or self.clusts[st - j] == 0):
-                    # again, skip over segments in the garbage
-                    if self.clusts[st - j] != 0:
-                        UD_counts[o, 0] += self.S.iloc[st - j, self.min_col]
-                        UD_counts[o, 1] += self.S.iloc[st - j, self.maj_col]
+                  self.clusts[st - j] == U_cl:
+                    UD_counts[o, 0] += self._Siat_ph(st - j, min = True)
+                    UD_counts[o, 1] += self._Siat_ph(st - j, min = False)
 
                     j += 1
 
             # maj/min counts of contiguous downstream segments belonging to the same cluster
             if en + 1 < len(self.S):
                 j = 1
-                while en + j < len(self.S) - 1 and self.clusts[en + j] == 0:
-                    j += 1
 
                 D_cl = self.clusts[en + j]
                 adj_clusters[o, 1] = D_cl
 
                 while en + j < len(self.S) - 1 and self.clusts[en + j] != -1 and \
-                  (self.clusts[en + j] == D_cl or self.clusts[en + j] == 0):
-                    if self.clusts[en + j] != 0:
-                        UD_counts[o, 2] += self.S.iloc[en + j, self.min_col]
-                        UD_counts[o, 3] += self.S.iloc[en + j, self.maj_col]
+                  self.clusts[en + j] == D_cl:
+                    UD_counts[o, 2] += self._Siat_ph(en + j, min = True)
+                    UD_counts[o, 3] += self._Siat_ph(en + j, min = False)
 
                     j += 1
 
-        # if we are looking at the segments at the very start or very end, set
-        # upstream/downstream cluster indices to garbage
-        if ordpairs[0, 0] == 0:
-            adj_clusters[0, 0] = 0
-        if ordpairs[-1, 1] == len(self.S) - 1:
-            adj_clusters[-1, 1] = 0
-
         # if there are any segments being moved adjacent to already existing clusters, get local split/join likelihoods
         adj_idx = ~(adj_clusters == -1).all(1)
 
@@ -494,8 +314,8 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                 # min/maj counts of the segment(s) being moved
                 st = ordpairs[j, 0]
                 en = ordpairs[j, 1]
-                S_a = self.S.iloc[:, self.min_col].values[st:(en + 1)].sum()
-                S_b = self.S.iloc[:, self.maj_col].values[st:(en + 1)].sum()
+                S_a = self._Ssum_ph(np.r_[st:(en + 1)], min = True) # en + 1 because ordpairs is a closed interval
+                S_b = self._Ssum_ph(np.r_[st:(en + 1)], min = False) 
 
                 # adjacency likelihood of this segment remaining where it is
 #                adj_AB += self.SJliks(
@@ -511,10 +331,10 @@ def compute_adj_liks(self, seg_idx, cur_clust):
 #                )
 
                 # adjacency likelihood of this segment joining each possible cluster:
-                # 1. those it is actually adjacent to (+ new cluster, garbage)
-                for cl in {-1, 0, cl_u, cl_d}:
+                # 1. those it is actually adjacent to (+ new cluster)
+                for cl in {-1, cl_u, cl_d}:
                     idx = self.clust_sums.index(cl)
-                    adj_BC[idx] += self.SJliks(
+                    adj_BC[idx, 0] += self.SJliks(
                       targ_clust = cl, 
                       upstream_clust = cl_u, 
                       downstream_clust = cl_d, 
@@ -525,14 +345,22 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                       D_a = D_a,
                       D_b = D_b
                     )
-                    # we cannot send a segment to the garbage adjacent to any unassigned segment
-                    if cl == 0 and (cl_u == -1 or cl_d == -1):
-                        adj_BC[idx] = -np.inf
+                    adj_BC[idx, 1] += self.SJliks(
+                      targ_clust = cl, 
+                      upstream_clust = cl_u, 
+                      downstream_clust = cl_d, 
+                      J_a = S_b, 
+                      J_b = S_a,
+                      U_a = U_a,
+                      U_b = U_b,
+                      D_a = D_a,
+                      D_b = D_b
+                    )
 
                 # 2. clusters it is not adjacent to (use default split value)
-                for cl in self.clust_sums.keys() - ({-1, 0} | set(adj_clusters[adj_idx].ravel())):
+                for cl in self.clust_sums.keys() - ({-1} | set(adj_clusters[adj_idx].ravel())):
                     idx = self.clust_sums.index(cl)
-                    adj_BC[idx] += self.SJliks(
+                    adj_BC[idx, 0] += self.SJliks(
                       targ_clust = -1, 
                       upstream_clust = -1, 
                       downstream_clust = -1, 
@@ -543,253 +371,372 @@ def compute_adj_liks(self, seg_idx, cur_clust):
                       D_a = D_a,
                       D_b = D_b
                     )
-        else:
-            # we cannot send a segment to the garbage adjacent to any unassigned segment
-            adj_BC[self.clust_sums.index(0)] = -np.inf
+                    adj_BC[idx, 1] += self.SJliks(
+                      targ_clust = -1, 
+                      upstream_clust = -1, 
+                      downstream_clust = -1, 
+                      J_a = S_b, 
+                      J_b = S_a,
+                      U_a = U_a,
+                      U_b = U_b,
+                      D_a = D_a,
+                      D_b = D_b
+                    )
 
         return adj_AB, adj_BC
 
-    def compute_overall_lik(self, segs_to_clusters = None, phase_orientations = None):
-        if segs_to_clusters is None:
-            _, segs_to_clusters = self.get_unique_clust_idxs()
-        else:
-            _, segs_to_clusters = self.get_unique_clust_idxs(segs_to_clusters)
-        if phase_orientations is None:
-            phase_orientations = np.r_[self.phase_orientations]
+    def compute_cluster_splitpoints(self, seg_idx):
+        spl = []
 
-        max_clust_idx = segs_to_clusters.max() + 1
+        # left bias
+        end = len(seg_idx)
+        i = 0
+        while True:
+            seg_idx_sp = seg_idx[0:end]
+            if len(seg_idx_sp) < 2:
+                break
 
-        liks = np.full([segs_to_clusters.shape[0], 2], np.nan)
+            min_cs = self._Scumsum_ph(seg_idx_sp, min = True)
+            min_csr = self._Ssum_ph(seg_idx_sp, min = True) - min_cs
+            maj_cs = self._Scumsum_ph(seg_idx_sp, min = False)
+            maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs
 
-        for i, (cl_samp, ph_samp) in enumerate(zip(segs_to_clusters, phase_orientations)):
-            # reset phases
-            # TODO: when we switch to faster phasing correction model that doesn't involve modifying self.S, this won't be necessary
-            S_ph = self.S.copy()
-            flip_idx = np.flatnonzero(ph_samp != S_ph["flipped"])
-            S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
+            split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp)
+            # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum())
+            # NOTE: instead of argmax, probabilistically choose? will this make a difference?
 
-            ## overall clustering likelihood
-            A = npg.aggregate(cl_samp, S_ph["min"], size = max_clust_idx)
-            B = npg.aggregate(cl_samp, S_ph["maj"], size = max_clust_idx)
+            end = split_lik.argmax()
+            spl.append(end)
 
-# for when self.S is not modified
-#            A = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "maj"], size = max_clust_idx) + \
-#              npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "min"], size = max_clust_idx)
-#
-#            B = npg.aggregate(cl_samp[ph_samp], self.S.loc[ph_samp, "min"], size = max_clust_idx) + \
-#              npg.aggregate(cl_samp[~ph_samp], self.S.loc[~ph_samp, "maj"], size = max_clust_idx)
+            if end <= 1 or end == len(split_lik) - 1:
+                break
 
-            clust_lik = ss.betaln(A + 1, B + 1).sum()
+            i += 1
 
-            ## segmentation likelihood
+        # right bias
+        start = 0
+        i = 0
+        while True:
+            seg_idx_sp = seg_idx[start:]
+            if len(seg_idx_sp) < 2:
+                break
 
-            # get segment boundaries
-            bdy = np.flatnonzero(np.r_[1, np.diff(cl_samp) != 0, 1])
-            bdy = np.c_[bdy[:-1], bdy[1:]]
+            min_cs = self._Scumsum_ph(seg_idx_sp, min = True)
+            min_csr = self._Ssum_ph(seg_idx_sp, min = True) - min_cs
+            maj_cs = self._Scumsum_ph(seg_idx_sp, min = False)
+            maj_csr = self._Ssum_ph(seg_idx_sp, min = False) - maj_cs
 
-            # sum log-likelihoods of each segment
-            seg_lik = 0
-            for st, en in bdy:
-                A, B = S_ph.iloc[st:en, [self.min_col, self.maj_col]].sum()
+            split_lik = ss.betaln(min_cs[:-1] + 1 + self.betahyp, maj_cs[:-1] + 1 + self.betahyp) + ss.betaln(min_csr[1:] + 1 + self.betahyp, maj_csr[1:] + 1 + self.betahyp)
+            # split_lprob = split_lik - split_lik.max() - np.log(np.exp(split_lik - split_lik.max()).sum())
 
-# for when self.S is not modified
-#               A = self.S["min"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \
-#                   self.S["maj"].iloc[st:en].loc[ph_samp[st:en]].sum()
-#               B = self.S["maj"].iloc[st:en].loc[~ph_samp[st:en]].sum() + \
-#                   self.S["min"].iloc[st:en].loc[ph_samp[st:en]].sum()
+            start += split_lik.argmax() + 1
+            spl.append(start - 1)
 
-                seg_lik += ss.betaln(A + 1, B + 1)
+            if start > len(seg_idx) - 1 or split_lik.argmax() == 0:
+                break
 
-            liks[i, :] = np.r_[clust_lik, seg_lik]
+            i += 1
 
-        return liks
+        bdy = np.unique(np.r_[0, spl, len(seg_idx)])
+        bdy = np.c_[bdy[:-1], bdy[1:]]
 
-    def run(self, n_iter = 50):
-        #
-        # assign segments to likeliest prior component {{{
-
-        if len(self.clust_prior) > 1:
-            for seg_idx in range(len(self.S)):
-                seg_idx = np.r_[seg_idx] 
-
-                # compute probability that segment belongs to each cluster prior element
-                S_a = self.S.iloc[seg_idx[0], self.min_col]
-                S_b = self.S.iloc[seg_idx[0], self.maj_col]
-                P_a = self.clust_prior_mat[1:, 0]
-                P_b = self.clust_prior_mat[1:, 1]
-
-                # prior likelihood ratios for both phase orientations
-                P_l = np.c_[
-                  ss.betaln(S_a + P_a + 1, S_b + P_b + 1) - (ss.betaln(S_a + 1, S_b + 1) + ss.betaln(P_a + 1, P_b + 1)),
-                  ss.betaln(S_b + P_a + 1, S_a + P_b + 1) - (ss.betaln(S_b + 1, S_a + 1) + ss.betaln(P_a + 1, P_b + 1)),
-                ]
+        return bdy
 
-                # get count prior
-                ccp = np.c_[[v for k, v in self.clust_count_prior.items() if k != -1 and k != 0]]
+    def add_breakpoint(self, start, mid, end, clust_idx):
+        """
+        Add breakpoint at mid belonging to clust_idx, between start and end
+        """
+        self.breakpoints.add(mid)
+        self.clust_members_bps[clust_idx].add(mid)
+        
+        A = self._Ssum_ph(np.r_[mid:end], min = True)
+        B = self._Ssum_ph(np.r_[mid:end], min = False)
 
-                # posterior numerator
-                num = P_l + np.log(ccp)
-                num -= num.max()
+        self.seg_sums[mid] = np.r_[A, B]
+        self.seg_sums[start] -= self.seg_sums[mid]
 
-                # probabilistically choose a cluster
-                probs = np.exp(num)/np.exp(num).sum()
-                idx = np.tile(np.r_[self.clust_prior.keys()][1:], [2, 1]).T*[1, -1]
-                choice = np.random.choice(
-                  idx.ravel(),
-                  p = probs.ravel()
-                )
+        self.seg_liks[mid] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
+        A = self._Ssum_ph(np.r_[start:mid], min = True)
+        B = self._Ssum_ph(np.r_[start:mid], min = False)
+        self.seg_liks[start] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
 
-                # rephase
-                if choice < 0:
-                    self.rephase(seg_idx, force = True)
-                    choice = -choice
+        self.seg_phase_probs[start] = self.compute_rephase_prob(np.r_[start:mid])
+        self.seg_phase_probs[mid] = self.compute_rephase_prob(np.r_[mid:end])
 
-                self.S.iloc[seg_idx, self.clust_col] = choice
+    def compute_overall_lik_simple(self):
+        ## overall clustering likelihood
+        # p({a_i, b_i} | {c_k}, {phase_i})
+        clust_lik = np.r_[[ss.betaln(v[0] + 1 + self.betahyp, v[1] + 1 + self.betahyp) for k, v in self.clust_sums.items() if k >= 0]].sum()
+
+        ## overall phasing likelihood
+        # p({phase_i} | {a_i, b_i})
+        phase_probs = np.r_[self.seg_phase_probs.values()]
+        phase_lik = np.log1p(phase_probs).sum() if not np.isnan(phase_probs).any() else np.nan
+
+        ## Dirichlet count prior (Dirichlet-categorical marginal likelihood)
+        # p({c_k})
+        dirvec = np.r_[self.clust_counts.values()].astype(float)/self.dp_count_scale_factor
+        k = len(dirvec)
+        count_prior = k*np.log(self.alpha) + ss.gammaln(dirvec).sum() + ss.gammaln(self.alpha) - ss.gammaln(dirvec.sum() + self.alpha)
 
-        # }}}
+        ## segmentation likelihood
+        # p({a_i, b_i} | {s}, {phase_i})
+        # TODO: memoize
+        seg_lik = np.r_[self.seg_liks.values()].sum()
 
+        # p({c_k}, {s}, {phase_i} | {a_i, b_i})
+        return np.r_[clust_lik, phase_lik, count_prior, seg_lik]
+
+    def run(self, n_iter = 0, n_samps = 0):
         #
         # initialize cluster tracking hash tables
-        self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore"))
-        # for the first round of clustering, this is { 1 : 1 }
+        self.clust_counts = sc.SortedDict(self.S["clust"].value_counts().drop(-1, errors = "ignore"))
+        # for the first round of clustering, this is { 0 : 1, 1 : 1, ..., N - 1 : 1 }
+
+        Sgc = self.S.groupby(["clust", "flipped"])[["min", "maj"]].sum()
+        if (Sgc.droplevel(0).index == True).any():
+            Sgc.loc[(slice(None), True), ["min", "maj"]] = Sgc.loc[(slice(None), True), ["maj", "min"]].values
         self.clust_sums = sc.SortedDict({
-          **{ k : np.r_[v["min"], v["maj"]] for k, v in self.S.groupby("clust")[["min", "maj"]].sum().to_dict(orient = "index").items() },
-          **{-1 : np.r_[0, 0], 0 : np.r_[0, 0]}
+          **{ k : np.r_[v["min"], v["maj"]] for k, v in Sgc.groupby(level = "clust").sum().to_dict(orient = "index").items() },
+          **{-1 : np.r_[0, 0]}
         })
-        # for the first round, this is { -1/0 : np.r_[0, 0], 1 : np.r_[S[0, "min"], S[0, "maj"]] }
-        self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 and k != 0 })
-        # for the first round, this is { 1 : {0} }
-        unassigned_segs = sc.SortedList(self.S.index[self.S["clust"] == -1])
+        # for the first round, this is { -1 : np.r_[0, 0], 0 : np.r_[S[0, "min"], S[0, "maj"]], 1 : S[1, "min"], S[1, "maj"], ..., N : S[N - 1, "min"], S[N - 1, "maj"] }
+
+        self.clust_members = sc.SortedDict({ k : set(v) for k, v in self.S.groupby("clust").groups.items() if k != -1 })
+        # for the first round, this is { 0 : {0}, 1 : {1}, ..., N - 1 : {N - 1} }
 
         # store this as numpy for speed
         self.clusts = self.S["clust"].values
 
         max_clust_idx = np.max(self.clust_members.keys() | self.clust_prior.keys() if self.clust_prior is not None else {})
 
+        #
+        # breakpoint tracking
+
+        # segmentation breakpoints
+        self.breakpoints = sc.SortedSet(np.flatnonzero(np.diff(self.S["clust"]) != 0) + 1) | {0, len(self.S)}
+
+        # min/maj counts in each segment
+        self.seg_sums = sc.SortedDict()
+        bpl = np.r_[self.breakpoints]
+        for st, en in np.c_[bpl[:-1], bpl[1:]]:
+            mn = self._Ssum_ph(np.r_[st:en], min = True)
+            mj = self._Ssum_ph(np.r_[st:en], min = False)
+            self.seg_sums[st] = np.r_[mn, mj]
+
+        # likelihoods for each segment
+        self.seg_liks = sc.SortedDict()
+        for k, (a, b) in self.seg_sums.items():
+            self.seg_liks[k] = ss.betaln(a + 1 + self.betahyp, b + 1 + self.betahyp)
+
+        # breakpoints for each cluster
+        self.clust_members_bps = sc.SortedDict({
+          k : sc.SortedSet(v) for k, v in \
+            self.S.loc[self.breakpoints[:-1], ["clust"]].groupby("clust").groups.items()
+        })
+
+        # misphase probabilities for each segment
+        self.seg_phase_probs = sc.SortedDict({ k : np.nan for k in self.breakpoints[:-1] })
+
         # containers for saving the MCMC trace
-        self.segs_to_clusters = []
+        self.snps_to_clusters = []
         self.phase_orientations = []
+        self.segment_trace = []
+        self.likelihood_trace = []
 
+        # likelihood trace for checking burnin status
+        self.lik_trace = []
         burned_in = False
-        all_assigned = False
-        seg_touch_idx = np.zeros(len(self.S), dtype = np.uint16)
-
-#        # containers for saving debugging information (overall likelihoods/cluster assignments pre-burnin)
-#        self.lik_tmp = []
-#        self.vc_tmp = []
+        self.burnin_iteration = -1
+        touch90 = False
+        likelihood_ready = False
 
         n_it = 0
         n_it_last = 0
-        while len(self.segs_to_clusters) < n_iter:
+
+        brk = 0
+
+        while True:
             if not n_it % 1000:
-                print(self.S["clust"].value_counts().drop([-1, 0], errors = "ignore").value_counts().sort_index())
-                print("n unassigned: {}".format((self.S["clust"] == -1).sum()))
-                print("n garbage: {}".format((self.S["clust"] == 0).sum()))
+                if len(self.clust_counts) > 20:
+                    print(pd.Series(self.clust_counts.values()).value_counts().sort_index())
+                else:
+                    print("\n".join([str(self.clust_counts[k]) + ": " + str(x/(x + y)) for k, (x, y) in self.clust_sums.items() if k != -1]))
+                if likelihood_ready:
+                    print("[{}] Likelihood: {}".format("*" if burned_in else " ", self.lik_trace[-1].sum()))
+                if burned_in:
+                    print("{}/{} MCMC samples collected".format(len(self.snps_to_clusters), n_samps))
+
+            # stop after a raw number of iterations
+            if n_iter > 0 and n_it > n_iter:
+                return
 
-            # we are burned in (n_seg/n_clust) iterations after all segments have been touched
+            # stop after a number of samples have been taken
+            if n_samps > 0 and len(self.snps_to_clusters) > n_samps:
+                break
+
+            # poll every 100 iterations for various statuses
             if not n_it % 100:
-                if not all_assigned and (((seg_touch_idx > 0) | (self.clusts == 0)).all() or \
-                  # if there is only one cluster, then consider every segment to have been touched
-                  # otherwise, waiting for every segment to actually be touched will take forever
-                  len(unassigned_segs) == 0 and len(self.clust_counts) == 1):
-                    all_assigned = True
-                    n_it_last = n_it
-                if not burned_in and all_assigned and \
-                  n_it - n_it_last > len(self.S)/len(self.clust_counts):
-                    burned_in = True
-            
-#                self.lik_tmp.append(self.compute_overall_lik())
-#                self.vc_tmp.append(self.S["clust"].value_counts())
+                # have >95% of segments been touched?
+                if (1 - (1 - 1/len(self.breakpoints))**n_it) > 0.95:
+                    touch90 = True
+
+                # start computing likelihoods
+                if touch90:
+                    lik = self.compute_overall_lik_simple()
+                    # phasing likelihood will be NaN until we've touched every singlesegment
+                    if not np.isnan(lik).any():
+                        self.lik_trace.append(lik)
+                        likelihood_ready = True
+
+                # check if likelihood has stabilized enough to consider us "burned in"
+                if likelihood_ready and not burned_in and len(self.lik_trace) > 500:
+                    lt = np.vstack(self.lik_trace).sum(1)
+                    if (np.convolve(np.diff(lt), np.ones(500)/500, mode = "same") < 0).sum() > 2:
+                        print("BURNED IN")
+                        burned_in = True
+                        self.burnin_iteration = len(self.lik_trace)
+                        n_it_last = n_it
 
             #
-            # pick either a segment or a cluster at random (50:50 prob.)
-            move_clust = False
-
-            # pick a segment at random
-            if np.random.rand() < 0.5:
-            #if np.random.rand() < 1:
-                # bias picking unassigned segments if >90% of segments have been assigned
-                if len(unassigned_segs) > 0 and len(unassigned_segs)/len(self.S) < 0.1 and np.random.rand() < 0.5:
-                    seg_idx = sc.SortedSet({np.random.choice(unassigned_segs)})
-                else:
-                    seg_idx = sc.SortedSet({np.random.choice(len(self.S))})
-
-                cur_clust = int(self.clusts[seg_idx])
-
-                # expand segment to include all adjacent segments in the same cluster,
-                # if it has already been assigned to a cluster
-                if cur_clust > 0 and np.random.rand() < 0.5:
-                    si = seg_idx[0]
-
-                    j = 1
-                    while si - j > 0 and \
-                      (self.clusts[si - j] == cur_clust or self.clusts[si - j] == 0):
-                        if self.clusts[si - j] != 0:
-                            seg_idx.add(si - j)
-                        j += 1
-                    j = 1
-                    while si + j < len(self.S) and \
-                      (self.clusts[si + j] == cur_clust or self.clusts[si + j] == 0):
-                        if self.clusts[si + j] != 0:
-                            seg_idx.add(si + j)
-                        j += 1
-
-                    # if we've expanded to include a large fraction (>10%) of segments 
-                    # in this cluster, cluster indexing might become inconsistent.
-                    # skip this iteration
-                    if len(seg_idx) >= 0.1*self.clust_counts[cur_clust]:
-                        n_it += 1
-                        continue
-
-                seg_idx = np.r_[list(seg_idx)]
-
-                n_move = len(seg_idx)
-
-                # if segment was already assigned to a cluster, unassign it
-                if cur_clust > 0:
-                    self.clust_counts[cur_clust] -= n_move
-                    if self.clust_counts[cur_clust] == 0:
-                        del self.clust_counts[cur_clust]
-                        del self.clust_sums[cur_clust]
-                        del self.clust_members[cur_clust]
-                    else:
-                        self.clust_sums[cur_clust] -= np.r_[self.S.iloc[seg_idx, self.min_col].sum(), self.S.iloc[seg_idx, self.maj_col].sum()]
-                        self.clust_members[cur_clust] -= set(seg_idx)
-
-                    unassigned_segs.update(seg_idx)
-                    self.clusts[seg_idx] = -1
-
-            # pick a cluster at random
+            # pick  a segment to move
+
+# diagnostic code to compute overall likelihood before move
+#            compute_lik = False
+#            lik_before = np.nan
+#            if touch90 and np.random.rand() < 0.1:
+#                compute_lik = True
+#                lik_before = self.compute_overall_lik_simple()
+
+            # >90% of segments have been moved; we are iterating over segments sequentially
+            if touch90:
+                break_idx = sc.SortedSet({brk % (len(self.breakpoints) - 1)})
+                brk += 1
+            # we are picking segments at random
             else:
-                # it only makes sense to try joining two clusters if there are at least two of them!
-                if len(self.clust_counts) < 2:
+                break_idx = sc.SortedSet({np.random.choice(len(self.breakpoints) - 1)})
+
+            # get all SNPs within this segment
+            seg_st = self.breakpoints[break_idx[0]]
+            seg_en = self.breakpoints[break_idx[0] + 1]
+            seg_idx = np.r_[seg_st:seg_en]
+
+            cur_clust = int(self.clusts[seg_idx[0]])
+
+            # propose breaking this segment
+            if np.random.rand() < 0.1:
+                # can't split segments of length 1
+                if len(seg_idx) == 1:
                     n_it += 1
                     continue
 
-                cl_idx = np.random.choice(self.clust_counts.keys())
-                seg_idx = np.r_[list(self.clust_members[cl_idx])]
-                n_move = len(seg_idx)
-                cur_clust = -1 # only applicable for individual segments, so we set to -1 here
-                               # (this is so that subsequent references to clust_sums[cur_clust]
-                               # will return (0, 0))
-
-                # unassign all segments within this cluster
-                # (it will either be joined with a new cluster, or remade again into its own cluster)
-                del self.clust_counts[cl_idx]
-                del self.clust_sums[cl_idx]
-                del self.clust_members[cl_idx]
-                unassigned_segs.update(seg_idx)
-                self.clusts[seg_idx] = -1
+                # TODO: memoize cumsums?
+                min_cs = self._Scumsum_ph(seg_idx, min = True)
+                min_csr = self.seg_sums[seg_idx[0]][0] - min_cs
+                maj_cs = self._Scumsum_ph(seg_idx, min = False)
+                maj_csr = self.seg_sums[seg_idx[0]][1] - maj_cs
+
+                split_lik = ss.betaln(min_cs + 1 + self.betahyp, maj_cs + 1 + self.betahyp) + ss.betaln(min_csr + 1 + self.betahyp, maj_csr + 1 + self.betahyp)
+                split_lik[-1] = ss.betaln(min_cs[-1] + 1 + self.betahyp, maj_cs[-1] + 1 + self.betahyp)
+                split_lik -= split_lik.max()
+                split_point = np.random.choice(np.r_[0:len(seg_idx)], p = np.exp(split_lik)/np.exp(split_lik).sum())
+                seg_idx = seg_idx[:(split_point + 1)]
+
+                # add breakpoint (can be erased subsequently if segment rejoins original cluster)
+                new_bp = seg_idx[-1] + 1
+                if len(seg_idx) < seg_en - seg_st: # don't add breakpoint if we're not splitting segment
+                    self.add_breakpoint(start = seg_idx[0], mid = new_bp, end = seg_en, clust_idx = cur_clust)
+
+            # propose splitting out a contiguous interval of segments within the current cluster {{{
+            split_clust = False
+            if False and touch90 and np.random.rand() < 0.1:
+                # TODO: if we use cur_clust, this will be biased towards larger clusters. is this desireable?
+                clust_snps = np.sort(np.r_[list(self.clust_members[cur_clust])])
+
+                # can't split clusters of length 1
+                if len(clust_snps) == 1:
+                    n_it += 1
+                    continue
+
+                split_bdy = self.compute_cluster_splitpoints(clust_snps)
+
+                A_tot, B_tot = self.clust_sums[cur_clust]
+
+                lik0 = ss.betaln(A_tot + 1 + self.betahyp, B_tot + 1 + self.betahyp)
+
+                liks = np.zeros(len(split_bdy) + 1)
+                liks[-1] = lik0 # don't split at all
+
+                # likelihood ratios for splitting each region into a new cluster
+                for i, (st, en) in enumerate(split_bdy):
+                    A = self._Ssum_ph(clust_snps[st:en], min = True)
+                    B = self._Ssum_ph(clust_snps[st:en], min = False)
+
+                    liks[i] = ss.betaln(A_tot - A + 1 + self.betahyp, B_tot - B + 1 + self.betahyp) + ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
+
+                # pick a region to split
+                split_idx = np.random.choice(
+                  len(split_bdy) + 1,
+                  p = np.exp(liks - liks.max())/np.exp(liks - liks.max()).sum()
+                )
+
+                # don't split at all
+                if split_idx == len(split_bdy):
+                    n_it += 1
+                    continue
+
+                # seg_idx == SNPs to propose to split off
+                seg_idx = clust_snps[slice(*split_bdy[split_idx])]
+
+                split_clust = True
+
+                # add breakpoints
+                for si in [seg_idx[0], seg_idx[-1]]:
+                    if si not in self.breakpoints:
+                        seg_st_idx = self.breakpoints.bisect_left(si) - 1
+                        seg_st = self.breakpoints[seg_st_idx]
+                        seg_en_idx = self.breakpoints.bisect_left(si)
+                        seg_en = self.breakpoints[seg_en_idx]
 
-                move_clust = True
+                        self.add_breakpoint(start = seg_st, mid = si, end = seg_en, clust_idx = cur_clust)
+
+                # get all breakpoints within this cluster/interval
+                left_idx = self.clust_members_bps[cur_clust].bisect_left(seg_idx[0])
+                right_idx = self.clust_members_bps[cur_clust].bisect_right(seg_idx[-1])
+                break_idx = sc.SortedSet([self.breakpoints.index(x) for x in self.clust_members_bps[cur_clust][left_idx:right_idx]])
+
+            # }}}
+
+            n_move = len(seg_idx)
+
+            # if segment was already assigned to a cluster, unassign it
+            if cur_clust >= 0:
+                self.clust_counts[cur_clust] -= n_move
+                if self.clust_counts[cur_clust] == 0:
+                    del self.clust_counts[cur_clust]
+                    del self.clust_sums[cur_clust]
+                    del self.clust_members[cur_clust]
+                    del self.clust_members_bps[cur_clust]
+                else:
+                    self.clust_sums[cur_clust] -= np.r_[self._Ssum_ph(seg_idx, min = True), self._Ssum_ph(seg_idx, min = False)]
+                    self.clust_members[cur_clust] -= set(seg_idx)
+                    for b in break_idx:
+                        self.clust_members_bps[cur_clust].remove(self.breakpoints[b])
 
-            if not all_assigned:
-                seg_touch_idx[seg_idx] += 1
+                self.clusts[seg_idx] = -1
 
             #
             # perform phase correction on segment/cluster
             # flip min/maj with probability that alleles are oriented the "wrong" way
-            self.rephase(seg_idx)
+#            if not np.isnan(self.seg_phase_probs[seg_idx[0]]):
+#                rfp = self.compute_rephase_prob(seg_idx)
+#                rfp_mem = self.seg_phase_probs[seg_idx[0]]
+#                if np.abs(rfp - rfp_mem) > 0.05:
+#                    print(rfp_mem, rfp)
+#                    breakpoint()
+            if np.isnan(self.seg_phase_probs[seg_idx[0]]):
+                self.seg_phase_probs[seg_idx[0]] = self.compute_rephase_prob(seg_idx)
+            rephase_prob = self.seg_phase_probs[seg_idx[0]]
 
             #
             # choose to join a cluster or make a new one
@@ -800,49 +747,29 @@ def run(self, n_iter = 50):
             # C is all possible clusters to move to
             A_a = self.clust_sums[cur_clust][0] if cur_clust in self.clust_sums else 0
             A_b = self.clust_sums[cur_clust][1] if cur_clust in self.clust_sums else 0
-            B_a = self.S.iloc[seg_idx, self.min_col].sum() # TODO: slow if seg_idx contains many SNPs
-            B_b = self.S.iloc[seg_idx, self.maj_col].sum()
-            C_ab = np.r_[self.clust_sums.values()] # first terms: (-1) = make new cluster, (0) = garbage cluster
+            B_a = self._Ssum_ph(seg_idx, min = True)
+            B_b = self._Ssum_ph(seg_idx, min = False)
+            C_ab = np.r_[self.clust_sums.values()] # first terms: -1 = make new cluster
             #C_ab = np.r_[[v for k, v in clust_sums.items() if k != cur_clust or cur_clust == -1]] # if we don't want to explicitly propose letting B rejoin cur_clust
 
-            #
-            # adjacent segment likelihoods
-
-            adj_AB = 0
-            adj_BC = np.zeros(len(self.clust_sums))
-
-            if not move_clust or (all_assigned and move_clust and np.random.rand() < 0.01):
-                adj_AB, adj_BC = self.compute_adj_liks(seg_idx, cur_clust)
-            else:
-                adj_BC[self.clust_sums.index(0)] = -np.inf
-
             # A+B,C -> A,B+C
 
             # A+B is likelihood of current cluster B is part of
             #AB = ss.betaln(A_a + B_a + 1, A_b + B_b + 1)
             # C is likelihood of target cluster pre-join
-            C = ss.betaln(C_ab[:, 0] + 1, C_ab[:, 1] + 1)
+            C = ss.betaln(C_ab[:, 0] + 1 + self.betahyp, C_ab[:, 1] + 1 + self.betahyp)
+            C[0] = 0 # don't count prior twice when opening a new cluster
             # A is likelihood cluster B is part of, minus B
             #A = ss.betaln(A_a + 1, A_b + 1)
-            # B+C is likelihood of target cluster post-join
-            BC = ss.betaln(C_ab[:, 0] + B_a + 1, C_ab[:, 1] + B_b + 1)
-
-            #     L(join)           L(split)
-            #MLs = A + BC + adj_BC - (AB + C + adj_AB)
-            # TODO: remove extraneous calculations (e.g. adj_AB, AB, A);
-            #       likelihood simplifies to this in the prior:
-            MLs = adj_BC + BC - C
+            # B+C is likelihood of target cluster post-join, with both phase orientations
+            BC = ss.betaln(C_ab[:, [0]] + np.c_[B_a, B_b] + 1 + self.betahyp, C_ab[:, [1]] + np.c_[B_b, B_a] + 1 + self.betahyp)
 
-            # if we are moving multiple contiguous segments assigned to the same
-            # cluster, do not allow them to create a new cluster. this helps keep
-            # cluster indices consistent
-            if n_move > 1 and not move_clust:
-                MLs[self.clust_sums.index(-1)] = -np.inf
+            MLs = BC - C[:, None]
 
             #
             # priors
 
-            # prior on previous cluster fractions
+            ## prior on previous cluster fractions {{{
 
             prior_diff = []
             prior_com = []
@@ -867,234 +794,334 @@ def run(self, n_iter = 50):
                 # [-1 (totally new cluster), <prior_diff>, <prior_com + prior_null>]
                 prior_idx = np.r_[
                   np.r_[[self.clust_prior.index(x) for x in prior_diff]],
-                  np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null | {0})]]
+                  np.r_[[self.clust_prior.index(x) if x in self.clust_prior else 0 for x in (prior_com | prior_null)]]
                 ]
 
+                # prior marginal likelihoods for both phase orientations
                 prior_MLs = ss.betaln( # prior clusters + segment
-                  np.r_[self.clust_prior_mat[prior_idx, 0]] + B_a + 1,
-                  np.r_[self.clust_prior_mat[prior_idx, 1]] + B_b + 1
+                  np.c_[self.clust_prior_mat[prior_idx, 0]] + np.c_[B_a, B_b] + 1,
+                  np.c_[self.clust_prior_mat[prior_idx, 1]] + np.c_[B_b, B_a] + 1
                 ) \
-                - (ss.betaln(B_a + 1, B_b + 1) + np.r_[np.r_[self.clust_prior_liks.values()][prior_idx]]) # prior clusters, segment
+                - np.c_[ss.betaln(B_a + 1, B_b + 1) + np.r_[np.r_[self.clust_prior_liks.values()][prior_idx]]] # prior clusters, segment
 
                 clust_prior_p = np.maximum(np.exp(prior_MLs - prior_MLs.max())/np.exp(prior_MLs - prior_MLs.max()).sum(), 1e-300)
 
                 # expand MLs to account for multiple new clusters
-                MLs = np.r_[np.full(len(prior_diff), MLs[0]), MLs[1:]]
+                MLs = np.r_[np.full([len(prior_diff), 2], MLs[0]), MLs[1:, :]]
+
+            # }}}
                 
-            # DP prior based on clusters sizes
-            # DP alpha factor is split proportionally between prior_diff and -1 (brand new cluster)
-            ccp = np.r_[[self.clust_count_prior[x] for x in prior_diff]]
-            count_prior = np.r_[self.clust_count_prior[-1]*ccp/ccp.sum(), self.clust_count_prior[0], self.clust_counts.values()]
-            count_prior /= count_prior.sum()
-
-            # choose to join a cluster or make a new one (choice_idx = 0)
-            num = MLs + np.log(count_prior) + np.log(clust_prior_p)
-            choice_p = np.exp(num - num.max())/np.exp(num - num.max()).sum()
+            ## DP prior based on clusters sizes
+            n_c = np.c_[self.clust_counts.values()]/self.dp_count_scale_factor
+            M = n_move/self.dp_count_scale_factor
+            N = n_c.sum() + M
+            log_count_prior = np.full([len(self.clust_sums), 1], np.nan)
+            log_count_prior[1:] = ss.gammaln(M + n_c) + ss.gammaln(N + self.alpha - M) \
+              - (ss.gammaln(n_c) + ss.gammaln(N + self.alpha))
+            # probability of opening a new cluster
+            log_count_prior[0] = ss.gammaln(M) + np.log(self.alpha) + ss.gammaln(N + self.alpha - M) - ss.gammaln(N + self.alpha)
+
+            # p(phase|X)
+            log_phase_prob = np.log(np.maximum(1e-300, np.r_[1 - rephase_prob, rephase_prob]))
+
+            #
+            # adjacent segment likelihood
+
+            log_adj_lik = self.compute_adj_prob(break_idx[0])
+ 
+            # p(X|clust,phase)p(X|seg,phase)p(clust)p(phase)
+            num = (MLs               # p({a_i, b_i}_{i\in B} | {a_i, b_i}_{i\in clust}, phase_{i\in B})
+                  + log_adj_lik      # p({a_i, b_i}_{i\in B} | U, D, phase_{i\in B})
+                  + log_count_prior  # p(clust) (DP prior on clust counts)
+                  + log_phase_prob)  # p(phase)
+
+            num -= num.max() # avoid underflow in sum-exp
+
+            # p(clust,phase|X)
+            choice_p = np.exp(num - np.log(np.exp(num).sum()))
+
+            # row major indexing: choice_idx//2 = cluster index, choice_idx & 1 = rephase true
             choice_idx = np.random.choice(
-              np.r_[0:len(MLs)],
-              p = choice_p
+              np.r_[0:np.prod(choice_p.shape)],
+              p = choice_p.ravel()
             )
             # -1 = brand new, -2, -3, ... = -(prior clust index) - 2
-            # 0 = garbage
-            choice = np.r_[-np.r_[prior_diff] - 2, 0, self.clust_counts.keys()][choice_idx]
+            choice = np.r_[-np.r_[prior_diff] - 2, self.clust_counts.keys()][choice_idx//2]
+
+            # save rephasing status
+            if choice_idx & 1:
+                self.S.iloc[seg_idx, self.flip_col] = ~self.S.iloc[seg_idx, self.flip_col]
+                for b in break_idx:
+                    st = self.breakpoints[b]
+                    en = self.breakpoints[b + 1]
+                    self.seg_sums[st] = self.seg_sums[st][::-1]
 
             # create new cluster
             if choice < 0:
                 # if we are moving an entire cluster, give it the same index it used to have
                 # otherwise, cluster indices will be inconsistent
-                if move_clust:
-                    new_clust_idx = cl_idx
-                elif choice == -1: # totally new cluster
+                if cur_clust not in self.clust_counts:
+                    new_clust_idx = cur_clust
+                else: # totally new cluster
                     max_clust_idx += 1
                     new_clust_idx = max_clust_idx
-                else: # match index of cluster in prior
-                    new_clust_idx = -choice - 2
 
                 self.clust_counts[new_clust_idx] = n_move
                 self.S.iloc[seg_idx, self.clust_col] = new_clust_idx
                 self.clusts[seg_idx] = new_clust_idx
 
-                self.clust_sums[new_clust_idx] = np.r_[B_a, B_b]
+                self.clust_sums[new_clust_idx] = np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a]
                 self.clust_members[new_clust_idx] = set(seg_idx)
 
-            # send to garbage
-            elif choice == 0:
-                self.S.iloc[seg_idx, self.clust_col] = 0
-                self.clusts[seg_idx] = 0
-
             # join existing cluster
             else:
-                # if we are combining two clusters, take the index of the bigger one
-                # this helps to keep cluster indices consistent
-                if move_clust and self.clust_counts[choice] < n_move:
-                    self.clust_counts[cl_idx] = self.clust_counts[choice]
-                    self.clust_sums[cl_idx] = self.clust_sums[choice]
-                    self.clust_members[cl_idx] = self.clust_members[choice]
-                    self.S.iloc[np.flatnonzero(self.S["clust"] == choice), self.clust_col] = cl_idx
-                    del self.clust_counts[choice]
-                    del self.clust_sums[choice]
-                    del self.clust_members[choice]
-                    choice = cl_idx
-
                 self.clust_counts[choice] += n_move 
-                self.clust_sums[choice] += np.r_[B_a, B_b]
+                self.clust_sums[choice] += np.r_[B_a, B_b] if not choice_idx & 1 else np.r_[B_b, B_a]
                 self.S.iloc[seg_idx, self.clust_col] = choice
                 self.clusts[seg_idx] = choice
 
                 self.clust_members[choice].update(set(seg_idx))
 
-            for si in seg_idx:
-                unassigned_segs.discard(si)
+            # if segment was rephased, update saved phasing probabilities
+            if choice_idx & 1:
+                for bp_idx in break_idx:
+                    st = self.breakpoints[bp_idx]
+                    en = self.breakpoints[bp_idx + 1]
+                    self.seg_phase_probs[st] = self.compute_rephase_prob(np.r_[st:en])
+
+            # update breakpoints
+
+            # B->A
+            #    .   .     .   break_idx + 1
+            # A B A B A C B A
+            #  +   +     +     break_idx
+            #*           *     update_idx
+
+            break_idx_bi = break_idx | { x + 1 for x in break_idx }
+            snp_idx_bi = sc.SortedSet([self.breakpoints[b] for b in break_idx_bi])
+            snp_idx = sc.SortedSet([self.breakpoints[b] for b in break_idx])
+            update_idx = sc.SortedSet()
+            for snp in snp_idx_bi:
+                if snp < len(self.S) and self.clusts[snp - 1] == self.clusts[snp]:
+                    snp_idx.discard(snp) # discard rather than remvoe because this could be in snp_idx + 1
+                    self.breakpoints.remove(snp)
+                    self.seg_sums.pop(snp)
+                    self.seg_liks.pop(snp)
+                    self.seg_phase_probs.pop(snp)
+                    self.clust_members_bps[self.clusts[snp]].discard(snp) # discard rather than remove since this breakpoint could be in break_idx + 1, which would belong to another cluster
+                    update_idx.add(self.breakpoints.bisect_left(snp) - 1)
+                    snp_idx.add(self.breakpoints[self.breakpoints.bisect_left(snp) - 1])
+#            if len(update_idx):
+#                usnp = self.breakpoints[self.breakpoints.bisect_left(seg_idx[0]) - 1]
+#                print(f"{usnp}: {self.clusts[usnp]}")
+#                print(f"{snp_idx[0]}: {self.clusts[snp_idx[0]]} <")
+#                print(f"{snp_idx[1]}: {self.clusts[snp_idx[1]]} <")
+#                dsnp = self.breakpoints[self.breakpoints.bisect_right(seg_idx[0])]
+#                print(f"{dsnp}: {self.clusts[dsnp]}")
+#                print(f"Update: {self.breakpoints[update_idx[0]]}")
+            for bp_idx in update_idx:
+                st = self.breakpoints[bp_idx]
+                en = self.breakpoints[bp_idx + 1]
+                A = self._Ssum_ph(np.r_[st:en], min = True)
+                B = self._Ssum_ph(np.r_[st:en], min = False)
+                self.seg_sums[st] = np.r_[A, B]
+                self.seg_liks[st] = ss.betaln(A + 1 + self.betahyp, B + 1 + self.betahyp)
+                self.seg_phase_probs[st] = self.compute_rephase_prob(np.r_[st:en])
 
-            # track global state of cluster assignments
-            # on average, each segment will have been reassigned every n_seg/(n_clust/2) iterations
-            if burned_in and n_it - n_it_last > len(self.S)/(len(self.clust_counts)*2):
-                self.segs_to_clusters.append(self.S["clust"].copy())
+            if choice < 0:
+                self.clust_members_bps[new_clust_idx] = snp_idx
+            else:
+                self.clust_members_bps[choice] |= snp_idx
+
+# diagnostic code to check if breakpoint list is properly updated
+#            if touch90:
+#                x = sc.SortedSet()
+#                for y in self.clust_members_bps.values():
+#                    x |= y
+#                if len(x) != len(self.breakpoints) - 1:
+#                    breakpoint()
+
+# diagnostic code to compute overall likelihood delta for iteration
+#            if compute_lik:
+#                lik_after = self.compute_overall_lik_simple()
+#                lik_delta = lik_after.sum() - lik_before.sum()
+#                ML_choice = num.ravel()[choice_idx]
+#                if not np.isnan(lik_delta) and (lik_delta != 0 or ML_choice != 0):
+#                    print("lik: {}; MLs: {}".format(lik_delta, ML_choice))
+##                if lik_delta < 0 and ML_choice == 0:
+##                    breakpoint()
+
+            # save a sample from the MCMC when >95% of segments have been touched since the last iteration
+            if burned_in and (1 - (1 - 1/len(self.breakpoints))**(n_it - n_it_last)) > 0.95:
+                self.snps_to_clusters.append(self.S["clust"].copy())
                 self.phase_orientations.append(self.S["flipped"].copy())
+                self.segment_trace.append({ snp : self.S.iloc[snp, self.clust_col] for snp in self.breakpoints[:-1]})
+                self.likelihood_trace.append(self.compute_overall_lik_simple().sum())
                 n_it_last = n_it
 
             n_it += 1
 
-        return np.r_[self.segs_to_clusters], np.r_[self.phase_orientations]
-
-    #_colors = mpl.cm.get_cmap("tab10").colors
-    _colors = ((np.c_[1:7] & np.r_[4, 2, 1]) > 0).astype(int)
-#   _colors = np.r_[np.c_[87, 182, 55],
-#   np.c_[253, 245, 81],
-#   np.c_[238, 109, 45],
-#   np.c_[204, 43, 30],
-#   np.c_[221, 50, 132],
-#   np.c_[0, 23, 204],
-#   np.c_[75, 172, 227]]/255
+        return np.r_[self.snps_to_clusters], np.r_[self.phase_orientations], np.r_[self.likelihood_trace]
 
-    def get_unique_clust_idxs(self, segs_to_clusters = None):
-        if segs_to_clusters is None:
-            segs_to_clusters = np.r_[self.segs_to_clusters]
-        s2cu, s2cu_j = np.unique(segs_to_clusters, return_inverse = True)
-        return s2cu, s2cu_j.reshape(segs_to_clusters.shape)
+    def get_unique_clust_idxs(self, snps_to_clusters = None):
+        if snps_to_clusters is None:
+            snps_to_clusters = np.r_[self.snps_to_clusters]
+        s2cu, s2cu_j = np.unique(snps_to_clusters, return_inverse = True)
+        return s2cu, s2cu_j.reshape(snps_to_clusters.shape)
 
     def get_colors(self):
         s2cu, s2cu_j = self.get_unique_clust_idxs()
 
-        seg_terr = self.S["end_gp"] - self.S["start_gp"]
-        tot_terr = np.zeros(len(s2cu))
-        for r in s2cu_j:
-           tot_terr += npg.aggregate(r, seg_terr, size = len(tot_terr))
-
-        si = np.argsort(tot_terr)[::-1]
-        terr_cs = np.cumsum(tot_terr[si])/tot_terr.sum()
-
-        return [mpl.cm.get_cmap("gist_rainbow")(x) for x in np.linspace(0, 1, (terr_cs < 0.99).sum())]
-
-    def visualize_segs(self):
-        plt.figure()
+        T = pd.DataFrame(np.c_[np.r_[self.breakpoints[:-2]], np.r_[self.breakpoints[1:-1]]], columns = ["snp_st", "snp_end"])
+        T["gp_st"] = self.S.loc[T["snp_st"], "pos_gp"].values
+        T["gp_end"] = self.S.loc[T["snp_end"], "pos_gp"].values
+        T["terr"] = T["gp_end"] - T["gp_st"]
+        T["clust"] = self.S.loc[T["snp_st"], "clust"].values
+
+        clust_terr = T.groupby("clust")["terr"].sum().sort_values(ascending = False)
+        si = clust_terr.index.argsort()
+
+        # color any cluster larger than 10Mb (~0.003 of total genomic territory)
+        base_colors = np.array([
+          [0.368417, 0.506779, 0.709798],
+          [0.880722, 0.611041, 0.142051],
+          [0.560181, 0.691569, 0.194885],
+          [0.922526, 0.385626, 0.209179],
+          [0.528488, 0.470624, 0.701351],
+          [0.772079, 0.431554, 0.102387],
+          [0.363898, 0.618501, 0.782349],
+          [1, 0.75, 0],
+          [0.647624, 0.37816, 0.614037],
+          [0.571589, 0.586483, 0.],
+          [0.915, 0.3325, 0.2125],
+          [0.400822, 0.522007, 0.85],
+          [0.972829, 0.621644, 0.073362],
+          [0.736783, 0.358, 0.503027],
+          [0.280264, 0.715, 0.429209]
+        ])
+        extra_colors = np.array(
+          distinctipy.distinctipy.get_colors(
+            (clust_terr/clust_terr.sum() >= 0.003).sum() - base_colors.shape[0],
+            exclude_colors = [list(x) for x in np.r_[np.c_[0, 0, 0], np.c_[1, 1, 1], np.c_[0.5, 0.5, 0.5], np.c_[1, 0, 1], base_colors]],
+            rng = 1234
+          )
+        )
+
+        return np.r_[base_colors, extra_colors if extra_colors.size > 0 else np.empty([0, 3])][si]
+
+    def visualize_segs(self, f = None, use_clust = False, show_snps = False, chrom = None):
+        f = plt.figure(figsize = [16, 4]) if f is None else f
         ax = plt.gca()
-        ax.set_xlim([0, self.S["end_gp"].max()])
+        if chrom is None:
+            ax.set_xlim([0, self.S["pos_gp"].max()])
+        else:
+            ax.set_xlim([*self.S.loc[self.S["chr"] == chrom, "pos_gp"].iloc[[0, -1]]])
         ax.set_ylim([0, 1])
 
         colors = self.get_colors()
         s2cu, s2cu_j = self.get_unique_clust_idxs()
 
-        n_samp = len(self.segs_to_clusters)
-
-        for s2c, s2ph in zip(s2cu_j, self.phase_orientations):
-            # rephase segments according to phase orientation sample
-            S_ph = self.S.copy()
-            flip_idx = np.flatnonzero(s2ph != S_ph["flipped"])
-            S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
+        n_samp = len(self.snps_to_clusters)
 
-            for i, r in enumerate(S_ph.itertuples()):
-                ## don't show garbage clusters
-                #if s2cu[s2c[i]] == 0:
-                #    continue
+        selff = copy.deepcopy(self)
 
-                ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], r.min + 1, r.maj + 1)
-                ax.add_patch(mpl.patches.Rectangle((r.start_gp, ci_lo), r.end_gp - r.start_gp, ci_hi - ci_lo, facecolor = colors[s2c[i] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000))
+        if show_snps:
+            # set SNP alpha based on number of SNPs
+            logistic = lambda A, K, B, M, x : A + (K - A)/(1 + np.exp(-B*(x - M)))
+            default_alpha = logistic(A = 0.4, K = 0.025, B = 0.00001, M = 120000, x = len(self.S) if chrom is None else (self.S["chr"] == chrom).sum())
 
-    def visualize_adjacent_segs(self, f = None, n_samp = None):
-        plt.figure(num = f, figsize = [17.56, 5.67])
-        ax = plt.gca()
-        ax.set_xlim([0, self.S["end_gp"].max()])
-        ax.set_ylim([0, 1])
+            ph_prob = np.r_[self.phase_orientations].mean(0)
 
-        colors = self.get_colors()
-        s2cu, s2cu_j = self.get_unique_clust_idxs()
+            cu = np.searchsorted(s2cu, self.S["clust"])
 
-        n_samp = len(self.segs_to_clusters) if n_samp is None else n_samp
+            # only plot unambiguous SNPs once
+            uidx = ph_prob == 0
+            ax.scatter(
+              self.S.loc[uidx, "pos_gp"],
+              self.S.loc[uidx, "min"]/self.S.loc[uidx, ["min", "maj"]].sum(1),
+              color = colors[cu[uidx] % len(colors)], marker = '.', alpha = default_alpha, s = 1
+            )
+            uidx = ph_prob == 1
+            ax.scatter(
+              self.S.loc[uidx, "pos_gp"],
+              self.S.loc[uidx, "maj"]/self.S.loc[uidx, ["min", "maj"]].sum(1),
+              color = colors[cu[uidx] % len(colors)], marker = '.', alpha = default_alpha, s = 1
+            )
 
-        for s2c, s2ph in zip(s2cu_j, self.phase_orientations):
-            # rephase segments according to phase orientation sample
-            S_ph = self.S.copy()
-            flip_idx = np.flatnonzero(s2ph != S_ph["flipped"])
-            S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
-
-            bdy = np.flatnonzero(np.r_[1, np.diff(s2c) != 0, 1])
-            bdy = np.c_[bdy[:-1], bdy[1:]]
-
-#            s2c_nz = s2c.copy()
-#            zidx = np.flatnonzero(s2c[bdy[:, 0]] == 0)
-#            for z in zidx:
-#                s2c_nz[bdy[z, 0]:bdy[z, 1]] = s2c_nz[bdy[z - 1, 0]]
-#            bdy_nz = np.flatnonzero(np.r_[1, np.diff(s2c_nz) != 0, 1])
-#            bdy_nz = np.c_[bdy_nz[:-1], bdy_nz[1:]]
-
-            for st, en in bdy:
-                ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], S_ph.iloc[st:en, self.min_col].sum() + 1, S_ph.iloc[st:en, self.maj_col].sum() + 1)
-                ax.add_patch(mpl.patches.Rectangle((S_ph.iloc[st]["start_gp"], ci_lo), S_ph.iloc[en - 1]["end_gp"] - S_ph.iloc[st]["start_gp"], np.maximum(0, ci_hi - ci_lo), facecolor = colors[s2c[st] % len(colors)], fill = True, alpha = 1/n_samp, zorder = 1000))
-
-    def visualize_clusts(self, f = None, n_samp = None, thick = False, nocolor = False):
-        plt.figure(num = f, figsize = [17.56, 5.67])
-        ax = plt.gca()
-        ax.set_xlim([0, self.S["end_gp"].max()])
-        ax.set_ylim([0, 1])
+            # plot ambiguous SNPs with opacity weighted by phase probability
+            nuidx = (ph_prob != 0) & (ph_prob != 1)
+            ax.scatter(
+              selff.S.loc[nuidx, "pos_gp"],
+              self.S.loc[nuidx, "min"]/self.S.loc[nuidx, ["min", "maj"]].sum(1),
+              color = colors[cu[nuidx] % len(colors)], marker = '.', alpha = default_alpha*(1 - ph_prob[nuidx]), s = 1
+            )
+            ax.scatter(
+              selff.S.loc[nuidx, "pos_gp"],
+              self.S.loc[nuidx, "maj"]/self.S.loc[nuidx, ["min", "maj"]].sum(1),
+              color = colors[cu[nuidx] % len(colors)], marker = '.', alpha = default_alpha*ph_prob[nuidx], s = 1
+            )
 
-        colors = self.get_colors()
-        s2cu, s2cu_j = self.get_unique_clust_idxs()
+        for seg2c, s2ph in zip(self.segment_trace, self.phase_orientations):
+            # only show maximum likelihood if we're overlaying SNPs
+            if show_snps:
+                mlidx = np.r_[self.likelihood_trace].argmax()
+                seg2c, s2ph = self.segment_trace[mlidx], self.phase_orientations[mlidx]
 
-        n_samp = len(self.segs_to_clusters) if n_samp is None else n_samp
+            # get uniqued clust indices for each segment start
+            seg_cu = np.searchsorted(s2cu, np.r_[list(seg2c.values())])
 
-        for s2c, s2ph in zip(s2cu_j, self.phase_orientations):
             # rephase segments according to phase orientation sample
-            S_ph = self.S.copy()
-            flip_idx = np.flatnonzero(s2ph != S_ph["flipped"])
-            S_ph.iloc[flip_idx, [self.min_col, self.maj_col]] = S_ph.iloc[flip_idx, [self.maj_col, self.min_col]]
-
-            # get overall cluster sums
-            clust_min = npg.aggregate(s2c, S_ph["min"])
-            clust_maj = npg.aggregate(s2c, S_ph["maj"])
-            CIs = s.beta.ppf([0.05, 0.5, 0.95], clust_min[:, None] + 1, clust_maj[:, None] + 1)
-
-            # get boundaries of contiguous segments
-            bdy = np.flatnonzero(np.r_[1, np.diff(s2c) != 0, 1])
-            bdy = np.c_[bdy[:-1], bdy[1:]]
-
-#            s2c_nz = s2c.copy()
-#            zidx = np.flatnonzero(s2c[bdy[:, 0]] == 0)
-#            for z in zidx:
-#                s2c_nz[bdy[z, 0]:bdy[z, 1]] = s2c_nz[bdy[z - 1, 0]]
-#            bdy_nz = np.flatnonzero(np.r_[1, np.diff(s2c_nz) != 0, 1])
-#            bdy_nz = np.c_[bdy_nz[:-1], bdy_nz[1:]]
-
-            for st, en in bdy:
-                if thick:
-                    b = CIs[s2c[st], 1] - 0.01
-                    t = CIs[s2c[st], 1] + 0.01
-                else:
-                    color = colors[s2c[st] % len(colors)]
-                    b = CIs[s2c[st], 0]
-                    t = CIs[s2c[st], 2]
+            selff.S["flipped"] = s2ph
 
-                if nocolor:
-                    color = [0, 1, 0]
-                else:
-                    color = colors[s2c[st] % len(colors)]
+            seg_bdy = np.r_[list(seg2c.keys()), len(selff.S)]
+            seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
 
+            for i, (st, en) in enumerate(seg_bdy):
+                if use_clust:
+                    ci_lo, med, ci_hi = s.beta.ppf(
+                      [0.05, 0.5, 0.95],
+                      selff.clust_sums[seg2c[st]][0] + 1 + self.betahyp,
+                      selff.clust_sums[seg2c[st]][1] + 1 + self.betahyp,
+                    )
+                else:
+                    ci_lo, med, ci_hi = s.beta.ppf(
+                      [0.05, 0.5, 0.95],
+                      selff._Ssum_ph(np.r_[st:en], min = True) + 1 + self.betahyp,
+                      selff._Ssum_ph(np.r_[st:en], min = False) + 1 + self.betahyp,
+                    )
                 ax.add_patch(mpl.patches.Rectangle(
-                  xy = (S_ph.iloc[st]["start_gp"], b),
-                  width = S_ph.iloc[en - 1]["end_gp"] - S_ph.iloc[st]["start_gp"],
-                  height = t - b,
-                  facecolor = color,
-                  fill = True,
-                  alpha = 1/n_samp,
-                  zorder = 1000)
+                  (selff.S.iloc[st]["pos_gp"], ci_lo),
+                  selff.S.iloc[en - 1]["pos_gp"] - selff.S.iloc[st]["pos_gp"],
+                  np.maximum(0, ci_hi - ci_lo),
+                  facecolor = colors[seg_cu[i] % len(colors)],
+                  edgecolor = 'k' if show_snps else None, linewidth = 0.5 if show_snps else None,
+                  fill = True, alpha = 1 if show_snps else 1/n_samp, zorder = 1000
+                ))
+                ax.scatter(
+                  (selff.S.iloc[en - 1]["pos_gp"] + selff.S.iloc[st]["pos_gp"])/2,
+                  med,
+                  color = colors[seg_cu[i] % len(colors)],
+                  marker = '.', s = 1, alpha = 1 if show_snps else 1/n_samp
                 )
+
+            if show_snps:
+                break
+
+    def visualize_clusts(self, **kwargs):
+        self.visualize_segs(use_clust = True, **kwargs)
+
+    def plot_likelihood_trace(self):
+        lt = np.vstack(self.lik_trace)
+        lt = lt[np.isnan(lt).sum(1) == 0, :]
+
+        lt = lt[self.burnin_iteration:, :]
+
+        plt.figure(); plt.clf()
+        plt.scatter(np.r_[0:len(lt)], lt[:, 0] - lt[:, 0].max())
+        #plt.scatter(np.r_[0:len(lt)], lt[:, 1] - lt[:, 1].max())
+        plt.scatter(np.r_[0:len(lt)], lt[:, 2] - lt[:, 2].max())
+        plt.scatter(np.r_[0:len(lt)], lt[:, 3] - lt[:, 3].max())
+        plt.scatter(np.r_[0:len(lt)], lt.sum(1) - lt.sum(1).max(), marker = '+', color = 'k')
+        plt.legend(["Clust", "DP", "Seg", "Total"])
+        plt.xlabel(r"Post-burnin iteration ($\times 100$)")
+        plt.ylabel(r"$\Delta$ likelihood")
diff --git a/hapaseg/allelic_MCMC.py b/hapaseg/allelic_MCMC.py
index d38d67e..32e4313 100644
--- a/hapaseg/allelic_MCMC.py
+++ b/hapaseg/allelic_MCMC.py
@@ -24,8 +24,6 @@ def __init__(self, P,
       quit_after_burnin = False,
       n_iter = 100000,
       ref_bias = 1.0,
-      misphase_prior = 0.001,
-      phase_correct = False
     ):
         #
         # dataframe stuff
@@ -59,22 +57,11 @@ def __init__(self, P,
 
         self.quit_after_burnin = quit_after_burnin
 
-        self.misphase_prior = misphase_prior
-
-        # whether to perform phasing correction iterations
-        self.phase_correct = phase_correct
-
-        # how many post-burnin samples to use to infer phase switches
-        self.n_phase_correct_samples = 40
-
         #
         # chain state
         self.iter = 1
         self.burned_in = False
 
-        # whether phase correction has been performed
-        self.phase_correction_ready = False
-
         #
         # breakpoint storage
 
@@ -89,17 +76,8 @@ def __init__(self, P,
         # list of all breakpoints at nth iteration
         self.breakpoint_list = []
 
-        #
-        # misphase interval storage
-
-        # candidate intervals that were misphased
-        self.B_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int)
-
-        # current state of interval assignments (relative to B_ct)
-        self.F = MySortedList()
-
-        # state of interval assignments at nth iteration
-        self.phase_interval_list = []
+        # MLE breakpoint
+        self.breakpoints_MLE = None
 
         #
         # cumsum arrays for each segment
@@ -110,19 +88,18 @@ def __init__(self, P,
         self.cs_MAJ = sc.SortedDict()
         self.cs_MIN = sc.SortedDict()
 
-        # probability of picking a breakpoint
-        self.split_prob = sc.SortedDict()
-
         #
         # marginal likelihoods
 
+        self.betahyp = (self.P["REF_COUNT"] + self.P["ALT_COUNT"]).mean()/4
+
         # log marginal likelihoods for each segment
         # initialize with each SNP comprising its own segment.
         self.seg_marg_liks = sc.SortedDict(zip(
           range(0, len(self.P)),
           ss.betaln(
-            self.P.iloc[0:len(self.P), self.min_idx] + 1,
-            self.P.iloc[0:len(self.P), self.maj_idx] + 1
+            self.P.iloc[0:len(self.P), self.min_idx] + 1 + self.betahyp,
+            self.P.iloc[0:len(self.P), self.maj_idx] + 1 + self.betahyp
           )
         ))
 
@@ -141,25 +118,14 @@ def _Piloc(self, st, en, col_idx, incl_idx = None):
 
     def run(self):
         while self.iter < self.n_iter:
-            # perform a split, combine, phase correct, or prune operation
-            op = np.random.choice(4)
+            # perform a split or combine
+            op = np.random.choice(2)
             if op == 0:
                 if self.combine(np.random.choice(self.breakpoints[:-1]), force = False) == -1:
                     continue
             elif op == 1:
                 if self.split(b_idx = np.random.choice(len(self.breakpoints))) == -1:
                     continue
-            elif op == 2:
-                if self.phase_correct and self.phase_correction_ready:
-                    self.rephase()
-                else:
-                    continue
-            elif op == 3:
-                continue
-                if np.random.rand() < 0.01:
-                    self.prune()
-                else:
-                    continue
 
             # if we're only running up to burnin, bail
             if self.quit_after_burnin and self.burned_in:
@@ -172,27 +138,15 @@ def run(self):
                 ) + colorama.Fore.RESET)
                 return self
 
-            # correct phases after some post-burnin iterations
-            if not self.phase_correction_ready and self.phase_correct and \
-              self.burned_in and len(self.breakpoint_list) >= 2*self.n_phase_correct_samples:
-                self.correct_phases()
-                self.phase_correction_ready = True
-
-                # breakpoint/prune lists are liable to change after phase correction, so clear them 
-                self.breakpoint_list = []
-                self.include = []
-
-            # save set of breakpoints, phase intervals, and prune states if burned in 
-            if self.burned_in and not self.iter % 100:
-                self.breakpoint_list.append(self.breakpoints.copy())
-                self.include.append(self.P["include"].copy())
-                if self.phase_correction_ready:
-                    self.phase_interval_list.append(self.F.copy())
+            # save MLE breakpoint if we've burned in
+            if self.burned_in:
+                if self.marg_lik[self.iter] > self.marg_lik[self.iter - 1]:
+                    self.breakpoints_MLE = self.breakpoints.copy()
 
             # print status
             if not self.iter % 100:
                 if self.burned_in:
-                    color = colorama.Fore.MAGENTA if not self.phase_correction_ready else colorama.Fore.RESET
+                    color = colorama.Fore.RESET
                 else:
                     color = colorama.Fore.YELLOW
                 print("{color}[{st},{en}]\t{n}/{tot}\tn_bp = {n_bp}\tlik = {lik}".format(
@@ -205,11 +159,23 @@ def run(self):
                   color = color
                 ))
 
-            # check if we've burned in
+            # check if we've burned in -- chain is oscillating around some
+            # optimium (and thus mean differences between marginal likelihoods might
+            # be slightly negative)
             # TODO: use a faster method of computing rolling average
-            if not self.burned_in and self.iter > 500:
-                if np.diff(self.marg_lik[(self.iter - 500):self.iter]).mean() < 0:
+            if not self.burned_in and self.iter > 1000:
+                if np.diff(self.marg_lik[(self.iter - 1000):self.iter]).mean() < 0:
                     self.burned_in = True
+                # contingency if we've unambiguously converged on an optimum and chain has not moved at all
+                # exit early to save time
+                if (np.diff(self.marg_lik[(self.iter - 1000):self.iter]) == 0).all():
+                    self.breakpoints_MLE = self.breakpoints.copy()
+                    print(colorama.Fore.GREEN + "Chain has unambiguously converged on an optimum; stopping early in {n} iterations. n_bp = {n_bp}, lik = {lik}".format(
+                      n = self.iter,
+                      n_bp = len(self.breakpoints),
+                      lik = self.marg_lik[self.iter]
+                    ) + colorama.Fore.RESET)
+                    return self
 
             self.iter += 1 
 
@@ -245,8 +211,8 @@ def combine(self, st = None, b_idx = None, force = True):
         ML_split = self.seg_marg_liks[st] + self.seg_marg_liks[mid]
 
         ML_join = ss.betaln(
-          self._Piloc(st, en, self.min_idx).sum() + 1,
-          self._Piloc(st, en, self.maj_idx).sum() + 1
+          self._Piloc(st, en, self.min_idx).sum() + 1 + self.betahyp,
+          self._Piloc(st, en, self.maj_idx).sum() + 1 + self.betahyp
         )
 
         # proposal dist. ratio
@@ -276,299 +242,6 @@ def combine(self, st = None, b_idx = None, force = True):
 
             return mid
 
-    def flip_hap(self, st, en):
-        """
-        Flips the SNPs from st to en
-        """
-
-        x = self.P.iloc[st:en, self.maj_idx].copy()
-        self.P.iloc[st:en, self.maj_idx] = self.P.iloc[st:en, self.min_idx]
-        self.P.iloc[st:en, self.min_idx] = x
-
-    def prob_misphase(self, bdy1, bdy2):
-        """
-        Compute probability of misphase
-        """
-        # TODO: change invocation to st, mid, en -- we don't need to correct
-        #       phasing of noncontiguous segments
-
-        # prior on misphasing probability
-        p_mis = self.misphase_prior if np.isnan(self.P.loc[bdy1[1] - 1, "misphase_prob"]) else self.P.loc[bdy1[1] - 1, "misphase_prob"]
-        if p_mis == 0:
-            return -np.inf, 0
-
-        # haps = x/y, segs = 1/2, beta params. = A/B
-
-        # seg 1
-        rng_idx = (self.P.index >= bdy1[0]) & (self.P.index < bdy1[1])
-
-        idx = rng_idx & self.P["aidx"] & self.P["include"]
-        x1_A = self.P.loc[idx, "ALT_COUNT"].sum()
-        x1_B = self.P.loc[idx, "REF_COUNT"].sum()
-
-        idx = rng_idx & ~self.P["aidx"] & self.P["include"]
-        y1_A = self.P.loc[idx, "ALT_COUNT"].sum()
-        y1_B = self.P.loc[idx, "REF_COUNT"].sum()
-
-        # seg 2
-        rng_idx = (self.P.index >= bdy2[0]) & (self.P.index < bdy2[1])
-
-        idx = rng_idx & self.P["aidx"] & self.P["include"]
-        x2_A = self.P.loc[idx, "ALT_COUNT"].sum()
-        x2_B = self.P.loc[idx, "REF_COUNT"].sum()
-
-        idx = rng_idx & ~self.P["aidx"] & self.P["include"]
-        y2_A = self.P.loc[idx, "ALT_COUNT"].sum()
-        y2_B = self.P.loc[idx, "REF_COUNT"].sum()
-
-        lik_mis   = ss.betaln(x1_A + y1_B + y2_A + x2_B + 1, y1_A + x1_B + x2_A + y2_B + 1)
-        lik_nomis = ss.betaln(x1_A + y1_B + x2_A + y2_B + 1, y1_A + x1_B + y2_A + x2_B + 1)
-
-        # logsumexp
-        m = np.maximum(lik_mis, lik_nomis)
-        denom = m + np.log(np.exp(lik_mis - m)*p_mis + np.exp(lik_nomis - m)*(1 - p_mis))
-
-        return lik_mis + np.log(p_mis) - denom, lik_nomis + np.log(1 - p_mis) - denom
-
-    def correct_phases(self):
-        """
-        Compute potentially misphased intervals, given some segmentation samples
-        """
-        if not self.burned_in or len(self.breakpoint_list) == 0:
-            raise RuntimeError("Breakpoint sample list must be populated (chain must be burned in)")
-
-        #A_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int)
-        #B_ct = sp.dok_matrix((len(self.P), len(self.P)), dtype = np.int)
-
-        for bp_idx in np.random.choice(len(self.breakpoint_list), self.n_phase_correct_samples, replace = False):
-            bpl = np.array(self.breakpoint_list[bp_idx]); bpl = np.c_[bpl[:-1], bpl[1:]]
-
-            p_mis = np.full(len(bpl) - 1, np.nan)
-            p_A = np.full(len(bpl) - 1, np.nan)
-            p_B = np.full(len(bpl) - 1, np.nan)
-
-            V = np.full([len(bpl) - 1, 2], np.nan)
-            B = np.zeros([len(bpl) - 1, 2], dtype = np.uint8)
-
-            for i, (st, mid, _, en) in enumerate(np.c_[bpl[:-1], bpl[1:]]):
-                p_mis, p_nomis = self.prob_misphase([st, mid], [mid, en])
-
-                # TODO: memoize partial sums
-
-                # prob. that left segment is on hap. A
-                p_A1 = s.beta.logsf(0.5, self._Piloc(st, mid, self.min_idx).sum() + 1, self._Piloc(st, mid, self.maj_idx).sum() + 1)
-                # prob. that right segment is on hap. A
-                p_A2 = s.beta.logsf(0.5, self._Piloc(mid, en, self.min_idx).sum() + 1, self._Piloc(mid, en, self.maj_idx).sum() + 1)
-
-                # prob. that left segment is on hap. B
-                p_B1 = s.beta.logcdf(0.5, self._Piloc(st, mid, self.min_idx).sum() + 1, self._Piloc(st, mid, self.maj_idx).sum() + 1)
-                # prob. that right segment is on hap. B
-                p_B2 = s.beta.logcdf(0.5, self._Piloc(mid, en, self.min_idx).sum() + 1, self._Piloc(mid, en, self.maj_idx).sum() + 1)
-
-                if i == 0:
-                    V[i, :] = [p_A1, p_B1]
-                    continue
-
-                p_AB = p_mis + p_A1 + p_B2
-                p_BA = p_mis + p_B1 + p_A2
-                p_AA = p_nomis + p_A1 + p_A2
-                p_BB = p_nomis + p_B1 + p_B2
-
-                V[i, 0] = np.max(np.r_[p_AA + V[i - 1, 0], p_BA + V[i - 1, 1]])
-                V[i, 1] = np.max(np.r_[p_AB + V[i - 1, 0], p_BB + V[i - 1, 1]])
-
-                B[i, 0] = np.argmax(np.r_[p_AA + V[i - 1, 0], p_BA + V[i - 1, 1]])
-                B[i, 1] = np.argmax(np.r_[p_AB + V[i - 1, 0], p_BB + V[i - 1, 1]])
-
-            # backtrace
-            BT = np.full(len(B), -1, dtype = np.uint8)
-            ix = np.argmax(V[-1])
-            BT[-1] = ix
-            for i, b in reversed(list(enumerate(B[:-1]))):
-                ix = b[ix]
-                BT[i] = ix
-
-            # join contiguous segments assigned to hap. B
-            d = np.diff(BT, append = 0, prepend = 0)
-            ctg_idx = np.c_[np.flatnonzero(d == 1), np.flatnonzero(d == -1) - 1]
-            b_segs_j = np.c_[bpl[ctg_idx[:, 0], 0], bpl[ctg_idx[:, 1], 1]]
-
-#            # join contiguous segments assigned to hap. A
-#            d = np.diff(1 - BT, append = 0, prepend = 0)
-#            ctg_idx = np.c_[np.flatnonzero(d == 1), np.flatnonzero(d == -1) - 1]
-#            a_segs_j = np.c_[bpl[ctg_idx[:, 0], 0], bpl[ctg_idx[:, 1], 1]]
-
-            # plot
-            #for x in np.flatnonzero(BT):
-            #    plt.plot(self.P.loc[bpl[x], "pos"], np.r_[j + 1, j + 1]*0.01)
-
-            # record
-            for x in b_segs_j:
-                self.B_ct[x[0], x[1]] += 1
-#            for x in a_segs_j:
-#                A_ct[x[0], x[1]] += 1
-
-#        # plot
-#        for k, v in B_ct.items():
-#            for _ in range(0, v):
-#                plt.plot(self.P.iloc[np.r_[k], self.P.columns.get_loc("pos")], 0.2*np.random.rand()*np.r_[1, 1])
-
-    # MCMC iteration that corrects a phase
-    def rephase(self): # TODO: add parameters to force an interval?
-        # TODO: prerequisite checks; has correct_phases() been run?
-        choice = list(self.B_ct.keys())
-        probs = np.r_[list(self.B_ct.values())]
-
-        #
-        # propose an interval to flip from B->A
-        st, en = choice[np.random.choice(np.r_[0:len(choice)], p = probs/probs.sum())]
-
-        #
-        # check if this overlaps any other regions that were already flipped B->A.
-
-        # any previously flipped regions contained within will be left alone
-
-        # return range of flipped region array that [st, en) overlaps
-        # TODO: rename this; f_o is a terrible name
-        def f_o(st = st, en = en):
-            st_idx = self.F.bisect_left(st + 1); st_idx -= st_idx % 2
-            en_idx = self.F.bisect_right(en - 1); en_idx += en_idx % 2
-            return slice(st_idx, en_idx)
-
-        overlaps = np.array(self.F[f_o()]).reshape(-1, 2)
-        o_S = sc.SortedSet({st, en})
-        for o in overlaps:
-            o_S.add(o[0])
-            o_S.add(o[1])
-
-        # somewhere we ought to assert that the length of self.F is even
-
-        # get list of regions to flip
-        flip_candidates = np.r_[o_S] # all possible regions to flip
-        flip_idx = np.zeros(len(flip_candidates) - 1, dtype = np.bool) # index of regions that haven't been flipped yet
-        A_flag = True # whether st:en consists entirely of regions that were flipped to A
-        for i, (st_seg, en_seg) in enumerate(np.c_[flip_candidates[:-1], flip_candidates[1:]]):
-            # this region was not already flipped B->A
-            if not self.F[f_o(st_seg, en_seg)]:
-                flip_idx[i] = True
-                A_flag = False
-
-        flips = np.c_[flip_candidates[:-1], flip_candidates[1:]][flip_idx, :]
-
-        #
-        # get full range of CNV breakpoints this region spans
-        st_reg = self.breakpoints.bisect_left(o_S[0])
-        en_reg = self.breakpoints.bisect_right(o_S[-1])
-        breakpoints0 = sc.SortedSet(self.breakpoints[(st_reg - 1):(en_reg + 1)])
-
-        #
-        # get initial marginal likelihood of this configuration
-        ML_orig = 0
-        for b in breakpoints0[:-1]:
-            ML_orig += self.seg_marg_liks[b]
-
-        #
-        # perform flips; update breakpoint list accordingly
-        for st_seg, en_seg in flips:
-            # if flip boundary corresponds to an extant breakpoint, remove it
-            # (we will propose joining these segments after flip)
-            if st_seg in breakpoints0:
-                breakpoints0 -= {st_seg}
-            # otherwise, add the flip boundary as a new breakpoint
-            # (we will propose introducing a new segment after flip)
-            else:
-                breakpoints0.add(st_seg)
-            if en_seg in breakpoints0:
-                breakpoints0 -= {en_seg}
-            else:
-                breakpoints0.add(en_seg)
-
-            self.flip_hap(st_seg, en_seg)
-
-        #
-        # if st:en is entirely assigned to A, try to flip it back to B (i.e. it was a false flip)
-        if A_flag:
-            if flip_candidates[0] in breakpoints0:
-                breakpoints0 -= {flip_candidates[0]}
-            else:
-                breakpoints0.add(flip_candidates[0])
-            if en_reg in breakpoints0:
-                breakpoints0 -= {flip_candidates[-1]}
-            else:
-                breakpoints0.add(flip_candidates[-1])
-
-            self.flip_hap(flip_candidates[0], flip_candidates[-1])
-
-        #
-        # get marginal likelihood post-flip and breakpoint adjustment
-        bps = np.r_[breakpoints0]
-        ML = 0
-        for st_bp, en_bp in np.c_[bps[:-1], bps[1:]]:
-            ML += ss.betaln(
-              self._Piloc(st_bp, en_bp, self.min_idx).sum() + 1,
-              self._Piloc(st_bp, en_bp, self.maj_idx).sum() + 1
-            )
-
-        #
-        # probabilistically accept new configuration
-        if np.log(np.random.rand()) < np.minimum(0, ML - ML_orig):
-            #
-            # update F array
-
-            # we could have either flipped a region from B->A ...
-            if not A_flag:
-                for st_seg, en_seg in flips:
-                    self.F.update([st_seg, en_seg])
-
-            # ... or reverted a flip
-            else:
-                for p in self.F[f_o(flip_candidates[0], flip_candidates[-1])]:
-                    self.F.remove(p)
-
-            #
-            # combine contiguous intervals in F array
-            # TODO
-
-            #
-            # update breakpoint list and seg. marg. liks
-            bps_to_del = list(self.breakpoints.islice(
-              self.breakpoints.bisect_left(breakpoints0[0]),
-              self.breakpoints.bisect_right(breakpoints0[-1])
-            ))
-            for x in bps_to_del:
-                self.breakpoints.remove(x)
-            self.breakpoints.update(breakpoints0)
-
-            #
-            # update seg. marg. liks
-            # TODO: recomputing each sum (even if in the future we use memoization)
-            #       is wasteful. intelligently pick which seg_marg_liks keys to update.
-            for x in bps_to_del[:-1]:
-                self.seg_marg_liks.__delitem__(x)
-            for st_bp, en_bp in np.c_[bps[:-1], bps[1:]]:
-                self.seg_marg_liks[st_bp] = ss.betaln(
-                  self._Piloc(st_bp, en_bp, self.min_idx).sum() + 1,
-                  self._Piloc(st_bp, en_bp, self.maj_idx).sum() + 1
-                )
-
-            self.marg_lik[self.iter] = self.marg_lik[self.iter - 1] - ML_orig + ML
-
-        #
-        # revert
-        else:
-            # flip each region back
-            for st_seg, en_seg in flips:
-                self.flip_hap(st_seg, en_seg)
-            if A_flag:
-                self.flip_hap(flip_candidates[0], flip_candidates[-1])
-
-            self.marg_lik[self.iter] = self.marg_lik[self.iter - 1]
-
-    def compute_all_cumsums(self):
-        bpl = np.array(self.breakpoints); bpl = np.c_[bpl[0:-1], bpl[1:]]
-        for st, en in bpl:
-            self.cs_MAJ[st], self.cs_MIN[st], self.split_prob[st] = self.compute_cumsum(st, en)
-
     def compute_cumsum(self, st, en):
         # major
         cs_MAJ = np.zeros(en - st, dtype = np.int)
@@ -582,7 +255,7 @@ def compute_cumsum(self, st, en):
             cs_MIN[i - st] = cs_MIN[i - st - 1] + (self.P.iat[i, self.min_idx] if self.P.iat[i, self.P.columns.get_loc("include")] else 0)
 
         # marginal likelihoods
-        ml = ss.betaln(cs_MAJ + 1, cs_MIN + 1) + ss.betaln(cs_MAJ[-1] - cs_MAJ + 1, cs_MIN[-1] - cs_MIN + 1)
+        ml = ss.betaln(cs_MAJ + 1 + self.betahyp, cs_MIN + 1 + self.betahyp) + ss.betaln(cs_MAJ[-1] - cs_MAJ + 1 + self.betahyp, cs_MIN[-1] - cs_MIN + 1 + self.betahyp)
 
         # prior
         # TODO: allow user to specify
@@ -631,12 +304,12 @@ def split(self, st = None, b_idx = None):
 
         # M-H acceptance
         seg_lik_1 = ss.betaln(
-          self._Piloc(st, mid, self.min_idx).sum() + 1,
-          self._Piloc(st, mid, self.maj_idx).sum() + 1
+          self._Piloc(st, mid, self.min_idx).sum() + 1 + self.betahyp,
+          self._Piloc(st, mid, self.maj_idx).sum() + 1 + self.betahyp
         )
         seg_lik_2 = ss.betaln(
-          self._Piloc(mid, en, self.min_idx).sum() + 1,
-          self._Piloc(mid, en, self.maj_idx).sum() + 1
+          self._Piloc(mid, en, self.min_idx).sum() + 1 + self.betahyp,
+          self._Piloc(mid, en, self.maj_idx).sum() + 1 + self.betahyp
         )
 
         ML_split = seg_lik_1 + seg_lik_2
@@ -691,21 +364,21 @@ def prune(self):
             # q_i = seg(A - A_i, B - B_i) + garbage(A_i, B_i) + (1 - include prior_i)
             #       - (seg(A, B) + (include prior_i))
             r_exc = ss.betaln(
-              A_inc_s - I["MIN_COUNT"] + 1,
-              B_inc_s - I["MAJ_COUNT"] + 1
-            ) + ss.betaln(I["MIN_COUNT"] + 1, I["MAJ_COUNT"] + 1) \
+              A_inc_s - I["MIN_COUNT"] + 1 + self.betahyp,
+              B_inc_s - I["MAJ_COUNT"] + 1 + self.betahyp
+            ) + ss.betaln(I["MIN_COUNT"] + 1 + self.betahyp, I["MAJ_COUNT"] + 1 + self.betahyp) \
               + np.log(1 - I["include_prior"]) \
-              - (ss.betaln(A_inc_s + 1, B_inc_s + 1) + np.log(I["include_prior"]))
+              - (ss.betaln(A_inc_s + 1 + self.betahyp, B_inc_s + 1 + self.betahyp) + np.log(I["include_prior"]))
 
             # 2. probability to include SNPs (that were previously excluded)
             # q_i = seg(A + A_i, B + B_i) + (include prior_i)
             #       - (seg(A, B) + garbage(A_i, B_i) + (1 - include prior_i))
             r_inc = ss.betaln(
-              A_inc_s + E["MIN_COUNT"] + 1,
-              B_inc_s + E["MAJ_COUNT"] + 1
+              A_inc_s + E["MIN_COUNT"] + 1 + self.betahyp,
+              B_inc_s + E["MAJ_COUNT"] + 1 + self.betahyp
             ) + np.log(E["include_prior"]) \
-              - (ss.betaln(A_inc_s + 1, B_inc_s + 1) + \
-                ss.betaln(E["MIN_COUNT"] + 1, E["MAJ_COUNT"] + 1) + \
+              - (ss.betaln(A_inc_s + 1 + self.betahyp, B_inc_s + 1 + self.betahyp) + \
+                ss.betaln(E["MIN_COUNT"] + 1 + self.betahyp, E["MAJ_COUNT"] + 1 + self.betahyp) + \
                 np.log(1 - E["include_prior"]))
 
             r_cat = pd.concat([r_inc, r_exc]).sort_index()
@@ -739,18 +412,18 @@ def prune(self):
 
             # regardless, code for computing q_star is the same
             r_exc_star = ss.betaln(
-              A_inc_s_star - I_star["MIN_COUNT"] + 1,
-              B_inc_s_star - I_star["MAJ_COUNT"] + 1
-            ) + ss.betaln(I_star["MIN_COUNT"] + 1, I_star["MAJ_COUNT"] + 1) \
+              A_inc_s_star - I_star["MIN_COUNT"] + 1 + self.betahyp,
+              B_inc_s_star - I_star["MAJ_COUNT"] + 1 + self.betahyp
+            ) + ss.betaln(I_star["MIN_COUNT"] + 1 + self.betahyp, I_star["MAJ_COUNT"] + 1 + self.betahyp) \
               + np.log(1 - I_star["include_prior"]) \
-              - (ss.betaln(A_inc_s_star + 1, B_inc_s_star + 1) + np.log(I_star["include_prior"]))
+              - (ss.betaln(A_inc_s_star + 1 + self.betahyp, B_inc_s_star + 1 + self.betahyp) + np.log(I_star["include_prior"]))
 
             r_inc_star = ss.betaln(
-              A_inc_s_star + E_star["MIN_COUNT"] + 1,
-              B_inc_s_star + E_star["MAJ_COUNT"] + 1
+              A_inc_s_star + E_star["MIN_COUNT"] + 1 + self.betahyp,
+              B_inc_s_star + E_star["MAJ_COUNT"] + 1 + self.betahyp
             ) + np.log(E_star["include_prior"]) \
-              - (ss.betaln(A_inc_s_star + 1, B_inc_s_star + 1) + \
-                ss.betaln(E_star["MIN_COUNT"] + 1, E_star["MAJ_COUNT"] + 1) + \
+              - (ss.betaln(A_inc_s_star + 1 + self.betahyp, B_inc_s_star + 1 + self.betahyp) + \
+                ss.betaln(E_star["MIN_COUNT"] + 1 + self.betahyp, E_star["MAJ_COUNT"] + 1 + self.betahyp) + \
                 np.log(1 - E_star["include_prior"]))
 
             r_cat_star = pd.concat([r_inc_star, r_exc_star]).sort_index()
@@ -771,8 +444,8 @@ def prune(self):
 
                 self.marg_lik[self.iter] -= self.seg_marg_liks[st]
                 self.seg_marg_liks[st] = ss.betaln(
-                  T.loc[T["include"], "MIN_COUNT"].sum() + 1,
-                  T.loc[T["include"], "MAJ_COUNT"].sum() + 1,
+                  T.loc[T["include"], "MIN_COUNT"].sum() + 1 + self.betahyp,
+                  T.loc[T["include"], "MAJ_COUNT"].sum() + 1 + self.betahyp,
                 )
                 self.marg_lik[self.iter] += self.seg_marg_liks[st]
 
@@ -780,8 +453,8 @@ def prune(self):
                 # effectively their own segments)
                 self.marg_lik[self.iter] += (1 if ~self.P.at[choice_idx, "include"] else -1)* \
                   ss.betaln(
-                    self.P.at[choice_idx, "MIN_COUNT"] + 1,
-                    self.P.at[choice_idx, "MAJ_COUNT"] + 1
+                    self.P.at[choice_idx, "MIN_COUNT"] + 1 + self.betahyp,
+                    self.P.at[choice_idx, "MAJ_COUNT"] + 1 + self.betahyp
                   )
 
                 # TODO: update segment partial sums (when we actually use these)
@@ -812,20 +485,16 @@ def incr_bp_counter(self, st, en, mid = None):
             self.breakpoint_counter[(mid + 1):en] += np.r_[0, 1]
 
     def visualize(self, show_CIs = False):
-        Ph = self.P.copy()
-        CI = s.beta.ppf([0.05, 0.5, 0.95], Ph["MIN_COUNT"][:, None] + 1, Ph["MAJ_COUNT"][:, None] + 1)
-        Ph[["CI_lo_hap", "median_hap", "CI_hi_hap"]] = CI
-
-        plt.figure(); plt.clf()
+        plt.figure(figsize = [16, 4]); plt.clf()
         ax = plt.gca()
 
         # SNPs
-        ax.scatter(Ph["pos"], Ph["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)], alpha = 0.5, s = 4)
+        ax.scatter(self.P["pos"], self.P["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][self.P["aidx"].astype(np.int)], alpha = 0.5, s = 4, marker = '.')
         if show_CIs:
-            ax.errorbar(Ph["pos"], y = Ph["median_hap"], yerr = np.c_[Ph["median_hap"] - Ph["CI_lo_hap"], Ph["CI_hi_hap"] - Ph["median_hap"]].T, fmt = 'none', alpha = 0.5, color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)])
+            ax.errorbar(self.P["pos"], y = self.P["median_hap"], yerr = np.c_[self.P["median_hap"] - self.P["CI_lo_hap"], self.P["CI_hi_hap"] - self.P["median_hap"]].T, fmt = 'none', alpha = 0.1, ecolor = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][self.P["aidx"].astype(np.int)])
 
         # mask excluded SNPs
-        ax.scatter(Ph["pos"], Ph["median_hap"], color = 'k', alpha = 1 - pd.concat(self.include, axis = 1).mean(1).values)
+        # ax.scatter(Ph["pos"], Ph["median_hap"], color = 'k', alpha = 1 - pd.concat(self.include, axis = 1).mean(1).values)
 
         # breakpoints 
 #        bp_prob = self.breakpoint_counter[:, 0]/self.breakpoint_counter[:, 1]
@@ -840,47 +509,22 @@ def visualize(self, show_CIs = False):
 #        ax2.set_xlim(ax.get_xlim());
 #        ax2.set_xlabel("Breakpoint number in current MCMC iteration")
 
-        # beta CI's weighted by breakpoints
-        # flip current rephases back to baseline
-        for st, en in self.F.intervals():
-            # code excised from flip_hap
-            x = Ph.iloc[st:en, self.maj_idx].copy()
-            Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx]
-            Ph.iloc[st:en, self.min_idx] = x
-
-        pos_col = Ph.columns.get_loc("pos")
-        for bp_samp, pi_samp, inc_samp in itertools.zip_longest(self.breakpoint_list, self.phase_interval_list, self.include):
-            # flip everything according to sample
-            # if we did not perform phase correction, pi_samp will be none (hence
-            # the use of zip_longest above)
-            if pi_samp is not None:
-                for st, en in pi_samp.intervals():
-                    # TODO: can replace with flip_hap()?
-                    x = Ph.iloc[st:en, self.maj_idx].copy()
-                    Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx]
-                    Ph.iloc[st:en, self.min_idx] = x
-
-            # SNPs TODO: plot only those that flipped, in a diff. color?
-            #ax.scatter(Ph["pos"], Ph["median_hap"], color = np.r_[np.c_[1, 0, 0], np.c_[0, 0, 1]][Ph["aidx"].astype(np.int)], alpha = 0.5, s = 4)
-
-            bpl = np.array(bp_samp); bpl = np.c_[bpl[0:-1], bpl[1:]]
-            for st, en in bpl:
-                Phi = Ph.iloc[st:en]; Phi = Phi.loc[inc_samp]
-                ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], Phi.iloc[:, self.min_idx].sum() + 1, Phi.iloc[:, self.maj_idx].sum() + 1)
-                ax.add_patch(mpl.patches.Rectangle((Ph.iloc[st, pos_col], ci_lo), Ph.iloc[en, pos_col] - Ph.iloc[st, pos_col], ci_hi - ci_lo, fill = True, facecolor = 'k', alpha = 1/len(self.breakpoint_list), zorder = 1000))
-
-            # flip everything back
-            if pi_samp is not None:
-                for st, en in pi_samp.intervals():
-                    # TODO: can replace with flip_hap()?
-                    x = Ph.iloc[st:en, self.maj_idx].copy()
-                    Ph.iloc[st:en, self.maj_idx] = Ph.iloc[st:en, self.min_idx]
-                    Ph.iloc[st:en, self.min_idx] = x
+        bpl = self.breakpoints if self.breakpoints_MLE is None else self.breakpoints_MLE
+        bpl = np.array(bpl); bpl = np.c_[bpl[0:-1], bpl[1:]]
+
+        pos_col = self.P.columns.get_loc("pos")
+        for st, en in bpl:
+            ci_lo, med, ci_hi = s.beta.ppf([0.05, 0.5, 0.95], self.P.iloc[st:en, self.maj_idx].sum() + 1, self.P.iloc[st:en, self.min_idx].sum() + 1)
+            ax.add_patch(mpl.patches.Rectangle((self.P.iloc[st, pos_col], ci_lo), self.P.iloc[en, pos_col] - self.P.iloc[st, pos_col], ci_hi - ci_lo, fill = True, facecolor = 'lime', alpha = 0.4, zorder = 1000))
 
         # 50:50 line
         ax.axhline(0.5, color = 'k', linestyle = ":")
 
         ax.set_xticks(np.linspace(*plt.xlim(), 20));
-        ax.set_xticklabels(Ph["pos"].searchsorted(np.linspace(*plt.xlim(), 20)));
+        ax.set_xticklabels(self.P["pos"].searchsorted(np.linspace(*plt.xlim(), 20)));
         ax.set_xlabel("SNP index")
         ax.set_ylim([0, 1])
+
+        ax.set_title(f"{self.P.iloc[0]['chr']}:{self.P.iloc[0]['pos']}-{self.P.iloc[-1]['pos']}")
+
+        plt.tight_layout()
diff --git a/hapaseg/coverage_DP.py b/hapaseg/coverage_DP.py
index 07a85e8..17687f2 100644
--- a/hapaseg/coverage_DP.py
+++ b/hapaseg/coverage_DP.py
@@ -116,7 +116,7 @@ def __init__(self, cov_df, beta, bin_exposure, prior_run=None, count_prior_sum=N
         self.seg_id_col = self.cov_df.columns.get_loc('segment_ID')
         self.beta = beta
         self.bin_exposure=bin_exposure
-        self.covar_cols = sorted([c for c in self.cov_df.columns if "C_" in c])
+        self.covar_cols = sorted(self.cov_df.columns[self.cov_df.columns.str.contains("^C_.*_z$")])
         
         self.num_segments = self.cov_df.iloc[:, self.seg_id_col].max() + 1
         self.segment_r_list = [None] * self.num_segments
diff --git a/hapaseg/model_optimizers.py b/hapaseg/model_optimizers.py
index 43d54d4..3284123 100644
--- a/hapaseg/model_optimizers.py
+++ b/hapaseg/model_optimizers.py
@@ -2,30 +2,42 @@
 
 
 class PoissonRegression:
-    def __init__(self, r, C, Pi):
+    def __init__(self, r, C, Pi,
+      log_exposure = 0, log_offset = 0, intercept = True,
+      mumu = 0, musig2 = 10, betamu = None, betasiginv = None):
         self.r = r
         self.C = C
         self.Pi = Pi
+        self.log_exposure = log_exposure
+        self.log_offset = log_offset
+        self.intercept = intercept
 
-        self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1]))
-        self.beta = np.ones([C.shape[1], 1])
-        self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu)
+        self.mu = np.log(r.mean() * np.ones([Pi.shape[1], 1])) - self.log_exposure
+        self.beta = np.zeros([C.shape[1], 1])
+        self.f = 1
+        self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
+
+        # prior parameters
+        self.mumu = mumu
+        self.musig2 = musig2
+        self.betamu = np.zeros_like(self.beta) if betamu is None else betamu
+        self.betasiginv = 1/np.sqrt(10)*np.eye(len(self.beta)) if betasiginv is None else betasiginv
 
     # mu gradient
     def gradmu(self):
-        return self.Pi.T @ (self.r - self.e_s)
+        return self.Pi.T @ (self.r - self.e_s) - (self.mu - self.mumu)/self.musig2
 
     # mu Hessian
     def hessmu(self):
-        return (-self.Pi.T * self.e_s.T)  @ self.Pi
+        return (-self.Pi.T * self.e_s.T)  @ self.Pi - 1/self.musig2
 
     # beta gradient
     def gradbeta(self):
-        return self.C.T @ (self.r - self.e_s)
+        return self.C.T @ (self.r - self.e_s) - self.betasiginv@(self.beta - self.betamu)
 
     # beta Hessian
     def hessbeta(self):
-        return (-self.C.T * self.e_s.T) @ self.C
+        return (-self.C.T * self.e_s.T) @ self.C - self.betasiginv
 
     # mu,beta Hessian
     def hessmubeta(self):
@@ -33,23 +45,63 @@ def hessmubeta(self):
 
     def NR_poisson(self):
         for i in range(100):
-            self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu)
-            gmu = self.gradmu()
+            self.e_s = np.exp(self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
             gbeta = self.gradbeta()
-            grad = np.r_[gmu, gbeta]
+            if self.intercept:
+                gmu = self.gradmu()
+                grad = np.r_[gmu, gbeta]
+            else:
+                grad = gbeta
 
-            hmu = self.hessmu()
             hbeta = self.hessbeta()
-            hmubeta = self.hessmubeta()
-            H = np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]
+            if self.intercept:
+                hmubeta = self.hessmubeta()
+                hmu = self.hessmu()
+                H = np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]
+            else:
+                H = hbeta
 
             delta = np.linalg.inv(H) @ grad
-            self.mu -= delta[0:len(self.mu)]
-            self.beta -= delta[len(self.mu):]
+            if self.intercept:
+                self.mu -= delta[0:len(self.mu)]
+                self.beta -= delta[len(self.mu):]
+            else:
+                self.beta -= delta
 
             if np.linalg.norm(grad) < 1e-5:
                 break
 
     def fit(self):
         self.NR_poisson()
-        return self.mu, self.beta
+        if self.intercept:
+            return self.mu, self.beta
+        else:
+            return self.beta
+
+    def hess(self):
+        hbeta = self.hessbeta()
+        if self.intercept:
+            hmu = self.hessmu()
+            hmubeta = self.hessmubeta()
+            return np.r_[np.c_[hmu, hmubeta.T], np.c_[hmubeta, hbeta]]
+        else:
+            return hbeta
+
+    # scale factor
+    def gradf(self):
+        return (self.C@self.beta).T@(self.r - self.e_s)
+
+    def hessf(self):
+        CB = self.C@self.beta
+        return -(CB*self.e_s).T@CB
+
+    def NR_f(self):
+        for i in range(100):
+            self.e_s = np.exp(self.f*self.C @ self.beta + self.Pi @ self.mu + self.log_exposure + self.log_offset)
+            gf = self.gradf()
+            hf = self.hessf()
+
+            self.f -= gf/hf
+
+            if np.linalg.norm(gf) < 1e-5:
+                break
diff --git a/hapaseg/run_coverage_MCMC.py b/hapaseg/run_coverage_MCMC.py
index 5370708..e880c13 100644
--- a/hapaseg/run_coverage_MCMC.py
+++ b/hapaseg/run_coverage_MCMC.py
@@ -1,9 +1,12 @@
 import numpy as np
 import pandas as pd
+import pickle
 import glob
 import re
 import os
 import scipy.special as ss
+import sys
+import tqdm
 from capy import mut, seq
 import scipy.stats as stats
 from statsmodels.discrete.discrete_model import NegativeBinomial as statsNB
@@ -16,19 +19,28 @@ def __init__(self,
                  coverage_csv,
                  f_allelic_clusters,
                  f_SNPs,
+                 f_segs,
+                 ref_fasta,
                  f_repl,
+                 f_faire,
                  f_GC=None,
                  num_draws=50,
                  cluster_num=None,
-                 allelic_sample=None
+                 allelic_sample=None,
+                 bin_width=1,
                  ):
 
         self.num_draws = num_draws
         self.cluster_num = cluster_num
         self.f_repl = f_repl
+        self.f_faire = f_faire
         self.f_GC = f_GC
+        self.ref_fasta = ref_fasta
+        self.bin_width = bin_width
 
         self.allelic_clusters = np.load(f_allelic_clusters)
+        with open(f_segs, "rb") as f:
+            self.segmentations = pickle.load(f)
         # coverage input is expected to be a df file with columns: ["chr", "start", "end", "covcorr", "covraw"]
         self.full_cov_df = self.load_coverage(coverage_csv)
         self.load_covariates()
@@ -37,7 +49,7 @@ def __init__(self,
         if allelic_sample is not None:
             self.allelic_sample = allelic_sample
         else:
-            self.allelic_sample = self.select_ADP_cluster()
+            self.allelic_sample = np.argmax(self.allelic_clusters["likelihoods"])
 
         self.model = None
 
@@ -56,165 +68,202 @@ def run_all_clusters(self):
     # Do preprocessing for running on each ADP cluster individually
     def prepare_single_cluster(self):
         Pi, r, C, filtered_cov_df = self.assign_clusters()
-        pois_regr = PoissonRegression(r, C, Pi)
+        pois_regr = PoissonRegression(r, C, Pi, log_exposure = np.log(self.bin_width))
         all_mu, global_beta = pois_regr.fit()
+        pois_hess = pois_regr.hess()
 
         # save these results to a numpy object
-        return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample
-
-    # method for calculating DP prior likelihood of an ADP cluster    
-    @staticmethod    
-    def dp_prior(cluster_counts_arr, alpha):
-        N = cluster_counts_arr.sum()
-        m = len(cluster_counts_arr)
-        return m * np.log(alpha) + ss.gammaln(cluster_counts_arr).sum() + ss.gammaln(alpha) - ss.gammaln(N+alpha)
-    
-    # method for selecting ADP clustering based on likelihoods
-    def select_ADP_cluster(self):
-        ADP_draws = self.allelic_clusters["snps_to_clusters"]
-        tmp_snps = self.SNPs.copy()
-        lls = []
-        for ADP_draw in ADP_draws:
-            tmp_snps['cluster_assignment'] = ADP_draw
-            count_arr = tmp_snps.groupby(by='cluster_assignment').agg({"maj":sum, "min":sum}).values
-            count_arr += 1
-            beta_ll = ss.betaln(count_arr[:, 0], count_arr[:, 1]).sum()
-            cluster_counts = tmp_snps['cluster_assignment'].value_counts().values
-            dp_ll = self.dp_prior(cluster_counts, 0.5)
-            lls.append(beta_ll + dp_ll)
-        lls = np.array(lls)
-        lls_max = np.max(lls)
-        choice_p = np.exp(lls - lls_max) / np.exp(lls - lls_max).sum()
-        return np.random.choice(len(ADP_draws), p=choice_p)
-
-
-    @staticmethod
-    def load_coverage(coverage_csv):
-        Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_reads"], low_memory=False)
+        return Pi, r, C, all_mu, global_beta, filtered_cov_df, self.allelic_sample, pois_hess
+
+    def load_coverage(self, coverage_csv):
+        Cov = pd.read_csv(coverage_csv, sep="\t", names=["chr", "start", "end", "covcorr", "mean_frag_len", "std_frag_len", "num_frags", "tot_reads", "fail_reads"], low_memory=False)
         Cov.loc[Cov['chr'] == 'chrM', 'chr'] = 'chrMT' #change mitocondrial contigs to follow mut conventions
         Cov["chr"] = mut.convert_chr(Cov["chr"])
         Cov = Cov.loc[Cov["chr"] != 0]
         Cov=Cov.reset_index(drop=True)
-        Cov["start_g"] = seq.chrpos2gpos(Cov["chr"], Cov["start"])
-        Cov["end_g"] = seq.chrpos2gpos(Cov["chr"], Cov["end"])
+        Cov["start_g"] = seq.chrpos2gpos(Cov["chr"], Cov["start"], ref = self.ref_fasta)
+        Cov["end_g"] = seq.chrpos2gpos(Cov["chr"], Cov["end"], ref = self.ref_fasta)
         
         return Cov
 
     def load_SNPs(self, f_snps):
         SNPs = pd.read_pickle(f_snps)
-        SNPs["chr"], SNPs["pos"] = seq.gpos2chrpos(SNPs["gpos"])
-
-        SNPs["tidx"] = mut.map_mutations_to_targets(SNPs, self.full_cov_df, inplace=False)
+        mut.map_mutations_to_targets(SNPs, self.full_cov_df)
         return SNPs
 
     def generate_GC(self):
         #grab fasta object from seq to avoid rebuilding
+        seq.set_reference(self.ref_fasta)
         F = seq._fa.ref_fa_obj
         self.full_cov_df['C_GC'] = np.nan
         
         #this indexing assumes 0-indexed start and end cols
-        for (i, chrm, start, end) in self.full_cov_df[['chr', 'start','end']].itertuples():
+        for (i, chrm, start, end) in tqdm.tqdm(self.full_cov_df[['chr', 'start','end']].itertuples(), total = len(self.full_cov_df)):
             self.full_cov_df.iat[i, -1] = F[chrm-1][start:end+1].gc
         
 
     def load_covariates(self):
-        #check if we are doing wgs, in which case we will have uniform 200 bp bins
-        wgs = True if self.f_GC is not None or len(self.full_cov_df) > 100000 else False
-        
-        #we only need bin size if doing exomes
-        if not wgs:
-            self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1)
+        zt = lambda x : (x - np.nanmean(x))/np.nanstd(x)
+
+        ## Target size
+
+        # we only need bin size if doing exomes but we can check by looking at the bin lengths
+        self.full_cov_df["C_log_len"] = np.log(self.full_cov_df["end"] - self.full_cov_df["start"] + 1)
+        self.full_cov_df["C_log_len_z"] = zt(self.full_cov_df["C_log_len"])
             
-            #this is a safety in case we are doing wgs but have few bins
-            if (np.diff(self.full_cov_df["C_log_len"]) == 0).all():
-                #remove the len col since it will ruin beta fitting
-                self.full_cov_df = self.full_cov_df.drop(['C_log_len'], axis=1)
+        # in case we are doing wgs these will all be the same and we must remove
+        if (np.diff(self.full_cov_df["C_log_len"]) == 0).all():
+            #remove the len col since it will ruin beta fitting
+            self.full_cov_df = self.full_cov_df.drop(['C_log_len', 'C_log_len_z'], axis=1)
+
+        ## Fragment length
+
+        # some bins have zero mean fragment length; these bins are bad and should be removed
+        self.full_cov_df = self.full_cov_df.loc[(self.full_cov_df.mean_frag_len > 0) & (self.full_cov_df.std_frag_len > 0)].reset_index(drop = True)
+
+        self.full_cov_df = self.full_cov_df.rename(columns = { "mean_frag_len" : "C_frag_len" })
+        self.full_cov_df["C_frag_len_z"] = zt(np.log(self.full_cov_df["C_frag_len"]))
+
+        # generate on 5x and 11x scales
+        swv = np.lib.stride_tricks.sliding_window_view
+        fl = self.full_cov_df["C_frag_len"].values; fl[np.isnan(fl)] = 0
+        wt = self.full_cov_df["num_frags"].values
+        for scale in [5, 11]:
+            fl_sw = swv(np.pad(fl, scale//2), scale)
+            wt_sw = swv(np.pad(wt, scale//2), scale)
+            conv = np.einsum('ij,ij->i', wt_sw, fl_sw)
+
+            self.full_cov_df[f"C_frag_len_{scale}x"] = conv/wt_sw.sum(1)
+            self.full_cov_df[f"C_frag_len_{scale}x_z"] = zt(np.log(self.full_cov_df[f"C_frag_len_{scale}x"]))
+
+        ### track-based covariates
+        # use midpoint of coverage bins to map to intervals
+        self.full_cov_df["midpoint"] = ((self.full_cov_df["end"] + self.full_cov_df["start"])/2).astype(int)
+
+        ## Replication timing
+
         # load repl timing
         F = pd.read_pickle(self.f_repl)
         # map targets to RT intervals
-        tidx = mut.map_mutations_to_targets(self.full_cov_df.rename(columns={"start": "pos"}), F, inplace=False)
+        tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint")
         self.full_cov_df['C_RT'] = np.nan
         self.full_cov_df.iloc[tidx.index, -1] = F.iloc[tidx, 3:].mean(1).values
 
         # z-transform
-        self.full_cov_df["C_RT_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(
-            np.log(self.full_cov_df["C_RT"] + 1e-20))
+        self.full_cov_df["C_RT_z"] = zt(np.log(self.full_cov_df["C_RT"]))
+
+        ## GC content
 
         # load GC content if we have it precomputed, otherwise generate it
-        if wgs and self.f_GC is not None and os.path.exists(self.f_GC):
-            print("Using precomputed GC content")
+        if self.f_GC is not None and os.path.exists(self.f_GC):
+            print("Using precomputed GC content", file = sys.stderr)
             B = pd.read_pickle(self.f_GC)
             
             self.full_cov_df = self.full_cov_df.merge(B.rename(columns={"gc": "C_GC"}), left_on=["chr", "start", "end"],
                                                   right_on=["chr", "start", "end"], how="left")
         else:
-            print("Computing GC content")
+            print("Computing GC content", file = sys.stderr)
             self.generate_GC()
-        
-        self.full_cov_df["C_GC_z"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(
-            np.log(self.full_cov_df["C_GC"] + 1e-20))
-        
-        #set zero coverage bins to nan
-        self.full_cov_df.loc[(self.full_cov_df.mean_frag_len == 0) | (self.full_cov_df.std_frag_len == 0), ['mean_frag_len', 'std_frag_len']] = (np.nan, np.nan)
-        
-        # add fragment based covars
-        self.full_cov_df["C_frag_len"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(np.log(self.full_cov_df["mean_frag_len"] + 1e-20))
-        self.full_cov_df["C_frag_std"] = (lambda x: (x - np.nanmean(x)) / np.nanstd(x))(np.log(self.full_cov_df["std_frag_len"] + 1e-20))
 
-        # drop non z-cetered cols
-        self.full_cov_df = self.full_cov_df.drop(['C_GC', 'C_RT'], axis=1)
-        
+        # GC content follows a roughly quadratic relationship with coverage
+        self.full_cov_df["C_GC2"] = self.full_cov_df["C_GC"]**2
+
+        ## FAIRE
+
+        if self.f_faire is not None:
+            F = pd.read_pickle(self.f_faire)
+
+            # map targets to FAIRE intervals
+            tidx = mut.map_mutations_to_targets(self.full_cov_df, F, inplace=False, poscol = "midpoint")
+            F = F.loc[tidx].set_index(tidx.index).iloc[:, 3:].rename(columns = lambda x : "C_" + x)
+            self.full_cov_df = pd.concat([self.full_cov_df, F], axis = 1)
+
+            # z-transform
+            self.full_cov_df = pd.concat([
+              self.full_cov_df,
+              self.full_cov_df.loc[:, F.columns].apply(lambda x : zt(np.log(x + 1))).rename(columns = lambda x : x + "_z")
+            ], axis = 1)
 
     # use SNP cluster assignments from the given draw assign coverage bins to clusters
     # clusters with snps from different clusters are probabliztically assigned
     # method returns coverage df with only bins that overlap snps
     def assign_clusters(self):
-        # generate unique clust assignments
+        ## generate unique clust assignments
         clust_choice = self.allelic_clusters["snps_to_clusters"][self.allelic_sample]
         clust_u, clust_uj = np.unique(clust_choice, return_inverse=True)
         clust_uj = clust_uj.reshape(clust_choice.shape)
+        self.SNPs["clust_choice"] = clust_uj
 
-        # assign coverage intervals to clusters
-        Cov_clust_probs = np.zeros([len(self.full_cov_df), clust_uj.max()+1])
-
-        # first compute assignment probabilities based on the SNPs within each bin
-        for targ, snp_idx in self.SNPs.groupby("tidx").indices.items():
-            targ_clust_hist = np.bincount(clust_uj[snp_idx].ravel(), minlength=clust_uj.max()+1)
+        ## assign coverage intervals to allelic clusters and segments 
+        # get allelic segment boundaries
+        seg_bdy = np.r_[0, list(self.segmentations[self.allelic_sample].keys()), len(self.SNPs)]
+        seg_bdy = np.c_[seg_bdy[:-1], seg_bdy[1:]]
+        self.SNPs["seg_idx"] = 0
+        for i, (st, en) in enumerate(seg_bdy):
+            self.SNPs.iloc[st:en, self.SNPs.columns.get_loc("seg_idx")] = i
+        seg_max = self.SNPs["seg_idx"].max() + 1
 
-            Cov_clust_probs[int(targ), :] = targ_clust_hist / targ_clust_hist.sum()
+        # assignment probabilities of each coverage interval -> allelic segment
+        Cov_clust_probs = np.zeros([len(self.full_cov_df), seg_max])
 
-        # subset intervals containing SNPs
+        # first compute assignment probabilities based on the SNPs within each bin
+        # segments just get assigned to the maximum probability
+        self.full_cov_df["seg_idx"] = -1
+        self.full_cov_df["allelic_cluster"] = -1
+        print("Mapping SNPs to targets ...", file = sys.stderr)
+        for targ, D in tqdm.tqdm(self.SNPs.groupby("targ_idx")[["clust_choice", "seg_idx"]]):
+            if targ == -1: # SNP does not overlap a coverage bin
+                continue
+            clust_idx = D["clust_choice"].values
+            seg_idx = D["seg_idx"].values
+            if len(seg_idx) == 1:
+                Cov_clust_probs[targ, seg_idx] = 1.0
+                self.full_cov_df.at[targ, "seg_idx"] = seg_idx[0]
+                self.full_cov_df.at[targ, "allelic_cluster"] = clust_idx[0]
+            else: 
+                targ_clust_hist = np.bincount(seg_idx, minlength = seg_max) 
+                Cov_clust_probs[targ, :] = targ_clust_hist / targ_clust_hist.sum()
+                self.full_cov_df.at[targ, "seg_idx"] = np.bincount(seg_idx).argmax()
+                self.full_cov_df.at[targ, "allelic_cluster"] = np.bincount(clust_idx).argmax()
+
+        ## subset to targets containing SNPs
         overlap_idx = Cov_clust_probs.sum(1) > 0
+#        # add targets within a 2 targ radius
+#        overlap_idx = np.flatnonzero(Cov_clust_probs.sum(1) > 0)[:, None]
+#        overlap_idx = overlap_idx + np.c_[-2:3].T
+#        overlap_idx = np.sort(np.unique((overlap_idx + np.c_[-2:3].T).ravel()))
         Cov_clust_probs_overlap = Cov_clust_probs[overlap_idx, :]
 
         # zero out improbable assignments and re-normalilze
         Cov_clust_probs_overlap[Cov_clust_probs_overlap < 0.05] = 0
         Cov_clust_probs_overlap /= Cov_clust_probs_overlap.sum(1)[:, None]
-        # prune garbage clusters
+        # prune empty clusters
         prune_idx = Cov_clust_probs_overlap.sum(0) > 0
         Cov_clust_probs_overlap = Cov_clust_probs_overlap[:, prune_idx]
         num_pruned_clusters = Cov_clust_probs_overlap.shape[1]
-        # subsetting to only targets that overlap SNPs
+
+        ## subsetting to only targets that overlap SNPs
         Cov_overlap = self.full_cov_df.loc[overlap_idx, :]
 
-        # probabilistically assign each ambiguous coverage bin to a cluster
+        ## probabilistically assign each ambiguous coverage bin to a cluster
         # for now we will take maximum instead
         amb_mask = np.max(Cov_clust_probs_overlap, 1) != 1
         amb_assgn_probs = Cov_clust_probs_overlap[amb_mask, :]
-        #new_assgn = np.array([np.random.choice(np.r_[:num_pruned_clusters],
-        #                                       p=amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))])
-        new_assgn = np.array([np.argmax(amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))])
-        new_onehot = np.zeros((new_assgn.size, num_pruned_clusters))
-        new_onehot[np.arange(new_assgn.size), new_assgn] = 1
-
-        # update with assigned values
-        Cov_clust_probs_overlap[amb_mask, :] = new_onehot
-
-        #downsampling for wgs
-        if len(Cov_clust_probs_overlap) > 20000:
-            downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2
-            Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask]
-            Cov_overlap = Cov_overlap.iloc[downsample_mask]
+        if amb_mask.sum() > 0:
+            #new_assgn = np.array([np.random.choice(np.r_[:num_pruned_clusters],
+            #                                       p=amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))])
+            new_assgn = np.array([np.argmax(amb_assgn_probs[i]) for i in range(len(amb_assgn_probs))])
+            new_onehot = np.zeros((new_assgn.size, num_pruned_clusters))
+            new_onehot[np.arange(new_assgn.size), new_assgn] = 1
+    
+            # update with assigned values
+            Cov_clust_probs_overlap[amb_mask, :] = new_onehot
+
+        ## downsampling for wgs
+#        if len(Cov_clust_probs_overlap) > 20000:
+#            downsample_mask = np.random.rand(Cov_clust_probs_overlap.shape[0]) < 0.2
+#            Cov_clust_probs_overlap = Cov_clust_probs_overlap[downsample_mask]
+#            Cov_overlap = Cov_overlap.iloc[downsample_mask]
     
         # remove clusters with fewer than 4 assigned coverage bins (remove these coverage bins as well)
         bad_clusters = Cov_clust_probs_overlap.sum(0) < 4
@@ -223,14 +272,18 @@ def assign_clusters(self):
 
         Cov_overlap = Cov_overlap.loc[~bad_bins, :]
         Pi = filtered.copy()
-       
-        r = np.c_[Cov_overlap["covcorr"]]
-        
-        covar_columns = sorted([c for c in Cov_overlap.columns if 'C_' in c])
-        # making covariate matrix
+
+        ## making regressor vector/covariate matrix
+
+        # scale regressor to reflect fragment counts
+        Cov_overlap["fragcorr"] = np.round(Cov_overlap["covcorr"]/Cov_overlap["C_frag_len"].mean())
+        r = np.c_[Cov_overlap["fragcorr"]]
+
+        # make covariate matrix; use all z-transformed covariates + non-scaled GC content+GC^2
+        covar_columns = sorted(Cov_overlap.columns[Cov_overlap.columns.str.contains("^C_.*_z$") | Cov_overlap.columns.str.contains("^C_GC")])
         C = np.c_[Cov_overlap[covar_columns]]
 
-        # dropping Nans
+        ## dropping Nans
         naidx = np.isnan(C).any(axis=1)
         # drop zero coverage bins as well (this is to account for a bug in coverage collector) TODO: remove need for this
         naidx = np.logical_or(naidx, (r==0).flatten())
@@ -240,14 +293,22 @@ def assign_clusters(self):
 
         Cov_overlap = Cov_overlap.iloc[~naidx]
         
-        #removing outliers
+        ## removing coverage outliers
         outlier_mask = find_outliers(r)
         r = r[~outlier_mask]
         C = C[~outlier_mask]
         Pi = Pi[~outlier_mask]
         Cov_overlap = Cov_overlap.iloc[~outlier_mask]
-        
-        Cov_overlap['allelic_cluster'] = np.argmax(Pi, axis=1)
+
+        # some clusters may have been eliminated by this point; prune them from Pi
+        Pi = Pi[:, Pi.sum(0) > 0]
+ 
+        ## remove covariate outliers (+- 6 sigma)
+        covar_outlier_idx = (Cov_overlap.loc[:, set(covar_columns) - {"C_GCtr_z"}].abs() < 6).all(axis = 1)
+        Cov_overlap = Cov_overlap.loc[covar_outlier_idx]
+        Pi = Pi[covar_outlier_idx, :]
+        r = r[covar_outlier_idx]
+        C = C[covar_outlier_idx, :]
 
         return Pi, r, C, Cov_overlap
 
@@ -293,8 +354,11 @@ def nat_sort(lst):
         return sorted(lst, key=alphanum_key)
 
 
-# function for collecting coverage mcmc results from each ADP cluster
-def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None, bin_width=1):
+# function for collecting coverage mcmc results from each ADP segment
+def aggregate_adp_segments(allelic_seg_groups_pickle, coverage_dir=None, f_file_list=None, cov_df_pickle=None, bin_width=1):
+    S = pd.read_pickle(allelic_seg_groups_pickle)
+    S = S.rename_axis(index = "allelic_seg_idx").reset_index()
+
     if coverage_dir is None and f_file_list is None:
         raise ValueError("need to pass in either coverage_dir or file_list txt file!")
     if coverage_dir is not None and f_file_list is not None:
@@ -302,7 +366,7 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None,
 
     # get results files from the directory provided or from the file list provided
     if coverage_dir is not None:
-        cluster_files = nat_sort(glob.glob(os.path.join(coverage_dir, 'cov_mcmc_data_cluster_*')))
+        adp_seg_files = nat_sort(glob.glob(os.path.join(coverage_dir, 'cov_mcmc_data_*')))
         cov_df = pd.read_pickle(os.path.join(coverage_dir, 'cov_df.pickle'))
         
     else:
@@ -316,47 +380,67 @@ def aggregate_clusters(coverage_dir=None, f_file_list=None, cov_df_pickle=None,
                 to_add = l.rstrip('\n')
                 if to_add != "nan":
                     read_files.append(to_add)
-        cluster_files = nat_sort(read_files)
+        adp_seg_files = nat_sort(read_files)
         cov_df = pd.read_pickle(cov_df_pickle)
-    
-    clust_assignments = cov_df['allelic_cluster'].values
-    
+
+    # make sure that number of results shards is consistent with shard indices
+    if len(adp_seg_files) != len(S):
+        raise ValueError("Number of ADP seg files does not match scatter shards!")
+
+    # load in covMCMC segment boundaries and mu's for each ADP segment
     seg_results = []
     mu_i_results = []
     
-    # load data from each cluster
-    for data_path in cluster_files:
-        cluster_data = np.load(data_path)
-        seg_results.append(cluster_data['seg_samples'])
-        mu_i_results.append(cluster_data['mu_i_samples'])
-    
+    for f in adp_seg_files:
+        seg_data = np.load(f)
+        seg_results.append(seg_data['seg_samples'])
+        mu_i_results.append(seg_data['mu_i_samples'])
+
+    S["seg_results"] = seg_results
+    S["mu_i_results"] = mu_i_results
+
     num_draws = seg_results[0].shape[1]
-    num_clusters = len(seg_results)
 
-    # now we use these data to fill an overall coverage segmentation array
+    # create overall segmentation array
     coverage_segmentation = np.zeros((len(cov_df), num_draws))
     mu_i_values = np.zeros((len(cov_df), num_draws))
 
+    # loop over each cov MCMC draw
+    # TODO: only use maximum likelihood draw; CDP should be able to resegment
     for d in range(num_draws):
-        global_counter = 0
-        for c in range(num_clusters):
-            cluster_mask = (clust_assignments == c)
-            coverage_segmentation[cluster_mask, d] = seg_results[c][:,d] + global_counter
-            mu_i_values[cluster_mask, d] = mu_i_results[c][:, d]
-            global_counter += len(np.unique(seg_results[c][:,d]))
-    
-    # generate data to re-compute global beta
+        n_tot_segs = 0
+        # loop over ADP segments
+        for _, s in S.iterrows():
+            seg_idxs = s["seg_results"][:, d]
+            coverage_segmentation[s["indices"], d] = seg_idxs + n_tot_segs
+            n_tot_segs += seg_idxs[-1] + 1
+
+            mu_i_values[s["indices"], d] = s["mu_i_results"][:, d]
+
+    # TEMP HACK: for now, only take iteration with fewest number of segments
+    sidx = coverage_segmentation[-1, :].argmin()
+    coverage_segmentation = coverage_segmentation[:, [sidx]]
+    mu_i_values = mu_i_values[:, [sidx]]
+
+    # remove short segments (<200Kb)
+    # TODO: remove segments not well-modeled by covariates
+    cov_df["cov_seg_idx"] = coverage_segmentation.astype(int)
+    long_seg_idx = cov_df.groupby("cov_seg_idx").apply(lambda x : (x.iloc[-1]["end"] - x.iloc[0]["start"]) > 2e5).rename("seg_OK")
+    cov_df = cov_df.merge(long_seg_idx, left_on = "cov_seg_idx", right_index = True)
+
+    coverage_segmentation = coverage_segmentation[cov_df["seg_OK"], :]
+    mu_i_values = mu_i_values[cov_df["seg_OK"], :]
+    cov_df = cov_df.loc[cov_df["seg_OK"]]
+
+    # recompute global beta
     r = np.c_[cov_df["covcorr"]]
     # we'll use the mu_is from the last segmentation sample
-    mu_is = mu_i_values[:,-1]
-    # compute new edogenous targets by subtracking out the mu_i values of the segments
-    # along with the bin exposure
-    endog = np.exp(np.log(r).flatten() - np.log(bin_width) - mu_is).reshape(-1,1)
+    mu_is = mu_i_values[:, [-1]]
     # generate covars
-    covar_columns = sorted([c for c in cov_df.columns if 'C_' in c])
+    covar_columns = sorted(cov_df.columns[cov_df.columns.str.contains("^C_.*_z$")])
     C = np.c_[cov_df[covar_columns]]
     # do regression
-    pois_regr = PoissonRegression(endog, C, np.ones(endog.shape))
+    pois_regr = PoissonRegression(r, C, np.ones(r.shape), np.log(bin_width) + mu_is)
     mu_refit, beta_refit = pois_regr.fit()
     
     return coverage_segmentation, beta_refit
diff --git a/hapaseg/utils.py b/hapaseg/utils.py
index e9dc502..5b78213 100644
--- a/hapaseg/utils.py
+++ b/hapaseg/utils.py
@@ -5,7 +5,13 @@
 _chrmap = dict(zip(["chr" + str(x) for x in list(range(1, 23)) + ["X", "Y"]], range(1, 25)))
 
 def parse_cytoband(cytoband):
-    cband = pd.read_csv(cytoband, sep = "\t")
+    # some cytoband files have a header, some don't; we need to check
+    has_header = False
+    with open(cytoband, "r") as f:
+        if f.readline().startswith("chr\t"):
+            has_header = True
+
+    cband = pd.read_csv(cytoband, sep = "\t", names = ["chr", "start", "end", "band", "stain"] if not has_header else None)
     cband["chr"] = cband["chr"].apply(lambda x : _chrmap[x])
 
     chrs = cband["chr"].unique()
@@ -40,11 +46,14 @@ def plot_chrbdy(cytoband_file):
     chrbdy = parse_cytoband(cytoband_file)
 
     # plot chromosome boundaries
+    yl_0 = plt.ylim()[0]
+    yl_1 = plt.ylim()[1]
     chr_ends = chrbdy.loc[1::2, "end"].cumsum()
     for end in chr_ends[:-1]:
         plt.axvline(end, color = 'k')
     for st, en in np.c_[chr_ends[:-1:2], chr_ends[1::2]]:
-        plt.fill_between([st, en], 0, 1, color = [0.9, 0.9, 0.9], zorder = 0)
+        plt.fill_between([st, en], yl_0, yl_1, color = [0.9, 0.9, 0.9], zorder = 0)
+    plt.ylim([yl_0, yl_1])
 
     # plot centromere locations
     for cent in (np.c_[chrbdy.loc[1::2, "start"], chrbdy.loc[::2, "end"]] + np.c_[np.r_[0, chr_ends[:-1]]]).ravel():
diff --git a/setup.py b/setup.py
index 670d4c6..b4501b5 100644
--- a/setup.py
+++ b/setup.py
@@ -35,7 +35,7 @@
     #long_description = long_description,
     #long_description_content_type = 'text/markdown',
     install_requires = [
-        'pandas>=0.24.1',
+        'pandas>=1.4.1',
         'numpy>=1.18.0',
         'more-itertools>=8.10.0',
         'numpy_groupies>=0.9.14',
diff --git a/wolF/tasks.py b/wolF/tasks.py
index 194bf94..8688199 100644
--- a/wolF/tasks.py
+++ b/wolF/tasks.py
@@ -53,7 +53,7 @@ class Hapaseg_burnin(wolf.Task):
     output_patterns = {
       "burnin_MCMC" : "amcmc_results.pickle"
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617"
 
 class Hapaseg_concat(wolf.Task):
     inputs = {
@@ -68,7 +68,7 @@ class Hapaseg_concat(wolf.Task):
       "arms" : "AMCMC-arm*.pickle",
       "ref_bias" : ("ref_bias.txt", wolf.read_file)
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617"
 
 class Hapaseg_amcmc(wolf.Task):
     inputs = {
@@ -82,10 +82,10 @@ class Hapaseg_amcmc(wolf.Task):
             --n_iter ${n_iter}
     """
     output_patterns = {
-      "arm_level_MCMC" : "amcmc_results.pickle"
+      "arm_level_MCMC" : "amcmc_results.pickle",
+      "segmentation_plot" : "figures/MLE_segmentation.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
-
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:all_SNPs_v617"
 
 class Hapaseg_concat_arms(wolf.Task):
     inputs = {
@@ -103,76 +103,68 @@ class Hapaseg_concat_arms(wolf.Task):
     }
     docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
 
-
 class Hapaseg_allelic_DP(wolf.Task):
     inputs = {
       "seg_dataframe" : None,
-      "n_dp_iter" : 10,
-      "seg_samp_idx" : 0,
       "ref_fasta" : None,
       "cytoband_file" : None
     }
     script = """
     export CAPY_REF_FA=${ref_fasta}
     hapaseg dp --seg_dataframe ${seg_dataframe} \
-            --n_dp_iter ${n_dp_iter} \
-            --seg_samp_idx ${seg_samp_idx} \
             --ref_fasta ${ref_fasta} \
             --cytoband_file ${cytoband_file}
     """
     output_patterns = {
       "cluster_and_phase_assignments" : "allelic_DP_SNP_clusts_and_phase_assignments.npz",
       "all_SNPs" : "all_SNPs.pickle",
+      "segmentation_breakpoints" : "segmentations.pickle",
+      "likelihood_trace_plot" : "figures/likelihood_trace.png",
       "SNP_plot" : "figures/SNPs.png",
-      "seg_plot" : "figures/allelic_imbalance_preDP.png",
-      "clust_plot" : "figures/allelic_imbalance_postDP.png",
-    }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
-    resources = { "mem" : "5G" }
-class Hapaseg_collect_adp(wolf.Task):
-    inputs = {
-        "dp_results":None
-    }
-    
-    script = """
-    hapaseg collect_adp --dp_results ${dp_results}
-    """
-    output_patterns = {
-        "full_dp_results":"full_dp_results.npz"
+      "seg_plot" : "figures/segs_only.png",
     }
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
-    resources = { "mem" : "5G" }
-
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v813"
+    resources = { "mem" : "8G" }
 
 class Hapaseg_prepare_coverage_mcmc(wolf.Task):
     inputs = {
         "coverage_csv": None,
         "allelic_clusters_object": None,
         "SNPs_pickle": None,
+        "segmentations_pickle": None,
         "repl_pickle": None,
+        "faire_pickle": "/mnt/j/proj/cnv/20201018_hapseg2/covars/FAIRE_GM12878.hg19.pickle", # TODO: make remote
         "gc_pickle":"",
         "allelic_sample":"",
-        "ref_file_path": None
+        "ref_fasta": None,
+        "bin_width" : 1 # only for whole genomes; for exomes, target lengths are passed as a covariate via the coverage CSV
     }
-    script = """
-    export CAPY_REF_FA=${ref_file_path}
-    hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \
-    --allelic_clusters_object ${allelic_clusters_object} \
-    --SNPs_pickle ${SNPs_pickle} \
-    --repl_pickle ${repl_pickle}"""
+    def script(self):
+        script = """
+        hapaseg coverage_mcmc_preprocess --coverage_csv ${coverage_csv} \
+        --ref_fasta ${ref_fasta} \
+        --allelic_clusters_object ${allelic_clusters_object} \
+        --SNPs_pickle ${SNPs_pickle} \
+        --segmentations_pickle ${segmentations_pickle} \
+        --repl_pickle ${repl_pickle} \
+        --faire_pickle ${faire_pickle} \
+        --bin_width ${bin_width}
+        """
     
-    def prolog(self):
         if self.conf["inputs"]["gc_pickle"] != "":
-            self.conf["script"][-1] += " --gc_pickle ${gc_pickle}"
+            script += " --gc_pickle ${gc_pickle} "
         if self.conf["inputs"]["allelic_sample"] != "":
-            self.conf["script"][-1] += " --allelic_sample ${allelic_sample}"
+            script += " --allelic_sample ${allelic_sample}"
+
+        return script
 
     output_patterns = {
         "preprocess_data": "preprocess_data.npz",
-        "cov_df_pickle": "cov_df.pickle"
+        "cov_df_pickle": "cov_df.pickle",
+        "allelic_seg_groups": "allelic_seg_groups.pickle"
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v858"
     resources = { "mem" : "15G" }
 
 
@@ -205,16 +197,18 @@ def prolog(self):
 
 class Hapaseg_coverage_mcmc(wolf.Task):
     inputs = {
-        "preprocess_data": None,
+        "preprocess_data": None,      # npz of covariate matrix (C), global beta, ADP cluster mu's, covbin ADP cluster assignments (all_mu), covbin raw coverage values (r)
+        "allelic_seg_indices": None,  # dataframe containing indicies into C/r/all_mu for each allelic segment
+        "allelic_seg_scatter_idx": None,      # allelic segment to operate on (for scatter)
         "num_draws": 50,
-        "cluster_num": None,
         "bin_width":None,
         "burnin_files":""
     }
     script = """
     hapaseg coverage_mcmc_shard --preprocess_data ${preprocess_data} \
+    --allelic_seg_indices ${allelic_seg_indices} \
+    --allelic_seg_idx ${allelic_seg_scatter_idx} \
     --num_draws ${num_draws} \
-    --cluster_num ${cluster_num} \
     --bin_width ${bin_width}"""
      
     def prolog(self):
@@ -222,13 +216,13 @@ def prolog(self):
             self.conf["script"][-1] += " --burnin_files ${burnin_files}"
     
     output_patterns = {
-        "cov_segmentation_model": 'cov_mcmc_model_cluster_*.pickle',
-        "cov_segmentation_data": 'cov_mcmc_data_cluster_*.npz',
-        "cov_seg_figure": 'cov_mcmc_cluster_*_visual.png'
+        "cov_segmentation_model": 'cov_mcmc_model_*.pickle',
+        "cov_segmentation_data": 'cov_mcmc_data_*.npz',
+        "cov_seg_figure": 'cov_mcmc_*_visual.png'
     }
 
-    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_v623"
-    resources = {"mem" : "5G"}
+    docker = "gcr.io/broad-getzlab-workflows/hapaseg:coverage_mcmc_integration_v856"
+    resources = {"mem" : "10G"}
 
 class Hapaseg_collect_coverage_mcmc(wolf.Task):
     inputs = {
diff --git a/wolF/workflow.py b/wolF/workflow.py
index 340636a..6ac1aa1 100644
--- a/wolF/workflow.py
+++ b/wolF/workflow.py
@@ -31,19 +31,21 @@
 
 # for Hapaseg itself
 hapaseg = wolf.ImportTask(
-  task_path = "../", # TODO: make remote
+  task_path = ".", # TODO: make remote
   task_name = "hapaseg"
 )
 
 # for coverage collection
 split_intervals = wolf.ImportTask(
   task_path = "git@github.com:getzlab/split_intervals_TOOL.git",
-  task_name = "split_intervals"
+  task_name = "split_intervals",
+  commit = "dc102d8"
 )
 
 cov_collect = wolf.ImportTask(
   task_path = "git@github.com:getzlab/covcollect.git",
-  task_name = "covcollect"
+  task_name = "covcollect",
+  branch = "tot_reads"
 )
 
 ####
@@ -112,10 +114,10 @@ def workflow(
 
   num_cov_seg_samples=5,
 
-  persistant_dry_run = False
+  persistent_dry_run = False
 ):
-    # alert for persistant dry run
-    if persistant_dry_run:
+    # alert for persistent dry run
+    if persistent_dry_run:
         #TODO push this message to canine
         print("WARNING: Skipping file localization in dry run!")
     
@@ -160,7 +162,7 @@ def workflow(
             "t_bai" : tumor_bai,
           },
         token=localization_token,
-        persistent_disk_dry_run = persistant_dry_run
+        persistent_disk_dry_run = persistent_dry_run
         )
         collect_tumor_coverage = True
     elif tumor_coverage_bed is not None:
@@ -176,7 +178,7 @@ def workflow(
             "n_bai" : normal_bai
           },
         token=localization_token,
-        persistent_disk_dry_run = persistant_dry_run
+        persistent_disk_dry_run = persistent_dry_run
         )
         collect_normal_coverage = True
     elif normal_coverage_bed is not None:
@@ -192,8 +194,13 @@ def workflow(
     # collect or load coverage
     # tumor
     if collect_tumor_coverage:
-        primary_contigs = ['chr{}'.format(i) for i in range(1,23)]
-        primary_contigs.extend(['chrX','chrY','chrM'])
+        # FIXME: hack to account for "chr" in hg38 but not in hg19
+        if ref_genome_build == "hg38":
+            primary_contigs = ['chr{}'.format(i) for i in range(1,23)]
+            primary_contigs.extend(['chrX','chrY','chrM'])
+        else:
+            primary_contigs = [str(x) for x in range(1, 23)] + ["X", "Y", "M"]
+
         # create scatter intervals
         split_intervals_task = split_intervals.split_intervals(
           bam = tumor_bam_localization_task["t_bam"],
@@ -204,18 +211,19 @@ def workflow(
 
         # shim task to transform split_intervals files into subset parameters for covcollect task
         @prefect.task
-        def interval_gather(interval_files):
+        def interval_gather(interval_files, primary_contigs):
             ints = []
             for f in interval_files:
                 ints.append(pd.read_csv(f, sep = "\t", header = None, names = ["chr", "start", "end"]))
             #filter non-primary contigs
-            primary_contigs = ['chr{}'.format(i) for i in range(1,23)]
-            primary_contigs.extend(['chrX','chrY','chrM'])
-            full_bed = pd.concat(ints).sort_values(["chr", "start", "end"])
+            full_bed = pd.concat(ints).sort_values(["chr", "start", "end"]).astype({ "chr" : str })
             filtered_bed = full_bed.loc[full_bed.chr.isin(primary_contigs)]
             return filtered_bed
 
-        subset_intervals = interval_gather(split_intervals_task["interval_files"])
+        subset_intervals = interval_gather(
+          split_intervals_task["interval_files"],
+          primary_contigs
+        )
 
         # dispatch coverage scatter
         tumor_cov_collect_task = cov_collect.Covcollect(
@@ -248,11 +256,21 @@ def interval_gather(interval_files):
           ref_fasta = localization_task["ref_fasta"],
           ref_fasta_idx = localization_task["ref_fasta_idx"],
           ref_fasta_dict = localization_task["ref_fasta_dict"],
-          dens_cutoff = 0.58 # TODO: set dynamically
+          use_pod_genotyper = True
         )
 
     # otherwise, run M1 and get it from the BAM
     elif callstats_file is None and tumor_bam is not None and normal_bam is not None:
+        # split het sites file uniformly
+#        split_het_sites = wolf.Task(
+#          name = "split_het_sites",
+#          inputs = { "snp_list" : localization_task["common_snp_list"] },
+#          script = """
+#          sed '/^@/d' ${snp_list} | split -l 10000 -d -a 4 - snp_list_chunk
+#          """,
+#          outputs = { "snp_list_shards" : "snp_list_chunk*" }
+#        )
+
         m1_task = mutect1.mutect1(inputs=dict(
           pairName = "het_coverage",
           caseName = "tumor",
@@ -269,7 +287,12 @@ def interval_gather(interval_files):
           refFastaIdx = localization_task["ref_fasta_idx"],
           refFastaDict = localization_task["ref_fasta_dict"],
 
-          intervals = split_intervals_task["interval_files"]
+          intervals = split_intervals_task["interval_files"],
+          #intervals = split_het_sites["snp_list_shards"],
+
+          exclude_chimeric = True#,
+
+          #force_calling = True,
         ))
 
         hp_scatter = het_pulldown.get_het_coverage_from_callstats(
@@ -278,7 +301,7 @@ def interval_gather(interval_files):
           ref_fasta = localization_task["ref_fasta"],
           ref_fasta_idx = localization_task["ref_fasta_idx"],
           ref_fasta_dict = localization_task["ref_fasta_dict"],
-          dens_cutoff = 0.58 # TODO: set dynamically
+          use_pod_genotyper = True
         )
 
         # gather het pulldown
@@ -347,6 +370,13 @@ def order_indices(bcf_path, bcf_idx_path, localization_task):
 
         F = F.join(F2)
 
+        # prepend "chr" to F's index if it's missing
+        idx = ~F.index.str.contains("^chr")
+        if idx.any():
+            new_index = F.index.values
+            new_index[idx] = "chr" + F.index[idx]
+            F = F.set_index(new_index)
+
         # reference panel BCFs
         R = pd.DataFrame({ "path" : localization_task } ).reset_index()
         F = F.join(R.join(R.loc[R["index"].str.contains("^chr.*_bcf$"), "index"].str.extract(r"(?P<chr>chr[^_]+)"), how = "right").set_index("chr").drop(columns = ["index"]).rename(columns = { "path" : "ref_bcf" }), how = "inner")
@@ -365,8 +395,10 @@ def order_indices(bcf_path, bcf_idx_path, localization_task):
         vcf_idx_in = F["bcf_idx_path"],
         vcf_ref = F["ref_bcf"],
         vcf_ref_idx = F["ref_bcf_idx"],
-        output_file_prefix = "foo"
-      )
+        output_file_prefix = "foo",
+        num_threads = 4,
+      ),
+      resources = { "cpus-per-task" : 4 }
     )
 
     # TODO: run whatshap
@@ -429,55 +461,90 @@ def get_chunks(scatter_chunks):
      }
     )
     
-    hapaseg_arm_concat_task = hapaseg.Hapaseg_concat_arms(
-        inputs={
-        "arm_results":[hapaseg_arm_AMCMC_task["arm_level_MCMC"]],
-        "ref_fasta":localization_task["ref_fasta"] #pickle load will import capy
-        }
-    )
+#    hapaseg_arm_concat_task = hapaseg.Hapaseg_concat_arms(
+#        inputs={
+#        "arm_results":[hapaseg_arm_AMCMC_task["arm_level_MCMC"]],
+#        "ref_fasta":localization_task["ref_fasta"] #pickle load will import capy
+#        }
+#    )
+#    @prefect.task
+#    def get_arm_samples_range(arm_concat_object):
+#        obj = np.load(arm_concat_object)
+#        n_samples_range = list(range(int(obj["n_samps"])))
+#        return n_samples_range
+#    
+#    n_samps_range = get_arm_samples_range(hapaseg_arm_concat_task["num_samples_obj"])
+
+    # concat arm level results
     @prefect.task
-    def get_arm_samples_range(arm_concat_object):
-        obj = np.load(arm_concat_object)
-        n_samples_range = list(range(int(obj["n_samps"])))
-        return n_samples_range
-    
-    n_samps_range = get_arm_samples_range(hapaseg_arm_concat_task["num_samples_obj"])
-    
+    def concat_arm_level_results(arm_results):
+        A = []
+        for arm_file in arm_results:
+            with open(arm_file, "rb") as f:
+                H = pickle.load(f)
+                A.append(pd.Series({ "chr" : H.P["chr"].iloc[0], "start" : H.P["pos"].iloc[0], "end" : H.P["pos"].iloc[-1], "results" : H }))
+
+        # get into order
+        A = pd.concat(A, axis = 1).T.sort_values(["chr", "start", "end"]).reset_index(drop = True)
+
+        # save
+        _, tmpfile = tempfile.mkstemp(  )
+        A.to_pickle(tmpfile)
+
+        return tmpfile
+
+    arm_concat = concat_arm_level_results(hapaseg_arm_AMCMC_task["arm_level_MCMC"])
+
     ## run DP
 
-    # scatter DP
     hapaseg_allelic_DP_task = hapaseg.Hapaseg_allelic_DP(
      inputs = {
-       "seg_dataframe" : hapaseg_arm_concat_task["arm_cat_results_pickle"],
-       "n_dp_iter" : 10,   # TODO: allow to be specified?
-       "seg_samp_idx" : n_samps_range,
-       "cytoband_file" : localization_task["cytoband_file"], # TODO: allow to be specified
+       "seg_dataframe" : arm_concat,
+       #"seg_dataframe" : hapaseg_arm_concat_task["arm_cat_results_pickle"],
+       "cytoband_file" : localization_task["cytoband_file"],
        "ref_fasta" : localization_task["ref_fasta"],
        "ref_fasta_idx" : localization_task["ref_fasta_idx"],  # not used; just supplied for symlink
        "ref_fasta_dict" : localization_task["ref_fasta_dict"] # not used; just supplied for symlink
      }
     )
-    
-    ##collect DP results
-    collect_adp_task = hapaseg.Hapaseg_collect_adp(
-        inputs={"dp_results":[hapaseg_allelic_DP_task["cluster_and_phase_assignments"]]
-               }
-    )
-    
-    ### coverage tasks ####
+
+    #
+    # coverage tasks
+    #
 
     # prepare coverage MCMC
     prep_cov_mcmc_task = hapaseg.Hapaseg_prepare_coverage_mcmc(
     inputs={
         "coverage_csv":tumor_cov_gather_task["coverage"], #each scatter result is the same
-        "allelic_clusters_object":collect_adp_task["full_dp_results"],
-        "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same
+        "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"],
+        "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'],
+        "segmentations_pickle":hapaseg_allelic_DP_task['segmentation_breakpoints'],
         "repl_pickle":ref_config["repl_file"],
         "gc_pickle":ref_config["gc_file"],
-        "ref_file_path":localization_task["ref_fasta"]
+        "ref_fasta":localization_task["ref_fasta"],
+        "bin_width":bin_width if wgs else 1
         }
     )
-    
+
+    # shim task to get number of allelic segments
+    #   (coverage MCMC will be scattered over each allelic segment)
+    @prefect.task
+    def get_N_seg_groups(S):
+        return list(range(len(pd.read_pickle(S))))
+
+    cov_mcmc_shard_range = get_N_seg_groups(prep_cov_mcmc_task["allelic_seg_groups"])
+
+    # coverage MCMC burnin(?) <- do we still need to burnin separately?
+    cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc(
+        inputs={
+            "preprocess_data":prep_cov_mcmc_task["preprocess_data"],
+            "allelic_seg_indices":prep_cov_mcmc_task["allelic_seg_groups"],
+            "allelic_seg_scatter_idx":cov_mcmc_shard_range,
+            "num_draws":50,
+            "bin_width":bin_width,
+        }
+    )
+ 
     #get the cluster indices from the preprocess data and generate the burnin indices
     @prefect.task(nout=4)
     def _get_ADP_cluster_list(preprocess_data_obj):
@@ -506,9 +573,9 @@ def _get_ADP_cluster_list(preprocess_data_obj):
         cluster_idxs = [i for i in np.arange(num_clusters)]
         print(cluster_idxs, cluster_list, range_list) 
         return len(cluster_idxs), cluster_idxs, cluster_list, range_list
-    
+
     num_clusters, cluster_idxs, cluster_list, range_list = _get_ADP_cluster_list(prep_cov_mcmc_task["preprocess_data"])
-    
+
     # coverage MCMC burnin
     cov_mcmc_burnin_task = hapaseg.Hapaseg_coverage_mcmc_burnin(
         inputs={
@@ -519,7 +586,7 @@ def _get_ADP_cluster_list(preprocess_data_obj):
             "range":range_list
         }
     )
-    
+
     # coverage MCMC scatter post-burnin
     cov_mcmc_scatter_task = hapaseg.Hapaseg_coverage_mcmc(
         inputs={
@@ -530,7 +597,7 @@ def _get_ADP_cluster_list(preprocess_data_obj):
             "burnin_files":[cov_mcmc_burnin_task["burnin_data"]] * num_clusters # this is to account for a wolf input len bug
         }
     )
-    
+
     # collect coverage MCMC
     cov_mcmc_gather_task = hapaseg.Hapaseg_collect_coverage_mcmc(
     inputs = {
@@ -539,6 +606,7 @@ def _get_ADP_cluster_list(preprocess_data_obj):
         "bin_width":bin_width
         }
     )
+
     # coverage DP
     cov_dp_task = hapaseg.Hapaseg_coverage_dp(
     inputs = {
@@ -550,27 +618,26 @@ def _get_ADP_cluster_list(preprocess_data_obj):
         "bin_width":bin_width
         }
     )
-    
+
     #get the adp draw number from the preprocess data object
     @prefect.task
     def _get_ADP_draw_num(preprocess_data_obj):
         return int(np.load(preprocess_data_obj)["adp_cluster"])
     
     adp_draw_num = _get_ADP_draw_num(prep_cov_mcmc_task["preprocess_data"])
-    
-    # generate acdp dataframe
 
+    # generate acdp dataframe 
     gen_acdp_task = hapaseg.Hapaseg_acdp_generate_df(
     inputs = {
         "SNPs_pickle":hapaseg_allelic_DP_task['all_SNPs'][0], #each scatter result is the same
-        "allelic_clusters_object":collect_adp_task["full_dp_results"],
+        "allelic_clusters_object":hapaseg_allelic_DP_task["cluster_and_phase_assignments"],
         "cdp_filepaths":[cov_dp_task["cov_dp_object"]],
         "allelic_draw_index":adp_draw_num,
         "ref_file_path":localization_task["ref_fasta"],
         "bin_width":bin_width
         }
     )
-    
+
     # run acdp
     acdp_task = hapaseg.Hapaseg_run_acdp(
     inputs = {
@@ -582,18 +649,20 @@ def _get_ADP_draw_num(preprocess_data_obj):
     )
 
     #cleanup by deleting bam disks. we make seperate tasks for the bams
-    if not persistant_dry_run and t_bam is not None and t_bai is not None:
+    if not persistent_dry_run and tumor_bam is not None and tumor_bai is not None:
         delete_tbams_task = DeleteDisk(
           inputs = {
             "disk" : [tumor_bam_localization_task["t_bam"], tumor_bam_localization_task["t_bai"]],
             "upstream" : m1_task["mutect1_cs"] if callstats_file is None else tumor_cov_gather_task["coverage"] 
+          }
      )
      
-    if not persistant_dry_run and n_bam is not None and n_bai is not None:
+    if not persistent_dry_run and normal_bam is not None and normal_bai is not None:
         delete_nbams_task = DeleteDisk(
           inputs = {
             "disk" : [normal_bam_localization_task["n_bam"], normal_bam_localization_task["n_bai"]],
             "upstream" : m1_task["mutect1_cs"]
+          }
     )
     #also delete the cached files disk
     delete_file_disk_task = DeleteDisk(