From 030b3e08eb3d44be3765b1147cf6ed2e8e6196c2 Mon Sep 17 00:00:00 2001 From: Ania Szymanska Date: Thu, 27 May 2021 17:23:54 +0200 Subject: [PATCH 1/3] bug fixes --- common_tads.py | 3 ++- moc.py | 3 ++- plot_venn.py | 3 ++- tools/measure_of_concordance.py | 11 +++++++---- tools/str2bool.py | 12 ++++++++++++ 5 files changed, 25 insertions(+), 7 deletions(-) create mode 100644 tools/str2bool.py diff --git a/common_tads.py b/common_tads.py index 247d1c3..9afe8c5 100755 --- a/common_tads.py +++ b/common_tads.py @@ -15,6 +15,7 @@ from tools.common_domains import find_common_domains, common_domains_multiple_sets, save_domains_matrix from os import path import glob +from tools.str2bool import str2bool def main(): @@ -28,7 +29,7 @@ def main(): parser.add_argument("-o", "--output", help="Directory to save output file. Output is saved only when analysing multiple sets of TADs " "(When --bedfile_1 is a directory. If None save in input directory.", default=None) - parser.add_argument("-r", "--report", help="If True print output matrix to stdout. Default=True", default=True, type=bool) + parser.add_argument("-r", "--report", help="If True print output matrix to stdout. Default=True", default=True, type=str2bool) parser.add_argument("-s", "--shift", default=0, type=int, help="Accepted shift of two domain boundaries positions in base pair.") args = parser.parse_args() diff --git a/moc.py b/moc.py index 5c47d35..af04acb 100755 --- a/moc.py +++ b/moc.py @@ -9,6 +9,7 @@ save_moc_matrix, add_row_and_columns_id from os import path import glob +from tools.str2bool import str2bool def main(): @@ -23,7 +24,7 @@ def main(): parser.add_argument("-o", "--output", help="Directory to save output file. Output is saved only when analysing multiple sets of TADs " "(When --bedfile_1 is a directory. If None save in input directory.", default=None) - parser.add_argument("-r", "--report", help="If True return MoC to stdout. Default=True", default=True) + parser.add_argument("-r", "--report", help="If True return MoC to stdout. Default=True", default=True, type=str2bool) args = parser.parse_args() if args.output: diff --git a/plot_venn.py b/plot_venn.py index a4d67c9..59deb49 100755 --- a/plot_venn.py +++ b/plot_venn.py @@ -8,6 +8,7 @@ import glob from tools.measure_of_concordance import read_domains_from_bedfile from tools.common_domains import plot_venn_diagram_of_3_sets, plot_venn_diagram_of_2_sets +from tools.str2bool import str2bool def main(): @@ -16,7 +17,7 @@ def main(): help='List of files (two or three) with different domains sets.', required=True) parser.add_argument("-o", "--output", help="Directory or filename to save a plot in.", default=None) - parser.add_argument("-s", "--show", help="If True show the plot.", default=True, type=bool) + parser.add_argument("-s", "--show", help="If True show the plot.", default=True, type=str2bool) args = parser.parse_args() if args.output: diff --git a/tools/measure_of_concordance.py b/tools/measure_of_concordance.py index 6cbd96f..255083f 100644 --- a/tools/measure_of_concordance.py +++ b/tools/measure_of_concordance.py @@ -23,7 +23,7 @@ def read_domains_from_bedfile(bedfile): def calculate_moc(set1, set2): n1 = len(set1) # number fo domains in set 1 n2 = len(set2) # number of domains in set 2 - if set1 == set2 == 1: + if n1 == n2 == 1: return 1 moc = 0 for i in range(n1): @@ -31,8 +31,8 @@ def calculate_moc(set1, set2): for j in range(n2): if set1[i][0] == set2[j][0]: # check if same chromosome q_j = set2[j][2] - set2[j][1] # domain length - overlap = len(check_overlap(start1=set1[i][1], end1=set1[i][2], start2=set2[j][1], - end2=set2[j][2])) # calculate overlap in bp + overlap = check_overlap(start1=set1[i][1], end1=set1[i][2], start2=set2[j][1], + end2=set2[j][2]) # calculate overlap in bp if overlap > 0: moc += (overlap ** 2 / (p_i * q_j)) return (1 / ((n1 * n2) ** (1 / 2) - 1)) * (moc - 1) @@ -40,7 +40,10 @@ def calculate_moc(set1, set2): def check_overlap(start1, end1, start2, end2): """Return the overlap between two domains""" - return range(max(start1, start2), min(end1, end2 + 1)) + overlap = min(end1, end2) - max(start1, start2) + if overlap < 0: + overlap = 0 + return overlap def moc_for_multiple_sets(set_list): diff --git a/tools/str2bool.py b/tools/str2bool.py new file mode 100644 index 0000000..db57006 --- /dev/null +++ b/tools/str2bool.py @@ -0,0 +1,12 @@ +import argparse + +def str2bool(v): + if isinstance(v, bool): + return v + v = v.lower() + if v in ('true', 't', '1'): + return True + elif v in ('false', 'f', '0'): + return False + else: + raise argparse.ArgumentTypeError() From 350f2c50f8797a2140b77ee49f31ed3c8b0d8626 Mon Sep 17 00:00:00 2001 From: Ania Szymanska Date: Sun, 6 Jun 2021 16:20:49 +0200 Subject: [PATCH 2/3] set number of common domains at diagonal (instead of 0) --- tools/common_domains.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/common_domains.py b/tools/common_domains.py index 46095bb..236e36e 100644 --- a/tools/common_domains.py +++ b/tools/common_domains.py @@ -56,10 +56,11 @@ def common_domains_multiple_sets(domains_sets, shift): """Return a matrix with common domains for different sets.""" matr = np.zeros((len(domains_sets), len(domains_sets)), dtype=int) for i in range(len(domains_sets)): - for j in range(len(domains_sets)): + for j in range(i, len(domains_sets)): if i == j: - continue - matr[i][j] = len(find_common_domains(set1=domains_sets[i], set2=domains_sets[j], shift=shift)) + matr[i][j] = len(domains_sets[i]) + else: + matr[i][j] = matr[j][i] = len(find_common_domains(set1=domains_sets[i], set2=domains_sets[j], shift=shift)) return matr From b2451d87e1fc67192616ea532d7dddd6d6268f56 Mon Sep 17 00:00:00 2001 From: Ania Szymanska Date: Sun, 6 Jun 2021 16:56:23 +0200 Subject: [PATCH 3/3] fix check shifted domains - faster and ranged changed from [) to [] --- tools/common_domains.py | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/tools/common_domains.py b/tools/common_domains.py index 236e36e..a511f95 100644 --- a/tools/common_domains.py +++ b/tools/common_domains.py @@ -14,12 +14,8 @@ def find_common_domains(set1, set2, shift): common_domains = [] for i in range(len(set1)): for j in range(len(set2)): - if shift == 0: - if check_domains(domain1=set1[i], domain2=set2[j]): - common_domains.append(set1[i]) - else: - if check_shifted_domains(domain1=set1[i], domain2=set2[j], shift=shift): - common_domains.append(set1[i]) + if check_shifted_domains(domain1=set1[i], domain2=set2[j], shift=shift): + common_domains.append(set1[i]) return common_domains @@ -27,25 +23,12 @@ def check_shifted_domains(domain1, domain2, shift): """Check if two domains positions are identical (with accepted shift)""" if domain1[0] != domain2[0]: return False - if domain2[1] - shift < 0: - start = 0 - else: - start = domain2[1] - shift - if domain1[1] in range(start, domain2[1] + shift): - if domain1[2] in range(domain2[2] - shift, domain2[2] + shift): + if domain2[1] - shift <= domain1[1] <= domain2[1] + shift: + if domain2[2] - shift <= domain1[2] <= domain2[2] + shift: return True return False -def check_domains(domain1, domain2): - """Check if two domains have exact same boundaries""" - if domain2[0] != domain1[0]: - return False - if domain1[1] == domain2[1] and domain1[2] == domain2[2]: - return True - return False - - def save_domains_matrix(tad_matrix, outfile): """Format nicely and save conserved domains matrix with sets names""" tad_matrix.to_csv(outfile, sep=",", header=True, index=True)