From e9853730f9d11380fe0443bbbc05f4b150acf8c6 Mon Sep 17 00:00:00 2001 From: xonq Date: Thu, 22 May 2025 21:54:21 +0000 Subject: [PATCH 01/12] init update --- scripts/README.md | 18 ++++ scripts/extract_nextclades.py | 167 ++++++++++++++++++++++++++++++++++ 2 files changed, 185 insertions(+) create mode 100644 scripts/extract_nextclades.py diff --git a/scripts/README.md b/scripts/README.md index 3ebc437..a3ba4ba 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -146,4 +146,22 @@ MiniWDL is a required package #### usage ```bash $ python update_taxon_tables_io.py -r -i +``` + +### extract_nextclades.py + +`extract_nextclades.py` extracts the nucleotide mutations - optionally including the amino acid mutations - associated with clade metadata, and produces a Nextclade compatible `clades.tsv` for building a Nextclade dataset. This script enables swift creation of a genotyping Nextclade module. Clade metadata that is not monophyletic within the reference phylogeny is omitted. + +#### requirements +- theiaphylo (`python3 -m pip install theiaphylo`) + +#### usage +```bash +$ python extract_nextclades.py \ + -t \ + -m \ + -c ... \ + -tc \ + -nt \ + -aa [AA_MUTS.json] ``` \ No newline at end of file diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py new file mode 100644 index 0000000..ef9379c --- /dev/null +++ b/scripts/extract_nextclades.py @@ -0,0 +1,167 @@ +#! /usr/bin/env python3 + +import re +import sys +import json +import logging +import argparse +import pandas as pd +from ete3 import Tree +from collections import defaultdict +from theiaphylo.lib.StdPath import Path +from theiaphylo.lib.TheiaPhylo import * + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) +logger = logging.getLogger(__name__) + + +def id_clade_mrca(tree, metadf, clade_col, clade): + # extract df for clade + clade_df = metadf[metadf[clade_col] == clade] + + # get tips from clade + clade_tips = sorted(set(clade_df.index)) + if len(clade_tips) == 1: + # clade is a single tip, extract the node as the leaf name + return clade_tips[0] + # get MRCA + mrca = tree.lowest_common_ancestor(clade_tips) + mrca_tips = sorted(x.name for x in mrca.iter_tips()) + + # extract clades associated with tips + mrca_df = metadf.loc[mrca_tips] + + # identify clades descended from the MRCA + mrca_clades = sorted(set(x for x in mrca_df[clade_col] if not pd.isna(x))) + + # clade is not monophyletic + if len(mrca_clades) > 1: + conflicts = sorted(set(mrca_clades) - {clade}) + logger.warning( + f"{clade} is not monophyletic; conflicts with {conflicts} - Skipping." + ) + return None + # extract node from monophyletic clade + else: + # convert to ete3 object + ete_tree = Tree( + tree.get_newick(with_node_names=True, escape_name=False), format=8 + ) + # get the node name from the ete3 tree + mrca_node = ( + ete_tree.get_common_ancestor(mrca_tips) + .name.replace("'", "") + .replace('"', "") + ) + # clade2node[clade] = + return mrca_node + + +def write_clade_muts(clade2muts, out_file): + with open(out_file, "w") as f: + f.write("clade\tgene\tsite\talt\n") + for clade, mut_dict in clade2muts.items(): + for mut in mut_dict["nt"]: + mut_components = re.search(r"\D+(\d+)(\D+)", mut) + if mut_components: + site = mut_components.group(1) + alt = mut_components.group(2) + f.write(f"{clade}\tnuc\t{site}\t{alt}\n") + for prot, muts in mut_dict["aa"].items(): + for mut in muts: + mut_components = re.search(r"\D+(\d+)(\D+)", mut) + if mut_components: + site = mut_components.group(1) + alt = mut_components.group(2) + f.write(f"{clade}\t{prot}\t{site}\t{alt}\n") + + +def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()): + + # remove metadata entries that are not in the tree + metadf = metadf[metadf.index.isin(tree.get_tip_names())] + + clade2muts = defaultdict(lambda: {"nt": [], "aa": {}}) + for clade_col in clade_cols: + # get the clades + clades = sorted( + set([x for x in metadf[clade_col] if not pd.isna(x) and x not in excluded]) + ) + for clade in clades: + mrca_node = id_clade_mrca(tree, metadf, clade_col, clade) + if mrca_node: + logger.info(f"{clade_col}: {clade}\tMRCA: {mrca_node}") + clade2muts[clade]["nt"] = nt_muts["nodes"][mrca_node]["muts"] + if aa_muts: + for prot, muts in aa_muts["nodes"][mrca_node]["aa_muts"].items(): + clade2muts[clade]["aa"][prot] = muts + return clade2muts + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Extract mutations from monophyletic clades for Nextclade genotyping" + ) + parser.add_argument( + "-t", "--tree", required=True, help="Path to the Augur-refined newick" + ) + parser.add_argument("-m", "--metadata", required=True, help="Path to metadata TSV") + parser.add_argument( + "-cc", "--clade_cols", nargs="*", required=True, help="Clade columns to extract" + ) + parser.add_argument( + "-tc", "--tip_col", required=True, help="Column in metadata to use as tip label" + ) + parser.add_argument("-nt", "--nt_muts", required=True, help="Path to nt_muts JSON") + parser.add_argument("-aa", "--aa_muts", help="Path to aa_muts JSON") + parser.add_argument( + "-e", "---exclude", nargs="*", help="Clades to exclude from analysis" + ) + parser.add_argument( + "-r", "--root", nargs="*", help="Root tip(s) / node for rooting" + ) + parser.add_argument( + "-o", "--output", help="Output file name. DEFAULT: 'clades.tsv'" + ) + args = parser.parse_args() + + # load the tree + tree = import_tree(Path(args.tree), outgroup=args.root) + + # load the metadata + metadata_file = Path(args.metadata) + if metadata_file.endswith(".csv"): + metadf = pd.read_csv(Path(metadata_file), index_col=args.tip_col) + else: + metadf = pd.read_csv(Path(metadata_file), sep="\t", index_col=args.tip_col) + + # load the mutation files + with open(Path(args.nt_muts), "r") as f: + nt_muts = json.load(f) + # load the aa_muts file if provided + if args.aa_muts: + with open(Path(args.aa_muts), "r") as f: + aa_muts = json.load(f) + else: + aa_muts = None + + if args.exclude: + exclusion_clades = set(args.exclude) + else: + exclusion_clades = set() + + clade2muts = main( + tree, metadf, args.clade_cols, nt_muts, aa_muts, excluded=exclusion_clades + ) + + # write the output + if args.output: + output_file = Path(args.output) + else: + output_file = Path("clades.tsv") + write_clade_muts(clade2muts, output_file) + sys.exit(0) From 2e2296b0bf417f8eba3a2bc9d75e6a3f6bdfca51 Mon Sep 17 00:00:00 2001 From: xonq Date: Fri, 23 May 2025 15:22:07 +0000 Subject: [PATCH 02/12] new package name --- scripts/extract_nextclades.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py index ef9379c..e98b07c 100644 --- a/scripts/extract_nextclades.py +++ b/scripts/extract_nextclades.py @@ -9,7 +9,7 @@ from ete3 import Tree from collections import defaultdict from theiaphylo.lib.StdPath import Path -from theiaphylo.lib.TheiaPhylo import * +from theiaphylo.theiaPhylo import * logging.basicConfig( level=logging.INFO, From ab248416f660619ccdce5223b2f92d4ce5d9d058 Mon Sep 17 00:00:00 2001 From: xonq Date: Sat, 31 May 2025 00:10:57 +0000 Subject: [PATCH 03/12] correct library call --- scripts/extract_nextclades.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py index e98b07c..df509e4 100644 --- a/scripts/extract_nextclades.py +++ b/scripts/extract_nextclades.py @@ -9,7 +9,7 @@ from ete3 import Tree from collections import defaultdict from theiaphylo.lib.StdPath import Path -from theiaphylo.theiaPhylo import * +from theiaphylo.theiaphylo import * logging.basicConfig( level=logging.INFO, From f774a40e5d03dc1dfe680f90ed7b19bf15f7210d Mon Sep 17 00:00:00 2001 From: xonq Date: Tue, 10 Jun 2025 15:30:15 +0000 Subject: [PATCH 04/12] allow for incomplete taxa --- scripts/extract_nextclades.py | 36 ++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py index df509e4..9bfc9cd 100644 --- a/scripts/extract_nextclades.py +++ b/scripts/extract_nextclades.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) -def id_clade_mrca(tree, metadf, clade_col, clade): +def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False): # extract df for clade clade_df = metadf[metadf[clade_col] == clade] @@ -33,7 +33,11 @@ def id_clade_mrca(tree, metadf, clade_col, clade): mrca_tips = sorted(x.name for x in mrca.iter_tips()) # extract clades associated with tips - mrca_df = metadf.loc[mrca_tips] + if noncomprehensive: + mrca_df_tips = sorted(set(metadf.index).intersection(set(mrca_tips))) + mrca_df = metadf.loc[mrca_df_tips] + else: + mrca_df = metadf.loc[mrca_tips] # identify clades descended from the MRCA mrca_clades = sorted(set(x for x in mrca_df[clade_col] if not pd.isna(x))) @@ -80,7 +84,15 @@ def write_clade_muts(clade2muts, out_file): f.write(f"{clade}\t{prot}\t{site}\t{alt}\n") -def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()): +def main( + tree, + metadf, + clade_cols, + nt_muts, + aa_muts=None, + excluded=set(), + noncomprehensive=False, +): # remove metadata entries that are not in the tree metadf = metadf[metadf.index.isin(tree.get_tip_names())] @@ -92,7 +104,9 @@ def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()): set([x for x in metadf[clade_col] if not pd.isna(x) and x not in excluded]) ) for clade in clades: - mrca_node = id_clade_mrca(tree, metadf, clade_col, clade) + mrca_node = id_clade_mrca( + tree, metadf, clade_col, clade, noncomprehensive=noncomprehensive + ) if mrca_node: logger.info(f"{clade_col}: {clade}\tMRCA: {mrca_node}") clade2muts[clade]["nt"] = nt_muts["nodes"][mrca_node]["muts"] @@ -124,6 +138,12 @@ def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()): parser.add_argument( "-r", "--root", nargs="*", help="Root tip(s) / node for rooting" ) + parser.add_argument( + "-n", + "--noncomprehensive", + action="store_true", + help="Accept missing metadata for tips", + ) parser.add_argument( "-o", "--output", help="Output file name. DEFAULT: 'clades.tsv'" ) @@ -155,7 +175,13 @@ def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()): exclusion_clades = set() clade2muts = main( - tree, metadf, args.clade_cols, nt_muts, aa_muts, excluded=exclusion_clades + tree, + metadf, + args.clade_cols, + nt_muts, + aa_muts, + excluded=exclusion_clades, + noncomprehensive=args.noncomprehensive, ) # write the output From d83cb4b35e5c58302c820b3d3b339338cda706ea Mon Sep 17 00:00:00 2001 From: xonq Date: Tue, 10 Jun 2025 17:31:03 +0000 Subject: [PATCH 05/12] formatting --- scripts/extract_nextclades.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py index 9bfc9cd..52d2785 100644 --- a/scripts/extract_nextclades.py +++ b/scripts/extract_nextclades.py @@ -20,6 +20,8 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False): + """Identify the most recent common ancestor (MRCA) of a clade if it is monophyletic. + Otherwise report a warning and return None.""" # extract df for clade clade_df = metadf[metadf[clade_col] == clade] @@ -66,6 +68,7 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False): def write_clade_muts(clade2muts, out_file): + """Extract and write the mutations for each clade to a TSV file.""" with open(out_file, "w") as f: f.write("clade\tgene\tsite\talt\n") for clade, mut_dict in clade2muts.items(): @@ -93,7 +96,7 @@ def main( excluded=set(), noncomprehensive=False, ): - + """Main function to extract mutations from clades.""" # remove metadata entries that are not in the tree metadf = metadf[metadf.index.isin(tree.get_tip_names())] From f3636b198b7fda21d2ce94668e8cf99260386672 Mon Sep 17 00:00:00 2001 From: xonq Date: Tue, 17 Jun 2025 16:55:00 +0000 Subject: [PATCH 06/12] increase logging --- scripts/extract_nextclades.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py index 52d2785..68cff62 100644 --- a/scripts/extract_nextclades.py +++ b/scripts/extract_nextclades.py @@ -29,6 +29,7 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False): clade_tips = sorted(set(clade_df.index)) if len(clade_tips) == 1: # clade is a single tip, extract the node as the leaf name + logger.info(f"{clade} mutations will be derived from one tip: {clade_tips[0]}") return clade_tips[0] # get MRCA mrca = tree.lowest_common_ancestor(clade_tips) From 23ecab390b932d828d70c2f4f9373bcfdbda7c9c Mon Sep 17 00:00:00 2001 From: xonq Date: Fri, 20 Jun 2025 22:44:27 +0000 Subject: [PATCH 07/12] enable identification of uniquely defining mutations --- scripts/extract_nextclades.py | 98 +++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 10 deletions(-) diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py index 68cff62..f72b074 100644 --- a/scripts/extract_nextclades.py +++ b/scripts/extract_nextclades.py @@ -19,7 +19,9 @@ logger = logging.getLogger(__name__) -def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False): +def id_clade_mrca( + tree, metadf, clade_col, clade, noncomprehensive=False, skip_singletons=True +): """Identify the most recent common ancestor (MRCA) of a clade if it is monophyletic. Otherwise report a warning and return None.""" # extract df for clade @@ -28,9 +30,17 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False): # get tips from clade clade_tips = sorted(set(clade_df.index)) if len(clade_tips) == 1: - # clade is a single tip, extract the node as the leaf name - logger.info(f"{clade} mutations will be derived from one tip: {clade_tips[0]}") - return clade_tips[0] + if skip_singletons: + logger.warning( + f"{clade} is a singleton clade with one tip: {clade_tips[0]} - Skipping." + ) + return None, None + else: + # clade is a single tip, extract the node as the leaf name + logger.info( + f"{clade} mutations will be derived from one tip: {clade_tips[0]}" + ) + return clade_tips[0], clade_tips[0] # get MRCA mrca = tree.lowest_common_ancestor(clade_tips) mrca_tips = sorted(x.name for x in mrca.iter_tips()) @@ -51,7 +61,7 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False): logger.warning( f"{clade} is not monophyletic; conflicts with {conflicts} - Skipping." ) - return None + return None, None # extract node from monophyletic clade else: # convert to ete3 object @@ -65,7 +75,7 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False): .replace('"', "") ) # clade2node[clade] = - return mrca_node + return mrca_node, mrca_tips def write_clade_muts(clade2muts, out_file): @@ -88,19 +98,54 @@ def write_clade_muts(clade2muts, out_file): f.write(f"{clade}\t{prot}\t{site}\t{alt}\n") +def check_nt_uniqueness(muts_set, in_node, nt2muts): + """Check if the mutations are unique enough to call. Can not compile mutations up the tree, so non-comprehensive. See export JSON function""" + for node, muts_d in nt2muts["nodes"].items(): + if node != in_node: + # if all the mutations are present in another node, return False + if not muts_set.difference(set(muts_d["muts"])): + return False + return True + + +def compile_export_json(tree_json): + """Compile clades to export nodes""" + node2clades = {} + for child in tree_json["children"]: + if "children" in child: + node2clades = {**node2clades, **compile_export_json(child)} + if "node_attrs" in child: + if "clade_membership" in child["node_attrs"]: + node2clades[child["name"]] = child["node_attrs"]["clade_membership"][ + "value" + ] + return node2clades + + def main( tree, metadf, clade_cols, nt_muts, aa_muts=None, + export_json=None, excluded=set(), noncomprehensive=False, + skip_singletons=True, ): """Main function to extract mutations from clades.""" # remove metadata entries that are not in the tree metadf = metadf[metadf.index.isin(tree.get_tip_names())] + clades2nodes = defaultdict(set) + # populate a dictionary of clades to reference + if export_json: + node2clades = compile_export_json(export_json["tree"]) + tips = set(x.name for x in tree.iter_tips()) + for node, clade in node2clades.items(): + if node in tips: + clades2nodes[clade].add(node) + clade2muts = defaultdict(lambda: {"nt": [], "aa": {}}) for clade_col in clade_cols: # get the clades @@ -108,15 +153,33 @@ def main( set([x for x in metadf[clade_col] if not pd.isna(x) and x not in excluded]) ) for clade in clades: - mrca_node = id_clade_mrca( - tree, metadf, clade_col, clade, noncomprehensive=noncomprehensive + mrca_node, mrca_tips = id_clade_mrca( + tree, + metadf, + clade_col, + clade, + noncomprehensive=noncomprehensive, + skip_singletons=skip_singletons, ) if mrca_node: logger.info(f"{clade_col}: {clade}\tMRCA: {mrca_node}") - clade2muts[clade]["nt"] = nt_muts["nodes"][mrca_node]["muts"] + clade_nt_muts = nt_muts["nodes"][mrca_node]["muts"] + # check for unique mutations + if clades2nodes: + if clades2nodes[clade].difference(set(mrca_tips)): + logger.warning( + f"{clade} does not have distinguishing mutations - Skipping." + ) + continue + # if check_nt_uniqueness(set(clade_nt_muts), mrca_node, nt_muts): + clade2muts[clade]["nt"] = clade_nt_muts if aa_muts: for prot, muts in aa_muts["nodes"][mrca_node]["aa_muts"].items(): clade2muts[clade]["aa"][prot] = muts + # else: + # logger.warning( + # f"{clade} mutations are not unique" + # ) return clade2muts @@ -137,8 +200,9 @@ def main( parser.add_argument("-nt", "--nt_muts", required=True, help="Path to nt_muts JSON") parser.add_argument("-aa", "--aa_muts", help="Path to aa_muts JSON") parser.add_argument( - "-e", "---exclude", nargs="*", help="Clades to exclude from analysis" + "-e", "--export_json", help="AUGUR export JSON to cross-reference clades" ) + parser.add_argument("---exclude", nargs="*", help="Clades to exclude from analysis") parser.add_argument( "-r", "--root", nargs="*", help="Root tip(s) / node for rooting" ) @@ -148,6 +212,12 @@ def main( action="store_true", help="Accept missing metadata for tips", ) + parser.add_argument( + "-s", + "--skip_singletons", + action="store_true", + help="Skip singletons (clades with one tip)", + ) parser.add_argument( "-o", "--output", help="Output file name. DEFAULT: 'clades.tsv'" ) @@ -173,6 +243,12 @@ def main( else: aa_muts = None + if args.export_json: + with open(Path(args.export_json), "r") as f: + export_json = json.load(f) + else: + export_json = None + if args.exclude: exclusion_clades = set(args.exclude) else: @@ -184,8 +260,10 @@ def main( args.clade_cols, nt_muts, aa_muts, + export_json, excluded=exclusion_clades, noncomprehensive=args.noncomprehensive, + skip_singletons=args.skip_singletons, ) # write the output From 2da87745a5b191738182ea789044c4d4bf202dd0 Mon Sep 17 00:00:00 2001 From: xonq Date: Fri, 11 Jul 2025 22:12:50 +0000 Subject: [PATCH 08/12] automatically detect non-unique clade-defining mutations and fetch new ones --- scripts/README.md | 5 +- scripts/extract_nextclades.py | 116 +++++++++++++++++++--------------- 2 files changed, 68 insertions(+), 53 deletions(-) diff --git a/scripts/README.md b/scripts/README.md index a3ba4ba..bec6391 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -163,5 +163,6 @@ $ python extract_nextclades.py \ -c ... \ -tc \ -nt \ - -aa [AA_MUTS.json] -``` \ No newline at end of file + -aa [AA_MUTS.json] \ + -n +``` diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py index f72b074..fc7d1a0 100644 --- a/scripts/extract_nextclades.py +++ b/scripts/extract_nextclades.py @@ -9,7 +9,7 @@ from ete3 import Tree from collections import defaultdict from theiaphylo.lib.StdPath import Path -from theiaphylo.theiaphylo import * +from theiaphylo.phyloutils import * logging.basicConfig( level=logging.INFO, @@ -19,6 +19,27 @@ logger = logging.getLogger(__name__) +def compile_tip2mutations(tree, tip, nt_muts, aa_muts=None): + """Compile mutations for a given tip from the nt_muts and aa_muts JSON.""" + tip_muts = {"nt": set(), "aa": defaultdict(set)} + par_node = tree.get_node_matching_name(tip) + # iteratively accumulate all mutations from the tip to the root + while par_node is not None: + node_name = par_node.name + tip_muts["nt"].update(set(nt_muts["nodes"][node_name]["muts"])) + par_node = par_node.parent + + # iteratively accumulate all amino acid mutations from the tip to the root + if aa_muts: + par_node = tree.get_node_matching_name(tip) + while par_node is not None: + node_name = par_node.name + for prot, muts in aa_muts["nodes"][node_name]["aa_muts"].items(): + tip_muts["aa"][prot].update(set(muts)) + par_node = par_node.parent + return tip_muts + + def id_clade_mrca( tree, metadf, clade_col, clade, noncomprehensive=False, skip_singletons=True ): @@ -37,9 +58,9 @@ def id_clade_mrca( return None, None else: # clade is a single tip, extract the node as the leaf name - logger.info( - f"{clade} mutations will be derived from one tip: {clade_tips[0]}" - ) + # logger.info( + # f"{clade} mutations will be derived from one tip: {clade_tips[0]}" + # ) return clade_tips[0], clade_tips[0] # get MRCA mrca = tree.lowest_common_ancestor(clade_tips) @@ -98,28 +119,35 @@ def write_clade_muts(clade2muts, out_file): f.write(f"{clade}\t{prot}\t{site}\t{alt}\n") -def check_nt_uniqueness(muts_set, in_node, nt2muts): +def check_nt_uniqueness(muts_set, tips2muts, mrca_tips, tree, mrca_node, nt_muts): """Check if the mutations are unique enough to call. Can not compile mutations up the tree, so non-comprehensive. See export JSON function""" - for node, muts_d in nt2muts["nodes"].items(): - if node != in_node: - # if all the mutations are present in another node, return False - if not muts_set.difference(set(muts_d["muts"])): - return False - return True + failing_tips = [] + for tip in set(tips2muts.keys()).difference(set(mrca_tips)): + tip_muts = tips2muts[tip]["nt"] + if not muts_set.difference(tip_muts): + failing_tips.append(tip) + # if there are failing tips, traverse the tree to get unique mutations + if failing_tips: + parent = tree.get_node_matching_name(mrca_node) + while failing_tips: + parent = parent.parent + # traversed to the root and couldn't identify unique mutations + if parent is None: + return False, failing_tips + node_name = parent.name + node_muts = nt_muts["nodes"][node_name]["muts"] + # compile unique mutations and remove if found + todel_tips = [] + for tip in failing_tips: + diff_muts = set(node_muts).difference(tips2muts[tip]["nt"]) + if diff_muts: + muts_set.update(diff_muts) + todel_tips.append(tip) + for tip in todel_tips: + failing_tips.remove(tip) -def compile_export_json(tree_json): - """Compile clades to export nodes""" - node2clades = {} - for child in tree_json["children"]: - if "children" in child: - node2clades = {**node2clades, **compile_export_json(child)} - if "node_attrs" in child: - if "clade_membership" in child["node_attrs"]: - node2clades[child["name"]] = child["node_attrs"]["clade_membership"][ - "value" - ] - return node2clades + return sorted(muts_set), failing_tips def main( @@ -128,7 +156,6 @@ def main( clade_cols, nt_muts, aa_muts=None, - export_json=None, excluded=set(), noncomprehensive=False, skip_singletons=True, @@ -137,14 +164,10 @@ def main( # remove metadata entries that are not in the tree metadf = metadf[metadf.index.isin(tree.get_tip_names())] - clades2nodes = defaultdict(set) - # populate a dictionary of clades to reference - if export_json: - node2clades = compile_export_json(export_json["tree"]) - tips = set(x.name for x in tree.iter_tips()) - for node, clade in node2clades.items(): - if node in tips: - clades2nodes[clade].add(node) + # compile cumulative mutation data for each tip + tips2muts = defaultdict(lambda: {"nt": [], "aa": {}}) + for tip in tree.get_tip_names(): + tips2muts[tip] = compile_tip2mutations(tree, tip, nt_muts, aa_muts=aa_muts) clade2muts = defaultdict(lambda: {"nt": [], "aa": {}}) for clade_col in clade_cols: @@ -163,15 +186,16 @@ def main( ) if mrca_node: logger.info(f"{clade_col}: {clade}\tMRCA: {mrca_node}") - clade_nt_muts = nt_muts["nodes"][mrca_node]["muts"] - # check for unique mutations - if clades2nodes: - if clades2nodes[clade].difference(set(mrca_tips)): - logger.warning( - f"{clade} does not have distinguishing mutations - Skipping." - ) - continue - # if check_nt_uniqueness(set(clade_nt_muts), mrca_node, nt_muts): + clade_nt_muts_set = set(nt_muts["nodes"][mrca_node]["muts"]) + + # check if the mutations are unique enough to call + clade_nt_muts, conflict_tips = check_nt_uniqueness( + clade_nt_muts_set, tips2muts, mrca_tips, tree, mrca_node, nt_muts + ) + if not clade_nt_muts: + logger.warning(f"{clade} mutations are not unique - Skipping.") + continue + clade2muts[clade]["nt"] = clade_nt_muts if aa_muts: for prot, muts in aa_muts["nodes"][mrca_node]["aa_muts"].items(): @@ -199,9 +223,6 @@ def main( ) parser.add_argument("-nt", "--nt_muts", required=True, help="Path to nt_muts JSON") parser.add_argument("-aa", "--aa_muts", help="Path to aa_muts JSON") - parser.add_argument( - "-e", "--export_json", help="AUGUR export JSON to cross-reference clades" - ) parser.add_argument("---exclude", nargs="*", help="Clades to exclude from analysis") parser.add_argument( "-r", "--root", nargs="*", help="Root tip(s) / node for rooting" @@ -243,12 +264,6 @@ def main( else: aa_muts = None - if args.export_json: - with open(Path(args.export_json), "r") as f: - export_json = json.load(f) - else: - export_json = None - if args.exclude: exclusion_clades = set(args.exclude) else: @@ -260,7 +275,6 @@ def main( args.clade_cols, nt_muts, aa_muts, - export_json, excluded=exclusion_clades, noncomprehensive=args.noncomprehensive, skip_singletons=args.skip_singletons, From 4c5a1126aa20d4a767fa709842242c156014daa7 Mon Sep 17 00:00:00 2001 From: xonq Date: Fri, 11 Jul 2025 22:15:59 +0000 Subject: [PATCH 09/12] we can do that now --- scripts/extract_nextclades.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py index fc7d1a0..6d37e4a 100644 --- a/scripts/extract_nextclades.py +++ b/scripts/extract_nextclades.py @@ -120,7 +120,7 @@ def write_clade_muts(clade2muts, out_file): def check_nt_uniqueness(muts_set, tips2muts, mrca_tips, tree, mrca_node, nt_muts): - """Check if the mutations are unique enough to call. Can not compile mutations up the tree, so non-comprehensive. See export JSON function""" + """Check if the mutations are unique enough to call.""" failing_tips = [] for tip in set(tips2muts.keys()).difference(set(mrca_tips)): tip_muts = tips2muts[tip]["nt"] From 826d726ca20746571972a024432afcdd86cb131e Mon Sep 17 00:00:00 2001 From: xonq Date: Thu, 9 Oct 2025 21:23:55 +0000 Subject: [PATCH 10/12] init docker image --- scripts/extract_nextclades/Dockerfile | 12 ++++++++++++ .../{ => extract_nextclades}/extract_nextclades.py | 0 2 files changed, 12 insertions(+) create mode 100644 scripts/extract_nextclades/Dockerfile rename scripts/{ => extract_nextclades}/extract_nextclades.py (100%) diff --git a/scripts/extract_nextclades/Dockerfile b/scripts/extract_nextclades/Dockerfile new file mode 100644 index 0000000..34a1a60 --- /dev/null +++ b/scripts/extract_nextclades/Dockerfile @@ -0,0 +1,12 @@ +FROM us-docker.pkg.dev/general-theiagen/theiagen/theiaphylo:0.1.8 + +COPY extract_nextclades.py /app/ + +RUN chmod +x /app/* +ENV PATH="/app:${PATH}" + +# Set environment variables for Google Cloud SDK +ENV CLOUDSDK_CONFIG=/gcp/config +ENV PYTHONPATH=/app + +CMD ["extract_nextclades.py"] \ No newline at end of file diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades/extract_nextclades.py similarity index 100% rename from scripts/extract_nextclades.py rename to scripts/extract_nextclades/extract_nextclades.py From 70cc02655f58fd8701b80b50f93323a5811ab5b8 Mon Sep 17 00:00:00 2001 From: xonq Date: Fri, 10 Oct 2025 15:54:22 +0000 Subject: [PATCH 11/12] add pandas --- scripts/extract_nextclades/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/extract_nextclades/Dockerfile b/scripts/extract_nextclades/Dockerfile index 34a1a60..2ae3168 100644 --- a/scripts/extract_nextclades/Dockerfile +++ b/scripts/extract_nextclades/Dockerfile @@ -2,6 +2,8 @@ FROM us-docker.pkg.dev/general-theiagen/theiagen/theiaphylo:0.1.8 COPY extract_nextclades.py /app/ +RUN python3 -m pip install pandas --break-system-packages + RUN chmod +x /app/* ENV PATH="/app:${PATH}" From a20aa0689f96b53a13d315070d15b0203d4ede81 Mon Sep 17 00:00:00 2001 From: xonq Date: Wed, 15 Oct 2025 23:51:15 +0000 Subject: [PATCH 12/12] raise error if no clades are detected --- scripts/extract_nextclades/extract_nextclades.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/extract_nextclades/extract_nextclades.py b/scripts/extract_nextclades/extract_nextclades.py index 6d37e4a..4d4e3da 100644 --- a/scripts/extract_nextclades/extract_nextclades.py +++ b/scripts/extract_nextclades/extract_nextclades.py @@ -101,6 +101,8 @@ def id_clade_mrca( def write_clade_muts(clade2muts, out_file): """Extract and write the mutations for each clade to a TSV file.""" + if len(clade2muts.keys()) < 2: + raise AttributeError('No clade-defining mutations could be extracted') with open(out_file, "w") as f: f.write("clade\tgene\tsite\talt\n") for clade, mut_dict in clade2muts.items():