From e9853730f9d11380fe0443bbbc05f4b150acf8c6 Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Thu, 22 May 2025 21:54:21 +0000
Subject: [PATCH 01/12] init update

---
 scripts/README.md             |  18 ++++
 scripts/extract_nextclades.py | 167 ++++++++++++++++++++++++++++++++++
 2 files changed, 185 insertions(+)
 create mode 100644 scripts/extract_nextclades.py

diff --git a/scripts/README.md b/scripts/README.md
index 3ebc437..a3ba4ba 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -146,4 +146,22 @@ MiniWDL is a required package
 #### usage
 ```bash
 $ python update_taxon_tables_io.py -r <local_PHB_repo> -i <task_broad_terra_tools.wdl>
+```
+
+### extract_nextclades.py
+
+`extract_nextclades.py` extracts the nucleotide mutations - optionally including the amino acid mutations - associated with clade metadata, and produces a Nextclade compatible `clades.tsv` for building a Nextclade dataset. This script enables swift creation of a genotyping Nextclade module. Clade metadata that is not monophyletic within the reference phylogeny is omitted. 
+
+#### requirements
+- theiaphylo (`python3 -m pip install theiaphylo`)
+
+#### usage
+```bash
+$ python extract_nextclades.py \
+  -t <AUGUR_REFINED_NWK> \
+  -m <METADATA.tsv> \
+  -c <CLADE_COL_HEADER1> <CLADE_COL_HEADER2> ... \
+  -tc <TIP_LABEL_COL> \
+  -nt <NT_MUTS.json> \
+  -aa [AA_MUTS.json]
 ```
\ No newline at end of file
diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
new file mode 100644
index 0000000..ef9379c
--- /dev/null
+++ b/scripts/extract_nextclades.py
@@ -0,0 +1,167 @@
+#! /usr/bin/env python3
+
+import re
+import sys
+import json
+import logging
+import argparse
+import pandas as pd
+from ete3 import Tree
+from collections import defaultdict
+from theiaphylo.lib.StdPath import Path
+from theiaphylo.lib.TheiaPhylo import *
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+
+
+def id_clade_mrca(tree, metadf, clade_col, clade):
+    # extract df for clade
+    clade_df = metadf[metadf[clade_col] == clade]
+
+    # get tips from clade
+    clade_tips = sorted(set(clade_df.index))
+    if len(clade_tips) == 1:
+        # clade is a single tip, extract the node as the leaf name
+        return clade_tips[0]
+    # get MRCA
+    mrca = tree.lowest_common_ancestor(clade_tips)
+    mrca_tips = sorted(x.name for x in mrca.iter_tips())
+
+    # extract clades associated with tips
+    mrca_df = metadf.loc[mrca_tips]
+
+    # identify clades descended from the MRCA
+    mrca_clades = sorted(set(x for x in mrca_df[clade_col] if not pd.isna(x)))
+
+    # clade is not monophyletic
+    if len(mrca_clades) > 1:
+        conflicts = sorted(set(mrca_clades) - {clade})
+        logger.warning(
+            f"{clade} is not monophyletic; conflicts with {conflicts} - Skipping."
+        )
+        return None
+    # extract node from monophyletic clade
+    else:
+        # convert to ete3 object
+        ete_tree = Tree(
+            tree.get_newick(with_node_names=True, escape_name=False), format=8
+        )
+        # get the node name from the ete3 tree
+        mrca_node = (
+            ete_tree.get_common_ancestor(mrca_tips)
+            .name.replace("'", "")
+            .replace('"', "")
+        )
+    #        clade2node[clade] =
+    return mrca_node
+
+
+def write_clade_muts(clade2muts, out_file):
+    with open(out_file, "w") as f:
+        f.write("clade\tgene\tsite\talt\n")
+        for clade, mut_dict in clade2muts.items():
+            for mut in mut_dict["nt"]:
+                mut_components = re.search(r"\D+(\d+)(\D+)", mut)
+                if mut_components:
+                    site = mut_components.group(1)
+                    alt = mut_components.group(2)
+                    f.write(f"{clade}\tnuc\t{site}\t{alt}\n")
+            for prot, muts in mut_dict["aa"].items():
+                for mut in muts:
+                    mut_components = re.search(r"\D+(\d+)(\D+)", mut)
+                    if mut_components:
+                        site = mut_components.group(1)
+                        alt = mut_components.group(2)
+                        f.write(f"{clade}\t{prot}\t{site}\t{alt}\n")
+
+
+def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()):
+
+    # remove metadata entries that are not in the tree
+    metadf = metadf[metadf.index.isin(tree.get_tip_names())]
+
+    clade2muts = defaultdict(lambda: {"nt": [], "aa": {}})
+    for clade_col in clade_cols:
+        # get the clades
+        clades = sorted(
+            set([x for x in metadf[clade_col] if not pd.isna(x) and x not in excluded])
+        )
+        for clade in clades:
+            mrca_node = id_clade_mrca(tree, metadf, clade_col, clade)
+            if mrca_node:
+                logger.info(f"{clade_col}: {clade}\tMRCA: {mrca_node}")
+                clade2muts[clade]["nt"] = nt_muts["nodes"][mrca_node]["muts"]
+                if aa_muts:
+                    for prot, muts in aa_muts["nodes"][mrca_node]["aa_muts"].items():
+                        clade2muts[clade]["aa"][prot] = muts
+    return clade2muts
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Extract mutations from monophyletic clades for Nextclade genotyping"
+    )
+    parser.add_argument(
+        "-t", "--tree", required=True, help="Path to the Augur-refined newick"
+    )
+    parser.add_argument("-m", "--metadata", required=True, help="Path to metadata TSV")
+    parser.add_argument(
+        "-cc", "--clade_cols", nargs="*", required=True, help="Clade columns to extract"
+    )
+    parser.add_argument(
+        "-tc", "--tip_col", required=True, help="Column in metadata to use as tip label"
+    )
+    parser.add_argument("-nt", "--nt_muts", required=True, help="Path to nt_muts JSON")
+    parser.add_argument("-aa", "--aa_muts", help="Path to aa_muts JSON")
+    parser.add_argument(
+        "-e", "---exclude", nargs="*", help="Clades to exclude from analysis"
+    )
+    parser.add_argument(
+        "-r", "--root", nargs="*", help="Root tip(s) / node for rooting"
+    )
+    parser.add_argument(
+        "-o", "--output", help="Output file name. DEFAULT: 'clades.tsv'"
+    )
+    args = parser.parse_args()
+
+    # load the tree
+    tree = import_tree(Path(args.tree), outgroup=args.root)
+
+    # load the metadata
+    metadata_file = Path(args.metadata)
+    if metadata_file.endswith(".csv"):
+        metadf = pd.read_csv(Path(metadata_file), index_col=args.tip_col)
+    else:
+        metadf = pd.read_csv(Path(metadata_file), sep="\t", index_col=args.tip_col)
+
+    # load the mutation files
+    with open(Path(args.nt_muts), "r") as f:
+        nt_muts = json.load(f)
+    # load the aa_muts file if provided
+    if args.aa_muts:
+        with open(Path(args.aa_muts), "r") as f:
+            aa_muts = json.load(f)
+    else:
+        aa_muts = None
+
+    if args.exclude:
+        exclusion_clades = set(args.exclude)
+    else:
+        exclusion_clades = set()
+
+    clade2muts = main(
+        tree, metadf, args.clade_cols, nt_muts, aa_muts, excluded=exclusion_clades
+    )
+
+    # write the output
+    if args.output:
+        output_file = Path(args.output)
+    else:
+        output_file = Path("clades.tsv")
+    write_clade_muts(clade2muts, output_file)
+    sys.exit(0)

From 2e2296b0bf417f8eba3a2bc9d75e6a3f6bdfca51 Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Fri, 23 May 2025 15:22:07 +0000
Subject: [PATCH 02/12] new package name

---
 scripts/extract_nextclades.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
index ef9379c..e98b07c 100644
--- a/scripts/extract_nextclades.py
+++ b/scripts/extract_nextclades.py
@@ -9,7 +9,7 @@
 from ete3 import Tree
 from collections import defaultdict
 from theiaphylo.lib.StdPath import Path
-from theiaphylo.lib.TheiaPhylo import *
+from theiaphylo.theiaPhylo import *
 
 logging.basicConfig(
     level=logging.INFO,

From ab248416f660619ccdce5223b2f92d4ce5d9d058 Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Sat, 31 May 2025 00:10:57 +0000
Subject: [PATCH 03/12] correct library call

---
 scripts/extract_nextclades.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
index e98b07c..df509e4 100644
--- a/scripts/extract_nextclades.py
+++ b/scripts/extract_nextclades.py
@@ -9,7 +9,7 @@
 from ete3 import Tree
 from collections import defaultdict
 from theiaphylo.lib.StdPath import Path
-from theiaphylo.theiaPhylo import *
+from theiaphylo.theiaphylo import *
 
 logging.basicConfig(
     level=logging.INFO,

From f774a40e5d03dc1dfe680f90ed7b19bf15f7210d Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Tue, 10 Jun 2025 15:30:15 +0000
Subject: [PATCH 04/12] allow for incomplete taxa

---
 scripts/extract_nextclades.py | 36 ++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
index df509e4..9bfc9cd 100644
--- a/scripts/extract_nextclades.py
+++ b/scripts/extract_nextclades.py
@@ -19,7 +19,7 @@
 logger = logging.getLogger(__name__)
 
 
-def id_clade_mrca(tree, metadf, clade_col, clade):
+def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False):
     # extract df for clade
     clade_df = metadf[metadf[clade_col] == clade]
 
@@ -33,7 +33,11 @@ def id_clade_mrca(tree, metadf, clade_col, clade):
     mrca_tips = sorted(x.name for x in mrca.iter_tips())
 
     # extract clades associated with tips
-    mrca_df = metadf.loc[mrca_tips]
+    if noncomprehensive:
+        mrca_df_tips = sorted(set(metadf.index).intersection(set(mrca_tips)))
+        mrca_df = metadf.loc[mrca_df_tips]
+    else:
+        mrca_df = metadf.loc[mrca_tips]
 
     # identify clades descended from the MRCA
     mrca_clades = sorted(set(x for x in mrca_df[clade_col] if not pd.isna(x)))
@@ -80,7 +84,15 @@ def write_clade_muts(clade2muts, out_file):
                         f.write(f"{clade}\t{prot}\t{site}\t{alt}\n")
 
 
-def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()):
+def main(
+    tree,
+    metadf,
+    clade_cols,
+    nt_muts,
+    aa_muts=None,
+    excluded=set(),
+    noncomprehensive=False,
+):
 
     # remove metadata entries that are not in the tree
     metadf = metadf[metadf.index.isin(tree.get_tip_names())]
@@ -92,7 +104,9 @@ def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()):
             set([x for x in metadf[clade_col] if not pd.isna(x) and x not in excluded])
         )
         for clade in clades:
-            mrca_node = id_clade_mrca(tree, metadf, clade_col, clade)
+            mrca_node = id_clade_mrca(
+                tree, metadf, clade_col, clade, noncomprehensive=noncomprehensive
+            )
             if mrca_node:
                 logger.info(f"{clade_col}: {clade}\tMRCA: {mrca_node}")
                 clade2muts[clade]["nt"] = nt_muts["nodes"][mrca_node]["muts"]
@@ -124,6 +138,12 @@ def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()):
     parser.add_argument(
         "-r", "--root", nargs="*", help="Root tip(s) / node for rooting"
     )
+    parser.add_argument(
+        "-n",
+        "--noncomprehensive",
+        action="store_true",
+        help="Accept missing metadata for tips",
+    )
     parser.add_argument(
         "-o", "--output", help="Output file name. DEFAULT: 'clades.tsv'"
     )
@@ -155,7 +175,13 @@ def main(tree, metadf, clade_cols, nt_muts, aa_muts=None, excluded=set()):
         exclusion_clades = set()
 
     clade2muts = main(
-        tree, metadf, args.clade_cols, nt_muts, aa_muts, excluded=exclusion_clades
+        tree,
+        metadf,
+        args.clade_cols,
+        nt_muts,
+        aa_muts,
+        excluded=exclusion_clades,
+        noncomprehensive=args.noncomprehensive,
     )
 
     # write the output

From d83cb4b35e5c58302c820b3d3b339338cda706ea Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Tue, 10 Jun 2025 17:31:03 +0000
Subject: [PATCH 05/12] formatting

---
 scripts/extract_nextclades.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
index 9bfc9cd..52d2785 100644
--- a/scripts/extract_nextclades.py
+++ b/scripts/extract_nextclades.py
@@ -20,6 +20,8 @@
 
 
 def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False):
+    """Identify the most recent common ancestor (MRCA) of a clade if it is monophyletic.
+    Otherwise report a warning and return None."""
     # extract df for clade
     clade_df = metadf[metadf[clade_col] == clade]
 
@@ -66,6 +68,7 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False):
 
 
 def write_clade_muts(clade2muts, out_file):
+    """Extract and write the mutations for each clade to a TSV file."""
     with open(out_file, "w") as f:
         f.write("clade\tgene\tsite\talt\n")
         for clade, mut_dict in clade2muts.items():
@@ -93,7 +96,7 @@ def main(
     excluded=set(),
     noncomprehensive=False,
 ):
-
+    """Main function to extract mutations from clades."""
     # remove metadata entries that are not in the tree
     metadf = metadf[metadf.index.isin(tree.get_tip_names())]
 

From f3636b198b7fda21d2ce94668e8cf99260386672 Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Tue, 17 Jun 2025 16:55:00 +0000
Subject: [PATCH 06/12] increase logging

---
 scripts/extract_nextclades.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
index 52d2785..68cff62 100644
--- a/scripts/extract_nextclades.py
+++ b/scripts/extract_nextclades.py
@@ -29,6 +29,7 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False):
     clade_tips = sorted(set(clade_df.index))
     if len(clade_tips) == 1:
         # clade is a single tip, extract the node as the leaf name
+        logger.info(f"{clade} mutations will be derived from one tip: {clade_tips[0]}")
         return clade_tips[0]
     # get MRCA
     mrca = tree.lowest_common_ancestor(clade_tips)

From 23ecab390b932d828d70c2f4f9373bcfdbda7c9c Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Fri, 20 Jun 2025 22:44:27 +0000
Subject: [PATCH 07/12] enable identification of uniquely defining mutations

---
 scripts/extract_nextclades.py | 98 +++++++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 10 deletions(-)

diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
index 68cff62..f72b074 100644
--- a/scripts/extract_nextclades.py
+++ b/scripts/extract_nextclades.py
@@ -19,7 +19,9 @@
 logger = logging.getLogger(__name__)
 
 
-def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False):
+def id_clade_mrca(
+    tree, metadf, clade_col, clade, noncomprehensive=False, skip_singletons=True
+):
     """Identify the most recent common ancestor (MRCA) of a clade if it is monophyletic.
     Otherwise report a warning and return None."""
     # extract df for clade
@@ -28,9 +30,17 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False):
     # get tips from clade
     clade_tips = sorted(set(clade_df.index))
     if len(clade_tips) == 1:
-        # clade is a single tip, extract the node as the leaf name
-        logger.info(f"{clade} mutations will be derived from one tip: {clade_tips[0]}")
-        return clade_tips[0]
+        if skip_singletons:
+            logger.warning(
+                f"{clade} is a singleton clade with one tip: {clade_tips[0]} - Skipping."
+            )
+            return None, None
+        else:
+            # clade is a single tip, extract the node as the leaf name
+            logger.info(
+                f"{clade} mutations will be derived from one tip: {clade_tips[0]}"
+            )
+            return clade_tips[0], clade_tips[0]
     # get MRCA
     mrca = tree.lowest_common_ancestor(clade_tips)
     mrca_tips = sorted(x.name for x in mrca.iter_tips())
@@ -51,7 +61,7 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False):
         logger.warning(
             f"{clade} is not monophyletic; conflicts with {conflicts} - Skipping."
         )
-        return None
+        return None, None
     # extract node from monophyletic clade
     else:
         # convert to ete3 object
@@ -65,7 +75,7 @@ def id_clade_mrca(tree, metadf, clade_col, clade, noncomprehensive=False):
             .replace('"', "")
         )
     #        clade2node[clade] =
-    return mrca_node
+    return mrca_node, mrca_tips
 
 
 def write_clade_muts(clade2muts, out_file):
@@ -88,19 +98,54 @@ def write_clade_muts(clade2muts, out_file):
                         f.write(f"{clade}\t{prot}\t{site}\t{alt}\n")
 
 
+def check_nt_uniqueness(muts_set, in_node, nt2muts):
+    """Check if the mutations are unique enough to call. Can not compile mutations up the tree, so non-comprehensive. See export JSON function"""
+    for node, muts_d in nt2muts["nodes"].items():
+        if node != in_node:
+            # if all the mutations are present in another node, return False
+            if not muts_set.difference(set(muts_d["muts"])):
+                return False
+    return True
+
+
+def compile_export_json(tree_json):
+    """Compile clades to export nodes"""
+    node2clades = {}
+    for child in tree_json["children"]:
+        if "children" in child:
+            node2clades = {**node2clades, **compile_export_json(child)}
+        if "node_attrs" in child:
+            if "clade_membership" in child["node_attrs"]:
+                node2clades[child["name"]] = child["node_attrs"]["clade_membership"][
+                    "value"
+                ]
+    return node2clades
+
+
 def main(
     tree,
     metadf,
     clade_cols,
     nt_muts,
     aa_muts=None,
+    export_json=None,
     excluded=set(),
     noncomprehensive=False,
+    skip_singletons=True,
 ):
     """Main function to extract mutations from clades."""
     # remove metadata entries that are not in the tree
     metadf = metadf[metadf.index.isin(tree.get_tip_names())]
 
+    clades2nodes = defaultdict(set)
+    # populate a dictionary of clades to reference
+    if export_json:
+        node2clades = compile_export_json(export_json["tree"])
+        tips = set(x.name for x in tree.iter_tips())
+        for node, clade in node2clades.items():
+            if node in tips:
+                clades2nodes[clade].add(node)
+
     clade2muts = defaultdict(lambda: {"nt": [], "aa": {}})
     for clade_col in clade_cols:
         # get the clades
@@ -108,15 +153,33 @@ def main(
             set([x for x in metadf[clade_col] if not pd.isna(x) and x not in excluded])
         )
         for clade in clades:
-            mrca_node = id_clade_mrca(
-                tree, metadf, clade_col, clade, noncomprehensive=noncomprehensive
+            mrca_node, mrca_tips = id_clade_mrca(
+                tree,
+                metadf,
+                clade_col,
+                clade,
+                noncomprehensive=noncomprehensive,
+                skip_singletons=skip_singletons,
             )
             if mrca_node:
                 logger.info(f"{clade_col}: {clade}\tMRCA: {mrca_node}")
-                clade2muts[clade]["nt"] = nt_muts["nodes"][mrca_node]["muts"]
+                clade_nt_muts = nt_muts["nodes"][mrca_node]["muts"]
+                # check for unique mutations
+                if clades2nodes:
+                    if clades2nodes[clade].difference(set(mrca_tips)):
+                        logger.warning(
+                            f"{clade} does not have distinguishing mutations - Skipping."
+                        )
+                        continue
+                #                if check_nt_uniqueness(set(clade_nt_muts), mrca_node, nt_muts):
+                clade2muts[clade]["nt"] = clade_nt_muts
                 if aa_muts:
                     for prot, muts in aa_muts["nodes"][mrca_node]["aa_muts"].items():
                         clade2muts[clade]["aa"][prot] = muts
+    #               else:
+    #                  logger.warning(
+    #                     f"{clade} mutations are not unique"
+    #                )
     return clade2muts
 
 
@@ -137,8 +200,9 @@ def main(
     parser.add_argument("-nt", "--nt_muts", required=True, help="Path to nt_muts JSON")
     parser.add_argument("-aa", "--aa_muts", help="Path to aa_muts JSON")
     parser.add_argument(
-        "-e", "---exclude", nargs="*", help="Clades to exclude from analysis"
+        "-e", "--export_json", help="AUGUR export JSON to cross-reference clades"
     )
+    parser.add_argument("---exclude", nargs="*", help="Clades to exclude from analysis")
     parser.add_argument(
         "-r", "--root", nargs="*", help="Root tip(s) / node for rooting"
     )
@@ -148,6 +212,12 @@ def main(
         action="store_true",
         help="Accept missing metadata for tips",
     )
+    parser.add_argument(
+        "-s",
+        "--skip_singletons",
+        action="store_true",
+        help="Skip singletons (clades with one tip)",
+    )
     parser.add_argument(
         "-o", "--output", help="Output file name. DEFAULT: 'clades.tsv'"
     )
@@ -173,6 +243,12 @@ def main(
     else:
         aa_muts = None
 
+    if args.export_json:
+        with open(Path(args.export_json), "r") as f:
+            export_json = json.load(f)
+    else:
+        export_json = None
+
     if args.exclude:
         exclusion_clades = set(args.exclude)
     else:
@@ -184,8 +260,10 @@ def main(
         args.clade_cols,
         nt_muts,
         aa_muts,
+        export_json,
         excluded=exclusion_clades,
         noncomprehensive=args.noncomprehensive,
+        skip_singletons=args.skip_singletons,
     )
 
     # write the output

From 2da87745a5b191738182ea789044c4d4bf202dd0 Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Fri, 11 Jul 2025 22:12:50 +0000
Subject: [PATCH 08/12] automatically detect non-unique clade-defining
 mutations and fetch new ones

---
 scripts/README.md             |   5 +-
 scripts/extract_nextclades.py | 116 +++++++++++++++++++---------------
 2 files changed, 68 insertions(+), 53 deletions(-)

diff --git a/scripts/README.md b/scripts/README.md
index a3ba4ba..bec6391 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -163,5 +163,6 @@ $ python extract_nextclades.py \
   -c <CLADE_COL_HEADER1> <CLADE_COL_HEADER2> ... \
   -tc <TIP_LABEL_COL> \
   -nt <NT_MUTS.json> \
-  -aa [AA_MUTS.json]
-```
\ No newline at end of file
+  -aa [AA_MUTS.json] \
+  -n
+```
diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
index f72b074..fc7d1a0 100644
--- a/scripts/extract_nextclades.py
+++ b/scripts/extract_nextclades.py
@@ -9,7 +9,7 @@
 from ete3 import Tree
 from collections import defaultdict
 from theiaphylo.lib.StdPath import Path
-from theiaphylo.theiaphylo import *
+from theiaphylo.phyloutils import *
 
 logging.basicConfig(
     level=logging.INFO,
@@ -19,6 +19,27 @@
 logger = logging.getLogger(__name__)
 
 
+def compile_tip2mutations(tree, tip, nt_muts, aa_muts=None):
+    """Compile mutations for a given tip from the nt_muts and aa_muts JSON."""
+    tip_muts = {"nt": set(), "aa": defaultdict(set)}
+    par_node = tree.get_node_matching_name(tip)
+    # iteratively accumulate all mutations from the tip to the root
+    while par_node is not None:
+        node_name = par_node.name
+        tip_muts["nt"].update(set(nt_muts["nodes"][node_name]["muts"]))
+        par_node = par_node.parent
+
+    # iteratively accumulate all amino acid mutations from the tip to the root
+    if aa_muts:
+        par_node = tree.get_node_matching_name(tip)
+        while par_node is not None:
+            node_name = par_node.name
+            for prot, muts in aa_muts["nodes"][node_name]["aa_muts"].items():
+                tip_muts["aa"][prot].update(set(muts))
+            par_node = par_node.parent
+    return tip_muts
+
+
 def id_clade_mrca(
     tree, metadf, clade_col, clade, noncomprehensive=False, skip_singletons=True
 ):
@@ -37,9 +58,9 @@ def id_clade_mrca(
             return None, None
         else:
             # clade is a single tip, extract the node as the leaf name
-            logger.info(
-                f"{clade} mutations will be derived from one tip: {clade_tips[0]}"
-            )
+            #           logger.info(
+            #                f"{clade} mutations will be derived from one tip: {clade_tips[0]}"
+            #          )
             return clade_tips[0], clade_tips[0]
     # get MRCA
     mrca = tree.lowest_common_ancestor(clade_tips)
@@ -98,28 +119,35 @@ def write_clade_muts(clade2muts, out_file):
                         f.write(f"{clade}\t{prot}\t{site}\t{alt}\n")
 
 
-def check_nt_uniqueness(muts_set, in_node, nt2muts):
+def check_nt_uniqueness(muts_set, tips2muts, mrca_tips, tree, mrca_node, nt_muts):
     """Check if the mutations are unique enough to call. Can not compile mutations up the tree, so non-comprehensive. See export JSON function"""
-    for node, muts_d in nt2muts["nodes"].items():
-        if node != in_node:
-            # if all the mutations are present in another node, return False
-            if not muts_set.difference(set(muts_d["muts"])):
-                return False
-    return True
+    failing_tips = []
+    for tip in set(tips2muts.keys()).difference(set(mrca_tips)):
+        tip_muts = tips2muts[tip]["nt"]
+        if not muts_set.difference(tip_muts):
+            failing_tips.append(tip)
 
+    # if there are failing tips, traverse the tree to get unique mutations
+    if failing_tips:
+        parent = tree.get_node_matching_name(mrca_node)
+        while failing_tips:
+            parent = parent.parent
+            # traversed to the root and couldn't identify unique mutations
+            if parent is None:
+                return False, failing_tips
+            node_name = parent.name
+            node_muts = nt_muts["nodes"][node_name]["muts"]
+            # compile unique mutations and remove if found
+            todel_tips = []
+            for tip in failing_tips:
+                diff_muts = set(node_muts).difference(tips2muts[tip]["nt"])
+                if diff_muts:
+                    muts_set.update(diff_muts)
+                    todel_tips.append(tip)
+            for tip in todel_tips:
+                failing_tips.remove(tip)
 
-def compile_export_json(tree_json):
-    """Compile clades to export nodes"""
-    node2clades = {}
-    for child in tree_json["children"]:
-        if "children" in child:
-            node2clades = {**node2clades, **compile_export_json(child)}
-        if "node_attrs" in child:
-            if "clade_membership" in child["node_attrs"]:
-                node2clades[child["name"]] = child["node_attrs"]["clade_membership"][
-                    "value"
-                ]
-    return node2clades
+    return sorted(muts_set), failing_tips
 
 
 def main(
@@ -128,7 +156,6 @@ def main(
     clade_cols,
     nt_muts,
     aa_muts=None,
-    export_json=None,
     excluded=set(),
     noncomprehensive=False,
     skip_singletons=True,
@@ -137,14 +164,10 @@ def main(
     # remove metadata entries that are not in the tree
     metadf = metadf[metadf.index.isin(tree.get_tip_names())]
 
-    clades2nodes = defaultdict(set)
-    # populate a dictionary of clades to reference
-    if export_json:
-        node2clades = compile_export_json(export_json["tree"])
-        tips = set(x.name for x in tree.iter_tips())
-        for node, clade in node2clades.items():
-            if node in tips:
-                clades2nodes[clade].add(node)
+    # compile cumulative mutation data for each tip
+    tips2muts = defaultdict(lambda: {"nt": [], "aa": {}})
+    for tip in tree.get_tip_names():
+        tips2muts[tip] = compile_tip2mutations(tree, tip, nt_muts, aa_muts=aa_muts)
 
     clade2muts = defaultdict(lambda: {"nt": [], "aa": {}})
     for clade_col in clade_cols:
@@ -163,15 +186,16 @@ def main(
             )
             if mrca_node:
                 logger.info(f"{clade_col}: {clade}\tMRCA: {mrca_node}")
-                clade_nt_muts = nt_muts["nodes"][mrca_node]["muts"]
-                # check for unique mutations
-                if clades2nodes:
-                    if clades2nodes[clade].difference(set(mrca_tips)):
-                        logger.warning(
-                            f"{clade} does not have distinguishing mutations - Skipping."
-                        )
-                        continue
-                #                if check_nt_uniqueness(set(clade_nt_muts), mrca_node, nt_muts):
+                clade_nt_muts_set = set(nt_muts["nodes"][mrca_node]["muts"])
+
+                # check if the mutations are unique enough to call
+                clade_nt_muts, conflict_tips = check_nt_uniqueness(
+                    clade_nt_muts_set, tips2muts, mrca_tips, tree, mrca_node, nt_muts
+                )
+                if not clade_nt_muts:
+                    logger.warning(f"{clade} mutations are not unique - Skipping.")
+                    continue
+
                 clade2muts[clade]["nt"] = clade_nt_muts
                 if aa_muts:
                     for prot, muts in aa_muts["nodes"][mrca_node]["aa_muts"].items():
@@ -199,9 +223,6 @@ def main(
     )
     parser.add_argument("-nt", "--nt_muts", required=True, help="Path to nt_muts JSON")
     parser.add_argument("-aa", "--aa_muts", help="Path to aa_muts JSON")
-    parser.add_argument(
-        "-e", "--export_json", help="AUGUR export JSON to cross-reference clades"
-    )
     parser.add_argument("---exclude", nargs="*", help="Clades to exclude from analysis")
     parser.add_argument(
         "-r", "--root", nargs="*", help="Root tip(s) / node for rooting"
@@ -243,12 +264,6 @@ def main(
     else:
         aa_muts = None
 
-    if args.export_json:
-        with open(Path(args.export_json), "r") as f:
-            export_json = json.load(f)
-    else:
-        export_json = None
-
     if args.exclude:
         exclusion_clades = set(args.exclude)
     else:
@@ -260,7 +275,6 @@ def main(
         args.clade_cols,
         nt_muts,
         aa_muts,
-        export_json,
         excluded=exclusion_clades,
         noncomprehensive=args.noncomprehensive,
         skip_singletons=args.skip_singletons,

From 4c5a1126aa20d4a767fa709842242c156014daa7 Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Fri, 11 Jul 2025 22:15:59 +0000
Subject: [PATCH 09/12] we can do that now

---
 scripts/extract_nextclades.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades.py
index fc7d1a0..6d37e4a 100644
--- a/scripts/extract_nextclades.py
+++ b/scripts/extract_nextclades.py
@@ -120,7 +120,7 @@ def write_clade_muts(clade2muts, out_file):
 
 
 def check_nt_uniqueness(muts_set, tips2muts, mrca_tips, tree, mrca_node, nt_muts):
-    """Check if the mutations are unique enough to call. Can not compile mutations up the tree, so non-comprehensive. See export JSON function"""
+    """Check if the mutations are unique enough to call."""
     failing_tips = []
     for tip in set(tips2muts.keys()).difference(set(mrca_tips)):
         tip_muts = tips2muts[tip]["nt"]

From 826d726ca20746571972a024432afcdd86cb131e Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Thu, 9 Oct 2025 21:23:55 +0000
Subject: [PATCH 10/12] init docker image

---
 scripts/extract_nextclades/Dockerfile                | 12 ++++++++++++
 .../{ => extract_nextclades}/extract_nextclades.py   |  0
 2 files changed, 12 insertions(+)
 create mode 100644 scripts/extract_nextclades/Dockerfile
 rename scripts/{ => extract_nextclades}/extract_nextclades.py (100%)

diff --git a/scripts/extract_nextclades/Dockerfile b/scripts/extract_nextclades/Dockerfile
new file mode 100644
index 0000000..34a1a60
--- /dev/null
+++ b/scripts/extract_nextclades/Dockerfile
@@ -0,0 +1,12 @@
+FROM us-docker.pkg.dev/general-theiagen/theiagen/theiaphylo:0.1.8 
+
+COPY extract_nextclades.py /app/
+
+RUN chmod +x /app/*
+ENV PATH="/app:${PATH}"
+
+# Set environment variables for Google Cloud SDK
+ENV CLOUDSDK_CONFIG=/gcp/config
+ENV PYTHONPATH=/app
+
+CMD ["extract_nextclades.py"]
\ No newline at end of file
diff --git a/scripts/extract_nextclades.py b/scripts/extract_nextclades/extract_nextclades.py
similarity index 100%
rename from scripts/extract_nextclades.py
rename to scripts/extract_nextclades/extract_nextclades.py

From 70cc02655f58fd8701b80b50f93323a5811ab5b8 Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Fri, 10 Oct 2025 15:54:22 +0000
Subject: [PATCH 11/12] add pandas

---
 scripts/extract_nextclades/Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/extract_nextclades/Dockerfile b/scripts/extract_nextclades/Dockerfile
index 34a1a60..2ae3168 100644
--- a/scripts/extract_nextclades/Dockerfile
+++ b/scripts/extract_nextclades/Dockerfile
@@ -2,6 +2,8 @@ FROM us-docker.pkg.dev/general-theiagen/theiagen/theiaphylo:0.1.8
 
 COPY extract_nextclades.py /app/
 
+RUN python3 -m pip install pandas --break-system-packages
+
 RUN chmod +x /app/*
 ENV PATH="/app:${PATH}"
 

From a20aa0689f96b53a13d315070d15b0203d4ede81 Mon Sep 17 00:00:00 2001
From: xonq <konkelzach@tuta.io>
Date: Wed, 15 Oct 2025 23:51:15 +0000
Subject: [PATCH 12/12] raise error if no clades are detected

---
 scripts/extract_nextclades/extract_nextclades.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/extract_nextclades/extract_nextclades.py b/scripts/extract_nextclades/extract_nextclades.py
index 6d37e4a..4d4e3da 100644
--- a/scripts/extract_nextclades/extract_nextclades.py
+++ b/scripts/extract_nextclades/extract_nextclades.py
@@ -101,6 +101,8 @@ def id_clade_mrca(
 
 def write_clade_muts(clade2muts, out_file):
     """Extract and write the mutations for each clade to a TSV file."""
+    if len(clade2muts.keys()) < 2:
+        raise AttributeError('No clade-defining mutations could be extracted')
     with open(out_file, "w") as f:
         f.write("clade\tgene\tsite\talt\n")
         for clade, mut_dict in clade2muts.items():