From 01fd59b071cacb4274df3201db7c2724fa7c4266 Mon Sep 17 00:00:00 2001 From: Jamie Lentin Date: Tue, 4 Nov 2025 11:34:50 +0000 Subject: [PATCH 1/2] CSV_base_table_creator: Increase recursion limit Reading large trees in DendroPy results in a huge call stack. Increase the recursion limit now to prime ourselves for fully dated trees, --- .../taxon_mapping_and_popularity/CSV_base_table_creator.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py index 48f9e10..cf64174 100755 --- a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py +++ b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py @@ -105,6 +105,10 @@ sql_subs_string = "" # ? for sqlite, %s for mysql +# DendroPy performs lots of recursion when reading large trees, this is expected +# https://github.com/jeetsukumaran/DendroPy/issues/52 +sys.setrecursionlimit(3000) + def is_unnamed_OTT(OTTid): """ From 240bbcf6cd345078ca204d6b3dc1e24b37e595d1 Mon Sep 17 00:00:00 2001 From: Jamie Lentin Date: Tue, 4 Nov 2025 11:37:05 +0000 Subject: [PATCH 2/2] CSV_base_table_creator: Open files one at a time We don't write to output files in parallel, so we don't need to hold them open all at once. Break up with block to only open the relevant file when writing. --- .../CSV_base_table_creator.py | 94 ++++++++++--------- 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py index cf64174..8827e05 100755 --- a/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py +++ b/oz_tree_build/taxon_mapping_and_popularity/CSV_base_table_creator.py @@ -654,56 +654,58 @@ def output_simplified_tree(tree, taxonomy_file, outdir, version, seed, save_sql= logging.info(" ✔ ladderized") logging.info(" > writing tree, dates, and csv to files") - with open(os.path.join(outdir, f"ordered_tree_{version}.nwk"), "w+") as condensed_newick, open( - os.path.join(outdir, f"ordered_tree_{version}.poly"), "w+" - ) as condensed_poly, open(os.path.join(outdir, f"ordered_dates_{version}.js"), "w+") as json_dates, open( - os.path.join(outdir, f"ordered_leaves_{version}.csv"), "w+", encoding="utf-8" - ) as leaves, open(os.path.join(outdir, f"ordered_nodes_{version}.csv"), "w+", encoding="utf-8") as nodes: + with open(os.path.join(outdir, f"ordered_tree_{version}.nwk"), "w+") as condensed_newick: tree.seed_node.write_brief_newick(condensed_newick) + with open(os.path.join(outdir, f"ordered_tree_{version}.poly"), "w+") as condensed_poly: tree.seed_node.write_brief_newick(condensed_poly, "{}") + with open(os.path.join(outdir, f"ordered_dates_{version}.js"), "w+") as json_dates: tree.write_preorder_ages(json_dates, format="json") - # these are the extra columns output to the leaf csv file - leaf_extras = OrderedDict() - leaf_extras["ott"] = ["ott"] - leaf_extras["wikidata"] = ["wd", "Q"] - leaf_extras["wikipedia_lang_flag"] = ["wd", "wikipedia_lang_flag"] - leaf_extras["iucn"] = ["iucn"] - leaf_extras["eol"] = ["eol"] - leaf_extras["raw_popularity"] = ["wd", "raw_popularity"] - leaf_extras["popularity"] = ["popularity"] - leaf_extras["popularity_rank"] = ["popularity_rank"] - leaf_extras["price"] = None - leaf_extras["ncbi"] = ["sources", "ncbi", "id"] - leaf_extras["ifung"] = ["sources", "ifung", "id"] - leaf_extras["worms"] = ["sources", "worms", "id"] - leaf_extras["irmng"] = ["sources", "irmng", "id"] - leaf_extras["gbif"] = ["sources", "gbif", "id"] - leaf_extras["ipni"] = ["ipni"] - - # these are the extra columns output to the node csv file - node_extras = OrderedDict() - node_extras["ott"] = ["ott"] - node_extras["wikidata"] = ["wd", "Q"] - node_extras["wikipedia_lang_flag"] = ["wd", "wikipedia_lang_flag"] - node_extras["eol"] = ["eol"] - # We avoid using 'rank' as it is a reserved word in mysql - node_extras["rnk"] = ["rank"] - node_extras["raw_popularity"] = ["wd", "raw_popularity"] - node_extras["popularity"] = ["popularity"] - node_extras["ncbi"] = ["sources", "ncbi", "id"] - node_extras["ifung"] = ["sources", "ifung", "id"] - node_extras["worms"] = ["sources", "worms", "id"] - node_extras["irmng"] = ["sources", "irmng", "id"] - node_extras["gbif"] = ["sources", "gbif", "id"] - node_extras["ipni"] = ["ipni"] - node_extras["vern_synth"] = None - for representative_image_type in ["rep", "rtr", "rpd"]: - for i in [str(x + 1) for x in range(8)]: - node_extras[representative_image_type + i] = None - - for iucn_type in ["NE", "DD", "LC", "NT", "VU", "EN", "CR", "EW", "EX"]: - node_extras["iucn" + iucn_type] = None + # these are the extra columns output to the leaf csv file + leaf_extras = OrderedDict() + leaf_extras["ott"] = ["ott"] + leaf_extras["wikidata"] = ["wd", "Q"] + leaf_extras["wikipedia_lang_flag"] = ["wd", "wikipedia_lang_flag"] + leaf_extras["iucn"] = ["iucn"] + leaf_extras["eol"] = ["eol"] + leaf_extras["raw_popularity"] = ["wd", "raw_popularity"] + leaf_extras["popularity"] = ["popularity"] + leaf_extras["popularity_rank"] = ["popularity_rank"] + leaf_extras["price"] = None + leaf_extras["ncbi"] = ["sources", "ncbi", "id"] + leaf_extras["ifung"] = ["sources", "ifung", "id"] + leaf_extras["worms"] = ["sources", "worms", "id"] + leaf_extras["irmng"] = ["sources", "irmng", "id"] + leaf_extras["gbif"] = ["sources", "gbif", "id"] + leaf_extras["ipni"] = ["ipni"] + + # these are the extra columns output to the node csv file + node_extras = OrderedDict() + node_extras["ott"] = ["ott"] + node_extras["wikidata"] = ["wd", "Q"] + node_extras["wikipedia_lang_flag"] = ["wd", "wikipedia_lang_flag"] + node_extras["eol"] = ["eol"] + # We avoid using 'rank' as it is a reserved word in mysql + node_extras["rnk"] = ["rank"] + node_extras["raw_popularity"] = ["wd", "raw_popularity"] + node_extras["popularity"] = ["popularity"] + node_extras["ncbi"] = ["sources", "ncbi", "id"] + node_extras["ifung"] = ["sources", "ifung", "id"] + node_extras["worms"] = ["sources", "worms", "id"] + node_extras["irmng"] = ["sources", "irmng", "id"] + node_extras["gbif"] = ["sources", "gbif", "id"] + node_extras["ipni"] = ["ipni"] + node_extras["vern_synth"] = None + for representative_image_type in ["rep", "rtr", "rpd"]: + for i in [str(x + 1) for x in range(8)]: + node_extras[representative_image_type + i] = None + for iucn_type in ["NE", "DD", "LC", "NT", "VU", "EN", "CR", "EW", "EX"]: + node_extras["iucn" + iucn_type] = None + + with ( + open(os.path.join(outdir, f"ordered_leaves_{version}.csv"), "w+", encoding="utf-8") as leaves, + open(os.path.join(outdir, f"ordered_nodes_{version}.csv"), "w+", encoding="utf-8") as nodes, + ): tree.write_preorder_to_csv(leaves, leaf_extras, nodes, node_extras, -version) logging.info(f" ✔ written into {outdir}/ordered_..._{version}...")