From bc9ea475e28d4ff7c9c884eb9e742a09be1d64fe Mon Sep 17 00:00:00 2001 From: Cecilia Sensalari <57489957+Cecilia-Sensalari@users.noreply.github.com> Date: Wed, 23 Mar 2022 10:34:39 +0100 Subject: [PATCH] Check Newick tree structural integrity (#35) * Checkpoint for Newick tree structural integrity - All internal nodes in the input (Newick) tree must have exactly two children nodes - If they have one child node or if they have three (or more), then the checkpoint exists and prompts the user to adjust the tree in the configuration file - To help out finding where the problem is, the structural error in the input tree is shown to the user both as a string and as an ASCII drawing of the subtree that contained the incorrect internal node --- ksrates/fc_configfile.py | 1 + ksrates/fc_manipulate_trees.py | 67 +++++++++++++++++++++++++++++++++- ksrates/setup_correction.py | 1 + 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/ksrates/fc_configfile.py b/ksrates/fc_configfile.py index 671bec5..053c302 100644 --- a/ksrates/fc_configfile.py +++ b/ksrates/fc_configfile.py @@ -116,6 +116,7 @@ def get_species(self): def get_newick_tree(self): """ Gets the config file field of the Newick tree. + Checks and exits if the species' names in the Newick tree contain illegal characters (underscore or spaces). :return tree_string: the tree object by ete3 """ diff --git a/ksrates/fc_manipulate_trees.py b/ksrates/fc_manipulate_trees.py index 7949b23..9e6d7b9 100644 --- a/ksrates/fc_manipulate_trees.py +++ b/ksrates/fc_manipulate_trees.py @@ -161,7 +161,72 @@ def find_missing_pairs_for_tree_rates(tree, species, species_history, latin_name missing_pairs_with_latin_names.append([sorted_latin_tag, sorted([leaf1.name, leaf2.name], key=str.casefold)]) missing_pairs.append(sorted([leaf1.name, leaf2.name], key=str.casefold)) return missing_pairs_with_latin_names, missing_pairs - + + +def check_integrity_newick_tree(tree): + """ + :param tree: the original tree object + + Checks if there are syntax errors in the newick_tree input. Exists if there are errors. + + - Case 1: The presence of extra unnecessary pairs of parenthesis generates internal nodes with only one child node, + instead of two children nodes; this will rise problems during the parsing of the tree to obtain the species trios. + Therefore, the code exists and prompts the user to remove such unnecessary parentheses. + + Example: the input Newick tree contains a subtree whose outermost pair of parenthesis has to be removed. + String visualization of the subtree: (((elaeis,oryza),asparagus)) + ASCII visualization of the subtree - note the extra node at the base of this subtree: + /-elaeis + /-| + -- /-| \-oryza + | + \-asparagus + + - Case 2: In presence of unresolved phylogeny (i.e. three or more children nodes branching off from an internal node) + there will be problems in downstream analysis due to ambiguous outgroup relationships. + Therefore, the code exists and prompts the user to rearrange the node(s). + + Example: the input Newick tree contains a subtree where the basal node has three children nodes. + String visualization of the subtree: (elaeis,oryza,maize) + ASCII visualization of the subtree: + /-elaeis + | + --|--oryza + | + \-maize + """ + # For each internal node, check integrity (must have exactly two children) + logging.info("Checking structural integrity of input Newick tree...") + trigger_exit = False + internal_nodes_with_one_child, internal_nodes_with_three_children = [], [] + for node in tree.traverse(): + if not node.is_leaf(): + number_of_children_nodes = len(node.get_children()) + if number_of_children_nodes == 1: + internal_nodes_with_one_child.append(node) + elif number_of_children_nodes > 2: + internal_nodes_with_three_children.append(node) + + if len(internal_nodes_with_one_child) != 0: + logging.error(f'The tree structure provided in "newick_tree" configuration file field has one ore more incomplete internal nodes:') + logging.error(f"likely there are unnecessary pairs of parentheses that generate internal nodes with only one child node instead of two children nodes") + logging.error(f"Please adjust the input tree in the configuration file as suggested below and rerun the analysis") + logging.error(f"Such syntax error can be solved by removing the unnecessary outermost pair of parentheses in the following subtree(s):\n") + for node in internal_nodes_with_one_child: + logging.error(f'Subtree {internal_nodes_with_one_child.index(node)+1}: {node.write(format=9).rstrip(";")}{node}\n') + trigger_exit = True + + if len(internal_nodes_with_three_children) != 0: + logging.error(f'The tree structure provided in "newick_tree" configuration file field contains unresolved phylogenetic relationships') + logging.error(f"Please adjust the tree so that each internal node has exactly two children nodes") + logging.error(f"Such structural issue has been encountered at the base of the following subtree(s):\n") + for node in internal_nodes_with_three_children: + logging.error(f'Subtree {internal_nodes_with_three_children.index(node)+1}: {node.write(format=9).rstrip(";")}{node}\n') + trigger_exit = True + + if trigger_exit: + sys.exit(1) + def reorder_tree_leaves(tree, species): """ diff --git a/ksrates/setup_correction.py b/ksrates/setup_correction.py index 0da8819..0455638 100644 --- a/ksrates/setup_correction.py +++ b/ksrates/setup_correction.py @@ -19,6 +19,7 @@ def setup_correction(config_file, nextflow_flag): # Check configfile species_of_interest = config.get_species() original_tree = config.get_newick_tree() + fcTree.check_integrity_newick_tree(original_tree) tree = fcTree.reorder_tree_leaves(original_tree, species_of_interest) # focal species is the top leaf latin_names = config.get_latin_names() paranome = config.get_paranome()