From 621191fe7daa7727ef03d9d6ac64a25ea3e22033 Mon Sep 17 00:00:00 2001 From: Cecilia Sensalari <57489957+Cecilia-Sensalari@users.noreply.github.com> Date: Wed, 23 Mar 2022 13:09:07 +0100 Subject: [PATCH] Improve checkpoint for divergence colors provided in configuration file (#36) - Checks if "divergence_color" list in the configuration file is empty and if it contains at least as many colors as the number of internal nodes in the lineage that leads to the focal species (backbone of the tree plotted in the PDF or as ASCII). - Warns and exit if colors in configuration file field "divergence_colors" are found to be not compatible with matplotlib, the package that is going to use them later on to generate the Ks plot. E.g. misspelled colors --- ksrates/fc_configfile.py | 42 ++++++++++++++++++++++++++----------- ksrates/setup_correction.py | 17 ++++++++++++--- 2 files changed, 44 insertions(+), 15 deletions(-) diff --git a/ksrates/fc_configfile.py b/ksrates/fc_configfile.py index 053c302..80ff9ec 100644 --- a/ksrates/fc_configfile.py +++ b/ksrates/fc_configfile.py @@ -2,6 +2,7 @@ import os from ete3 import Tree import ksrates.fc_check_input as fcCheck +from matplotlib.colors import is_color_like import logging import sys from ast import literal_eval @@ -106,10 +107,10 @@ def get_species(self): """ species = self.config.get("SPECIES", "focal_species") if species == "": - logging.error("Parameter focal_species in configuration file is empty, please fill in") + logging.error('Field "focal_species" in configuration file is empty, please fill in') sys.exit(1) - elif len(species.split()) != 1: - logging.error(f"Parameter focal_species [{species}] should be a short name and must not contain any spaces, please change accordingly") + elif len(species.split()) != 1 or "_" in species: + logging.error(f'Field "focal_species" [{species}] should be a short name and must not contain any spaces or underscores, please change accordingly') sys.exit(1) return species @@ -123,13 +124,13 @@ def get_newick_tree(self): tree_string = self.config.get("SPECIES", "newick_tree") if not (tree_string.endswith(';')): tree_string += ";" - if tree_string == "();": - logging.error("Parameter newick_tree in configuration file is empty, please fill in") + if tree_string == "();" or tree_string == ";": + logging.error('Field "newick_tree" in configuration file is empty, please fill in') sys.exit(1) try: tree = Tree(tree_string) except Exception: - logging.error("Unrecognized format for parameter newick_tree in configuration file (for example, parentheses do not match)") + logging.error('Unrecognized format for field "newick_tree" in configuration file (for example, parentheses do not match)') sys.exit(1) # Check if species' informal names contain illegal characters (underscore or spaces) @@ -157,9 +158,9 @@ def check_complete_latin_names_dict(self, dictionary): missing_species = list(set.difference(set(all_leaves), set(dictionary.keys()))) if len(missing_species) != 0: if len(missing_species) == 1: - logging.error(f"The following species is missing from the [latin_names] configuration file field:") + logging.error(f'The following species is missing from the "latin_names" configuration file field:') else: - logging.error(f"The following species are missing from the [latin_names] configuration file field:") + logging.error(f'The following species are missing from the "latin_names" configuration file field:') for missing_name in missing_species: logging.error(f" - {missing_name}") @@ -177,7 +178,7 @@ def get_latin_names(self): if latin_names != "": latin_names_dict = self._get_clean_dict_stringent(latin_names, "latin_names") else: - logging.error("Configuration file field [latin_names] is empty, please fill in and restart the analysis.") + logging.error('Configuration file field "latin_names" is empty, please fill in and restart the analysis.') logging.error("Exiting.") sys.exit(1) # Check if latin_names contains all the species present in the Newick tree; if not, exits @@ -458,10 +459,27 @@ def get_color_list(self): is assigned to the second internal node encountered along this path, and so on. There must be at least as many colors as the number of divergence nodes. + Checks if there are colors whose name is not recognized by matplotlib, e.g. misspelled. + :return colors: list of colors """ color_list_string = self.config.get("PARAMETERS", "divergence_colors") colors = [c.strip() for c in color_list_string.split(',')] + if len(colors) == 1 and colors[0] == "": + logging.error('Field "divergence_colors" in configuration file is empty, please fill in') + logging.error("Exiting.") + sys.exit(1) + + # Check if color names are recognized by matplotlib + faulty_color_names = [] + for color in colors: + if not is_color_like(color): + faulty_color_names.append(color) + if len(faulty_color_names) != 0: + logging.error('Field "divergence_colors" in configuration file contains color names not recognized by Matplotlib, please adjust the following:') + for color in faulty_color_names: + logging.error(f"- {color}") + sys.exit(1) return colors @@ -689,12 +707,12 @@ def get_max_mixture_model_components(self): logging.warning(f'Unrecognized field in expert configuration file [max_mixture_model_components = {max_comp}]. Please choose a positive integer >= 2. Default choice will be applied [5]') max_comp = 5 elif max_comp == 1: - logging.warning(f"Parameter [max_mixture_model_components] has been changed from {max_comp} to the minimum required, 2.") + logging.warning(f'Field "max_mixture_model_components" has been changed from {max_comp} to the minimum required, 2') max_comp = 2 # exponential + buffer elif max_comp <= 3: - logging.warning(f"A low number of mixture model components [max_mixture_model_components = {max_comp}] can produce poor fitting.") + logging.warning(f"A low number of mixture model components [max_mixture_model_components = {max_comp}] can produce poor fitting") elif max_comp >= 7: - logging.warning(f"A high number of mixture model components [max_mixture_model_components = {max_comp}] increases overfitting risk.") + logging.warning(f"A high number of mixture model components [max_mixture_model_components = {max_comp}] increases overfitting risk") except Exception: logging.warning(f'Missing field in expert configuration file [max_mixture_model_components]. Please choose a positive integer. Default choice will be applied [5]') max_comp = 5 diff --git a/ksrates/setup_correction.py b/ksrates/setup_correction.py index 0455638..3651ae5 100644 --- a/ksrates/setup_correction.py +++ b/ksrates/setup_correction.py @@ -22,6 +22,7 @@ def setup_correction(config_file, nextflow_flag): fcTree.check_integrity_newick_tree(original_tree) tree = fcTree.reorder_tree_leaves(original_tree, species_of_interest) # focal species is the top leaf latin_names = config.get_latin_names() + divergence_colors = config.get_color_list() paranome = config.get_paranome() colinearity = config.get_colinearity() @@ -104,14 +105,24 @@ def setup_correction(config_file, nextflow_flag): logging.error(f"Please add at least one outgroup species or change the focal species.") sys.exit(1) + # Obtaining the numeric labels for internal nodes relevant in the species analysis + fcTree.labeling_internal_nodes(species_of_interest_node) + # If the amount of colors provided for the divergence lines in the config file + # is insufficient for the number of divergence nodes in the tree, exit + num_required_colors = sp_history[-2].name + if len(divergence_colors) < num_required_colors: + logging.error("") + logging.error(f'Configuration file field "divergence_colors" is missing {num_required_colors - len(divergence_colors)} color(s) ' + + f"out of {num_required_colors} required for the analysis on focal species [{species_of_interest}]") + logging.error("Please add the missing color(s) and rerun the analysis") + logging.error("Exiting.") + sys.exit(1) + trios_array = [] # list of trios outfile_drawing_path = os.path.join("rate_adjustment", f"{species_of_interest}", f"tree_{species_of_interest}.txt") with open(outfile_drawing_path, "w+") as outfile_drawing: outfile_drawing.write(f"Focal species: {species_of_interest}\n\n") - - # Obtaining the numeric labels for internal nodes relevant in the species analysis - fcTree.labeling_internal_nodes(species_of_interest_node) node = 0 while node < len(sp_history)-2: