From fa4fc4b435cbf87ced08c9d561f91047dcff2a62 Mon Sep 17 00:00:00 2001 From: Guillaume GAUTREAU Date: Thu, 14 Dec 2023 11:17:14 +0100 Subject: [PATCH 01/51] Update quickOutputs.md --- docs/user/QuickUsage/quickOutputs.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/user/QuickUsage/quickOutputs.md b/docs/user/QuickUsage/quickOutputs.md index 3180d731..7a20be61 100644 --- a/docs/user/QuickUsage/quickOutputs.md +++ b/docs/user/QuickUsage/quickOutputs.md @@ -4,7 +4,7 @@ The complete workflow subcommand `all` automatically generates some files and fi Here, we are going to describe several of these key outputs that are commonly used in pangenomic studies as these files illustrate the pangenome from your taxonomic group of interest in different ways. ### Statistics and metrics on the pangenome -#### Organisms statistics +#### Statistics about genomes [//]: # (TODO change this subsection after merge of split_write) @@ -36,12 +36,12 @@ More information about this file can be found [here](../PangenomeAnalyses/pangen This file is a .tsv file, with a single parameter written as a comment at the beginning of the file, which indicates the proportion of genomes in which a gene family must be present more than once to be considered 'duplicated' (and not a single copy marker). This file lists the gene families, their duplication ratio, their mean presence in the pangenome and whether it is considered a 'single copy marker' or not, -which is particularly useful when calculating the completeness recorded in the [organisms statistics file](#organisms-statitics) described previously. +which is particularly useful when calculating the completeness recorded in the [genomes statistics file](#statistics-about-genomes) described previously. ### Figures #### U-shaped plot: gene families frequency distribution in pangenome -A U-shaped plot is a figure presenting the number of families (y-axis) per number of organisms (x-axis). +A U-shaped plot is a figure presenting the number of families (y-axis) per number of genomes (x-axis). It is a _.html_ file that can be opened with any browser and with which you can interact, zoom, move around, mouseover to see numbers in more detail, and you can save what you are seeing as a .png image file. @@ -52,19 +52,19 @@ Look at [here](../PangenomeAnalyses/pangenomeAnalyses.md#u-shape-plot) to change #### Tile plot: detect pangenome structure and outlier -A tile plot is a heatmap representing the gene families (y-axis) in the organisms (x-axis) making up your pangenome. -The tiles on the graph will be colored if the gene family is present in an organism and uncolored if absent. +A tile plot is a heatmap representing the gene families (y-axis) in the genomes (x-axis) making up your pangenome. +The tiles on the graph will be colored if the gene family is present in a genome and uncolored if absent. The gene families are ordered by partition, and the genomes are ordered by a hierarchical clustering based on their shared gene families (basically two genomes that are close together in terms of gene family composition will be close together on the figure). This plot is quite helpful to observe potential structures in your pangenome, and can also help you to identify eventual outliers. You can interact with it, and mousing over a tile in the plot will indicate to you which is the gene identifier(s), -the gene family and the organism that corresponds to the tile. +the gene family and the genome that corresponds to the tile. ![tile_plot](../../_static/tutorial/tile_plot.png) [//]: # (TODO Explain the bar on the right side) -With the 'workflow' subcommand, if you have more than 500 organisms, only the 'shell' and the 'persistent' partitions will be drawn, leaving out the 'cloud' as the figure tends to be too heavy for a browser to open it otherwise. Look at [here](../PangenomeAnalyses/pangenomeAnalyses.md#tile-plot) to add the cloud if you need. +With the 'workflow' subcommand, if you have more than 500 genomes, only the 'shell' and the 'persistent' partitions will be drawn, leaving out the 'cloud' as the figure tends to be too heavy for a browser to open it otherwise. Look at [here](../PangenomeAnalyses/pangenomeAnalyses.md#tile-plot) to add the cloud if you need. ```{note} If you want the 'cloud' gene families even if a lot of data can be hard to open with a browser sometimes, From 5288445110806b72527742ca2dcb84fa7e368071 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 14:27:54 +0100 Subject: [PATCH 02/51] review RGP doc --- docs/user/RGP/rgpOutputs.md | 45 ++++++++++++++++++++++++---------- docs/user/RGP/rgpPrediction.md | 30 +++++++++++++---------- 2 files changed, 49 insertions(+), 26 deletions(-) diff --git a/docs/user/RGP/rgpOutputs.md b/docs/user/RGP/rgpOutputs.md index 4568368f..2aa255d6 100644 --- a/docs/user/RGP/rgpOutputs.md +++ b/docs/user/RGP/rgpOutputs.md @@ -2,10 +2,12 @@ ### RGP -The `plastic_regions.tsv` is a tsv file that lists all of the detected Regions of Genome Plasticity. This requires to have run the RGP detection analysis by either using the `panrgp` command or the `rgp` command. +The `regions_of_genomic_plasticity.tsv` is a tsv file that lists all the detected Regions of Genome Plasticity. This requires to have run the RGP detection analysis by either using the `panrgp` command or the `rgp` command. It can be written with the following command: -`ppanggolin write_pangenome -p pangenome.h5 --regions` +```bash +ppanggolin write_pangenome -p pangenome.h5 --regions -o rgp_outputs +``` The file has the following format : @@ -21,22 +23,27 @@ The file has the following format : ### Spots -The `spots.tsv` is a tsv file with two column. It links the spots of `summarize_spots.tsv` with the RGPs of `plastic_regions.tsv`. +The `spots.tsv` is a tsv file that links the spots in `summarize_spots.tsv` with the RGPs in `regions_of_genomic_plasticity.tsv`. -It is written with the following command: -`ppanggolin write_pangenome -p pangenome.h5 --spots` +It can be created with the following command: +```bash +ppanggolin write_pangenome -p pangenome.h5 --spots -o rgp_outputs +``` |Column|Description| |------|------------| |spot_id| The spot identifier (found in the 'spot' column of `summarize_spots.tsv`).| -|rgp_id| The RGP identifier (found in 'region' column of `plastic_regions.tsv`).| +|rgp_id| The RGP identifier (found in 'region' column of `regions_of_genomic_plasticity.tsv`).| ### Summarize spots The `summarize_spots.tsv` file is a tsv file that will associate each spot with multiple metrics that can indicate the dynamic of the spot. -It is written with the following command: -`ppanggolin write_pangenome -p pangenome.h5 --spots` +It can be created with the following command: + +```bash +ppanggolin write_pangenome -p pangenome.h5 --spots -o rgp_outputs +``` |Column|Description| |-------|------------| @@ -51,8 +58,11 @@ It is written with the following command: ### Borders -Each spot has at least one set of gene families bordering them. To write the list of gene families bordering a spot, you need to use the following option: -`ppanggolin write_pangenome -p pangenome.h5 --borders` +Each spot has at least one set of gene families bordering them. To write the list of gene families bordering spots, you can use the `--borders` option as follow: + +```bash +ppanggolin write_pangenome -p pangenome.h5 --borders -o rgp_outputs +``` It will write a .tsv file with 4 columns: @@ -66,17 +76,26 @@ It will write a .tsv file with 4 columns: As there can be some variation in the borders, some spots will have multiple borders and as such multiple lines in this file. The sum of the number for each spot_id should be exactly the number of RGPs in the spot. +The flag `--borders` also creates a file call `border_protein_genes.fasta` that are the protein sequences of the gene family found in borders. + +In addition, the `--borders` option also generates a file named `border_protein_genes.fasta`, containing protein sequences corresponding to the gene families of the spot borders. + ## Draw spots The `draw` command can draw specific spots of interest, whose ID are provided, or all the spots if you wish. It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is basically a subgraph of the pangenome, consisting of the spot itself. The command can be used as such: -`ppanggolin draw -p pangenome.h5 --spots all` will draw an interactive `.html` figure and a `gexf` graph file for all the spots. +```bash +ppanggolin draw -p pangenome.h5 --spots all +``` +This command draws an interactive `.html` figure and a `.gexf` graph file for all the spots. -If you are interested in only a single spot, you can use its identifier to draw it, as such: +If you are interested in only a single spot, you can use its identifier to draw it. For example for the `spot_34`: -`ppanggolin draw -p pangenome.h5 --spots spot_34` for spot_34, for example. +```bash +ppanggolin draw -p pangenome.h5 --spots spot_34 +``` The interactive figures that are drawn look like this: diff --git a/docs/user/RGP/rgpPrediction.md b/docs/user/RGP/rgpPrediction.md index 17abb499..cfb812b0 100644 --- a/docs/user/RGP/rgpPrediction.md +++ b/docs/user/RGP/rgpPrediction.md @@ -1,28 +1,28 @@ ## Purpose -Regions of Genome Plasticity (RGPs) are clusters of genes made of shell and cloud genomes in the pangenome graph. Most of them arise from Horizontal gene transfer (HGT) and correspond to Genomic Islands (GIs). RGP from different genomes can be grouped in spots of insertion based on their conserved flanking persistent genes, rather than their gene content, to find out which are located in the same locations in the genome. The panRGP methods and its subcommands and subsequent output files are made to detect describe as thoroughly as possible those Regions of Genome Plasticity accross all genomes of the pangenome. +Regions of Genome Plasticity (RGPs) are clusters of genes made of shell and cloud genomes in the pangenome graph. Most of them arise from Horizontal gene transfer (HGT) and correspond to Genomic Islands (GIs). RGPs from different genomes can be grouped in spots of insertion based on their conserved flanking persistent genes, rather than their gene content, to find out which are located in the same locations in the genome. The panRGP methods and its subcommands and subsequent output files are made to detect describe as thoroughly as possible those Regions of Genome Plasticity across all genomes of the pangenome. Those methods were supported by the [panRGP publication](https://doi.org/10.1093/bioinformatics/btaa792) which can be read to have their methodological descriptions and justifications. ## PanRGP -This command works exactly like [workflow](./pangenomeAnalyses.md#workflow). The difference is that it will run additional analyses to characterize Regions of Genome Plasticity (RGP). +This command works exactly like [workflow](../PangenomeAnalyses/pangenomeAnalyses.md#workflow). The difference is that it will run additional analyses to characterize Regions of Genome Plasticity (RGP). You can use the `panrgp` with annotation (gff3 or gbff) files with `--anno` option, as such: -``` +```bash ppanggolin panrgp --anno organism.gbff.list ``` For fasta files, you need to use the alternative `--fasta` option, as such: -``` +```bash ppanggolin panrgp --fasta organism.fasta.list ``` -Just like [workflow](./pangenomeAnalyses.md#workflow), this command will deal with the [annotation](./pangenomeAnalyses.md#annotation), [clustering](./pangenomeAnalyses.md#clustering), [graph](./pangenomeAnalyses.md#graph) and [partition](./pangenomeAnalyses.md#partition) commands by itself. -Then, the RGP detection is ran using [rgp](./rgpAnalyses.md#rgp-detection) after the pangenome partitionning. Once all RGP have been computed, those found in similar genomic contexts in the genomes are gathered into spots of insertion used [spot](./rgpAnalyses.md#spot-prediction). +Just like [workflow](../PangenomeAnalyses/pangenomeAnalyses.md#workflow), this command will deal with the [annotation](../PangenomeAnalyses/pangenomeAnalyses.md#annotation), [clustering](../PangenomeAnalyses/pangenomeAnalyses.md#compute-pangenome-gene-families), [graph](../PangenomeAnalyses/pangenomeAnalyses.md#graph) and [partition](../PangenomeAnalyses/pangenomeAnalyses.md#partition) commands by itself. +Then, the RGP detection is ran using [rgp](#rgp-detection) after the pangenome partitionning. Once all RGP have been computed, those found in similar genomic contexts in the genomes are gathered into spots of insertion using [spot](#spot-prediction). -If you want to tune the rgp detection, you can use the `rgp` command after the `workflow` command. If you wish to tune the spot detection, you can use the `spot` command after the `rgp` command. +If you want to tune the rgp detection, you can use the `rgp` command after the `workflow` command. If you wish to tune the spot detection, you can use the `spot` command after the `rgp` command. Additionally, you have the option to utilize a configuration file to customize each detection within the `panrgp` command. More detail about RGP detection and the spot of insertion prediction can be found in the [panRGP publication](https://doi.org/10.1093/bioinformatics/btaa792) @@ -33,7 +33,9 @@ This subcommand's options are about tuning parameters for the analysis. You can do it as such: -`ppanggolin rgp -p pangenome.h5` +```bash +ppanggolin rgp -p pangenome.h5 +``` This will predict RGP and store results in the HDF5 file. @@ -43,15 +45,17 @@ Users looking to change those 3 parameters should consider reading the Materials The two other options are more straightforward. The `--min_length` will indicate the minimal size in base pair that a RGP should be to be predicted. The `--dup_margin` is a filter used to identify persistent gene families to consider as multigenic. Gene families that have more than one gene in more than `--dup_margin` genomes will be classified as multigenic, and as such considered as "variable" genes. -After this command is ran, a single output file that will list all of the predictions can be written, the [plastic_regions.tsv](./rgpAnalyses.md#rgp) file. +After this command is executed, a single output file that will list all of the predictions can be written, the [regions_of_genomic_plasticity.tsv](./rgpOutputs.md#rgp) file. ## Spot prediction -To study RGP that are found in the same area in different genomes, we gather them into 'spots of insertion'. Those spots are groups of RGP that do not necessarily have the same gene content but have similar bordering _persistent_ genes. We run those analyses to study the dynamic of gene turnover of large regions in bacterial genomes. In this way, spots of the same pangenome can be compared and their dynamic can be established by comparing their different metrics together. Those metrics are described in the [RGP and spot output section](rgpAnalyses.md#Spots). +To study RGP that are found in the same area in different genomes, we gather them into 'spots of insertion'. These spots are groups of RGP that do not necessarily have the same gene content but have similar bordering _persistent_ genes. We run those analyses to study the dynamic of gene turnover of large regions in bacterial genomes. In this way, spots of the same pangenome can be compared and their dynamic can be established by comparing their different metrics together. Detailed descriptions of these metrics can be found in the [RGP and spot output section](./rgpOutputs.md#rgp-outputs). Spots can be computed once RGP have been predicted. You can do that using: -`ppanggolin spot -p pangenome.h5` +```bash +ppanggolin spot -p pangenome.h5 +``` This command has 3 options that can change its results: @@ -64,6 +68,6 @@ The two other options are related to the 'spot graph' used to predict spots of i - `--spot_graph` writes the spot graph once predictions are computed - `--graph_formats` defines the format in which the spot graph is written. Can be either gexf or graphml. (default: gexf) -You can the use the dedicated subcommand [draw](./rgpAnalyses.md#draw-spots) to draw interactive figures for any given spot with the python library [bokeh](http://docs.bokeh.org/en/latest/). Those figures can can be visualized and modified directly in the browser. This plot is described [here](./rgpAnalyses.md#draw-spots) +You can the use the dedicated subcommand [draw](./rgpOutputs.md#draw-spots) to draw interactive figures for any given spot with the python library [bokeh](http://docs.bokeh.org/en/latest/). Those figures can can be visualized and modified directly in the browser. This plot is described [here](./rgpOutputs.md#draw-spots) -Multiple files can then be written describing the predicted spots and their linked RGP, such as a [file linking RGPs with their spots](./rgpAnalyses.md#Spots) and a [file showing multiple metrics for each spot](./rgpAnalyses.md#summarize-spots). +Multiple files can then be written describing the predicted spots and their linked RGP, such as a [file linking RGPs with their spots](./rgpOutputs.md#spots) and a [file showing multiple metrics for each spot](./rgpOutputs.md#summarize-spots). From af1eb06f35c77ed47d677c7e77e2b5d6d7dd1d65 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 14:46:29 +0100 Subject: [PATCH 03/51] fix draw spot commands example in doc and typo in draw spot error --- docs/user/RGP/rgpOutputs.md | 8 ++++---- ppanggolin/figures/drawing.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/user/RGP/rgpOutputs.md b/docs/user/RGP/rgpOutputs.md index 2aa255d6..47ce5520 100644 --- a/docs/user/RGP/rgpOutputs.md +++ b/docs/user/RGP/rgpOutputs.md @@ -73,7 +73,7 @@ It will write a .tsv file with 4 columns: |border1| Comma-separated list of gene families of the 1st border.| |border2| Comma-separated list of gene families of the 2nd border.| -As there can be some variation in the borders, some spots will have multiple borders and as such multiple lines in this file. +Since there can be some variation in the borders, some spots will have multiple borders and thus multiple lines in this file. The sum of the number for each spot_id should be exactly the number of RGPs in the spot. The flag `--borders` also creates a file call `border_protein_genes.fasta` that are the protein sequences of the gene family found in borders. @@ -82,19 +82,19 @@ In addition, the `--borders` option also generates a file named `border_protein_ ## Draw spots -The `draw` command can draw specific spots of interest, whose ID are provided, or all the spots if you wish. +The `draw` command with the option `--draw_spots` can draw specific spots of interest, whose ID are provided, or all the spots if you wish. It will also write a gexf file, which corresponds to the gene families and their organization within the spots. It is basically a subgraph of the pangenome, consisting of the spot itself. The command can be used as such: ```bash -ppanggolin draw -p pangenome.h5 --spots all +ppanggolin draw -p pangenome.h5 --draw_spots --spots all ``` This command draws an interactive `.html` figure and a `.gexf` graph file for all the spots. If you are interested in only a single spot, you can use its identifier to draw it. For example for the `spot_34`: ```bash -ppanggolin draw -p pangenome.h5 --spots spot_34 +ppanggolin draw -p pangenome.h5 --draw_spots --spots spot_34 ``` The interactive figures that are drawn look like this: diff --git a/ppanggolin/figures/drawing.py b/ppanggolin/figures/drawing.py index 2e09674f..9607e42b 100644 --- a/ppanggolin/figures/drawing.py +++ b/ppanggolin/figures/drawing.py @@ -27,7 +27,7 @@ def check_spot_args(args: argparse.Namespace): """ default_arg_spots = 'all' if not args.draw_spots and args.spots != default_arg_spots: - raise argparse.ArgumentError(None, "The --spots argument cannot be used when --draw-spots is not specified.") + raise argparse.ArgumentError(None, "The --spots argument cannot be used when --draw_spots is not specified.") def launch(args: argparse.Namespace): From 853df52c2b36792d9053335ab0c00098ac0fef0a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 15:02:45 +0100 Subject: [PATCH 04/51] Review of partition doc --- .../PangenomeAnalyses/pangenomePartition.md | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/user/PangenomeAnalyses/pangenomePartition.md b/docs/user/PangenomeAnalyses/pangenomePartition.md index 46012255..9791bda8 100644 --- a/docs/user/PangenomeAnalyses/pangenomePartition.md +++ b/docs/user/PangenomeAnalyses/pangenomePartition.md @@ -1,12 +1,18 @@ This step will assign gene families either to the 'persistent', 'shell', or 'cloud' partitions. -The 'persistent' partition includes gene families with genes commonly found throughout the species +- **'Persistent' Partition** + +The 'persistent' partition includes gene families with genes commonly found throughout the species. It corresponds to essential genes, genes required for important metabolic pathways and genes that define the metabolic and biosynthetic capabilities of the taxonomic group. +- **'Shell' Partition** + The 'shell' partition includes gene families with genes found in some individuals. These genes, frequently acquired via horizontal gene transfers, typically encode functions related to environmental adaptation, pathogenicity, virulence, or the production of secondary metabolites +- **'Cloud' Partition** + The 'cloud' partition includes gene families with rare genes found in one, or very few, individuals. Most of the genes were associated with phage-related genes. They probably all were acquired through horizontal gene transfers. @@ -14,20 +20,22 @@ Antibiotic resistance genes were often found to be belonging to the cloud genome It can be realized through the following subcommand : -`ppanggolin partition -p pangenome.h5` +```bash +ppanggolin partition -p pangenome.h5 +``` The command also has quite a few options. Most of them are not self-explanatory. If you want to know what they do, you should read the PPanGGOLiN paper (you can read it [here](https://journals.plos.org/ploscompbiol/article?rev=2&id=10.1371/journal.pcbi.1007732)) where the statistical methods used are thoroughly described. -The one parameter that might be of importance is the '-K', or '--nb_of_partitions' parameter. +The one parameter that might be of importance is the `-K`, or `--nb_of_partitions` parameter. This will define the number of classes (`K`) used to partition the pangenome. If you anticipate well-defined subpopulations within your pangenome and know their exact number, this approach can be particularly useful. For metagenome-assembled genomes (MAGs), which are often incomplete, it is typically advised to set a fixed value of K=3. If the number of subpopulations is unknown, it will be automatically determined using the ICL criterion. The idea is that the most present partition will be 'persistent', the least present will be 'cloud', and all the others will be 'shell'. The number of partitions corresponding to the shell will be the number of expected subpopulations in your pangenome. -(for instance, if you expect 5 subpopulations, you could use -K 7). +(for instance, if you expect 5 subpopulations, you could use `-K 7`). In most cases, you should let the statistical criterion used by PPanGGOLiN find the optimal number of partitions for you. -All the results will be added to the given 'pangenome.h5' input file. +All the results will be added to the given `pangenome.h5` input file. From d585651156658aad027cd5fdbbe5e4eca561ba0b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 15:07:43 +0100 Subject: [PATCH 05/51] add doc for info command --- docs/user/PangenomeAnalyses/pangenomeInfo.md | 46 ++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 docs/user/PangenomeAnalyses/pangenomeInfo.md diff --git a/docs/user/PangenomeAnalyses/pangenomeInfo.md b/docs/user/PangenomeAnalyses/pangenomeInfo.md new file mode 100644 index 00000000..587f99b0 --- /dev/null +++ b/docs/user/PangenomeAnalyses/pangenomeInfo.md @@ -0,0 +1,46 @@ + + + +The `info` command in PPanGGOLiN enables users to acquire comprehensive insights into the contents and construction process of a pangenome file. + +Different types of information can be displayed using various parameters, such as `--status`, `--parameters`, `--content`, and `--metadata`. When no flag is specified, all available outputs are displayed. + +```bash +ppanggolin info -p pangenome.h5 +``` + +The `info` command generates information in YAML format, displaying the results directly in the standard output without writing to any files. + +#### Overview of `info --content` Output + +The `--content` option with the `info` command exhibits general statistical data about your pangenome: + +1. **General Metrics**: + - Presents a comprehensive count of genes, genomes, gene families, and edges within the pangenome graph. + +2. **Partitioned Metrics**: + - Provides detailed information for each partition, including gene counts and presence thresholds for persistent, shell, and cloud families among genomes. + +3. **Number of Partitions**: + - Indicates the total count of partitions in the pangenome. + +4. **Genome Fluidity**: + - If computed through the 'metrics' command, displays genome fluidity values across all partitions and within each partition. + +5. **Regions of Genomic Plasticity (RGPs)**: + - Exhibits counts of Regions of Genomic Plasticity (RGPs) and spots of insertion if predicted using commands such as 'all', 'panrgp', 'rgp', or 'spot'. + +6. **Modules**: + - Shows counts of modules and associated gene families if predicted using commands like 'module', 'panmodule', or 'all'. Additionally, provides partition composition percentages for these modules. + +#### Overview of `info --parameters` Output + +This option displays the PPanGGOLiN parameters used at each analysis step. The output can be utilized as a configuration file for other PPanGGOLiN commands to replicate the same parameters. Refer [here](../practicalInformation.md#configuration-file) for more details on the configuration file . + +#### Overview of `info --status` Output + +Using this option, users can check what analysis have been conducted to obtain the pangenome file. + +#### Overview of `info --metadata` Output + +When metadata has been added to the pangenome elements, this option showcases which elements possess metadata and their respective sources. Find more details on metadata [here](../metadata.md). From af1c254c463e705edf96f51ab3fa97d2dd955862 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 15:52:52 +0100 Subject: [PATCH 06/51] rm old doc files and clean index.md --- docs/index.md | 5 +- docs/tutorial/AnalyseRGP.md | 17 -- .../Basic-usage-and-practical-information.md | 217 ------------------ docs/tutorial/inputData.md | 42 ---- docs/tutorial/prepEnv.md | 64 ------ docs/tutorial/workflows.md | 105 --------- docs/user/Flat/dupplication.md | 8 - docs/user/Flat/fam2gen.md | 7 - docs/user/Flat/genomes_fasta.md | 11 - docs/user/Flat/genomes_metadata.md | 56 ----- docs/user/Flat/gff.md | 48 ---- docs/user/Flat/metrics.md | 35 --- docs/user/Flat/partition.md | 5 - docs/user/Flat/presAbs.md | 16 -- docs/user/Flat/proksee.md | 31 --- docs/user/Flat/tables.md | 35 --- docs/user/Outputs.md | 110 --------- 17 files changed, 2 insertions(+), 810 deletions(-) delete mode 100644 docs/tutorial/AnalyseRGP.md delete mode 100644 docs/tutorial/Basic-usage-and-practical-information.md delete mode 100644 docs/tutorial/inputData.md delete mode 100644 docs/tutorial/prepEnv.md delete mode 100644 docs/tutorial/workflows.md delete mode 100644 docs/user/Flat/dupplication.md delete mode 100644 docs/user/Flat/fam2gen.md delete mode 100644 docs/user/Flat/genomes_fasta.md delete mode 100644 docs/user/Flat/genomes_metadata.md delete mode 100644 docs/user/Flat/gff.md delete mode 100644 docs/user/Flat/metrics.md delete mode 100644 docs/user/Flat/partition.md delete mode 100644 docs/user/Flat/presAbs.md delete mode 100644 docs/user/Flat/proksee.md delete mode 100644 docs/user/Flat/tables.md delete mode 100644 docs/user/Outputs.md diff --git a/docs/index.md b/docs/index.md index 166850b1..84d563f7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,7 +19,7 @@ ```{image} _static/logo.png :alt: ppangolin logo :align: center -:heigth: 350 +:height: 350 :width: 437 ``` @@ -67,7 +67,6 @@ Those RGPs can be further divided in conserved modules by panModule ([Bazin et a :caption: 'User Guide:' :maxdepth: 2 -user/introduction user/install user/QuickUsage/quickAnalyses user/practicalInformation @@ -93,7 +92,7 @@ dev/workflows dev/buildDoc ``` -# Indices and tables +# API Reference [//]: # (- {ref}`ppanggolin package`) - {ref}`genindex` diff --git a/docs/tutorial/AnalyseRGP.md b/docs/tutorial/AnalyseRGP.md deleted file mode 100644 index 5e085392..00000000 --- a/docs/tutorial/AnalyseRGP.md +++ /dev/null @@ -1,17 +0,0 @@ -# Region of genomic plasticity prediction -## Predict the region of genomic plasticity from the partitioned pangenome graph - -"Regions of genome plasticity (**RGPs**) are clusters of genes located in highly variable genomic regions. Most of them arise from HGT and correspond to genomic islands (GIs)." [Bazin et al. 2020](https://doi.org/10.1093/bioinformatics/btaa792) - -We are going to start from the partitioned pangenome graph build in the [previous step](#build-and-partition-a-pangenome-graph). - -You can predict the RGP with PPanGGOLiN with this command: - -``` -ppanggolin rgp -p B_janonicum_results/pangenome.h5 -``` - -As explain before, PPanGGOLiN will predict and store the RGPs in the pangenome file. Follow the - -## Analyse RGPs predicted by PPanGGOLiN -### Get the list of RGPs predicted by PPanGGOLiN \ No newline at end of file diff --git a/docs/tutorial/Basic-usage-and-practical-information.md b/docs/tutorial/Basic-usage-and-practical-information.md deleted file mode 100644 index 08c41332..00000000 --- a/docs/tutorial/Basic-usage-and-practical-information.md +++ /dev/null @@ -1,217 +0,0 @@ -(basic)= -# Basic usage and pratical information - -## The 'workflow' subcommand - -We tried to make PPanGGOLiN relatively easy to use by making this **'workflow'** subcommand. -It runs a pangenome analysis whose exact steps will depend on the input files you provide it with. -In the end, you will end up with some files and figures that describe the pangenome of your taxonomic group of interest in different ways. - -The minimal subcommand is as follows : - -``` -ppanggolin workflow --fasta ORGANISMS_FASTA_LIST -``` - -It uses parameters that we found to be generally the best when working with species pangenomes. - -The file ORGANISMS_FASTA_LIST is a tsv-separated file with the following organisation : - -1. The first column contains a unique organism name -2. The second column the path to the associated FASTA file -3. Circular contig identifiers are indicated in the following columns -4. Each line represents an organism - - -An [example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.fasta.list) with 50 *Chlamydia trachomatis* genomes can be found in the testingDataset/ directory. - -You can also give PPanGGOLiN your own annotations using .gff or .gbff/.gbk files instead of .fasta files as long as they include the genomic dna sequences, such as the ones provided by prokka using the following command : - -``` -ppanggolin workflow --anno ORGANISMS_ANNOTATION_LIST -``` - -Another [example](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/organisms.gbff.list) of such a file can be found in the testingDataset/ directory. - - -```{note} -Look at the **annotate** command documentation for more information [here](#annotation) -``` - -In addition, you can provide your own gene families. -PPanGGOLiN will use it to build and partition the pangenome graph. -You can do that through the command line : - -``` -ppanggolin workflow --fasta ORGANISMS_FASTA_LIST --anno ORGANISMS_ANNOTATION_LIST --clusters MY_CLUSTERS_FILE -``` - -An example of what MY_CLUSTERS_FILE should look like is provided [here](https://github.com/labgem/PPanGGOLiN/blob/master/testingDataset/clusters.tsv) - -Whether you use fasta or annotations, the workflow command options are the same. - -| name | alias | default | type / choices | description | -|---------------------|-------|---------------------------------|--------------------|-------------------------------------------------------------------------------------------------------------------------------| -| --output | -o | ppanggolin_output_DATE_HOUR_PID | Path | Output directory to save the pangenome and all the output files | -| --basename | | pangenome | string | basename for the pangenome file | -| --rarefaction | | False | bool | Use to compute the rarefaction curves (WARNING: can be time consuming) | -| --cpu | -c | 1 | integer | Number of available cpus | -| --translation_table | | 11 | integer | Translation table (genetic code) to use | -| --kingdom | | bacteria | {bacteria,archaea} | Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation | -| --mode | | 1 | {0,1,2,3} | the cluster mode of MMseqs2. 0: Setcover, 1: single linkage (or connected component), 2: CD-HIT-like, 3: CD-HIT-like (lowmem) | -| --coverage | | 0.8 | 0<=float<=1 | Minimal coverage of the alignment for two proteins to be in the same cluster | -| --identity | | 0.8 | 0<=float<=1 | Minimal identity percent for two proteins to be in the same cluster | -| --nb_of_partitions | -K | -1 | integer | Number of partitions to use. Must be at least 2. If under 2, it will be detected automatically | -| --no_defrag | | False | bool | DO NOT Realign gene families to link fragments with their non-fragmented gene family | -| --no_flat_files | | False | bool | Generate only the HDF5 pangenome file | -| --tmpdir | | TMPDIR | Path | directory for storing temporary files | - -(panrgp)= -## The 'panrgp' subcommand - -This command works exactly like 'workflow'. The difference is that it will run more analysis related to [Regions of Genome Plasticity](#RGP-section). -You can use the panrgp command as follow: - -```bash -ppanggolin panrgp --fasta ORGANISMS_FASTA_LIST -``` - -The rgp analysis is launched after the pangenome partitionning and use the default parameters. -If you want to tune the rgp detection, you can use the `rgp` command after the `workflow` command. - - -More detail about RGP detection [here](#RGP-section) and in the [panRGP publication](https://doi.org/10.1093/bioinformatics/btaa792) - -(panmodule)= -## The 'panmodule' subcommand - -Again, it works like 'workflow' but you can detect the conserved modules in your pangenome, you can use the **panModule** workflow, as such: - -```bash -ppanggolin panmodule --fasta ORGANISMS_FASTA_LIST -``` - -The module prediction is launched after the pangenome partitionning with the default parameters. -If you want to tune the module detection, you can use the `module` command after the `workflow`. - - -Further details can be found in the [conserved module analysis documentation](#module-section) and in the [panModule publication](https://doi.org/10.1101/2021.12.06.471380) - -## Run all PPanGGOLiN analysis - -Finally it's also possible to run all analysis with one command wrapper `all`. -With this workflow, the pangenome will be built and partionned and RGP, spots and module will be predicted. -You can run all the analysis as such: - -```bash -ppanggolin all --fasta ORGANISMS_FASTA_LIST -``` - -## Configuration file - -Advanced users can provide a configuration file containing any or all parameters to PPanGGolin commands. -This feature is particularly useful for workflow commands such as `workflow`, `all`, `panrgp`, and `panmodule`, as it allows for the specification of all parameters for each subcommand launched in a workflow. -Additionally, a configuration file can be used to reuse a specific set of parameters across multiple pangenomes. - -To provide a configuration file to a PPanGGolin command, use the `--config` parameter. - -```{note} -Any command line arguments provided along with a configuration file will override the corresponding arguments specified in the configuration file. -When an argument is not specified in either the command line or the configuration file, the default value is used. -``` - -The configuration file is a JSON file that contains two sections common to all commands: `input_parameters` and `general_parameters`. -In addition, there is a section for each subcommand that contains its specific parameters. - -You can generate a configuration file template with default values by using the `ppanggolin utils` command as follows: - -``` -ppanggolin utils --default_config CMD -``` - -For example, to generate a configuration file for the panrgp command with default values, use the command -``` -ppanggolin utils --default_config panrgp -``` - - This command will create the following configuration file: - -```yaml -input_parameters: - # A tab-separated file listing the organism names, and the fasta filepath of its - # genomic sequence(s) (the fastas can be compressed with gzip). One line per organism. - # fasta: - # A tab-separated file listing the organism names, and the gff/gbff filepath of - # its annotations (the files can be compressed with gzip). One line - # per organism. If this is provided, those annotations will be used. - # anno: - -general_parameters: - # Output directory - output: ppanggolin_output_DATE2023-04-14_HOUR10.09.27_PID14968 - # basename for the output file - basename: pangenome - # directory for storing temporary files - tmpdir: /tmp - # Indicate verbose level (0 for warning and errors only, 1 for info, 2 for debug) - # Choices: 0, 1, 2 - verbose: 1 - # log output file - log: stdout - # disables the progress bars - disable_prog_bar: False - # Force writing in output directory and in pangenome output file. - force: False - -annotate: - # Use to not remove genes overlapping with RNA features. - allow_overlap: False - # Use to avoid annotating RNA features. - norna: False - # Kingdom to which the prokaryota belongs to, to know which models to use for rRNA annotation. - # Choices: bacteria, archaea - kingdom: bacteria - # Translation table (genetic code) to use. - translation_table: 11 - # In the context of provided annotation, use this option to read pseudogenes. (Default behavior is to ignore them) - use_pseudo: False - # Allow to force the prodigal procedure. If nothing given, PPanGGOLiN will decide in function of contig length - # Choices: single, meta - prodigal_procedure: False - # Number of available cpus - cpu: 1 -``` - -## Required computing resources - -Most of PPanGGOLiN's commands should be run with as many CPUs as you can give them by using the --cpu option as PPanGGOLiN's speed increases relatively well with the number of CPUs. -While the 'smallest' pangenomes (up to a few hundred genomes) can be easily analyzed on a normal desktop computer, -the biggest ones will require a good amount of RAM. -For example, 40 strains of *E. coli* were analyzed in 3 minutes using 1.2Go of RAM using 16 threads. -1000 strains were analyzed in 45 minutes with 14 Go of RAM using 16 threads, and as of writing those lines, -20 656 genomes was the biggest pangenome we did, and it required about a day and 120 Go of RAM. -The following graphic can give you an idea of the time it takes for a pangenome analysis given the number of genomes in input: - -```{image} ../_static/runtimes.png -:align: center -``` - -## Usage and basic options - -As most programs in bioinformatics, you can always specify some utility options. - -You can specify the number of CPUs to use (which is recommended ! The default is to use just one) using the option `--cpu`. - -You can specify the output directory (if not provided, one will be generated) using the option `--output`. - -If you work in a strange environment that has no, or little available disk space in the '/tmp' (or your system equivalent, what is stored in TMPDIR) directory, you can specify a new temporary directory using `--tmp` - -If you want to redo an analysis from scratch and store it in a directory that already exists, you will have to use the `--force` option. -Be wary, however, that the data in that directory will be overwritten if named identically as any output file written by ppanggolin. - -PPanGGOLiN is deliberately very verbose, to help users understand each stage of the analysis. -If you want, verbosity can be reduced in several ways. -First, you can specify the verbosity level with the `--verbose` option. -With `0` will show only warning and erros, `1` will add the information (default value), and if you encounter any problem you can use the debug level with value `2`. -Then you can also remove the progress bar with the option `--disable_prog_bar` -Finaly, you can also save PPanGGOLiN logs in a file by specified its path with the option `--log`. \ No newline at end of file diff --git a/docs/tutorial/inputData.md b/docs/tutorial/inputData.md deleted file mode 100644 index a8a2d35c..00000000 --- a/docs/tutorial/inputData.md +++ /dev/null @@ -1,42 +0,0 @@ -# How to prepare your data for PPanGGOLiN - -To build and partition a pangenome, PPanGGOLiN need a set of either DNA sequences or provided genome annotations. In order to help you to start with PPanGGOLiN, you can follow this step to download some genomes from _Bradyrhizobium japonicum_. These genomes will be our baseline all along the tutorial. If you already have your genome you can directly go to [the input file creation](#create-your-list-of-genomes-file) - -## Get B._japonicum_ genomics data -### Genomes from GTDB taxonomy - -To obtain the genomes of B. _japonicum_ from the [GTDB database](https://gtdb.ecogenomic.org/), you must use the name of the species in GTDB. - -``` -genome_updater.sh -d "refseq,genbank" -f "genomic.gbff.gz" -o "B_japonicum_genomes" -M "gtdb" -T "s__Bradyrhizobium japonicum" -``` - -### Genomes from NCBI taxonomy - -To obtain the genomes of B. _japonicum_ from the [NCBI](https://www.ncbi.nlm.nih.gov/), you must use its taxonomic ID. - -``` -genome_updater.sh -d "refseq,genbank" -f "genomic.gbff.gz" -o "B_japonicum_genomes" -M "ncbi" -T "375" -``` - -## Create your list of genomes file - -PPanGGOLiN use the list of genomes as input for some command, such as the workflow. -The file is a tsv-separated file with the following organisation : - -1. The first column contains a unique organism name -2. The second column the path to the associated annotated file -3. Each line represents an organism - -```{note} -It's also possible to use fasta file as input. -Look at the documentation. -``` - -If you are using the annotated genomes (*GBFF*, *GFF*, *GBK*), you can generate your file with the following command - -``` -for file in $(ls B_japonicum_genomes/*/files/*.gz);do genome=$(echo $file | cut -d'/' -f4 | cut -d'_' -f1-3); echo -e "$genome\t$file"; done > organism_gbff.list -``` - -**You're now ready to build the pangenome !!!** \ No newline at end of file diff --git a/docs/tutorial/prepEnv.md b/docs/tutorial/prepEnv.md deleted file mode 100644 index fbacb470..00000000 --- a/docs/tutorial/prepEnv.md +++ /dev/null @@ -1,64 +0,0 @@ -# How to prepare your working environment - -In order to work with PPanGGOLiN and to follow the tutorial we recommend to follow the next steps. - -## Create a conda environment - -The first step consist of creating a conda environment to install PPanGGOLiN and its dependencies. -You can look [here](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) to know how install conda on your system. - -You can create tour environment as such: -```shell -conda create --name ppanggo -``` - -More information on how to create a conda environment [here](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#activating-an-environment) - - -## Install PPanGGOLiN - -To install PPanGGOLiN in your conda environment you need to add 3 channels as follows: - -``` -conda config --add channels defaults ; -conda config --add channels bioconda ; -conda config --add channels conda-forge ; -``` - -Then you can just run: - -``` -conda install -c bioconda ppanggolin -``` - -This command will automatically install PPanGGOLiN dependencies. -If you want more information or other methods to install PPanGGOLiN look at the documentation [here](../user/Installation.md#Installation) - -## Install tutorial dependencies - -As part of the tutorial we will also install some other software and packages. - -### Download genomes - -To download our genomes, we are going to use [genome_updater](https://github.com/pirovc/genome_updater). -Other solution exist such as [ncbi genome downloading scripts](https://github.com/kblin/ncbi-genome-download). Feel free to use the best and easiest way for you. - -To install genome updater you can use the following command: `conda install -c bioconda genome_updater` - -To install ncbi genome downloader you can use the following command: `conda install -c bioconda ncbi-genome-download` - -### Graph visualisation - -To visualise the pangenome graph you can use [Gephi](https://gephi.org/). To install it you must download the archive [here](https://gephi.org/users/download/). Then you can follow the next command: - -```shell -tar -xvzf path/to/gephi/archive/gephi-X.XX.X-linux-x64.tar.gz -cd gephi-X.XX.X -chmod 755 bin/gephi -./bin/gephi -``` - -```{tip} -Gephi is also available on flathub and snapcraft. -``` -[//]: # (Other depencies can be added here) \ No newline at end of file diff --git a/docs/tutorial/workflows.md b/docs/tutorial/workflows.md deleted file mode 100644 index 017e9d08..00000000 --- a/docs/tutorial/workflows.md +++ /dev/null @@ -1,105 +0,0 @@ -# Build and partition a pangenome graph - -## How to build and partition a pangenome graph in one command line with PPanGGOLiN - -Now that you have your list of genome as described [here](./inputData#create-your-list-of-genomes-file) you can build the pangenome of _B.japonicum_. - -We tried to make PPanGGOLiN relatively easy to use by making this **'workflow'** subcommand. -It runs a pangenome analysis whose exact steps will depend on the input files you provide it with. -In the end, you will end up with some files and figures that describe the pangenome of your taxonomic group of interest in different ways. - -The minimal subcommand is as follows : - -``` -ppanggolin workflow --anno organism_gbff.list -o B_janonicum_results -``` - -Congratulation, you build (maybe your first) pangenome graph and partitioned it in 3 different partition: **persistent**, **shell** and **cloud** (look at ([Gautreau et al. 2020](https://doi.org/10.1371/journal.pcbi.1007732)) for more information about partition) - -The results of the workflow is saved in the **pangenome.h5** file, which is in the HDF-5 file format. -When you run an analysis using this file as input, the results of that analysis will be added to the file to supplement the data that are already stored in it. -The idea behind this is that you can store and manipulate your pangenome with PPanGGOLiN by using this file only. It will keep all the information about what was done, all the parameters used, and all the pangenome's content. - - -```{tip} -Many option are available to tune your analysis. Take a look here. -``` - -## How to analyse the pangenome graph with PPanGGOLiN - -The workflow subcommand generate automatically some outputs, we are going to describe some of them that are useful and classic in pangenome analyses - -### Statistics and metrics on the pangenome -#### Organisms statitics - - -PPanGGOLiN can generate a tab-separated file describing the content of each of the genome used for building the pangenome. -It might be useful when working with fragmented data such as *MAGs* or if you suspect some of your genomes to be chimeric, -or to not belong to your taxonomic group (as those genomes will be outliers regarding to the numbers in this file). -The first lines starting with a '#' are indicators of parameters used when generating the numbers describing each organisms, and should not be read if loading this into a spreadsheet. They will be skipped automatically if you load this file with R. - -This file is made of 15 columns described in the documentation here. - -It can be generated using the 'write' subcommand as such : - -`ppanggolin write -p pangenome.h5 --stats` - -```{note} -This command will also generate the 'mean_persistent_duplication.tsv' file. -``` - -### U-shaped plot: gene families frequency distribution in pangenome - -A U-shaped plot is a figure presenting the number of families (y-axis) per number of organisms (x-axis). -It is a .html file that can be opened with any browser and with which you can interact, zoom, move around, -mouseover to see numbers in more detail, and you can save what you are seeing as a .png image file. - -![U-shaped plot _B.japonicum_](../_static/tutorial/U-shape.gif) - -A dotted grey bar on the graph representing the **soft core threshold** which is the lower limit for which families are present in the majority of genomes. By default this value is 95% (so families are in more than 95 genomes). - -You can change this value as such: - -``` -ppanggolin draw -p pangenome.h5 -o . --ucurve --soft_core 0.8 -f -``` - -### Tile plot: detect pangenome structure and outlier -A tile plot is a heatmap representing the gene families (y axis) in the organisms (x axis) making up your pangenome. -The tiles on the graph will be colored if the gene family is present in an organism and uncolored if absent. -The gene families are ordered by partition, and the genomes are ordered by a hierarchical clustering based on their shared gene families (basically two genomes that are close together in terms of gene family composition will be close together on the figure). - -This plot is quite helpful to observe potential structures in your pangenome, and can also help you to identify eventual outliers. -You can interact with it, and mousing over a tile in the plot will indicate to you which is the gene identifier(s), -the gene family and the organism that corresponds to the tile. - -![tile_plot](../_static/tutorial/tile_plot.png) - -[//]: # (Explain the bar on the right side) - -If you do not want the 'cloud' gene families as it is a lot of data and can be hard to open with a browser sometimes, -you can use the following option: - -`ppanggolin draw -p pangenome.h5 --tile_plot --nocloud` - -```{note} -If you build your pangenome using the 'workflow' subcommand and you have more than 500 organisms, only the 'shell' and the 'persistent' partitions will be drawn, leaving out the 'cloud' as the figure tends to be too heavy for a browser to open it otherwise. -``` - -### Rarefaction curve: indicator of the taxonomic group diversity - -The rarefaction curve represents the evolution of the number of gene families for each partition as you add more genomes to the pangenome. -It has been used a lot in the literature as an indicator of the diversity that you are missing with your dataset on your taxonomic group. -The idea is that if at some point when you keep adding genomes to your pangenome you do not add any more gene families, -you might have access to your entire taxonomic group's diversity. -On the contrary if you are still adding a lot of genes you may be still missing a lot of gene families. - -There are 8 partitions represented. For each of the partitions there are multiple representations of the observed data. -You can find the observed: *means*, *medians*, *1st* and *3rd quartiles* of the number of gene families per number of genome used. -And you can find the *fitting* of the data by the **Heaps' law**, which is usually used to represent this evolution of the diversity in terms of gene families in each of the partitions. - -It can be generated using the 'rarefaction' subcommand, which is dedicated to drawing this graph, as such : - -`ppanggolin rarefaction -p pangenome.h5` - -A lot of options can be used with this subcommand to tune your rarefaction curves, look at the documentation here. \ No newline at end of file diff --git a/docs/user/Flat/dupplication.md b/docs/user/Flat/dupplication.md deleted file mode 100644 index e7bbd479..00000000 --- a/docs/user/Flat/dupplication.md +++ /dev/null @@ -1,8 +0,0 @@ -This file is a .tsv file, with a single parameter written as a comment at the beginning of the file, which indicates the proportion of genomes in which a gene family must be present more than once to be considered 'duplicated' (and not single copy marker). -This file lists the gene families, their duplication ratio, their mean presence in the pangenome and whether it is considered a 'single copy marker' or not, which is particularly useful when calculating the completeness recorded in the [organisms statistics](https://github.com/labgem/PPanGGOLiN/wiki/Outputs#organisms-statistics) file described previously. - -It can be generated using the 'write' subcommand as such : - -`ppanggolin write_pangenome -p pangenome.h5 --stats` - -This command will also generate the 'organisms_statistics.tsv' file. diff --git a/docs/user/Flat/fam2gen.md b/docs/user/Flat/fam2gen.md deleted file mode 100644 index 8b3bc861..00000000 --- a/docs/user/Flat/fam2gen.md +++ /dev/null @@ -1,7 +0,0 @@ - -You can write a list containing the gene family assigned to every single gene of your pangenome, in a file format extactly like the one provided by [MMseqs2](https://github.com/soedinglab/MMseqs2) through its subcommand 'createtsv'. -It is basically a three-column file listing the gene family name in the first column, and the gene names in the second. A third column is either empty, or has an "F" in it. In that case it indicates that the gene is potentially a gene fragment and not complete. This will be indicated only if the [defragmentation](https://github.com/labgem/PPanGGOLiN/wiki/PPanGGOLiN---step-by-step-pangenome-analysis#defragmentation) pipeline is used. - -You can obtain it as such : - -`ppanggolin write_pangenome -p pangenome.h5 --families_tsv` \ No newline at end of file diff --git a/docs/user/Flat/genomes_fasta.md b/docs/user/Flat/genomes_fasta.md deleted file mode 100644 index acdae0d7..00000000 --- a/docs/user/Flat/genomes_fasta.md +++ /dev/null @@ -1,11 +0,0 @@ - - -PPanGGOLiN allows the incorporation of fasta sequences into GFF files and proksee JSON map files. This integration with Proksee provides access to various tools that rely on DNA sequences, including the construction of GC% and GC skew profiles, and conducting blast searches for example. - - -Since PPanGGOLiN does not retain genomic sequences, it is necessary to provide the original genomic files used to construct the pangenome through either the `--anno` or `--fasta` argument. These arguments mirror those used in workflow commands (`workflow`, `all`, `panrgp`, `panmodule`) and the `annotate` command. - -- `--anno`: This option requires a tab-separated file containing organism names and the corresponding GFF/GBFF filepaths of their annotations. If `--anno` is utilized, GFF files should include fasta sequences. - -- `--fasta`: Use this option with a tab-separated file that lists organism names alongside the filepaths of their genomic sequences in fasta format. - diff --git a/docs/user/Flat/genomes_metadata.md b/docs/user/Flat/genomes_metadata.md deleted file mode 100644 index 3d74ee8f..00000000 --- a/docs/user/Flat/genomes_metadata.md +++ /dev/null @@ -1,56 +0,0 @@ - - -You can inject metadata, previously added with the `metadata` command, into genome outputs using the `--add_metadata` parameter. When users add metadata, they specify the source of this metadata. These metadata sources can be selectively included using the `--metadata_sources` parameter. By default, all sources are added when the `--add_metadata` flag is specified. - -#### Metadata in GFF Files - -Metadata is integrated into the attributes column of the GFF file. The patterns for adding metadata are as follows: - -- In CDS lines, metadata associated with genes follow this pattern: `gene__=`. Gene family metadata follows a similar pattern: `gene__=`. -- In the contig lines of type `region` describing the contig, genome metadata is added with the pattern: `genome__=`, and contig metadata is added with: `contig__=`. -- In RGP lines, metadata is added using the pattern: `rpg__=`. - -For example, if we associate metadata is associated with the gene family DYB08_RS16060 with the source `pfam`: - -```tsv -families accession type description -DYB08_RS16060 PF18894 domain This entry represents a probable metallopeptidase domain found in a variety of phage and bacterial proteomes. -``` - -This metadata file can be added to the pangenome with the metadata command: - -```bash -ppanggolin metadata -p pangenome.h5 --source pfam --metadata family_pfam_annotation.tsv --assign families -``` - -When writing GFF output with the `--add_metadata` flag: - -```bash -ppanggolin write_genomes -p pangenome.h5 --proksee -o proksee_out --gff --add_metadata -``` - -A gene belonging to this family would have the following attribute in its GFF line: `family_pfam_accession=PF18894;family_pfam_description=This entry represents a probable metallopeptidase domain found in a variety of phage and bacterial proteomes.;family_pfam_type=domain`. - -```gff -NC_010404.1 external CDS 77317 77958 . - 0 ID=ABAYE_RS00475;Parent=gene-ABAYE_RS00475;product=putative metallopeptidase;family=DYB08_RS16060;partition=persistent;rgp=NC_010404.1_RGP_0;family_pfam_accession=PF18894;family_pfam_description=This entry represents a probable metallopeptidase domain found in a variety of phage and bacterial proteomes.;family_pfam_type=domain -``` - -#### Metadata in Proksee Visualization - -Metadata can be seamlessly incorporated into Proksee JSON MAP files. These metadata details become accessible by simply hovering the mouse over the features. - -For instance, with the metadata previously added to the DYB08_RS16060 gene family, the Proksee visualization would resemble the example below: - -```{image} ../_static/proksee_metadata_example.png -:align: center -``` - - -#### Metadata in Table output - -Metadata is seamlessly incorporated into table output with the addition of extra columns. These columns follow the GFF attribute naming: - -- gene metadata: `gene__` -- family metadata: `gene__` - - diff --git a/docs/user/Flat/gff.md b/docs/user/Flat/gff.md deleted file mode 100644 index 03d72d5d..00000000 --- a/docs/user/Flat/gff.md +++ /dev/null @@ -1,48 +0,0 @@ - -The `--gff` argument generates GFF files, each containing pangenome annotations for individual genomes within the pangenome. The GFF file format is a widely recognized standard in bioinformatics and can seamlessly integrate into downstream analysis tools. - -To generate GFF files from a pangenome HDF5 file, you can use the following command: - -```bash -ppanggolin write_genomes -p pangenome.h5 --gff -o output -``` - -This command will create a gff directory within the output directory, with one GFF file per genome. - -Pangenome annotations within the GFF are recorded in the attribute column of the file. - -For CDS features, pangenome annotations are recorded in the attribute column of the file: - -CDS features have the following attributes: - -- **family:** ID of the gene family to which the gene belongs. -- **partition:** The partition of the gene family, categorized as persistent, shell, or cloud. -- **module:** If the gene family belongs to a module, the module ID is specified with the key 'module.' -- **rgp:** If the gene is part of a Region of Genomic Plasticity (RGP), the RGP name is specified with the key 'rgp.' - -For Regions of Genomic Plasticity (RGPs), RGPs are specified under the feature type 'region.' - -RGPs have the following attributes: - -- The attribute 'spot' designates the spot ID where the RGP is inserted. When the RGP has no spot, the term 'No_spot' is used. -- The 'Note' attribute specifies that this feature is an RGP. - - -Here is an example showcasing the initial lines of the GFF file for the Acinetobacter baumannii AYE genome: - -```gff -##gff-version 3 -##sequence-region NC_010401.1 1 5644 -##sequence-region NC_010402.1 1 9661 -##sequence-region NC_010403.1 1 2726 -##sequence-region NC_010404.1 1 94413 -##sequence-region NC_010410.1 1 3936291 -NC_010401.1 . region 1 5644 . + . ID=NC_010401.1;Is_circular=true -NC_010401.1 ppanggolin region 629 5591 . . . Name=NC_010401.1_RGP_0;spot=No_spot;Note=Region of Genomic Plasticity (RGP) -NC_010401.1 external gene 629 1579 . + . ID=gene-ABAYE_RS00005 -NC_010401.1 external CDS 629 1579 . + 0 ID=ABAYE_RS00005;Parent=gene-ABAYE_RS00005;product=replication initiation protein;family=ABAYE_RS00005;partition=cloud;rgp=NC_010401.1_RGP_0 -NC_010401.1 external gene 1576 1863 . + . ID=gene-ABAYE_RS00010 -NC_010401.1 external CDS 1576 1863 . + 0 ID=ABAYE_RS00010;Parent=gene-ABAYE_RS00010;product=hypothetical protein;family=ABAYE_RS00010;partition=cloud;rgp=NC_010401.1_RGP_0 -NC_010401.1 external gene 2054 2572 . - . ID=gene-ABAYE_RS00015 -NC_010401.1 external CDS 2054 2572 . - 0 ID=ABAYE_RS00015;Parent=gene-ABAYE_RS00015;product=tetratricopeptide repeat protein;family=HTZ92_RS18670;partition=shell;rgp=NC_010401.1_RGP_0 -``` \ No newline at end of file diff --git a/docs/user/Flat/metrics.md b/docs/user/Flat/metrics.md deleted file mode 100644 index 0c8c8047..00000000 --- a/docs/user/Flat/metrics.md +++ /dev/null @@ -1,35 +0,0 @@ -After computing a pangenome, it's interesting to get some metrics about it. -The `metrics` subcommand allow running and compute some analysis and metrics. - -All the metrics computed here will be saved in your pangenome file and -will be easily readable with the `info` subcommand - -### Genomic fluidity -The genomic fluidity is described as *a robust metric to categorize the -gene-level similarity among groups of sequenced isolates.* -[more information here](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-12-32) - -We add the possibility to get genomic fluidity for all the pangenome or -for specific partition. The genomic fluidity is computable like that : -``` -ppanggolin metrics -p pangenome --genome_fluidity -... -Genomes fluidity : all=0.026, shell=0.477, cloud=0.045, accessory=0.554 -``` -*all* correspond to all the family in the pangenome (core and accessory) - -### Module information -It could be necessary to get more information about the modules. -Here we provide information about families, and we separate modules in -function of the partition. You can get this supplementary information -as such : -```bash -ppanggolin metrics -p pangenome.h5 --info_modules -``` - -``` -Modules : 3 -Families in Modules : 22 (min : 5, max : 9, sd : 2.08, mean : 7.33) - Sheel specific : 36.36 (sd : 4.62, mean : 2.67) - Cloud specific : 63.64 (sd : 4.51, mean : 4.67) -``` diff --git a/docs/user/Flat/partition.md b/docs/user/Flat/partition.md deleted file mode 100644 index 3011a63b..00000000 --- a/docs/user/Flat/partition.md +++ /dev/null @@ -1,5 +0,0 @@ -Those files will be stored in the 'partitions' directory and will be named after the partition that they represent (like persistent.txt for the persistent partition). In each of those file there will be a list of gene family identifiers that correspond to the gene families belonging to that partition, one family per line, should you need it for your pipelines or during your analysis. - -You can generate those files as such : - -` ppanggolin write_pangenome -p pangenome.h5 --partitions` \ No newline at end of file diff --git a/docs/user/Flat/presAbs.md b/docs/user/Flat/presAbs.md deleted file mode 100644 index 0fa40800..00000000 --- a/docs/user/Flat/presAbs.md +++ /dev/null @@ -1,16 +0,0 @@ -### gene presence absence - -This file is basically a presence absence matrix. The columns are the genomes used to build the pangenome, the lines are the gene families. The identifier of the gene family is the gene identifier chosen as a representative. - There is a 1 if the gene family is present in a genome, and 0 otherwise. It follows the exact same format than the 'gene_presence_absence.Rtab' file that you get from the pangenome analysis software [roary](https://sanger-pathogens.github.io/Roary/) - -It can be generated using the 'write' subcommand as such : - -`ppanggolin write_pangenome -p pangenome.h5 --Rtab` - -### matrix - -This file is a .csv file following a format alike the gene_presence_absence.csv file generated by [roary](https://sanger-pathogens.github.io/Roary/), and works with [scoary](https://github.com/AdmiralenOla/Scoary) if you want to do pangenome-wide association studies. - -It can be generated using the 'write' subcommand as such : - -`ppanggolin write_pangenome -p pangenome.h5 --csv` diff --git a/docs/user/Flat/proksee.md b/docs/user/Flat/proksee.md deleted file mode 100644 index f839bc4e..00000000 --- a/docs/user/Flat/proksee.md +++ /dev/null @@ -1,31 +0,0 @@ -The `--proksee` argument generates JSON map files containing pangenome annotations, which can be visualized using Proksee at [https://proksee.ca/](https://proksee.ca/). - -To generate JSON map files, you can use the following command: - -```bash -ppanggolin write_genomes -p pangenome.h5 --proksee -o output -``` - -This command will create a proksee directory within the output directory, with one JSON file per genome. - - -To load a JSON map file on Proksee, follow these steps: -1. Navigate to the "Map JSON" tab. -2. Upload your file using the browse button. -3. Click the "Create Map" button to generate the visualization. - -A genome visualized by Proksee with PPanGGOLiN annotation appears as depicted below: - - -```{image} ../_static/proksee_exemple_A_baumannii_AYE.png -:align: center -``` - -*Image: Genome visualized by Proksee with PPanGGOLiN annotation.* - - -The visualization consists of three tracks: -- **Genes:** Color-coded by their gene family partition. -- **RGP (Region of Genomic Plasticity):** Spot associated to the RGPs are specified in the annotation of the object. -- **Module:** Displaying modules within the genome. The completion of the module is specified in the annotation of the object. - diff --git a/docs/user/Flat/tables.md b/docs/user/Flat/tables.md deleted file mode 100644 index 83a2e90a..00000000 --- a/docs/user/Flat/tables.md +++ /dev/null @@ -1,35 +0,0 @@ - - -The `--table` option generates a TSV file for each genome, providing pangenome annotations for the genes. These files are stored within a directory named 'tables'. - - -The table below outlines the columns found in these generated files: - -| Column | Description | -|----------------------|----------------------------------------------------------------------------| -| gene | Unique identifier for the gene | -| contig | Contig on which the gene is located | -| start | Start position of the gene | -| stop | Stop position of the gene | -| strand | Strand on which the gene is on | -| family | Id of the gene's associated family within the pangenome | -| nb_copy_in_org | Number of family copies present in the organism; 1 indicates no close paralogs | -| partition | Partition to which the gene family belongs in the pangenome | -| persistent_neighbors | Number of neighbors classified as 'persistent' in the pangenome graph | -| shell_neighbors | Number of neighbors classified as 'shell' in the pangenome graph | -| cloud_neighbors | Number of neighbors classified as 'cloud' in the pangenome graph | -| RGP | Name of the Region of Genomic Plasticity (RGP) if the gene is within an RGP (present only if RGPs have been predicted) | -| spot | Spot ID in which the RGP is inserted (present only if RGPs and spot have been predicted) | -| module | Module ID of the gene family (present if modules have been predicted) | - - -```{note} -Columns such as RGP, spot, and module are included only when these elements have been predicted in the pangenome. -``` - -Those files can be generated as such : - -``` -ppanggolin write_genomes -p pangenome.h5 --table -o write_genomes_output -``` - diff --git a/docs/user/Outputs.md b/docs/user/Outputs.md deleted file mode 100644 index 63c5dacb..00000000 --- a/docs/user/Outputs.md +++ /dev/null @@ -1,110 +0,0 @@ -(output)= -# PPanGGOLiN outputs - -PPanGGOLiN provides multiple outputs to describe a pangenome. In this section the different outputs will be described. - -In most cases it will provide with a HDF-5 file named "pangenome.h5". This file stores all the information about your pangenome and the analysis that were run. If given to ppanggolin through most of the subcommands, it will read information from it. This is practical as you can regenerate figures or output files, or rerun parts of the analysis without redoing everything. - -In this section, each part will describe a possible output of PPanGGOLiN, and will be commented with the command line that generates it using the HDF5 file, which is assumed to be called 'pangenome.h5'. - -When using the same subcommand (like 'write_pangenome' or 'draw' that can help you generate multiple file each), you can provide multiple options to write all of the file formats that you desire at once. - -## PPanGGOLiN figures outputs - -### U-shaped plot -```{include} Figures/Uplot.md -``` - -### tile plot -```{include} Figures/tilePlot.md -``` - -### Spot plots -```{include} Figures/spots.md -``` - -### Rarefaction -```{include} Figures/rarefaction.md -``` - -## Write flat outputs describing the pangenome - -Writes 'flat' files that describe the pangenome and its elements with the command `write_pangenome`. - -### Organisms statistics -```{include} Flat/orgStat.md -``` - -### pangenomeGraph files -The pangenome's graph can be given through multiple data formats, in order to manipulate it with other softwares. - -#### gexf and light gexf -```{include} graphOut/GEXF.md -``` - -#### json -```{include} graphOut/JSON.md -``` - -```{include} Flat/presAbs.md -``` - -### mean persistent duplication -```{include} Flat/dupplication.md -``` - -### Gene families and genes -```{include} Flat/fam2gen.md -``` - -### Genomic Island -```{include} Flat/RGP.md -``` - -### Modules -```{include} Flat/module.md -``` - -### Partitions -```{include} Flat/partition.md -``` - -## Write genome files with pangenome annotations -The `write_genomes` command creates 'flat' files representing genomes with their pangenome annotations. - -To generate output exclusively for particular genomes, users can utilize the `--organisms` argument. This argument accepts a list of organism names, either directly entered in the command line (comma-separated) or referenced from a file where each line contains a single organism name. - - -### Genes table with pangenome annotations -```{include} Flat/tables.md -``` -### GFF file -```{include} Flat/gff.md -``` -### JSON Map for Proksee visualisation -```{include} Flat/proksee.md -``` -### Adding Fasta Sequences into GFF and proksee JSON map Files - -```{include} Flat/genomes_fasta.md -``` - -### Incorporating Metadata into Tables, GFF, and Proksee Files -```{include} Flat/genomes_metadata.md -``` - -## Fasta -```{include} sequence/fasta.md -``` - -## MSA -```{include} sequence/MSA.md -``` - -## Info -```{include} Flat/info.md -``` - -## Metrics -```{include} Flat/metrics.md -``` \ No newline at end of file From 6c3d95b37dfc329df6859333d578f44327aea8f8 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 17:10:47 +0100 Subject: [PATCH 07/51] add contribution workflow --- docs/dev/contribute.md | 85 ++++++++++++++++++++++++++++++++++++++++++ docs/dev/git.md | 6 --- 2 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 docs/dev/contribute.md delete mode 100644 docs/dev/git.md diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md new file mode 100644 index 00000000..c11990d0 --- /dev/null +++ b/docs/dev/contribute.md @@ -0,0 +1,85 @@ +# How to Contribute to the PPanGGOLiN Project + +We warmly welcome contributions from the community! Whether you're interested in suggesting new features, fixing typos in the documentation, or making minor changes, your input is highly appreciated. + +## Starting with an Issue + +If you have ideas for new features or improvements, initiating a discussion in an issue is a great way to collaborate with the development team. This allows us to evaluate and discuss your suggestions together. + +For minor changes like fixing typos or making small edits, feel free to create a new Pull Request (PR) directly with your proposed changes. + +## Setting Up the Development Environment + +1. **Fork the Repository:** Start by forking the repository to your GitHub account. + +2. **Clone the Forked Repository:** Clone your forked repository to your local machine. + +2. **Get a environnement:** Create an environnement with all PPanGGOLiN prerequis installed. For that you can follow instalation instruction [here](../user/install.md#installing-from-source-code-github). + +3. **Branch from 'dev':** Begin your changes from the 'dev' branch, where we incorporate alterations for the upcoming release. + +4. **Install in Editable Mode:** To enable seamless code editing and testing of new functionality, install PPanGGOLiN in editable mode using the following command: + + ```bash + pip install -e . + ``` + + This allows you to modify the code and experiment with new features directly. + + ```{note} + Note: Currently, we are not utilizing any auto formatters (like autopep8 or black). Kindly refrain from using them, as it could introduce extensive changes across the project, making code review challenging for us. + ``` + +## Making Your Changes + +We encourage consistency in code formatting; when adding new code, try to follow the existing code structure as closely as possible. Functions should include descriptive docstrings explaining their purpose and detailing the parameters. Ensure that argument types are specified in the function definitions. + +## Update Documentation + +It's essential to update the documentation to reflect your changes. Provide clear descriptions and, if necessary, examples of commands and their respective outputs. + +## Tests + +### Continuous Integration (CI) Workflow + +We've set up a CI workflow in the Actions tab, which executes a series of PPanGGOLiN commands to validate their functionality. If you've introduced a new feature, consider adding a command line to the CI YAML file to test it and ensure its seamless integration. + +### Unit Tests + +While not mandatory for all PPanGGOLiN code, incorporating tests for your additions can be advantageous. The test suite is located in the 'tests' directory at the root of the project. Execute pytest at the project root: + +## Update documentation + +Update the documentation with your change. Describe as clearly as possible and provide example command and output if necessary. + + +## Tests + +### CI workflow + +We have a CI workflow in the actions tab that runs a lists of ppanggolin commands to ensure they still work. If you added a new feature you can add a command line in the CI yaml file to test it. + + +### Unit Tests + +While we don't require unit tests for all PPanGGOLiN code, it's beneficial to include some tests for any code additions you make. +Tests are stored in the tests repository at the root of the project. + + + +## Creating a Pull Request + +Once you've made your changes: + +1. **Create a Pull Request:** Submit a pull request from your forked repository to the 'dev' branch on GitHub. + +2. **Describe Your Changes:** Clearly describe the modifications you've made and link any associated issue(s) in the PR description. + +3. **Collaborative Review:** Our team will review your changes, offer feedback, and engage in discussions until we collectively agree on the implementation. + +We greatly appreciate your contributions and look forward to collaborating with you! + + + + + diff --git a/docs/dev/git.md b/docs/dev/git.md deleted file mode 100644 index 5d4c5d77..00000000 --- a/docs/dev/git.md +++ /dev/null @@ -1,6 +0,0 @@ -# How to contribute to the project -```{warning} -This part of the documentation is in progress. -``` -## How to use git to version the code -## How to use our GitHub repository From 8820dc99ae636f57887e69636ab9b9f3fb7c8376 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 17:35:37 +0100 Subject: [PATCH 08/51] add some emojis to the contribution pipeline :tada: --- docs/dev/contribute.md | 54 ++++++++++++------------------------------ docs/index.md | 6 +---- 2 files changed, 16 insertions(+), 44 deletions(-) diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md index c11990d0..510e31f4 100644 --- a/docs/dev/contribute.md +++ b/docs/dev/contribute.md @@ -1,30 +1,30 @@ -# How to Contribute to the PPanGGOLiN Project +# How to Contribute ✨ -We warmly welcome contributions from the community! Whether you're interested in suggesting new features, fixing typos in the documentation, or making minor changes, your input is highly appreciated. +We warmly welcome contributions from the community! Whether you're interested in suggesting new features, fixing typos in the documentation, or making minor changes, your input is highly appreciated. 🌟 ## Starting with an Issue -If you have ideas for new features or improvements, initiating a discussion in an issue is a great way to collaborate with the development team. This allows us to evaluate and discuss your suggestions together. +If you have ideas for new features or improvements, initiating a discussion in an issue is a great way to collaborate with the development team. This allows us to evaluate and discuss your suggestions together. 💡 -For minor changes like fixing typos or making small edits, feel free to create a new Pull Request (PR) directly with your proposed changes. +For minor changes like fixing typos or making small edits, feel free to create a new Pull Request (PR) directly with your proposed changes. ## Setting Up the Development Environment -1. **Fork the Repository:** Start by forking the repository to your GitHub account. +1. **Fork the Repository:** Start by forking the repository to your GitHub account. 🍴 2. **Clone the Forked Repository:** Clone your forked repository to your local machine. -2. **Get a environnement:** Create an environnement with all PPanGGOLiN prerequis installed. For that you can follow instalation instruction [here](../user/install.md#installing-from-source-code-github). +3. **Get an Environment:** Create an environment with all PPanGGOLiN prerequisites installed. For that, you can follow installation instructions [here](../user/install.md#installing-from-source-code-github). -3. **Branch from 'dev':** Begin your changes from the 'dev' branch, where we incorporate alterations for the upcoming release. +4. **Branch from 'dev':** Begin your changes from the 'dev' branch, where we incorporate alterations for the upcoming release. -4. **Install in Editable Mode:** To enable seamless code editing and testing of new functionality, install PPanGGOLiN in editable mode using the following command: +5. **Install in Editable Mode:** To enable seamless code editing and testing of new functionality, install PPanGGOLiN in editable mode using the following command: ```bash pip install -e . ``` - This allows you to modify the code and experiment with new features directly. + This allows you to modify the code and experiment with new features directly. ```{note} Note: Currently, we are not utilizing any auto formatters (like autopep8 or black). Kindly refrain from using them, as it could introduce extensive changes across the project, making code review challenging for us. @@ -32,7 +32,7 @@ For minor changes like fixing typos or making small edits, feel free to create a ## Making Your Changes -We encourage consistency in code formatting; when adding new code, try to follow the existing code structure as closely as possible. Functions should include descriptive docstrings explaining their purpose and detailing the parameters. Ensure that argument types are specified in the function definitions. +We encourage consistency in code formatting; when adding new code, try to follow the existing code structure as closely as possible. Functions should include descriptive docstrings explaining their purpose and detailing the parameters. Ensure that argument types are specified in the function definitions. ## Update Documentation @@ -46,40 +46,16 @@ We've set up a CI workflow in the Actions tab, which executes a series of PPanGG ### Unit Tests -While not mandatory for all PPanGGOLiN code, incorporating tests for your additions can be advantageous. The test suite is located in the 'tests' directory at the root of the project. Execute pytest at the project root: - -## Update documentation - -Update the documentation with your change. Describe as clearly as possible and provide example command and output if necessary. - - -## Tests - -### CI workflow - -We have a CI workflow in the actions tab that runs a lists of ppanggolin commands to ensure they still work. If you added a new feature you can add a command line in the CI yaml file to test it. - - -### Unit Tests - -While we don't require unit tests for all PPanGGOLiN code, it's beneficial to include some tests for any code additions you make. -Tests are stored in the tests repository at the root of the project. - - +While not mandatory for all PPanGGOLiN code, incorporating tests for your additions can be advantageous. The test suite is located in the 'tests' directory at the root of the project. Execute pytest at the project root: ## Creating a Pull Request Once you've made your changes: -1. **Create a Pull Request:** Submit a pull request from your forked repository to the 'dev' branch on GitHub. - -2. **Describe Your Changes:** Clearly describe the modifications you've made and link any associated issue(s) in the PR description. - -3. **Collaborative Review:** Our team will review your changes, offer feedback, and engage in discussions until we collectively agree on the implementation. - -We greatly appreciate your contributions and look forward to collaborating with you! - - +1. **Create a Pull Request:** Submit a pull request from your forked repository to the 'dev' branch on GitHub. 🚀 +2. **Describe Your Changes:** Clearly describe the modifications you've made and link any associated issue(s) in the PR description. 📝 +3. **Collaborative Review:** Our team will review your changes, offer feedback, and engage in discussions until we collectively agree on the implementation. 🤝 +We greatly appreciate your contributions and look forward to collaborating with you! 🙌 diff --git a/docs/index.md b/docs/index.md index 84d563f7..163cc7a4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,7 +19,6 @@ ```{image} _static/logo.png :alt: ppangolin logo :align: center -:height: 350 :width: 437 ``` @@ -85,10 +84,7 @@ user/metadata :caption: 'Developper Guide:' :maxdepth: 1 -dev/devRules -dev/git -dev/unitTest -dev/workflows +dev/contribute dev/buildDoc ``` From 863ca0c8795c43f51e19ed9ceb0e1c0fdb9fce80 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 17:38:03 +0100 Subject: [PATCH 09/51] clean dev doc --- docs/dev/devRules.md | 4 ---- docs/dev/unitTest.md | 4 ---- docs/dev/workflows.md | 4 ---- 3 files changed, 12 deletions(-) delete mode 100644 docs/dev/devRules.md delete mode 100644 docs/dev/unitTest.md delete mode 100644 docs/dev/workflows.md diff --git a/docs/dev/devRules.md b/docs/dev/devRules.md deleted file mode 100644 index 20de4f53..00000000 --- a/docs/dev/devRules.md +++ /dev/null @@ -1,4 +0,0 @@ -# Development rules -```{warning} -This part of the documentation is in progress. -``` \ No newline at end of file diff --git a/docs/dev/unitTest.md b/docs/dev/unitTest.md deleted file mode 100644 index 3a59f526..00000000 --- a/docs/dev/unitTest.md +++ /dev/null @@ -1,4 +0,0 @@ -# Unitary test -```{warning} -This part of the documentation is in progress. -``` \ No newline at end of file diff --git a/docs/dev/workflows.md b/docs/dev/workflows.md deleted file mode 100644 index b4ea9df3..00000000 --- a/docs/dev/workflows.md +++ /dev/null @@ -1,4 +0,0 @@ -# GitHub workflows -```{warning} -This part of the documentation is in progress. -``` \ No newline at end of file From 72d78425e80ed8f58621b5f57f3ff6f0657d3a16 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 14 Dec 2023 17:47:45 +0100 Subject: [PATCH 10/51] fix typo --- docs/dev/contribute.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md index 510e31f4..055ec017 100644 --- a/docs/dev/contribute.md +++ b/docs/dev/contribute.md @@ -46,7 +46,7 @@ We've set up a CI workflow in the Actions tab, which executes a series of PPanGG ### Unit Tests -While not mandatory for all PPanGGOLiN code, incorporating tests for your additions can be advantageous. The test suite is located in the 'tests' directory at the root of the project. Execute pytest at the project root: +While not mandatory for all PPanGGOLiN code, incorporating unit tests for your additions can be advantageous. The test suite is located in the 'tests' directory at the root of the project. ## Creating a Pull Request From 2c5a63e4de451bef7f7c5e29e639377b84fe49b3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Dec 2023 09:49:13 +0100 Subject: [PATCH 11/51] update contribute --- docs/dev/contribute.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md index 055ec017..6bf509a0 100644 --- a/docs/dev/contribute.md +++ b/docs/dev/contribute.md @@ -16,7 +16,7 @@ For minor changes like fixing typos or making small edits, feel free to create a 3. **Get an Environment:** Create an environment with all PPanGGOLiN prerequisites installed. For that, you can follow installation instructions [here](../user/install.md#installing-from-source-code-github). -4. **Branch from 'dev':** Begin your changes from the 'dev' branch, where we incorporate alterations for the upcoming release. +4. **Branch from 'dev':** Begin your changes from the 'dev' branch, where we incorporate changes for the upcoming release. 5. **Install in Editable Mode:** To enable seamless code editing and testing of new functionality, install PPanGGOLiN in editable mode using the following command: From b5c673752b7f900da6ab1e13b5e279e23b02b6c4 Mon Sep 17 00:00:00 2001 From: acalteau Date: Fri, 15 Dec 2023 11:04:18 +0100 Subject: [PATCH 12/51] Update pangenomeMetric.md minor revision --- docs/user/PangenomeAnalyses/pangenomeMetric.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user/PangenomeAnalyses/pangenomeMetric.md b/docs/user/PangenomeAnalyses/pangenomeMetric.md index 17efbfa1..7c99607a 100644 --- a/docs/user/PangenomeAnalyses/pangenomeMetric.md +++ b/docs/user/PangenomeAnalyses/pangenomeMetric.md @@ -13,8 +13,8 @@ The genomic fluidity is described as *a robust metric to categorize the gene-level similarity among groups of sequenced isolates.* [more information here](https://bmcgenomics.biomedcentral.com/articles/10.1186/1471-2164-12-32) -We add the possibility to get genomic fluidity for all the pangenome or -for specific partition. The genomic fluidity is computable like that : +We have added the possibility to get genomic fluidity for the whole pangenome or +for a specific partition. Genomic fluidity is computable as follows: ```bash ppanggolin metrics -p pangenome --genome_fluidity @@ -32,6 +32,6 @@ Genomes_fluidity: ```{note} -At the moment, only the fluidity is computed by the `metrics` command. But we migth add other metrics in the future. If you have some idea of metric describing the pangenome, please open an issue ! +Currently, the `metrics` command only computes fluidity. However, additional metrics may be added in the future. If you have any ideas for metrics that describe the pangenome, please open an issue! ``` From 2a12605a22c3c85e91061bd5d266a02c6e126df8 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Dec 2023 16:14:32 +0100 Subject: [PATCH 13/51] Add mermaid diagram of the workflow commands --- docs/conf.py | 6 +++ docs/user/Modules/modulePrediction.md | 44 +++++++++++++++ .../PangenomeAnalyses/pangenomeWorkflow.md | 47 +++++++++++++++- docs/user/QuickUsage/quickWorkflow.md | 53 +++++++++++++++++++ docs/user/RGP/rgpPrediction.md | 48 +++++++++++++++++ pyproject.toml | 4 +- 6 files changed, 199 insertions(+), 3 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index fc9816d3..7ec252d2 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -34,8 +34,14 @@ "sphinx.ext.autosectionlabel", "sphinx.ext.autodoc", 'sphinx_search.extension', + 'sphinxcontrib.mermaid' ] + +source_suffix = { + '.md': 'markdown' +} + # Prefix document path to section labels, to use: # `path/to/file:heading` instead of just `heading` autosectionlabel_prefix_document = True diff --git a/docs/user/Modules/modulePrediction.md b/docs/user/Modules/modulePrediction.md index 1bc22dea..ad6c6c6e 100644 --- a/docs/user/Modules/modulePrediction.md +++ b/docs/user/Modules/modulePrediction.md @@ -2,6 +2,50 @@ PPanGGOLiN can predict and work with conserved modules, which are groups of genes that are part of the variable genome, and often found together across the genomes of the pangenome. These conserved modules may also be potential functional modules. + + +```{mermaid} + +--- +title: "Workflow Overview: Steps launched by the panmodule command" +align: center +--- + +%%{init: {'theme':'default'}}%% + + +graph LR + + i[input genomes] --> a + + m:::panmodule + + subgraph Pangenome creation + a:::workflow + c:::workflow + g:::workflow + p:::workflow + a("annotate") --> c + c(cluster) --> g(graph) + g(graph) --> p(partition) + end + + subgraph Functional module + p --> m(module) + end + + + p --> f[pangenome.f5] + m --> f[pangenome.f5] + + + classDef panrgp fill:#4066d4 + classDef panmodule fill:#d44066 + classDef workflow fill:#d4ae40 + + +``` + Further details can be found in the [panModule preprint](https://doi.org/10.1101/2021.12.06.471380) ## The panModule workflow diff --git a/docs/user/PangenomeAnalyses/pangenomeWorkflow.md b/docs/user/PangenomeAnalyses/pangenomeWorkflow.md index 14ef0c59..29b99e57 100644 --- a/docs/user/PangenomeAnalyses/pangenomeWorkflow.md +++ b/docs/user/PangenomeAnalyses/pangenomeWorkflow.md @@ -1,7 +1,48 @@ PPanGGOLiN was created with the idea to make it both easy to use for beginners, and fully customizable for experts. Ease of use has been achieved by incorporating a workflow command that allows the construction and partitioning of a pangenome using genomic data. The command has only one mandatory option, and predefined parameters adapted to pangenomes at the scale of a bacterial species. -This command launches the [annotation](./pangenomeAnalyses.md#annotation), [clustering](./pangenomeAnalyses.md#clustering), [graph](./pangenomeAnalyses.md#graph) and [partition](./pangenomeAnalyses.md#partition) commands described below. +This command launches the [annotation](./pangenomeAnalyses.md#annotation), [clustering](./pangenomeCluster.md#cluster-genes-into-gene-families), [graph](./pangenomeAnalyses.md#graph) and [partition](./pangenomeAnalyses.md#partition) commands described below. + +
+
+ +```{mermaid} + +--- +title: "Workflow Overview: Steps launched by the workflow command" +align: center +--- + +%%{init: {'theme':'default'}}%% + + +graph LR + + i[input genomes] --> a + + + subgraph Pangenome creation + a:::workflow + c:::workflow + g:::workflow + p:::workflow + a("annotate") --> c + c(cluster) --> g(graph) + g(graph) --> p(partition) + end + + p --> f[pangenome.f5] + + + classDef panrgp fill:#4066d4 + classDef panmodule fill:#d44066 + classDef workflow fill:#d4ae40 + + +``` + +
+
To use this command, you need to provide a tab-separated list of either annotation files (gff3 or gbff) or fasta files. The expected format is detailed [in the annotation section](./pangenomeAnalyses.md#annotation) @@ -30,3 +71,7 @@ If you are unfamiliar with the output available in PPanGGOLiN, we recommend that In the workflow CLI, it is not possible to tune all the options available in all the steps. For a fully optimized analysis, you can either launch the subcommands one by one as described below, or you can use the configuration file as described [here](../practicalInformation.md#configuration-file) ``` + + + + diff --git a/docs/user/QuickUsage/quickWorkflow.md b/docs/user/QuickUsage/quickWorkflow.md index a7daf7a0..44c8a226 100644 --- a/docs/user/QuickUsage/quickWorkflow.md +++ b/docs/user/QuickUsage/quickWorkflow.md @@ -6,6 +6,59 @@ In the end, you will have a partitioned pangenome graph with predicted **RGP, sp [//]: # (### PPanGGOLiN: Pangenome analyses from list of annotated files) + + +```{mermaid} + +--- +title: "Workflow Overview: Steps launched by the all command" +align: center +--- + +%%{init: {'theme':'default'}}%% + + +graph LR + + i[input genomes] --> a + + + r:::panrgp + s:::panrgp + m:::panmodule + + subgraph Pangenome creation + a:::workflow + c:::workflow + g:::workflow + p:::workflow + a("annotate") --> c + c(cluster) --> g(graph) + g(graph) --> p(partition) + end + + subgraph Functional module + p --> m(module) + end + + subgraph Region of Genomic Plasticity + + p --> r(rgp) + r --> s(spot) + end + + p --> f[pangenome.f5] + s --> f[pangenome.f5] + m --> f[pangenome.f5] + + + classDef panrgp fill:#4066d4 + classDef panmodule fill:#d44066 + classDef workflow fill:#d4ae40 + + +``` + The minimal subcommand only need your own annotations files (using `.gff` or `.gbff`/`.gbk` files) as long as they include the genomic dna sequences, such as the ones provided by Prokka or Bakta. diff --git a/docs/user/RGP/rgpPrediction.md b/docs/user/RGP/rgpPrediction.md index cfb812b0..747756c2 100644 --- a/docs/user/RGP/rgpPrediction.md +++ b/docs/user/RGP/rgpPrediction.md @@ -9,6 +9,54 @@ Those methods were supported by the [panRGP publication](https://doi.org/10.1093 This command works exactly like [workflow](../PangenomeAnalyses/pangenomeAnalyses.md#workflow). The difference is that it will run additional analyses to characterize Regions of Genome Plasticity (RGP). + + +```{mermaid} + +--- +title: "Workflow Overview: Steps launched by the panrgp command" +align: center +--- + +%%{init: {'theme':'default'}}%% + + +graph LR + + i[input genomes] --> a + + + r:::panrgp + s:::panrgp + + subgraph Pangenome creation + a:::workflow + c:::workflow + g:::workflow + p:::workflow + a("annotate") --> c + c(cluster) --> g(graph) + g(graph) --> p(partition) + end + + subgraph Region of Genomic Plasticity + + p --> r(rgp) + r --> s(spot) + end + + p --> f[pangenome.f5] + s --> f[pangenome.f5] + + + classDef panrgp fill:#4066d4 + classDef panmodule fill:#d44066 + classDef workflow fill:#d4ae40 + + +``` + + You can use the `panrgp` with annotation (gff3 or gbff) files with `--anno` option, as such: ```bash ppanggolin panrgp --anno organism.gbff.list diff --git a/pyproject.toml b/pyproject.toml index 530c6052..3871fd98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ doc = [ "sphinx_rtd_theme==1.2.2", "readthedocs-sphinx-search==0.3.1", "sphinx-autobuild==2021.3.14", - "myst-parser==1.0.0", + "myst-parser==2", "docutils==0.18.1" ] test = [ @@ -67,4 +67,4 @@ packages = ["ppanggolin"] #mypkg = ["*.txt", "*.rst"] [tool.setuptools.dynamic] -version = {file = "VERSION"} \ No newline at end of file +version = {file = "VERSION"} From ba95b7345e5422c7393e0bb969fbfe4fbb6d082b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Dec 2023 16:20:17 +0100 Subject: [PATCH 14/51] add mermaid in sphinx requirements --- docs/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index e8024c8d..31ed17e0 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -3,4 +3,5 @@ sphinx_rtd_theme==1.2.2 readthedocs-sphinx-search==0.3.1 sphinx-autobuild==2021.3.14 myst-parser==1.0.0 -docutils==0.18.1 \ No newline at end of file +docutils==0.18.1 +sphinxcontrib.mermaid \ No newline at end of file From 442f6f00a33a007670261331d55a647845302308 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Dec 2023 16:28:59 +0100 Subject: [PATCH 15/51] adjust mermaid wf --- docs/user/PangenomeAnalyses/pangenomeWorkflow.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/user/PangenomeAnalyses/pangenomeWorkflow.md b/docs/user/PangenomeAnalyses/pangenomeWorkflow.md index 29b99e57..dfd55307 100644 --- a/docs/user/PangenomeAnalyses/pangenomeWorkflow.md +++ b/docs/user/PangenomeAnalyses/pangenomeWorkflow.md @@ -3,8 +3,7 @@ Ease of use has been achieved by incorporating a workflow command that allows th The command has only one mandatory option, and predefined parameters adapted to pangenomes at the scale of a bacterial species. This command launches the [annotation](./pangenomeAnalyses.md#annotation), [clustering](./pangenomeCluster.md#cluster-genes-into-gene-families), [graph](./pangenomeAnalyses.md#graph) and [partition](./pangenomeAnalyses.md#partition) commands described below. -
-
+ ```{mermaid} @@ -41,8 +40,6 @@ graph LR ``` -
-
To use this command, you need to provide a tab-separated list of either annotation files (gff3 or gbff) or fasta files. The expected format is detailed [in the annotation section](./pangenomeAnalyses.md#annotation) From 8f5d13cd80bb1212507c139411fb551a4b5b0169 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Dec 2023 17:06:00 +0100 Subject: [PATCH 16/51] fix mermaid version --- docs/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/requirements.txt b/docs/requirements.txt index 31ed17e0..87dbffaa 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -4,4 +4,4 @@ readthedocs-sphinx-search==0.3.1 sphinx-autobuild==2021.3.14 myst-parser==1.0.0 docutils==0.18.1 -sphinxcontrib.mermaid \ No newline at end of file +sphinxcontrib.mermaid==0.9.2 \ No newline at end of file From f0f76a4550c5e22a2e63afca42a75b8854270da5 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Dec 2023 17:06:13 +0100 Subject: [PATCH 17/51] Make RGP green --- docs/user/QuickUsage/quickWorkflow.md | 2 +- docs/user/RGP/rgpPrediction.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/user/QuickUsage/quickWorkflow.md b/docs/user/QuickUsage/quickWorkflow.md index 44c8a226..e35a5153 100644 --- a/docs/user/QuickUsage/quickWorkflow.md +++ b/docs/user/QuickUsage/quickWorkflow.md @@ -52,7 +52,7 @@ graph LR m --> f[pangenome.f5] - classDef panrgp fill:#4066d4 + classDef panrgp fill:#84d191 classDef panmodule fill:#d44066 classDef workflow fill:#d4ae40 diff --git a/docs/user/RGP/rgpPrediction.md b/docs/user/RGP/rgpPrediction.md index 747756c2..d42e49d9 100644 --- a/docs/user/RGP/rgpPrediction.md +++ b/docs/user/RGP/rgpPrediction.md @@ -49,7 +49,7 @@ graph LR s --> f[pangenome.f5] - classDef panrgp fill:#4066d4 + classDef panrgp fill:#84d191 classDef panmodule fill:#d44066 classDef workflow fill:#d4ae40 From 1a69ce6a9760b0699251558acf6970316e8a4dbc Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Fri, 15 Dec 2023 17:31:19 +0100 Subject: [PATCH 18/51] edit metadata doc --- docs/user/metadata.md | 64 +++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/docs/user/metadata.md b/docs/user/metadata.md index 9c0033b7..bbf9f90c 100644 --- a/docs/user/metadata.md +++ b/docs/user/metadata.md @@ -1,46 +1,50 @@ -# Add metadata to pangenome elements +# Adding Metadata to Pangenome Elements -It is possible to add metadata link to pangenome elements using PPanGGOLiN. -Metadata can be associated with: genes, genomes, families, RGPs, spots and modules from a simple TSV file. -To add metadata in your pangenome you can launch the command is as follows: +The `metadata` command allows the addition of metadata linked to various pangenome elements. Metadata can be associated with genes, genomes, families, RGPs, spots, and modules using a simple TSV file. -`ppanggolin metadata -p PANGENOME --metadata METADATA.TSV --source SOURCE --assign ASSIGN` +To add metadata to your pangenome, execute the command as shown below: -- `--source` arguments corresponds to the origin of the metadata and will be used as the storage key in the pangenome. -- `--assign` Choose to which pangenome elements who want to add metadata in the following list {families,genomes,genes,RGPs,spots,modules} +```bash +ppanggolin metadata -p PANGENOME --metadata METADATA.TSV --source SOURCE --assign ASSIGN +``` -## Metadata format +- The `--source` argument corresponds to the metadata's origin and will serve as the storage key in the pangenome. +- `--assign` allows you to specify the pangenome elements to which you want to add metadata from the following list: {families, genomes, genes, RGPs, spots, modules}. -PPanGGOLiN allows to use a highly flexible metadata file. Only one column name is mandatory, and it is identical to the -assignment argument chosen by the user. -For example the TSV file to assign metadata to gene families to functional annotation could be as follows: +The associated metadata can then be exported in various output files of PPanGGOLiN such as GFF, PROKSEE JSON Map and Table output for genomes (see [here](./writeGenomes.md#incorporating-metadata-into-tables-gff-and-proksee-files) for more details) and in the gexf graph file of the pangenome as well as in the graph resulting in the RGP clustering. -| families | Accesion | Function | Description | -|----------|----------|----------|-------------| -| GF_1 | Acc_1 | Fn_1 | Desc_1 | -| GF_2 | Acc_2 | Fn_2 | Desc_2 | -| GF_2 | Acc_3 | Fn_3 | Desc_3 | -| ... | ... | ... | ... | -| GF_n | Acc_n | Fn_n | Desc_n | -*Note: As you can see in the above table, one element (here GF_2) can be associated with more than one metadata.* +The metadata linked to pangenome elements can be exported to various output file formats within PPanGGOLiN, including GFF, PROKSEE JSON Map, and Table outputs of the `write_genomes` command (see [here](./writeGenomes.md#incorporating-metadata-into-tables-gff-and-proksee-files) for more details). Additionally, the metadata can also be included in the gexf graph file representing the pangenome and in the RGP clustering graph. -### Command specifiq option details +## Metadata Format + +PPanGGOLiN offers a highly flexible metadata file format. Only one column name is mandatory, and it aligns with the assignment argument chosen by the user (ie. families, RGPS...). + +For instance, the TSV file used to assign metadata to gene families for functional annotation might resemble the following: + +| families | Accession | Function | Description | +|----------|-----------|----------|-------------| +| GF_1 | Acc_1 | Fn_1 | Desc_1 | +| GF_2 | Acc_2 | Fn_2 | Desc_2 | +| GF_2 | Acc_3 | Fn_3 | Desc_3 | +| ... | ... | ... | ... | +| GF_n | Acc_n | Fn_n | Desc_n | + +```{note} +As you can see in the above table, one element (here GF_2) can be associated with with multiple metadata entries. +``` + +### Command Specific Option Details #### `--metadata` PPanGGOLiN enables to give one TSV at a time to add metadata. -#### `--source` -The source is the key use to access to metadata in pangenome. -So if the name of the source already exist in the pangenome it can be overwritten only with `--force` option. -This system allow to have multiple metadata source that can be read and use in PPanGGOLiN. +#### `--source` +The source serves as the key for accessing metadata within the pangenome. If the source name already exists in the pangenome, it can be overwritten using the `--force` option. This system facilitates the utilization of multiple metadata sources, accessible and usable within PPanGGOLiN. In the context of annotation, the source typically represents the name of the annotation database used during the annotation process. -#### `--assign` -PPanGGOLiN allows to add metadata to all pangenome elements: families,genomes,genes,RGPs,spots,modules. -But the user can only give one metadata file at a time as he can provide only source and so one type of pangenome element. +#### `--assign` +PPanGGOLiN enables the addition of metadata to various pangenome elements, including families, genomes, genes, RGPs, spots, and modules. However, the user can provide only one metadata file at a time, thereby specifying a single source and one type of pangenome element. #### `--omit` -You can use this option to skip the error provide by an unfind ID in the pangenome. -This could be useful if you are using a general TSV with element not in the pangenome, but must be used with carefully. - +This option allows you to bypass errors resulting from an unfound ID in the pangenome. It can be beneficial when utilizing a general TSV with elements not present in the pangenome. This argument should be used carefully. From 5a0a76b20e20ce53c8c4a0d256974d92bbf7b51d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 09:36:03 +0100 Subject: [PATCH 19/51] fix pangenome file exension in diagrams --- docs/user/Modules/modulePrediction.md | 4 ++-- docs/user/QuickUsage/quickWorkflow.md | 6 +++--- docs/user/RGP/rgpPrediction.md | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/user/Modules/modulePrediction.md b/docs/user/Modules/modulePrediction.md index ad6c6c6e..6625a9fb 100644 --- a/docs/user/Modules/modulePrediction.md +++ b/docs/user/Modules/modulePrediction.md @@ -35,8 +35,8 @@ graph LR end - p --> f[pangenome.f5] - m --> f[pangenome.f5] + p --> f[pangenome.h5] + m --> f classDef panrgp fill:#4066d4 diff --git a/docs/user/QuickUsage/quickWorkflow.md b/docs/user/QuickUsage/quickWorkflow.md index e35a5153..26ce7111 100644 --- a/docs/user/QuickUsage/quickWorkflow.md +++ b/docs/user/QuickUsage/quickWorkflow.md @@ -47,9 +47,9 @@ graph LR r --> s(spot) end - p --> f[pangenome.f5] - s --> f[pangenome.f5] - m --> f[pangenome.f5] + p --> f[pangenome.h5] + s --> f + m --> f classDef panrgp fill:#84d191 diff --git a/docs/user/RGP/rgpPrediction.md b/docs/user/RGP/rgpPrediction.md index d42e49d9..90c6623e 100644 --- a/docs/user/RGP/rgpPrediction.md +++ b/docs/user/RGP/rgpPrediction.md @@ -45,8 +45,8 @@ graph LR r --> s(spot) end - p --> f[pangenome.f5] - s --> f[pangenome.f5] + p --> f[pangenome.h5] + s --> f[pangenome.h5] classDef panrgp fill:#84d191 From a03828725f81d7aa333ea60a814539067108d215 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 12:57:09 +0100 Subject: [PATCH 20/51] update doc requirements --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3871fd98..67b67f1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,8 @@ doc = [ "readthedocs-sphinx-search==0.3.1", "sphinx-autobuild==2021.3.14", "myst-parser==2", - "docutils==0.18.1" + "docutils==0.18.1", + "sphinxcontrib.mermaid==0.9.2" ] test = [ "pytest>=7.0.0" From 40b760f5254d321a0afccb1ac22e76401f3660ff Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 15:14:55 +0100 Subject: [PATCH 21/51] fix pangenome file exension in diagrams --- docs/user/PangenomeAnalyses/pangenomeWorkflow.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user/PangenomeAnalyses/pangenomeWorkflow.md b/docs/user/PangenomeAnalyses/pangenomeWorkflow.md index dfd55307..552f50d2 100644 --- a/docs/user/PangenomeAnalyses/pangenomeWorkflow.md +++ b/docs/user/PangenomeAnalyses/pangenomeWorkflow.md @@ -30,7 +30,7 @@ graph LR g(graph) --> p(partition) end - p --> f[pangenome.f5] + p --> f[pangenome.h5] classDef panrgp fill:#4066d4 From 578599a64039d95870a9e7ad0042b00e3ace0743 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 17:23:26 +0100 Subject: [PATCH 22/51] attempt to fix api ref in redthedoc --- .readthedocs.yaml | 6 +++++- docs/index.md | 4 +++- docs/requirements.txt | 7 ------- 3 files changed, 8 insertions(+), 9 deletions(-) delete mode 100644 docs/requirements.txt diff --git a/.readthedocs.yaml b/.readthedocs.yaml index ff7f01ab..c317c145 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,8 +5,12 @@ # Required version: 2 python: + version: 3.8 install: - - requirements: docs/requirements.txt + - method: pip + path: . + extra_requirements: + - docs # Set the OS, Python version and other tools you might need build: diff --git a/docs/index.md b/docs/index.md index 163cc7a4..de7135a5 100644 --- a/docs/index.md +++ b/docs/index.md @@ -86,9 +86,11 @@ user/metadata dev/contribute dev/buildDoc +api/modules ``` -# API Reference + +# Indices and tables [//]: # (- {ref}`ppanggolin package`) - {ref}`genindex` diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 87dbffaa..00000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -sphinx==6.2.1 -sphinx_rtd_theme==1.2.2 -readthedocs-sphinx-search==0.3.1 -sphinx-autobuild==2021.3.14 -myst-parser==1.0.0 -docutils==0.18.1 -sphinxcontrib.mermaid==0.9.2 \ No newline at end of file From 354d8197788aa9bda535540d671270361e1d29c7 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 17:25:42 +0100 Subject: [PATCH 23/51] fix python version --- .readthedocs.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index c317c145..aa426ef1 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -5,7 +5,6 @@ # Required version: 2 python: - version: 3.8 install: - method: pip path: . From efcd3e4e6d5abe139660a5a5781d296b027c5826 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 17:33:26 +0100 Subject: [PATCH 24/51] attempt to install doc dep with pip doc --- .readthedocs.yaml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index aa426ef1..6b3085f5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,9 +7,7 @@ version: 2 python: install: - method: pip - path: . - extra_requirements: - - docs + path: .[doc] # Set the OS, Python version and other tools you might need build: From 598d33e98fd0a8cada37ff2036ab79d15c530981 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 17:45:09 +0100 Subject: [PATCH 25/51] add requirements of all python depencies in the toml --- pyproject.toml | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 67b67f1b..da928ce3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,18 @@ requires = [ "setuptools", "setuptools-scm", - "cython" + "cython", + tqdm>=4.64, + tables>=3.7, + pyrodigal>=3.0.1, + networkx>=3.0, + scipy>=1.10.0, + plotly>=4.14.3, + gmpy2>=2.1.2, + pandas>=2.0, + colorlover>=0.3, + numpy>=1.24, + bokeh>=2.4.2,<3, ] build-backend = "setuptools.build_meta" py_modules=["ppanggolin"] From 9ec3df0d8c893c235bd1d098838801bc980b7f85 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 17:55:59 +0100 Subject: [PATCH 26/51] add all python dependencies in toml --- pyproject.toml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index da928ce3..0592ea46 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,17 +3,17 @@ requires = [ "setuptools", "setuptools-scm", "cython", - tqdm>=4.64, - tables>=3.7, - pyrodigal>=3.0.1, - networkx>=3.0, - scipy>=1.10.0, - plotly>=4.14.3, - gmpy2>=2.1.2, - pandas>=2.0, - colorlover>=0.3, - numpy>=1.24, - bokeh>=2.4.2,<3, + "tqdm>=4.64", + "tables>=3.7", + "pyrodigal>=3.0.1", + "networkx>=3.0", + "scipy>=1.10.0", + "plotly>=4.14.3", + "gmpy2>=2.1.2", + "pandas>=2.0", + "colorlover>=0.3", + "numpy>=1.24", + "bokeh>=2.4.2,<3" ] build-backend = "setuptools.build_meta" py_modules=["ppanggolin"] From efe18b5da2e203ce5b1deb69afa1c6575b8d0fc6 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 18:26:00 +0100 Subject: [PATCH 27/51] try install in readthedoc with conda --- .readthedocs.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 6b3085f5..6b22a90e 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -14,11 +14,12 @@ build: os: ubuntu-22.04 tools: python: "3.8" - # You can also specify other tool versions: - # nodejs: "19" - # rust: "1.64" - # golang: "1.19" + python: "mambaforge-22.9" +conda: + environment: ppanggolin_env.yaml + + # Build documentation in the "docs/" directory with Sphinx sphinx: configuration: docs/conf.py From 6584f9e5fc2daa43a2a8cda77450e15f0054cb78 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 18:32:26 +0100 Subject: [PATCH 28/51] try without the build lines --- .readthedocs.yaml | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 6b22a90e..cbb0f0c5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,17 +7,19 @@ version: 2 python: install: - method: pip - path: .[doc] + path: . + extra_requirements: + - doc # Set the OS, Python version and other tools you might need -build: - os: ubuntu-22.04 - tools: - python: "3.8" - python: "mambaforge-22.9" +# build: +# os: ubuntu-22.04 +# tools: +# python: "3.8" +# python: "mambaforge-22.9" -conda: - environment: ppanggolin_env.yaml +# conda: +# environment: ppanggolin_env.yaml # Build documentation in the "docs/" directory with Sphinx From a6d15dfaab646c5edfe9551e73d0b4a5a0f69ebf Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 18:36:09 +0100 Subject: [PATCH 29/51] update .readthedocs.yaml --- .readthedocs.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index cbb0f0c5..a0eee30a 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -12,9 +12,9 @@ python: - doc # Set the OS, Python version and other tools you might need -# build: -# os: ubuntu-22.04 -# tools: +build: + os: ubuntu-22.04 + tools: # python: "3.8" # python: "mambaforge-22.9" From 2782c16f8e4c891a1533d49fc74f50d34edbd23d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 18:37:09 +0100 Subject: [PATCH 30/51] update .readthedocs.yaml --- .readthedocs.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index a0eee30a..e6938f57 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -15,7 +15,7 @@ python: build: os: ubuntu-22.04 tools: -# python: "3.8" + python: "3.8" # python: "mambaforge-22.9" # conda: From 7c6a178f621fc44a8fbc401e9dd7d06257be15c1 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 19:13:48 +0100 Subject: [PATCH 31/51] update .readthedocs.yaml --- .readthedocs.yaml | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index e6938f57..a5e8b2ca 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -4,18 +4,20 @@ # Required version: 2 -python: - install: - - method: pip - path: . - extra_requirements: - - doc +# python: +# install: +# - method: pip +# path: . +# extra_requirements: +# - doc # Set the OS, Python version and other tools you might need build: os: ubuntu-22.04 tools: - python: "3.8" + python: "3.8" + commands: + - pip install .[doc] # python: "mambaforge-22.9" # conda: From cf2598669487521d5b65cbbfe17da4248c7f75e3 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 19:21:21 +0100 Subject: [PATCH 32/51] update .readthedocs.yaml --- .readthedocs.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index a5e8b2ca..12fcc0ac 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -4,12 +4,12 @@ # Required version: 2 -# python: -# install: -# - method: pip -# path: . -# extra_requirements: -# - doc +python: + install: + - method: pip + path: . + extra_requirements: + - doc # Set the OS, Python version and other tools you might need build: From d0cf4dfac555bc74e5fd6d4097e1ef9c39e0a154 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Mon, 18 Dec 2023 19:32:21 +0100 Subject: [PATCH 33/51] update .readthedocs.yaml --- .readthedocs.yaml | 2 -- pyproject.toml | 13 ++++++++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 12fcc0ac..a69299f1 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -16,8 +16,6 @@ build: os: ubuntu-22.04 tools: python: "3.8" - commands: - - pip install .[doc] # python: "mambaforge-22.9" # conda: diff --git a/pyproject.toml b/pyproject.toml index 0592ea46..b0f6aa15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,18 @@ doc = [ "sphinx-autobuild==2021.3.14", "myst-parser==2", "docutils==0.18.1", - "sphinxcontrib.mermaid==0.9.2" + "sphinxcontrib.mermaid==0.9.2", + "tqdm>=4.64", + "tables>=3.7", + "pyrodigal>=3.0.1", + "networkx>=3.0", + "scipy>=1.10.0", + "plotly>=4.14.3", + "gmpy2>=2.1.2", + "pandas>=2.0", + "colorlover>=0.3", + "numpy>=1.24", + "bokeh>=2.4.2,<3" ] test = [ "pytest>=7.0.0" From e167acec37774879214334389a7b5e968476cd50 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 09:24:49 +0100 Subject: [PATCH 34/51] add modules md entry point of api ref --- docs/api/modules.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 docs/api/modules.md diff --git a/docs/api/modules.md b/docs/api/modules.md new file mode 100644 index 00000000..5ba103b1 --- /dev/null +++ b/docs/api/modules.md @@ -0,0 +1,18 @@ +# ppanggolin + + + + +```{toctree} +:maxdepth: 2 + +ppanggolin +``` + + +# Indices and tables + +- {ref}`genindex` +- {ref}`modindex` +- {ref}`search` + From 42fc5e3cad6b59c7094c76bbe582fdda60463654 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 09:36:40 +0100 Subject: [PATCH 35/51] add API ref in toctree --- docs/api/indice_and_table.md | 7 +++++++ docs/api/modules.md | 14 ++------------ 2 files changed, 9 insertions(+), 12 deletions(-) create mode 100644 docs/api/indice_and_table.md diff --git a/docs/api/indice_and_table.md b/docs/api/indice_and_table.md new file mode 100644 index 00000000..d8eafa38 --- /dev/null +++ b/docs/api/indice_and_table.md @@ -0,0 +1,7 @@ +# Indices and tables + +- {ref}`genindex` +- {ref}`modindex` +- {ref}`search` + + diff --git a/docs/api/modules.md b/docs/api/modules.md index 5ba103b1..6e569604 100644 --- a/docs/api/modules.md +++ b/docs/api/modules.md @@ -1,18 +1,8 @@ -# ppanggolin - - - +# API Reference ```{toctree} :maxdepth: 2 - ppanggolin +indice_and_table ``` - -# Indices and tables - -- {ref}`genindex` -- {ref}`modindex` -- {ref}`search` - From 3227875e7b8709f8f2b4342a370c714d83eed6e2 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 10:05:12 +0100 Subject: [PATCH 36/51] add new cat in toml to improve install in readthedoc --- .readthedocs.yaml | 1 + pyproject.toml | 21 ++++++--------------- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index a69299f1..92e946c5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -10,6 +10,7 @@ python: path: . extra_requirements: - doc + - python_deps # Set the OS, Python version and other tools you might need build: diff --git a/pyproject.toml b/pyproject.toml index b0f6aa15..3bfbbe77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,18 +2,7 @@ requires = [ "setuptools", "setuptools-scm", - "cython", - "tqdm>=4.64", - "tables>=3.7", - "pyrodigal>=3.0.1", - "networkx>=3.0", - "scipy>=1.10.0", - "plotly>=4.14.3", - "gmpy2>=2.1.2", - "pandas>=2.0", - "colorlover>=0.3", - "numpy>=1.24", - "bokeh>=2.4.2,<3" + "cython" ] build-backend = "setuptools.build_meta" py_modules=["ppanggolin"] @@ -57,6 +46,11 @@ doc = [ "myst-parser==2", "docutils==0.18.1", "sphinxcontrib.mermaid==0.9.2", +] +test = [ + "pytest>=7.0.0" +] +python_deps = [ "tqdm>=4.64", "tables>=3.7", "pyrodigal>=3.0.1", @@ -69,9 +63,6 @@ doc = [ "numpy>=1.24", "bokeh>=2.4.2,<3" ] -test = [ - "pytest>=7.0.0" -] # [project.urls] Homepage = "https://labgem.genoscope.cns.fr/2023/04/27/ppanggolin/" From 7bfc3b2248e77a49096d6441a542ad79d8c09748 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 10:46:44 +0100 Subject: [PATCH 37/51] clean readthedocs --- .readthedocs.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 92e946c5..11afb0d3 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -17,10 +17,6 @@ build: os: ubuntu-22.04 tools: python: "3.8" -# python: "mambaforge-22.9" - -# conda: -# environment: ppanggolin_env.yaml # Build documentation in the "docs/" directory with Sphinx From 3264b124c441523000c925b7cc9a5da0a9e3a460 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 10:55:51 +0100 Subject: [PATCH 38/51] update check doc gh action --- .github/workflows/check_doc.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/check_doc.yml b/.github/workflows/check_doc.yml index 165fbca9..f2613ea2 100644 --- a/.github/workflows/check_doc.yml +++ b/.github/workflows/check_doc.yml @@ -11,7 +11,12 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: '3.8' + - run: pip install .[doc,python_deps] + # Standard drop-in approach that should work for most people. - uses: ammaraskar/sphinx-action@master with: From 12c79432cfb1fe192be80837f971d629adbe709d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 11:00:15 +0100 Subject: [PATCH 39/51] update check doc gh action --- .github/workflows/check_doc.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/check_doc.yml b/.github/workflows/check_doc.yml index f2613ea2..2b009f7f 100644 --- a/.github/workflows/check_doc.yml +++ b/.github/workflows/check_doc.yml @@ -4,6 +4,7 @@ on: push: paths: - 'docs/**' + - '.readthedocs.yaml' jobs: build: From cc85c051e7a1166e666236d48416246f297cbd82 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 11:01:21 +0100 Subject: [PATCH 40/51] fix makefile --- docs/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf1..ed880990 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -5,7 +5,7 @@ # from the environment for the first two. SPHINXOPTS ?= SPHINXBUILD ?= sphinx-build -SOURCEDIR = source +SOURCEDIR = . BUILDDIR = build # Put it first so that "make" without argument is like "make help". From d4251c0bf42943c3122f58c1d0295184554d9947 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 11:25:07 +0100 Subject: [PATCH 41/51] update check doc action --- .github/workflows/check_doc.yml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/check_doc.yml b/.github/workflows/check_doc.yml index 2b009f7f..9ca53ff5 100644 --- a/.github/workflows/check_doc.yml +++ b/.github/workflows/check_doc.yml @@ -5,7 +5,8 @@ on: paths: - 'docs/**' - '.readthedocs.yaml' - + - '.github/workflows/check_doc.yml' + jobs: build: @@ -14,15 +15,16 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 + name: install ppanggolin with python deps and doc deps with: python-version: '3.8' - run: pip install .[doc,python_deps] - # Standard drop-in approach that should work for most people. - - uses: ammaraskar/sphinx-action@master - with: - docs-folder: "docs/" - build-command: "sphinx-build -b html . build/" + - name: Complete workflow + shell: bash -l {0} + run: | + cd docs/ + sphinx-build -b html . build/ # Great extra actions to compose with: # Create an artifact of the html output. - uses: actions/upload-artifact@v1 From 7b2e7bab27f155856a106d10635c09af38520d6a Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 11:28:04 +0100 Subject: [PATCH 42/51] fix step name in gh action --- .github/workflows/check_doc.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/check_doc.yml b/.github/workflows/check_doc.yml index 9ca53ff5..5e8b43f0 100644 --- a/.github/workflows/check_doc.yml +++ b/.github/workflows/check_doc.yml @@ -6,7 +6,7 @@ on: - 'docs/**' - '.readthedocs.yaml' - '.github/workflows/check_doc.yml' - + jobs: build: @@ -15,10 +15,10 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 - name: install ppanggolin with python deps and doc deps with: python-version: '3.8' - - run: pip install .[doc,python_deps] + - name: install ppanggolin with python deps and doc deps + run: pip install .[doc,python_deps] - name: Complete workflow shell: bash -l {0} From 1feaacf2478348197738621601393bd8dd17d6dc Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 11:55:05 +0100 Subject: [PATCH 43/51] update api doc --- docs/api/modules.md | 8 -------- docs/api/ppanggolin.formats.md | 13 +++++++++++-- docs/index.md | 12 +----------- 3 files changed, 12 insertions(+), 21 deletions(-) delete mode 100644 docs/api/modules.md diff --git a/docs/api/modules.md b/docs/api/modules.md deleted file mode 100644 index 6e569604..00000000 --- a/docs/api/modules.md +++ /dev/null @@ -1,8 +0,0 @@ -# API Reference - -```{toctree} -:maxdepth: 2 -ppanggolin -indice_and_table -``` - diff --git a/docs/api/ppanggolin.formats.md b/docs/api/ppanggolin.formats.md index 5f9ec7ae..a13be3db 100644 --- a/docs/api/ppanggolin.formats.md +++ b/docs/api/ppanggolin.formats.md @@ -29,10 +29,19 @@ :show-inheritance: ``` -## ppanggolin.formats.writeFlat module +## ppanggolin.formats.writeFlatGenomes module ```{eval-rst} -.. automodule:: ppanggolin.formats.writeFlat +.. automodule:: ppanggolin.formats.writeFlatGenomes + :members: + :undoc-members: + :show-inheritance: +``` + +## ppanggolin.formats.writeFlatPangenome module + +```{eval-rst} +.. automodule:: ppanggolin.formats.writeFlatPangenome :members: :undoc-members: :show-inheritance: diff --git a/docs/index.md b/docs/index.md index de7135a5..2ffd38c2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -86,15 +86,5 @@ user/metadata dev/contribute dev/buildDoc -api/modules +api/api_ref ``` - - -# Indices and tables -[//]: # (- {ref}`ppanggolin package`) - -- {ref}`genindex` - -- {ref}`modindex` - -- {ref}`search` From d896a14bd8690ab38bbdaaed2846ebcc92bab196 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 11:55:50 +0100 Subject: [PATCH 44/51] add entry point for the api doc --- docs/api/api_ref.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 docs/api/api_ref.md diff --git a/docs/api/api_ref.md b/docs/api/api_ref.md new file mode 100644 index 00000000..6e569604 --- /dev/null +++ b/docs/api/api_ref.md @@ -0,0 +1,8 @@ +# API Reference + +```{toctree} +:maxdepth: 2 +ppanggolin +indice_and_table +``` + From c527a838b28d325b4899fdd57862db9128332aa4 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 15:33:49 +0100 Subject: [PATCH 45/51] update build DOC doc --- docs/dev/buildDoc.md | 285 ++++++++++++++++--------------------------- 1 file changed, 105 insertions(+), 180 deletions(-) diff --git a/docs/dev/buildDoc.md b/docs/dev/buildDoc.md index cdc3a3d9..7f379b35 100644 --- a/docs/dev/buildDoc.md +++ b/docs/dev/buildDoc.md @@ -1,250 +1,175 @@ -# Build the documentation -This partdescribe the guidelines to build the documentation of PPanGGOLiN. +# Building the Documentation -```{warning} -When you will merge or pull request your branch on master, a bot from readthedoc will see it and update the doc online. -Be sure that your doc is clean and without error. -``` +This section provides guidelines for building the PPanGGOLiN documentation locally. + +## Setting Up the Environment -## Install required packages +Before proceeding, ensure that you have installed PPanGGOLiN from the source code. For detailed instructions, refer to [this section](../user/install.md#installing-from-source-code-github). -Required packages are listed below : -```text -sphinx==6.2.1 -sphinx_rtd_theme==1.2.2 -readthedocs-sphinx-search==0.3.1 -sphinx-autobuild==2021.3.14 -myst-parser==1.0.0 +The necessary packages to build the documentation are listed in the 'requirements.txt' file located in the `doc/` folder. + +```bash +pip install -r docs/requirements.txt ``` -To build the doc you need to use an environnement with ppanggolin installed. -To make think easier [pyproject.toml file](../../pyproject.toml) contain the same list of requirement -and can install everything automatically with pip. + +Alternatively, the same list of requirements is available in the [pyproject.toml file](../../pyproject.toml), which allows for automatic installation using `pip`. + ```shell -# PPanGGOLiN=/path/to/ppanggolin/ -pip install $PPanGGOLiN[doc] # You can add -e to install in editable mode +# Replace '/path/to/ppanggolin/' with your actual path +pip install /path/to/ppanggolin/[doc] +``` + +## Building Documentation with Sphinx +### Build and produce an html + +Building the documentation is as simple as : + + +```bash +# Replace '/path/to/ppanggolin/' with your actual path +cd /path/to/ppanggolin/docs/ +sphinx-build -b html . build/ +``` +You can also use the makefile as follow + + +```bash +# Replace '/path/to/ppanggolin/' with your actual path +cd /path/to/ppanggolin/docs/ +make html ``` -## Build documentation with sphinx -You can look at your modification in live by using **sphinx-autobuild** (installed previously). + +### Build with autobuild + +You can visualize your modifications in real-time using **sphinx-autobuild**, a tool previously installed. ```shell cd $PPanGGOLiN/docs sphinx-autobuild . build/ -#copy server adresse, for me (as example) http://127.0.0.1:8000 -#paste the adresse in your browser +# Copy the server address, for example: http://127.0.0.1:8000 +# Paste the address in your browser ``` ```{note} -The package [readthedocs-sphinx-search](https://readthedocs-sphinx-search.readthedocs.io/en/latest/) "enable search as you type for docs hosted on Read the Docs". It's only work on ReadTheDocs web site `[INFO] Docs are not being served on Read the Docs, readthedocs-sphinx-search will not work.`, don't try to make it work. +The package [readthedocs-sphinx-search](https://readthedocs-sphinx-search.readthedocs.io/en/latest/) enables "search as you type" functionality for docs hosted on Read the Docs. Please note that it only functions on the ReadTheDocs website. `[INFO] Docs are not being served on Read the Docs, readthedocs-sphinx-search will not work.` ``` -### Modify existing documentation -In this part we will speak about how to change the already existing documentation files. -To add files for command, package, ... See [Adding section](#heading-adding) +### Editing or Adding Documentation + -To modify the existing user or developper documentation, you simply need to go to the file where you want to make a change and modify it. +To modify existing documentation: -The API documentation is automatically update when you modify the docstring in the code. -It's also working when you add function, method, class, ect, in an already existing package, -but not if you add new package (new file in the ppanggolin), for this look at [Update API documentation](#add-api-doc). +1. **Navigate to the Document**: Go to the file you wish to edit and make necessary changes. -(heading-adding)= -### Adding to existing documentation -#### Adding user documentation file -User documentation should contain files relative to new command, example and information about PPanGGOLiN. -To ensure efficency, file name should correspond to the main topic. -A file should not be long, prefer to split in multiple files. +To add a new page: -When the file is created, you can add it to the index in the *toctree UserGuide* by adding a line `user/filename` -without the file extension (.md) in the **index file**. +1. **Create Markdown File**: Place the new markdown file in the relevant folder within the 'docs' directory—'user' for user documentation or 'dev' for developer documentation. +2. **Update Table of Contents (TOC)**: Add a reference to the newly added file in the 'index.md' file at the root of the docs folder under the 'user' or 'dev' TOC tree. -#### New guidelines for development -All new guidelines that seems interesting are welcomed. -If you think that the guidelines could not be added to an existing file, you can create a new one. -Use an explicit name for your file and add it to the *toctree DevelopperGuide* (add-api-doc)= #### Update API documentation -The API documentation is build automatically. +The API reference documentation is automatically updated as it is build each time the doc is build by sphinx. +In case you added a new file in the ppanggolin code base + To update the API documentation and keep the automatic update when a new package, module, submodules is added follow the next lines: ```shell sphinx-apidoc -o api $PPanGGOLiN/ppanggolin -f ``` -```{warning} +```{note} *sphinx-apidoc* will generate ReStructeredText files. You need to convert them in markdown. For this follow the guides [here](#rst2md) ``` -### Creating a new documentation from scratch -#### Quickstart with sphinx -```{warning} -This must be discuss with repository administrators. -``` -To create the documentation from scratch, rename the existing documentation (or use another name for the new one) -and follow the next steps. + +### Creating a New Documentation from Scratch + +This section documents how the current documentation has been created. + +#### Quickstart with Sphinx + +To start the documentation process from scratch, follow these steps to either rename the existing documentation or provide a new name for the upcoming documentation. ```shell DOCS=path/to/PPanGGOLiN/docs sphinx-quickstart $DOCS -#Welcome to the Sphinx 6.2.1 quickstart utility. -# -#Please enter values for the following settings (just press Enter to -#accept a default value, if one is given in brackets). -# -#Selected root path: docs_scratch -# -#You have two options for placing the build directory for Sphinx output. -#Either, you use a directory "_build" within the root path, or you separate -#"source" and "build" directories within the root path. -#> Separate source and build directories (y/n) [n]: y -# -#The project name will occur in several places in the built documentation. -#> Project name: PPanGGOLiN -#> Author name(s): Jérôme Arnoux -#> Project release []: 1.2.174 -# -#If the documents are to be written in a language other than English, -#you can select a language here by its language code. Sphinx will then -#translate text that it generates into that language. -# -#For a list of supported codes, see -#https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-language. -#> Project language [en]: -# -#Creating file /your/path/PPanGGOLiN/docs_scratch/source/conf.py. -#Creating file /your/path/Projects/PPanGGOLiN/docs_scratch/source/index.rst. -#Creating file /your/path/Projects/PPanGGOLiN/docs_scratch/Makefile. -#Creating file /your/path/Projects/PPanGGOLiN/docs_scratch/make.bat. -# -#Finished: An initial directory structure has been created. -# -#You should now populate your master file /home/jarnoux/Projects/PPanGGOLiN/docs_scratch/source/index.rst and create other documentation -#source files. Use the Makefile to build the docs, like so: -# make builder -#where "builder" is one of the supported builders, e.g. html, latex or linkcheck. ``` -Now you have a documentation folder ready to use. -#### Configuration file -In the *source* directory you should find a `conf.py` file. Replace the code inside by the following. -```python -# Configuration file for the Sphinx documentation builder. -# -# This file only contains a selection of the most common options. For a full -# list see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -# -- Path setup -------------------------------------------------------------- +Upon executing the command, you will be prompted with a series of settings in order to setup the new documentation folder. -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -from pathlib import Path +We used so far the default settings as follow: -# -- Project information ----------------------------------------------------- +- Separate source and build directories (y/n) [n]: **n** -project = 'PPanGGOLiN' -copyright = 'LABGeM' -author = 'Jérôme Arnoux' +- Project name: **PPanGGOLiN** +- Author name(s): **Your name** +- Project release []: **The current version of PPanGGOLiN** -# The full version, including alpha/beta/rc tags -release = open(Path(__file__).resolve().parents[2]/"VERSION").read().rstrip() # Get release number in the VERSION file -# -- General configuration --------------------------------------------------- - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "myst_parser", - # "sphinxcontrib.jquery", - "sphinx.ext.duration", - "sphinx.ext.autosectionlabel", - "sphinx.ext.autodoc", - 'sphinx_search.extension', -] +#### Configuration file -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +Locate the conf.py file within the docs directory. You can modify this file similarly to the adjustments made in the current `conf.py` file. -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] +(rst2md)= +#### ReStructeredText to markdown +reStructuredText (rst) is the default plaintext markup language used by both Docutils and Sphinx. +Despite being more comprehensive, it's considered slightly older and less user-friendly compared to Markdown. -# -- Options for HTML output ------------------------------------------------- +We have decided to use Markdown (md) instead of reStructuredText for our documentation +We will use [MyST](https://mystmd.org/guide) to translate RST files to Markdown while preserving all features provided by reStructuredText. -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'sphinx_rtd_theme' +For this we will need to install the package `rst-to-myst`. -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] -``` -(rst2md)= -#### ReStructeredText to markdown -reStructuredText (rst) is the default plaintext markup language used by both Docutils and Sphinx. -More complete but a little bit older than Markdown, which is easier to use too. -We are going to change rst for Markdown (md). -To translate rst and keep all the features, we will use [MyST](https://mystmd.org/guide). +```shell +pip install rst-to-myst -For this case we will need to install a new package `rst-to-myst`. -```{note} We advice to use another environment, because as far as we know today, this package is not compatible with our sphinx version -``` +rst2myst convert index.rst -```shell -pip install rst-to-myst[sphinx] -# Go to your environment with rst2myst -rst2myst convert source/index.rst -# Go back to your environment with ppanggolin -rm source/index.rst -``` -#### README in index.md -It's possible to add the **README** file in the index to don't have to rewrite it in the doc. -Simply add the following line in `index.md` -```markdown - ```{include} ../../README.md - :relative-images: % To - ``` -% Without tabulation +# remove rst file(s) +rm index.rst ``` #### User documentation -The user documentation is completely handwritten. Moreover, we advise respecting the following guidelines: +Here are some general guidelines to write user documentation: -1. One file per topic/command with an explicit text on the feature -2. One file for the installation guidelines -3. One file on how to report issue or enhancement -4. Don't ref to any function in the ppanggolin code. This is reserved for developper documentation +1. **Topic/Command Separation**: Create individual files for each topic or command, offering explicit explanations of the feature's functionality. + - **Enhance with Examples**: Include example code snippets and output figures or initial lines of output files wherever applicable. -#### Developper documentation -The developper documentation is handwritten too. We advise respecting the following guidelines: -1. Spoke about the PEP rules -2. Give guidelines on how to use git and GitHub for version control -3. Explain how to write unit test and modify GitHub workflows -4. Write how to enhance the documentation -5. Select some function, class or command that are central in the code and provide a more complete description of them. +2. **Clarity and Precision**: Strive for utmost clarity by defining acronyms and jargon used within the documentation. #### API documentation -To build the API documentation and use the docstring in code, you can use the command `sphinx-apidoc` as follows: -```shell + +To generate the API documentation using the docstrings in your code, follow these steps: + +1. Using `sphinx-apidoc`: + +Generate the API documentation files with the `sphinx-apidoc` command: + +```bash +# Generate API doc files sphinx-apidoc -o api $PPanGGOLiN/ppanggolin -# Go to your environment with rst2myst +``` + +This command creates an 'api' folder containing the skeleton of the API reference pages. + +2. Translating to MyST with `rst-to-myst`: + +Translate these generated files into MyST markdown using `rst-to-myst`: + +```bash +# Translate them into MyST rst2myst convert api/*.rst -# Go back to your environment with sphinx + +# Remove remaining RST files rm api/*.rst ``` -You have now documentation for PPanGGOLiN api. To ref api in your doc you can paste **\{ref\}\`package ppanggolin\`** ```{tip} With the "sphinx.ext.autosectionlabel", you will certainly get multiple warning for duplicate label. From f791622be0219d0fe03abbe27d7e1996b4777637 Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 15:53:57 +0100 Subject: [PATCH 46/51] edit msa doc file --- docs/user/MSA.md | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/docs/user/MSA.md b/docs/user/MSA.md index ee6d0ee5..c62337f5 100644 --- a/docs/user/MSA.md +++ b/docs/user/MSA.md @@ -1,29 +1,43 @@ # Multiple Sequence Alignment -This command is available from 1.1.103 and on. -It is used to call [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to compute MSA of any partition of the pangenome. Using multiple cpus is recommended as it is quite demanding in computational resources. +The commande msa compute multiple sequence alignement of any partition of the pangenome. The command uses [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to perform the alignment. Using multiple cpus is recommended as multiple alignment can be quite demanding in computational resources. + +This command can be used as follow: + +```bash +ppanggolin msa -p pangenome.h5 +``` By default it will write the strict 'core' (genes that are present in absolutely all genomes) and remove any duplicated genes. Beware however that, if you have many genomes (over 1000), the core will likely be either very small or even empty if you have fragmented genomes. It will write one MSA for each family. You can then provide the directory where the MSA are written to [IQ-TREE](https://github.com/Cibiv/IQ-TREE) for example, to do phylogenetic analysis. -### partitions +### Modify the partition with `--partition` You can change the partition which is written, by using the --partition option. -`ppanggolin msa -p pangenome.h5 --partition persistent` for example will compute MSA for all the persistent gene families. -Supported partitions are core, persistent, shell, cloud, softcore, accessory. If you wish to have additional filters, you can raise an issue with your demand, or write a PR directly, most possibilites should be quite straightforward to add. +for example will compute MSA for all the persistent gene families. + +```bash +ppanggolin msa -p pangenome.h5 --partition persistent +``` + +Supported partitions are `core`, `persistent`, `shell`, `cloud`, `softcore`, `accessory`. If you wish to have additional filters, you can raise an issue in the [issue tracker](https://github.com/labgem/PPanGGOLiN/issues) with your demand, or write a PR directly (see [here](../dev/contribute.md) for instruction on how to contribute), most possibilites should be quite straightforward to add. ### source You can specify whether to use dna or protein sequences for the MSA by using --source. It uses protein sequences by default. -`ppanggolin msa -p pangenome.h5 --source dna` +```bash +ppanggolin msa -p pangenome.h5 --source dna +``` ### phylo It is also possible to write a single whole genome MSA file, which many phylogenetic softwares accept as input, by using the --phylo option as such: -`ppanggolin msa -p pangenome.h5 --phylo` +```bash +ppanggolin msa -p pangenome.h5 --phylo +``` This will contatenate all of the family MSA into a single MSA, with one sequence for each genome. \ No newline at end of file From eaf58ab66d0423c80f61c81505899d779a14becd Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 15:55:08 +0100 Subject: [PATCH 47/51] edit msa doc file again --- docs/user/MSA.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/user/MSA.md b/docs/user/MSA.md index c62337f5..04ea268b 100644 --- a/docs/user/MSA.md +++ b/docs/user/MSA.md @@ -1,6 +1,6 @@ # Multiple Sequence Alignment -The commande msa compute multiple sequence alignement of any partition of the pangenome. The command uses [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to perform the alignment. Using multiple cpus is recommended as multiple alignment can be quite demanding in computational resources. +The commande `msa` compute multiple sequence alignement of any partition of the pangenome. The command uses [mafft](https://mafft.cbrc.jp/alignment/software/) with default options to perform the alignment. Using multiple cpus with the `--cpu` argument is recommended as multiple alignment can be quite demanding in computational resources. This command can be used as follow: @@ -22,19 +22,19 @@ for example will compute MSA for all the persistent gene families. ppanggolin msa -p pangenome.h5 --partition persistent ``` -Supported partitions are `core`, `persistent`, `shell`, `cloud`, `softcore`, `accessory`. If you wish to have additional filters, you can raise an issue in the [issue tracker](https://github.com/labgem/PPanGGOLiN/issues) with your demand, or write a PR directly (see [here](../dev/contribute.md) for instruction on how to contribute), most possibilites should be quite straightforward to add. +Supported partitions are `core`, `persistent`, `shell`, `cloud`, `softcore`, `accessory`. If you need specific filters, you can submit a request in the [issue tracker](https://github.com/labgem/PPanGGOLiN/issues) with your requirements. You can also directly implement the new filter and submit a Pull Request (instructions for contribution can be found [here](../dev/contribute.md)). Most filters should be quite straightforward to add. -### source +### Chose to align dna or protein sequences with `--source` -You can specify whether to use dna or protein sequences for the MSA by using --source. It uses protein sequences by default. +You can specify whether to use `dna` or `protein` sequences for the MSA by using `--source`. It uses protein sequences by default. ```bash ppanggolin msa -p pangenome.h5 --source dna ``` -### phylo +### Write a single whole MSA file with `--phylo` -It is also possible to write a single whole genome MSA file, which many phylogenetic softwares accept as input, by using the --phylo option as such: +It is also possible to write a single whole genome MSA file, which many phylogenetic softwares accept as input, by using the `--phylo` option as such: ```bash ppanggolin msa -p pangenome.h5 --phylo From de37a897d1971aed7c504302c19a7a567796290f Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 17:34:59 +0100 Subject: [PATCH 48/51] fix double gz extension --- ppanggolin/formats/writeFlatPangenome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index d53ad8eb..6d6b0451 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -344,7 +344,7 @@ def write_gexf(output: Path, light: bool = True, compress: bool = False): txt += "light gexf file for the pangenome graph..." if light else "gexf file for the pangenome graph..." logging.getLogger("PPanGGOLiN").info(txt) - outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf{'.gz' if compress else ''}" + outname = output / f"pangenomeGraph{'_light' if light else ''}.gexf" with write_compressed_or_not(outname, compress) as gexf: graph_type = 'ligth gexf' if light else 'gexf' logging.getLogger("PPanGGOLiN").debug(f"Writing the {graph_type} header...") From d43c2d3ecfa6f6b2dafd7848b53943ad890a968d Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Tue, 19 Dec 2023 18:22:31 +0100 Subject: [PATCH 49/51] mv call of metadata_sources out of loops to be faster --- ppanggolin/formats/writeFlatPangenome.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ppanggolin/formats/writeFlatPangenome.py b/ppanggolin/formats/writeFlatPangenome.py index 6d6b0451..6bba6539 100644 --- a/ppanggolin/formats/writeFlatPangenome.py +++ b/ppanggolin/formats/writeFlatPangenome.py @@ -238,6 +238,8 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): 'cloud': 'a="0" b="255" g="222" r="121"'} if not light: index = pan.get_org_index() + + pan_metadata_sources = pan.metadata_sources("families") for fam in pan.gene_families: name = Counter() @@ -276,7 +278,7 @@ def write_gexf_nodes(gexf: TextIO, light: bool = True, soft_core: False = 0.95): gexf.write(f' \n') shift = 14 source_fields = {m.source: m.fields for f in pan.gene_families if len(list(f.metadata)) > 0 for m in f.metadata} - for source_metadata_families in pan.metadata_sources("families"): + for source_metadata_families in pan_metadata_sources: to_concat = defaultdict(list) for m in fam.metadata: if m.source == source_metadata_families: @@ -307,7 +309,7 @@ def write_gexf_edges(gexf: TextIO, light: bool = True): edgeids = 0 index = pan.get_org_index() shift = 14 - + metadata_count = len(pan.metadata_sources("families")) for edge in pan.edges: gexf.write(f' \n') @@ -316,8 +318,7 @@ def write_gexf_edges(gexf: TextIO, light: bool = True): gexf.write(f' \n') if not light: for org, genes_pairs in edge.get_organisms_dict().items(): - gexf.write( - f' \n') + gexf.write(f' \n') gexf.write(' \n') gexf.write(' \n') edgeids += 1 From c90dc812d9e06486eb5af0c4ce3f6c047ec1ca52 Mon Sep 17 00:00:00 2001 From: Jean Mainguy Date: Wed, 20 Dec 2023 16:44:00 +0100 Subject: [PATCH 50/51] Update VERSION to 2.0.0-alpha1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 897c4fa2..5d8955fb 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -1.2.194 +2.0.0-alpha1 From ff80fb172fee953b94d66dae23547db678acb54b Mon Sep 17 00:00:00 2001 From: JeanMainguy Date: Thu, 21 Dec 2023 16:48:56 +0100 Subject: [PATCH 51/51] add version check between ppanggolin file and the installed ppanggolin --- ppanggolin/formats/readBinaries.py | 7 ++++++ ppanggolin/pangenome.py | 12 +++++++--- ppanggolin/utils.py | 38 ++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 3 deletions(-) diff --git a/ppanggolin/formats/readBinaries.py b/ppanggolin/formats/readBinaries.py index 1a6d0a8a..cf1724af 100644 --- a/ppanggolin/formats/readBinaries.py +++ b/ppanggolin/formats/readBinaries.py @@ -105,6 +105,13 @@ def get_status(pangenome: Pangenome, pangenome_file: Path): pangenome.status["geneFamilySequences"] = "inFile" if status_group._v_attrs.NeighborsGraph: pangenome.status["neighborsGraph"] = "inFile" + + if hasattr(status_group._v_attrs, "version"): + pangenome.status["ppanggolin_version"] = str(status_group._v_attrs.version) + else: + logging.getLogger("PPanGGOLiN").error(f'The provided pangenome file {pangenome_file} does not have a version stored in its status.' + ' This issue may indicate that the file is corrupted.') + pangenome.status["ppanggolin_version"] = None if status_group._v_attrs.Partitioned: pangenome.status["partitioned"] = "inFile" diff --git a/ppanggolin/pangenome.py b/ppanggolin/pangenome.py index 6f073bf8..e7cc0460 100644 --- a/ppanggolin/pangenome.py +++ b/ppanggolin/pangenome.py @@ -66,19 +66,25 @@ def __init__(self): } self.parameters = {} - def add_file(self, pangenome_file: Path): - """Links an HDF5 file to the pangenome. If needed elements will be loaded from this file, + def add_file(self, pangenome_file: Path, check_version:bool=True): + """ + Links an HDF5 file to the pangenome. If needed elements will be loaded from this file, and anything that is computed will be saved to this file when :func:`ppanggolin.formats.writeBinaries.writePangenome` is called. :param pangenome_file: A string representing filepath to hdf5 pangenome file to be either used or created - + :param check_version: Check ppanggolin version of the pangenome file to be compatible with the current version of ppaggolin being used. :raises AssertionError: If the `pangenome_file` is not an instance of the Path class """ assert isinstance(pangenome_file, Path), "pangenome file should be a Path object type" from ppanggolin.formats.readBinaries import get_status + from ppanggolin.utils import check_version_compatibility # importing on call instead of importing on top to avoid cross-reference problems. + get_status(self, pangenome_file) + + check_version_compatibility(self.status["ppanggolin_version"]) + self.file = pangenome_file.absolute().as_posix() """ Gene Methods""" diff --git a/ppanggolin/utils.py b/ppanggolin/utils.py index ee256391..59c16bed 100755 --- a/ppanggolin/utils.py +++ b/ppanggolin/utils.py @@ -1065,3 +1065,41 @@ def flatten(dictionary, parent_key=''): flatten(nested_dict) return flat_dict + +def get_major_version(version: str) -> int: + """ + Extracts the major version number from a version string. + + :param version: A string representing the version number. + :return: The major version extracted from the input version string. + :raises ValueError: If the input version does not have the expected format. + """ + try: + major_version = int(version.split('.')[0]) + except ValueError: + raise ValueError(f"Version {version} does not have the expected format.") + + return major_version + + +def check_version_compatibility(file_version: str) -> None: + """ + Checks the compatibility of the provided pangenome file version with the current PPanGGOLiN version. + + :param file_version: A string representing the version of the pangenome file. + """ + # Get the current PPanGGOLiN version + current_version = distribution('ppanggolin').version + + current_version_major = get_major_version(current_version) + file_major_version = get_major_version(file_version) + + # Check for compatibility issues + if file_major_version != current_version_major: + logging.getLogger("PPanGGOLiN").error('Your pangenome file has been created with a different major version ' + 'of PPanGGOLiN than the one installed in the system. This mismatch may lead to compatibility issues.') + + if file_major_version < 2 and current_version_major >= 2: + raise ValueError(f'The provided pangenome file was created by PPanGGOLiN version {file_version}, which is ' + f'incompatible with the current PPanGGOLiN version {current_version}.') +