Merge pull request #205 from labgem/TranslateGene

Translate gene
labgem · Jun 10, 2024 · 0efb670 · 0efb670
2 parents 85ecdb9 + f30ab34
commit 0efb670
Show file tree

Hide file tree

Showing 12 changed files with 797 additions and 557 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -89,17 +89,18 @@ jobs:
         ppanggolin rarefaction --output stepbystep -f -p stepbystep/pangenome.h5 --depth 5 --min 1 --max 50 -ms 10 -fd -ck 30 -K 3 --soft_core 0.9 -se $RANDOM
         ppanggolin draw -p stepbystep/pangenome.h5 --tile_plot --nocloud --soft_core 0.92 --ucurve --output stepbystep -f
         ppanggolin rgp -p stepbystep/pangenome.h5 --persistent_penalty 2 --variable_gain 1 --min_score 3 --dup_margin 0.05
-        ppanggolin spot -p stepbystep/pangenome.h5 --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1
+        ppanggolin spot -p stepbystep/pangenome.h5 --output stepbystep --spot_graph --overlapping_match 2 --set_size 3 --exact_match_size 1 -f
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots -o stepbystep -f
         ppanggolin module -p stepbystep/pangenome.h5 --transitive 4 --size 3 --jaccard 0.86 --dup_margin 0.05
         ppanggolin write_pangenome -p stepbystep/pangenome.h5 --output stepbystep -f --soft_core 0.9 --dup_margin 0.06  --gexf --light_gexf --csv --Rtab --stats --partitions --compress --json --spots --regions --borders --families_tsv --cpu 1 
         ppanggolin write_genomes  -p stepbystep/pangenome.h5 --output stepbystep -f --fasta genomes.fasta.list --gff --proksee --table
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families all --gene_families shell --regions all --fasta genomes.fasta.list
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp 
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families rgp --gene_families rgp --compress 
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families softcore --gene_families softcore 
         ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families module_0
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --prot_families core
-        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --genes core --proteins cloud
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --gene_families module_0 --genes module_0 --compress
+        ppanggolin fasta -p stepbystep/pangenome.h5 --output stepbystep -f --proteins cloud --cpu $NUM_CPUS --keep_tmp --compress
 
         ppanggolin draw -p stepbystep/pangenome.h5 --draw_spots --spots all -o stepbystep -f
         ppanggolin metrics -p stepbystep/pangenome.h5 --genome_fluidity --no_print_info --recompute_metrics --log metrics.log
@@ -179,6 +180,8 @@ jobs:
         head genomes.gbff.list | sed 's/^/input_genome_/g' > genomes.gbff.head.list
         ppanggolin projection --pangenome stepbystep/pangenome.h5  -o projection_from_list_of_gbff --anno genomes.gbff.head.list --gff --proksee --cpu $NUM_CPUS
 
+        head genomes.fasta.list | sed 's/^/input_genome_/g' > genomes.fasta.head.list
+        ppanggolin projection --pangenome myannopang/pangenome.h5  -o projection_from_list_of_fasta --fasta genomes.fasta.head.list --gff --proksee --cpu $NUM_CPUS
 
         ppanggolin projection --pangenome mybasicpangenome/pangenome.h5  -o projection_from_single_fasta \
                               --genome_name chlam_A --fasta FASTA/GCF_002776845.1_ASM277684v1_genomic.fna.gz \

diff --git a/docs/user/writeFasta.md b/docs/user/writeFasta.md
@@ -18,7 +18,10 @@ When using the `softcore` filter, the `--soft_core` option can be used to modify
 
 ## Genes
 
-This option can be used to write the nucleotide CDS sequences. It can be used as such, to write all of the genes of the pangenome for example:
+### Nucleotide sequences
+
+With the `--genes partition` option PPanGGOLiN will write the nucleotide CDS sequences for the given partition.
+It can be used as such, to write all the genes of the pangenome for example:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes all
@@ -30,34 +33,72 @@ Or to write only the persistent genes:
 ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes persistent
 ```
 
+### Protein sequences
+
+With the `--proteins partition` option PPanGGOLiN will write the nucleotide CDS sequences for the given partition. 
+It can be used as such, to write all the genes of the pangenome for example:
 
-## Protein families
+```bash
+ppanggolin fasta -p pangenome.h5 --output MY_GENES --proteins all
+```
 
-This option can be used to write the protein sequences of the representative sequences for each family. It can be used as such for all families:
+Or to write only the cloud genes:
+
+```bash
+ppanggolin fasta -p pangenome.h5 --output MY_GENES --genes_prot cloud
+```
+
+To translate the gene sequences, PPanGGOLiN uses the [MMSeqs2](https://github.com/soedinglab/MMseqs2) `translatenucs` command. 
+So for this option you can specify multiple threads with `--cpu`.
+You can also specify the translation table to use with `--translate_table`.
+The temporary directory, can be specified with `--tmpdir` to store the [MMSeqs2](https://github.com/soedinglab/MMseqs2) database and other files. Temporary files will be deleted at the end of the execution. To keep them, you can use the `--keep_tmp` option.
+
+## Gene families
+
+### Protein sequences
+
+With the `--prot_families partition` option PPanGGOLiN will write the protein sequences of the representative gene for each family for the given partition. 
+It can be used as such for all families:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families all
 ```
 
-or for all of the shell families for example:
+Or for all the shell families for example:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_PROT --prot_families shell
 ```
 
+### Nucleotide sequences
 
-## Gene families
-
-This option can be used to write the gene sequences of the representative sequences for each family. It can be used as such:
+With the `--gene_families partition` option PPanGGOLiN will write the nucleotide sequences of the representative gene for each family for the given partition. 
+It can be used as such for all families:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families all
 ```
 
-or for the cloud families for example:
+Or for the core families for example:
+
+```bash
+ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families core
+```
+
+
+## Modules
+All the precedent command admit a module as partition.
+
+So you can write the protein sequences for the family in module_X as such:  
+
+```bash
+ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --prot_families module_X
+```
+
+Or the nucleotide sequence of all genes in module_X:
 
 ```bash
-ppanggolin fasta -p pangenome.h5 --output MY_GENES_FAMILIES --gene_families cloud
+ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --genes module_X
 ```
 
 ## Regions
@@ -73,4 +114,4 @@ It can be used as such:
 
 ```bash
 ppanggolin fasta -p pangenome.h5 --output MY_REGIONS --regions all --fasta genomes.fasta.list
-```
+```