Skip to content

Commit

Permalink
add parameter to clean Name,product,dbxref,product attributes. Append… (
Browse files Browse the repository at this point in the history
#438)

* add parameter to clean Name,product,dbxref,product attributes. Append and do not replace attribute by default. Avoid pathway when it is - because endup in the DBxref attribute + update doc + adapt test with clean_name parameter
  • Loading branch information
Juke34 authored Mar 7, 2024
1 parent 85b9f35 commit 39e841d
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 24 deletions.
95 changes: 74 additions & 21 deletions bin/agat_sp_manage_functional_annotation.pl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
my $opt_reffile;
my $opt_output;
my $opt_BlastFile;
my $opt_CleanNameAttribute; # Should we remove the Name attribute value if already exists - bolean
my $opt_CleanProductAttribute; # Should we remove the product attribute value if already exists - bolean
my $opt_CleanOntology_termAttribute; # Should we remove the Ontology_term attribute value if already exists - bolean
my $opt_CleanDbxrefAttribute; # Should we remove the Dbxref attribute value if already exists - bolean
my $opt_InterproFile;
my $opt_name = undef;
my $opt_nameU;
Expand Down Expand Up @@ -77,6 +81,10 @@
GetOptions(
'f|ref|reffile|gff|gff3=s' => \$opt_reffile,
'b|blast=s' => \$opt_BlastFile,
'clean_name!' => \$opt_CleanNameAttribute,
'clean_product!' => \$opt_CleanProductAttribute,
'clean_dbxref!' => \$opt_CleanDbxrefAttribute,
'clean_ontology!' => \$opt_CleanOntology_termAttribute,
'd|db=s' => \$opt_dataBase,
'be|blast_evalue=f' => \$opt_blastEvalue,
'pe=i' => \$opt_pe,
Expand Down Expand Up @@ -298,20 +306,16 @@
foreach my $id_level1 (keys %{$hash_omniscient ->{'level1'}{$primary_tag_level1}}) {
my $feature_level1 = $hash_omniscient->{'level1'}{$primary_tag_level1}{$id_level1};

# Clean NAME attribute
if ($feature_level1->has_tag('Name')) {
$feature_level1->remove_tag('Name');
}

#Manage Name if option setting
#Manage Name
clean_attribute($feature_level1, "Name"); # Clean NAME attribute
if ( $opt_BlastFile ) {

if (exists ($geneNameBlast{$id_level1})) {
my @list_names = @{$geneNameBlast{$id_level1}};
create_or_replace_tag($feature_level1, 'Name', \@list_names);
create_or_append_tag($feature_level1, 'Name', \@list_names);
$nbNamedGene++;

# Keep track of ducplicated gene names <= Find another way
# Keep track of duplicated gene names <= Find another way
foreach my $name (@list_names){

if (exists ($geneNameGiven{$name})) {
Expand All @@ -335,16 +339,12 @@

my $level2_ID = lc($feature_level2->_tag_value('ID'));

# Clean NAME attribute
if ($feature_level2->has_tag('Name')) {
$feature_level2->remove_tag('Name');
}

# Manage Name if option set
# Manage Name
clean_attribute($feature_level2, "Name"); # Clean NAME attribute
if ($opt_BlastFile) {
# add gene Name
if (exists ($mRNANameBlast{$level2_ID})) {
create_or_replace_tag($feature_level2, 'Name', $mRNANameBlast{$level2_ID});
create_or_append_tag($feature_level2, 'Name', $mRNANameBlast{$level2_ID});
add_attribute_to_cds($hash_omniscient, $level2_ID, 'Name', $mRNANameBlast{$level2_ID});
}

Expand All @@ -370,13 +370,14 @@
my $productData = printProductFunct($level2_ID);

#add product attribute
clean_attribute($feature_level2, "product"); # Clean product attribute
if ($productData ne "") {
add_attribute_to_cds($hash_omniscient, $level2_ID, 'product', $productData);
if ($feature_level2->has_tag('pseudo')) {
create_or_replace_tag($feature_level2, 'Note', "product:$productData");
}
else {
create_or_replace_tag($feature_level2, 'product', $productData);
create_or_append_tag($feature_level2, 'product', $productData);
}
}
else {
Expand All @@ -385,7 +386,7 @@
create_or_replace_tag($feature_level2, 'Note', "product:hypothetical protein");
}
else {
create_or_replace_tag($feature_level2, 'product', "hypothetical protein");
create_or_append_tag($feature_level2, 'product', "hypothetical protein");
}
} #Case where the protein is not known
}
Expand Down Expand Up @@ -651,12 +652,39 @@
####
##

# remove the attribute provided
sub clean_attribute {
my ($feature, $tag) = @_;

if ($opt_CleanNameAttribute and $tag eq "Name"){
if ($feature->has_tag('Name')) {
$feature->remove_tag('Name');
}
}
if ($opt_CleanProductAttribute and $tag eq "product"){
if ($feature->has_tag('product')) {
$feature->remove_tag('product');
}
}
if ($opt_CleanDbxrefAttribute and $tag eq "Dbxref"){
if ($feature->has_tag('Dbxref')) {
$feature->remove_tag('Dbxref');
}
}
if ($opt_CleanOntology_termAttribute and $tag eq "Ontology_term"){
if ($feature->has_tag('Ontology_term')) {
$feature->remove_tag('Ontology_term');
}
}
}

sub add_attribute_to_cds {
my ($hash_omniscient, $level2_ID, $tag, $value) = @_;

if($opt_populate_cds){
if ( exists_keys ($hash_omniscient, ('level3', 'cds', lc($level2_ID)) ) ) {
foreach my $feature_level3 ( @{$hash_omniscient->{'level3'}{'cds'}{lc($level2_ID)}}) {
clean_attribute($feature_level3, $tag);
$feature_level3->add_tag_value($tag, $value);
}
}
Expand Down Expand Up @@ -730,6 +758,7 @@ sub addFunctions {
my $data_list;

if (lc($function_type) eq "go") {
clean_attribute($feature, "Ontology_term"); # Clean Ontology_term attribute
foreach my $data (@{$functionData{$function_type}{$ID}}) {
$feature->add_tag_value('Ontology_term', $data);
$data_list .= "$data,";
Expand All @@ -738,6 +767,7 @@ sub addFunctions {
}
}
else {
clean_attribute($feature, "Dbxref"); # Clean Dbxref attribute
foreach my $data (@{$functionData{$function_type}{$ID}}) {
$feature->add_tag_value('Dbxref', $data);
$data_list .= "$data,";
Expand Down Expand Up @@ -1042,7 +1072,7 @@ sub parse_interpro_tsv {
my @tuple = split(/:/, $pathway_tuple); #cut at character :
my $db_name = $tuple[0];
print "pathway info: ".$pathway_tuple."\n" if ($opt_verbose);

next if ($pathway_tuple eq "-"); # avoid empty pathway tuple
if (! grep( /^\Q$pathway_tuple\E$/, @{$functionData{$db_name}{$mRNAID}} ) ) { # to avoid duplicate
$TotalTerm{$db_name}++;
push ( @{$functionData{$db_name}{$mRNAID}} , $pathway_tuple );
Expand Down Expand Up @@ -1145,12 +1175,35 @@ =head1 OPTIONS
=item B<-b> or B<--blast>
String - Input blast ( outfmt 6 = tabular ) file that will be used to complement the features
read from the first file (specified with --ref).
String - Input blast ( outfmt 6 = tabular ) usually made by blasting the proteins resulting from the GFF/GTF file provided as input
and a confident protein database (e.g. Swissprot/Uniprot). The file makse a bridge between the feature ID from the GFF/GTF and the
best protein ID matched in the used database. Thanks to that link the Name and products (sometimes called descriptions) information
will be extracted from the database fasta file and added in the GFF file. You must provide the same database via --db as the one used
to create this blast output file.
=item B<--clean_name>
Bolean - When activated, if the Name attribute already exists, it we be cleaned. Otherwise Name retrieved by --blast + --db options
will be appended. Default False (Name attribute not cleaned).
=item B<--clean_product>
Bolean - When activated, if the product attribute already exists, it we be cleaned. Otherwise product retrieved by --blast + --db options
will be appended. Default False (product attribute not cleaned).
=item B<--clean_dbxref>
Bolean - When activated, if the Dbxref attribute already exists, it we be cleaned. Otherwise Dbxref retrieved by --interpro option
will be appended. Default False (Dbxref attribute not cleaned).
=item B<--clean_ontology>
Bolean - When activated, if the Ontology_term attribute already exists, it we be cleaned. Otherwise Ontology_term retrieved by --interpro option
will be appended. Default False (Ontology_term attribute not cleaned).
=item B<-d> or B<--db>
String - The fasta file that has been used as DB for the blast. Gene names and products/descriptions will be fished from this file.
String - The fasta file that has been used as DB for the blast. Gene names and products (sometimes called descriptions) will be fished from this file.
=item B<--be> or B<--blast_evalue>
Expand Down
25 changes: 23 additions & 2 deletions docs/tools/agat_sp_manage_functional_annotation.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,29 @@ agat_sp_manage_functional_annotation.pl --help

- **-b** or **--blast**

String - Input blast ( outfmt 6 = tabular ) file that will be used to complement the features
read from the first file (specified with --ref).
String - Input blast ( outfmt 6 = tabular ) usually made by blasting the proteins resulting from the GFF/GTF file provided as input
and a confident protein database (e.g. Swissprot/Uniprot). The file makse a bridge between the feature ID from the GFF/GTF and the
best protein ID matched in the used database. Thanks to that link the Name and products (sometimes called descriptions) information will be extracted from the database fasta file and added in the GFF file. You must provide the same database via --db as the one used to create
this blast output file.

- **--clean_name**

Bolean - When activated, if the Name attribute already exists, it we be cleaned. Otherwise Name retrieved by --blast + --db options
will be appended. Default False (Name attribute not cleaned).

- **--clean_product**

Bolean - When activated, if the product attribute already exists, it we be cleaned. Otherwise product retrieved by --blast + --db options
will be appended. Default False (product attribute not cleaned).

- **--clean_dbxref**

Bolean - When activated, if the Dbxref attribute already exists, it we be cleaned. Otherwise Dbxref retrieved by --interpro option
will be appended. Default False (Dbxref attribute not cleaned).

- **--clean_ontology**

Bolean - When activated, if the Ontology_term attribute already exists, it we be cleaned. Otherwise Ontology_term retrieved by --interpro option will be appended. Default False (Ontology_term attribute not cleaned).

- **-d** or **--db**

Expand Down
2 changes: 1 addition & 1 deletion t/scripts_output.t
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,7 @@ unlink $outtmp;

$script = $script_prefix."bin/agat_sp_manage_functional_annotation.pl";
$result = "$output_folder/agat_sp_manage_functional_annotation_1.gff";
system(" $script --gff $input_folder/agat_sp_manage_functional_annotation/02413F.gff --db $input_folder/agat_sp_manage_functional_annotation/uniprot_sprot_test.fasta -b $input_folder/agat_sp_manage_functional_annotation/02413F_blast.out -i $input_folder/agat_sp_manage_functional_annotation/02413F_interpro.tsv -o $outtmp 2>&1 1>/dev/null");
system(" $script --gff $input_folder/agat_sp_manage_functional_annotation/02413F.gff --db $input_folder/agat_sp_manage_functional_annotation/uniprot_sprot_test.fasta -b $input_folder/agat_sp_manage_functional_annotation/02413F_blast.out -i $input_folder/agat_sp_manage_functional_annotation/02413F_interpro.tsv --clean_name -o $outtmp 2>&1 1>/dev/null");
#run test
ok( system( "diff $result $outtmp/02413F.gff" ) == 0, "output $script");
rmtree $outtmp;
Expand Down

0 comments on commit 39e841d

Please sign in to comment.