Skip to content

Commit

Permalink
Fix gtf2gff converter to remove cds in special cases - add tests for …
Browse files Browse the repository at this point in the history
…gtf to gff conversion and for gff to gff to check stop codon when starting an exon or split over 2 exons (#125)
  • Loading branch information
Juke34 authored Apr 30, 2021
1 parent 86e3f90 commit cf9e435
Show file tree
Hide file tree
Showing 72 changed files with 4,125 additions and 134 deletions.
69 changes: 66 additions & 3 deletions bin/agat_convert_sp_gff2gtf.pl
Original file line number Diff line number Diff line change
Expand Up @@ -328,24 +328,75 @@ sub convert_feature_type{
# get gff3 feature (ID)
my $level2_ID = lc($feature_level2->_tag_value('ID'));


foreach my $tag_l3_lc (keys %{$hash_omniscient->{'level3'}}){ # primary_tag_key_level3 = cds or exon or start_codon or utr etc...

#shortened CDS to remove stop codon if any
if ( $tag_l3_lc =~ "cds"){
if (exists_keys($hash_omniscient, ('level3', 'stop_codon', $level2_ID) ) ){
my @list_cds = sort {$a->start <=> $b->start} @{$hash_omniscient->{'level3'}{'cds'}{$level2_ID}};
my $first_cds = $list_cds[0];
my $first_cds_size = $first_cds->end - $first_cds->start + 1;
my $last_cds = $list_cds[ $#list_cds ];
my $last_cds_size = $last_cds->end - $last_cds->start + 1;
my $strand = $first_cds->strand;
my $stop_codon = $hash_omniscient->{'level3'}{'stop_codon'}{$level2_ID}->[0];

my @list_stop = sort {$a->start <=> $b->start} @{$hash_omniscient->{'level3'}{'stop_codon'}{$level2_ID}};

if ( ($strand eq "+" ) or ($strand eq "1" ) ) {
my $stop_codon = $list_stop[$#list_stop];
my $stop_codon_size = ($stop_codon->end - $stop_codon->start + 1);

if (check_features_overlap($stop_codon, $last_cds) ){
$last_cds->end($stop_codon->start-1);
if( $stop_codon_size == 3){
if($last_cds_size > 3){
$last_cds->end($stop_codon->start-1);
}
else{remove_cds($hash_omniscient, $level2_ID, $last_cds);}
}
elsif( $stop_codon_size < 3){
if($last_cds_size > $stop_codon_size){
$last_cds->end($stop_codon->start);
}
else{remove_cds($hash_omniscient, $level2_ID, $last_cds);}
# there is another stop_codon
if($#list_stop > 0){
$stop_codon = $list_stop[$#list_stop-1];
$last_cds = $list_cds[ $#list_cds - 1];
$last_cds->end($stop_codon->start - 1);
}
}
else{
print "Warning: stop codon longer than 3 nucleotides for $level2_ID\n";
}
}
}
else{ # minus strand
my $stop_codon = $list_stop[0];
my $stop_codon_size = ($stop_codon->end - $stop_codon->start + 1);

if (check_features_overlap($stop_codon, $first_cds) ){
$first_cds->start($stop_codon->end+1);
if( $stop_codon_size == 3){
if($first_cds_size > 3){
$first_cds->start($stop_codon->end+1);
}
else{remove_cds($hash_omniscient, $level2_ID, $first_cds);}
}
elsif( $stop_codon_size < 3){
if($first_cds_size > $stop_codon_size){
$last_cds->start($stop_codon->end);
}
else{remove_cds($hash_omniscient, $level2_ID, $first_cds);}
# there is another stop_codon
if($#list_stop > 0){
$stop_codon = $list_stop[1];
$first_cds = $list_cds[1];
$first_cds->start($stop_codon->end+1);
}
}
else{
print "Warning: stop codon longer than 3 nucleotides for $level2_ID\n";
}
}
}
}
Expand Down Expand Up @@ -402,6 +453,18 @@ sub convert_feature_type{
}
}

sub remove_cds{
my ($hash_omniscient, $id_l2, $cds_feature)=@_;

my @new_cds_list=();
foreach my $feature ( @{$hash_omniscient->{'level3'}{'cds'}{$id_l2}} ) {
if( lc($feature->_tag_value('ID')) eq lc($cds_feature->_tag_value('ID'))){
next;
}
push @new_cds_list, $feature;
}
@{$hash_omniscient->{'level3'}{'cds'}{$id_l2}} = @new_cds_list;
}

# filter feature type to remove not expected ones
sub print_omniscient_filter{
Expand Down
15 changes: 12 additions & 3 deletions lib/AGAT/OmniscientI.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1680,6 +1680,7 @@ sub _check_l2_linked_to_l3{
sub _check_cds{
my ($debug, $log, $hash_omniscient, $mRNAGeneLink, $miscCount, $uniqID, $uniqIDtoType, $verbose)=@_;
my $resume_case=undef;
my $resume_case2=undef;

if( exists_keys($hash_omniscient,('level3', 'cds')) ){
if( exists_keys($hash_omniscient,('level3', 'stop_codon')) ){
Expand Down Expand Up @@ -1730,6 +1731,7 @@ sub _check_cds{
my $uID = _create_ID($miscCount, $uniqID, $uniqIDtoType, 'cds', $new_cds->_tag_value('ID'), PREFIX_NEW_ID); #method will push the uID
create_or_replace_tag($new_cds, 'ID', $uID); # remove parent ID because, none.
push (@{$hash_omniscient->{"level3"}{'cds'}{$id_l2}}, $new_cds);
$resume_case2++;
}
}
else{
Expand All @@ -1748,7 +1750,7 @@ sub _check_cds{
my $uID = _create_ID($miscCount, $uniqID, $uniqIDtoType, 'cds', $new_cds->_tag_value('ID'), PREFIX_NEW_ID); #method will push the uID
create_or_replace_tag($new_cds, 'ID', $uID); # remove parent ID because, none.
push (@{$hash_omniscient->{"level3"}{'cds'}{$id_l2}}, $new_cds);
$stop_start_exon=1;
$stop_start_exon=1;$resume_case2++;
}
}
}
Expand All @@ -1762,6 +1764,7 @@ sub _check_cds{
$resume_case++;
}
}
# strand is minus
else{
my $cds = $list_cds[0];
my $stop = $list_stop[0];
Expand All @@ -1783,6 +1786,7 @@ sub _check_cds{
my $uID = _create_ID($miscCount, $uniqID, $uniqIDtoType, 'cds', $new_cds->_tag_value('ID'), PREFIX_NEW_ID); #method will push the uID
create_or_replace_tag($new_cds, 'ID', $uID); # remove parent ID because, none.
push (@{$hash_omniscient->{"level3"}{'cds'}{$id_l2}}, $new_cds);
$resume_case2++;
}
}
else{
Expand All @@ -1801,7 +1805,7 @@ sub _check_cds{
my $uID = _create_ID($miscCount, $uniqID, $uniqIDtoType, 'cds', $new_cds->_tag_value('ID'), PREFIX_NEW_ID); #method will push the uID
create_or_replace_tag($new_cds, 'ID', $uID); # remove parent ID because, none.
push (@{$hash_omniscient->{"level3"}{'cds'}{$id_l2}}, $new_cds);
$stop_start_exon=1;
$stop_start_exon=1;$resume_case2++;
}
}
}
Expand All @@ -1822,7 +1826,12 @@ sub _check_cds{
if( $resume_case ){
dual_print($log, "$resume_case CDS extended to include the stop_codon\n", $verbose);
}
else{ dual_print($log, "No problem found\n", $verbose); }
if($resume_case2){
dual_print($log, "$resume_case2 CDS created to include the stop_codon that was on next exon\n", $verbose);
}
if(!$resume_case and !$resume_case2){
dual_print($log, "No problem found\n", $verbose);
}
}

# @Purpose: Check L3 features. If exon are missing we create them. We go through all features of level3 and check them by type, if two should be merged, we do it (CDS 1-50 and 51-100, must be CDS 1-100).
Expand Down
17 changes: 15 additions & 2 deletions t/gff_syntax.t
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use strict;
use warnings;
use Test::More tests => 35;
use Test::More tests => 37;

=head1 DESCRIPTION
Expand Down Expand Up @@ -54,7 +54,7 @@ foreach my $file (sort { (($a =~ /^(\d+)/)[0] || 0) <=> (($b =~ /^(\d+)/)[0] ||

#run test
ok( system("diff $pathtmp t/gff_syntax/$correct_output") == 0, "parse $file");

unlink $pathtmp;
#--------------- rerun on the first output #---------------

# peculiar case
Expand All @@ -75,6 +75,19 @@ foreach my $file (sort { (($a =~ /^(\d+)/)[0] || 0) <=> (($b =~ /^(\d+)/)[0] ||
}
closedir($dh);

# ---------------------------- special tests ----------------------------
my $result = "t/gff_syntax/stop_start_an_exon_correct_output.gff";
system(" $script --gff t/gff_syntax/stop_start_an_exon.gtf -o $pathtmp 2>&1 1>/dev/null");
#run test
ok( system("diff $result $pathtmp") == 0, "output $script");
unlink $pathtmp;

$result = "t/gff_syntax/stop_split_over_two_exons_correct_output.gff";
system(" $script --gff t/gff_syntax/stop_split_over_two_exons.gtf -o $pathtmp 2>&1 1>/dev/null");
#run test
ok( system("diff $result $pathtmp") == 0, "output $script");
unlink $pathtmp;



#unlink $pathtmp2;
8 changes: 5 additions & 3 deletions t/gff_syntax/README
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
This is an explanations of the different test files used to check the GFF3 parser.
Launch the tester.sh script to check the parser and all these files.


00: Correct gff3 => Must stay as it is.
01: 4 Exon duplicated + 4 exon missing
Expand Down Expand Up @@ -38,7 +36,7 @@ Launch the tester.sh script to check the parser and all these files.
33: Missing level1 features. The main problem is that the Parent ID and ID of the feature2 is the same. L1 feature is built from L2 feature, the parser has to check modify the Parent ID.
34: L1 and L2 features of a record have the same ID. The parser will modify the ID of the l2 feature and consequently the Parent ID of the L3 features.
35: One supernumary UTR to remove
36: No l1 feature and some L2 features do not have parent information. AGAT should check the l3 features to verify which parent is expected.
36: No l1 feature and some L2 features do not have parent information. AGAT should check the l3 features to verify which parent is expected.

/!\ If only level3 features are defined, and no locus tag present (see test 26), the tool cannot deal with it. I will create by default one umbrella level1, or if you on attribute as uniq locus ID, It will create a l1 for each feature => If only exon or only CDS features so the result will be fine, but if there are two different features that has to be linked together (two CDS or a CDS and a signal peptide as in the test case 26) , the tool will not perform properly.

Expand All @@ -50,3 +48,7 @@ The philosophy of the parser is to
Definitions:
feature = 1 line
record = bench of features linked together to describe a genomic element (i.e gene feature + mRNA feature + exon feature + CDS feature + UTR feature)

# ------------ extra check ------------
stop_start_an_exon => when converting into GFF the stop codon has to be included within the CDS. A new CDS chunck must be created.
stop_split_over_two_exons => when converting into GFF the stop codon has to be included within the CDS. one CDS has to be extended and a new CDS chunck must be created.
9 changes: 9 additions & 0 deletions t/gff_syntax/stop_split_over_two_exons.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
chr6 HAVANA transcript 143060901 143061606 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA exon 143060901 143061066 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA CDS 143060926 143061065 . + 0 gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA start_codon 143060926 143060928 . + 0 gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA exon 143061214 143061606 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA stop_codon 143061066 143061066 . + 0 gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA stop_codon 143061214 143061215 . + 0 gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA UTR 143060901 143060925 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA UTR 143061214 143061606 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
12 changes: 12 additions & 0 deletions t/gff_syntax/stop_split_over_two_exons_correct_output.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
##gff-version 3
chr6 HAVANA gene 143060901 143061606 . + . ID=ENSG00000146416.19;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA transcript 143060901 143061606 . + . ID=ENST00000367596.5;Parent=ENSG00000146416.19;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA exon 143060901 143061066 . + . ID=exon-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA exon 143061214 143061606 . + . ID=exon-2;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA CDS 143060926 143061066 . + 0 ID=cds-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA CDS 143061214 143061215 . + 1 ID=nbis-cds-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA start_codon 143060926 143060928 . + 0 ID=start_codon-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA stop_codon 143061066 143061066 . + 0 ID=stop_codon-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA stop_codon 143061214 143061215 . + 0 ID=stop_codon-2;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA UTR 143060901 143060925 . + . ID=utr-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA UTR 143061216 143061606 . + . ID=utr-2;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
8 changes: 8 additions & 0 deletions t/gff_syntax/stop_start_an_exon.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
chr6 HAVANA transcript 143060901 143061606 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA exon 143060901 143061066 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA CDS 143060926 143061066 . + 0 gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA start_codon 143060926 143060928 . + 0 gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA exon 143061214 143061606 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA stop_codon 143061214 143061216 . + 0 gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA UTR 143060901 143060925 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
chr6 HAVANA UTR 143061214 143061606 . + . gene_id "ENSG00000146416.19"; transcript_id "ENST00000367596.5"; gene_type "protein_coding"; gene_name "AIG1";
11 changes: 11 additions & 0 deletions t/gff_syntax/stop_start_an_exon_correct_output.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
##gff-version 3
chr6 HAVANA gene 143060901 143061606 . + . ID=ENSG00000146416.19;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA transcript 143060901 143061606 . + . ID=ENST00000367596.5;Parent=ENSG00000146416.19;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA exon 143060901 143061066 . + . ID=exon-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA exon 143061214 143061606 . + . ID=exon-2;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA CDS 143060926 143061066 . + 0 ID=cds-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA CDS 143061214 143061216 . + 0 ID=nbis-cds-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA start_codon 143060926 143060928 . + 0 ID=start_codon-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA stop_codon 143061214 143061216 . + 0 ID=stop_codon-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA UTR 143060901 143060925 . + . ID=utr-1;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
chr6 HAVANA UTR 143061217 143061606 . + . ID=utr-2;Parent=ENST00000367596.5;gene_id=ENSG00000146416.19;gene_name=AIG1;gene_type=protein_coding;transcript_id=ENST00000367596.5
73 changes: 0 additions & 73 deletions t/gff_syntax/tester.sh

This file was deleted.

Loading

0 comments on commit cf9e435

Please sign in to comment.