Skip to content

Commit

Permalink
Merge pull request #112 from NBISweden/110
Browse files Browse the repository at this point in the history
fix #110 (intron missing 1bp in calculation) + improvement of extract_sequeces.pl + fix #111 (import missing for plot is statistics.pm)
  • Loading branch information
Juke34 authored Apr 13, 2021
2 parents 76f0794 + 54fefa4 commit bf48b0d
Show file tree
Hide file tree
Showing 11 changed files with 6,417 additions and 232 deletions.
433 changes: 268 additions & 165 deletions bin/agat_sp_extract_sequences.pl

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion lib/AGAT/OmniscientI.pm
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ sub slurp_gff3_file_JD {
if(! $no_check or grep( /remove_orphan_l1$/, @$no_check_skip ) ) {
#check level1 has subfeature else we remove it
dual_print ($log, file_text_line({ string => "Check$check_cpt: remove orphan l1", char => "-", prefix => "\n" }), $verbose );
dual_print ($log, "We remove only those not supposed to be orphan\n");
dual_print ($log, "We remove only those not supposed to be orphan\n", $verbose );
_remove_orphan_l1(\%omniscient, \%miscCount, \%uniqID, \%uniqIDtoType, \%mRNAGeneLink, $verbose, $log, $debug); #or fix if level2 is missing (refseq case)
dual_print ($log, file_text_line({ string => " done in ".(time() - $previous_time)." seconds", char => "-" }), $verbose );
$check_cpt++; $previous_time = time();
Expand Down
6 changes: 4 additions & 2 deletions lib/AGAT/OmniscientStat.pm
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use Bio::SeqIO;
use AGAT::OmniscientTool;
use AGAT::OmniscientJson;
use AGAT::Utilities;
use AGAT::PlotR;
use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw( print_omniscient_statistics );
Expand Down Expand Up @@ -241,6 +242,7 @@ sub get_omniscient_statistics {
foreach my $tag_l2 ( sort keys %{$hash_omniscient->{'level2'} }){
print "tag_l2 $tag_l2\n" if $verbose;
my ($info_l2, $extra_l2) = get_omniscient_statistics_from_l2($hash_omniscient, $tag_l2, $verbose);

my $info_l2_sentence = get_info_sentences($info_l2, $extra_l2);
my $info_l2_distri = get_distributions($info_l2, $extra_l2);

Expand Down Expand Up @@ -385,7 +387,7 @@ sub get_omniscient_statistics_from_l2{
$counterL2_match++;

if($counterL2_match > 0 and $counterL2_match <= $indexLastL2){
my $intronSize= $sortedList[$counterL2_match]->start - $sortedList[$counterL2_match-1]->end;
my $intronSize = $sortedList[$counterL2_match]->start - $sortedList[$counterL2_match-1]->end - 1;

#compute feature size
$all_info{$tag_l2}{'level2'}{'intron'}{'size_feat'}+=$intronSize;
Expand Down Expand Up @@ -433,7 +435,7 @@ sub get_omniscient_statistics_from_l2{
# from the second intron to the last (from index 1 to last index of the table sortedList)
# We go inside this loop only if we have more than 1 feature.
if($counterL3 > 0 and $counterL3 <= $indexLast){
my $intronSize = $sortedList[$counterL3]->start - $sortedList[$counterL3-1]->end;
my $intronSize = $sortedList[$counterL3]->start - $sortedList[$counterL3-1]->end - 1;

#compute feature size
$all_info{$tag_l2}{'level3'}{$tag_l3}{'intron'}{'size_feat'}+=$intronSize;
Expand Down
6 changes: 4 additions & 2 deletions share/features_spread.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
{ "_comment": "Here are described features that can be split over different locations",
{ "_comment": "Here are described features that may span over different locations",
"cds":"1",
"three_prime_utr":"1",
"five_prime_utr":"1",
"start_codon":"1",
"stop_codon":"1",
"three_prime_utr":"1",
"utr":"1",
"3utr":"1",
"5utr":"1"
Expand Down
25 changes: 23 additions & 2 deletions t/scripts_output.t
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use strict;
use warnings;
use File::Path;

use Test::More tests => 51;
use Test::More tests => 54;

=head1 DESCRIPTION
Expand Down Expand Up @@ -171,7 +171,28 @@ $script = $script_prefix."bin/agat_sp_extract_sequences.pl";
$result = "$output_folder/agat_sp_extract_sequences_1.fa";
system(" $script --gff $output_folder/1.gff --fasta $output_folder/1.fa -o $outtmp 1>/dev/null");
#run test
ok( system("diff $result $outtmp") == 0, "output $script");
ok( system("diff $result $outtmp") == 0, "output $script test1");
unlink $outtmp;

$script = $script_prefix."bin/agat_sp_extract_sequences.pl";
$result = "$output_folder/agat_sp_extract_sequences_split.fa";
system(" $script --gff $output_folder/1.gff --fasta $output_folder/1.fa --split -o $outtmp 1>/dev/null");
#run test
ok( system("diff $result $outtmp") == 0, "output $script test2");
unlink $outtmp;

$script = $script_prefix."bin/agat_sp_extract_sequences.pl";
$result = "$output_folder/agat_sp_extract_sequences_merge.fa";
system(" $script --gff $output_folder/1.gff --fasta $output_folder/1.fa -t exon --merge -o $outtmp 1>/dev/null");
#run test
ok( system("diff $result $outtmp") == 0, "output $script test3");
unlink $outtmp;

$script = $script_prefix."bin/agat_sp_extract_sequences.pl";
$result = "$output_folder/agat_sp_extract_sequences_full.fa";
system(" $script --gff $output_folder/1.gff --fasta $output_folder/1.fa --full -o $outtmp 1>/dev/null");
#run test
ok( system("diff $result $outtmp") == 0, "output $script test4");
unlink $outtmp;

# --------check agat_sp_filter_by_locus_distance.pl-------------
Expand Down
2,553 changes: 2,553 additions & 0 deletions t/scripts_output/agat_sp_extract_sequences_full.fa

Large diffs are not rendered by default.

1,902 changes: 1,902 additions & 0 deletions t/scripts_output/agat_sp_extract_sequences_merge.fa

Large diffs are not rendered by default.

1,602 changes: 1,602 additions & 0 deletions t/scripts_output/agat_sp_extract_sequences_split.fa

Large diffs are not rendered by default.

24 changes: 12 additions & 12 deletions t/scripts_output/agat_sp_functional_statistics_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ Total five_prime_utr length 147
Total start_codon length 6
Total stop_codon length 6
Total three_prime_utr length 85
Total intron length per cds 666
Total intron length per exon 873
Total intron length per five_prime_utr 207
Total intron length per cds 659
Total intron length per exon 865
Total intron length per five_prime_utr 206
mean gene length 1886
mean mrna length 1886
mean cds length 1338
Expand All @@ -54,9 +54,9 @@ mean three_prime_utr length 42
mean cds piece length 297
mean five_prime_utr piece length 73
mean three_prime_utr piece length 42
mean intron in cds length 95
mean intron in exon length 109
mean intron in five_prime_utr length 207
mean intron in cds length 94
mean intron in exon length 108
mean intron in five_prime_utr length 206
Longest gene 2717
Longest mrna 2717
Longest cds 1992
Expand All @@ -68,9 +68,9 @@ Longest three_prime_utr 82
Longest cds piece 1652
Longest five_prime_utr piece 86
Longest three_prime_utr piece 82
Longest intron into cds part 143
Longest intron into exon part 207
Longest intron into five_prime_utr part 207
Longest intron into cds part 142
Longest intron into exon part 206
Longest intron into five_prime_utr part 206
Shortest gene 1056
Shortest mrna 1056
Shortest cds 684
Expand All @@ -82,9 +82,9 @@ Shortest three_prime_utr 3
Shortest cds piece 29
Shortest five_prime_utr piece 61
Shortest three_prime_utr piece 3
Shortest intron into cds part 56
Shortest intron into exon part 56
Shortest intron into five_prime_utr part 207
Shortest intron into cds part 55
Shortest intron into exon part 55
Shortest intron into five_prime_utr part 206

--------------------------------------------------------------------------------

Expand Down
32 changes: 16 additions & 16 deletions t/scripts_output/agat_sp_manage_introns_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ Total cds length 69687
Total exon length 108170
Total five_prime_utr length 11773
Total three_prime_utr length 25596
Total intron length per cds 78013
Total intron length per exon 90070
Total intron length per five_prime_utr 11254
Total intron length per three_prime_utr 803
Total intron length per cds 77787
Total intron length per exon 89816
Total intron length per five_prime_utr 11235
Total intron length per three_prime_utr 794
mean gene length 3054
mean mrna length 3045
mean cds length 1088
Expand All @@ -75,10 +75,10 @@ mean three_prime_utr length 419
mean cds piece length 240
mean five_prime_utr piece length 149
mean three_prime_utr piece length 365
mean intron in cds length 345
mean intron in exon length 354
mean intron in five_prime_utr length 592
mean intron in three_prime_utr length 89
mean intron in cds length 344
mean intron in exon length 353
mean intron in five_prime_utr length 591
mean intron in three_prime_utr length 88
Longest gene 8045
Longest mrna 8045
Longest cds 2937
Expand All @@ -88,10 +88,10 @@ Longest three_prime_utr 2780
Longest cds piece 2433
Longest five_prime_utr piece 2027
Longest three_prime_utr piece 2780
Longest intron into cds part 3535
Longest intron into exon part 4351
Longest intron into five_prime_utr part 4351
Longest intron into three_prime_utr part 128
Longest intron into cds part 3534
Longest intron into exon part 4350
Longest intron into five_prime_utr part 4350
Longest intron into three_prime_utr part 127
Shortest gene 346
Shortest mrna 346
Shortest cds 90
Expand All @@ -101,10 +101,10 @@ Shortest three_prime_utr 61
Shortest cds piece 2
Shortest five_prime_utr piece 1
Shortest three_prime_utr piece 16
Shortest intron into cds part 70
Shortest intron into exon part 40
Shortest intron into five_prime_utr part 77
Shortest intron into three_prime_utr part 40
Shortest intron into cds part 69
Shortest intron into exon part 39
Shortest intron into five_prime_utr part 76
Shortest intron into three_prime_utr part 39

--------------------------------------------------------------------------------

Expand Down
64 changes: 32 additions & 32 deletions t/scripts_output/agat_sp_statistics_1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,10 @@ Total cds length 69687
Total exon length 108170
Total five_prime_utr length 11773
Total three_prime_utr length 25596
Total intron length per cds 78013
Total intron length per exon 90070
Total intron length per five_prime_utr 11254
Total intron length per three_prime_utr 803
Total intron length per cds 77787
Total intron length per exon 89816
Total intron length per five_prime_utr 11235
Total intron length per three_prime_utr 794
mean gene length 3054
mean mrna length 3045
mean cds length 1088
Expand All @@ -73,10 +73,10 @@ mean three_prime_utr length 419
mean cds piece length 240
mean five_prime_utr piece length 149
mean three_prime_utr piece length 365
mean intron in cds length 345
mean intron in exon length 354
mean intron in five_prime_utr length 592
mean intron in three_prime_utr length 89
mean intron in cds length 344
mean intron in exon length 353
mean intron in five_prime_utr length 591
mean intron in three_prime_utr length 88
Longest gene 8045
Longest mrna 8045
Longest cds 2937
Expand All @@ -86,10 +86,10 @@ Longest three_prime_utr 2780
Longest cds piece 2433
Longest five_prime_utr piece 2027
Longest three_prime_utr piece 2780
Longest intron into cds part 3535
Longest intron into exon part 4351
Longest intron into five_prime_utr part 4351
Longest intron into three_prime_utr part 128
Longest intron into cds part 3534
Longest intron into exon part 4350
Longest intron into five_prime_utr part 4350
Longest intron into three_prime_utr part 127
Shortest gene 346
Shortest mrna 346
Shortest cds 90
Expand All @@ -99,10 +99,10 @@ Shortest three_prime_utr 61
Shortest cds piece 2
Shortest five_prime_utr piece 1
Shortest three_prime_utr piece 16
Shortest intron into cds part 70
Shortest intron into exon part 40
Shortest intron into five_prime_utr part 77
Shortest intron into three_prime_utr part 40
Shortest intron into cds part 69
Shortest intron into exon part 39
Shortest intron into five_prime_utr part 76
Shortest intron into three_prime_utr part 39

Re-compute mrna without isoforms asked. We remove shortest isoforms if any

Expand Down Expand Up @@ -142,10 +142,10 @@ Total cds length 53754
Total exon length 83174
Total five_prime_utr length 10367
Total three_prime_utr length 17939
Total intron length per cds 60924
Total intron length per exon 72857
Total intron length per five_prime_utr 11130
Total intron length per three_prime_utr 803
Total intron length per cds 60743
Total intron length per exon 72649
Total intron length per five_prime_utr 11112
Total intron length per three_prime_utr 794
mean gene length 3054
mean mrna length 2996
mean cds length 1054
Expand All @@ -155,10 +155,10 @@ mean three_prime_utr length 373
mean cds piece length 231
mean five_prime_utr piece length 159
mean three_prime_utr piece length 314
mean intron in cds length 336
mean intron in exon length 350
mean intron in five_prime_utr length 618
mean intron in three_prime_utr length 89
mean intron in cds length 335
mean intron in exon length 349
mean intron in five_prime_utr length 617
mean intron in three_prime_utr length 88
Longest gene 8045
Longest mrna 7833
Longest cds 2937
Expand All @@ -168,10 +168,10 @@ Longest three_prime_utr 2780
Longest cds piece 2433
Longest five_prime_utr piece 2027
Longest three_prime_utr piece 2780
Longest intron into cds part 3535
Longest intron into exon part 4351
Longest intron into five_prime_utr part 4351
Longest intron into three_prime_utr part 128
Longest intron into cds part 3534
Longest intron into exon part 4350
Longest intron into five_prime_utr part 4350
Longest intron into three_prime_utr part 127
Shortest gene 346
Shortest mrna 346
Shortest cds 90
Expand All @@ -181,10 +181,10 @@ Shortest three_prime_utr 61
Shortest cds piece 2
Shortest five_prime_utr piece 1
Shortest three_prime_utr piece 16
Shortest intron into cds part 70
Shortest intron into exon part 40
Shortest intron into five_prime_utr part 77
Shortest intron into three_prime_utr part 40
Shortest intron into cds part 69
Shortest intron into exon part 39
Shortest intron into five_prime_utr part 76
Shortest intron into three_prime_utr part 39

--------------------------------------------------------------------------------

Expand Down

0 comments on commit bf48b0d

Please sign in to comment.