-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathget_sample_sheet.pl
executable file
·1071 lines (906 loc) · 35.9 KB
/
get_sample_sheet.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/opt/software/perl/bin/perl
#To work at TIGEM
#!/usr/bin/perl
#To work at CINECA
use lib '/cineca/prod/compilers/perl/5.x/none/lib/perl5';
####PLATFORM_SPECIFIC_SETTINGS_TERMINATED
#VarGenius - Variant Discovery and Annotation Tool
#Copyright (C) <2017> <Francesco Musacchia>
#This program is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#This program is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#You should have received a copy of the GNU General Public License
#along with this program. If not, see <http://www.gnu.org/licenses/>.
# Package VarGenius - Variant Discovery and Annotation Tool
# Author: Francesco Musacchia (2017)
#
#This software is a pipeline to discover SNPs and Variants through clinical
#sequencing data and to update our internal databases
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;#Used to write an usage
use FindBin;#To search libraries
use lib $FindBin::Bin;#To search libraries
#Using a library to manage files
use LIB::files_management qw( join_files_with_zcat );
use LIB::programs_management qw( initialize_folders );
my $program_name = "get_sample_sheet.pl";
my $program_version = "1.0";
#Input variables
my $target_bed = "";
my $intern_fold = "";
my $perchrom = "";
my $reference = "";
my $convscores = "";
my $user_id = "";
my $research_group_id = "";
my $add_to_analysisname = "";
my $change_analysisname = "";
my $infreq = "";
my $set_kinship = "";
my $output = "";
my $head_file = "";
my $dir = "";
my $mode = "";
my $mult_lanes = "";
my $fileList = "";
my $r1_suff = "";
my $r2_suff = "";
my $lanes_list = "";
my $foldersFile = "folders.txt";
my $ss_head_file = "CONFIGURATION/ss_head_file.txt ";
#Standard values for parameters
my $motor_target = "motor7_targeted.bed";
my $exome_target = "clinical_exome_cod.bed";
my $rna_target = "rna.bed";
my $base_perchrom = 0;
my $base_reference = "hg19";
my $base_convscores = 0;#Convert scores
#Users and research group ids
my $base_user_id = 142;#Motor 142 UDP 133
my $base_research_group_id = 14;
my $base_add_to_analysisname = "";#"";#
my $base_change_analysisname = "";#If this is not '-' the analysis name will be identical for all
#The analysis must be kept in count for the frequencies computation
my $base_infreq = 0;
#Folder that is inside each sample folder before to arrive to fastq files
my $base_intern_fold = "";#"/Files/";
#Reads 1 and 2 suffix
my $base_r1_suff = '_R1_';
my $base_r2_suff = '_R2_';
#Indicator of lanes
my @lanes = ("_L001_","_L002_","_L003_","_L004_","_L005_","_L006_","_L007_","_L008_","_L009_");
####################################MAIN
#Start getting the input parameters
parse_command_line_args();
#Get the working folder and vargenius
my ($working_folder,$program_folder) = initialize_folders($foldersFile);
my $start = 1;
#Check mandatory values
#if ( $head_file eq ''){
#die "Header file is needed. Please use --head_file\n";
#}
$head_file = $program_folder."/".$ss_head_file;
if ( $dir eq ''){
die "Directory where to search samples is needed. Please use -d-ir\n";
}
if ( $mode eq ''){
die "mode is needed [exome|targeted|rna]. Please use --mode\n";
}
#Set variables whether they are inputed or not
unless ( $intern_fold ne "" ) {$intern_fold = $base_intern_fold; }
unless ( $perchrom ne "" ) {$perchrom = $base_perchrom;}
unless ( $reference ne "" ) {$reference = $base_reference;}
unless ( $convscores ne "" ) {$convscores = $base_convscores; }
unless ( $user_id ne "" ) {$user_id = $base_user_id; }
unless ( $research_group_id ne "" ) {$research_group_id = $base_research_group_id; }
unless ( $r1_suff ne "" ) {$r1_suff = $base_r1_suff; }
unless ( $r2_suff ne "" ) {$r2_suff = $base_r2_suff; }
unless ( $lanes_list eq "" ) {@lanes = split(",",$lanes_list); }
my $ext_to_search = "gz";
my $exts_to_search = "fq.gz|fastq.gz|fastq|fq;";#('.fq.gz','.fastq.gz','.fastq','fq');
#Groupname
unless ( $add_to_analysisname ne "" ) {$add_to_analysisname = $base_add_to_analysisname; }
unless ( $change_analysisname ne "" ) {$change_analysisname = $base_change_analysisname; }
#The analysis must be kept in count for the frequencies computation
unless ( $infreq ne "" ) {$infreq = $base_infreq;}
#Print what will be used (if the output goes somewhere else...)
if ($output ne ''){
if ( $output eq ''){
print "You did not use the file path.. ( --output)\n";
}
if ( $mult_lanes eq ''){
print "You did not use the mult_lanes parameters...\n";
}
if ( $fileList eq ''){
print "You did not use a file with the list of sample names [-fileList]...\n";
}
print "head_file: $head_file\n";
print "dir: $dir\n";
print "mode: $mode\n";
print "mult_lanes: $mult_lanes\n";
print "fileList: $fileList\n";
print "Internal folder: $intern_fold\n";
print "perchrom : $perchrom\n";
print "reference: $reference\n";
print "convscores: $convscores\n";
print "user_id: $user_id\n";
print "research_group_id: $research_group_id\n";
print "add_to_analysisname: $add_to_analysisname\n";
print "change_analysisname: $change_analysisname\n";
print "infreq: $infreq\n";
print "output: $output\n";
print "Read1 suffix: $r1_suff\n";
print "Read2 suffix: $r2_suff\n";
print "lanes list: $lanes_list\n";
}
if ( $start){
#write_sample_sheet_both_selected();
write_sample_sheet();
}
=head2 get_gender_hash
Title : get_gender_hash
Usage : get_gender_hash( - dirname => 'the folder to check',
);
Function:
If the user gives in input a file with: sample_name\tgender
the gender is picked from the file and inserted into an hash.
Then this function fills the list of sample names
Returns : imputed hash and array are filled
=cut
sub get_gender_hash {
my ($gender_hash) = shift;
my ($list) = shift;
my $fileList = shift;
my $retval = 0;
open(LIST,"<$fileList") or die "Cannot open $fileList..\n";
while ( my $row = <LIST>){
chomp($row);
my @fields = split("\t",$row);
if (defined $fields[1]){
$retval = 1;
$$gender_hash->{$fields[0]} = $fields[1];
}
push(@$list,$fields[0]);
}
close(LIST);
return $retval;
}
=head2 get_sample_info_hash
Title : get_sample_info_hash
Usage : get_sample_info_hash( - dirname => 'the folder to check',
);
Function:
This files contains fields with samples info:
sample_name\tgender\tdob\tpob\tkinship\taffected\n
infos are picked from the file and inserted into an hash.
Then this function fills the list of sample names
Returns : imputed hash and array are filled
=cut
sub get_sample_info_hash {
my ($sample_info_hash) = shift;
my ($list) = shift;
my $fileList = shift;
my $retval = 0;
open(LIST,"<$fileList") or die "Cannot open $fileList..\n";
while ( my $row = <LIST>){
chomp($row);
my @fields = split("\t",$row);
if (defined $fields[1]){
$retval = 1;
#Gender
$$sample_info_hash->{$fields[0]}->{'gender'} = $fields[1];
$$sample_info_hash->{$fields[0]}->{'dob'} = $fields[2];
$$sample_info_hash->{$fields[0]}->{'pob'} = $fields[3];
$$sample_info_hash->{$fields[0]}->{'affected'} = $fields[4];
}
push(@$list,$fields[0]);
}
close(LIST);
return $retval;
}
=head2 write_sample_sheet_both_selected
Title : write_sample_sheet_both_selected
Usage : write_sample_sheet_both_selected( - dirname => 'the folder to check',
);
dir: source directory
mode: motor or exome or exome_gr
mult_lanes: can be 'lanes' or something other
Use exome_gr if you know there are multiple samples inside a folder
set the intern_fold variable if needed
Function: Writes a sample sheet by using the folder of the fastq files.
MotorV7: motor7_haloplex.bed
MotorV8: MotorPlex_v8_TuttiGeni_HPnormale_Covered.bed
Returns : nothing
=cut
sub write_sample_sheet_both_selected{
my $sep = "\t";
my $no_val = "-";
#my $intern_fold = "";#"/Files/";
#Put the names from the file in an array
my @list = ();
my @samples_used = ();
my $gender_hash;
my $gender_taken = 0;
if (defined $fileList and $fileList ne 'none' and $fileList ne ''){
if ( get_gender_hash(\$gender_hash,\@list,$fileList) == 1){
$gender_taken = 1;
}
else{
@list = list_to_array($fileList,'NO_NEW_LINE');
}
}
#Opens the dir with all the samples
opendir DIR, $dir or die "ERROR : cannot open dir $dir";
my @samples_dirs= readdir DIR;#All the files are in this list
my $final_info = "";
if ($mode eq "targeted"){
$perchrom = 0;
$final_info = "targeted".$sep.$perchrom.$sep.$convscores.$sep.$infreq;
#$target_bed = "MotorPlex_v8_TuttiGeni_HPnormale_Covered.bed";
unless ( $target_bed ne "" ) {$target_bed = $motor_target };
}elsif ($mode eq "exome"){#Exome single sample
#$perchrom = 1;
$final_info = "exome".$sep.$perchrom.$sep.$convscores.$sep.$infreq;
unless ( $target_bed ne "" ) {$target_bed = $exome_target;}
}elsif ($mode eq "exome_gr"){#Exomes
$final_info = "exome".$sep.$perchrom.$sep.$convscores.$sep.$infreq;
unless ( $target_bed ne "" ) {$target_bed = $exome_target;}
}elsif ($mode eq "rna"){#RNA
$final_info = "rna".$sep.$perchrom.$sep.$convscores.$sep.$infreq;
unless ( $target_bed ne "" ) {$target_bed = $rna_target;}
}else{
die "Need a mode to insert the parameters! [exome, motor, exome_gr, rna]\n";
}
#Start to write the file
if ($output ne ''){
open (OUT,">$output") or die "ERROR: Cannot open $output. The program will exit..\n";
}
#print "There are ".scalar(@samples_dirs)." elements in $dir\n";#DEBUGCODE
my $couple;
#Get and print header
open (HEAD,"<$head_file") or die "ERROR: Cannot open $head_file. The program will exit..\n";
my $header = <HEAD>;
close(HEAD);
#Substitute commas in the header with the separator used here
$header =~ s/,/$sep/g;
if ($output ne ''){
print OUT $header;
}else{
print $header;
}
#A control starts on each sample among all the samples
my $numFile= 1;
my $fastq1 = "";
my $fastq2 = "";
#Flag used to check if the sample is to take or not
my $get_sel_only = 0;
my $analyze_sample = 1;
#Go through the different samples directories
foreach my $sample_dir (@samples_dirs){
#print "Considering sample $sample_dir \n";#DEBUGCODE
my $analysis_name = $sample_dir;
##########CONSIDER SAMPLELIST
#If the file list is defined set the analyze_sample flag to NO
if (defined $fileList and $fileList ne 'none' and $fileList ne ''){
$get_sel_only = 1;
if ( not grep {/\b$sample_dir\b/} @list ){
$analyze_sample = 0;
#print "The sample $sample_dir will not be analyzed..\n";#DEBUGCODE
}else{
#print "The sample $sample_dir will be analyzed..\n";#DEBUGCODE
}
}
#Get all the samples unless the user wants to get only those selected and this sample is ok
unless ( $get_sel_only == 1 and $analyze_sample == 0){
#If the folder name does not start with a dot and is a directory
if ( ($sample_dir !~ /^\..*/) and (-d $dir.$sample_dir)){
#print "$sample_dir will be analyzed.\n";#DEBUGCODE
my @lanes_used = ();
my $kinship = "-";
my $gender = "-";
############SET KINSIP
#Kinship is associated when the last letter is given
#More than one proband are specified with the number after the _P
#The groupname is picked from the proband sample name
if ( $set_kinship){
if ($sample_dir =~ /_P\d*$/ ) {
$kinship = "P";
#pick the analysis name from the sample name removing the kinship
$analysis_name = $sample_dir;
$analysis_name =~ s/_P\d*$//;
}elsif ($sample_dir =~ /_M$/) {
$kinship = "M";
$gender ="F";
#pick the analysis name from the sample name removing the kinship
$analysis_name = $sample_dir;
$analysis_name =~ s/_M$//;
}elsif ($sample_dir =~ /_F$/) {
$kinship = "F";
$gender ="M";
#pick the analysis name from the sample name removing the kinship
$analysis_name = $sample_dir;
$analysis_name =~ s/_F$//;
}else{
$kinship = "P";
$gender = "-";
#pick the analysis name from the sample name
$analysis_name = $sample_dir;
}
}
##############SET GENDER
#If the gender has been put into
if ( defined $gender_hash->{$sample_dir}){
$gender = $gender_hash->{$sample_dir};
}
############ADD SUFFIX TO GROUPNAME
#If the group name is not specified than it is the same as the sample dir
if ( ($mode eq 'targeted') or ($mode eq 'exome') or ($mode eq 'rna')){
#If change_analysisname is set, then it is used. Otherwise add_to is verified
if ($change_analysisname ne "" ){
$analysis_name = $change_analysisname;
}else{
#If I already changed the groupname in set_kinship
if ($set_kinship){
$analysis_name = $analysis_name.$add_to_analysisname;
}else{
$analysis_name = $sample_dir.$add_to_analysisname;
}
}
}
#print "Entering $sample_dir...\n";
my $dir1 = $dir."/".$sample_dir;
#print $dir1."\t";
$dir1 .= "/$intern_fold/";
opendir DIR1, $dir1 or die "ERROR: cannot open dir $dir1";
#my @files= readdir DIR1;#All the files are in this list
my @files = grep { /\.($exts_to_search)$/ } readdir DIR1;
die "ERROR: Cannot find files with extensions $exts_to_search in $dir1\n".
"Maybe there is an internal folder? (intern_fold: $intern_fold)" unless @files > 0;
#A control starts on each file of the folder
############MULTIPLE LANES
############
if ( $mult_lanes ){
my $ext_found = 0;
foreach my $file (@files){
#print "Using $file..\n";#DEBUGCODE
$ext_found = 1;
foreach my $lane (@lanes){
#print "Checking if $lane is contained...\n";
if ( $file =~ /$lane/ ){
#print "$file contains $lane...\n";
#Check if is the read 1 or two
if ( $file =~ /$r1_suff/ ){
$couple->{$lane}->{1} = $file;
push(@lanes_used,$lane);
}else{
$couple->{$lane}->{2} = $file;
}
}
}
}
#Error not found fq
die "ERROR: Cannot find fastq files ($ext_to_search) in $dir1. Exiting\n"
unless ($ext_found);
#Print both the files for each lane used
#print "for folder $sample_dir used ".scalar(@lanes_used)."\n";
foreach my $lane (@lanes_used){
#heritability,gender,diseaseID,kinship are NOVAL
if ($output ne ''){
print OUT $no_val.$sep.$gender.$sep.$no_val.$sep.$kinship.$sep.$user_id.$sep.$research_group_id.$sep;
print OUT $dir1.$sep;
}else{
print $no_val.$sep.$gender.$sep.$no_val.$sep.$kinship.$sep.$user_id.$sep.$research_group_id.$sep;
print $dir1.$sep;
}
#Add read 1 file name
if (defined $couple->{$lane}->{1} ) {
if ($output ne ''){
print OUT $couple->{$lane}->{1}.$sep;
}else{
print $couple->{$lane}->{1}.$sep;
}
}
#Add read 2 file name
if (defined $couple->{$lane}->{2} ) {
if ($output ne ''){
print OUT $couple->{$lane}->{2}.$sep;
}else{
print $couple->{$lane}->{2}.$sep;
}
}
#The name of the read file must contain the lane number and
#must be preceeded by an underscore an should contain nothing after
#the number
my $lane_num = $lane;
#if present, remove the last '_'
if ($lane_num =~ /\_$/){chop($lane_num);}
#if not present, add an '_' at the start
if ($lane_num !~ /^_/){$lane_num = "_".$lane_num;}
#If theere is a single read file just use the sample name for the readfname
if ( scalar(@lanes_used) == 1){
$lane_num = "";
}
#ReadFName is obtained by adding the lane number to the sample name
#If the user specified an output file
if ($output ne ''){
print OUT $sample_dir.$lane_num.$sep.$sample_dir.$sep.$analysis_name.$sep.$reference.$sep.$target_bed.$sep.$final_info."\n";
}#else she/he didn't specify the out file
else{
print $sample_dir.$lane_num.$sep.$sample_dir.$sep.$analysis_name.$sep.$reference.$sep.$target_bed.$sep.$final_info."\n";
}
$numFile++;
}
}
############NO MULTIPLE LANES
############
else{
my $fq1_found = 0;
my $fq2_found = 0;
#Check if there are more than two files in the folder. Could mean the user used lanes
if ( scalar(@files) > 2 ){
print "WARNING: there are multiple files in $dir1. If multiple lanes fastq are present use -ml\n"
}
#Expects only two files inside the folder with _R1_ and _R2_
foreach my $file (@files){
#print "Using $file..\n";#DEBUGCODE
#Search the files with _R1_ and R2
if ( $file =~ /$r1_suff/ ){
$fastq1 = $file;
$fq1_found = 1;
}elsif ( $file =~ /$r2_suff/ ){
$fastq2 = $file;
$fq2_found = 1;
}
}
#Error not found fq1
die "ERROR: Cannot find fastq with extension $r1_suff. Please use --r1_suff to change how to search.\n"
unless ($fq1_found);
#Error not found fq1
die "ERROR: Cannot find fastq with extension $r2_suff. Please use --r2_suff to change how to search.\n"
unless ($fq2_found);
#print "for folder $sample_dir used ".scalar(@lanes_used)."\n";
if ($output ne ''){
print OUT $no_val.$sep.$gender.$sep.$no_val.$sep.$kinship.$sep.$user_id.$sep.$research_group_id.$sep;
print OUT $dir1.$sep;
print OUT $fastq1.$sep.$fastq2.$sep;
print OUT $sample_dir.$sep.$sample_dir.$sep.$analysis_name.$sep.$reference.$sep.$target_bed.$sep.$final_info."\n";
}else{
print $no_val.$sep.$gender.$sep.$no_val.$sep.$kinship.$sep.$user_id.$sep.$research_group_id.$sep;
print $dir1.$sep;
print $fastq1.$sep.$fastq2.$sep;
print $sample_dir.$sep.$sample_dir.$sep.$analysis_name.$sep.$reference.$sep.$target_bed.$sep.$final_info."\n";
}
$numFile++;
}
push(@samples_used,$sample_dir);
}
}
#Reset the flag to 1 so that the next sample is to analyze
#unless it is set again to 0
$analyze_sample = 1
}
if (scalar(@samples_used) > 0){
print "Used ".scalar(@samples_used)." samples \n";
}else{
print "WARNING: no sample has been found in $dir \n";
}
close(OUT);
}
=head2 write_sample_sheet_both_selected
Title : write_sample_sheet_both_selected
Usage : write_sample_sheet_both_selected( - dirname => 'the folder to check',
);
Using a file in input with information about the samples
generates a text file with the data that will be imported into the db
dir: source directory
mode: motor or exome or exome_gr
mult_lanes: can be 'lanes' or something other
Use exome_gr if you know there are multiple samples inside a folder
set the intern_fold variable if needed
Function:
Returns : nothing
=cut
sub write_sample_sheet{
my $sep = "\t";
my $no_val = "-";
#my $intern_fold = "";#"/Files/";
#Put the names from the file in an array
my @list = ();
my @samples_used = ();
my $sample_info_hash;
my $gender_taken = 0;
if (defined $fileList and $fileList ne 'none' and $fileList ne ''){
if ( get_sample_info_hash(\$sample_info_hash,\@list,$fileList) == 1){
$gender_taken = 1;
}
else{
@list = list_to_array($fileList,'NO_NEW_LINE');
}
}
#Opens the dir with all the samples
opendir DIR, $dir or die "ERROR : cannot open dir $dir";
my @samples_dirs= readdir DIR;#All the files are in this list
my $final_info = "";
if ($mode eq "targeted"){
$perchrom = 0;
$final_info = "targeted".$sep.$perchrom.$sep.$convscores.$sep.$infreq;
#$target_bed = "MotorPlex_v8_TuttiGeni_HPnormale_Covered.bed";
unless ( $target_bed ne "" ) {$target_bed = $motor_target };
}elsif ($mode eq "exome"){#Exome single sample
#$perchrom = 1;
$final_info = "exome".$sep.$perchrom.$sep.$convscores.$sep.$infreq;
unless ( $target_bed ne "" ) {$target_bed = $exome_target;}
}elsif ($mode eq "exome_gr"){#Exomes
$final_info = "exome".$sep.$perchrom.$sep.$convscores.$sep.$infreq;
unless ( $target_bed ne "" ) {$target_bed = $exome_target;}
}elsif ($mode eq "rna"){#RNA
$final_info = "rna".$sep.$perchrom.$sep.$convscores.$sep.$infreq;
unless ( $target_bed ne "" ) {$target_bed = $rna_target;}
}else{
die "Need a mode to insert the parameters! [exome, motor, exome_gr, rna]\n";
}
#Start to write the file
if ($output ne ''){
open (OUT,">$output") or die "ERROR: Cannot open $output. The program will exit..\n";
}
#print "There are ".scalar(@samples_dirs)." elements in $dir\n";#DEBUGCODE
my $couple;
#Get and print header
open (HEAD,"<$head_file") or die "ERROR: Cannot open $head_file. The program will exit..\n";
my $header = <HEAD>;
close(HEAD);
#Substitute commas in the header with the separator used here
$header =~ s/,/$sep/g;
if ($output ne ''){
print OUT $header;
}else{
print $header;
}
#A control starts on each sample among all the samples
my $numFile= 1;
my $fastq1 = "";
my $fastq2 = "";
#Flag used to check if the sample is to take or not
my $get_sel_only = 0;
my $analyze_sample = 1;
#Go through the different samples directories
foreach my $sample_dir (@samples_dirs){
#print "Considering sample $sample_dir \n";#DEBUGCODE
my $analysis_name = $sample_dir;
##########CONSIDER SAMPLELIST
#If the file list is defined set the analyze_sample flag to NO
if (defined $fileList and $fileList ne 'none' and $fileList ne ''){
$get_sel_only = 1;
if ( not grep {/\b$sample_dir\b/} @list ){
$analyze_sample = 0;
#print "The sample $sample_dir will not be analyzed..\n";#DEBUGCODE
}else{
#print "The sample $sample_dir will be analyzed..\n";#DEBUGCODE
}
}
#Get all the samples unless the user wants to get only those selected and this sample is ok
unless ( $get_sel_only == 1 and $analyze_sample == 0){
#If the folder name does not start with a dot and is a directory
if ( ($sample_dir !~ /^\..*/) and (-d $dir.$sample_dir)){
#print "$sample_dir will be analyzed.\n";#DEBUGCODE
my @lanes_used = ();
my $kinship = "-";
my $gender = "-";
my $dob = "-";
my $pob = "-";
my $affected = '-';
############SET KINSIP
#Kinship is associated when the last letter is given
#More than one proband are specified with the number after the _P
#The groupname is picked from the proband sample name
if ( $set_kinship){
if ($sample_dir =~ /_P\d*$/ ) {
$kinship = "P";
#pick the analysis name from the sample name removing the kinship
$analysis_name = $sample_dir;
$analysis_name =~ s/_P\d*$//;
}elsif ($sample_dir =~ /_M$/) {
$kinship = "M";
$gender ="F";
#pick the analysis name from the sample name removing the kinship
$analysis_name = $sample_dir;
$analysis_name =~ s/_M$//;
}elsif ($sample_dir =~ /_F$/) {
$kinship = "F";
$gender ="M";
#pick the analysis name from the sample name removing the kinship
$analysis_name = $sample_dir;
$analysis_name =~ s/_F$//;
}else{
$kinship = "P";
$gender = "-";
#pick the analysis name from the sample name
$analysis_name = $sample_dir;
}
}
##############SET GENDER
#If the gender has been put into
if ( defined $sample_info_hash->{$sample_dir}->{'gender'}){
$gender = $sample_info_hash->{$sample_dir}->{'gender'};
}
#If the dob has been put into
if ( defined $sample_info_hash->{$sample_dir}->{'dob'}){
$dob = $sample_info_hash->{$sample_dir}->{'dob'};
}
#If the pob has been put into
if ( defined $sample_info_hash->{$sample_dir}->{'pob'}){
$pob = $sample_info_hash->{$sample_dir}->{'pob'};
}
#If the affected has been put into
if ( defined $sample_info_hash->{$sample_dir}->{'affected'}){
$affected = $sample_info_hash->{$sample_dir}->{'affected'};
}
############ADD SUFFIX TO GROUPNAME
#If the group name is not specified than it is the same as the sample dir
if ( ($mode eq 'targeted') or ($mode eq 'exome') or ($mode eq 'rna')){
#If change_analysisname is set, then it is used. Otherwise add_to is verified
if ($change_analysisname ne "" ){
$analysis_name = $change_analysisname;
}else{
#If I already changed the groupname in set_kinship
if ($set_kinship){
$analysis_name = $analysis_name.$add_to_analysisname;
}else{
$analysis_name = $sample_dir.$add_to_analysisname;
}
}
}
#print "Entering $sample_dir...\n";
my $dir1 = $dir."/".$sample_dir;
#print $dir1."\t";
$dir1 .= "/$intern_fold/";
opendir DIR1, $dir1 or die "ERROR: cannot open dir $dir1";
#my @files= readdir DIR1;#All the files are in this list
my @files = grep { /\.($exts_to_search)$/ } readdir DIR1;
die "ERROR: Cannot find files with extensions $exts_to_search in $dir1\n".
"Maybe there is an internal folder? (intern_fold: $intern_fold)" unless @files > 0;
#A control starts on each file of the folder
############MULTIPLE LANES
############
if ( $mult_lanes ){
my $ext_found = 0;
foreach my $file (@files){
#print "Using $file..\n";#DEBUGCODE
$ext_found = 1;
foreach my $lane (@lanes){
#print "Checking if $lane is contained...\n";
if ( $file =~ /$lane/ ){
#print "$file contains $lane...\n";
#Check if is the read 1 or two
if ( $file =~ /$r1_suff/ ){
$couple->{$lane}->{1} = $file;
push(@lanes_used,$lane);
}else{
$couple->{$lane}->{2} = $file;
}
}
}
}
#Error not found fq
die "ERROR: Cannot find fastq files ($ext_to_search) in $dir1. Exiting\n"
unless ($ext_found);
#Print both the files for each lane used
#print "for folder $sample_dir used ".scalar(@lanes_used)."\n";
foreach my $lane (@lanes_used){
#gender,kinship
if ($output ne ''){
print OUT $gender.$sep.$kinship.$sep.$dob.$sep.$pob.$sep.$affected.$sep.$user_id.$sep.$research_group_id.$sep;
print OUT $dir1.$sep;
}else{
print $gender.$sep.$kinship.$sep.$dob.$sep.$pob.$sep.$affected.$sep.$user_id.$sep.$research_group_id.$sep;
print $dir1.$sep;
}
#Add read 1 file name
if (defined $couple->{$lane}->{1} ) {
if ($output ne ''){
print OUT $couple->{$lane}->{1}.$sep;
}else{
print $couple->{$lane}->{1}.$sep;
}
}
#Add read 2 file name
if (defined $couple->{$lane}->{2} ) {
if ($output ne ''){
print OUT $couple->{$lane}->{2}.$sep;
}else{
print $couple->{$lane}->{2}.$sep;
}
}
#The name of the read file must contain the lane number and
#must be preceeded by an underscore an should contain nothing after
#the number
my $lane_num = $lane;
#if present, remove the last '_'
if ($lane_num =~ /\_$/){chop($lane_num);}
#if not present, add an '_' at the start
if ($lane_num !~ /^_/){$lane_num = "_".$lane_num;}
#If theere is a single read file just use the sample name for the readfname
if ( scalar(@lanes_used) == 1){
$lane_num = "";
}
#ReadFName is obtained by adding the lane number to the sample name
#If the user specified an output file
if ($output ne ''){
print OUT $sample_dir.$lane_num.$sep.$sample_dir.$sep.$analysis_name.$sep.$reference.$sep.$target_bed.$sep.$final_info."\n";
}#else she/he didn't specify the out file
else{
print $sample_dir.$lane_num.$sep.$sample_dir.$sep.$analysis_name.$sep.$reference.$sep.$target_bed.$sep.$final_info."\n";
}
$numFile++;
}
}
############NO MULTIPLE LANES
############
else{
my $fq1_found = 0;
my $fq2_found = 0;
#Check if there are more than two files in the folder. Could mean the user used lanes
if ( scalar(@files) > 2 ){
print "WARNING: there are multiple files in $dir1. If multiple lanes fastq are present use -ml\n"
}
#Expects only two files inside the folder with _R1_ and _R2_
foreach my $file (@files){
#print "Using $file..\n";#DEBUGCODE
#Search the files with _R1_ and R2
if ( $file =~ /$r1_suff/ ){
$fastq1 = $file;
$fq1_found = 1;
}elsif ( $file =~ /$r2_suff/ ){
$fastq2 = $file;
$fq2_found = 1;
}
}
#Error not found fq1
die "ERROR: Cannot find fastq with extension $r1_suff. Please use --r1_suff to change how to search.\n"
unless ($fq1_found);
#Error not found fq1
die "ERROR: Cannot find fastq with extension $r2_suff. Please use --r2_suff to change how to search.\n"
unless ($fq2_found);
#print "for folder $sample_dir used ".scalar(@lanes_used)."\n";
if ($output ne ''){
print OUT $gender.$sep.$kinship.$sep.$dob.$sep.$pob.$sep.$affected.$sep.$user_id.$sep.$research_group_id.$sep;
print OUT $dir1.$sep;
print OUT $fastq1.$sep.$fastq2.$sep;
print OUT $sample_dir.$sep.$sample_dir.$sep.$analysis_name.$sep.$reference.$sep.$target_bed.$sep.$final_info."\n";
}else{
print $gender.$sep.$kinship.$sep.$dob.$sep.$pob.$sep.$affected.$sep.$user_id.$sep.$research_group_id.$sep;
print $dir1.$sep;
print $fastq1.$sep.$fastq2.$sep;
print $sample_dir.$sep.$sample_dir.$sep.$analysis_name.$sep.$reference.$sep.$target_bed.$sep.$final_info."\n";
}
$numFile++;
}
push(@samples_used,$sample_dir);
}
}
#Reset the flag to 1 so that the next sample is to analyze
#unless it is set again to 0
$analyze_sample = 1
}
if (scalar(@samples_used) > 0){
print "Used ".scalar(@samples_used)." samples \n";
}else{
print "WARNING: no sample has been found in $dir \n";
}
close(OUT);
}
=head2 parse_command_line_args
Title : parse_command_line_args
Usage : parse_command_line_args( );
Function: Parses the arguments specified upon the command line.
Returns : nothing
=cut
sub parse_command_line_args{
my $HELP = 0;# Shows help overview.
my $VERSION = 0;# Shows version number and exit.
my $howToUse = "Use with: \nperl $program_name \n\n".
"\t-h|--head_file: The header file is the file with the header of the sample sheet. It is name ss_head_file.txt and ".
"located into the VarGenius/CONFIGURATION folder. Please use its full path.\n".
"\t-d|--dir: Full path of the directory where the samples folders are located.\n".
"\t-m|--mode: [exome|targeted|rna]\n".
"\t-ml|--mult_lanes: use this parameter if into the folders you find different files for different samples. ".
" The file names must contain _L001_,_L002_,_L003_,_L004_ and so on.\n".
"\t-r1|--r1_suff: set this parameter to set how to find the fastq read 1. Default value is _R1_\n".
"\t-r2|--r2_suff: set this parameter to set how to find the fastq read 2. Default value is _R2_\n".
"\t-fl|--fileList: if you want to use selected samples in the folder for this sample sheet, set this parameter ".
"giving in input a text file with a list of sample names separated by newline.\n".
"This file could also be used to give the gender of selected samples. Please separate gender and sample name with a TAB.
(samplename F). Allowed genders: F and M".
"\t-lanes|--lanes_list: set this parameter to set how to find the lane names. Give it a list of lanes names ".
" (e.g. lane1,lane2,lane3,lane4). Default value is L001,L002,L003,L004,L005,L007,L008\n".
"\t-t|--target_bed: the target BED file name (just the name, not the path)\n".
"\t-if|--intern_fold: if the fastq are contained into a subdirectory of the sample folder specify the subfolder name here.\n".
"\t-pchr|--perchrom: just 0 or 1 if you want to do the analysis per-chromosome or not. Default: 1\n".
"-r|--reference: reference genome identifier[hg19]\n".
"\t-conv|--convscores: if scores must be converted to Phred [0 or 1]\n".
"\t-uid|--user_id: identifier of the user that is running vargenius (a number that is choose by you);\n".
"\t-rgid|--research_group_id: identifier of the research group that is running vargenius (a number that is choose by you);\n".
"\t-agn|--add_to_analysisname: string to add to the analysis name. Default: both the sample name and analysis name are the sample folder name. \n".
"\t-cgn|--change_analysisname: set and assign same analysis name to each file of the sample sheet.\n".
"\t-infr|--infreq: if this analysis should be kept in count for the variant frequencies calculation [0|1]\n".
"\t-o|--output: output sample sheet path and name\n".
"\t-k|--set_kinship: This is a flag just to search for kinship at the end of the name. The name must end with one among: ".
"P:proband,M:mother,F:father,A:affected non proband (brother, syster or cousin of proband),".
"R:any relative inserted for any reason\n\n";
# Parse options
GetOptions(
"help" => \$HELP,
"version" => \$VERSION,
"h|head_file=s" => \$head_file, #It's mandatory
"d|dir=s" => \$dir, #It's mandator
"m|mode=s" => \$mode, #It's mandatory
"ml|mult_lanes" => \$mult_lanes,
"r1|r1_suff=s" => \$r1_suff,
"r2|r2_suff=s" => \$r2_suff,
"lanes|lanes_list=s" => \$lanes_list,
"fl|fileList=s" => \$fileList,