forked from gladstone-institutes/MonkeyPipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbananas_agw.pm
1966 lines (1772 loc) · 155 KB
/
bananas_agw.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/perl
# bananas.pm version 2 - parallelized for rigel
# Authors: Sean Thomas and Alex Williams, 2013
# This program is designed to automate many of the tasks that accompany next generation sequencing tasks.
# MERGER of CHIP/EXO and RNA pipelines completed Dec 15, 2014
# Alex's changed package name so as to not inadvertently interfere with Sean's.
# Thing to note:
# * when jobs are queued up, they are added IN ALPHABETICAL ORDER
# * a job that begins alphabetically earlier therefore CANNOT depend on a later job---example: job "08b" can depend on "08a" but not vice versa.
# * thus, jobs must be numbered in such a way that alphabetization is related to prerequisites.
# To do:
# * All of this assumes that you're running the executable from the same filesystem as the cluster.
# So when we check for binaries, we are checking on the CLIENT (head/login node),
# but what we REALLY need to do is submit a 'check dependencies' cluster job to check everything
# on the cluster machine. -- AlexW, July 18 2016
package bananas_agw;
use strict; use warnings; use Carp; # Carp has "confess"
use Data::Dumper;
use File::Basename;
use Scalar::Util qw(looks_like_number);
use File::Spec::Functions qw(catfile); # File::Spec has "catfile"
use IO::Compress::Gzip qw(gzip $GzipError) ;
use List::Util qw(max);
use Exporter;
use Sys::Hostname; # for the 'hostname()' command used in getUserPriv
use POSIX; # for 'getgroups()' in getUserPriv
### 0.i Export functions
our (@ISA, @EXPORT, @EXPORT_OK);
@ISA = qw(Exporter);
@EXPORT_OK = qw(GLOBAL_VERBOSE GLOBAL_BIN_DIR GLOBAL_DEBUG GLOBAL_DRY_RUN GLOBAL_COLOR FORCE_RERUN OUTPUT_DIR_OVERRIDE GLOBAL_WARNINGS RUN_DIRECTLY_WITHOUT_TORQUE NA_VALUE);
@EXPORT = qw(loadConfig checkConfig buildJobSubmissionScript runJobSubmissionScript noExtensions printInstallInstructions isNA);
(defined($ENV{BINFBINROOT}) and length($ENV{BINFBINROOT}) >= 1) or confess "-----------\n[ERROR]: You need to define the UNIX system-wide environment variable 'BINFBINROOT' (for 'bioinformatics binary root') in your ~/.bashrc file before you can run Monkey.\nHere is an example of something you can add to the ~/.bashrc: export BINFBINROOT=/data/applications/2015_06/bin \n(Make sure to log out and log back in after you set that, or type 'source ~/.bashrc' to reload the bash config file). If you add that and it doesn't work, try checking the value of 'echo \$BINFBINROOT' and make sure that actually prints the proper path.\n------------ ";
our ($GLOBAL_BIN_DIR) = $ENV{BINFBINROOT}; # "/data/bin/2015_06"; # <== note: hard-coded, and NOT THE "monkey" /bin dir with the perl scripts in it! (That's the 'monkeyPoo' directory)
if (not -d $ENV{BINFBINROOT}) {
warn("[WARNING]: Your environment variable 'BINFBINROOT' (for 'bioinformatics binary root') was not set to a valid directory that already exists! You will need to properly set the 'BINFBINROOT' in your ~/.bashrc file. You can check to see what this variable is set to by typing 'echo \$BINFBINROOT' on the command line. It should be a directory with a bunch of programs in it. The current (probably incorrect) one is set to this value: <$GLOBAL_BIN_DIR> (which does not seem to be a valid directory)");
confess "Failure to find a valid directory with binaries in it, which we expect to be specified in the 'BINFBINROOT' environment variable.\nSee text above for details.\nIn other words---your environment variables in your ~/.bashrc are NOT set up correctly.";
}
## These are settings SPECIFIC to our server and queue system, NOT general settings!!!!! #####
my $ADMIN_GROUP_NAME = "admin";
my $NORMAL_GROUP_NAME = "normal";
my %QSETTINGS = ( "dest" => {"$ADMIN_GROUP_NAME" => "-q Bio"
, "$NORMAL_GROUP_NAME" => "-q General" }
, "grouplist" => {"$ADMIN_GROUP_NAME" => "-W group_list=bioqueue"
, "$NORMAL_GROUP_NAME" => "-W group_list=genqueue" }
, "ncpus_for_alignment" => {"$ADMIN_GROUP_NAME" => "6" # Privileged users = 4 threads per job, other users = 1 thread
, "$NORMAL_GROUP_NAME" => "2" }
, "tophat_needs" => {"pbs_mem" => "12gb" }
, "star_aligner_needs" => {"pbs_mem" => "33gb" } # needs a lot of ram! Default is 31 GB limit for just the aligner
, "gem" => {"pbs_mem" => "25gb", "pbs_ncpus"=>1 } # needs a lot of ram! Default is 31 GB limit for just the aligner
, "default_ncpus" => {"$ADMIN_GROUP_NAME" => "1"
, "$NORMAL_GROUP_NAME" => "1" }
, "default_mem" => {"$ADMIN_GROUP_NAME" => "4gb"
, "$NORMAL_GROUP_NAME" => "4gb" }
, "default_walltime" => {"$ADMIN_GROUP_NAME" => "72:00:00"
, "$NORMAL_GROUP_NAME" => "72:00:00" }
);
my $UNIX_GRP_WORK = "1004"; # <== Anybody in this group gets the be 'privileged' for job submission
my $UNIX_GRP_CAVE = "3050"; # <== Anybody in this group gets the be 'privileged' for job submission
my $UNIX_GRP_B2B = "2050"; # <== Anybody in this group gets the be 'privileged' for job submission
my $UNIX_GRP_BIOQUEUE = "35098"; # <== Anybody in this group gets the be 'privileged' for job submission
my @UNIX_BIO_GRP_ID_ARRAY = ($UNIX_GRP_WORK, $UNIX_GRP_CAVE, $UNIX_GRP_B2B, $UNIX_GRP_BIOQUEUE); # HARD CODED right now for rigel specifically. Check /etc/groups for a list by ID.
my $SHOULD_ATTEMPT_TO_RUN_MONOCLE = 0; # Never did really get this working right...
## ABove: these are settings SPECIFIC to our server and queue system, NOT general settings!!!!! #####
our ($GLOBAL_VERBOSE) = 0; # 0 = quiet, 1 = verbose
our ($GLOBAL_DEBUG) = 0; # 0 = normal, 1 = ultra-verbose diagnostic messages
our ($GLOBAL_DEV_MODE) = 0; # 0 = normal, 1 = ultra-verbose diagnostic messages
our ($GLOBAL_DRY_RUN) = 0; # 0 = normal, 1 = dry run (dry = "don't actually do anything")
our ($GLOBAL_COLOR) = 0; # 0 = only black and white messages. 1 = COLOR text to console
our ($FORCE_RERUN) = 0; # 0 = normal, 1 = re-run all jobs, even completed ones
our ($OUTPUT_DIR_OVERRIDE) = undef; # If defined, this means the user has specified their own resultDir on the command line, rather than in the config file.
our ($GLOBAL_WARNINGS) = ""; # Accumulate all the WARNING messages that we might want to tell the user both 1) when they occur and 2) at the end of the job submission
our ($RUN_DIRECTLY_WITHOUT_TORQUE) = 0; # By default, we will run with the torque job scheduler. But if this is specified, we will instead just run ONE job at a time, very slowly. Works even if Torque is not installed
our ($NA_VALUE) = "NA"; # Don't change this unless you ALSO change the 'NA' value that is used to specifiy *intentionally* missing files in the input files.
my $poo_data_subdir = "data"; # subdirectory in "poo" that has certain data files
my $ZLM_LICENSE_HARD_CODED = "/home/sthomas/tools/vmatch-2.2.4-Linux_x86_64-64bit/vmatch.lic"; # <== warning: do not change the variable name: "ZLM_LICENSE" is some ENV variable that is needed by peakMotifs. See 10.motifDiscovery.pl for the exact command
# For the 'gem' peaks
my $GEM_READ_DIST_DEFAULT = "Read_Distribution_default.txt"; # This file should be in the "monkeyPoo" directory
my $GEM_READ_DIST_CHIP_EXO = "Read_Distribution_ChIP-exo.txt"; # This file should be in the "monkeyPoo" directory
my $HOLD_UNTIL_ALL_JOBS_SUBMITTED = 1; # <== Should be 1. This is VERY IMPORTANT for jobs with large numbers of files (--test6 for an example). Set it to 0 to remove Alex's workaround for PBS failing when jobs depend on other already-completed jobs.
my $SIGNAL_TO_START_VARNAME = "SIGNAL_TO_START_ALL_JOBS";
my $TRACKFILE_SUFFIX = ".tracks.txt";
# List recognized "key" names from the config file. Anything in the config file but NOT in here will raise a warning.
my @OK_KEY_NAMES = qw(studyName sampleDir resultDir minMapQ libraryAdapterFile bowtie2Index bwaIndex starIndexDir genomicBins genomicBinsID chromSizesFile geneWindows exonFile transcriptFile symbolXref genomeFasta genomeFastaPerChr genomeFastaDir repeatMask monkeyPoo aligner gtfFile tracksURL tracksRsync bedAnnotForRSEQC species version );
my @OK_BOOLEAN_KEY_NAMES = qw(forceRerun tracksShowEveryRead skipFastQC skipQC doQC skipBrowser browserBins browserWigs skipWindows doWindows skipTagsAndDensity doDensity shouldFilterAdapters doAltAnalyze );
my @OK_ALIGNERS = qw(bowtie bowtie2 tophat tophat2 tophat2_1_1 star hisat hisat2_0_1 bwa);
my $SAFE_JOB_NAME_REGEXP = '^' . "[_A-Za-z0-9]+" . '$'; # Job names must be in this format. Note that hyphens are not allowed here.
my $SAFE_FILENAME_REGEXP = '^' . "[-+~._A-Za-z0-9]+" . '$'; # A filename, NOT a path, so no '/' are allowed. Dots, hyphens, and even the 'plus' sign are OK here.
my $SAFE_SPECIES_REGEXP = $SAFE_FILENAME_REGEXP; # species must either be undefined OR look something like this. Anything that would be OK in a filename (no spaces!) is ok here.
my $VERBOSE_DIAGNOSTIC_OK_COLOR = "cyan on_black";
my $VERBOSE_SKIP_STEP_OK_COLOR = "magenta on_black";
# Currently running job names from qstat: qstat -f| grep 'Job_Name' | perl -pe 's/^\s*Job_Name\s*=\s*//'
my $BANANAS_VERSION = "1.1.2017.01";
my $LOGDIR = "X.logs"; # ALL logs go into here -- note that this location is HARD CODED in each of the scripts in "monkey/bin/*.pl", so DO NOT CHANGE THIS unless you also change it in all those scripts too!
my $globalNumJobsSubmitted = 0; # counter
# =================== "REMEMBER" stores various file relationships (e.g., which BAM files were generated) which are not stored anywhere else ===================
my $REM_ALIGN_BAM = "alignbam"; # where we remember which bam files we aligned
my $REM_EXPERIMENT_CATEGORY = "category"; # remember which experimental categories exist
my $REM_SCRIPTS_CHECKED = "scripts-checked-for-ok-syntax";
my $REM_ALL_JOBS = "alljobs";
my $REM_DEPENDENCIES_STR_COLON_DELIM = "dep-colon-delim";
my $REM_SAMPLE_ORDER_ARR = "sample-ids-in-order"; # Remember the original order that the files were specified in the config file. Important for grouping samples in the 'subread' pairs plot figure. This is an ARRAY (@) of the sample IDs (not filenames)!
my %remember = ($REM_ALIGN_BAM =>{} # new empty hash ref
, $REM_EXPERIMENT_CATEGORY=>{} # new empty hash ref
, $REM_SCRIPTS_CHECKED =>{} # new empty hash ref
, $REM_ALL_JOBS =>{} # new empty hash ref
, $REM_DEPENDENCIES_STR_COLON_DELIM =>{} # new empty hash ref
); # A hash that stores important file relationships that we need to remember. Somewhat ad hoc at the moment.
@{$remember{$REM_SAMPLE_ORDER_ARR}} = (); # New empty array
# ==============================================================================
my $STEP_FILTER = "s02_filter"; # <== These need to appear ALPHABETICALLY,
my $STEP_MAPPING = "s04a_map"; # as that is how jobs are queued up and run.
my $STEP_TAGS = "s05a_tag"; # In other words: jobs can only depend on other ones that come ALPHABETICALLY EARLIER.
my $STEP_TAG_STATS = "s05b_tag_stats";
my $STEP_DENSITY = "s06_density";
my $STEP_PEAKS = "s07a1_peaks";
my $STEP_PEAK_SUMMARY = "s07a2_peaks_summary";
my $STEP_GEM_PEAKS = "s07a3_gemPeaks";
my $STEP_GEMPEAKS_SUMMARY = "s07a4_gemPeaks_summary";
my $STEP_BCP_PEAKS = "s07b1_bcpPeaks";
my $STEP_BCPPEAKS_SUMMARY = "s07b2_bcpPeaks_summary";
my $STEP_NUCLEOATAC = "s07c1_nucleoATAC"; # ATAC-seq
my $STEP_BROWSER_BINS = "s08a_browser_bins"; # Note: Different from "s48_browser_wiggle"
my $STEP_BROWSER_BIN_SUMMARY = "s08b_browser_sum";
my $STEP_WINDOWS = "s09_windows";
my $STEP_MOTIFS = "s10a_motif";
my $STEP_MOTIF_SUMMARY = "s10b_motif_sum";
my $STEP_SUBREAD = "s20a_subread_count"; # RNA
my $STEP_SUBREAD_SUMMARY_FIGS = "s20b_subread_figs"; # RNA
my $STEP_CUFFDIFF = "s22_cuffdiff"; # RNA
my $STEP_CUMMERBUND = "s23_cummerbund"; # RNA
my $STEP_RSEQC_SINGLE = "s47a_rseqc";
my $STEP_RSEQC_TOGETHER = "s46_rseqc_all";
my $STEP_BROWSER_WIGGLE = "s48_browser_wig";
my $STEP_FASTQC_UNMAPPED = "s49a_fastqc_unaligned";
my $STEP_FASTQC_ALIGNED = "s49c_fastqc_aligned";
my $STEP_EDGER_DIFF_EXPR = "s50a_edgeR"; # RNA
my $STEP_MONOCLE = "s51a_monocle"; # RNA... single-cell really, though
my $STEP_SUMMARY_AFTER_EDGER = "s51b_sum";
my $STEP_CLUSTER_AFTER_EDGER = "s51c_clust";
my $STEP_ALT_ANALYZE = "s52_alt_analyze";
# job times:
# tags: 8 hours
# alignment: 99 hours
# filering: 8 hours
# 05.xSummary.pl: 2 hours
# 07a.peaksGem.pl:#PBS -l walltime=8:00:00
# 07a.xSummaryGem.pl:#PBS -l walltime=1:00:00
# 07b.peaksBCP.pl:#PBS -l walltime=4:00:00
# 07b.xSummaryBCP.pl:#PBS -l walltime=1:00:00
# 07c.NucleoAtac.pl:#PBS -l walltime=300:00:00
# 07.peaks.pl:#PBS -l walltime=4:00:00
# 07.xSummary.pl:#PBS -l walltime=1:00:00
# 08.browser.pl:#PBS -l walltime=24:00:00
# 08.xSummary.pl:#PBS -l walltime=1:00:00
# 09.windows.pl:#PBS -l walltime=4:00:00
# 10.motifDiscovery.pl:#PBS -l walltime=4:00:00
# 10.xSummary.pl:#PBS -l walltime=24:00:00
# 20a.count.subread.pl:#PBS -l walltime=72:00:00
# 20b.summary.figures.pl:#PBS -l walltime=8:00:00
# grep: data: Is a directory
# 22.cuffdiff.pl:#PBS -l walltime=300:00:00
# 23a.cummerbund.pl:#PBS -l walltime=8:00:00
# 46.qc.multiple.bam.pl:#PBS -l walltime=72:00:00
# 47.rse.qc.pl:#PBS -l walltime=100:00:00
# 48.browser_agw.pl:#PBS -l walltime=72:00:00
# 49.qc.pl:#PBS -l walltime=2:00:00
# 50a.edgeR.pl:#PBS -l walltime=24:00:00
# 51a.monocle.R:#PBS -l walltime=24:00:00
# 51b.summary.R:#PBS -l walltime=24:00:00
# 51c.cluster.R:#PBS -l walltime=24:00:00
# 52.altanalyze.pl:#PBS -l walltime=24:00:00
#########################################################
# Code for the sub-scripts to (for example) evaluate arguments
sub getMostExtensivePythonPath() {
my $current = (defined($ENV{PYTHONPATH}) ? $ENV{PYTHONPATH} : "");
return join(':'
, ($current
, "/data/applications/2015_06/libpython2.7/lib64/python2.7/site-packages"
, "/data/applications/2015_06/libpython2.7/lib/python2.7/site-packages")
);
}
# envLooksTrue is also used by the sub-scripts in the bin directory--do not delete it!
sub envLooksTrue {
# Works fine for multiple values: e.g. my ($t1, $t2) = envLooksTrue("t_one", "t_two");
# Returns whether $ENV{'the_input_thing'} is a TRUE perl value, and NOT "false" or "undef" (case-insensitive).
# Example correct usage: my $boolean = envLooksTrue("varname"); # Example wrong usage: my $wrongAnswer = envLooksTrue($ENV{"varname"} <- WRONG ) <== WRONG! Do not pass "$ENV" in here!
# Looks like it can also return an array, if you want to check multiple items at once
if (scalar(@_) == 0) { confess "ERR: Can't call envLooksTrue without at least one (string) argument."; }
my @listOfBools = map { defined($_) && $ENV{$_} && (uc($ENV{$_}) ne "FALSE") && (uc($ENV{$_}) ne "UNDEF") } @_;
if (scalar(@_) == 1) { return $listOfBools[0]; } # Return just the FIRST argument. Allows this: my $var = envLooksTrue("something") to work properly. Otherwise that will fail (it will evaluate the return value of the list in a scalar context, thus returning the length of the list (1) instead of the value. This is confusing because Perl is weird!
else { return (@listOfBools); } # Return an ARRARY if multiple arguments are specified. Example: my ($t1, $t2) = envLooksTrue("t_one", "t_two");
}
sub isNA($) { return (defined($_[0]) && (uc($_[0]) eq uc($NA_VALUE))); } # Checks for the literal "NA" text value. This is how we distinguish our own "intentionally not set" values from Perl undef values, which generally indicate a PROGRAMMING ERROR.
sub sampleHasMatchingInput($) { return(defined($_[0]->{'input'}) and !isNA($_[0]->{'input'}) and $_[0]->{'input'} ne ''); }
sub getMatchingInput($) { return (sampleHasMatchingInput($_[0])) ? $_[0]->{'input'} : $NA_VALUE; }
sub catRequiredFile { # same as catfile, but REQUIRES that the file exists, unless it's a dry run.
my $filePath = catfile(@_);
($GLOBAL_DRY_RUN or -e $filePath) or confess "[ERROR] We expected the following file to exist, but it did not: $filePath";
return($filePath);
}
sub mkdirOrDie($) { # This is SURPRISINGLY complicated because of race conditions! 'mkdirOrDie' is also used by the sub-scripts in the bin directory--do not delete it!
my ($dir) = @_;
defined($dir) or confess "[ERROR] What in the world! 'mkdirOrDie' got an undefined value passed to it! This is probably a programming error.";
($dir !~ m/\s/) or ourWarn("[WARNING] Creating a directory with WHITESPACE in the name. The offending name is: $dir");
($dir =~ m/^[-+._~\w\/]+$/) or ourWarn("[WARNING] Creating a directory with NON-BASIC-ALPHANUMERIC (and [-+._~]) characters in it. The offending name is: $dir");
if ($GLOBAL_DRY_RUN) {
print STDERR "Dry run: we are not actually creating the directory named '$dir'.\n";
return 1;
}
if (-d $dir) { return 1; } # ok, it exists already # race condition info: note: file could be created here BETWEEN checking for existence and making the directory!
if (not mkdir($dir) and ($! =~ m/exists/i)) { return 1; } # Actually MAKE the directory here. The failure message was "File exists" -- so this means that some OTHER process must have made it! So it's ok. # Note: mkdir returns 0 (FAILURE!!) if the directory already exists at this point, which is possible if some OTHER process made it
if (-d $dir) { return 1; } # last chance to succeed!
confess qq{[ERROR] Failed in our attempt to make (or find) the directory that we expected to be at the following location: "$dir". The error message was as follows: '$!'};
}
sub pythonHasLib($$) {
my ($pyExe, $pyLibName) = @_;
my $exitCode = system(("$pyExe", "-c", "import $pyLibName"));
return ($exitCode == 0); # 0 == got it!
}
sub dieIfMissingPythonLib($$) {
my ($pyExe, $pyLibName) = @_;
(pythonHasLib($pyExe, $pyLibName)) or confess "[ERROR]: The required python library '$pyLibName' was NOT FOUND on the system! This is for the python executable '$pyExe'---double check to make sure that this is the version of python that you expect to be running! You may have multiple versions of python on your system (2.6, 2.7, 3.x, etc...). Try running 'import ___library_name_here___' on the command line, and seeing if you get the same problem. Also remember that the machine where you SUBMIT a job may not be the same machine that actually RUNS the job, if you are running on a cluster! You may be able to install the missing library using 'pip'.";
}
sub dieIfCannotWriteTo($) { # Argument: a file path. Tells you if you could theoretically write there or not.
my ($path) = @_;
($path =~ /^\//) or ourWarn("[WARNING] You appear to have given a non-absolute path to the 'dieIfCannotWriteTo()' function---with Monkey, everything should be an ABSOLUTE path. Continuing anyway..."); # check to make sure it starts with a '/' (and is therefore probably a full path)
if (-d $path) { -r $path and -w $path or confess "[ERROR] Weird, the directory <$path> was not both readable and writable by the current user! Disaster! Fix this."; }
if (-e $path) { -r $path and -w $path or confess "[ERROR] Weird, the file at <$path> was not both readable and writable by the current user! Catastrophe! Fix this."; }
if (not -e $path) { # Nothing ALREADY exists at this location, so let's see if we could (hypothetically) create something here by looking to see if the parent directory is writable.
my $pathParentDir = dirname($path); # always the parent dir, even if this path is ITSELF a directory
if (not -e $pathParentDir) {
ourWarn("[WARNING] This is weird... the parent directory of <$path> also doesn't exist. This may be ok, though, so we will continue... but it may be a problem, too.\n");
} else { # see if the parent "directory" is actually a directory and is writeable
if (-f $pathParentDir) { confess "[ERROR] The parent directory of <$path> cannot be an (existing) plain file, this is too bizarre."; }
if (-d $pathParentDir) { -r $path and -w $path or confess "Hey, we need to be able to write to the parent directory <$pathParentDir> of path <$path>! But right now we cannot. Fix the permissions!"; }
}
}
return 1;
}
sub fileIsReadableAndNonzero($) { my ($f)=@_; return (($GLOBAL_DRY_RUN) or (defined($f) and -e $f and -r $f and -s $f)); } # -e: exists. -r: is readable. -s: is nonzero
sub dieIfFileAccessFails($;$) {
# Input: a filename. This function will do nothing UNLESS that file is missing or unreadable, in which case it should give you an informative message about the specific inaccessible file.
if ($GLOBAL_DRY_RUN) { return 1; }
my ($filename, $optionalMessage) = @_;
if (!defined($optionalMessage)) { $optionalMessage = "FILE HANDLING FAILED"; }
defined($filename) or confess "[ERROR] $optionalMessage: For whatever reason, a required filename was passed in as 'undefined'. This probably means that you did not specify a required parameter/filename in the monkey config file. See if other messages give any additional indication of what happened!";
if (-d $filename) { return 1; } # ok, it was a directory, so I guess we can just return early without checking for anything else!
(-e $filename) or confess "[ERROR] $optionalMessage: Cannot open the seemingly-nonexistent file '$filename'! Quitting now.";
(-r $filename) or confess "[ERROR] $optionalMessage: this user cannot READ the file '$filename'! Quitting now.";
return 1; # Looks like it was OK
}
sub numLinesInFile { # pass in a FILENAME
my ($filename) = @_;
my $lineCount = 0;
open(FFF,$filename) or die "Cannot open $filename\n"; {
while(<FFF>) { $lineCount++; }
} close(FFF);
return($lineCount);
}
sub bash_system { my @args = ( "bash", "-c", shift ); return(system(@args)); } # Runs system calls through BASH instead of regular SH, allowing for things like subshells.
# systemAndLog is also used by the sub-scripts in the bin directory--do not delete it!i
sub systemAndLog($;$$$) {
# Takes a command and (optionally) a boolean (should log to STDERR?) and a filename (if defined, append to this log file?)
# Returns the exit code of the command, just like plain 'system'
my ($cmd, $shouldLogToStderr, $logToThisFile, $dieIfNonzero) = @_;
if (!defined($shouldLogToStderr)) { $shouldLogToStderr = 0; }
if (!defined($dieIfNonzero)) { $dieIfNonzero = 0; }
my $date = `date`; chomp($date);
my $msg = ($GLOBAL_DRY_RUN ? "DRY RUN of this command: " : "[${date}] Running this command:") . "\n ${cmd}\n";
my $exitCode = ($GLOBAL_DRY_RUN) ? 0 : bash_system($cmd); # Only run the command if this is NOT a dry run.
$msg .= " (Returned exit code $exitCode)\n";
if ($exitCode != 0) { $msg .= "[WARNING]: Exit code was nonzero!\n"; }
if ($shouldLogToStderr) { print STDERR $msg; }
# if (defined($logToThisFile)) {
# open(LOG,">>$logToThisFile") or confess "ERROR: Failed to open log file $logToThisFile. Probably this is a programming error.";
# if ($logToThisFile) { print LOG $msg; }
# close(LOG) or confess "ERROR: Could not close the file $logToThisFile.";
# }
if ($dieIfNonzero && $exitCode != 0) { confess "[ERROR / FATAL SYSTEM CALL FAILURE]! Non-zero exit code:\n$msg"; }
return($exitCode);
}
sub systemAndLogDieNonzero($;$$) { my ($cmd, $toerr, $f) = @_; return(systemAndLog($cmd, $toerr, $f, "die if nonzero exit")); }
sub setIfUndefined($$$) {
my ($config, $varname, $defaultValue) = @_; # Sets a variable in the config only if that value was NOT defined before!
if (not defined($config->{$varname})) { $config->{$varname} = $defaultValue; }
}
# requireEnvOrDie: This function is used by the SUB scripts (the ones in "poo/bin") in order to validate that %ENV variables were passed in correctly.
sub requireEnvOrDie { # Takes any number of string inputs
# Example usage:
# bananas_agw::requireEnvOrDie('input', "secondary_thing", "outfilename"); # If any of those are undefined, 'confess'es with an error. Otherwise returns true.
# This function can accept any number of (string) arguments.
# It checks to make sure they are all DEFINED. If any one of them is NOT, then it exits with a FATAL ERROR.
# Variables ARE allowed to be blank. But they must be *defined*.
# Note: "%ENV" is automatically populated by the 'qsub' queue-handling command. It is not a hash that we create explicitly.
my (@varList) = @_;
#defined(%ENV) or confess "The global 'ENV' hash variable (which should have been passed in by the 'qsub' queue submission program) was *not* defined in script submission! This indicates a fatal error somewhere in the queue submission process. Possibly this job was run by some means OTHER than qsub?";
for my $requiredVarName (@varList) {
defined($ENV{$requiredVarName}) or confess "[ERROR] The required variable '$requiredVarName' was NOT passed into this script by bananas.pm! This is a bug in bananas--fix it! Also check your capitalization--variable names are CASE SENSITIVE.";
}
return 1; # <== If we get here, then all the required variables were defined.
}
sub removeTrailingSlash($) { # Removes (at most) ONE single trailing slash from a path (string)
my ($s) = @_; $s =~ s/\/$//;
return($s);
}
sub getNumExperimentalGroups() { return scalar(keys(%{$remember{$REM_EXPERIMENT_CATEGORY}})); } # number of unique GROUPS
sub getGroupLabelsAndBamStringByGroup($$$) { # Gets the bams **by group** from the "remember" hash.
my ($labelDelim, $betweenGroupDelim, $withinGroupDelim) = @_;
# Example:
# Returns TWO STRINGS, something like:
# labelStr: WT,CTRL,EXP2 (<== delimited by $labelDelim, a comma in this case)
# groupStr: wt1.bam,wt2.bam,wt3.bam ctl1.bam,ctl2.bam exp2.1.bam,exp2.2.bam (<== delimited by spaces BETWEEN groups and commas WITHIN group here)
my $bamsByGroupStr = ""; # <== Will eventually look something like "exp1.bam,exp2.bam ctrl1.bam,ctrl2.bam,ctrl3.bam"
foreach my $category ( keys( %{$remember{$REM_EXPERIMENT_CATEGORY}} ) ) {
my @bamArray = ();
foreach my $replicate (keys(%{$remember{$REM_EXPERIMENT_CATEGORY}{$category}})) {
my $bamPath = $remember{$REM_EXPERIMENT_CATEGORY}{$category}{$replicate}{"bam"};
push(@bamArray, $bamPath);
}
$bamsByGroupStr .= ((length($bamsByGroupStr) == 0) ? '' : $betweenGroupDelim) . join($withinGroupDelim, @bamArray); # <== add a space between categories for every category EXCEPT the very first one!. The FINAL result will look something like "exp1.bam,exp2.bam ctrl1.bam,ctrl2.bam,ctrl3.bam"
}
my $labelStr = join($labelDelim, keys(%{$remember{$REM_EXPERIMENT_CATEGORY}})); # Should look something like: "wildtype<COMMA>control<COMMA>drug<COMMA>whatever". Note that we can't use a literal comma (,), because qsub chews it up. So we use "<COMMA>" instead. The cuffdiff script knows how to deal with this.
return ($labelStr, $bamsByGroupStr); # TWO strings!!
}
#########################################################
sub safeColor($;$) { # one required and one optional argument
## Returns colored text, but only if $SHOULD_USE_COLORS is set.
## Allows you to totally disable colored printing by just changing $SHOULD_USE_COLORS to 0 at the top of this file
# Colorstring is OPTIONAL, and can be something like "red on_blue" or "red" or "magenta on_green"
# Example usage:
# * print STDERR safeColor("This warning message is red on yellow", "red on_yellow");
my ($message, $color) = @_;
if ($GLOBAL_COLOR) { use Term::ANSIColor; }
return (($GLOBAL_COLOR && defined($color)) ? (Term::ANSIColor::colored($message, $color) . Term::ANSIColor::color("reset")) : $message);
}
sub printColorErr($;$) { # prints color to STDERR *UNLESS* it is re-directed to a file, in which case NO COLOR IS PRINTED.
my ($msg, $col) = @_; # Only prints in color if STDERR is to a terminal, NOT if it is redirected to an output file!
if (! -t STDERR) { $col = undef; } # no coloration if this isn't to a terminal
print STDERR safeColor($msg, $col);
}
sub printColorOut($;$) { # prints color to STDOUT *UNLESS* it is re-directed to a file, in which case NO COLOR IS PRINTED.
my ($msg, $col) = @_; # Only prints in color if STDOUT is to a terminal, NOT if it is redirected to an output file!
if (! -t STDOUT) { $col = undef; } # no coloration if this isn't to a terminal
print STDOUT safeColor($msg, $col);
}
# aid with checking configuration file key values
sub addKeyValuePairToConfig($$$$$) {
my ($cfgPointer, $key, $value, $lineNum, $cfgFilename) = @_;
# Check for deprecated variable names.
($key ne 'tracksSCP') or confess "[ERROR] 'tracksSCP' is now a DEPRECATED CONFIG VARIABLE NAME. Replace it with 'tracksRsync', which is superior for various reasons.";
($key ne 'skipBrowser') or confess "[ERROR] 'skipBrowser' is now a DEPRECATED CONFIG VARIABLE NAME. Instead, specify 'browserWigs = FALSE' and 'browserBins = FALSE' in the config file.";
(!exists(${$cfgPointer}->{$key})) or confess "\n[ERROR] DUPLICATE ASSIGNMENT OF '$key' DETECTED (filename: \"$cfgFilename\")\n>>>> The '$key' was defined a SECOND time on line $lineNum. You cannot have a key defined twice in the same file! ";
(grep(/^$key$/, (@OK_KEY_NAMES, @OK_BOOLEAN_KEY_NAMES))) or warn "\n>>>> KEY WARNING: the key '$key' (on line $lineNum in the config file $cfgFilename) was NOT FOUND in the list of recognized keys.";
${$cfgPointer}->{$key} = $value;
}
sub getBamFullPath($$) { # get the final path to a bam file in the "04 mapping" directory only
my ($cfgPointer, $sampName) = @_;
my $file = ${sampName} . "_" . $cfgPointer->{genomeName} . "_" . 'q' . $cfgPointer->{minMapQ} . ".bam"; # something like MUTANT1_hg19_q30.bam or WT2_mm9_q3.bam
return catfile($cfgPointer->{'mappingResultsDir'}, $file);
}
# Same as regular "print" in most regards, except it ONLY actually prints if the $GLOBAL_VERBOSE flag is set. Only accepts one string argument. No arrays!
sub verbosePrint($;$) { # If it is a dry run, then it prepends [DRY RUN] to the statement.
my ($text, $color) = @_;
($GLOBAL_VERBOSE || $GLOBAL_DEBUG) && printColorErr($text, $color); # Prints to STDERR always!
}
sub verboseOkPrint($) { verbosePrint("[OK] $_[0]" , $VERBOSE_DIAGNOSTIC_OK_COLOR); } # Print that something is OK and doesn't really require the user to do anything.
sub verboseSkipPrint($) { verbosePrint("[OK] [SKIP] $_[0]", $VERBOSE_SKIP_STEP_OK_COLOR); } # Print that we are skipping a step. This is OK also.
sub warnPrint($) {
# Prints a warning regardless of the "verbose" settings
my ($text) = @_;
printColorErr($text, "red");
#printColorOut($text, "red");
warn $text;
}
sub debugPrint($;$) {
my ($text, $color) = @_;
($GLOBAL_DEBUG) && printColorErr($text, (defined($color)?$color:"yellow on_red")); # Prints yellow on_red if no color was specified
}
sub isLiteralStringTrue($) { # Returns "TRUE" if input is both defined and the string "TRUE" or just "T", in any capitalization, or 0 otherwise.
my ($inputString) = @_;
return((defined($inputString) && ($inputString =~ m/^(TRUE|T)$/i)) ? "TRUE" : 0);
}
sub isBooleanStringOrMissing($) {
my ($inputString) = @_; # Returns "TRUE" if and only if the input is TRUE/true, FALSE/false, T/F or undefined. Used for checking validity of inputs.
return((!defined($inputString) || ($inputString =~ m/^(TRUE|T|FALSE|F)$/i)) ? "TRUE" : 0);
}
sub getArrayOfAlignedBamFilenames($) { # (uses a global variable: the %remember hash). Returns an ORDERED array of full FILE PATHS in the same order they were in the config file.
# Example usage: my @a = getArrayOfAlignedBamFilenames("paired_end"); print join(", ", @a);
# Returns the filenames IN THE SAME ORDER that they appear in the original config file!
my ($filterFor) = @_; # Tells us if we want ALL files, or just single end / paired end. There are three options: "all" or "paired_end" or "single_end"
($filterFor =~ m/^(all|paired_end|single_end)$/) or confess "[ERROR] The 'filterFor' parameter must be either 'single', 'paired', or 'both'";
my @bamArr = (); # Return array
foreach my $id (@{$remember{$REM_SAMPLE_ORDER_ARR}}) {
my $isPaired = $remember{$REM_ALIGN_BAM}{$id}{'isPaired'};
if (($filterFor eq "all") or ($isPaired and $filterFor eq "paired_end") or (!$isPaired and $filterFor eq "single_end")) {
push(@bamArr, $remember{$REM_ALIGN_BAM}{$id}{'bam'});
}
}
return (@bamArr);
}
sub getHashPointerOfAllBamJobs() { # (uses a global variable: the %remember hash)
# Gets a valid "dependencies" hash that requires that all the BAM jobs have completed
# This is used for any job that requires the BAM files be aligned, but doesn't have any other requirements.
my $hashref = {}; # {} (hash reference)
foreach my $sample (keys(%{$remember{$REM_ALIGN_BAM}})) {
my $prereqJob = $remember{$REM_ALIGN_BAM}{$sample}{"job"}; defined($prereqJob) or confess "[ERROR] Failed to locate the required pre-requisite job variable for sample <$sample>!";
$hashref->{$prereqJob} = $prereqJob; # save this as a prerequisite!
}
return $hashref; # hash reference to all BAM-generating jobs
}
sub noExtensions($) { # Returns the filename but WITHOUT any compression extensions (including the "period" character)
my ($filename) = @_;
if (!defined($filename)) { return undef; }
$filename =~ s/[.](bz2|bzip2|gzip|gz|zip)$//i; # Remove common compression extensions from filename/path.
return($filename);
}
sub validateScriptOrDie($$) { # Validate that a script exists. Intentionally does not check that it is executable (-x), since we might run it like "perl ./script.pl" and thus the script itself need only be READABLE.
my ($scriptPath, $cfgPtr) = @_;
dieIfFileAccessFails($scriptPath, "This function called for the script <$scriptPath>, but there did not appear to a (readable) file actually there! Double check this! ");
if ($scriptPath =~ /.pl$/i and (!$remember{$REM_SCRIPTS_CHECKED}{$scriptPath})) { # Is this a perl script that we have NOT checked to make sure its syntax is OK? If so, check it.
(0 == system(("perl", "-c", $scriptPath))) or confess("[ERROR in syntax checking while setting up qsub jobs]: Looks like perl was not able to parse the syntax of the script <$scriptPath> (in other words, perl -c $scriptPath FAILED to return exit code zero. That script probably has an error that needs to be fixed");
$remember{$REM_SCRIPTS_CHECKED}{$scriptPath} = 1; # GLOBAL variable. Remember that this script was checked---don't bother checking it again
}
if ($scriptPath =~ m/51b.summary.R$/i) { # hard-coded check for this one file to skip
print STDERR "[OK - NOTE] svTools::lint is going to skip checking the '51b.summary.R' file. There is actually a bug in svLint that causes it to report the incorrect failure message: 'in grep(regex, attr(exprs, 'srcref')[[j]]) ... 'Missing ')''";
return;
}
# ================ CHECK R SYNTAX. This is VERY ANNOYING in comparison to perl ================
if ($scriptPath =~ /[.](R|Rscript)$/i and (!$remember{$REM_SCRIPTS_CHECKED}{$scriptPath})) { # Is this an R script that we have NOT checked to make sure its syntax is OK? If so, check it.
# Another thing that should be checked is: capabilities()["X11"] <== if this is FALSE, then no plots will works
my $R_CHECKER_CODE = <<"R_STUFF";
# Below: this is R code in a perl HEREDOC
if (!require("svTools")) {
print("You need to run install.packages('svTools') in R to allow syntax checking.");
quit(save="no", 2); # Exit code 2 will be used to mean 'svTools' is not installed
} else {
lintOut = svTools::lint(file="${scriptPath}"); # <== Note the PERL variable here, which gets expanded to the proper file name!
errFound = (is.data.frame(lintOut) && nrow(lintOut) > 0); # <== This is R code
if (errFound) { print("Problems found!"); print(lintOut); }
else { quit(save="no", 0); } # exit code 0 = everything is OK
}
quit(save="no", 1); # <== exit with code #1 to indicate that improper syntax was detected by svTools
R_STUFF
my $R_CHECKER_TEMPFILE = "tmp.R_checker_code.tmp";
open(RC, ">$R_CHECKER_TEMPFILE") or confess("[ERROR] Couldn't make a temporary R file in this directory! Tried to write (unsuccessfully) to the following file: $R_CHECKER_TEMPFILE");
print RC $R_CHECKER_CODE;
close(RC);
my $rverb = ($GLOBAL_VERBOSE) ? " --quiet --vanilla " : " --quiet --vanilla --slave "; # --slave is for background tasks and means "don't echo R commands" (even less verbose than 'quiet')
my $exitCode = verboseSystem(getBinPath($cfgPtr, "R_EXE") . qq{ $rverb < $R_CHECKER_TEMPFILE});
($exitCode == 1) and confess("[ERROR] The R file '${scriptPath}' had a syntax error as detected by svTools::lint. Source the file yourself to find the error.");
($exitCode == 2) and confess("[ERROR] The R syntax code checker requires that 'svTools' is installed. This probably means you do not have your R_LIBS set up properly. If you are POSITIVE you have it set up properly, you should run install.packages('svTools') (possibly as root) from within R in order to install the useful syntax checker package.");
($exitCode != 0) and confess("[ERROR] The R syntax code checker found a syntax error in the script '${scriptPath}' Exit code was $exitCode.");
unlink($R_CHECKER_TEMPFILE) or warn("[WARNING] Weirdly, we could not delete the R temp file ($R_CHECKER_TEMPFILE) that we made...");
}
# ================ DONE CHECKING R SYNTAX. ================
}
# Runs a system command, and prints it out if we are using "verbose" mode (i.e., monkey.pl --verbose)
# DRY RUN behavior: Does NOT actually run the system call if this is a dry run!
sub verboseSystem($) {
my ($cmd) = @_;
if ($GLOBAL_DRY_RUN) { verbosePrint(">>>>> DRY RUN system call >>>>> $cmd\n", "cyan" ); }
else { verbosePrint(">>>>> System call >>>>> $cmd\n" , "green"); }
my $exitCode = ($GLOBAL_DRY_RUN) ? 0 : system($cmd); # Only run the actual system command if it is NOT a dry run!
verbosePrint(" (Returned exit code $exitCode)\n", ($GLOBAL_DRY_RUN ? "cyan" : "green"));
return($exitCode); # <== Mandatory that we RETURN the system call result!
}
sub ourWarn($) { # Prints a warning to the console, AND ALSO updates the 'GLOBAL_WARNINGS' variables that gets printed at the end.
my ($msg) = @_;
chomp($msg); # remove any newline so that the warning message shows the line number
warn($msg); # Print the warning to the screen when it occurs.
$GLOBAL_WARNINGS .= "$msg\n"; # Add the message to the GLOBAL warnings, plus a newline. We will print the GLOBAL warnings again when the script is done running.
}
sub printGlobalWarnings() {
if (length($GLOBAL_WARNINGS) > 0) {
print STDERR "We encountered the following warning messages:\n";
print STDERR $GLOBAL_WARNINGS . "\n";
}
}
sub fatalConfigErr($$$) { my ($lineNum, $configFilename, $msg) = @_; confess "[ERROR ON LINE $lineNum OF $configFilename]: $msg"; }
sub configWarning($$$) { my ($lineNum, $configFilename, $msg) = @_; ourWarn("[WARNING ON LINE $lineNum OF $configFilename]: $msg"); }
sub isSplicedAligner($) {
my ($alignerName) = @_;
if ($alignerName =~ m/^(tophat|star)/i) { return 1; } # Tophat, STAR = SPLICE-AWARE
if ($alignerName =~ m/^(bowtie|bwa)/i) { return 0; } # Bowtie, BWA = NO SPLICING
confess "[Error] 'isSplicedAligner' was passed an UNRECOGNIZED aligner executable name: '$alignerName'...";
}
sub loadConfig { # <== loads the configuration file information into the "$cfg" hash pointer
my ($file) = @_;
my $cfg; # <== hash (hash pointer?)
my %dupeCheckHash = (); # used to check whether an ID is a dupe
my $lineNum = 0;
$cfg->{filename} = $file; # Just save the config filename in case we need it later!
open(CF,$file) or confess "[ERROR] Cannot open the configuration file \"$file\" It may not exist, or not be readable by this user!";
while (my $line = <CF>) {
chomp($line);
$lineNum++;
$line =~ s/\s+[#].*//; # <== Remove non-full-line comments! Note that comments need a space before them! Example: turns "mydata something # comment here" into "mydata something"
$line =~ s/[ \t\s]+/\t/g; # <== Replace RUNS of spaces/tabs with a single tab. Note that ALL SPACES ARE TURNED INTO TABS here! That means you can't have errant spaces in (say) filenames and whatnot.
$line = trimWhitespace($line); # <== Remove any other leading / trailing whitespace.
next if ($line =~ /^[#]/); # <== Skip any lines that START with a '#' (comment character
next if ($line =~ /^(\s)*$/); # <== Skip any ENTIRELY whitespace lines, in other words, there's nothing left after our modifications above.
my @t = split(/\t/,$line);
if ($line =~ /^(sample|input)\s/i) { #(scalar @t == 5) or (scalar(@t) == 6)) { # should probably just check to see if the line starts with "^sample\t" or "^input\t"
(scalar(@t) == 5 or scalar(@t) == 6) or fatalConfigErr($lineNum, $file, "The line began with 'sample' or 'input', but it did NOT have 5 or 6 elements! All lines beginning with 'sample' or 'input' must have exactly 5 (for single end) or 6 (for paired end) values. The text of the offending line was:\n$line");
my $sampleOrInput = lc($t[0]); # either "sample" or "input" -- the literal text!
my $idName = $t[1]; # like "myDrugTest.1"
my $dataType = lc($t[2]); # rna, chip, or exo. Lower-case it.
my $inputField = $t[3]; # if it's CHIP-SEQ, then the "input" is a corresponding file that matches this sample. Or it can be "NA"
my $firstPairFile = $t[4];
my $secondPairFile = (scalar(@t) >= 6) ? $t[5] : undef; # check to see if it's paired-end...
($dataType =~ m/^(chip|exo|rna|atac|other)$/i) or fatalConfigErr($lineNum, $file, "The sample *type* that you specified was not recognized. You specified \"$dataType\", but the type must be one of these: {'chip', 'exo', 'rna','atac', or other}). Fix it!");
if ((!defined($secondPairFile) or isNA($secondPairFile)) and ($firstPairFile =~ m/[-._](R1|pair1)[-._]/)) {
ourWarn("[WARNING] Although there was no second pair specified for the single-end (?) file named '$firstPairFile', the filename contained a hint (like '.R1' or '.pair1') that the file might actually be paired end. Double check that this file is not missing its mate pair!");
sleep(1); # give the user some time to ponder the warning before it swooshes by
}
if ($firstPairFile =~ m/[-._](R2|pair2)[-._]/) {
ourWarn("[WARNING] The filename '$firstPairFile' had text in it that made it look like a second-of-a-pair paired-end file (like '.R1' or '.pair1'), but it was listed as either single-end of as the first file in a pair. Double check that this file is really NOT a second-mate-pair paired-end file!");
sleep(1); # give the user some time to ponder the warning before it swooshes by
}
($sampleOrInput =~ m/^(sample|input)$/) or fatalConfigErr($lineNum, $file, "The first item must be either the literal text 'sample' or 'input'. Instead it was: <$sampleOrInput>!");
(!exists($dupeCheckHash{uc($idName)})) or fatalConfigErr($lineNum, $file, "The ID name for each sample must be UNIQUE, but the ID \"$idName\" on line $lineNum had already been used on line $dupeCheckHash{$idName}. Fix this!");
$dupeCheckHash{uc($idName)} = $lineNum; # <== Save the line number that this ID occurred on. Used to report any erroneous duplicates. Save the UPPER CASE version of each string to avoid allowing only-case-differences in filenames.
($idName =~ m/${SAFE_FILENAME_REGEXP}/) or fatalConfigErr($lineNum, $file, "The sample ID CANNOT have any special characters in it. The allowable characters are the ones that match this regular expression: $SAFE_FILENAME_REGEXP. The invalid ID was this: \"$idName\"!");
# sample.ID0414.fas
($idName !~ m/^[0-9]/) or fatalConfigErr($lineNum, $file, "The sample name CANNOT *start* with a number (it confuses certain R commands, especially in edgeR), but this one (incorrectly) does: <$idName>. Recommendation: add a letter or underscore to the beginning.");
if ($idName =~ m/^[^.]+[.]id/i) { # looking for "some stuff that is not a dot, then '.id' (case-insensitive). E.g. DRUGX.idWHATEVER or DRUG_A.id.4234.24.24.2
# There is a special form for replicate names to be non-numeric, if they have a name like:
# group_whatever.id_429424.424.23423 . In this case, the magic part is ".id"--anything is OK as an identifier if that appears, even more decimal points!
# There is no additional error checking for an id here; anything (unique) is ok--even multiple '.', except spaces and special characters.
} else {
# This is a NORMAL style of sample description: GROUPNAME.replicate_number
($idName !~ m/[.].*[.]/ ) or fatalConfigErr($lineNum, $file, "The sample ID CANNOT have more than one decimal point in it, but this one does: \"$idName\"!");
($idName =~ m/[.][0-9]+$/) or fatalConfigErr($lineNum, $file, "The name \"$idName\" DID NOT end in a period and then a (replicate) number. Valid names should look like this: \"WT.1\" or \"DRUG.13\" or \"Ctrl.18\". Numbers do *not* necessarily need to be sequential. Note that there is also a special form allowed for arbitrary names, you just have to put the word 'id' after the dot. So for example: DRUG.id_999.88_77 is a valid replicate name: it's group DRUG, with replicate id_999.88_77. Even multiple dots are allowed in this form.\n");
}
my @id_replicate_split = split(/[.]/, $idName, 2); # Split into TWO parts only! CRITICAL to have the 2 here!
$cfg->{details}->{$idName}->{name} = $id_replicate_split[0]; # example: "WILDTYPE" or "CONTROL"
$cfg->{details}->{$idName}->{replicate} = $id_replicate_split[1]; # example: "1" (for first replicate of WILDTYPE)
($cfg->{details}->{$idName}->{name} =~ m/$SAFE_JOB_NAME_REGEXP/) or fatalConfigErr($lineNum, $file, "The sample name <$cfg->{details}->{$idName}->{name}> is NOT acceptable, because it must only consist of standard alphanumeric characters plus the '-' and '_'. No special characters are allowed, as they may result in problematic filenames downstream. Please change this name to something without special characters, spaces, or punctuation");
$cfg->{details}->{$idName}->{type} = $dataType; # Note that '$dataType' is already lower-cased
$cfg->{details}->{$idName}->{needsPeakCall} = ($dataType =~ m/^(---|chip|exo|atac|other)$/i); # peak calling for everything but RNA
$cfg->{details}->{$idName}->{needsGemPeaks} = ($dataType =~ m/^(---|chip|exo|----|-----)$/i); # chip or exo only for some reason
$cfg->{details}->{$idName}->{needsATAC} = ($dataType =~ m/^(---|----|---|atac|-----)$/i); # chip or exo only for some reason
$cfg->{details}->{$idName}->{needsBcpPeaks} = ($dataType =~ m/^(---|chip|---|----|-----)$/i); # chip only for BCP
$cfg->{details}->{$idName}->{needsDE} = ($dataType =~ m/^(rna|----|---|----|-----)$/i); # only RNA gets expression value analysis
$cfg->{details}->{$idName}->{needsExo} = ($dataType =~ m/^(---|----|exo|----|-----)$/i); # only EXO gets exo-seq value analysis
# Do peaks is GLOBAL
$cfg->{globalDoPeaks} = (($cfg->{details}->{$idName}->{needsPeakCall}) || (defined($cfg->{globalDoPeaks}) && $cfg->{globalDoPeaks}));
if (!isNA($inputField) and ($inputField ne '') and ($inputField =~ /\S/)) { # If this field isn't blank or NA or has at least one non-space character (\S)
$cfg->{details}->{$idName}->{input} = $t[3]; # some samples (e.g. for CHIPSEQ) have a corresponding "INPUT" sample name associated with them.
}
$cfg->{$sampleOrInput}->{$idName}->{$firstPairFile} = 1; # indicate that we have obtained a sample with this specific name
push(@{$remember{$REM_SAMPLE_ORDER_ARR}}, $idName); # "remember" is a GLOBAL variable that stores the sample IDs in the same order they were seen.
if (defined($secondPairFile) && !isNA($secondPairFile)) {
$cfg->{$sampleOrInput}->{$idName}->{$secondPairFile} = 1;
} # indicate that we have obtained a paired end file!!!
} elsif ($line =~ m/^([^=]+)=(.*)$/) { # look for something with an EQUALS SIGN between it, e.g. "genome = myGenome.fa"
my $key = trimWhitespace($1);
my $value = trimWhitespace($2); (length($value) > 0) or fatalConfigErr($lineNum, $file, "The value for key '$key' was BLANK or otherwise invalid. Note that every assignment MUST have a value, even if it is a placeholder like 'NA'. The offending line was:\n $line\nPlease double-check the config file");
#verboseOkPrint("Found a key / value $key / $value for species '$cfg->{species}').\n");
# ==================== Handle the species-specific prefixes ================
if ($key =~ m/^\[ # <== leading bracket. Example: [hg19 ]gtfFile = something
\s*([^\]]*)\s* # <== ($1): whatever is inside the brackets and is NOT a space is the SPECIES (e.g., hg19)
\] # <== closing bracket
\s* # <== optional whitespace, which is NOT part of the key
(.*) # <== ($2): everything else is part of the key
/x) {
my $keyIsForThisSpecies = trimWhitespace($1);
(length($keyIsForThisSpecies) > 0) or fatalConfigErr($lineNum, $file, qq{Seems like you were using the species-specific bracket notation for key '$key', but the species name (\"$keyIsForThisSpecies\") was blank (???). Double check this offending line in the config file:\n $line\n});
if (defined($cfg->{species}) and not isNA($cfg->{species})) {
if ($keyIsForThisSpecies eq $cfg->{species}) {
#verboseOkPrint("FOUND a key for species '$keyIsForThisSpecies' '$key' = '$value', since that is the current species (which is '$cfg->{species}').\n");
$key = $2;
} else {
verboseOkPrint("Skipping a key for species '$keyIsForThisSpecies' '$value', since that is not the current species (which is '$cfg->{species}').\n");
next; # Skip it, and do NOT add this key to the config hash!
}
}
}
# ==================== Done handling the species-specific prefixes ================
($key !~ m/\s/) or fatalConfigErr($lineNum, $file, "The key '$key' had WHITESPACE in the key name, which is not valid! The offending line was:\n $line\nPlease double-check the config file");
(length($key) > 0) or fatalConfigErr($lineNum, $file, "The key was BLANK or otherwise invalid. The offending line was:\n $line\nPlease double-check the config file");
addKeyValuePairToConfig(\$cfg, $key, $value, $lineNum, $file);
} elsif (scalar(@t) == 2) { # look for any line with TWO ELEMENTS on it (i.e. tab-delimited key and value)
my $key = $t[0]; my $value = $t[1];
addKeyValuePairToConfig(\$cfg, $key, $value, $lineNum, $file);
} else {
fatalConfigErr($lineNum, $file, "A line in the config file could not be interpreted. Perhaps it is incorrectly formatted, or has a 'weird' character in it somewhere. The offending line was:\n $line\nPlease double-check the config file");
}
}
close(CF);
return($cfg);
}
sub trimWhitespace { my $str=shift; $str =~ s/^\s+|\s+$//g; return($str) }; # trims leading and trailing whitespace
sub definedInConfigOrDie($$) { my ($cfg, $itemName) = @_; defined($cfg->{$itemName}) or confess "ERROR in config: Configuration file does not contain a '$itemName' setting--please specify this setting in the config file! Also, double check that it is properly capitalized!"; return 1; }
sub dirExistsInConfigOrDie($$) {
my ($theCfg, $v) = @_;
definedInConfigOrDie($theCfg, $v) and (-d $theCfg->{$v}) or confess "ERROR in config: The specified '$v' directory (" . $theCfg->{$v} . ") does not exist. Double check that the directory exists and is properly capitalized! ";
return 1;
}
sub fileExistsInConfigOrDie($$) {
my ($theCfg, $v) = @_;
definedInConfigOrDie($theCfg, $v) and (-f $theCfg->{$v}) or confess "ERROR in config: The specified '$v' file (expected to be at \"" . $theCfg->{$v} . "\") does not exist. Double check that the file exists and is properly capitalized! ";
return 1;
}
sub printInstallInstructions() {
my $tempCfg;
setupBin($tempCfg);
print "ok\n";
}
sub addExe($$$$$$) {
# The intent of this function, which is not actually used yet, is to list all the required executables
# ...AND ALSO the command used to install them. The idea is that we could then easily install Monkey's dependencies
# ...on a new machine by just iterating through the '$installCmd' variable here.
my ($cfgPtr, $key, $dir, $exe, $desc, $installCmd) = @_;
(!defined($cfgPtr->{bin}->{$key})) or confess "[ERROR] Attempt to DOUBLE DEFINE the key '$key'.\n";
(defined($cfgPtr)) and $cfgPtr->{bin}->{$key} = catfile($dir, $exe); # Save the path to the executable into this specific $key
# Note: right now we aren't saving the exe, dir, desc(ription) or installation cmd
#addExe($cfg, "bowtie2", $binDir, "bowtie2", "Bowtie 2 non-splicing aligner", "brew tap homebrew/science; brew update; brew install bowtie2");
#addExe(undef, undef, undef, "linuxbrew", "linuxbrew software installation system (similar to apt-get)", "brew tap homebrew/science; brew update;");
}
my %VERIFIED_EXE_HASH = (); # hash of executables that we verified do exist and are runnable
sub getBinPath($$) {
# This verifies that binaries exist, but only when they are ***actually about to be USED***.
# That way, we don't have to have every program installed on every system, if those programs aren't used for anything.
my ($cfg, $exeName) = @_;
exists($cfg->{bin}->{$exeName}) or confess "[ERROR] 'getBinPath' failed for the binary named '$exeName'--this is a programming error and indicates that we haven't added '$exeName' to the 'bin' hash yet in bananas_agw.pm. Fix this!";
if (!defined($VERIFIED_EXE_HASH{$exeName})) {
my $exePath = $cfg->{bin}->{$exeName};
(-e $exePath) or confess "[ERROR in bananas_agw.pm]: While checking all the EXEs in cfg->{bin}, we discovered that the hard-coded path '$exePath' (for executable '$exeName') did not seem to have a valid executable! Check to make sure this file actually exists. If that is the wrong location for this executable, you may have to change the expected location in the code in 'bananas.pm'. Note that the problem may also be with your 'BINFBINROOT' enviroment variable (which should be a directory full of binaries). That directory is currently specified as <$GLOBAL_BIN_DIR>.";
if ($exePath =~ m/[.](pl|sh|py)/) {
# no need to check these for executable-ness, because they will be run by another program (perl, bash, python) anyway
} else {
(-x $exePath) or confess "[ERROR in bananas_agw.pm]: We discovered that the hard-coded path (for executable '$exeName') was present, but NOT EXECUTABLE by the current user! Use 'chmod' to investigate this! ";
}
$VERIFIED_EXE_HASH{$exeName} = 1; # Verified to exist / be an executable file
}
return($cfg->{bin}->{$exeName}); # returns the valid path
}
sub setupBin($) {
my ($cfg) = @_;
(scalar(@_) == 1) or confess "Wrong number of args to setupBin. The only argument must be the '$cfg' pointer to the hash where we store everything.";
# Anything in $cfg->{bin} gets checked to make sure it both EXISTS and also CAN BE EXECUTED by the current user.
# Do NOT put non-executables in $cfg->{bin}, or the automatic checking will fail!
defined($cfg->{monkeyPoo}) or confess "[ERROR in code]: 'monkeyPoo' directory must have already been set up in the 'checkConfig' function BEFORE the binary directories can be set up.";
my $binDir = $GLOBAL_BIN_DIR;
my $MONKEY_SUPPORT_SUBDIR = catfile($cfg->{monkeyPoo}, "support");
$cfg->{bin}->{basedir} = $binDir;
$cfg->{bin}->{convertToGenomeBrowserExe} = catfile($binDir,"convert_SAM_or_BAM_for_Genome_Browser.pl");
$cfg->{bin}->{qsub} = catfile($binDir,"qsub"); # queue submission utility
$cfg->{bin}->{qhold} = catfile($binDir,"qhold"); # queue 'hold' utility
$cfg->{bin}->{qrls} = catfile($binDir,"qrls"); # queue 'release' utility (opposite of qhold)
$cfg->{bin}->{bam2bed} = catfile($binDir,"bam2bed");
$cfg->{bin}->{bigWig} = catfile($binDir,"bedGraphToBigWig");
$cfg->{bin}->{bedmap} = catfile($binDir,"bedmap");
$cfg->{bin}->{bedops} = catfile($binDir,"bedops");
$cfg->{bin}->{bigBed} = catfile($binDir,"bedToBigBed");
$cfg->{bin}->{bedtools} = catfile($binDir,"bedtools");
$cfg->{bin}->{bowtie2} = catfile($binDir,"bowtie2");
$cfg->{bin}->{bwa} = catfile($binDir,"bwa");
$cfg->{bin}->{STAR} = $cfg->{bin}->{star}= catfile($binDir,"STAR"); # <== allow both upper AND lower case for the STAR aligner, which is officially capitalized.
# $cfg->{bin}->{hisat2_0_1} = catfile($binDir,"hisat_2_0_1"); # for requesting a specific version
# $cfg->{bin}->{hisat} = catfile($binDir,"hisat_current");
# $cfg->{bin}->{tophat2_1_1} = catfile($binDir,"tophat_2_1_1"); # for requesting a specific version
$cfg->{bin}->{tophat} = catfile($binDir,"tophat");
$cfg->{bin}->{cuffdiff} = catfile($binDir,"cuffdiff");
$cfg->{bin}->{fastqMcf} = catfile($binDir,"fastq-mcf");
$cfg->{bin}->{fastqc} = catfile($binDir,"fastqc");
$cfg->{bin}->{gem} = catfile($binDir,"gem.jar");
$cfg->{bin}->{bcp} = catfile($binDir,"BCP_HM");
$cfg->{bin}->{match} = catfile($binDir,"match");
$cfg->{bin}->{match2moa} = catfile($binDir,"match2moa");
$cfg->{bin}->{moaOverlaps} = catfile($binDir,"removeMoaOverlaps");
$cfg->{bin}->{samtools} = catfile($binDir,"samtools");
$cfg->{bin}->{sortBed} = catfile($binDir,"sort-bed");
$cfg->{bin}->{subreadFeatureCounts} = catfile($binDir,"featureCounts");
$cfg->{bin}->{featureCountsToPlainMatrix}= catfile($MONKEY_SUPPORT_SUBDIR, "featureCounts_to_plain_matrix.sh");
$cfg->{bin}->{alt_analyze_py} = catfile($binDir,"AltAnalyze.py");
$cfg->{bin}->{peakMotifs} = "/data/applications/rsat/rsat/perl-scripts/peak-motifs"; # must be hardcoded to work... dubious. Also requires Perl library "MIME::Lite"
$cfg->{bin}->{R_EXE} = "/usr/bin/R";
$cfg->{bin}->{RSCRIPT_EXE} = "/usr/bin/Rscript"; # Some sub-scripts (like 51c.cluster.R) need this, you but should not need to run it manually.
$cfg->{bin}->{python2_7_exe} = "/bin/python2.7"; # location of a python executable with version >= 2.7
$cfg->{bin}->{atacPython} = "/home/sthomas/envs/atac/bin/python";
$cfg->{bin}->{nucleoatac} = "/home/sthomas/envs/atac/bin/nucleoatac";
$cfg->{bin}->{step1Motif} = catfile($cfg->{monkeyPoo},"s1_getMotifRegex.pl"); # /home/sthomas/tools/utils/s1_getMotifRegex.pl
$cfg->{bin}->{step2Motif} = catfile($cfg->{monkeyPoo},"s2_findLocations.pl"); # /home/sthomas/tools/utils/s2_findLocations.pl
}
sub dieIfBothOptionsExist($$$) {
my ($cfg, $a, $b) = @_;
(!defined($cfg->{$a}) or !defined($cfg->{$b})) or confess "[ERROR] You have defined BOTH '$a' and '$b' in your config file! These are redundant and/or conflicting. Fix this in your config file!";
}
sub checkConfig {
my ($cfg) = @_;
# ================== Fix up the 'studyName' ================
(defined($cfg->{studyName})) or confess "[ERROR] Configuration file must contain a 'studyName' entry. This is just a basic name for your study, like 'myRnaSeqProject'. No periods / unusual characters are allowed! ";
if ($cfg->{studyName} =~ /-/) {
$cfg->{studyName} =~ s/-/_/g; # Changing all hyphens into UNDERSCORES, because hyphens are confusing to certain elements of the pipeline.
print STDERR qq{[NOTE]: the hyphens in the 'studyName' will be converted into underscores--study name is now \"$cfg->{studyName}\".\n};
}
($cfg->{studyName} =~ m/${SAFE_JOB_NAME_REGEXP}/) or confess "[ERROR] Study name must contain ONLY alphanumeric characters and the underscore. No periods! You can use underscores. The offending name was: \"$cfg->{studyName}\" ";
if ($cfg->{studyName} =~ /^[0-9]/) { $cfg->{studyName} = "X".$cfg->{studyName}; ourWarn(qq{WARNING: the 'studyName' cannot START with a number (it will fail in qsub), so we added an 'X' to the beginning---study name is now \"$cfg->{studyName}\".}); }
# ================== Fix up the 'studyName' ================
dirExistsInConfigOrDie($cfg, 'monkeyPoo');
if ($GLOBAL_DEV_MODE) { # MonkeyPoo directory should be changed to "/path/to/monkey/poo/dev_bin" instead of just "/path.../bin/"
$cfg->{monkeyPoo} = dirname($cfg->{monkeyPoo}) . "/dev_" . basename($cfg->{monkeyPoo});
dirExistsInConfigOrDie($cfg, 'monkeyPoo'); # check it AGAIN if we are in dev mode
}
setupBin($cfg); # set up the 'bin' directory. This must happen AFTER the 'monkeyPoo' directory is defined.
dirExistsInConfigOrDie($cfg, "sampleDir");
($cfg->{sampleDir} =~ /^\//) or confess "[ERROR] 'sampleDir' must be a FULL PATHNAME (so it should start with a '/' -- i.e., /path/to/place) and NOT a relative path! The offending name was: $cfg->{sampleDir} "; # check to make sure it starts with a '/' (and is therefore probably a full path)
($cfg->{sampleDir} !~ /[\s]/) or confess "[ERROR] 'sampleDir' must NOT contain whitespaces. The offending name was: $cfg->{sampleDir} ";
if (defined($OUTPUT_DIR_OVERRIDE)) { $cfg->{resultDir} = $OUTPUT_DIR_OVERRIDE; } # <== the user can specifiy --out=/my/directory/path on the command line to override the path in the study design file.
(defined($cfg->{resultDir})) or confess "[ERROR] Configuration file must contain a 'resultDir' entry!";
chomp($cfg->{resultDir}); # (Sometimes a newline can end up here in odd scenarios)
length($cfg->{resultDir}) > 0 or confess "[ERROR] Result directory was blank somehow. This may be a result of a 'bad inode' situation---if you DEFINITELY set 'resultDir=something' in your config file, try changing to your home directory and then changing to this directory again (this fixes the 'bad inode' situation caused by directory renaming). "; # check to make sure it starts with a '/' (and is therefore probably a full path)
($cfg->{resultDir} =~ /^\//) or confess "[ERROR] Result directory must be a FULL PATHNAME (i.e., /path/to/place), so that means it has to start with a '/'---it can NOT be a relative path! The offending name is in brackets here: [$cfg->{resultDir}] "; # check to make sure it starts with a '/' (and is therefore probably a full path)
($cfg->{resultDir} !~ /[\s]/) or confess "[ERROR] Working directory cannot contain whitespaces. The offending name was: <<<$cfg->{resultDir}>>>. Beware of non-printable characters and newlines! ";
mkdirOrDie($cfg->{resultDir});
$cfg->{writeup} = catfile($cfg->{resultDir}, "00a.writeup.txt");
$cfg->{writeup_cite} = catfile($cfg->{resultDir}, "00b.citations.txt");
# Note: these are all FULL PATHS
$cfg->{filterResultsDir} = catfile($cfg->{resultDir},"02a.filtered_fastq");
$cfg->{fastqcResultsDir} = catfile($cfg->{resultDir},"02b.filtered_fastq_fastqc");
$cfg->{mappingResultsDir} = catfile($cfg->{resultDir},"04a.mapping");
$cfg->{qcAfterMappingDir} = catfile($cfg->{resultDir},"04b.mapped_qc_fastqc");
$cfg->{rseqcQualityDir} = catfile($cfg->{resultDir},"04c.mapped_qc_rseqc");
$cfg->{tagResultsDir} = catfile($cfg->{resultDir},"05.tags");
$cfg->{densityResultsDir} = catfile($cfg->{resultDir},"06.tagDensity");
$cfg->{peakResultsDir} = catfile($cfg->{resultDir},"07.peaks");
$cfg->{gemPeakResultsDir} = catfile($cfg->{resultDir},"07a.gemPeaks");
$cfg->{bcpPeakResultsDir} = catfile($cfg->{resultDir},"07b.bcpPeaks");
$cfg->{atacResultsDir} = catfile($cfg->{resultDir},"07c.nucleoAtac");
$cfg->{browserBinResultsDir} = catfile($cfg->{resultDir},"08.browser_tracks_binned");
$cfg->{browserWigResultsDir} = catfile($cfg->{resultDir},"08.browser_wig_and_bam");
$cfg->{windowResultsDir} = catfile($cfg->{resultDir},"09.window_figures");
$cfg->{motifDiscDir} = catfile($cfg->{resultDir},"10.motif_discovery");
$cfg->{subreadCountsDir} = catfile($cfg->{resultDir},"20.subread_counts");
$cfg->{cuffdiffDir} = catfile($cfg->{resultDir},"22.cuffdiff");
$cfg->{cummerbundDir} = catfile($cfg->{resultDir},"23.cummerbund");
$cfg->{edgeRDir} = catfile($cfg->{resultDir},"50.edgeR_diff_expr");
$cfg->{monocleDir} = catfile($cfg->{resultDir},"51a.monocle");
$cfg->{summaryFigureDir} = catfile($cfg->{resultDir},"51b.summary_figures");
$cfg->{clusterFigDir} = catfile($cfg->{resultDir},"51c.cluster_heatmap");
$cfg->{altAnalyzeDir} = catfile($cfg->{resultDir},"52.alt_analyze");
# Anything in $cfg->{r} gets checked for R syntactic validity, so DO NOT put non-R scripts there!
$cfg->{r}->{pairPlot} = catfile($cfg->{monkeyPoo}, "20c.summary.scripts", "R_pair_plot.R");
$cfg->{r}->{cummerbund} = catfile($cfg->{monkeyPoo}, "23b.cummerbund.R");
$cfg->{r}->{edgeR} = catfile($cfg->{monkeyPoo}, "50b.edgeR.R");
$cfg->{r}->{monocle} = catfile($cfg->{monkeyPoo}, "51a.monocle.R");
$cfg->{r}->{window_figures} = catfile($cfg->{monkeyPoo}, "09.xAnalyze.RScript");
$cfg->{subread}->{countsFile} = "subread.featureCounts.counts.txt"; # <-- literal filename that will be looked for later
$cfg->{edgeR}->{fpkmFilePath} = catfile($cfg->{edgeRDir}, "fpkm_naive.txt"); # note: hard coded in the edgeR script for now
$cfg->{RSEQC}->{rseqc_parent_dir} = "/usr/bin/"; # "/data/applications/rseqc/rseqc-2.6.1/build/scripts-2.7/"; # Location of the RSEQC python files
# Set default values for certain unspecified variables.
setIfUndefined($cfg, 'shouldFilterAdapters', "TRUE"); # Default is to filter adapters. If you don't want this, then specify "shouldFilterAdapters = FALSE".
setIfUndefined($cfg, 'genomeFasta' , $NA_VALUE);
setIfUndefined($cfg, 'tracksShowEveryRead' , "FALSE"); # Default value: do NOT show each read (in other words, generate individual-read BAM files) for the 'browserWigs' part of the Genome Browser (s48_browser_wiggle). For that step, instead, only generate "wiggle" tracks. This is because BAM files are very large. Does not affect the generation of browser 'bin' tracks (step 8, s08a_browser_bins), which is unrelated.
if (defined($cfg->{skipFastQC})) { fatalConfigErr("(No line number)", $cfg->{filename}, "ERROR: Sorry, the 'skipFastQC' option has been deprecated. Instead of specifying skipFastQC=FALSE, change it to doQC=TRUE . Sorry for the inconvenience!") }
dieIfBothOptionsExist($cfg, 'skipQC' ,'doQC');
dieIfBothOptionsExist($cfg, 'skipWindows' ,'doWindows');
dieIfBothOptionsExist($cfg, 'skipTagsAndDensity','doDensity');
# Handle the older style of "skip a thing that isn't specified"
if (defined($cfg->{skipQC})) { $cfg->{doQC} = isLiteralStringTrue($cfg->{skipQC}) ? "F" : "T" } # yes, the literal strings "T" and "F"--not perl values yet!
if (defined($cfg->{skipWindows})) { $cfg->{doWindows} = isLiteralStringTrue($cfg->{skipWindows}) ? "F" : "T" } # also note that it's BACKWARDS: skipQC=T means doQC=F
if (defined($cfg->{skipTagsAndDensity})) { $cfg->{doDensity} = isLiteralStringTrue($cfg->{skipTagsAndDensity}) ? "F" : "T" }
if (!defined($cfg->{doQC})) { $cfg->{doQC} = "TRUE"; } # QC things if we didn't explicitly say not to!
for my $boolOption (@OK_BOOLEAN_KEY_NAMES) { # Only run this AFTER we check for the old-style naming for 'skipQC' and the like, above
isBooleanStringOrMissing($cfg->{$boolOption}) or confess "[ERROR] Configuration file problem: '$boolOption' was defined as \"$cfg->{$boolOption}\", but it must be the literal text TRUE (T), FALSE (F), or omitted entirely. Blank values are NOT ACCEPTABLE!";
$cfg->{$boolOption} = isLiteralStringTrue($cfg->{$boolOption});
}
if (defined($cfg->{species})) {
( $cfg->{species} =~ m/\S/ ) or fatalConfigErr("(No line number)", $cfg->{filename}, "\n\n[ERROR] The 'species' variable is defined, but is BLANK. Unacceptable! Set it to something.\n");
($cfg->{species} =~ m/${SAFE_SPECIES_REGEXP}/) or fatalConfigErr("(No line number)", $cfg->{filename}, "\n\n[ERROR] The 'species' variable contains a SPACE or other illegal character! Unacceptable! Fix this. Right now, it is this: $cfg->{species}\n");
(not $cfg->{species} =~ m/(^__|__$)/) or fatalConfigErr("(No line number)", $cfg->{filename}, "\n\n[ERROR] The 'species' variable needs to be set to something VALID, but it looks like it's still our sample one (specifically, it is this: $cfg->{species}), since it has two underscores at the beginning and/or at the end---set it to the real species identifier without underscores at the beginning/end!\n");
}
(defined($cfg->{minMapQ})) or confess "[ERROR] MISSING CONFIG FILE OPTION: Configuration file must contain a 'minMapQ' entry! This is the minimum map quality (MAPQ) that a read must posses in order to NOT be disqualified after alignment. It should be between 0 (meaning 'keep everything') and 100. A standard value is 30.";
($cfg->{minMapQ} =~ /^[0-9]+$/ and $cfg->{minMapQ} >= 0 and $cfg->{minMapQ} <= 100) or confess "[ERROR] The 'minMapQ' option in the config file must be between 0 and 100, inclusive. The invalid map quality score specified in the file was the following: $cfg->{minMapQ}\n ";
if ($cfg->{'shouldFilterAdapters'}) { fileExistsInConfigOrDie($cfg, 'libraryAdapterFile'); } # If the user wants to filter adapters, make sure the adapter file exists!
else { $cfg->{'libraryAdapterFile'} = "<Since we are not filtering adapters--this file should NOT be specified or used ever. Using this text for anything real is an ERROR.>"; }
isNA($cfg->{'genomeFasta'}) or fileExistsInConfigOrDie($cfg, 'genomeFasta');
if ($cfg->{'forceRerun'}) { print STDERR "[NOTE] Forcing re-run even of completed steps, due to the 'forceRerun=TRUE' line in the config file."; $FORCE_RERUN = 1; } # The user can either specify '-f' on the command line, or set 'forceRerun=TRUE' in the config file.
if (!defined($cfg->{'bedAnnotForRSEQC'})) { ourWarn("Note: the 'bedAnnotForRSEQC' parameter was NOT DEFINED in the config file. We will skip this QC step until you define that file (or manually set it to 'NA')."); }
if ($cfg->{doQC} && (!isNA($cfg->{'bedAnnotForRSEQC'})) && defined($cfg->{'bedAnnotForRSEQC'})) {
($cfg->{'bedAnnotForRSEQC'} =~ /.bed$/i) or confess "[ERROR] The 'bedAnnotForRSEQC' file ('$cfg->{bedAnnotForRSEQC}') did not end with .bed, so it's probably not the right file type! Fix this (or set it to 'NA', or add doQC=FALSE to the config file if you don't care about QC at all)!";
dieIfFileAccessFails($cfg->{'bedAnnotForRSEQC'}, "Configuration file problem: unless you skip the QC step, you MUST have a bed annotation file defined as 'bedAnnotForRSEQC'--this file is necessary for rseQC to function! You either didn't define a variable, or you passed in the invalid file location '$cfg->{bedAnnotForRSEQC}'! Check your config file and verify that a file is in fact at that location.");
}
# ======================== CHECK THE ALIGNER ====================
(exists($cfg->{'aligner'})) or confess "[ERROR] MISSING CONFIG FILE OPTION: The configuration file must specify an ALIGNER (the 'aligner') option, which was not specified!";
$cfg->{'aligner'} = lc($cfg->{'aligner'}); # Convert the aligner name to LOWER CASE!
$cfg->{'aligner'} =~ s/^bowtie$/bowtie2/i; # Plain 'bowtie' always refers to 'bowtie2'. Always refer to it as 'bowtie2' from now on.
$cfg->{'aligner'} =~ s/^tophat2$/tophat/i; # Tophat, on the other hand, is always version 2, but is called 'tophat'. Always call it plain 'tphat' from now on.
(grep(/^$cfg->{aligner}$/, @OK_ALIGNERS)) or confess "[ERROR] The configuration file specified the following UNRECOGNIZED aligner: \"$cfg->{aligner}\". We expect the aligner to be something like 'bowtie' or 'tophat' or 'bwa'.\n";
# ======================== HANDLE THINGS SPECIFIC TO CERTAIN ALIGNER(S) ====================
if ($cfg->{'aligner'} =~ m/tophat/i) {
(defined($cfg->{gtfFile})) or confess "[ERROR] MISSING CONFIG FILE OPTION: If you specify 'tophat' as your aligner, you must **EITHER** (1) specify a valid GTF file with the 'gtfFile' parameter **OR** 2) Specify 'gtfFile=${NA_VALUE}' in the config file to explicitly indicate that there is no GTF file.";
verifyThatTophatSegmentJuncsCanRun($cfg) or confess "[ERROR] Failed to successfully launch the 'segment_juncs' executable.";
}
if ($cfg->{'aligner'} =~ m/^(tophat|star)/i) {
# Aligner can use a GTF file
(isNA($cfg->{gtfFile})) or (dieIfFileAccessFails($cfg->{gtfFile}, "The specified GTF annotation file ('gtfFile' setting in the config file) '$cfg->{gtfFile}' was NOT readable on the filesystem. Tophat and STAR requires one of two things: 1) either this file exist, OR 2) 'gtfFile' is set to the special value of 'gtfFile=${NA_VALUE}' to indicate that there is no such file."));
# Validate GTF if it's present maybe?? validate_gtf.pl can maybe do this.
}
if ($cfg->{'aligner'} =~ m/^(tophat|bowtie)/i) {
definedInConfigOrDie($cfg, "bowtie2Index");
($cfg->{bowtie2Index} !~ m/[.]bt2$/) or confess "[ERROR] Configuration file error---the 'bowtie2Index' variable has a '.bt2' suffix on it, which it is not supposed to! Remove any file suffixes from this variable! Example: CORRECT = /path/to/hg19 . INCORRECT = /path/to/hg19.1.bt2 <== do not do that. ";
dieIfFileAccessFails("$cfg->{bowtie2Index}.1.bt2", "ALIGNER IS MISSING THE REQUIRED 'bowtie2' FORMAT INDEX FILE (those index files that end in '.bt2'). You probably need to either change the specified bowtie2 index location OR run 'bowtie2-build' on your input genome fasta file to build these indexes in the first place.");
}
if ($cfg->{'aligner'} =~ m/^(bwa)/i) { # A bwa index has a TON of files that must exist.
definedInConfigOrDie($cfg, "bwaIndex");
foreach my $suf (".amb", ".ann", ".bwt", ".pac", ".sa") { dieIfFileAccessFails("$cfg->{bwaIndex}${suf}", "MISSING THE REQUIRED '${suf}' FORMAT BWA INDEX FILE! Remember that the bam index is just the PREFIX for the files. So do not include '.bwt' or anything in the bwaIndex variable."); }
}
if ($cfg->{aligner} =~ m/^(star)/i) {
definedInConfigOrDie($cfg, "starIndexDir");
dirExistsInConfigOrDie($cfg, "starIndexDir"); # <== it needs to also be a DIRECTORY. Example of a valid starIndexDir: /data/info/genome/mm9_ensembl_igenome_with_chr/STAR_index_mm9.chr/
foreach my $starfile ("SA", "SAindex", "Genome", "chrLength.txt") {
dieIfFileAccessFails("$cfg->{starIndexDir}/${starfile}", "MISSING THE REQUIRED '${starfile}' FILE that the STAR ALIGNER needs. Make sure this file exists!");
}
}
if ($cfg->{aligner} =~ m/^(hisat)/i) {
confess "[ERROR] HISAT is not supported yet";
}
setIfUndefined($cfg, 'globalDoPeaks' , 0);
setIfUndefined($cfg, 'bowtie2Index' , $NA_VALUE);
setIfUndefined($cfg, 'bwaIndex' , $NA_VALUE);
setIfUndefined($cfg, 'starIndexDir' , $NA_VALUE);
# ======================== CHECK THE INPUT files (for ChIP-seq usually)... if any) ====================
if (defined($cfg->{"input"})) {
foreach my $k (keys(%{$cfg->{"input"}})) {
#next if (isNA($k)); # The "NA" values don't need to be checked--they do not exist!
foreach my $k2 (keys(%{$cfg->{"input"}->{$k}})) {
#next if (isNA($k2)); # <== it's OK if a corresponding input file is NA maybe?
dieIfFileAccessFails(catfile($cfg->{sampleDir}, $k2), "[ERROR] The ChIP-seq 'input' file for key '$k' and '$k2' and <$cfg->{sampleDir}/${k2}> was missing or otherwise UNREADABLE");
}
}
}
(defined($cfg->{'sample'})) or confess "[ERROR] Configuration file does not contain any 'sample' lines. e.g.:\nsample <sampleLabel> <fullPathToSampleFile>. Note that the samples have to start with the LITERAL string 'sample', not the name of the sample!";
while ( my($k,$th2) = each( %{$cfg->{'sample'}} ) ) {
foreach my $sampkey (keys %$th2) {
dieIfFileAccessFails(catfile($cfg->{sampleDir}, $sampkey), "[ERROR] The 'sample' file for key '$k' and '$sampkey' <$cfg->{sampleDir}/$sampkey> was unreadable\n ...maybe your sample directory was set incorrectly? It was set to: \"$cfg->{sampleDir}\" \n... you should double check it");
}
my $th3 = $cfg->{details}->{$k};
if (defined($th3->{input})) {
my $tch = $cfg->{input};
defined($tch->{$th3->{input}}) or confess "[ERROR] The 'Input' line that corresponds to sample '$k' lists a matched input ($th3->{input}) that isn't actually specified in the configuration file.";
}
}
if (isNA($cfg->{genomeFasta})) {
confess "[ERROR] That's weird, we expect you to ALWAYS provide a genome fasta file (like 'hg19.fa' or something), but apparently you did not? Failing in the code here in bananas_agw.pm.";
}
$cfg->{genomeName} = basename($cfg->{genomeFasta});
$cfg->{genomeName} =~ s/[.](fa|fasta)//; # infer the genome name from the (required) input FASTA file. Remove the .fa/.fasta file extensions.
if (not $cfg->{doDensity}) {
$cfg->{globalDoPeaks} and ourWarn("[WARNING]: You are skipping tags and density---but you have data that needs peaks (you have either chip-seq or exome-seq data)! By specifying 'doDensity = FALSE', you skip EVERYTHING that relies on this downstream (browserBins, Sean's motif finding, etc.)");
$cfg->{browserBins} and ourWarn("[WARNING]: You are skipping tags and density---but you have data that needs peaks (you have either chip-seq or exome-seq data)! By specifying 'doDensity = FALSE', you skip EVERYTHING that relies on this downstream (browserBins, Sean's motif finding, etc.)");
$cfg->{doWindows} and ourWarn("[WARNING]: You are skipping tags and density---but you did NOT skip the 'window generation' step. Window generation cannot occur if you skipped tags and density! By specifying 'doDensity = FALSE', you skip EVERYTHING that relies on this downstream (browserBins, Sean's motif finding, etc.)");
$cfg->{globalDoPeaks} = 0; # Can't do peak calling without tags/density...
$cfg->{browserBins} = 0; # Can't make browserBins without tags/density...
$cfg->{doWindows} = 0; # If you skip tags and density, you ALSO have to skip window-generation...
}
if ($cfg->{browserBins} or $cfg->{browserWigs}) {
(defined($cfg->{tracksURL})) or confess "[ERROR] MISSING CONFIG FILE OPTION: browser track URL basename ('tracksURL') needs to be specified in the config file. Or, if you don't want browser tracks, add 'browserBins = FALSE' and 'browserWigs = FALSE' to your config file. You can also specify 'NA' if you don't want a valid URL to be generated.";
(defined($cfg->{tracksRsync})) or confess "[ERROR] MISSING CONFIG FILE OPTION: if you want browser tracks to be synced to a remote server, you need to specify a browser track rsync location basename ('tracksRsync') in the config file. OR you can just specify 'NA' to avoid copying the files entirely.";
($cfg->{tracksURL} =~ m/^https:\/\//) or ourWarn("WARNING: in 'tracksURL' specification: the 'tracksURL' parameter (\"$cfg->{tracksURL}\") should probably start with https:// . Note the 's' in httpS---not just http!");
if ($cfg->{browserBins}) {
dieIfFileAccessFails($cfg->{chromSizesFile}, "For making the 'browserBins' browser tracks, we NEED a valid chromSizesFile file, but the config file does not specify a valid one!");