Make quality trimming and filtering optional (#18)

BCCDC-PHL · Mar 26, 2024 · 0a424e8 · 0a424e8
1 parent 43ecdf3
commit 0a424e8
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -124,6 +124,25 @@ nextflow run BCCDC-PHL/downsample-reads \
 
 ...will add the file `test_downsampling_summary.csv` to the outdir.
 
+### Quality Trimming & Filtering
+
+By default, input fastq files will be run through [fastp](https://github.com/OpenGene/fastp) using its default settings. This
+means that [quality filtering](https://github.com/OpenGene/fastp?tab=readme-ov-file#quality-filter) will be applied to remove
+poor-quality reads. But [quality trimming](https://github.com/OpenGene/fastp?tab=readme-ov-file#per-read-cutting-by-quality-score)
+is not applied.
+
+To disable quality filtering, use the `--disable_quality_filtering` flag. To enable quality trimming, use the `--enable_quality_trimming`
+flag. For example:
+
+```
+nextflow run BCCDC-PHL/downsample-reads \
+  -profile conda \
+  --cache ~/.conda/envs \
+  --samplesheet_input samplesheet.csv \
+  --disable_quality_filtering \
+  --enable_quality_trimming \
+  --outdir </path/to/output_dir>
+```
 
 ## Output
 
@@ -179,10 +198,10 @@ In the output directory for each sample, a provenance file will be written with
   nextflow_session_id: ceb7cc4c-644b-47bd-9469-5f3a7658119f
   nextflow_run_name: voluminous_jennings
   analysis_start_time: 2024-03-19T15:23:43.570174-07:00
-- input_filename: NC000962_R1.fastq.gz
+- filename: NC000962_R1.fastq.gz
   file_type: fastq-input
   sha256: 2793587aeb2b87bece4902183c295213a7943ea178c83f8b5432594d4b2e3b84
-- input_filename: NC000962_R2.fastq.gz
+- filename: NC000962_R2.fastq.gz
   file_type: fastq-input
   sha256: 336e4c42a60f22738c87eb1291270ab4ddfd918f32fa1fc662421d4f9605ea59
 - process_name: fastp
@@ -201,13 +220,12 @@ In the output directory for each sample, a provenance file will be written with
           value: 10
         - parameter: --genome-size
           value: 4.4m
-- process_name: fastp
-  tools:
-    - tool_name: fastp
-      tool_version: 0.23.2
-      parameters:
-        - parameter: --cut_tail
-          value: null
+- filename: NC000962-downsample-10x_R1.fastq.gz
+  file_type: fastq-output
+  sha256: 2fe74753d889d1b6f02832a09b10a1cab51b1fb2e16a2af20577277aded07a83
+- filename: NC000962-downsample-10x_R2.fastq.gz
+  file_type: fastq-output
+  sha256: b6041ce11ccad3522b3f0ae4117967839ccad78a90e90f106ac399e2e23a8000
 ```
 
 If multiple coverage levels are specified for a sample, then multiple provenance files will be created (one for each coverage level).
diff --git a/main.nf b/main.nf
@@ -2,12 +2,13 @@
 
 nextflow.enable.dsl = 2
 
-include { hash_files }            from './modules/hash_files.nf'
-include { fastp as fastp_input }  from './modules/downsample_reads.nf'
-include { downsample }            from './modules/downsample_reads.nf'
-include { fastp as fastp_output } from './modules/downsample_reads.nf'
-include { pipeline_provenance }   from './modules/provenance.nf'
-include { collect_provenance }    from './modules/provenance.nf'
+include { hash_files as hash_fastq_input }  from './modules/hash_files.nf'
+include { hash_files as hash_fastq_output } from './modules/hash_files.nf'
+include { fastp as fastp_input }            from './modules/downsample_reads.nf'
+include { downsample }                      from './modules/downsample_reads.nf'
+include { fastp as fastp_output }           from './modules/downsample_reads.nf'
+include { pipeline_provenance }             from './modules/provenance.nf'
+include { collect_provenance }              from './modules/provenance.nf'
 
 workflow {
 
@@ -29,14 +30,16 @@ workflow {
 
     main:
 
-    hash_files(ch_fastq.map{ it -> [it[0], it[1]] }.combine(Channel.of("fastq-input")))
+    hash_fastq_input(ch_fastq.join(ch_coverages).map({ it -> [it[0], it[2], it[1]] }).combine(Channel.of("fastq-input")))
 
     ch_fastp_input = ch_fastq.join(ch_coverages.map({ it -> [it[0], it[2]] }))
 
     fastp_input(ch_fastp_input.combine(Channel.of("original")))
 
     downsample(ch_fastq.join(ch_coverages))
 
+    hash_fastq_output(downsample.out.reads.map{ it -> [it[0], it[3], it[1]] }.combine(Channel.of("fastq-output")))
+
     fastp_output(downsample.out.reads)
 
     fastp_input.out.csv.concat(fastp_output.out.csv).map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_downsampling_summary.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it[0] })
@@ -50,10 +53,10 @@ workflow {
     ch_provenance = ch_sample_ids_with_coverages
     ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata)
     ch_provenance = ch_provenance.combine(ch_pipeline_provenance).map({ it -> [it[0], it[1], [it[2]]] })
-    ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1], it[2] << it[3]] }
+    ch_provenance = ch_provenance.join(hash_fastq_input.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
     ch_provenance = ch_provenance.join(fastp_input.out.provenance).map{ it -> [it[0], it[1], it[2] << it[4]] }
     ch_provenance = ch_provenance.join(downsample.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
-    ch_provenance = ch_provenance.join(fastp_output.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
+    ch_provenance = ch_provenance.join(hash_fastq_output.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
 
     collect_provenance(ch_provenance.map{ it -> [it[0], it[1], it[2].minus(null)] })
 }
diff --git a/modules/downsample_reads.nf b/modules/downsample_reads.nf
@@ -10,31 +10,50 @@ process fastp {
     output:
     tuple val(sample_id), path("${sample_id}_${target_coverage_filename}_fastp.json"), emit: json
     tuple val(sample_id), path("${sample_id}_${target_coverage_filename}_downsampling_summary.csv"), emit: csv
-    tuple val(sample_id), val(target_coverage), path("${sample_id}_${target_coverage_filename}_fastp_provenance.yml"), emit: provenance
+    tuple val(sample_id), val(target_coverage), path("${sample_id}_original_fastp_provenance.yml"), emit: provenance, optional: true
 
     script:
     if (target_coverage == 'original') {
 	target_coverage_filename = 'original'
     } else {
 	target_coverage_filename = target_coverage + 'x'
     }
+    if (target_coverage == 'original' && params.enable_quality_trimming) {
+	quality_trimming = '--cut_tail'
+    } else {
+	quality_trimming = ''
+    }
+    if (target_coverage == 'original' && params.disable_quality_filtering) {
+	quality_filtering = '--disable_quality_filtering'
+    } else {
+	quality_filtering = ''
+    }
     """
     printf -- "- process_name: fastp\\n"  >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
     printf -- "  tools:\\n"               >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
     printf -- "    - tool_name: fastp\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
     printf -- "      tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
-    printf -- "      parameters:\\n"               >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
-    printf -- "        - parameter: --cut_tail\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
-    printf -- "          value: null\\n"           >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
+    if [[ "${quality_trimming}" != "" || "${quality_filtering}" != "" ]]; then
+        printf -- "      parameters:\\n"               >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
+    fi
+    if [[ "${quality_trimming}" != "" ]]; then
+        printf -- "        - parameter: --cut_tail\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
+        printf -- "          value: null\\n"           >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
+    fi
+    if [[ "${quality_filtering}" != "" ]]; then
+        printf -- "        - parameter: --disable_quality_filtering\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
+        printf -- "          value: null\\n"           >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
+    fi
 
     fastp \
-      -t ${task.cpus} \
-      -i ${reads[0]} \
-      -I ${reads[1]} \
-      --cut_tail \
-      -o ${sample_id}_R1.trim.fastq.gz \
-      -O ${sample_id}_R2.trim.fastq.gz \
-      -j ${sample_id}_${target_coverage_filename}_fastp.json
+	-t ${task.cpus} \
+	-i ${reads[0]} \
+	-I ${reads[1]} \
+	${quality_trimming} \
+	${quality_filtering} \
+	-o ${sample_id}_R1.trim.fastq.gz \
+	-O ${sample_id}_R2.trim.fastq.gz \
+	-j ${sample_id}_${target_coverage_filename}_fastp.json
 
     echo "target_coverage"  >> coverage_field.csv
     echo ${target_coverage} >> coverage_field.csv

diff --git a/modules/hash_files.nf b/modules/hash_files.nf
@@ -3,20 +3,26 @@ process hash_files {
     tag { sample_id + " / " + file_type }
 
     input:
-    tuple  val(sample_id), path(files_to_hash), val(file_type)
+    tuple  val(sample_id), val(coverage), path(files_to_hash), val(file_type)
 
     output:
-    tuple  val(sample_id), path("${sample_id}_${file_type}.sha256.csv"), emit: csv
-    tuple  val(sample_id), path("${sample_id}_${file_type}_provenance.yml"), emit: provenance
+    tuple  val(sample_id), val(coverage), path("${sample_id}_${coverage_filename}_${file_type}.sha256.csv"), emit: csv
+    tuple  val(sample_id), val(coverage), path("${sample_id}_${coverage_filename}_${file_type}_provenance.yml"), emit: provenance
 
     script:
+    if (coverage == "original") {
+	coverage_filename = "original"
+    } else {
+	coverage_filename = coverage + "x"
+    }
+
     """
-    shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sample_id}_${file_type}.sha256.csv
+    shasum -a 256 ${files_to_hash} | tr -s ' ' ',' >> ${sample_id}_${coverage_filename}_${file_type}.sha256.csv
     while IFS=',' read -r hash filename; do
-      printf -- "- input_filename: \$filename\\n"  >> ${sample_id}_${file_type}_provenance.yml;
-      printf -- "  file_type: ${file_type}\\n"     >> ${sample_id}_${file_type}_provenance.yml;
-      printf -- "  sha256: \$hash\\n"              >> ${sample_id}_${file_type}_provenance.yml;
-    done < ${sample_id}_${file_type}.sha256.csv
+      printf -- "- filename: \$filename\\n"        >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
+      printf -- "  file_type: ${file_type}\\n"     >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
+      printf -- "  sha256: \$hash\\n"              >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
+    done < ${sample_id}_${coverage_filename}_${file_type}.sha256.csv
     """
 
 }
diff --git a/nextflow.config b/nextflow.config
@@ -17,6 +17,8 @@ params {
     coverages = 'NO_FILE'
     coverage = 30
     genome_size = '5m'
+    enable_quality_trimming = false
+    disable_quality_filtering = false
     collect_outputs = false
     collected_outputs_prefix = 'collected'
 }