Skip to content

Commit

Permalink
Make quality trimming and filtering optional (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
dfornika authored Mar 26, 2024
1 parent 43ecdf3 commit 0a424e8
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 37 deletions.
36 changes: 27 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,25 @@ nextflow run BCCDC-PHL/downsample-reads \

...will add the file `test_downsampling_summary.csv` to the outdir.

### Quality Trimming & Filtering

By default, input fastq files will be run through [fastp](https://github.com/OpenGene/fastp) using its default settings. This
means that [quality filtering](https://github.com/OpenGene/fastp?tab=readme-ov-file#quality-filter) will be applied to remove
poor-quality reads. But [quality trimming](https://github.com/OpenGene/fastp?tab=readme-ov-file#per-read-cutting-by-quality-score)
is not applied.

To disable quality filtering, use the `--disable_quality_filtering` flag. To enable quality trimming, use the `--enable_quality_trimming`
flag. For example:

```
nextflow run BCCDC-PHL/downsample-reads \
-profile conda \
--cache ~/.conda/envs \
--samplesheet_input samplesheet.csv \
--disable_quality_filtering \
--enable_quality_trimming \
--outdir </path/to/output_dir>
```

## Output

Expand Down Expand Up @@ -179,10 +198,10 @@ In the output directory for each sample, a provenance file will be written with
nextflow_session_id: ceb7cc4c-644b-47bd-9469-5f3a7658119f
nextflow_run_name: voluminous_jennings
analysis_start_time: 2024-03-19T15:23:43.570174-07:00
- input_filename: NC000962_R1.fastq.gz
- filename: NC000962_R1.fastq.gz
file_type: fastq-input
sha256: 2793587aeb2b87bece4902183c295213a7943ea178c83f8b5432594d4b2e3b84
- input_filename: NC000962_R2.fastq.gz
- filename: NC000962_R2.fastq.gz
file_type: fastq-input
sha256: 336e4c42a60f22738c87eb1291270ab4ddfd918f32fa1fc662421d4f9605ea59
- process_name: fastp
Expand All @@ -201,13 +220,12 @@ In the output directory for each sample, a provenance file will be written with
value: 10
- parameter: --genome-size
value: 4.4m
- process_name: fastp
tools:
- tool_name: fastp
tool_version: 0.23.2
parameters:
- parameter: --cut_tail
value: null
- filename: NC000962-downsample-10x_R1.fastq.gz
file_type: fastq-output
sha256: 2fe74753d889d1b6f02832a09b10a1cab51b1fb2e16a2af20577277aded07a83
- filename: NC000962-downsample-10x_R2.fastq.gz
file_type: fastq-output
sha256: b6041ce11ccad3522b3f0ae4117967839ccad78a90e90f106ac399e2e23a8000
```
If multiple coverage levels are specified for a sample, then multiple provenance files will be created (one for each coverage level).
21 changes: 12 additions & 9 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

nextflow.enable.dsl = 2

include { hash_files } from './modules/hash_files.nf'
include { fastp as fastp_input } from './modules/downsample_reads.nf'
include { downsample } from './modules/downsample_reads.nf'
include { fastp as fastp_output } from './modules/downsample_reads.nf'
include { pipeline_provenance } from './modules/provenance.nf'
include { collect_provenance } from './modules/provenance.nf'
include { hash_files as hash_fastq_input } from './modules/hash_files.nf'
include { hash_files as hash_fastq_output } from './modules/hash_files.nf'
include { fastp as fastp_input } from './modules/downsample_reads.nf'
include { downsample } from './modules/downsample_reads.nf'
include { fastp as fastp_output } from './modules/downsample_reads.nf'
include { pipeline_provenance } from './modules/provenance.nf'
include { collect_provenance } from './modules/provenance.nf'

workflow {

Expand All @@ -29,14 +30,16 @@ workflow {

main:

hash_files(ch_fastq.map{ it -> [it[0], it[1]] }.combine(Channel.of("fastq-input")))
hash_fastq_input(ch_fastq.join(ch_coverages).map({ it -> [it[0], it[2], it[1]] }).combine(Channel.of("fastq-input")))

ch_fastp_input = ch_fastq.join(ch_coverages.map({ it -> [it[0], it[2]] }))

fastp_input(ch_fastp_input.combine(Channel.of("original")))

downsample(ch_fastq.join(ch_coverages))

hash_fastq_output(downsample.out.reads.map{ it -> [it[0], it[3], it[1]] }.combine(Channel.of("fastq-output")))

fastp_output(downsample.out.reads)

fastp_input.out.csv.concat(fastp_output.out.csv).map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_downsampling_summary.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it[0] })
Expand All @@ -50,10 +53,10 @@ workflow {
ch_provenance = ch_sample_ids_with_coverages
ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata)
ch_provenance = ch_provenance.combine(ch_pipeline_provenance).map({ it -> [it[0], it[1], [it[2]]] })
ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1], it[2] << it[3]] }
ch_provenance = ch_provenance.join(hash_fastq_input.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
ch_provenance = ch_provenance.join(fastp_input.out.provenance).map{ it -> [it[0], it[1], it[2] << it[4]] }
ch_provenance = ch_provenance.join(downsample.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
ch_provenance = ch_provenance.join(fastp_output.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }
ch_provenance = ch_provenance.join(hash_fastq_output.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] }

collect_provenance(ch_provenance.map{ it -> [it[0], it[1], it[2].minus(null)] })
}
41 changes: 30 additions & 11 deletions modules/downsample_reads.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,50 @@ process fastp {
output:
tuple val(sample_id), path("${sample_id}_${target_coverage_filename}_fastp.json"), emit: json
tuple val(sample_id), path("${sample_id}_${target_coverage_filename}_downsampling_summary.csv"), emit: csv
tuple val(sample_id), val(target_coverage), path("${sample_id}_${target_coverage_filename}_fastp_provenance.yml"), emit: provenance
tuple val(sample_id), val(target_coverage), path("${sample_id}_original_fastp_provenance.yml"), emit: provenance, optional: true

script:
if (target_coverage == 'original') {
target_coverage_filename = 'original'
} else {
target_coverage_filename = target_coverage + 'x'
}
if (target_coverage == 'original' && params.enable_quality_trimming) {
quality_trimming = '--cut_tail'
} else {
quality_trimming = ''
}
if (target_coverage == 'original' && params.disable_quality_filtering) {
quality_filtering = '--disable_quality_filtering'
} else {
quality_filtering = ''
}
"""
printf -- "- process_name: fastp\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " tools:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " - tool_name: fastp\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " parameters:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " - parameter: --cut_tail\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
if [[ "${quality_trimming}" != "" || "${quality_filtering}" != "" ]]; then
printf -- " parameters:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
fi
if [[ "${quality_trimming}" != "" ]]; then
printf -- " - parameter: --cut_tail\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
fi
if [[ "${quality_filtering}" != "" ]]; then
printf -- " - parameter: --disable_quality_filtering\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml
fi
fastp \
-t ${task.cpus} \
-i ${reads[0]} \
-I ${reads[1]} \
--cut_tail \
-o ${sample_id}_R1.trim.fastq.gz \
-O ${sample_id}_R2.trim.fastq.gz \
-j ${sample_id}_${target_coverage_filename}_fastp.json
-t ${task.cpus} \
-i ${reads[0]} \
-I ${reads[1]} \
${quality_trimming} \
${quality_filtering} \
-o ${sample_id}_R1.trim.fastq.gz \
-O ${sample_id}_R2.trim.fastq.gz \
-j ${sample_id}_${target_coverage_filename}_fastp.json
echo "target_coverage" >> coverage_field.csv
echo ${target_coverage} >> coverage_field.csv
Expand Down
22 changes: 14 additions & 8 deletions modules/hash_files.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,26 @@ process hash_files {
tag { sample_id + " / " + file_type }

input:
tuple val(sample_id), path(files_to_hash), val(file_type)
tuple val(sample_id), val(coverage), path(files_to_hash), val(file_type)

output:
tuple val(sample_id), path("${sample_id}_${file_type}.sha256.csv"), emit: csv
tuple val(sample_id), path("${sample_id}_${file_type}_provenance.yml"), emit: provenance
tuple val(sample_id), val(coverage), path("${sample_id}_${coverage_filename}_${file_type}.sha256.csv"), emit: csv
tuple val(sample_id), val(coverage), path("${sample_id}_${coverage_filename}_${file_type}_provenance.yml"), emit: provenance

script:
if (coverage == "original") {
coverage_filename = "original"
} else {
coverage_filename = coverage + "x"
}

"""
shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sample_id}_${file_type}.sha256.csv
shasum -a 256 ${files_to_hash} | tr -s ' ' ',' >> ${sample_id}_${coverage_filename}_${file_type}.sha256.csv
while IFS=',' read -r hash filename; do
printf -- "- input_filename: \$filename\\n" >> ${sample_id}_${file_type}_provenance.yml;
printf -- " file_type: ${file_type}\\n" >> ${sample_id}_${file_type}_provenance.yml;
printf -- " sha256: \$hash\\n" >> ${sample_id}_${file_type}_provenance.yml;
done < ${sample_id}_${file_type}.sha256.csv
printf -- "- filename: \$filename\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
printf -- " file_type: ${file_type}\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
printf -- " sha256: \$hash\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml;
done < ${sample_id}_${coverage_filename}_${file_type}.sha256.csv
"""

}
2 changes: 2 additions & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ params {
coverages = 'NO_FILE'
coverage = 30
genome_size = '5m'
enable_quality_trimming = false
disable_quality_filtering = false
collect_outputs = false
collected_outputs_prefix = 'collected'
}
Expand Down

0 comments on commit 0a424e8

Please sign in to comment.