diff --git a/README.md b/README.md index 07c74fe..04275bd 100644 --- a/README.md +++ b/README.md @@ -124,6 +124,25 @@ nextflow run BCCDC-PHL/downsample-reads \ ...will add the file `test_downsampling_summary.csv` to the outdir. +### Quality Trimming & Filtering + +By default, input fastq files will be run through [fastp](https://github.com/OpenGene/fastp) using its default settings. This +means that [quality filtering](https://github.com/OpenGene/fastp?tab=readme-ov-file#quality-filter) will be applied to remove +poor-quality reads. But [quality trimming](https://github.com/OpenGene/fastp?tab=readme-ov-file#per-read-cutting-by-quality-score) +is not applied. + +To disable quality filtering, use the `--disable_quality_filtering` flag. To enable quality trimming, use the `--enable_quality_trimming` +flag. For example: + +``` +nextflow run BCCDC-PHL/downsample-reads \ + -profile conda \ + --cache ~/.conda/envs \ + --samplesheet_input samplesheet.csv \ + --disable_quality_filtering \ + --enable_quality_trimming \ + --outdir +``` ## Output @@ -179,10 +198,10 @@ In the output directory for each sample, a provenance file will be written with nextflow_session_id: ceb7cc4c-644b-47bd-9469-5f3a7658119f nextflow_run_name: voluminous_jennings analysis_start_time: 2024-03-19T15:23:43.570174-07:00 -- input_filename: NC000962_R1.fastq.gz +- filename: NC000962_R1.fastq.gz file_type: fastq-input sha256: 2793587aeb2b87bece4902183c295213a7943ea178c83f8b5432594d4b2e3b84 -- input_filename: NC000962_R2.fastq.gz +- filename: NC000962_R2.fastq.gz file_type: fastq-input sha256: 336e4c42a60f22738c87eb1291270ab4ddfd918f32fa1fc662421d4f9605ea59 - process_name: fastp @@ -201,13 +220,12 @@ In the output directory for each sample, a provenance file will be written with value: 10 - parameter: --genome-size value: 4.4m -- process_name: fastp - tools: - - tool_name: fastp - tool_version: 0.23.2 - parameters: - - parameter: --cut_tail - value: null +- filename: NC000962-downsample-10x_R1.fastq.gz + file_type: fastq-output + sha256: 2fe74753d889d1b6f02832a09b10a1cab51b1fb2e16a2af20577277aded07a83 +- filename: NC000962-downsample-10x_R2.fastq.gz + file_type: fastq-output + sha256: b6041ce11ccad3522b3f0ae4117967839ccad78a90e90f106ac399e2e23a8000 ``` If multiple coverage levels are specified for a sample, then multiple provenance files will be created (one for each coverage level). \ No newline at end of file diff --git a/main.nf b/main.nf index e17366a..a700d3c 100644 --- a/main.nf +++ b/main.nf @@ -2,12 +2,13 @@ nextflow.enable.dsl = 2 -include { hash_files } from './modules/hash_files.nf' -include { fastp as fastp_input } from './modules/downsample_reads.nf' -include { downsample } from './modules/downsample_reads.nf' -include { fastp as fastp_output } from './modules/downsample_reads.nf' -include { pipeline_provenance } from './modules/provenance.nf' -include { collect_provenance } from './modules/provenance.nf' +include { hash_files as hash_fastq_input } from './modules/hash_files.nf' +include { hash_files as hash_fastq_output } from './modules/hash_files.nf' +include { fastp as fastp_input } from './modules/downsample_reads.nf' +include { downsample } from './modules/downsample_reads.nf' +include { fastp as fastp_output } from './modules/downsample_reads.nf' +include { pipeline_provenance } from './modules/provenance.nf' +include { collect_provenance } from './modules/provenance.nf' workflow { @@ -29,7 +30,7 @@ workflow { main: - hash_files(ch_fastq.map{ it -> [it[0], it[1]] }.combine(Channel.of("fastq-input"))) + hash_fastq_input(ch_fastq.join(ch_coverages).map({ it -> [it[0], it[2], it[1]] }).combine(Channel.of("fastq-input"))) ch_fastp_input = ch_fastq.join(ch_coverages.map({ it -> [it[0], it[2]] })) @@ -37,6 +38,8 @@ workflow { downsample(ch_fastq.join(ch_coverages)) + hash_fastq_output(downsample.out.reads.map{ it -> [it[0], it[3], it[1]] }.combine(Channel.of("fastq-output"))) + fastp_output(downsample.out.reads) fastp_input.out.csv.concat(fastp_output.out.csv).map{ it -> it[1] }.collectFile(name: params.collected_outputs_prefix + "_downsampling_summary.csv", storeDir: params.outdir, keepHeader: true, skip: 1, sort: { it -> it[0] }) @@ -50,10 +53,10 @@ workflow { ch_provenance = ch_sample_ids_with_coverages ch_pipeline_provenance = pipeline_provenance(ch_workflow_metadata) ch_provenance = ch_provenance.combine(ch_pipeline_provenance).map({ it -> [it[0], it[1], [it[2]]] }) - ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1], it[2] << it[3]] } + ch_provenance = ch_provenance.join(hash_fastq_input.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] } ch_provenance = ch_provenance.join(fastp_input.out.provenance).map{ it -> [it[0], it[1], it[2] << it[4]] } ch_provenance = ch_provenance.join(downsample.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] } - ch_provenance = ch_provenance.join(fastp_output.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] } + ch_provenance = ch_provenance.join(hash_fastq_output.out.provenance, by: [0, 1]).map{ it -> [it[0], it[1], it[2] << it[3]] } collect_provenance(ch_provenance.map{ it -> [it[0], it[1], it[2].minus(null)] }) } diff --git a/modules/downsample_reads.nf b/modules/downsample_reads.nf index d5b6cf6..29beeeb 100644 --- a/modules/downsample_reads.nf +++ b/modules/downsample_reads.nf @@ -10,7 +10,7 @@ process fastp { output: tuple val(sample_id), path("${sample_id}_${target_coverage_filename}_fastp.json"), emit: json tuple val(sample_id), path("${sample_id}_${target_coverage_filename}_downsampling_summary.csv"), emit: csv - tuple val(sample_id), val(target_coverage), path("${sample_id}_${target_coverage_filename}_fastp_provenance.yml"), emit: provenance + tuple val(sample_id), val(target_coverage), path("${sample_id}_original_fastp_provenance.yml"), emit: provenance, optional: true script: if (target_coverage == 'original') { @@ -18,23 +18,42 @@ process fastp { } else { target_coverage_filename = target_coverage + 'x' } + if (target_coverage == 'original' && params.enable_quality_trimming) { + quality_trimming = '--cut_tail' + } else { + quality_trimming = '' + } + if (target_coverage == 'original' && params.disable_quality_filtering) { + quality_filtering = '--disable_quality_filtering' + } else { + quality_filtering = '' + } """ printf -- "- process_name: fastp\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml printf -- " tools:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml printf -- " - tool_name: fastp\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml printf -- " tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml - printf -- " parameters:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml - printf -- " - parameter: --cut_tail\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml - printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml + if [[ "${quality_trimming}" != "" || "${quality_filtering}" != "" ]]; then + printf -- " parameters:\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml + fi + if [[ "${quality_trimming}" != "" ]]; then + printf -- " - parameter: --cut_tail\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml + printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml + fi + if [[ "${quality_filtering}" != "" ]]; then + printf -- " - parameter: --disable_quality_filtering\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml + printf -- " value: null\\n" >> ${sample_id}_${target_coverage_filename}_fastp_provenance.yml + fi fastp \ - -t ${task.cpus} \ - -i ${reads[0]} \ - -I ${reads[1]} \ - --cut_tail \ - -o ${sample_id}_R1.trim.fastq.gz \ - -O ${sample_id}_R2.trim.fastq.gz \ - -j ${sample_id}_${target_coverage_filename}_fastp.json + -t ${task.cpus} \ + -i ${reads[0]} \ + -I ${reads[1]} \ + ${quality_trimming} \ + ${quality_filtering} \ + -o ${sample_id}_R1.trim.fastq.gz \ + -O ${sample_id}_R2.trim.fastq.gz \ + -j ${sample_id}_${target_coverage_filename}_fastp.json echo "target_coverage" >> coverage_field.csv echo ${target_coverage} >> coverage_field.csv diff --git a/modules/hash_files.nf b/modules/hash_files.nf index 74642fe..fdbf96d 100644 --- a/modules/hash_files.nf +++ b/modules/hash_files.nf @@ -3,20 +3,26 @@ process hash_files { tag { sample_id + " / " + file_type } input: - tuple val(sample_id), path(files_to_hash), val(file_type) + tuple val(sample_id), val(coverage), path(files_to_hash), val(file_type) output: - tuple val(sample_id), path("${sample_id}_${file_type}.sha256.csv"), emit: csv - tuple val(sample_id), path("${sample_id}_${file_type}_provenance.yml"), emit: provenance + tuple val(sample_id), val(coverage), path("${sample_id}_${coverage_filename}_${file_type}.sha256.csv"), emit: csv + tuple val(sample_id), val(coverage), path("${sample_id}_${coverage_filename}_${file_type}_provenance.yml"), emit: provenance script: + if (coverage == "original") { + coverage_filename = "original" + } else { + coverage_filename = coverage + "x" + } + """ - shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sample_id}_${file_type}.sha256.csv + shasum -a 256 ${files_to_hash} | tr -s ' ' ',' >> ${sample_id}_${coverage_filename}_${file_type}.sha256.csv while IFS=',' read -r hash filename; do - printf -- "- input_filename: \$filename\\n" >> ${sample_id}_${file_type}_provenance.yml; - printf -- " file_type: ${file_type}\\n" >> ${sample_id}_${file_type}_provenance.yml; - printf -- " sha256: \$hash\\n" >> ${sample_id}_${file_type}_provenance.yml; - done < ${sample_id}_${file_type}.sha256.csv + printf -- "- filename: \$filename\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml; + printf -- " file_type: ${file_type}\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml; + printf -- " sha256: \$hash\\n" >> ${sample_id}_${coverage_filename}_${file_type}_provenance.yml; + done < ${sample_id}_${coverage_filename}_${file_type}.sha256.csv """ } diff --git a/nextflow.config b/nextflow.config index 3e8e173..34e1d28 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,6 +17,8 @@ params { coverages = 'NO_FILE' coverage = 30 genome_size = '5m' + enable_quality_trimming = false + disable_quality_filtering = false collect_outputs = false collected_outputs_prefix = 'collected' }