From 2898065e1702c00d5b6acb38565f79ac81cd0633 Mon Sep 17 00:00:00 2001 From: Dan Fornika Date: Thu, 20 Jun 2024 10:29:06 -0700 Subject: [PATCH] Add testing pipeline (#35) * Add testing pipeline * Add env files * prepare mob-suite db prior to use * Use full path to mob-suite db * Use different sample for test data * update * Fix provenance * Update README * fix provenance * fix provenance * fix provenance in README --- .github/data/reads_to_simulate.csv | 1 + .github/environments/art.yml | 7 ++ .github/environments/check-outputs.yml | 9 ++ .github/scripts/check_outputs.py | 114 ++++++++++++++++++ .github/scripts/check_outputs.sh | 12 ++ .github/scripts/create_art_environment.sh | 3 + .../create_output_checking_environment.sh | 3 + .github/scripts/create_samplesheet.sh | 11 ++ .github/scripts/download_assemblies.sh | 16 +++ .github/scripts/download_mob-suite_db.sh | 27 +++++ .github/scripts/install_conda.sh | 22 ++++ .github/scripts/install_nextflow.sh | 11 ++ .github/scripts/prepare_artifacts.sh | 13 ++ .github/scripts/run_pipeline.sh | 23 ++++ .github/scripts/run_tests_locally.sh | 18 +++ .github/scripts/simulate_reads.sh | 35 ++++++ .github/workflows/tests.yml | 52 ++++++++ README.md | 6 +- modules/mash_screen.nf | 6 +- modules/mob_recon.nf | 2 +- 20 files changed, 385 insertions(+), 6 deletions(-) create mode 100644 .github/data/reads_to_simulate.csv create mode 100644 .github/environments/art.yml create mode 100644 .github/environments/check-outputs.yml create mode 100755 .github/scripts/check_outputs.py create mode 100755 .github/scripts/check_outputs.sh create mode 100755 .github/scripts/create_art_environment.sh create mode 100755 .github/scripts/create_output_checking_environment.sh create mode 100755 .github/scripts/create_samplesheet.sh create mode 100755 .github/scripts/download_assemblies.sh create mode 100755 .github/scripts/download_mob-suite_db.sh create mode 100755 .github/scripts/install_conda.sh create mode 100755 .github/scripts/install_nextflow.sh create mode 100755 .github/scripts/prepare_artifacts.sh create mode 100755 .github/scripts/run_pipeline.sh create mode 100755 .github/scripts/run_tests_locally.sh create mode 100755 .github/scripts/simulate_reads.sh create mode 100644 .github/workflows/tests.yml diff --git a/.github/data/reads_to_simulate.csv b/.github/data/reads_to_simulate.csv new file mode 100644 index 0000000..9c15e0f --- /dev/null +++ b/.github/data/reads_to_simulate.csv @@ -0,0 +1 @@ +GCF024700185.1,.github/data/assemblies/GCF024700185.1.fa diff --git a/.github/environments/art.yml b/.github/environments/art.yml new file mode 100644 index 0000000..bb6458a --- /dev/null +++ b/.github/environments/art.yml @@ -0,0 +1,7 @@ +name: art +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - art=2016.06.05 diff --git a/.github/environments/check-outputs.yml b/.github/environments/check-outputs.yml new file mode 100644 index 0000000..414072e --- /dev/null +++ b/.github/environments/check-outputs.yml @@ -0,0 +1,9 @@ +name: check-outputs +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3 + - jsonschema=4.20.0 + - pyyaml=6.0.1 diff --git a/.github/scripts/check_outputs.py b/.github/scripts/check_outputs.py new file mode 100755 index 0000000..71dfea9 --- /dev/null +++ b/.github/scripts/check_outputs.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 + +import argparse +import csv +import glob +import json +import os +import urllib.request + +from jsonschema import validate +import yaml + + +def check_provenance_format_valid(provenance_files, schema): + """ + Check that the provenance files are valid according to the schema. + """ + for provenance_file in provenance_files: + with open(provenance_file) as f: + try: + provenance = yaml.load(f, Loader=yaml.BaseLoader) + validate(provenance, schema) + except Exception as e: + print(f"Error validating {provenance_file}: {e}") + exit(1) + return False + + return True + +def check_expected_files_exist(output_dir, sample_ids): + """ + Check that the expected files exist in the output directory. + + :param output_dir: Path to the output directory + :param sample_ids: List of sample IDs + :return: True if all expected files exist, False otherwise + :rtype: bool + """ + for sample_id in sample_ids: + expected_files = [ + f"{sample_id}/{sample_id}_fastp.csv", + f"{sample_id}/{sample_id}_fastp.json", + f"{sample_id}/{sample_id}_quast.csv", + f"{sample_id}/{sample_id}_abricate_ncbi.tsv", + f"{sample_id}/{sample_id}_abricate_plasmidfinder.tsv", + f"{sample_id}/{sample_id}_resistance_gene_report.tsv", + ] + + for expected_file in expected_files: + expected_file_path = os.path.join(output_dir, expected_file) + if not os.path.exists(expected_file_path): + print(f"Expected file {expected_file_path} not found") + return False + + return True + + +def main(args): + + output_dir = os.path.dirname(args.output) + os.makedirs(output_dir, exist_ok=True) + + provenance_schema_url = "https://raw.githubusercontent.com/BCCDC-PHL/pipeline-provenance-schema/main/schema/pipeline-provenance.json" + provenance_schema_path = ".github/data/pipeline-provenance.json" + urllib.request.urlretrieve(provenance_schema_url, provenance_schema_path) + + provenance_schema = None + with open(provenance_schema_path) as f: + provenance_schema = json.load(f) + + provenace_files_glob = f"{args.pipeline_outdir}/**/*_provenance.yml" + provenance_files = glob.glob(provenace_files_glob, recursive=True) + + sample_ids = [os.path.basename(provenance_file).split("_")[0] for provenance_file in provenance_files] + + # TODO: Add more tests + tests = [ + { + "test_name": "provenance_format_valid", + "test_passed": check_provenance_format_valid(provenance_files, provenance_schema), + }, + { + "test_name": "all_expected_files_exist", + "test_passed": check_expected_files_exist(args.pipeline_outdir, sample_ids), + }, + ] + + output_fields = [ + "test_name", + "test_result" + ] + + output_path = args.output + with open(output_path, 'w') as f: + writer = csv.DictWriter(f, fieldnames=output_fields, extrasaction='ignore') + writer.writeheader() + for test in tests: + if test["test_passed"]: + test["test_result"] = "PASS" + else: + test["test_result"] = "FAIL" + writer.writerow(test) + + for test in tests: + if not test['test_passed']: + exit(1) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check outputs') + parser.add_argument('--pipeline-outdir', type=str, help='Path to the pipeline output directory') + parser.add_argument('-o', '--output', type=str, help='Path to the output file') + args = parser.parse_args() + main(args) diff --git a/.github/scripts/check_outputs.sh b/.github/scripts/check_outputs.sh new file mode 100755 index 0000000..3f4b54a --- /dev/null +++ b/.github/scripts/check_outputs.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +set -e -o pipefail + +source ${HOME}/.bashrc + +eval "$(conda shell.bash hook)" + +conda activate check-outputs + + +.github/scripts/check_outputs.py --pipeline-outdir .github/data/test_output -o artifacts/check_outputs_results.csv diff --git a/.github/scripts/create_art_environment.sh b/.github/scripts/create_art_environment.sh new file mode 100755 index 0000000..d393421 --- /dev/null +++ b/.github/scripts/create_art_environment.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +conda env create -f .github/environments/art.yml diff --git a/.github/scripts/create_output_checking_environment.sh b/.github/scripts/create_output_checking_environment.sh new file mode 100755 index 0000000..0f9a4a4 --- /dev/null +++ b/.github/scripts/create_output_checking_environment.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +conda env create -f .github/environments/check-outputs.yml diff --git a/.github/scripts/create_samplesheet.sh b/.github/scripts/create_samplesheet.sh new file mode 100755 index 0000000..363fe98 --- /dev/null +++ b/.github/scripts/create_samplesheet.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +echo 'ID,R1,R2,ASSEMBLY' > .github/data/samplesheet.csv + +for i in $(ls ${PWD}/.github/data/fastq/*_R1.fastq.gz); do + ID=$(basename $i _R1.fastq.gz) + R1=$i + R2=${PWD}/.github/data/fastq/${ID}_R2.fastq.gz + ASSEMBLY=${PWD}/.github/data/assemblies/${ID}.fa + echo "$ID,$R1,$R2,$ASSEMBLY" >> .github/data/samplesheet.csv +done diff --git a/.github/scripts/download_assemblies.sh b/.github/scripts/download_assemblies.sh new file mode 100755 index 0000000..b8a15fb --- /dev/null +++ b/.github/scripts/download_assemblies.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +mkdir -p .github/data/assemblies + +rm -f .github/data/assemblies/GCF_024700185.1.zip +rm -f .github/data/assemblies/GCF024700185.1.fa +rm -f .github/data/assemblies/README.md + +curl -o .github/data/assemblies/GCF_024700185.1.zip "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCF_024700185.1/download?include_annotation_type=GENOME_FASTA&include_annotation_type=SEQUENCE_REPORT&hydrated=FULLY_HYDRATED" + +unzip .github/data/assemblies/GCF_024700185.1.zip -d .github/data/assemblies + +mv .github/data/assemblies/ncbi_dataset/data/GCF_024700185.1/GCF_024700185.1_ASM2470018v1_genomic.fna .github/data/assemblies/GCF024700185.1.fa + +rm -r .github/data/assemblies/ncbi_dataset +rm -f .github/data/assemblies/README.md diff --git a/.github/scripts/download_mob-suite_db.sh b/.github/scripts/download_mob-suite_db.sh new file mode 100755 index 0000000..f040b01 --- /dev/null +++ b/.github/scripts/download_mob-suite_db.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +source ${HOME}/.bashrc + +eval "$(conda shell.bash hook)" + +mkdir -p .github/data + +rm -rf .github/data/mob-suite-db + +pushd .github/data + +wget -O data.tar.gz https://zenodo.org/records/10304948/files/data.tar.gz?download=1 + +tar -xzf data.tar.gz + +rm data.tar.gz + +mv data mob-suite-db + +conda activate plasmid-screen-35d122a137231eda3b8a0039d42f24f6 + +mash sketch -i mob-suite-db/ncbi_plasmid_full_seqs.fas + +makeblastdb -in mob-suite-db/ncbi_plasmid_full_seqs.fas -dbtype nucl + +popd diff --git a/.github/scripts/install_conda.sh b/.github/scripts/install_conda.sh new file mode 100755 index 0000000..fd82b1a --- /dev/null +++ b/.github/scripts/install_conda.sh @@ -0,0 +1,22 @@ +#!/bin/bash +set -eo pipefail + +artifacts_dir="artifacts" + +echo "Install Miniconda .." >> ${artifacts_dir}/test.log + +export PATH=/opt/miniconda3/bin:$PATH + +wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh + +/bin/bash ~/miniconda.sh -b -p /opt/miniconda3 + +rm ~/miniconda.sh + +echo ". /opt/minconda3/etc/profile.d/conda.sh" >> ~/.bashrc + +conda update -n base -c defaults conda + +conda install -y -c conda-forge mamba + +conda init bash diff --git a/.github/scripts/install_nextflow.sh b/.github/scripts/install_nextflow.sh new file mode 100755 index 0000000..ae5cde8 --- /dev/null +++ b/.github/scripts/install_nextflow.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +set -eo pipefail + +artifacts_dir="artifacts" + +echo Install Nextflow .. >> ${artifacts_dir}/test.log + +wget -qO- https://get.nextflow.io | bash + +sudo mv nextflow /usr/local/bin/ diff --git a/.github/scripts/prepare_artifacts.sh b/.github/scripts/prepare_artifacts.sh new file mode 100755 index 0000000..2ac7069 --- /dev/null +++ b/.github/scripts/prepare_artifacts.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +artifacts_dir="artifacts" + +echo "Prepare artifacts .." >> ${artifacts_dir}/test.log + +mkdir -p ${artifacts_dir}/fastq + +mv .github/data/fastq/*.fastq.gz ${artifacts_dir}/fastq + +mkdir -p ${artifacts_dir}/pipeline_outputs + +mv .github/data/test_output/* ${artifacts_dir}/pipeline_outputs diff --git a/.github/scripts/run_pipeline.sh b/.github/scripts/run_pipeline.sh new file mode 100755 index 0000000..9d1972f --- /dev/null +++ b/.github/scripts/run_pipeline.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +set -eo pipefail + +if [ -n "${GITHUB_ACTIONS}" ]; then + echo "Running in GitHub Actions Environment" + echo "Adjusting nextflow.config" + sed -i 's/cpus = 16/cpus = 4/g' nextflow.config +else + echo "Not running in GitHub Actions Environment" +fi + +nextflow run main.nf \ + -profile conda \ + --cache ${HOME}/.conda/envs \ + --samplesheet_input .github/data/samplesheet.csv \ + --pre_assembled \ + --mob_db ${PWD}/.github/data/mob-suite-db \ + --collect_outputs \ + --collected_outputs_prefix test \ + --outdir .github/data/test_output \ + -with-report .github/data/test_output/nextflow_report.html \ + -with-trace .github/data/test_output/nextflow_trace.tsv diff --git a/.github/scripts/run_tests_locally.sh b/.github/scripts/run_tests_locally.sh new file mode 100755 index 0000000..6e916ce --- /dev/null +++ b/.github/scripts/run_tests_locally.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +rm -rf .github/data/assemblies/* +rm -rf .github/data/fastq/* +rm -rf .github/data/mob-suite-db +rm -rf .github/data/samplesheet.csv +rm -rf .github/data/test_output + +.github/scripts/download_assemblies.sh + +.github/scripts/simulate_reads.sh + +.github/scripts/download_mob-suite_db.sh + +.github/scripts/create_samplesheet.sh + +.github/scripts/run_pipeline.sh + diff --git a/.github/scripts/simulate_reads.sh b/.github/scripts/simulate_reads.sh new file mode 100755 index 0000000..7001cce --- /dev/null +++ b/.github/scripts/simulate_reads.sh @@ -0,0 +1,35 @@ +#!/bin/bash + + +source ${HOME}/.bashrc + +eval "$(conda shell.bash hook)" + +conda activate art + +mkdir -p .github/data/fastq + +while IFS=',' read -r sample_id assembly; do + art_illumina \ + --paired \ + --in ${assembly} \ + --fcov 12 \ + --len 150 \ + --mflen 400 \ + --sdev 100 \ + --rndSeed 42 \ + --qShift 0 \ + --qShift2 0 \ + --out .github/data/fastq/${sample_id}_R + + rm -f .github/data/fastq/${sample_id}_R1.aln + rm -f .github/data/fastq/${sample_id}_R2.aln + + mv .github/data/fastq/${sample_id}_R1.fq .github/data/fastq/${sample_id}_R1.fastq + mv .github/data/fastq/${sample_id}_R2.fq .github/data/fastq/${sample_id}_R2.fastq + + gzip -f .github/data/fastq/${sample_id}_R1.fastq + gzip -f .github/data/fastq/${sample_id}_R2.fastq + +done < .github/data/reads_to_simulate.csv + diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..098427a --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,52 @@ +on: + pull_request: + branches: + - main + push: + branches: + - main + workflow_dispatch: +name: Tests +jobs: + test: + strategy: + fail-fast: false + matrix: + nextflow_version: ["21.04.3", "23.10.1"] + name: Run tests + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@master + - name: Create Artifacts Directory + run: mkdir artifacts + - name: Install Miniconda + run: bash .github/scripts/install_conda.sh + - name: Install Nextflow + env: + NXF_VER: ${{ matrix.nextflow_version }} + run: bash .github/scripts/install_nextflow.sh + - name: Create ART Read-Simulation Environment + run: bash .github/scripts/create_art_environment.sh + - name: Download Assemblies + run: bash .github/scripts/download_assemblies.sh + - name: Simulate Reads + run: bash .github/scripts/simulate_reads.sh + - name: Download mob-suite db + run: bash .github/scripts/download_mob-suite_db.sh + - name: Create SampleSheet + run: bash .github/scripts/create_samplesheet.sh + - name: Run Pipeline + run: bash .github/scripts/run_pipeline.sh + - name: Create Output Checking Environment + run: bash .github/scripts/create_output_checking_environment.sh + - name: Check Outputs + run: bash .github/scripts/check_outputs.sh + - name: Prepare Artifacts + if: always() + run: bash .github/scripts/prepare_artifacts.sh + - name: Upload Artifacts + uses: actions/upload-artifact@v4 + if: always() + with: + name: artifacts-BCCDC-PHL-routine-assembly-nextflow-v${{ matrix.nextflow_version }}-${{ github.run_id }}.${{ github.run_attempt }} + path: artifacts diff --git a/README.md b/README.md index 5421e87..bf32ab4 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[![Tests](https://github.com/BCCDC-PHL/plasmid-screen/actions/workflows/tests.yml/badge.svg)](https://github.com/BCCDC-PHL/plasmid-screen/actions/workflows/tests.yml) + # plasmid-screen ## Usage @@ -157,11 +159,11 @@ Example provenance output: - tool_name: quast tool_version: 5.0.2 - process_name: mash_screen - tools + tools: - tool_name: mash tool_version: 2.3 parameters - - name: threshold + - parameter: threshold value: 0.996 - process_name: mob_recon tools: diff --git a/modules/mash_screen.nf b/modules/mash_screen.nf index 8a30c40..8f2d28b 100644 --- a/modules/mash_screen.nf +++ b/modules/mash_screen.nf @@ -14,11 +14,11 @@ process mash_screen { script: """ printf -- "- process_name: mash_screen\\n" >> ${sample_id}_mash_provenance.yml - printf -- " tools\\n" >> ${sample_id}_mash_provenance.yml + printf -- " tools:\\n" >> ${sample_id}_mash_provenance.yml printf -- " - tool_name: mash\\n" >> ${sample_id}_mash_provenance.yml printf -- " tool_version: \$(mash --version)\\n" >> ${sample_id}_mash_provenance.yml - printf -- " parameters\\n" >> ${sample_id}_mash_provenance.yml - printf -- " - name: threshold\\n" >> ${sample_id}_mash_provenance.yml + printf -- " parameters:\\n" >> ${sample_id}_mash_provenance.yml + printf -- " - parameter: threshold\\n" >> ${sample_id}_mash_provenance.yml printf -- " value: ${params.mashthreshold}\\n" >> ${sample_id}_mash_provenance.yml mash screen -p ${task.cpus} -i ${params.mashthreshold} ${mob_db}/ncbi_plasmid_full_seqs.fas.msh ${reads_r1} ${reads_r2} | \ diff --git a/modules/mob_recon.nf b/modules/mob_recon.nf index 187c728..259fc33 100644 --- a/modules/mob_recon.nf +++ b/modules/mob_recon.nf @@ -20,7 +20,7 @@ process mob_recon { printf -- " tools:\\n" >> ${sample_id}_mob_recon_provenance.yml printf -- " - tool_name: mob_recon\\n" >> ${sample_id}_mob_recon_provenance.yml printf -- " tool_version: \$(mob_recon --version | cut -d ' ' -f 2)\\n" >> ${sample_id}_mob_recon_provenance.yml - printf -- " parameters\\n" >> ${sample_id}_mob_recon_provenance.yml + printf -- " parameters:\\n" >> ${sample_id}_mob_recon_provenance.yml printf -- " - parameter: database_directory\\n" >> ${sample_id}_mob_recon_provenance.yml printf -- " value: \$(realpath ${params.mob_db})\\n" >> ${sample_id}_mob_recon_provenance.yml printf -- " - parameter: filter_db\\n" >> ${sample_id}_mob_recon_provenance.yml