-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathSnakefile
114 lines (92 loc) · 4.93 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from subprocess import CalledProcessError
from snakemake.utils import min_version
import os
# Snakemake 7.7.0 introduced `retries` directive used in fetch_sequences
min_version("7.7.0")
# Hardcoded gene list used to create the DAG for both nextclade.smk and upload.smk
# It does _not_ need to be supplied to the `nextclade run` invocations because
# it matches the genes listed in the SARS-CoV-2 Nextclade dataset genome_annotations.gff
# https://github.com/nextstrain/nextclade_data/blob/244058e7d599a8295d748b12cffdd25cec6d3e7b/data/nextstrain/sars-cov-2/wuhan-hu-1/orfs/genome_annotation.gff3
# - Jover, 21 Feb 2024
GENE_LIST = ['E', 'M', 'N', 'ORF1a', 'ORF1b', 'ORF3a', 'ORF6', 'ORF7a', 'ORF7b', 'ORF8', 'ORF9b', 'S']
#################################################################
####################### general setup ###########################
#################################################################
database=config.get("database_name", "")
if database != "gisaid" and database != "genbank":
print(f"[Fatal] An unknown database \"{database}\" was specified")
sys.exit(1)
send_notifications = "SLACK_CHANNELS" in os.environ and "SLACK_TOKEN" in os.environ
#################################################################
################ work out what steps to run #####################
#################################################################
all_targets = [
f"data/{database}/metadata.tsv",
f"data/{database}/sequences.fasta",
f"data/{database}/aligned.fasta",
]
# Include targets for uploading to S3 if `s3_dst` is provided in config
if config.get("s3_dst"):
all_targets.append(f"data/{database}/upload.done")
# Only check for trigger config if `s3_dst` is provided because we only
# want to trigger builds if we've uploaded the output files to S3.
if config.get("trigger_rebuild", False):
all_targets.append(f"data/{database}/trigger-rebuild.done")
if config.get("trigger_counts", False):
all_targets.append(f"data/{database}/trigger-counts.done")
# Include targets for Slack notifications if Slack env variables are provided
# and the `s3_src` is provided in config since some notify scripts depend
# do diffs with files on S3 from previous runs
if send_notifications and config.get("s3_src"):
all_targets.append(f"data/{database}/notify.done")
rule all:
input: all_targets
#################################################################
###################### rule definitions #########################
#################################################################
include: "workflow/snakemake_rules/fetch_sequences.smk"
include: "workflow/snakemake_rules/curate.smk"
include: "workflow/snakemake_rules/nextclade.smk"
if send_notifications and config.get("s3_src"):
include: "workflow/snakemake_rules/slack_notifications.smk"
if config.get("s3_dst"):
include: "workflow/snakemake_rules/upload.smk"
# Only include rules for trigger if uploading files since the trigger
# rules depend on the outputs from upload.
include: "workflow/snakemake_rules/trigger.smk"
################################################################
################################################################
# A helpful list of environment variables in use by various scripts
env_variables = {
"AWS_DEFAULT_REGION": "Required for S3 access",
"AWS_ACCESS_KEY_ID": "Required for S3 access",
"AWS_SECRET_ACCESS_KEY": "Required for S3 access",
"GITHUB_RUN_ID": "Included in slack notification message (optional)",
"SLACK_TOKEN": "Required for sending slack notifications",
"SLACK_CHANNELS": "Required for sending slack notifications",
"PAT_GITHUB_DISPATCH": "Required for triggering GitHub actions (e.g. to rebuild nextstrain/ncov)",
"GISAID_API_ENDPOINT": "Required for GISAID API access",
"GISAID_USERNAME_AND_PASSWORD": "Required for GISAID API access"
}
onstart:
print(f"Pipeline starting.")
print(f"Source s3 bucket: {config.get('s3_src', 'N/A')}, destination: {config.get('s3_dst', 'N/A')}")
print("Environment variables present:")
for var, description in env_variables.items():
print(f"\t${{{var}}}: " + ("YES" if os.environ.get(var, "") else "NO") + f"({description})")
if send_notifications:
message="🥗 GISAID ingest" if database=="gisaid" else "🥣 GenBank ingest"
shell(f"./vendored/notify-on-job-start \"{message}\" nextstrain/ncov-ingest '.'")
onsuccess:
message = "✅ This pipeline has successfully finished 🎉"
print(message)
if not config.get("keep_all_files", False):
print("Removing intermediate files (set config option keep_all_files to skip this)")
shell("./bin/clean")
onerror:
print("Pipeline failed.")
if send_notifications:
shell("./vendored/notify-on-job-fail Ingest nextstrain/ncov-ingest")
if not config.get("keep_all_files", False):
print("Removing intermediate files (set config option keep_all_files to skip this)")
shell("./bin/clean")