Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Propagate input annotations to primary.cwlprov files #1678

Draft
wants to merge 19 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ include tests/loop/*
include tests/tmp1/tmp2/tmp3/.gitkeep
include tests/tmp4/alpha/*
include tests/wf/*
include tests/wf/adv_prov/*
include tests/wf/adv_prov/data/*
include tests/wf/adv_prov/tools/*
include tests/wf/operation/*
include tests/override/*
include tests/reloc/*.cwl
Expand Down
73 changes: 73 additions & 0 deletions cwltool/cwlprov/provenance_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,16 @@
self.prospective_prov(job)
customised_job = copy_job_order(job, job_order_object)
self.used_artefacts(customised_job, self.workflow_run_uri)
# if CWLPROV['prov'].uri in job_order_object: # maybe move this to another place
# metadata = job_order_object[CWLPROV['prov'].uri] # change uri to CWLPROV['prov'].uri
# for item in metadata:
# # make a new entity with id
# # give it type additionalType value
# # add nested annotations
# # how much of this can we reuse from _add_nested_annotations?
# # how do we identify the correct file to write to? self.workflow_run_uri?
# #
# pass

def record_process_start(
self, process: Process, job: JobsType, process_run_id: Optional[str] = None
Expand Down Expand Up @@ -291,6 +301,31 @@
self.generate_output_prov(outputs, process_run_id, process_name)
self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when)

def _add_nested_annotations(
self, annotation_key: str, annotation_value: Any, e: ProvEntity
) -> ProvEntity:
"""Propagate input data annotations to provenance."""
# Change https:// into http:// first
schema2_uri = "https://schema.org/"
if schema2_uri in annotation_key:
annotation_key = SCHEMA[annotation_key.replace(schema2_uri, "")].uri

if not isinstance(annotation_value, (MutableSequence, MutableMapping)):
e.add_attributes({annotation_key: str(annotation_value)})
elif isinstance(annotation_value, MutableSequence):
for item_value in annotation_value:
e = self._add_nested_annotations(annotation_key, item_value, e)

Check warning on line 317 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L317

Added line #L317 was not covered by tests
else:
nested_id = uuid.uuid4().urn
nested_entity = self.document.entity(nested_id)
e.add_attributes({annotation_key: nested_entity.identifier})
for nested_key in annotation_value:
nested_value = annotation_value[nested_key]
nested_entity = self._add_nested_annotations(
nested_key, nested_value, nested_entity
)
return e

def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]:
if value["class"] != "File":
raise ValueError("Must have class:File: %s" % value)
Expand Down Expand Up @@ -341,6 +376,29 @@
file_entity.add_attributes({CWLPROV["nameext"]: cast(str, value["nameext"])})
self.document.specializationOf(file_entity, entity)

# Identify all schema annotations
schema_annotations = dict(
[(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")]
Fixed Show fixed Hide fixed
)

# Transfer SCHEMA annotations to provenance
for s in schema_annotations:
if "additionalType" in s:
atype = schema_annotations[s]
if isinstance(atype, str):
additional_type = atype.split(sep="/")[-1] # find better method?
file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})

Check warning on line 390 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L389-L390

Added lines #L389 - L390 were not covered by tests
else:
for a_entry in cast(List[str], atype):
additional_type = a_entry.split(sep="/")[-1] # find better method?
file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
else:
file_entity = self._add_nested_annotations(s, schema_annotations[s], file_entity)

# Transfer format annotations to provenance:
if "format" in value:
file_entity.add_attributes({SCHEMA["encodingFormat"]: value["format"]})

# Check for secondaries
for sec in cast(MutableSequence[CWLObjectType], value.get("secondaryFiles", [])):
# TODO: Record these in a specializationOf entity with UUID?
Expand Down Expand Up @@ -450,6 +508,21 @@
coll.add_attributes(coll_attribs)
coll_b.add_attributes(coll_b_attribs)

# Identify all schema annotations
schema_annotations = dict(
[(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")]
Fixed Show fixed Hide fixed
)

# Transfer SCHEMA annotations to provenance
for s in schema_annotations:
if "additionalType" in s:
additional_type = cast(str, schema_annotations[s]).split(sep="/")[

Check warning on line 519 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L519

Added line #L519 was not covered by tests
-1
] # find better method?
coll.add_attributes({PROV_TYPE: SCHEMA[additional_type]})

Check warning on line 522 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L522

Added line #L522 was not covered by tests
elif "hasPart" not in s:
coll = self._add_nested_annotations(s, schema_annotations[s], coll)

Check warning on line 524 in cwltool/cwlprov/provenance_profile.py

View check run for this annotation

Codecov / codecov/patch

cwltool/cwlprov/provenance_profile.py#L524

Added line #L524 was not covered by tests

# Also Save ORE Folder as annotation metadata
ore_doc = ProvDocument()
ore_doc.add_namespace(ORE)
Expand Down
43 changes: 43 additions & 0 deletions tests/test_provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,49 @@ def test_revsort_workflow(tmp_path: Path) -> None:
check_provenance(folder)


@needs_docker
def test_revsort_label_annotations(tmp_path: Path) -> None:
"""Affirm that EDAM file formats in the input object make it into CWLProv."""
base_path = cwltool(
tmp_path,
get_data("tests/wf/revsort.cwl"),
get_data("tests/wf/revsort-job.json"),
)
prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
arcp_root = find_arcp(base_path)
g = Graph()
with open(prov_file, "rb") as f:
g.parse(file=f, format="nt", publicID=arcp_root)
mime_having_objects = list(g.subjects(SCHEMA.encodingFormat))
assert len(mime_having_objects) == 2
for obj in mime_having_objects:
assert (
cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value
== "https://www.iana.org/assignments/media-types/text/plain"
)


def test_advanced_prov_annotations(tmp_path: Path) -> None:
"""Pass through of advanced input annotations."""
base_path = cwltool(
tmp_path,
get_data("tests/wf/adv_prov/niaa_wf.cwl"),
get_data("tests/wf/adv_prov/niaa_wf_job.yml"),
)
prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
arcp_root = find_arcp(base_path)
g = Graph()
with open(prov_file, "rb") as f:
g.parse(file=f, format="nt", publicID=arcp_root)
mime_having_objects = list(g.subjects(SCHEMA.encodingFormat))
assert len(mime_having_objects) == 8
# for obj in mime_having_objects:
# assert (
# cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value
# == "https://www.iana.org/assignments/media-types/text/plain"
# )


@needs_docker
def test_revsort_workflow_shortcut(tmp_path: Path) -> None:
"""Confirm that using 'cwl:tool' shortcut still snapshots the CWL files."""
Expand Down
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
186 changes: 186 additions & 0 deletions tests/wf/adv_prov/niaa_wf.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
#!/usr/bin/env cwl-runner

cwlVersion: v1.2
class: Workflow

intent: [ edam:operation_2423 ] # Prediction ope
doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction."

requirements:
ScatterFeatureRequirement: {}
StepInputExpressionRequirement: {}
SubworkflowFeatureRequirement: {}

inputs:
sabdab_summary:
type: File
format: iana:text/tab-separated-values
doc: "SAbDAb Summary metadata about all structures in the database."
biodl_train_dataset:
type: File
format: iana:text/csv
doc: "BioDL training dataset containing PPI interactions"
biodl_test_dataset:
type: File
doc: "BioDL test dataset with PPI interactions."
hhblits_db:
type: Directory
doc: "Reference database for HHblits"
hhblits_db_name:
type: string
doc: "Name of hhblits reference database"
pdb_search_api_query:
type: File
format: iana:application/json
doc: "Structured query for PDB API."

outputs:
model_output:
type: File
outputSource: train_epitope_prediction_model/train_log
doc: "Output of the prediction model."

steps:
run_pdb_query:
in:
pdb_search_query: pdb_search_api_query
out:
[ processed_response ]
run: ./tools/pdb_query.cwl
doc: |
Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements.
See https://search.rcsb.org/index.html#search-api for a tutorial.

download_pdb_files:
in:
input_file: run_pdb_query/processed_response
mmcif_format: { default: True }
pdb_format: { default: True }
out:
[ pdb_files ]
run: ./tools/pdb_batch_download.cwl

decompress_pdb_files:
in:
pdb_archives: download_pdb_files/pdb_files
out: [ cifs, pdbs ]
run: ./tools/decompress.cwl
doc: "Decompress files using gzip"

generate_dssp_labels:
in:
pdb_files: decompress_pdb_files/pdbs # change this later
rsa_cutoff: { default : 0.06 }
out: [ dssp_output_files ]
run: ./tools/dssp.cwl
doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files."

generate_ppi_labels:
in:
mmcif_files: decompress_pdb_files/cifs
train_dataset: biodl_train_dataset
test_dataset: biodl_test_dataset
out: [ ppi_fasta_files ]
run: ./tools/ppi_annotations.cwl
doc: "Extract ppi annotations from BioDL. This step is partly emulated."

preprocess_sabdab_data:
doc: "Extract antigen chains from SAbDab summary file."
in:
sabdab_summary: sabdab_summary
out: [ processed_summary ]
run: ./tools/process_sabdab.cwl

generate_epitope_labels:
in:
mmcif_files: decompress_pdb_files/cifs
sabdab_processed: preprocess_sabdab_data/processed_summary
out: [ epitope_fasta_dir ]
run: ./tools/epitope_annotations.cwl
doc: "Extract epitope annotations from PDB files."

combine_labels:
doc: "Combine labels into 1 file per protein sequence."
run: ./tools/combine_labels.cwl
in:
epitope_directory: generate_epitope_labels/epitope_fasta_dir
ppi_directory: generate_ppi_labels/ppi_fasta_files
dssp_directory: generate_dssp_labels/dssp_output_files
out: [ labels_combined ]

generate_pc7:
doc: Calculate PC7 features for each residue in each protein sequence.
run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input
in:
fasta: generate_ppi_labels/ppi_fasta_files
out: [ pc7_features ]

generate_psp19:
label: Calculate PSP19 features for each residue in each protein sequence.
run: ./tools/psp19_inputs.cwl
in:
fasta: generate_ppi_labels/ppi_fasta_files
out: [ psp19_features ]

generate_hhm:
in:
query_sequences:
source: generate_ppi_labels/ppi_fasta_files # type Directory
valueFrom: $(self.listing) # here type Directory is converted to File array
hhblits_db: hhblits_db
hhblits_db_name: hhblits_db_name
hhblits_n_iterations: { default: 1 }
out: [ hhm_file_array ]
run:
class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File
inputs:
query_sequences: File[]
hhblits_db: Directory
hhblits_db_name: string
hhblits_n_iterations: int
outputs:
hhm_file_array:
type: File[]
outputSource: run_hhblits/hhm
steps:
run_hhblits:
in:
protein_query_sequence: query_sequences
database: hhblits_db
database_name: hhblits_db_name
n_iterations: hhblits_n_iterations
out: [ hhm ]
scatter: protein_query_sequence
run: ./tools/hhm_inputs_scatter.cwl
combine_features:
in:
input_sequences: generate_ppi_labels/ppi_fasta_files
pc7_features: generate_pc7/pc7_features
psp19_features: generate_psp19/psp19_features
hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory
out: [ combined_features ]
run: ./tools/combine_features.cwl

train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow.
in: # in the real workflow, the configuration file would be generated as part of the workflow as well
input_features: combine_features/combined_features
input_labels: combine_labels/labels_combined
out: [ train_log ]
run: ./tools/train_epitope_model.cwl
doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet."

$namespaces:
iana: "https://www.iana.org/assignments/media-types/"
s: "https://schema.org/"
edam: "http://edamontology.org/"
cwlprov: "https://w3id.org/cwl/prov#"

$schemas:
- https://schema.org/version/latest/schemaorg-current-https.rdf
- https://edamontology.org/EDAM_1.25.owl

s:author:
- s:name: "Renske de Wit"
s:identifier: https://orcid.org/0000-0003-0902-0086
- s:name: "Katharina Waury"
s:license: https://spdx.org/licenses/Apache-2.0
Loading
Loading