common-workflow-language · RenskeW · Jun 7, 2022 · Jun 9, 2022 · Jun 20, 2022 · Jun 22, 2022
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -10,6 +10,9 @@ include tests/loop/*
 include tests/tmp1/tmp2/tmp3/.gitkeep
 include tests/tmp4/alpha/*
 include tests/wf/*
+include tests/wf/adv_prov/*
+include tests/wf/adv_prov/data/*
+include tests/wf/adv_prov/tools/*
 include tests/wf/operation/*
 include tests/override/*
 include tests/reloc/*.cwl

diff --git a/cwltool/cwlprov/provenance_profile.py b/cwltool/cwlprov/provenance_profile.py
@@ -244,6 +244,16 @@
             self.prospective_prov(job)
             customised_job = copy_job_order(job, job_order_object)
             self.used_artefacts(customised_job, self.workflow_run_uri)
+            # if CWLPROV['prov'].uri in job_order_object: # maybe move this to another place
+            #     metadata = job_order_object[CWLPROV['prov'].uri] # change uri to CWLPROV['prov'].uri
+            #     for item in metadata:
+            #         # make a new entity with id
+            #         # give it type additionalType value
+            #         # add nested annotations
+            #         # how much of this can we reuse from _add_nested_annotations?
+            #         # how do we identify the correct file to write to? self.workflow_run_uri?
+            #         #
+            #         pass
 
     def record_process_start(
         self, process: Process, job: JobsType, process_run_id: Optional[str] = None
@@ -291,6 +301,31 @@
         self.generate_output_prov(outputs, process_run_id, process_name)
         self.document.wasEndedBy(process_run_id, None, self.workflow_run_uri, when)
 
+    def _add_nested_annotations(
+        self, annotation_key: str, annotation_value: Any, e: ProvEntity
+    ) -> ProvEntity:
+        """Propagate input data annotations to provenance."""
+        # Change https:// into http:// first
+        schema2_uri = "https://schema.org/"
+        if schema2_uri in annotation_key:
+            annotation_key = SCHEMA[annotation_key.replace(schema2_uri, "")].uri
+
+        if not isinstance(annotation_value, (MutableSequence, MutableMapping)):
+            e.add_attributes({annotation_key: str(annotation_value)})
+        elif isinstance(annotation_value, MutableSequence):
+            for item_value in annotation_value:
+                e = self._add_nested_annotations(annotation_key, item_value, e)
+        else:
+            nested_id = uuid.uuid4().urn
+            nested_entity = self.document.entity(nested_id)
+            e.add_attributes({annotation_key: nested_entity.identifier})
+            for nested_key in annotation_value:
+                nested_value = annotation_value[nested_key]
+                nested_entity = self._add_nested_annotations(
+                    nested_key, nested_value, nested_entity
+                )
+        return e
+
     def declare_file(self, value: CWLObjectType) -> Tuple[ProvEntity, ProvEntity, str]:
         if value["class"] != "File":
             raise ValueError("Must have class:File: %s" % value)
@@ -341,6 +376,29 @@
             file_entity.add_attributes({CWLPROV["nameext"]: cast(str, value["nameext"])})
         self.document.specializationOf(file_entity, entity)
 
+        # Identify all schema annotations
+        schema_annotations = dict(
+            [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")]
+        )
+
+        # Transfer SCHEMA annotations to provenance
+        for s in schema_annotations:
+            if "additionalType" in s:
+                atype = schema_annotations[s]
+                if isinstance(atype, str):
+                    additional_type = atype.split(sep="/")[-1]  # find better method?
+                    file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
+                else:
+                    for a_entry in cast(List[str], atype):
+                        additional_type = a_entry.split(sep="/")[-1]  # find better method?
+                        file_entity.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
+            else:
+                file_entity = self._add_nested_annotations(s, schema_annotations[s], file_entity)
+
+        # Transfer format annotations to provenance:
+        if "format" in value:
+            file_entity.add_attributes({SCHEMA["encodingFormat"]: value["format"]})
+
         # Check for secondaries
         for sec in cast(MutableSequence[CWLObjectType], value.get("secondaryFiles", [])):
             # TODO: Record these in a specializationOf entity with UUID?
@@ -450,6 +508,21 @@
         coll.add_attributes(coll_attribs)
         coll_b.add_attributes(coll_b_attribs)
 
+        # Identify all schema annotations
+        schema_annotations = dict(
+            [(v, value[v]) for v in value.keys() if v.startswith("https://schema.org")]
+        )
+
+        # Transfer SCHEMA annotations to provenance
+        for s in schema_annotations:
+            if "additionalType" in s:
+                additional_type = cast(str, schema_annotations[s]).split(sep="/")[
+                    -1
+                ]  # find better method?
+                coll.add_attributes({PROV_TYPE: SCHEMA[additional_type]})
+            elif "hasPart" not in s:
+                coll = self._add_nested_annotations(s, schema_annotations[s], coll)
+
         # Also Save ORE Folder as annotation metadata
         ore_doc = ProvDocument()
         ore_doc.add_namespace(ORE)

diff --git a/tests/test_provenance.py b/tests/test_provenance.py
@@ -83,6 +83,49 @@ def test_revsort_workflow(tmp_path: Path) -> None:
     check_provenance(folder)
 
 
+@needs_docker
+def test_revsort_label_annotations(tmp_path: Path) -> None:
+    """Affirm that EDAM file formats in the input object make it into CWLProv."""
+    base_path = cwltool(
+        tmp_path,
+        get_data("tests/wf/revsort.cwl"),
+        get_data("tests/wf/revsort-job.json"),
+    )
+    prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
+    arcp_root = find_arcp(base_path)
+    g = Graph()
+    with open(prov_file, "rb") as f:
+        g.parse(file=f, format="nt", publicID=arcp_root)
+    mime_having_objects = list(g.subjects(SCHEMA.encodingFormat))
+    assert len(mime_having_objects) == 2
+    for obj in mime_having_objects:
+        assert (
+            cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value
+            == "https://www.iana.org/assignments/media-types/text/plain"
+        )
+
+
+def test_advanced_prov_annotations(tmp_path: Path) -> None:
+    """Pass through of advanced input annotations."""
+    base_path = cwltool(
+        tmp_path,
+        get_data("tests/wf/adv_prov/niaa_wf.cwl"),
+        get_data("tests/wf/adv_prov/niaa_wf_job.yml"),
+    )
+    prov_file = base_path / "metadata" / "provenance" / "primary.cwlprov.nt"
+    arcp_root = find_arcp(base_path)
+    g = Graph()
+    with open(prov_file, "rb") as f:
+        g.parse(file=f, format="nt", publicID=arcp_root)
+    mime_having_objects = list(g.subjects(SCHEMA.encodingFormat))
+    assert len(mime_having_objects) == 8
+    # for obj in mime_having_objects:
+    #     assert (
+    #         cast(Literal, list(g.objects(obj, SCHEMA.encodingFormat))[0]).value
+    #         == "https://www.iana.org/assignments/media-types/text/plain"
+    #     )
+
+
 @needs_docker
 def test_revsort_workflow_shortcut(tmp_path: Path) -> None:
     """Confirm that using 'cwl:tool' shortcut still snapshots the CWL files."""

diff --git a/tests/wf/adv_prov/data/pdb_query.json b/tests/wf/adv_prov/data/pdb_query.json
diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_testing.csv
diff --git a/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv b/tests/wf/adv_prov/data/prepared_biolip_win_p_training.csv
diff --git a/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv b/tests/wf/adv_prov/data/sabdab_summary_all_20220527.tsv
diff --git a/tests/wf/adv_prov/model_example_params.json b/tests/wf/adv_prov/model_example_params.json
diff --git a/tests/wf/adv_prov/niaa_wf.cwl b/tests/wf/adv_prov/niaa_wf.cwl
@@ -0,0 +1,186 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.2
+class: Workflow
+
+intent: [ edam:operation_2423 ]  # Prediction ope
+doc: "This mock workflow calculates input features and labels which are used to train a deep learning model for epitope prediction."
+
+requirements:
+  ScatterFeatureRequirement: {}
+  StepInputExpressionRequirement: {}
+  SubworkflowFeatureRequirement: {}
+
+inputs: 
+  sabdab_summary: 
+    type: File
+    format: iana:text/tab-separated-values
+    doc: "SAbDAb Summary metadata about all structures in the database."
+  biodl_train_dataset: 
+    type: File
+    format: iana:text/csv
+    doc: "BioDL training dataset containing PPI interactions"
+  biodl_test_dataset: 
+    type: File
+    doc: "BioDL test dataset with PPI interactions."
+  hhblits_db:
+    type: Directory
+    doc: "Reference database for HHblits"
+  hhblits_db_name: 
+    type: string
+    doc: "Name of hhblits reference database"
+  pdb_search_api_query: 
+    type: File
+    format: iana:application/json
+    doc: "Structured query for PDB API."
+
+outputs: 
+  model_output:
+    type: File
+    outputSource: train_epitope_prediction_model/train_log
+    doc: "Output of the prediction model."
+
+steps:
+  run_pdb_query:
+    in:
+      pdb_search_query: pdb_search_api_query
+    out:
+      [ processed_response ]
+    run: ./tools/pdb_query.cwl
+    doc: |
+      Use PDB search API to run a query on the Protein Data Bank. Returns .txt file with comma-separated PDB IDs which satisfy the query requirements.
+      See https://search.rcsb.org/index.html#search-api for a tutorial.
+
+  download_pdb_files:
+    in: 
+      input_file: run_pdb_query/processed_response 
+      mmcif_format: { default: True }
+      pdb_format: { default: True }
+    out:
+      [ pdb_files ]
+    run: ./tools/pdb_batch_download.cwl
+
+  decompress_pdb_files:
+    in:
+      pdb_archives: download_pdb_files/pdb_files
+    out: [ cifs, pdbs ]
+    run: ./tools/decompress.cwl
+    doc: "Decompress files using gzip"
+
+  generate_dssp_labels:
+    in:
+      pdb_files: decompress_pdb_files/pdbs # change this later
+      rsa_cutoff: { default :  0.06 }
+    out: [ dssp_output_files ]
+    run: ./tools/dssp.cwl
+    doc: "Use DSSP to extract secondary structure and solvent accessibility from PDB files."
+
+  generate_ppi_labels:
+    in:
+      mmcif_files: decompress_pdb_files/cifs
+      train_dataset: biodl_train_dataset
+      test_dataset: biodl_test_dataset
+    out: [ ppi_fasta_files ]
+    run: ./tools/ppi_annotations.cwl
+    doc: "Extract ppi annotations from BioDL. This step is partly emulated."
+
+  preprocess_sabdab_data:
+    doc: "Extract antigen chains from SAbDab summary file."
+    in:
+      sabdab_summary: sabdab_summary
+    out: [ processed_summary ]
+    run: ./tools/process_sabdab.cwl
+
+  generate_epitope_labels:
+    in: 
+      mmcif_files: decompress_pdb_files/cifs
+      sabdab_processed: preprocess_sabdab_data/processed_summary
+    out: [ epitope_fasta_dir ]
+    run: ./tools/epitope_annotations.cwl
+    doc: "Extract epitope annotations from PDB files."
+
+  combine_labels:
+    doc: "Combine labels into 1 file per protein sequence."
+    run: ./tools/combine_labels.cwl
+    in:
+      epitope_directory: generate_epitope_labels/epitope_fasta_dir
+      ppi_directory: generate_ppi_labels/ppi_fasta_files
+      dssp_directory: generate_dssp_labels/dssp_output_files
+    out: [ labels_combined ]
+
+  generate_pc7:
+    doc: Calculate PC7 features for each residue in each protein sequence.
+    run: ./tools/pc7_inputs.cwl # to do: adapt tool so it takes directory of fasta files as input
+    in: 
+      fasta: generate_ppi_labels/ppi_fasta_files 
+    out: [ pc7_features ]  
+
+  generate_psp19:
+    label: Calculate PSP19 features for each residue in each protein sequence.
+    run: ./tools/psp19_inputs.cwl
+    in:
+      fasta: generate_ppi_labels/ppi_fasta_files
+    out: [ psp19_features ]
+
+  generate_hhm:
+    in:
+      query_sequences: 
+        source: generate_ppi_labels/ppi_fasta_files # type Directory
+        valueFrom: $(self.listing) # here type Directory is converted to File array
+      hhblits_db: hhblits_db
+      hhblits_db_name: hhblits_db_name
+      hhblits_n_iterations: { default: 1 }
+    out: [ hhm_file_array ]
+    run:
+      class: Workflow # this is a subworkflow as a workaround because generate_ppi_labels/ppi_fasta_files is Directory while run_hhblits takes File
+      inputs:
+        query_sequences: File[]
+        hhblits_db: Directory
+        hhblits_db_name: string
+        hhblits_n_iterations: int
+      outputs:
+        hhm_file_array:
+          type: File[]
+          outputSource: run_hhblits/hhm
+      steps:
+        run_hhblits:
+          in: 
+            protein_query_sequence: query_sequences
+            database: hhblits_db
+            database_name: hhblits_db_name
+            n_iterations: hhblits_n_iterations
+          out: [ hhm ]
+          scatter: protein_query_sequence
+          run: ./tools/hhm_inputs_scatter.cwl
+  combine_features:
+    in: 
+      input_sequences: generate_ppi_labels/ppi_fasta_files
+      pc7_features: generate_pc7/pc7_features
+      psp19_features: generate_psp19/psp19_features
+      hhm_features: generate_hhm/hhm_file_array # file array, combine_features.cwl converts it to directory
+    out: [ combined_features ]
+    run: ./tools/combine_features.cwl  
+
+  train_epitope_prediction_model: # This step incorporates both training and prediction, not sure if this is the case in the real workflow.
+    in: # in the real workflow, the configuration file would be generated as part of the workflow as well
+      input_features: combine_features/combined_features
+      input_labels: combine_labels/labels_combined
+    out: [ train_log ] 
+    run: ./tools/train_epitope_model.cwl
+    doc: "Predict epitope residues using a multi-task learning approach. This step is not real yet."  
+
+$namespaces:
+  iana: "https://www.iana.org/assignments/media-types/"
+  s: "https://schema.org/"
+  edam: "http://edamontology.org/"
+  cwlprov: "https://w3id.org/cwl/prov#"
+
+$schemas:
+- https://schema.org/version/latest/schemaorg-current-https.rdf
+- https://edamontology.org/EDAM_1.25.owl
+
+s:author:
+- s:name: "Renske de Wit"
+  s:identifier: https://orcid.org/0000-0003-0902-0086
+- s:name: "Katharina Waury"
+s:license: https://spdx.org/licenses/Apache-2.0