[ENH] Implemented pipeline_version and pipeline_name query fields (…

…#345) * Added query parameters for pipeline name and version * Updated query model * Updated SPARQL query template Added nipoppy namespace * Updated crud.get function * Updated `query_string` in `create_query` function * Updated the `default_neurobagel_query` * Refactored `query_string` in `create_query` function * Updated test fixtures * Added tests for `pipeline_version` and `pipeline_name` * Updated `np` namespace * Refactored the pipeline information representation in the response * Updated test fixture * Fixed the test * Fixed the pipeline info at the subject level * Addressed requested changes * Left over clean up * Addressed second wave requested changes
neurobagel · Oct 9, 2024 · a93d476 · a93d476
1 parent 38b58f7
commit a93d476
Show file tree

Hide file tree

Showing 7 changed files with 255 additions and 5 deletions.
diff --git a/app/api/crud.py b/app/api/crud.py
@@ -16,6 +16,8 @@
     "dataset_uuid",
     "dataset_name",
     "dataset_portal_uri",
+    "pipeline_version",
+    "pipeline_name",
 ]
 
 
@@ -100,6 +102,8 @@ async def get(
     min_num_phenotypic_sessions: int,
     assessment: str,
     image_modal: str,
+    pipeline_name: str,
+    pipeline_version: str,
 ) -> list[CohortQueryResponse]:
     """
     Sends SPARQL queries to the graph API via httpx POST requests for subject-session or dataset metadata
@@ -125,6 +129,10 @@ async def get(
         Non-imaging assessment completed by subjects.
     image_modal : str
         Imaging modality of subject scans.
+    pipeline_name : str
+        Name of pipeline run on subject scans.
+    pipeline_version : str
+        Version of pipeline run on subject scans.
 
     Returns
     -------
@@ -142,6 +150,8 @@ async def get(
             min_num_imaging_sessions=min_num_imaging_sessions,
             assessment=assessment,
             image_modal=image_modal,
+            pipeline_version=pipeline_version,
+            pipeline_name=pipeline_name,
         )
     )
 
@@ -158,6 +168,7 @@ async def get(
 
     response_obj = []
     dataset_cols = ["dataset_uuid", "dataset_name"]
+    dataset_available_pipeline_info = {}
     if not results_df.empty:
         for (dataset_uuid, dataset_name), group in results_df.groupby(
             by=dataset_cols
@@ -189,6 +200,50 @@ async def get(
                     )
                 )
 
+                pipeline_grouped_data = (
+                    group.groupby(
+                        [
+                            "sub_id",
+                            "session_id",
+                            "session_type",
+                            "pipeline_name",
+                        ],
+                        dropna=True,
+                    )
+                    .agg(
+                        {
+                            "pipeline_version": lambda x: list(
+                                x.dropna().unique()
+                            )
+                        }
+                    )
+                    .reset_index()
+                )
+
+                session_completed_pipeline_data = (
+                    pipeline_grouped_data.groupby(
+                        ["sub_id", "session_id", "session_type"]
+                    )
+                    .apply(
+                        lambda x: dict(
+                            zip(x["pipeline_name"], x["pipeline_version"])
+                        )
+                    )
+                    .reset_index(name="completed_pipelines")
+                )
+
+                subject_data = pd.merge(
+                    subject_data.reset_index(drop=True),
+                    session_completed_pipeline_data,
+                    on=["sub_id", "session_id", "session_type"],
+                    how="left",
+                )
+
+                # ensure that for sessions missing completed pipeline info, completed_pipelines is still a dict rather than null/nan
+                subject_data["completed_pipelines"] = subject_data[
+                    "completed_pipelines"
+                ].apply(lambda x: x if isinstance(x, dict) else {})
+
                 # TODO: Revisit this as there may be a more elegant solution.
                 # The following code replaces columns with all NaN values with values of None, to ensure they show up in the final JSON as `null`.
                 # This is needed as the above .agg() seems to turn NaN into None for object-type columns (which have some non-missing values)
@@ -204,6 +259,14 @@ async def get(
 
                 subject_data = list(subject_data.to_dict("records"))
 
+                dataset_available_pipeline_info = (
+                    group.groupby("pipeline_name", dropna=True)[
+                        "pipeline_version"
+                    ]
+                    .apply(lambda x: list(x.dropna().unique()))
+                    .to_dict()
+                )
+
             response_obj.append(
                 CohortQueryResponse(
                     dataset_uuid=dataset_uuid,
@@ -224,6 +287,7 @@ async def get(
                             group["image_modal"].notna()
                         ].unique()
                     ),
+                    available_pipelines=dataset_available_pipeline_info,
                 )
             )
 

diff --git a/app/api/models.py b/app/api/models.py
@@ -8,6 +8,7 @@
 from pydantic import BaseModel, constr, root_validator
 
 CONTROLLED_TERM_REGEX = r"^[a-zA-Z]+[:]\S+$"
+VERSION_REGEX = r"^([A-Za-z0-9-]+)\.(\d+)\.([A-Za-z0-9-]+)$"
 
 
 class QueryModel(BaseModel):
@@ -22,6 +23,9 @@ class QueryModel(BaseModel):
     min_num_phenotypic_sessions: int = Query(default=None, ge=0)
     assessment: constr(regex=CONTROLLED_TERM_REGEX) = None
     image_modal: constr(regex=CONTROLLED_TERM_REGEX) = None
+    pipeline_name: constr(regex=CONTROLLED_TERM_REGEX) = None
+    # TODO: Check back if validating using a regex is too restrictive
+    pipeline_version: constr(regex=VERSION_REGEX) = None
 
     @root_validator()
     def check_maxage_ge_minage(cls, values):
@@ -67,6 +71,7 @@ class SessionResponse(BaseModel):
     assessment: list
     image_modal: list
     session_file_path: Optional[str]
+    completed_pipelines: dict
 
 
 class CohortQueryResponse(BaseModel):
@@ -81,6 +86,7 @@ class CohortQueryResponse(BaseModel):
     num_matching_subjects: int
     subject_data: Union[list[SessionResponse], str]
     image_modals: list
+    available_pipelines: dict
 
 
 class DataElementURI(str, Enum):

diff --git a/app/api/routers/query.py b/app/api/routers/query.py
@@ -52,6 +52,8 @@ async def get_query(
         query.min_num_phenotypic_sessions,
         query.assessment,
         query.image_modal,
+        query.pipeline_name,
+        query.pipeline_version,
     )
 
     return response
diff --git a/app/api/utility.py b/app/api/utility.py
@@ -50,6 +50,7 @@
     "ncit": "http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#",
     "nidm": "http://purl.org/nidash/nidm#",
     "snomed": "http://purl.bioontology.org/ontology/SNOMEDCT/",
+    "np": "https://github.com/nipoppy/pipeline-catalog/tree/main/processing/",
 }
 
 # Store domains in named tuples
@@ -61,6 +62,8 @@
 IS_CONTROL = Domain("subject_group", "nb:isSubjectGroup")
 ASSESSMENT = Domain("assessment", "nb:hasAssessment")
 IMAGE_MODAL = Domain("image_modal", "nb:hasContrastType")
+PIPELINE_NAME = Domain("pipeline_name", "nb:hasPipelineName")
+PIPELINE_VERSION = Domain("pipeline_version", "nb:hasPipelineVersion")
 PROJECT = Domain("project", "nb:hasSamples")
 
 
@@ -115,6 +118,8 @@ def create_query(
     min_num_phenotypic_sessions: Optional[int] = None,
     assessment: Optional[str] = None,
     image_modal: Optional[str] = None,
+    pipeline_name: Optional[str] = None,
+    pipeline_version: Optional[str] = None,
 ) -> str:
     """
     Creates a SPARQL query using a query template and filters it using the input parameters.
@@ -139,6 +144,10 @@ def create_query(
         Non-imaging assessment completed by subjects, by default None.
     image_modal : str, optional
         Imaging modality of subject scans, by default None.
+    pipeline_name : str, optional
+        Name of pipeline run on subject scans, by default None.
+    pipeline_version : str, optional
+        Version of pipeline run on subject scans, by default None.
 
     Returns
     -------
@@ -203,13 +212,28 @@ def create_query(
     imaging_session_level_filters = ""
     if image_modal is not None:
         imaging_session_level_filters += (
-            "\n" + f"FILTER (?{IMAGE_MODAL.var} = {image_modal})."
+            "\n"
+            + f"{create_bound_filter(IMAGE_MODAL.var)} && ?{IMAGE_MODAL.var} = {image_modal})."
+        )
+
+    if pipeline_name is not None:
+        imaging_session_level_filters += (
+            "\n"
+            + f"{create_bound_filter(PIPELINE_NAME.var)} && (?{PIPELINE_NAME.var} = {pipeline_name})."
+        )
+
+    # In case a user specified the pipeline version but not the name
+    if pipeline_version is not None:
+        imaging_session_level_filters += (
+            "\n"
+            + f'{create_bound_filter(PIPELINE_VERSION.var)} && ?{PIPELINE_VERSION.var} = "{pipeline_version}").'  # Wrap with quotes to avoid workaround implicit conversion
         )
 
     query_string = textwrap.dedent(
         f"""
         SELECT DISTINCT ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?age ?sex
-        ?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions ?session_id ?session_type ?assessment ?image_modal ?session_file_path
+        ?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions
+        ?session_id ?session_type ?assessment ?image_modal ?session_file_path ?pipeline_version ?pipeline_name
         WHERE {{
             ?dataset_uuid a nb:Dataset;
                 nb:hasLabel ?dataset_name;
@@ -244,14 +268,30 @@ def create_query(
                     {phenotypic_session_level_filters}
                 }} GROUP BY ?subject
             }}
+
+            OPTIONAL {{
+                ?session nb:hasCompletedPipeline ?pipeline.
+                ?pipeline nb:hasPipelineVersion ?pipeline_version.
+                ?pipeline nb:hasPipelineName ?pipeline_name.
+            }}
             {{
                 SELECT ?subject (count(distinct ?imaging_session) as ?num_matching_imaging_sessions)
                 WHERE {{
                     ?subject a nb:Subject.
                     OPTIONAL {{
                         ?subject nb:hasSession ?imaging_session.
-                        ?imaging_session a nb:ImagingSession;
-                            nb:hasAcquisition/nb:hasContrastType ?image_modal.
+                        ?imaging_session a nb:ImagingSession.
+
+                        OPTIONAL {{
+                            ?imaging_session nb:hasAcquisition ?acquisition.
+                            ?acquisition nb:hasContrastType ?image_modal.
+                        }}
+
+                        OPTIONAL {{
+                            ?imaging_session nb:hasCompletedPipeline ?pipeline.
+                            ?pipeline nb:hasPipelineVersion ?pipeline_version;
+                            nb:hasPipelineName ?pipeline_name.
+                        }}
                     }}
                     {imaging_session_level_filters}
                 }} GROUP BY ?subject

diff --git a/docs/default_neurobagel_query.rq b/docs/default_neurobagel_query.rq
@@ -6,7 +6,7 @@ PREFIX nidm: <http://purl.org/nidash/nidm#>
 PREFIX snomed: <http://purl.bioontology.org/ontology/SNOMEDCT/>
 
 SELECT DISTINCT ?dataset_uuid ?dataset_name ?dataset_portal_uri ?sub_id ?age ?sex
-?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions ?session_id ?session_type ?assessment ?image_modal ?session_file_path
+?diagnosis ?subject_group ?num_matching_phenotypic_sessions ?num_matching_imaging_sessions ?session_id ?session_type ?assessment ?image_modal ?session_file_path ?pipeline_name ?pipeline_version
 WHERE {
     ?dataset_uuid a nb:Dataset;
         nb:hasLabel ?dataset_name;
@@ -41,6 +41,11 @@ WHERE {
 
         } GROUP BY ?subject
     }
+    OPTIONAL {
+        ?session nb:hasCompletedPipeline ?pipeline.
+        ?pipeline nb:hasPipelineVersion ?pipeline_version.
+        ?pipeline nb:hasPipelineName ?pipeline_name.
+    }
     {
         SELECT ?subject (count(distinct ?imaging_session) as ?num_matching_imaging_sessions)
         WHERE {
@@ -50,6 +55,11 @@ WHERE {
                 ?imaging_session a nb:ImagingSession;
                     nb:hasAcquisition/nb:hasContrastType ?image_modal.
             }
+            OPTIONAL {
+                ?imaging_session nb:hasCompletedPipeline ?pipeline.
+                ?pipeline nb:hasPipelineVersion ?pipeline_version.
+                ?pipeline nb:hasPipelineName ?pipeline_name.
+            }
 
         } GROUP BY ?subject
     }

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -73,6 +73,9 @@ def test_data():
                 "http://purl.org/nidash/nidm#T1Weighted",
                 "http://purl.org/nidash/nidm#T2Weighted",
             ],
+            "available_pipelines": {
+                "freesurfer": ["7.3.2", "2.8.2", "8.7.0-rc"]
+            },
         },
         {
             "dataset_uuid": "http://neurobagel.org/vocab/67890",
@@ -86,6 +89,10 @@ def test_data():
                 "http://purl.org/nidash/nidm#FlowWeighted",
                 "http://purl.org/nidash/nidm#T1Weighted",
             ],
+            "available_pipelines": {
+                "freesurfer": ["7.3.2", "2.1.2"],
+                "fmriprep": ["23.1.3", "22.1.4", "v2.0.1"],
+            },
         },
     ]
 
@@ -178,6 +185,8 @@ async def _mock_get_with_exception(
         min_num_phenotypic_sessions,
         assessment,
         image_modal,
+        pipeline_version,
+        pipeline_name,
     ):
         raise request.param
 
@@ -206,6 +215,8 @@ async def _mock_get(
         min_num_phenotypic_sessions,
         assessment,
         image_modal,
+        pipeline_version,
+        pipeline_name,
     ):
         return request.param
 
@@ -226,6 +237,8 @@ async def _mock_successful_get(
         min_num_phenotypic_sessions,
         assessment,
         image_modal,
+        pipeline_version,
+        pipeline_name,
     ):
         return test_data