[MODEL] Update data model to include longitudinal pheno data when ses…

…sions exist in TSV (#250) * add + test example inputs with no session ID * Our first attempt at handling longitudinal pheno * Refactored tests for phenotypic sessions Co-authored-by: Alyssa Dai <[email protected]> Co-authored-by: sam-gregz <[email protected]> Co-authored-by: Renee He <[email protected]> Co-authored-by: nagatv11 <[email protected]> * Update neurobagel examples Co-authored-by: Alyssa Dai <[email protected]> Co-authored-by: sam-gregz <[email protected]> Co-authored-by: Renee He <[email protected]> Co-authored-by: nagatv11 <[email protected]> * Fix comments Co-authored-by: Sebastian Urchs <[email protected]> --------- Co-authored-by: Sebastian Urchs <[email protected]> Co-authored-by: sam-gregz <[email protected]> Co-authored-by: Renee He <[email protected]> Co-authored-by: nagatv11 <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Sebastian Urchs <[email protected]>
neurobagel · Dec 12, 2023 · e06a440 · e06a440
1 parent a8369a6
commit e06a440
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 66 deletions.
diff --git a/bagel/cli.py b/bagel/cli.py
@@ -92,54 +92,69 @@ def pheno(
 
     # TODO: needs refactoring once we handle multiple participant IDs
     participants = column_mapping.get("participant")[0]
+    # TODO: handle if no session_ID column exists
+    session_column = column_mapping.get("session")[0]
 
     for participant in pheno_df[participants].unique():
         # TODO: needs refactoring once we handle phenotypic information at the session level
         # for the moment we are not creating any session instances in the phenotypic graph
         # we treat the phenotypic information in the first row of the _sub_pheno dataframe
         # as reflecting the subject level phenotypic information
-        _sub_pheno = pheno_df.query(
-            f"{participants} == '{str(participant)}'"
-        ).iloc[0]
-
-        subject = models.Subject(hasLabel=str(participant))
-        if "sex" in column_mapping.keys():
-            _sex_val = putil.get_transformed_values(
-                column_mapping["sex"], _sub_pheno, data_dictionary
-            )
-            if _sex_val:
-                subject.hasSex = models.Sex(identifier=_sex_val)
+        _sub_pheno = pheno_df.query(f"{participants} == '{str(participant)}'")
 
-        if "diagnosis" in column_mapping.keys():
-            _dx_val = putil.get_transformed_values(
-                column_mapping["diagnosis"], _sub_pheno, data_dictionary
-            )
-            if _dx_val is None:
-                pass
-            elif _dx_val == mappings.NEUROBAGEL["healthy_control"]:
-                subject.isSubjectGroup = models.SubjectGroup(
-                    identifier=mappings.NEUROBAGEL["healthy_control"],
-                )
-            else:
-                subject.hasDiagnosis = [models.Diagnosis(identifier=_dx_val)]
+        # TODO ensure we don't have duplicates in the session ID
+        session_names = _sub_pheno[session_column].unique()
 
-        if "age" in column_mapping.keys():
-            subject.hasAge = putil.get_transformed_values(
-                column_mapping["age"], _sub_pheno, data_dictionary
-            )
+        sessions = []
+        for session_name in session_names:
+            session = models.PhenotypicSession(hasLabel=str(session_name))
+            _ses_pheno = _sub_pheno.query(
+                f"{session_column} == '{str(session_name)}'"
+            ).iloc[0]
 
-        if tool_mapping:
-            _assessments = [
-                models.Assessment(identifier=tool)
-                for tool, columns in tool_mapping.items()
-                if putil.are_any_available(
-                    columns, _sub_pheno, data_dictionary
+            if "sex" in column_mapping.keys():
+                _sex_val = putil.get_transformed_values(
+                    column_mapping["sex"], _ses_pheno, data_dictionary
+                )
+                if _sex_val:
+                    session.hasSex = models.Sex(identifier=_sex_val)
+
+            if "diagnosis" in column_mapping.keys():
+                _dx_val = putil.get_transformed_values(
+                    column_mapping["diagnosis"], _ses_pheno, data_dictionary
+                )
+                if _dx_val is None:
+                    pass
+                elif _dx_val == mappings.NEUROBAGEL["healthy_control"]:
+                    session.isSubjectGroup = models.SubjectGroup(
+                        identifier=mappings.NEUROBAGEL["healthy_control"],
+                    )
+                else:
+                    session.hasDiagnosis = [
+                        models.Diagnosis(identifier=_dx_val)
+                    ]
+
+            if "age" in column_mapping.keys():
+                session.hasAge = putil.get_transformed_values(
+                    column_mapping["age"], _ses_pheno, data_dictionary
                 )
-            ]
-            if _assessments:
-                # Only set assignments for the subject if at least one has a non-missing item
-                subject.hasAssessment = _assessments
 
+            if tool_mapping:
+                _assessments = [
+                    models.Assessment(identifier=tool)
+                    for tool, columns in tool_mapping.items()
+                    if putil.are_any_available(
+                        columns, _ses_pheno, data_dictionary
+                    )
+                ]
+                if _assessments:
+                    # Only set assessments for the subject if at least one has a non-missing item
+                    session.hasAssessment = _assessments
+            sessions.append(session)
+
+        subject = models.Subject(
+            hasLabel=str(participant), hasSession=sessions
+        )
         subject_list.append(subject)
 
     dataset = models.Dataset(
@@ -277,14 +292,14 @@ def bids(
             # TODO: needs refactoring once we also handle phenotypic information at the session level
             session_list.append(
                 # Add back "ses" prefix because pybids stripped it
-                models.Session(
+                models.ImagingSession(
                     hasLabel="ses-" + session_label,
                     hasFilePath=session_path,
                     hasAcquisition=image_list,
                 )
             )
 
-        pheno_subject.hasSession = session_list
+        pheno_subject.hasSession += session_list
 
     merged_dataset = {**context, **pheno_dataset.dict(exclude_none=True)}
 

diff --git a/bagel/models.py b/bagel/models.py
@@ -51,19 +51,26 @@ class Acquisition(Bagel):
 
 class Session(Bagel):
     hasLabel: str
-    hasFilePath: Optional[str] = None
-    hasAcquisition: List[Acquisition]
-    schemaKey: Literal["Session"] = "Session"
 
 
-class Subject(Bagel):
-    hasLabel: str
-    hasSession: Optional[List[Session]] = None
+class PhenotypicSession(Session):
     hasAge: Optional[float] = None
     hasSex: Optional[Sex] = None
     isSubjectGroup: Optional[SubjectGroup] = None
     hasDiagnosis: Optional[List[Diagnosis]] = None
     hasAssessment: Optional[List[Assessment]] = None
+    schemaKey = "PhenotypicSession"
+
+
+class ImagingSession(Session):
+    hasFilePath: Optional[str] = None
+    hasAcquisition: List[Acquisition]
+    schemaKey = "ImagingSession"
+
+
+class Subject(Bagel):
+    hasLabel: str
+    hasSession: List[Union[PhenotypicSession, ImagingSession]]
     schemaKey: Literal["Subject"] = "Subject"
 
 

diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md
@@ -21,6 +21,7 @@ Example inputs to the CLI
 | 14           | Valid, same as example 2                                                        | Valid, based on example 2, but with an extra column annotation without Neurobagel    | Pass               |
 | 15           | Valid, same as example 2                                                        | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations  | Fail               |
 | 16           | Invalid, same as example2.csv, but with a sneaky .tsv file ending               | Valid, same as example2                                                              | fail               |
+| 17 | Same as example 2 TSV, but without session_id column | Same as example 2 JSON, without session_id column | pass
 
 `* this is expected to fail until we enable multiple participant_ID handling`.
 

diff --git a/bagel/tests/data/example17.json b/bagel/tests/data/example17.json
@@ -0,0 +1,71 @@
+{
+  "participant_id": {
+    "Description": "A participant ID",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:ParticipantID",
+        "Label": "Unique participant identifier"
+      },
+      "Identifies": "participant"
+    }
+  },
+  "group": {
+    "Description": "Group variable",
+    "Levels": {
+      "PAT": "Patient",
+      "CTRL": "Control subject"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Diagnosis",
+        "Label": "Diagnosis"
+      },
+      "Levels": {
+        "PAT": {
+          "TermURL": "snomed:49049000",
+          "Label": "Parkinson's disease"
+        },
+        "CTRL": {
+          "TermURL": "ncit:C94342",
+          "Label": "Healthy Control"
+        }
+      }
+    }
+  },
+  "sex": {
+    "Description": "Sex variable",
+    "Levels": {
+      "M": "Male",
+      "F": "Female"
+    },
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Sex",
+        "Label": "Sex"
+      },
+      "Levels": {
+        "M": {
+          "TermURL": "snomed:248153007",
+          "Label": "Male"
+        },
+        "F": {
+          "TermURL": "snomed:248152002",
+          "Label": "Female"
+        }
+      }
+    }
+  },
+  "participant_age": {
+    "Description": "Age of the participant",
+    "Annotations": {
+      "IsAbout": {
+        "TermURL": "nb:Age",
+        "Label": "Chronological age"
+      },
+      "Transformation": {
+        "TermURL": "nb:FromISO8601",
+        "Label": "A period of time defined according to the ISO8601 standard"
+      }
+    }
+  }
+}
diff --git a/bagel/tests/data/example17.tsv b/bagel/tests/data/example17.tsv
@@ -0,0 +1,5 @@
+participant_id	group	sex	participant_age
+sub-01	PAT	M	P20Y6M
+sub-01	PAT	M	P20Y8M
+sub-02	CTRL	F	P25Y8M
+sub-02	CTRL	F	P26Y4M
diff --git a/bagel/tests/test_cli_bids.py b/bagel/tests/test_cli_bids.py
@@ -59,9 +59,19 @@ def test_bids_sessions_have_correct_labels(
 
     pheno_bids = load_test_json(default_pheno_bids_output_path)
     for sub in pheno_bids["hasSamples"]:
-        assert ["ses-01", "ses-02"] == [
-            ses["hasLabel"] for ses in sub["hasSession"]
+        assert 4 == len(sub["hasSession"])
+
+        imaging_session = [
+            ses
+            for ses in sub["hasSession"]
+            if ses["schemaKey"] == "ImagingSession"
         ]
+        assert 2 == len(imaging_session)
+
+        # We also need to make sure that we do not have duplicate imaging session labels
+        assert set(["ses-01", "ses-02"]) == set(
+            [ses["hasLabel"] for ses in imaging_session]
+        )
 
 
 def test_bids_data_with_sessions_have_correct_paths(
@@ -89,8 +99,15 @@ def test_bids_data_with_sessions_have_correct_paths(
 
     pheno_bids = load_test_json(default_pheno_bids_output_path)
     for sub in pheno_bids["hasSamples"]:
-        for ses in sub["hasSession"]:
-            assert sub["hasLabel"] in ses["hasFilePath"]
-            assert ses["hasLabel"] in ses["hasFilePath"]
-            assert Path(ses["hasFilePath"]).is_absolute()
-            assert Path(ses["hasFilePath"]).is_dir()
+        for imaging_session in [
+            ses
+            for ses in sub["hasSession"]
+            if ses["schemaKey"] == "imaging_session"
+        ]:
+
+            assert sub["hasLabel"] in imaging_session["hasFilePath"]
+            assert (
+                imaging_session["hasLabel"] in imaging_session["hasFilePath"]
+            )
+            assert Path(imaging_session["hasFilePath"]).is_absolute()
+            assert Path(imaging_session["hasFilePath"]).is_dir()
diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py
@@ -357,13 +357,16 @@ def test_diagnosis_and_control_status_handled(
     pheno = load_test_json(default_pheno_output_path)
 
     assert (
-        pheno["hasSamples"][0]["hasDiagnosis"][0]["identifier"]
+        pheno["hasSamples"][0]["hasSession"][0]["hasDiagnosis"][0][
+            "identifier"
+        ]
         == "snomed:49049000"
     )
-    assert "hasDiagnosis" not in pheno["hasSamples"][1].keys()
-    assert "hasDiagnosis" not in pheno["hasSamples"][2].keys()
+    assert "hasDiagnosis" not in pheno["hasSamples"][1]["hasSession"][0].keys()
+    assert "hasDiagnosis" not in pheno["hasSamples"][2]["hasSession"][0].keys()
     assert (
-        pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] == "ncit:C94342"
+        pheno["hasSamples"][2]["hasSession"][0]["isSubjectGroup"]["identifier"]
+        == "ncit:C94342"
     )
 
 
@@ -395,13 +398,14 @@ def test_controlled_terms_have_identifiers(
     pheno = load_test_json(default_pheno_output_path)
 
     for sub in pheno["hasSamples"]:
-        if attribute in sub.keys():
-            value = sub.get(attribute)
-            if not isinstance(value, list):
-                value = [value]
-            assert all(
-                ["identifier" in entry for entry in value]
-            ), f"{attribute}: did not have an identifier for subject {sub} and value {value}"
+        for ses in sub["hasSession"]:
+            if attribute in ses.keys():
+                value = ses.get(attribute)
+                if not isinstance(value, list):
+                    value = [value]
+                assert all(
+                    ["identifier" in entry for entry in value]
+                ), f"{attribute}: did not have an identifier for subject {sub} and value {value}"
 
 
 def test_controlled_term_classes_have_uri_type(
@@ -479,7 +483,9 @@ def test_assessment_data_are_parsed_correctly(
 
     pheno = load_test_json(default_pheno_output_path)
 
-    assert assessment == pheno["hasSamples"][subject_idx].get("hasAssessment")
+    assert assessment == pheno["hasSamples"][subject_idx]["hasSession"][0].get(
+        "hasAssessment"
+    )
 
 
 @pytest.mark.parametrize(
@@ -511,7 +517,9 @@ def test_cli_age_is_processed(
 
     pheno = load_test_json(default_pheno_output_path)
 
-    assert expected_age == pheno["hasSamples"][subject]["hasAge"]
+    assert (
+        expected_age == pheno["hasSamples"][subject]["hasSession"][0]["hasAge"]
+    )
 
 
 def test_output_includes_context(

diff --git a/neurobagel_examples b/neurobagel_examples
+232 −73		data-upload/example_synthetic.jsonld
+303 −154		data-upload/pheno-bids-output/example_synthetic_pheno-bids.jsonld