Skip to content

Commit

Permalink
[MODEL] Update data model to include longitudinal pheno data when ses…
Browse files Browse the repository at this point in the history
…sions exist in TSV (#250)

* add + test example inputs with no session ID

* Our first attempt at handling longitudinal pheno

* Refactored tests for phenotypic sessions

Co-authored-by: Alyssa Dai <[email protected]>
Co-authored-by: sam-gregz <[email protected]>
Co-authored-by: Renee He <[email protected]>
Co-authored-by: nagatv11 <[email protected]>

* Update neurobagel examples

Co-authored-by: Alyssa Dai <[email protected]>
Co-authored-by: sam-gregz <[email protected]>
Co-authored-by: Renee He <[email protected]>
Co-authored-by: nagatv11 <[email protected]>

* Fix comments

Co-authored-by: Sebastian Urchs <[email protected]>

---------

Co-authored-by: Sebastian Urchs <[email protected]>
Co-authored-by: sam-gregz <[email protected]>
Co-authored-by: Renee He <[email protected]>
Co-authored-by: nagatv11 <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Sebastian Urchs <[email protected]>
  • Loading branch information
7 people authored Dec 12, 2023
1 parent a8369a6 commit e06a440
Show file tree
Hide file tree
Showing 8 changed files with 190 additions and 66 deletions.
93 changes: 54 additions & 39 deletions bagel/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,54 +92,69 @@ def pheno(

# TODO: needs refactoring once we handle multiple participant IDs
participants = column_mapping.get("participant")[0]
# TODO: handle if no session_ID column exists
session_column = column_mapping.get("session")[0]

for participant in pheno_df[participants].unique():
# TODO: needs refactoring once we handle phenotypic information at the session level
# for the moment we are not creating any session instances in the phenotypic graph
# we treat the phenotypic information in the first row of the _sub_pheno dataframe
# as reflecting the subject level phenotypic information
_sub_pheno = pheno_df.query(
f"{participants} == '{str(participant)}'"
).iloc[0]

subject = models.Subject(hasLabel=str(participant))
if "sex" in column_mapping.keys():
_sex_val = putil.get_transformed_values(
column_mapping["sex"], _sub_pheno, data_dictionary
)
if _sex_val:
subject.hasSex = models.Sex(identifier=_sex_val)
_sub_pheno = pheno_df.query(f"{participants} == '{str(participant)}'")

if "diagnosis" in column_mapping.keys():
_dx_val = putil.get_transformed_values(
column_mapping["diagnosis"], _sub_pheno, data_dictionary
)
if _dx_val is None:
pass
elif _dx_val == mappings.NEUROBAGEL["healthy_control"]:
subject.isSubjectGroup = models.SubjectGroup(
identifier=mappings.NEUROBAGEL["healthy_control"],
)
else:
subject.hasDiagnosis = [models.Diagnosis(identifier=_dx_val)]
# TODO ensure we don't have duplicates in the session ID
session_names = _sub_pheno[session_column].unique()

if "age" in column_mapping.keys():
subject.hasAge = putil.get_transformed_values(
column_mapping["age"], _sub_pheno, data_dictionary
)
sessions = []
for session_name in session_names:
session = models.PhenotypicSession(hasLabel=str(session_name))
_ses_pheno = _sub_pheno.query(
f"{session_column} == '{str(session_name)}'"
).iloc[0]

if tool_mapping:
_assessments = [
models.Assessment(identifier=tool)
for tool, columns in tool_mapping.items()
if putil.are_any_available(
columns, _sub_pheno, data_dictionary
if "sex" in column_mapping.keys():
_sex_val = putil.get_transformed_values(
column_mapping["sex"], _ses_pheno, data_dictionary
)
if _sex_val:
session.hasSex = models.Sex(identifier=_sex_val)

if "diagnosis" in column_mapping.keys():
_dx_val = putil.get_transformed_values(
column_mapping["diagnosis"], _ses_pheno, data_dictionary
)
if _dx_val is None:
pass
elif _dx_val == mappings.NEUROBAGEL["healthy_control"]:
session.isSubjectGroup = models.SubjectGroup(
identifier=mappings.NEUROBAGEL["healthy_control"],
)
else:
session.hasDiagnosis = [
models.Diagnosis(identifier=_dx_val)
]

if "age" in column_mapping.keys():
session.hasAge = putil.get_transformed_values(
column_mapping["age"], _ses_pheno, data_dictionary
)
]
if _assessments:
# Only set assignments for the subject if at least one has a non-missing item
subject.hasAssessment = _assessments

if tool_mapping:
_assessments = [
models.Assessment(identifier=tool)
for tool, columns in tool_mapping.items()
if putil.are_any_available(
columns, _ses_pheno, data_dictionary
)
]
if _assessments:
# Only set assessments for the subject if at least one has a non-missing item
session.hasAssessment = _assessments
sessions.append(session)

subject = models.Subject(
hasLabel=str(participant), hasSession=sessions
)
subject_list.append(subject)

dataset = models.Dataset(
Expand Down Expand Up @@ -277,14 +292,14 @@ def bids(
# TODO: needs refactoring once we also handle phenotypic information at the session level
session_list.append(
# Add back "ses" prefix because pybids stripped it
models.Session(
models.ImagingSession(
hasLabel="ses-" + session_label,
hasFilePath=session_path,
hasAcquisition=image_list,
)
)

pheno_subject.hasSession = session_list
pheno_subject.hasSession += session_list

merged_dataset = {**context, **pheno_dataset.dict(exclude_none=True)}

Expand Down
19 changes: 13 additions & 6 deletions bagel/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,26 @@ class Acquisition(Bagel):

class Session(Bagel):
hasLabel: str
hasFilePath: Optional[str] = None
hasAcquisition: List[Acquisition]
schemaKey: Literal["Session"] = "Session"


class Subject(Bagel):
hasLabel: str
hasSession: Optional[List[Session]] = None
class PhenotypicSession(Session):
hasAge: Optional[float] = None
hasSex: Optional[Sex] = None
isSubjectGroup: Optional[SubjectGroup] = None
hasDiagnosis: Optional[List[Diagnosis]] = None
hasAssessment: Optional[List[Assessment]] = None
schemaKey = "PhenotypicSession"


class ImagingSession(Session):
hasFilePath: Optional[str] = None
hasAcquisition: List[Acquisition]
schemaKey = "ImagingSession"


class Subject(Bagel):
hasLabel: str
hasSession: List[Union[PhenotypicSession, ImagingSession]]
schemaKey: Literal["Subject"] = "Subject"


Expand Down
1 change: 1 addition & 0 deletions bagel/tests/data/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Example inputs to the CLI
| 14 | Valid, same as example 2 | Valid, based on example 2, but with an extra column annotation without Neurobagel | Pass |
| 15 | Valid, same as example 2 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail |
| 16 | Invalid, same as example2.csv, but with a sneaky .tsv file ending | Valid, same as example2 | fail |
| 17 | Same as example 2 TSV, but without session_id column | Same as example 2 JSON, without session_id column | pass

`* this is expected to fail until we enable multiple participant_ID handling`.

Expand Down
71 changes: 71 additions & 0 deletions bagel/tests/data/example17.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
{
"participant_id": {
"Description": "A participant ID",
"Annotations": {
"IsAbout": {
"TermURL": "nb:ParticipantID",
"Label": "Unique participant identifier"
},
"Identifies": "participant"
}
},
"group": {
"Description": "Group variable",
"Levels": {
"PAT": "Patient",
"CTRL": "Control subject"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Diagnosis",
"Label": "Diagnosis"
},
"Levels": {
"PAT": {
"TermURL": "snomed:49049000",
"Label": "Parkinson's disease"
},
"CTRL": {
"TermURL": "ncit:C94342",
"Label": "Healthy Control"
}
}
}
},
"sex": {
"Description": "Sex variable",
"Levels": {
"M": "Male",
"F": "Female"
},
"Annotations": {
"IsAbout": {
"TermURL": "nb:Sex",
"Label": "Sex"
},
"Levels": {
"M": {
"TermURL": "snomed:248153007",
"Label": "Male"
},
"F": {
"TermURL": "snomed:248152002",
"Label": "Female"
}
}
}
},
"participant_age": {
"Description": "Age of the participant",
"Annotations": {
"IsAbout": {
"TermURL": "nb:Age",
"Label": "Chronological age"
},
"Transformation": {
"TermURL": "nb:FromISO8601",
"Label": "A period of time defined according to the ISO8601 standard"
}
}
}
}
5 changes: 5 additions & 0 deletions bagel/tests/data/example17.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
participant_id group sex participant_age
sub-01 PAT M P20Y6M
sub-01 PAT M P20Y8M
sub-02 CTRL F P25Y8M
sub-02 CTRL F P26Y4M
31 changes: 24 additions & 7 deletions bagel/tests/test_cli_bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,19 @@ def test_bids_sessions_have_correct_labels(

pheno_bids = load_test_json(default_pheno_bids_output_path)
for sub in pheno_bids["hasSamples"]:
assert ["ses-01", "ses-02"] == [
ses["hasLabel"] for ses in sub["hasSession"]
assert 4 == len(sub["hasSession"])

imaging_session = [
ses
for ses in sub["hasSession"]
if ses["schemaKey"] == "ImagingSession"
]
assert 2 == len(imaging_session)

# We also need to make sure that we do not have duplicate imaging session labels
assert set(["ses-01", "ses-02"]) == set(
[ses["hasLabel"] for ses in imaging_session]
)


def test_bids_data_with_sessions_have_correct_paths(
Expand Down Expand Up @@ -89,8 +99,15 @@ def test_bids_data_with_sessions_have_correct_paths(

pheno_bids = load_test_json(default_pheno_bids_output_path)
for sub in pheno_bids["hasSamples"]:
for ses in sub["hasSession"]:
assert sub["hasLabel"] in ses["hasFilePath"]
assert ses["hasLabel"] in ses["hasFilePath"]
assert Path(ses["hasFilePath"]).is_absolute()
assert Path(ses["hasFilePath"]).is_dir()
for imaging_session in [
ses
for ses in sub["hasSession"]
if ses["schemaKey"] == "imaging_session"
]:

assert sub["hasLabel"] in imaging_session["hasFilePath"]
assert (
imaging_session["hasLabel"] in imaging_session["hasFilePath"]
)
assert Path(imaging_session["hasFilePath"]).is_absolute()
assert Path(imaging_session["hasFilePath"]).is_dir()
34 changes: 21 additions & 13 deletions bagel/tests/test_cli_pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,13 +357,16 @@ def test_diagnosis_and_control_status_handled(
pheno = load_test_json(default_pheno_output_path)

assert (
pheno["hasSamples"][0]["hasDiagnosis"][0]["identifier"]
pheno["hasSamples"][0]["hasSession"][0]["hasDiagnosis"][0][
"identifier"
]
== "snomed:49049000"
)
assert "hasDiagnosis" not in pheno["hasSamples"][1].keys()
assert "hasDiagnosis" not in pheno["hasSamples"][2].keys()
assert "hasDiagnosis" not in pheno["hasSamples"][1]["hasSession"][0].keys()
assert "hasDiagnosis" not in pheno["hasSamples"][2]["hasSession"][0].keys()
assert (
pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] == "ncit:C94342"
pheno["hasSamples"][2]["hasSession"][0]["isSubjectGroup"]["identifier"]
== "ncit:C94342"
)


Expand Down Expand Up @@ -395,13 +398,14 @@ def test_controlled_terms_have_identifiers(
pheno = load_test_json(default_pheno_output_path)

for sub in pheno["hasSamples"]:
if attribute in sub.keys():
value = sub.get(attribute)
if not isinstance(value, list):
value = [value]
assert all(
["identifier" in entry for entry in value]
), f"{attribute}: did not have an identifier for subject {sub} and value {value}"
for ses in sub["hasSession"]:
if attribute in ses.keys():
value = ses.get(attribute)
if not isinstance(value, list):
value = [value]
assert all(
["identifier" in entry for entry in value]
), f"{attribute}: did not have an identifier for subject {sub} and value {value}"


def test_controlled_term_classes_have_uri_type(
Expand Down Expand Up @@ -479,7 +483,9 @@ def test_assessment_data_are_parsed_correctly(

pheno = load_test_json(default_pheno_output_path)

assert assessment == pheno["hasSamples"][subject_idx].get("hasAssessment")
assert assessment == pheno["hasSamples"][subject_idx]["hasSession"][0].get(
"hasAssessment"
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -511,7 +517,9 @@ def test_cli_age_is_processed(

pheno = load_test_json(default_pheno_output_path)

assert expected_age == pheno["hasSamples"][subject]["hasAge"]
assert (
expected_age == pheno["hasSamples"][subject]["hasSession"][0]["hasAge"]
)


def test_output_includes_context(
Expand Down

0 comments on commit e06a440

Please sign in to comment.