diff --git a/bagel/cli.py b/bagel/cli.py index 886501e..2a8df8b 100644 --- a/bagel/cli.py +++ b/bagel/cli.py @@ -92,54 +92,69 @@ def pheno( # TODO: needs refactoring once we handle multiple participant IDs participants = column_mapping.get("participant")[0] + # TODO: handle if no session_ID column exists + session_column = column_mapping.get("session")[0] for participant in pheno_df[participants].unique(): # TODO: needs refactoring once we handle phenotypic information at the session level # for the moment we are not creating any session instances in the phenotypic graph # we treat the phenotypic information in the first row of the _sub_pheno dataframe # as reflecting the subject level phenotypic information - _sub_pheno = pheno_df.query( - f"{participants} == '{str(participant)}'" - ).iloc[0] - - subject = models.Subject(hasLabel=str(participant)) - if "sex" in column_mapping.keys(): - _sex_val = putil.get_transformed_values( - column_mapping["sex"], _sub_pheno, data_dictionary - ) - if _sex_val: - subject.hasSex = models.Sex(identifier=_sex_val) + _sub_pheno = pheno_df.query(f"{participants} == '{str(participant)}'") - if "diagnosis" in column_mapping.keys(): - _dx_val = putil.get_transformed_values( - column_mapping["diagnosis"], _sub_pheno, data_dictionary - ) - if _dx_val is None: - pass - elif _dx_val == mappings.NEUROBAGEL["healthy_control"]: - subject.isSubjectGroup = models.SubjectGroup( - identifier=mappings.NEUROBAGEL["healthy_control"], - ) - else: - subject.hasDiagnosis = [models.Diagnosis(identifier=_dx_val)] + # TODO ensure we don't have duplicates in the session ID + session_names = _sub_pheno[session_column].unique() - if "age" in column_mapping.keys(): - subject.hasAge = putil.get_transformed_values( - column_mapping["age"], _sub_pheno, data_dictionary - ) + sessions = [] + for session_name in session_names: + session = models.PhenotypicSession(hasLabel=str(session_name)) + _ses_pheno = _sub_pheno.query( + f"{session_column} == '{str(session_name)}'" + ).iloc[0] - if tool_mapping: - _assessments = [ - models.Assessment(identifier=tool) - for tool, columns in tool_mapping.items() - if putil.are_any_available( - columns, _sub_pheno, data_dictionary + if "sex" in column_mapping.keys(): + _sex_val = putil.get_transformed_values( + column_mapping["sex"], _ses_pheno, data_dictionary + ) + if _sex_val: + session.hasSex = models.Sex(identifier=_sex_val) + + if "diagnosis" in column_mapping.keys(): + _dx_val = putil.get_transformed_values( + column_mapping["diagnosis"], _ses_pheno, data_dictionary + ) + if _dx_val is None: + pass + elif _dx_val == mappings.NEUROBAGEL["healthy_control"]: + session.isSubjectGroup = models.SubjectGroup( + identifier=mappings.NEUROBAGEL["healthy_control"], + ) + else: + session.hasDiagnosis = [ + models.Diagnosis(identifier=_dx_val) + ] + + if "age" in column_mapping.keys(): + session.hasAge = putil.get_transformed_values( + column_mapping["age"], _ses_pheno, data_dictionary ) - ] - if _assessments: - # Only set assignments for the subject if at least one has a non-missing item - subject.hasAssessment = _assessments + if tool_mapping: + _assessments = [ + models.Assessment(identifier=tool) + for tool, columns in tool_mapping.items() + if putil.are_any_available( + columns, _ses_pheno, data_dictionary + ) + ] + if _assessments: + # Only set assessments for the subject if at least one has a non-missing item + session.hasAssessment = _assessments + sessions.append(session) + + subject = models.Subject( + hasLabel=str(participant), hasSession=sessions + ) subject_list.append(subject) dataset = models.Dataset( @@ -277,14 +292,14 @@ def bids( # TODO: needs refactoring once we also handle phenotypic information at the session level session_list.append( # Add back "ses" prefix because pybids stripped it - models.Session( + models.ImagingSession( hasLabel="ses-" + session_label, hasFilePath=session_path, hasAcquisition=image_list, ) ) - pheno_subject.hasSession = session_list + pheno_subject.hasSession += session_list merged_dataset = {**context, **pheno_dataset.dict(exclude_none=True)} diff --git a/bagel/models.py b/bagel/models.py index c438e3a..ed14a3e 100644 --- a/bagel/models.py +++ b/bagel/models.py @@ -51,19 +51,26 @@ class Acquisition(Bagel): class Session(Bagel): hasLabel: str - hasFilePath: Optional[str] = None - hasAcquisition: List[Acquisition] - schemaKey: Literal["Session"] = "Session" -class Subject(Bagel): - hasLabel: str - hasSession: Optional[List[Session]] = None +class PhenotypicSession(Session): hasAge: Optional[float] = None hasSex: Optional[Sex] = None isSubjectGroup: Optional[SubjectGroup] = None hasDiagnosis: Optional[List[Diagnosis]] = None hasAssessment: Optional[List[Assessment]] = None + schemaKey = "PhenotypicSession" + + +class ImagingSession(Session): + hasFilePath: Optional[str] = None + hasAcquisition: List[Acquisition] + schemaKey = "ImagingSession" + + +class Subject(Bagel): + hasLabel: str + hasSession: List[Union[PhenotypicSession, ImagingSession]] schemaKey: Literal["Subject"] = "Subject" diff --git a/bagel/tests/data/README.md b/bagel/tests/data/README.md index 2401876..debfdff 100644 --- a/bagel/tests/data/README.md +++ b/bagel/tests/data/README.md @@ -21,6 +21,7 @@ Example inputs to the CLI | 14 | Valid, same as example 2 | Valid, based on example 2, but with an extra column annotation without Neurobagel | Pass | | 15 | Valid, same as example 2 | Invalid, based on example 2, but participant ID column lacks Neurobagel annotations | Fail | | 16 | Invalid, same as example2.csv, but with a sneaky .tsv file ending | Valid, same as example2 | fail | +| 17 | Same as example 2 TSV, but without session_id column | Same as example 2 JSON, without session_id column | pass `* this is expected to fail until we enable multiple participant_ID handling`. diff --git a/bagel/tests/data/example17.json b/bagel/tests/data/example17.json new file mode 100644 index 0000000..69334dc --- /dev/null +++ b/bagel/tests/data/example17.json @@ -0,0 +1,71 @@ +{ + "participant_id": { + "Description": "A participant ID", + "Annotations": { + "IsAbout": { + "TermURL": "nb:ParticipantID", + "Label": "Unique participant identifier" + }, + "Identifies": "participant" + } + }, + "group": { + "Description": "Group variable", + "Levels": { + "PAT": "Patient", + "CTRL": "Control subject" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Diagnosis", + "Label": "Diagnosis" + }, + "Levels": { + "PAT": { + "TermURL": "snomed:49049000", + "Label": "Parkinson's disease" + }, + "CTRL": { + "TermURL": "ncit:C94342", + "Label": "Healthy Control" + } + } + } + }, + "sex": { + "Description": "Sex variable", + "Levels": { + "M": "Male", + "F": "Female" + }, + "Annotations": { + "IsAbout": { + "TermURL": "nb:Sex", + "Label": "Sex" + }, + "Levels": { + "M": { + "TermURL": "snomed:248153007", + "Label": "Male" + }, + "F": { + "TermURL": "snomed:248152002", + "Label": "Female" + } + } + } + }, + "participant_age": { + "Description": "Age of the participant", + "Annotations": { + "IsAbout": { + "TermURL": "nb:Age", + "Label": "Chronological age" + }, + "Transformation": { + "TermURL": "nb:FromISO8601", + "Label": "A period of time defined according to the ISO8601 standard" + } + } + } +} \ No newline at end of file diff --git a/bagel/tests/data/example17.tsv b/bagel/tests/data/example17.tsv new file mode 100644 index 0000000..b8642e1 --- /dev/null +++ b/bagel/tests/data/example17.tsv @@ -0,0 +1,5 @@ +participant_id group sex participant_age +sub-01 PAT M P20Y6M +sub-01 PAT M P20Y8M +sub-02 CTRL F P25Y8M +sub-02 CTRL F P26Y4M diff --git a/bagel/tests/test_cli_bids.py b/bagel/tests/test_cli_bids.py index 1812c0c..ba7c7e0 100644 --- a/bagel/tests/test_cli_bids.py +++ b/bagel/tests/test_cli_bids.py @@ -59,9 +59,19 @@ def test_bids_sessions_have_correct_labels( pheno_bids = load_test_json(default_pheno_bids_output_path) for sub in pheno_bids["hasSamples"]: - assert ["ses-01", "ses-02"] == [ - ses["hasLabel"] for ses in sub["hasSession"] + assert 4 == len(sub["hasSession"]) + + imaging_session = [ + ses + for ses in sub["hasSession"] + if ses["schemaKey"] == "ImagingSession" ] + assert 2 == len(imaging_session) + + # We also need to make sure that we do not have duplicate imaging session labels + assert set(["ses-01", "ses-02"]) == set( + [ses["hasLabel"] for ses in imaging_session] + ) def test_bids_data_with_sessions_have_correct_paths( @@ -89,8 +99,15 @@ def test_bids_data_with_sessions_have_correct_paths( pheno_bids = load_test_json(default_pheno_bids_output_path) for sub in pheno_bids["hasSamples"]: - for ses in sub["hasSession"]: - assert sub["hasLabel"] in ses["hasFilePath"] - assert ses["hasLabel"] in ses["hasFilePath"] - assert Path(ses["hasFilePath"]).is_absolute() - assert Path(ses["hasFilePath"]).is_dir() + for imaging_session in [ + ses + for ses in sub["hasSession"] + if ses["schemaKey"] == "imaging_session" + ]: + + assert sub["hasLabel"] in imaging_session["hasFilePath"] + assert ( + imaging_session["hasLabel"] in imaging_session["hasFilePath"] + ) + assert Path(imaging_session["hasFilePath"]).is_absolute() + assert Path(imaging_session["hasFilePath"]).is_dir() diff --git a/bagel/tests/test_cli_pheno.py b/bagel/tests/test_cli_pheno.py index 21f7c3a..ce9f30d 100644 --- a/bagel/tests/test_cli_pheno.py +++ b/bagel/tests/test_cli_pheno.py @@ -357,13 +357,16 @@ def test_diagnosis_and_control_status_handled( pheno = load_test_json(default_pheno_output_path) assert ( - pheno["hasSamples"][0]["hasDiagnosis"][0]["identifier"] + pheno["hasSamples"][0]["hasSession"][0]["hasDiagnosis"][0][ + "identifier" + ] == "snomed:49049000" ) - assert "hasDiagnosis" not in pheno["hasSamples"][1].keys() - assert "hasDiagnosis" not in pheno["hasSamples"][2].keys() + assert "hasDiagnosis" not in pheno["hasSamples"][1]["hasSession"][0].keys() + assert "hasDiagnosis" not in pheno["hasSamples"][2]["hasSession"][0].keys() assert ( - pheno["hasSamples"][2]["isSubjectGroup"]["identifier"] == "ncit:C94342" + pheno["hasSamples"][2]["hasSession"][0]["isSubjectGroup"]["identifier"] + == "ncit:C94342" ) @@ -395,13 +398,14 @@ def test_controlled_terms_have_identifiers( pheno = load_test_json(default_pheno_output_path) for sub in pheno["hasSamples"]: - if attribute in sub.keys(): - value = sub.get(attribute) - if not isinstance(value, list): - value = [value] - assert all( - ["identifier" in entry for entry in value] - ), f"{attribute}: did not have an identifier for subject {sub} and value {value}" + for ses in sub["hasSession"]: + if attribute in ses.keys(): + value = ses.get(attribute) + if not isinstance(value, list): + value = [value] + assert all( + ["identifier" in entry for entry in value] + ), f"{attribute}: did not have an identifier for subject {sub} and value {value}" def test_controlled_term_classes_have_uri_type( @@ -479,7 +483,9 @@ def test_assessment_data_are_parsed_correctly( pheno = load_test_json(default_pheno_output_path) - assert assessment == pheno["hasSamples"][subject_idx].get("hasAssessment") + assert assessment == pheno["hasSamples"][subject_idx]["hasSession"][0].get( + "hasAssessment" + ) @pytest.mark.parametrize( @@ -511,7 +517,9 @@ def test_cli_age_is_processed( pheno = load_test_json(default_pheno_output_path) - assert expected_age == pheno["hasSamples"][subject]["hasAge"] + assert ( + expected_age == pheno["hasSamples"][subject]["hasSession"][0]["hasAge"] + ) def test_output_includes_context( diff --git a/neurobagel_examples b/neurobagel_examples index 94282f1..519b076 160000 --- a/neurobagel_examples +++ b/neurobagel_examples @@ -1 +1 @@ -Subproject commit 94282f166742845b1f2e6181849a167a3fb2401f +Subproject commit 519b076bc991fd4549632986874d52f9ca821acc