Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add normalization filters to aggregate MDS adapter filters #122

Merged
merged 8 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions docs/config_agg_mds.md
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,44 @@ The filters are applied to the text value of the remote field. Furthermore, the
* **strip_email**: remove email addresses from a text field
* **add_icpsr_source_url**: creates an url to the study data on ICPSR
* **add_clinical_trials_source_url**: creates an url to the study on clinicaltrials.gov
* **normalize_value**: normalized a value by mapping from one value to another. This uses a mapping object passed as a filter parameter.
For example:
```json
"subject_cancerStage: {
"path": "cancerStage",
"filters": [
"normalize_value"
],
"filterParams": {
"normalize_value": {
"2": "Stage 2",
"3": "Stage 3"
}
},
"default": ""
},
```
Will map values of '2" to 'Stage 2' and '3' to 'Stage 3. This can be used to normalize data values from different data sources.

* **normalize_tags**: normalizes tag values by defining a mapping of category names to new name values.
Example:
```json
"tags": {
"path": "tags",
"filters": [
"normalize_tags"
],
"filterParams": {
"normalize_tags": {
"CancerStage": {
"Stage II": "Stage 2",
"Stage III": "Stage 3"
}
}
}
},
```
Will normalize the tage category 'CancerStage' to Stage 2 and Stage 3.

You can add your own filters, and register them by creating a python function with the signature:
```python
Expand Down
18 changes: 9 additions & 9 deletions docs/openapi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -275,15 +275,15 @@ paths:
- Aggregate
/aggregate/metadata/{name}:
get:
description: "et all metadata records from a commons by name\n\nReturns an array\
\ containing all the metadata entries for a single commons.\nThere are no\
\ limit/offset parameters.\n\nExample:\n\n [\n {\n \"\
gen3_discovery\": {\n \"name\": \"bear\",\n \
\ \"type\": \"study\",\n ...\n },\n \"\
data_dictionaries\": {\n ...\n }\n },\n \
\ {\n \"gen3_discovery\": {\n \"name\": \"\
cat\",\n \"type\": \"study\",\n ...\n \
\ }\n },\n ...\n ]"
description: "get all metadata records from a commons by name\n\nReturns an\
\ array containing all the metadata entries for a single commons.\nThere are\
\ no limit/offset parameters.\n\nExample:\n\n [\n {\n \
\ \"gen3_discovery\": {\n \"name\": \"bear\",\n \
\ \"type\": \"study\",\n ...\n },\n \
\ \"data_dictionaries\": {\n ...\n }\n \
\ },\n {\n \"gen3_discovery\": {\n \"name\"\
: \"cat\",\n \"type\": \"study\",\n ...\n \
\ }\n },\n ...\n ]"
operationId: get_aggregate_metadata_for_commons_aggregate_metadata__name__get
parameters:
- description: Return the results without grouping items by commons.
Expand Down
73 changes: 70 additions & 3 deletions src/mds/agg_mds/adapters.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import collections.abc
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Tuple, Union
from typing import Any, Dict, List, Tuple, Union, Optional
from jsonpath_ng import parse, JSONPathError
import httpx
import xmltodict
Expand Down Expand Up @@ -66,6 +66,63 @@ def aggregate_pdc_file_count(record: list):
return file_count


def normalize_value(value: str, mapping: Optional[Dict[str, str]] = None):
"""
Normalizes the input value based on the given mapping.

This function checks if the input `value` is a string, and if a `mapping`
dictionary is provided. If both conditions are met, the function attempts
to find the `value` in the given `mapping`. If a match is found, it returns
the corresponding mapped value. If no match is found, or if `value` is not
a string or no mapping is provided, it returns the original value.

Args:
value: str
The input value to be normalized.
mapping: Optional[Dict[str, str]]
An optional dictionary that maps specific values to their desired
normalized equivalents.

Returns:
str
The normalized value if a mapping is provided and the value is found
in the mapping; otherwise, the original input value.
"""
return mapping.get(value, value) if isinstance(value, str) and mapping else value


def normalize_tags(
tags: List[Dict[str, str]], mapping: Optional[Dict[str, str]] = None
):
"""
Maps the 'name' field of dictionaries in a list based on matching 'category' using a mapping.

Args:
items: A list of dictionaries, each containing 'name' and 'category' keys.
mapping: A dictionary where the key is a category and the value is another dictionary
mapping old names to new names.

Returns:
A new list of dictionaries with updated 'name' values where mappings are applied.
"""
if not mapping:
return tags

updated_tags = []
for tag in tags:
if "name" in tag and "category" in tag:
category = tag["category"]
name = tag["name"]
# Update name if category and name are in the mapping
new_name = mapping.get(category, {}).get(name, name)
updated_tags.append({**tag, "name": new_name})
else:
# If tag does not contain 'name' or 'category', keep it unchanged
updated_tags.append(tag)

return updated_tags


class FieldFilters:
filters = {
"strip_html": strip_html,
Expand All @@ -75,13 +132,17 @@ class FieldFilters:
"uppercase": uppercase,
"prepare_cidc_description": prepare_cidc_description,
"aggregate_pdc_file_count": aggregate_pdc_file_count,
"normalize_value": normalize_value,
"normalize_tags": normalize_tags,
}

@classmethod
def execute(cls, name, value):
def execute(cls, name, value, params=None):
if name not in FieldFilters.filters:
logger.warning(f"filter {name} not found: returning original value.")
return value
if params is not None:
return FieldFilters.filters[name](value, params)
return FieldFilters.filters[name](value)


Expand Down Expand Up @@ -220,8 +281,14 @@ def mapFields(item: dict, mappings: dict, global_filters=None, schema=None) -> d
)

filters = value.get("filters", [])
filterParams = value.get("filterParams", {})
for flt in filters:
field_value = FieldFilters.execute(flt, field_value)
if flt in filterParams:
field_value = FieldFilters.execute(
flt, field_value, filterParams[flt]
)
else:
field_value = FieldFilters.execute(flt, field_value)

elif isinstance(value, str) and "path:" in value:
# process as json path
Expand Down
2 changes: 1 addition & 1 deletion src/mds/agg_mds/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ async def get_aggregate_metadata_for_commons(
description="Return the results without grouping items by commons."
),
):
"""et all metadata records from a commons by name
"""get all metadata records from a commons by name

Returns an array containing all the metadata entries for a single commons.
There are no limit/offset parameters.
Expand Down
162 changes: 162 additions & 0 deletions tests/test_agg_mds_adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
get_json_path_value,
strip_email,
strip_html,
normalize_value,
normalize_tags,
add_icpsr_source_url,
FieldFilters,
get_json_path_value,
Expand Down Expand Up @@ -140,6 +142,166 @@ def test_json_path_expression():
]


def test_normalize_value_with_no_mapping():
"""
Test that the function returns the input value when mapping is None.
"""
value = "test_value"
result = normalize_value(value)
assert result == "test_value"


def test_normalize_value_with_non_matching_mapping():
"""
Test that the function returns the input value when it is not found in the mapping.
"""
value = "test_value"
mapping = {"other_value": "mapped_value"}
result = normalize_value(value, mapping)
assert result == "test_value"


def test_normalize_value_with_matching_mapping():
"""
Test that the function returns the mapped value when the input value is found in the mapping.
"""
value = "test_value"
mapping = {"test_value": "mapped_value"}
result = normalize_value(value, mapping)
assert result == "mapped_value"


def test_normalize_value_with_non_string_input():
"""
Test that the function returns the input value unchanged when the input is not a string.
"""
value = 12345
mapping = {"12345": "mapped_value"}
result = normalize_value(value, mapping)
assert result == 12345


def test_normalize_value_with_empty_mapping():
"""
Test that the function returns the input value when the mapping is empty.
"""
value = "test_value"
mapping = {}
result = normalize_value(value, mapping)
assert result == "test_value"


def test_normalize_value_with_none_as_value():
"""
Test that the function returns None when the input value is None.
"""
value = None
mapping = {"test_value": "mapped_value"}
result = normalize_value(value, mapping)
assert result is None


def test_normalize_tags_with_no_mapping():
"""
Test that the function returns the original tags unmodified
when no mapping is provided.
"""
input_tags = [{"name": "tag1", "category": "cat1"}]
result = normalize_tags(input_tags)
assert result == input_tags


def test_normalize_tags_with_empty_mapping():
"""
Test that the function returns the original tags unmodified
when an empty mapping is provided.
"""
input_tags = [{"name": "tag1", "category": "cat1"}]
result = normalize_tags(input_tags, mapping={})
assert result == input_tags


def test_normalize_tags_with_matching_mapping():
"""
Test that the function updates the 'name' field correctly
based on the provided mapping.
"""
input_tags = [{"name": "tag1", "category": "cat1"}]
mapping = {"cat1": {"tag1": "tag1_updated"}}
expected = [{"name": "tag1_updated", "category": "cat1"}]
result = normalize_tags(input_tags, mapping=mapping)
assert result == expected


def test_normalize_tags_with_non_matching_category():
"""
Test that the function leaves the tags unchanged
when the category is not in the mapping.
"""
input_tags = [{"name": "tag1", "category": "cat1"}]
mapping = {"cat2": {"tag1": "tag1_updated"}}
result = normalize_tags(input_tags, mapping=mapping)
assert result == input_tags


def test_normalize_tags_with_non_matching_name():
"""
Test that the function leaves the tags unchanged
when the name is not in the mapping for the given category.
"""
input_tags = [{"name": "tag1", "category": "cat1"}]
mapping = {"cat1": {"tag2": "tag2_updated"}}
result = normalize_tags(input_tags, mapping=mapping)
assert result == input_tags


def test_normalize_tags_with_missing_name_key():
"""
Test that the function leaves tags without a 'name' key unchanged.
"""
input_tags = [{"category": "cat1"}]
mapping = {"cat1": {"tag1": "tag1_updated"}}
result = normalize_tags(input_tags, mapping=mapping)
assert result == input_tags


def test_normalize_tags_with_missing_category_key():
"""
Test that the function leaves tags without a 'category' key unchanged.
"""
input_tags = [{"name": "tag1"}]
mapping = {"cat1": {"tag1": "tag1_updated"}}
result = normalize_tags(input_tags, mapping=mapping)
assert result == input_tags


def test_normalize_tags_with_partial_mapping_applied():
"""
Test that the function updates the 'name' field for tags with matches
in the mapping and leaves others unchanged.
"""
input_tags = [
{"name": "tag1", "category": "cat1"},
{"name": "tag2", "category": "cat2"},
]
mapping = {"cat1": {"tag1": "tag1_updated"}}
expected = [
{"name": "tag1_updated", "category": "cat1"},
{"name": "tag2", "category": "cat2"},
]
result = normalize_tags(input_tags, mapping=mapping)
assert result == expected


def test_normalize_tags_empty_input_tags():
"""
Test that the function returns an empty list when given
an empty input list.
"""
result = normalize_tags([], mapping={"cat1": {"tag1": "tag1_updated"}})
assert result == []


def test_add_clinical_trials_source_url():
integer = 1
assert add_clinical_trials_source_url(integer) == 1
Expand Down
Loading