Skip to content

Commit

Permalink
Add full support for SOSO award structure
Browse files Browse the repository at this point in the history
Ref #64
  • Loading branch information
amoeba committed Aug 7, 2022
1 parent 1a06a10 commit 7aa9d94
Show file tree
Hide file tree
Showing 5 changed files with 278 additions and 11 deletions.
148 changes: 137 additions & 11 deletions d1lod/d1lod/processors/eml/eml220_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,7 @@ def process(self):

# dataset/project/award -> schema:award
for award in self.scimeta.findall(".//dataset/project/award"):
funder_name = award.find("./funderName").text
award_number = award.find("./awardNumber").text
title = award.find("./title").text

self.model.append(
RDF.Statement(
dataset_subject,
RDF.Node(RDF.Uri("https://schema.org/award")),
RDF.Node(f"{funder_name} #{award_number} ({title})"),
)
)
self.process_award(dataset_subject, award)

"""
annotations
Expand Down Expand Up @@ -110,5 +100,141 @@ def process_child_annotations_at(self, xpath_text):
)
)

def process_award(self, dataset_subject, award):
"""Process an award
XML:
<award>
<funderName>National Science Foundation</funderName>
<funderIdentifier>https://doi.org/10.13039/00000001</funderIdentifier>
<awardNumber>1546024</awardNumber>
<title>Scientia Arctica: A Knowledge Archive for Discovery and Reproducible Science in the Arctic</title>
<awardUrl>https://www.nsf.gov/awardsearch/showAward?AWD_ID=1546024</awardUrl>
</award>
JSON-LD:
{
"@id": "https://www.nsf.gov/awardsearch/showAward?AWD_ID=1604105",
"@type": "MonetaryGrant",
"identifier": "1604105",
"name": "Collaborative Research: Nutritional Landscapes of Arctic Caribou: Observations, Experiments, and Models Provide Process-Level Understanding of Forage Traits and Trajectories",
"url": "https://www.nsf.gov/awardsearch/showAward?AWD_ID=1604105",
"funder": {
"@id": "http://dx.doi.org/10.13039/100000001",
"@type": "Organization",
"name": "National Science Foundation",
"identifier": [
"http://dx.doi.org/10.13039/100000001",
"https://ror.org/021nxhr62"
]
}
}
"""

funderName = award.find("./funderName").text # 1:1
awardNumber = award.find("./awardNumber").text # 1:1
funderIdentifier = award.findall("./funderIdentifier") # 0-∞
title = award.find("./title").text # 1:1
awardUrl = award.find("./awardUrl") # 0:1

# Determine whether to use a blank node or not based upon whether the
# award has an awardUrl or not
if awardUrl is not None:
award_node = RDF.Node(RDF.Uri(awardUrl.text))
else:
award_node = RDF.Node(blank="award")

# dataset -> award
self.model.append(
RDF.Statement(
dataset_subject,
RDF.Node(RDF.Uri("https://schema.org/award")),
award_node,
)
)

# @type
self.model.append(
RDF.Statement(
award_node,
RDF.Node(RDF.Uri("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")),
RDF.Node(RDF.Uri("https://schema.org/MonetaryGrant")),
)
)

# title -> name
self.model.append(
RDF.Statement(
award_node,
RDF.Node(RDF.Uri("https://schema.org/name")),
title,
)
)

# awardNumber -> identifier
self.model.append(
RDF.Statement(
award_node,
RDF.Node(RDF.Uri("https://schema.org/identifier")),
awardNumber,
)
)

# awardUrl (0-1) -> url
if awardUrl is not None:
self.model.append(
RDF.Statement(
award_node,
RDF.Node(RDF.Uri("https://schema.org/url")),
awardUrl.text,
)
)

# funder blank node
# Uses the first funderIdentifier as the URI and puts all values in
# as 'schema:identifier' triples
if len(funderIdentifier) > 0:
funder_node = RDF.Node(RDF.Uri(funderIdentifier[0].text))
else:
funder_node = RDF.Node(blank="funder")

# award node -> funder node
self.model.append(
RDF.Statement(
award_node,
RDF.Node(RDF.Uri("https://schema.org/funder")),
funder_node,
)
)

# @type
self.model.append(
RDF.Statement(
funder_node,
RDF.Node(RDF.Uri("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")),
RDF.Node(RDF.Uri("https://schema.org/Organization")),
)
)

# funderName -> name
self.model.append(
RDF.Statement(
funder_node,
RDF.Node(RDF.Uri("https://schema.org/name")),
funderName,
)
)

# funderIdentifier (0-∞) -> identifier
for identifier in funderIdentifier:
self.model.append(
RDF.Statement(
funder_node,
RDF.Node(RDF.Uri("https://schema.org/identifier")),
identifier.text,
)
)

def get_dataset_subject(self):
return super().get_dataset_subject()
36 changes: 36 additions & 0 deletions d1lod/tests/data/metadata/eml/eml-award-blanknodes.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?xml version="1.0"?>
<eml:eml packageId="doi:10.xxxx/eml.1.1" system="https://doi.org"
xmlns:eml="https://eml.ecoinformatics.org/eml-2.2.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:stmml="http://www.xml-cml.org/schema/stmml-1.1" xsi:schemaLocation="https://eml.ecoinformatics.org/eml-2.2.0 https://eml.ecoinformatics.org/eml-2.2.0/eml.xsd">
<dataset>
<title>title</title>
<creator>
<individualName>
<givenName>A</givenName>
<surName>Person</surName>
</individualName>
</creator>
<contact>
<individualName>
<givenName>A</givenName>
<surName>Person</surName>
</individualName>
</contact>
<project>
<title>A Project</title>
<personnel>
<individualName>
<givenName>A</givenName>
<surName>Person</surName>
</individualName>
<role>principalInvestigator</role>
</personnel>
<award>
<funderName>Some Random Funder</funderName>
<awardNumber>12345</awardNumber>
<title>An award title</title>
</award>
</project>
</dataset>
</eml:eml>
38 changes: 38 additions & 0 deletions d1lod/tests/data/metadata/eml/eml-award-noblanknodes.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
<?xml version="1.0"?>
<eml:eml packageId="doi:10.xxxx/eml.1.1" system="https://doi.org"
xmlns:eml="https://eml.ecoinformatics.org/eml-2.2.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xmlns:stmml="http://www.xml-cml.org/schema/stmml-1.1" xsi:schemaLocation="https://eml.ecoinformatics.org/eml-2.2.0 https://eml.ecoinformatics.org/eml-2.2.0/eml.xsd">
<dataset>
<title>title</title>
<creator>
<individualName>
<givenName>A</givenName>
<surName>Person</surName>
</individualName>
</creator>
<contact>
<individualName>
<givenName>A</givenName>
<surName>Person</surName>
</individualName>
</contact>
<project>
<title>A Project</title>
<personnel>
<individualName>
<givenName>A</givenName>
<surName>Person</surName>
</individualName>
<role>principalInvestigator</role>
</personnel>
<award>
<funderName>National Science Foundation</funderName>
<funderIdentifier>https://doi.org/10.13039/00000001</funderIdentifier>
<awardNumber>12345</awardNumber>
<title>An award title</title>
<awardUrl>https://www.nsf.gov/awardsearch/showAward?AWD_ID=12345</awardUrl>
</award>
</project>
</dataset>
</eml:eml>
13 changes: 13 additions & 0 deletions d1lod/tests/data/sysmeta/eml-award-sysmeta.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<v2:systemMetadata xmlns:v2="http://ns.dataone.org/service/types/v2.0">
<identifier>eml-award</identifier>
<formatId>https://eml.ecoinformatics.org/eml-2.2.0</formatId>
<size>0</size>
<checksum algorithm="SHA1">SHA1</checksum>
<submitter>test_submitter</submitter>
<rightsHolder>test_rights_holder</rightsHolder>
<dateUploaded>2021-06-01T00:00:00</dateUploaded>
<dateSysMetadataModified>2021-06-01T00:00:00</dateSysMetadataModified>
<originMemberNode>test_originating_mn</originMemberNode>
<authoritativeMemberNode>test_authoritative_mn</authoritativeMemberNode>
<fileName>string</fileName>
</v2:systemMetadata>
54 changes: 54 additions & 0 deletions d1lod/tests/test_eml220_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,60 @@ def test_processor_extracts_additional_metadata_annotations(client, model):
assert statement in processor.model


# TOOD: Award tests
# Test blank node and not-blank node for
# - award
# - funder


def test_award_uses_blank_nodes_when_appropriate(client, model):
metadata = load_metadata("eml/eml-award-blanknodes.xml")
sysmeta = load_sysmeta("eml-award-sysmeta.xml")

processor = EML220Processor(client, model, sysmeta, metadata, [])
processor.process()

# Test award is a blank node
statement = RDF.Statement(
RDF.Node(RDF.Uri("https://dataone.org/datasets/eml-award")),
RDF.Node(RDF.Uri("https://schema.org/award")),
RDF.Node(blank="award"),
)

# Test funder is a blank node
statement = RDF.Statement(
RDF.Node(blank="award"),
RDF.Node(RDF.Uri("https://schema.org/funder")),
RDF.Node(blank="funder"),
)

assert statement in processor.model


def test_award_uses_named_nodes_when_appropriate(client, model):
metadata = load_metadata("eml/eml-award-noblanknodes.xml")
sysmeta = load_sysmeta("eml-award-sysmeta.xml")

processor = EML220Processor(client, model, sysmeta, metadata, [])
processor.process()

# Test award is a URI
statement = RDF.Statement(
RDF.Node(RDF.Uri("https://dataone.org/datasets/eml-award")),
RDF.Node(RDF.Uri("https://schema.org/award")),
RDF.Node(RDF.Uri("https://www.nsf.gov/awardsearch/showAward?AWD_ID=12345")),
)

# Test funder is a URI
statement = RDF.Statement(
RDF.Node(RDF.Uri("https://www.nsf.gov/awardsearch/showAward?AWD_ID=12345")),
RDF.Node(RDF.Uri("https://schema.org/funder")),
RDF.Node(RDF.Uri("https://doi.org/10.13039/00000001")),
)

assert statement in processor.model


@pytest.mark.integration
def test_production_eml(client, model):
"""
Expand Down

0 comments on commit 7aa9d94

Please sign in to comment.