Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

serializers: added dcat serializer #1082

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions invenio.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -1118,6 +1118,15 @@ APP_RDM_RECORD_EXPORTERS = {
"params": {},
"content-type": "application/vnd.datacite.datacite+xml",
"filename": "{id}.xml",
},
"dcat-ap": {
"name": _("DCAT"),
"serializer": (
"zenodo_rdm.serializers:ZenodoDCATSerializer"
),
"params": {},
"content-type": "application/dcat+xml",
"filename": "{id}.xml",
},
"cff": {
"name": _("Citation File Format"),
Expand Down
2 changes: 2 additions & 0 deletions site/zenodo_rdm/serializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
from .cff import ZenodoCFFSerializer
from .codemeta import ZenodoCodemetaSerializer
from .datacite import ZenodoDataciteJSONSerializer, ZenodoDataciteXMLSerializer
from .dcat import ZenodoDCATSerializer

__all__ = (
"ZenodoBibtexSerializer",
"ZenodoCodemetaSerializer",
"ZenodoDataciteJSONSerializer",
"ZenodoDataciteXMLSerializer",
"ZenodoCFFSerializer",
"ZenodoDCATSerializer",
)
119 changes: 119 additions & 0 deletions site/zenodo_rdm/serializers/dcat.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor: looking at both of the new methods now, I think we can actually add them directly in rdm-records and thus avoid entirely overriding the serializer here. I thought there would be too many specific "EDMO"-isms or similar for the other subject types, but it looks like all the namespaces and elements pretty much already exist in DCAT...

You did too good of a job at it, so now you get a bit more work 🙃

Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2024 CERN.
#
# ZenodoRDM is free software; you can redistribute it and/or modify
# it under the terms of the MIT License; see LICENSE file for more details.
"""Zenodo dcat serializer."""


import idutils
from datacite import schema43
from invenio_rdm_records.resources.serializers.dcat import DCATSerializer
from lxml import etree


class ZenodoDCATSerializer(DCATSerializer):
"""Zenodo DCAT Serializer."""

def __init__(self, **options):
"""Constructor."""
super().__init__(**options)
0einstein0 marked this conversation as resolved.
Show resolved Hide resolved

def add_subjects_uri(self, rdf_tree, subjects):
"""Add valueURI of subjects to the corresponding dct:subject elements in the RDF tree."""
namespaces = rdf_tree.nsmap
for subject in subjects:
value_uri = subject.get("valueURI")
subject_label = subject.get("subject")
subject_scheme = subject.get("subjectScheme")
subject_props = subject.get("subjectProps", {})

if value_uri and subject_label and subject_scheme:
# Find the corresponding dct:subject element by prefLabel and subjectScheme
subject_element = rdf_tree.xpath(
f"""
//dct:subject[
skos:Concept[
skos:prefLabel[text()='{subject_label}']
and skos:inScheme/skos:ConceptScheme/dct:title[text()='{subject_scheme}']
]
]
""",
namespaces=namespaces,
)[0]

if subject_element:
# Add the valueURI to the dct:subject element as rdf:about
subject_element.set(
"{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about", value_uri
)

# Check if
# subject has a definition in its props
definition = subject_props.get("definition")
if definition:
concept_elem = subject_element.find(
".//skos:Concept", namespaces=namespaces
)
if concept_elem is not None:
skos_definition = etree.Element(
"{http://www.w3.org/2004/02/skos/core#}definition"
)
skos_definition.text = definition
concept_elem.append(skos_definition)

return rdf_tree

def add_missing_creator_link(self, rdf_tree):
"""Add `rdf:about` attributes to <rdf:Description> within <dct:creator> if missing."""
namespaces = rdf_tree.nsmap
creators = rdf_tree.xpath(
"//dct:creator/rdf:Description[not(@rdf:about)]", namespaces=namespaces
)

for description in creators:
identifier_elem = description.find("dct:identifier", namespaces)
if identifier_elem is not None:
identifier = identifier_elem.text.strip()
schemes = idutils.detect_identifier_schemes(identifier)
rdf_about_url = next(
(
idutils.to_url(identifier, scheme=scheme)
for scheme in schemes
if idutils.to_url(identifier, scheme)
),
None,
)
if rdf_about_url:
description.set(
"{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about",
rdf_about_url,
)
return rdf_tree

def transform_with_xslt(self, dc_record, **kwargs):
"""Transform record with XSLT and add rdf:about."""
# Transform with base class functionality
dc_etree = schema43.dump_etree(dc_record)
dc_namespace = schema43.ns[None]
dc_etree.tag = "{{{0}}}resource".format(dc_namespace)
dcat_etree = self.xslt_transform_func(dc_etree).getroot()

# Add valueURI to subjects
subjects = dc_record.get("subjects", [])
if subjects:
dcat_etree = self.add_subjects_uri(dcat_etree, subjects)

# Add the identifier links for creators if missing
dcat_etree = self.add_missing_creator_link(dcat_etree)

# Inject files in results (since the XSLT can't do that by default)
files_data = dc_record.get("_files", [])
if files_data:
self._add_files(
root=dcat_etree,
files=files_data,
)

return dcat_etree