opendata-swiss · loleg · Oct 31, 2023 · Oct 31, 2023 · Nov 22, 2023 · Nov 22, 2023
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -3,10 +3,12 @@ on: [push, pull_request]
 jobs:
   lint:
     runs-on: ubuntu-latest
-    container:
-      image: python:2.7.18-buster
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
+      - name: Set up Python 
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.13'
       - name: Install requirements
         run: pip install flake8 pycodestyle
       - name: Check syntax
@@ -16,7 +18,7 @@ jobs:
     needs: lint
     strategy:
       matrix:
-        ckan-version: [2.8]
+        ckan-version: [2.8, 2.9, "2.9-py2", "2.10"]
       fail-fast: false
 
     name: CKAN ${{ matrix.ckan-version }}

diff --git a/README.md b/README.md
@@ -83,7 +83,7 @@ the CKAN config file, comma separated:
 
     ckanext.dcat_ch_rdf_harvester.test_env_urls = https://test.example.com,https://staging.example.com 
 
-The Swiss DCAT Harvester inherits all configuration options from the DCAT RDF harvester. 
+The Swiss DCAT Harvester inherits all configuration options from the [DCAT RDF harvester](https://github.com/ckan/ckanext-dcat#rdf-dcat-harvester). 
 It has the following additional configuration options:
 
 Exclude datasets from import: this will prevent the import of datasets with certain identifiers.

diff --git a/ckanext/dcatapchharvest/dcat_helpers.py b/ckanext/dcatapchharvest/dcat_helpers.py
@@ -1,7 +1,7 @@
 import iribaker
 import json
 import os
-from urlparse import urlparse
+from urllib.parse import urlparse
 from ckantoolkit import config
 from rdflib import URIRef, Graph
 from rdflib.namespace import Namespace, RDF, SKOS
@@ -85,7 +85,7 @@ def dataset_uri(dataset_dict, dataset_ref=None):
     to the production site. In that case, the dataset uris will contain the
     url of the test environment, so we have to replace it with the prod one.
     """
-    uri = (unicode(dataset_ref)
+    uri = (str(dataset_ref)
            if isinstance(dataset_ref, URIRef)
            else '')
     if not uri:
@@ -110,7 +110,7 @@ def dataset_uri(dataset_dict, dataset_ref=None):
 
 def get_permalink(identifier):
     site_url = config.get('ckan.site_url')
-    return u'{0}/perma/{1}'.format(site_url, identifier)
+    return '{0}/perma/{1}'.format(site_url, identifier)
 
 
 def resource_uri(resource_dict, distribution=None):
@@ -127,7 +127,7 @@ def resource_uri(resource_dict, distribution=None):
     resource haven't been saved. This is all right as it will be generated
     when the dataset is output in RDF format.
     """
-    uri = (unicode(distribution)
+    uri = (str(distribution)
            if isinstance(distribution, URIRef)
            else '')
     if not uri:
@@ -154,7 +154,7 @@ def resource_uri(resource_dict, distribution=None):
 def get_frequency_values():
     g = Graph()
     frequency_mapping = {}
-    for prefix, namespace in frequency_namespaces.items():
+    for prefix, namespace in list(frequency_namespaces.items()):
         g.bind(prefix, namespace)
     file = os.path.join(__location__, 'frequency.ttl')
     g.parse(file, format='turtle')
@@ -169,24 +169,24 @@ def get_frequency_values():
 
 def get_license_uri_by_name(vocabulary_name):
     license_vocabulary = get_license_values()
-    for key, value in license_vocabulary.items():
-        if unicode(vocabulary_name) == unicode(value):
+    for key, value in list(license_vocabulary.items()):
+        if str(vocabulary_name) == str(value):
             return key
     return None
 
 
 def get_license_name_by_uri(vocabulary_uri):
     license_vocabulary = get_license_values()
-    for key, value in license_vocabulary.items():
-        if unicode(vocabulary_uri) == unicode(key):
-            return unicode(value)
+    for key, value in list(license_vocabulary.items()):
+        if str(vocabulary_uri) == str(key):
+            return str(value)
     return None
 
 
 def get_license_values():
     g = Graph()
     license_mapping = {}
-    for prefix, namespace in license_namespaces.items():
+    for prefix, namespace in list(license_namespaces.items()):
         g.bind(prefix, namespace)
     file = os.path.join(__location__, 'license.ttl')
     g.parse(file, format='turtle')
@@ -204,7 +204,7 @@ def get_license_values():
 def get_theme_mapping():
     g = Graph()
     theme_mapping = {}
-    for prefix, namespace in theme_namespaces.items():
+    for prefix, namespace in list(theme_namespaces.items()):
         g.bind(prefix, namespace)
     file = os.path.join(__location__, 'themes.ttl')
     g.parse(file, format='turtle')
@@ -232,13 +232,13 @@ def get_pagination(catalog_graph):
         ]
         for key, ref in items:
             for obj in catalog_graph.objects(pagination_node, ref):
-                pagination[key] = unicode(obj)
+                pagination[key] = str(obj)
     return pagination
 
 
 def get_format_values():
     g = Graph()
-    for prefix, namespace in format_namespaces.items():
+    for prefix, namespace in list(format_namespaces.items()):
         g.bind(prefix, namespace)
     file = os.path.join(__location__, 'formats.xml')
     g.parse(file, format='xml')

diff --git a/ckanext/dcatapchharvest/harvest_helper.py b/ckanext/dcatapchharvest/harvest_helper.py
@@ -19,7 +19,7 @@ def map_resources_to_ids(pkg_dict, package_id):
         {r['id']: _get_resource_id_string(r) for r in existing_resources}
     for resource in pkg_dict.get('resources'):
         resource_id_dict = _get_resource_id_string(resource)
-        id_to_reuse = [k for k, v in existing_resources_mapping.items()
+        id_to_reuse = [k for k, v in list(existing_resources_mapping.items())
                        if v == resource_id_dict]
         if id_to_reuse:
             id_to_reuse = id_to_reuse[0]

diff --git a/ckanext/dcatapchharvest/harvesters.py b/ckanext/dcatapchharvest/harvesters.py
@@ -42,7 +42,7 @@ def validate_config(self, source_config):
             if not isinstance(excluded_dataset_identifiers, list):
                 raise ValueError('excluded_dataset_identifiers must be '
                                  'a list of strings')
-            if not all(isinstance(item, basestring)
+            if not all(isinstance(item, str)
                        for item in excluded_dataset_identifiers):
                 raise ValueError('excluded_dataset_identifiers must be '
                                  'a list of strings')
@@ -52,7 +52,7 @@ def validate_config(self, source_config):
             if not isinstance(excluded_rights, list):
                 raise ValueError('excluded_rights must be '
                                  'a list of strings')
-            if not all(isinstance(item, basestring)
+            if not all(isinstance(item, str)
                        for item in excluded_rights):
                 raise ValueError('excluded_rights must be '
                                  'a list of strings')

diff --git a/ckanext/dcatapchharvest/profiles.py b/ckanext/dcatapchharvest/profiles.py
@@ -77,7 +77,7 @@ def _add_multilang_value(self, subject, predicate, dataset_key=None,
             multilang_values = dataset_dict.get(dataset_key)
         if multilang_values:
             try:
-                for key, values in multilang_values.iteritems():
+                for key, values in multilang_values.items():
                     if values:
                         # the values can be either a multilang-dict or they are
                         # nested in another iterable (e.g. keywords)
@@ -140,11 +140,11 @@ def _object_value(self, subject, predicate, multilang=False):
         lang_dict = {}
         for o in self.g.objects(subject, predicate):
             if multilang and o.language:
-                lang_dict[o.language] = unicode(o)
+                lang_dict[o.language] = str(o)
             elif multilang:
-                lang_dict[default_lang] = unicode(o)
+                lang_dict[default_lang] = str(o)
             else:
-                return unicode(o)
+                return str(o)
         if multilang:
             # when translation does not exist, create an empty one
             for lang in dh.get_langs():
@@ -160,8 +160,8 @@ def _object_value_and_datatype(self, subject, predicate):
         """
         for o in self.g.objects(subject, predicate):
             if isinstance(o, Literal):
-                return unicode(o), o.datatype
-            return unicode(o), None
+                return str(o), o.datatype
+            return str(o), None
         return None, None
 
     def _get_publisher_url_from_identifier(self, identifier):
@@ -272,7 +272,7 @@ def _license_rights_name(self, subject, predicate):
             # DCAT-AP CH v1: the license as a literal (should be
             # the code for one of the DCAT-AP CH licenses)
             if isinstance(node, Literal):
-                return unicode(node)
+                return str(node)
             if isinstance(node, URIRef):
                 return dh.get_license_name_by_uri(node)
         return None
@@ -294,7 +294,7 @@ def _keywords(self, subject):
 
         for keyword_node in self.g.objects(subject, DCAT.keyword):
             lang = keyword_node.language
-            keyword = munge_tag(unicode(keyword_node))
+            keyword = munge_tag(str(keyword_node))
             keywords.setdefault(lang, []).append(keyword)
 
         return keywords
@@ -352,14 +352,14 @@ def _temporals(self, subject):
         return temporals
 
     def _clean_datetime(self, datetime_value, data_type):
-        """Convert a literal in one of the accepted data types into an isoformat
-        datetime string.
+        """Convert a literal in one of the accepted data types into an
+        isoformat datetime string.
 
         Accepted types are: xsd:date, xsd:dateTime, xsd:gYear, or
         xsd:gYearMonth; or schema:Date or schema:DateTime, for temporals
         specified as schema:startDate and schema:endDate.
 
-        We only consider the parts of the date that are expected from the given
+        We only consider parts of the date that are expected from the given
         data_type, e.g. the year of an xsd:gYear, even if the month and day
         have been included in the datetime_value. If a datetime_value with
         data_type of xsd:dateTime or schema:DateTime does not contain time
@@ -438,7 +438,7 @@ def _clean_end_datetime(self, datetime_value, data_type):
     def _get_eu_accrual_periodicity(self, subject):
         ogdch_value = self._object_value(subject, DCT.accrualPeriodicity)
         ogdch_value = URIRef(ogdch_value)
-        for key, value in valid_frequencies.items():
+        for key, value in list(valid_frequencies.items()):
             if ogdch_value == value:
                 ogdch_value = key
                 return ogdch_value
@@ -461,12 +461,18 @@ def _get_groups(self, subject):
             for dcat_theme_url in dcat_theme_urls:
                 eu_theme_url = None
 
+                # Python 2 / 3 compatibility
+                import sys
+                if sys.version_info[0] >= 3:
+                    unicode = str
+
                 # Case 1: We get a deprecated opendata.swiss theme. Replace
                 #         the base url with the dcat-ap.ch base url, so we can
                 #         look it up in the theme mapping.
                 if dcat_theme_url.startswith(OGD_THEMES_URI):
                     new_theme_url = dcat_theme_url.replace(
                         OGD_THEMES_URI, CHTHEMES_URI)
+
                     eu_theme_url = unicode(
                         eu_theme_mapping[URIRef(new_theme_url)][0])
 
@@ -545,7 +551,7 @@ def parse_dataset(self, dataset_dict, dataset_ref):  # noqa
         # Tags
         keywords = self._object_value_list(dataset_ref, DCAT.keyword) or []
         for keyword in keywords:
-            dataset_dict['tags'].append({'name': munge_tag(unicode(keyword))})
+            dataset_dict['tags'].append({'name': munge_tag(str(keyword))})
 
         # Keywords
         dataset_dict['keywords'] = self._keywords(dataset_ref)
@@ -726,7 +732,7 @@ def graph_from_dataset(self, dataset_dict, dataset_ref):  # noqa
 
         g = self.g
 
-        for prefix, namespace in namespaces.iteritems():
+        for prefix, namespace in namespaces.items():
             g.bind(prefix, namespace)
 
         g.add((dataset_ref, RDF.type, DCAT.Dataset))
@@ -1095,10 +1101,11 @@ def graph_from_catalog(self, catalog_dict, catalog_ref):
 
     def _accrual_periodicity_to_graph(self, dataset_ref, accrual_periodicity):
         g = self.g
-        old_valid_frequencies = filter(
-            lambda i: i != URIRef(
-                "http://purl.org/cld/freq/completelyIrregular"),
-            list(valid_frequencies.values()))
+        old_valid_frequencies = [
+            i for i in list(valid_frequencies.values())
+            if i != URIRef(
+                    "http://purl.org/cld/freq/completelyIrregular")
+            ]
         if URIRef(accrual_periodicity) in \
                 old_valid_frequencies + list(valid_frequencies.keys()):
             g.add((