From 0019785c45244c58b98fa890eb527de87aafab33 Mon Sep 17 00:00:00 2001 From: CarlosCoelhoSL <110818364+CarlosCoelhoSL@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:28:05 +0000 Subject: [PATCH] Add data step 1 (#285) * first stage added to submodule * refactors log analysis in add-data * adds happy path test and start of acceptance test * adds/fixes tests * adds pipeline and collection validation * adds acceptance tests * tidies logs and comments * adds consecutive run handling (temporary) * parametrizes tests * raise exception for duplicate endpoint added * fixes log endpoint message * modifies collection.load() to handle logs without entry in source.csv * adds log message to collection.load() * adds extra url validity check * adds tests for collection and is_url_valid * removes unnecessary raise * removes unnecessary pipeline_dir fixtures * uses specification and organisation classes, moves validation to utils * removes log_path from Collector.fetch return * raises HTTPError when failing to collect from URL * adds logging to try except in collection.py * changes default logging in collection.py to True * renames logging boolean * renames error ogging boolean * removes collection.py edits, now deletes log after exiting --------- Co-authored-by: averheecke-tpx --- digital_land/cli.py | 31 ++ digital_land/commands.py | 188 ++++++++++ digital_land/specification.py | 7 + digital_land/utils/add_data_utils.py | 50 +++ tests/acceptance/test_add_data.py | 262 +++++++++++++ tests/data/specification/licence.csv | 3 + tests/integration/test_add_data.py | 444 +++++++++++++++++++++++ tests/integration/test_add_data_utils.py | 19 + tests/unit/test_add_data_utils.py | 70 ++++ 9 files changed, 1074 insertions(+) create mode 100644 digital_land/utils/add_data_utils.py create mode 100644 tests/acceptance/test_add_data.py create mode 100644 tests/data/specification/licence.csv create mode 100644 tests/integration/test_add_data.py create mode 100644 tests/integration/test_add_data_utils.py create mode 100644 tests/unit/test_add_data_utils.py diff --git a/digital_land/cli.py b/digital_land/cli.py index aae62b52..a7b85988 100644 --- a/digital_land/cli.py +++ b/digital_land/cli.py @@ -30,6 +30,7 @@ organisation_check, save_state, compare_state, + add_data, ) from digital_land.command_arguments import ( @@ -346,6 +347,36 @@ def retire_endpoints_cmd(config_collections_dir, csv_path): return collection_retire_endpoints_and_sources(config_collections_dir, csv_path) +@cli.command("add-data") +@click.argument("csv-path", nargs=1, type=click.Path()) +@click.argument("collection-name", nargs=1, type=click.STRING) +@click.option("--collection-dir", "-c", nargs=1, type=click.Path(exists=True)) +@click.option( + "--specification-dir", "-s", type=click.Path(exists=True), default="specification/" +) +@click.option( + "--organisation-path", + "-o", + type=click.Path(exists=True), + default="var/cache/organisation.csv", +) +def add_data_cmd( + csv_path, collection_name, collection_dir, specification_dir, organisation_path +): + csv_file_path = Path(csv_path) + if not csv_file_path.is_file(): + logging.error(f"CSV file not found at path: {csv_path}") + sys.exit(2) + + return add_data( + csv_file_path, + collection_name, + collection_dir, + specification_dir, + organisation_path, + ) + + # edit to add collection_name in @cli.command("add-endpoints-and-lookups") @click.argument("csv-path", nargs=1, type=click.Path()) diff --git a/digital_land/commands.py b/digital_land/commands.py index e3b3947f..c9c3e441 100644 --- a/digital_land/commands.py +++ b/digital_land/commands.py @@ -8,8 +8,10 @@ from packaging.version import Version import pandas as pd from pathlib import Path +from datetime import datetime import geojson +from requests import HTTPError import shapely from digital_land.package.organisation import OrganisationPackage @@ -57,6 +59,7 @@ from digital_land.configuration.main import Config from digital_land.api import API from digital_land.state import State +from digital_land.utils.add_data_utils import clear_log, is_date_valid, is_url_valid from .register import hash_value from .utils.gdal_utils import get_gdal_version @@ -531,6 +534,191 @@ def collection_add_source(entry, collection, endpoint_url, collection_dir): add_source_endpoint(entry, directory=collection_dir) +def validate_and_add_data_input( + csv_file_path, collection_name, collection_dir, specification_dir, organisation_path +): + expected_cols = [ + "pipelines", + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "licence", + ] + + specification = Specification(specification_dir) + organisation = Organisation(organisation_path=organisation_path) + + collection = Collection(name=collection_name, directory=collection_dir) + collection.load() + # ===== FIRST VALIDATION BASED ON IMPORT.CSV INFO + # - Check licence, url, date, organisation + + # read and process each record of the new endpoints csv at csv_file_path i.e import.csv + + with open(csv_file_path) as new_endpoints_file: + reader = csv.DictReader(new_endpoints_file) + csv_columns = reader.fieldnames + + # validate the columns in input .csv + for expected_col in expected_cols: + if expected_col not in csv_columns: + raise Exception(f"required column ({expected_col}) not found in csv") + + for row in reader: + # validate licence + if row["licence"] == "": + raise ValueError("Licence is blank") + elif not specification.licence.get(row["licence"], None): + raise ValueError( + f"Licence '{row['licence']}' is not a valid licence according to the specification." + ) + # check if urls are not blank and valid urls + is_endpoint_valid, endpoint_valid_error = is_url_valid( + row["endpoint-url"], "endpoint_url" + ) + is_documentation_valid, documentation_valid_error = is_url_valid( + row["documentation-url"], "documentation_url" + ) + if not is_endpoint_valid or not is_documentation_valid: + raise ValueError( + f"{endpoint_valid_error} \n {documentation_valid_error}" + ) + + # if there is no start-date, do we want to populate it with today's date? + if row["start-date"]: + valid_date, error = is_date_valid(row["start-date"], "start-date") + if not valid_date: + raise ValueError(error) + + # validate organisation + if row["organisation"] == "": + raise ValueError("The organisation must not be blank") + elif not organisation.lookup(row["organisation"]): + raise ValueError( + f"The given organisation '{row['organisation']}' is not in our valid organisations" + ) + + # validate pipeline(s) - do they exist and are they in the collection + pipelines = row["pipelines"].split(";") + for pipeline in pipelines: + if not specification.dataset.get(pipeline, None): + raise ValueError( + f"'{pipeline}' is not a valid dataset in the specification" + ) + collection_in_specification = specification.dataset.get( + pipeline, None + ).get("collection") + if collection_name != collection_in_specification: + raise ValueError( + f"'{pipeline}' does not belong to provided collection {collection_name}" + ) + + # VALIDATION DONE, NOW ADD TO COLLECTION + print("======================================================================") + print("Endpoint and source details") + print("======================================================================") + print("Endpoint URL: ", row["endpoint-url"]) + print("Endpoint Hash:", hash_value(row["endpoint-url"])) + print("Documentation URL: ", row["documentation-url"]) + print() + + endpoints = [] + # if endpoint already exists, it will indicate it and quit function here + if collection.add_source_endpoint(row): + endpoint = { + "endpoint-url": row["endpoint-url"], + "endpoint": hash_value(row["endpoint-url"]), + "end-date": row.get("end-date", ""), + "plugin": row.get("plugin"), + "licence": row["licence"], + } + endpoints.append(endpoint) + else: + # We rely on the add_source_endpoint function to log why it couldn't be added + raise Exception( + "Endpoint and source could not be added - is this a duplicate endpoint?" + ) + + # if successfully added we can now attempt to fetch from endpoint + collector = Collector(collection_dir=collection_dir) + endpoint_resource_info = {} + for endpoint in endpoints: + status = collector.fetch( + url=endpoint["endpoint-url"], + endpoint=endpoint["endpoint"], + end_date=endpoint["end-date"], + plugin=endpoint["plugin"], + ) + try: + log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"]) + with open(log_path, "r") as f: + log = json.load(f) + except Exception as e: + print( + f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}" + ) + break + + status = log.get("status", None) + # Raise exception if status is not 200 + if not status or status != "200": + exception = log.get("exception", None) + raise HTTPError( + f"Failed to collect from URL with status: {status if status else exception}" + ) + + # Resource and path will only be printed if downloaded successfully but should only happen if status is 200 + resource = log.get("resource", None) + if resource: + print( + "Resource collected: ", + resource, + ) + print( + "Resource Path is: ", + Path(collection_dir) / "resource" / resource, + ) + + print(f"Log Status for {endpoint['endpoint']}: The status is {status}") + endpoint_resource_info.update( + { + "endpoint": endpoint["endpoint"], + "resource": log.get("resource"), + "pipelines": row["pipelines"].split(";"), + } + ) + + return collection, endpoint_resource_info + + +def add_data( + csv_file_path, collection_name, collection_dir, specification_dir, organisation_path +): + # Potentially track a list of files to clean up at the end of session? e.g log file + + # First validate the input .csv and collect from the endpoint + collection, endpoint_resource_info = validate_and_add_data_input( + csv_file_path, + collection_name, + collection_dir, + specification_dir, + organisation_path, + ) + # At this point the endpoint will have been added to the collection + + user_response = ( + input("Do you want to continue processing this resource? (yes/no): ") + .strip() + .lower() + ) + + if user_response != "yes": + print("Operation cancelled by user.") + clear_log(collection_dir, endpoint_resource_info["endpoint"]) + return + + def add_endpoints_and_lookups( csv_file_path, collection_name, diff --git a/digital_land/specification.py b/digital_land/specification.py index 6facce5a..67a5034f 100644 --- a/digital_land/specification.py +++ b/digital_land/specification.py @@ -39,6 +39,7 @@ def __init__(self, path="specification"): self.schema_field = {} self.typology = {} self.pipeline = {} + self.licence = {} self.load_dataset(path) self.load_schema(path) self.load_dataset_schema(path) @@ -48,6 +49,7 @@ def __init__(self, path="specification"): self.load_typology(path) self.load_pipeline(path) self.load_dataset_field(path) + self.load_licence(path) self.index_field() self.index_schema() @@ -111,6 +113,11 @@ def load_pipeline(self, path): for row in reader: self.pipeline[row["pipeline"]] = row + def load_licence(self, path): + reader = csv.DictReader(open(os.path.join(path, "licence.csv"))) + for row in reader: + self.licence[row["licence"]] = row + def index_schema(self): self.schema_dataset = {} for dataset, d in self.dataset_schema.items(): diff --git a/digital_land/utils/add_data_utils.py b/digital_land/utils/add_data_utils.py new file mode 100644 index 00000000..4f95f6d7 --- /dev/null +++ b/digital_land/utils/add_data_utils.py @@ -0,0 +1,50 @@ +import os +from datetime import datetime +from urllib.parse import urlparse + +from digital_land.collect import Collector + + +def is_url_valid(url, url_type): + if not url or url.strip() == "": + return False, f"The {url_type} must be populated" + + parsed_url = urlparse(url) + # is url scheme valid i.e start with http:// or https:// + if parsed_url.scheme not in ["http", "https"] or not parsed_url.scheme: + return False, f"The {url_type} must start with 'http://' or 'https://'" + + # does url have domain + if not parsed_url.netloc: + return False, f"The {url_type} must have a domain" + + # ensure domain has correct format + if "." not in parsed_url.netloc: + return ( + False, + f"The {url_type} must have a valid domain with a top-level domain (e.g., '.gov.uk', '.com')", + ) + + return True, "" + + +def is_date_valid(date, date_type): + if len(date) == 0: + return False, "Date is blank" + try: + date = datetime.strptime(date, "%Y-%m-%d").date() + # need to catch ValueError here otherwise datetime will raise it's own error, not the clear format we want + except ValueError: + return False, f"{date_type} {date} must be format YYYY-MM-DD" + + if date > datetime.today().date(): + return False, f"The {date_type} {date} cannot be in the future" + + return True, "" + + +def clear_log(collection_dir, endpoint): + collector = Collector(collection_dir=collection_dir) + log_path = collector.log_path(datetime.utcnow(), endpoint) + if os.path.isfile(log_path): + os.remove(log_path) diff --git a/tests/acceptance/test_add_data.py b/tests/acceptance/test_add_data.py new file mode 100644 index 00000000..2cdf96cd --- /dev/null +++ b/tests/acceptance/test_add_data.py @@ -0,0 +1,262 @@ +import csv +import os +import tempfile +from unittest.mock import Mock +from click.testing import CliRunner +import pytest + +from digital_land.cli import cli +from tests.acceptance.conftest import copy_latest_specification_files_to + + +@pytest.fixture(scope="module") +def specification_dir(tmp_path_factory): + specification_dir = tmp_path_factory.mktemp("specification") + copy_latest_specification_files_to(specification_dir) + return specification_dir + + +@pytest.fixture(scope="function") +def collection_dir(tmp_path_factory): + collection_dir = tmp_path_factory.mktemp("collection") + + # create source csv + source_fieldnames = [ + "attribution", + "collection", + "documentation-url", + "endpoint", + "licence", + "organisation", + "pipelines", + "entry-date", + "start-date", + "end-date", + ] + + with open(os.path.join(collection_dir, "source.csv"), "w") as f: + dictwriter = csv.DictWriter(f, fieldnames=source_fieldnames) + dictwriter.writeheader() + + # create endpoint csv + endpoint_fieldnames = [ + "endpoint", + "endpoint-url", + "parameters", + "plugin", + "entry-date", + "start-date", + "end-date", + ] + + with open(os.path.join(collection_dir, "endpoint.csv"), "w") as f: + dictwriter = csv.DictWriter(f, fieldnames=endpoint_fieldnames) + dictwriter.writeheader() + return collection_dir + + +@pytest.fixture(scope="module") +def organisation_csv(): + organisation_path = tempfile.NamedTemporaryFile().name + organisation_fieldnames = [ + "dataset", + "end-date", + "entity", + "entry-date", + "name", + "organisation", + "prefix", + "reference", + "start-date", + ] + organisation_row = { + "dataset": "local-authority", + "end-date": "", + "entity": 314, + "entry-date": "2023-11-19", + "name": "South Staffordshire Council", + "organisation": "local-authority:SST", + "prefix": "local-authority", + "reference": "SST", + "start-date": "", + } + + with open(organisation_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=organisation_fieldnames) + writer.writeheader() + writer.writerow(organisation_row) + + return organisation_path + + +@pytest.fixture +def mock_request_get(mocker): + data = {"reference": "1", "value": "test"} + csv_content = str(data).encode("utf-8") + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.request.headers = {"test": "test"} + mock_response.headers = {"test": "test"} + mock_response.content = csv_content + mocker.patch( + "requests.Session.get", + return_value=mock_response, + ) + + +def create_input_csv( + data, + fieldnames=[ + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "pipelines", + "plugin", + "licence", + ], +): + tmp_input_path = tempfile.NamedTemporaryFile().name + + with open(tmp_input_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerow(data) + + return tmp_input_path + + +def test_cli_add_data( + collection_dir, + specification_dir, + organisation_csv, + mock_request_get, + monkeypatch, +): + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + csv_path = create_input_csv(no_error_input_data) + + # Mock in user input + monkeypatch.setattr("builtins.input", lambda _: "yes") + + runner = CliRunner() + result = runner.invoke( + cli, + [ + "add-data", + csv_path, + "conservation-area", + "--collection-dir", + str(collection_dir), + "--specification-dir", + str(specification_dir), + "--organisation-path", + str(organisation_csv), + ], + ) + + assert result.exit_code == 0 + + +def test_cli_add_data_incorrect_input_data( + collection_dir, + specification_dir, + organisation_csv, + mock_request_get, +): + incorrect_input_data = { + "organisation": "", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + csv_path = create_input_csv(incorrect_input_data) + + runner = CliRunner() + result = runner.invoke( + cli, + [ + "add-data", + csv_path, + "conservation-area", + "--collection-dir", + str(collection_dir), + "--specification-dir", + str(specification_dir), + "--organisation-path", + str(organisation_csv), + ], + ) + assert result.exit_code == 1 + assert "organisation must not be blank" in str(result.exception) + + +# This test exists as there is potential for the collection.load() to fail when +# there are leftover log files from a previous run +def test_cli_add_data_consecutive_runs( + collection_dir, + specification_dir, + organisation_csv, + mock_request_get, + monkeypatch, +): + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + csv_path = create_input_csv(no_error_input_data) + + # Mock in user input + monkeypatch.setattr("builtins.input", lambda _: "no") + + runner = CliRunner() + result = runner.invoke( + cli, + [ + "add-data", + csv_path, + "conservation-area", + "--collection-dir", + str(collection_dir), + "--specification-dir", + str(specification_dir), + "--organisation-path", + str(organisation_csv), + ], + ) + assert result.exit_code == 0 + + monkeypatch.setattr("builtins.input", lambda _: "yes") + # Now run a second time + result = runner.invoke( + cli, + [ + "add-data", + csv_path, + "conservation-area", + "--collection-dir", + str(collection_dir), + "--specification-dir", + str(specification_dir), + "--organisation-path", + str(organisation_csv), + ], + ) + assert result.exit_code == 0 diff --git a/tests/data/specification/licence.csv b/tests/data/specification/licence.csv new file mode 100644 index 00000000..29f32936 --- /dev/null +++ b/tests/data/specification/licence.csv @@ -0,0 +1,3 @@ +end-date,entity,entry-date,licence,start-date,text +,,,licence1,, +,,,licence2,, \ No newline at end of file diff --git a/tests/integration/test_add_data.py b/tests/integration/test_add_data.py new file mode 100644 index 00000000..25f37f55 --- /dev/null +++ b/tests/integration/test_add_data.py @@ -0,0 +1,444 @@ +import csv +import logging +import os +import tempfile +from unittest.mock import Mock +import pytest +from requests import HTTPError + +from digital_land.commands import validate_and_add_data_input +from tests.acceptance.conftest import copy_latest_specification_files_to + + +@pytest.fixture(scope="module") +def specification_dir(tmp_path_factory): + specification_dir = tmp_path_factory.mktemp("specification") + copy_latest_specification_files_to(specification_dir) + return specification_dir + + +@pytest.fixture(scope="function") +def collection_dir(tmp_path_factory): + collection_dir = tmp_path_factory.mktemp("collection") + + # create source csv + source_fieldnames = [ + "attribution", + "collection", + "documentation-url", + "endpoint", + "licence", + "organisation", + "pipelines", + "entry-date", + "start-date", + "end-date", + ] + + with open(os.path.join(collection_dir, "source.csv"), "w") as f: + dictwriter = csv.DictWriter(f, fieldnames=source_fieldnames) + dictwriter.writeheader() + + # create endpoint csv + endpoint_fieldnames = [ + "endpoint", + "endpoint-url", + "parameters", + "plugin", + "entry-date", + "start-date", + "end-date", + ] + + with open(os.path.join(collection_dir, "endpoint.csv"), "w") as f: + dictwriter = csv.DictWriter(f, fieldnames=endpoint_fieldnames) + dictwriter.writeheader() + return collection_dir + + +@pytest.fixture(scope="module") +def organisation_csv(): + organisation_path = tempfile.NamedTemporaryFile().name + organisation_fieldnames = [ + "dataset", + "end-date", + "entity", + "entry-date", + "name", + "organisation", + "prefix", + "reference", + "start-date", + ] + organisation_row = { + "dataset": "local-authority", + "end-date": "", + "entity": 314, + "entry-date": "2023-11-19", + "name": "South Staffordshire Council", + "organisation": "local-authority:SST", + "prefix": "local-authority", + "reference": "SST", + "start-date": "", + } + + with open(organisation_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=organisation_fieldnames) + writer.writeheader() + writer.writerow(organisation_row) + + return organisation_path + + +@pytest.fixture +def mock_request_get(mocker): + data = {"reference": "1", "value": "test"} + csv_content = str(data).encode("utf-8") + + mock_response = Mock() + mock_response.status_code = 200 + mock_response.request.headers = {"test": "test"} + mock_response.headers = {"test": "test"} + mock_response.content = csv_content + mocker.patch( + "requests.Session.get", + return_value=mock_response, + ) + + +def create_input_csv( + data, + fieldnames=[ + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "pipelines", + "plugin", + "licence", + ], +): + tmp_input_path = tempfile.NamedTemporaryFile().name + + with open(tmp_input_path, "w") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerow(data) + + return tmp_input_path + + +def test_validate_and_add_data_input_no_error( + collection_dir, + specification_dir, + organisation_csv, + caplog, + mock_request_get, +): + collection_name = "conservation-area" + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + + tmp_input_path = create_input_csv(no_error_input_data) + + with caplog.at_level(logging.ERROR): + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, + ) + assert len(caplog.text) == 0 + + +def test_validate_and_add_data_input_missing_columns( + collection_dir, specification_dir, organisation_csv, mock_request_get +): + collection_name = "conservation-area" + missing_column_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + } + missing_column_fieldnames = [ + "organisation", + "documentation-url", + "endpoint-url", + "start-date", + "pipelines", + "plugin", + ] + tmp_input_path = create_input_csv( + missing_column_input_data, fieldnames=missing_column_fieldnames + ) + + with pytest.raises(Exception) as error: + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, + ) + assert "required column (licence) not found in csv" in str(error) + + +@pytest.mark.parametrize( + "input_data, error_message", + [ + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "", + }, + "Licence is blank", + ), + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "incorrect", + }, + "'incorrect' is not a valid licence according to the specification", + ), + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + }, + "endpoint_url must be populated", + ), + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + }, + "endpoint_url must start with 'http://' or 'https://'", + ), + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://sstaffs", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + }, + "The endpoint_url must have a valid domain with a top-level domain (e.g., '.gov.uk', '.com')", + ), + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "01/01/2000", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + }, + "start-date 01/01/2000 must be format YYYY-MM-DD'", + ), + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "9999-01-01", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + }, + "start-date 9999-01-01 cannot be in the future", + ), + ( + { + "organisation": "", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + }, + "organisation must not be blank", + ), + ( + { + "organisation": "???", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + }, + "'???' is not in our valid organisations", + ), + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area;invalid-pipeline", + "plugin": "", + "licence": "ogl3", + }, + "'invalid-pipeline' is not a valid dataset in the specification", + ), + ( + { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area;brownfield-land", + "plugin": "", + "licence": "ogl3", + }, + "'brownfield-land' does not belong to provided collection conservation-area", + ), + ], +) +def test_validate_and_add_data( + collection_dir, + specification_dir, + organisation_csv, + mock_request_get, + input_data, + error_message, +): + collection_name = "conservation-area" + tmp_input_path = create_input_csv(input_data) + with pytest.raises(ValueError) as error: + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, + ) + assert error_message in str(error) + + +def test_validate_and_add_data_input_non_200( + collection_dir, specification_dir, organisation_csv, mocker +): + + mock_response = Mock() + mock_response.status_code = 404 + mock_response.request.headers = {"test": "test"} + mock_response.headers = {"test": "test"} + mock_response.content = "" + mocker.patch( + "requests.Session.get", + return_value=mock_response, + ) + collection_name = "conservation-area" + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-are", + "endpoint-url": "https://www.sstaffs.gov.uk/random_url", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + + tmp_input_path = create_input_csv(no_error_input_data) + + with pytest.raises(HTTPError) as error: + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, + ) + + assert "Failed to collect from URL with status: 404" in str(error) + + +def test_validate_and_add_data_input_duplicate_endpoint( + collection_dir, + specification_dir, + organisation_csv, + capsys, + mock_request_get, +): + endpoint_fieldnames = [ + "endpoint", + "endpoint-url", + "parameters", + "plugin", + "entry-date", + "start-date", + "end-date", + ] + endpoint_duplicate_row = { + "endpoint": "ebeea8689113e04aa6e709520e0b437aa0f39e80952498dd00663d9cfabb2030", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "parameters": "", + "plugin": "", + "entry-date": "", + "start-date": "", + "end-date": "", + } + with open(os.path.join(collection_dir, "endpoint.csv"), "a") as f: + writer = csv.DictWriter(f, fieldnames=endpoint_fieldnames) + writer.writeheader() + writer.writerow(endpoint_duplicate_row) + + collection_name = "conservation-area" + no_error_input_data = { + "organisation": "local-authority:SST", + "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas", + "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv", + "start-date": "", + "pipelines": "conservation-area", + "plugin": "", + "licence": "ogl3", + } + + tmp_input_path = create_input_csv(no_error_input_data) + + with pytest.raises(Exception): + validate_and_add_data_input( + tmp_input_path, + collection_name, + collection_dir, + specification_dir, + organisation_csv, + ) + assert "endpoint already exists" in capsys.readouterr().out diff --git a/tests/integration/test_add_data_utils.py b/tests/integration/test_add_data_utils.py new file mode 100644 index 00000000..64f87ee5 --- /dev/null +++ b/tests/integration/test_add_data_utils.py @@ -0,0 +1,19 @@ +from datetime import datetime +import os + +from digital_land.utils.add_data_utils import clear_log + + +def test_clear_logs(tmp_path_factory): + today = datetime.utcnow().isoformat()[:10] + endpoint = "endpoint" + collection_dir = tmp_path_factory.mktemp("random_collection") + + file_path = os.path.join(collection_dir, "log", today, f"{endpoint}.json") + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w") as f: + f.write("hello") + + clear_log(collection_dir, endpoint) + + assert not os.path.isfile(file_path) diff --git a/tests/unit/test_add_data_utils.py b/tests/unit/test_add_data_utils.py new file mode 100644 index 00000000..a0752e79 --- /dev/null +++ b/tests/unit/test_add_data_utils.py @@ -0,0 +1,70 @@ +import pytest + +from digital_land.commands import is_url_valid +from digital_land.utils.add_data_utils import is_date_valid + + +def test_is_url_valid(): + isValid, error = is_url_valid("https://www.google.com", "URL") + + assert isValid + assert error == "" + + +@pytest.mark.parametrize( + "url, error_message", + [ + ( + "", + "The URL must be populated", + ), + ( + "www.google.com", + "The URL must start with 'http://' or 'https://'", + ), + ( + "https:///query=?a=1&b=2", + "The URL must have a domain", + ), + ( + "https://google", + "The URL must have a valid domain with a top-level domain (e.g., '.gov.uk', '.com')", + ), + ], +) +def test_is_url_valid_error(url, error_message): + isValid, error = is_url_valid(url, "URL") + + assert not isValid + assert error == error_message + + +def test_is_date_valid(): + isValid, error = is_date_valid("2000-12-25", "date") + + assert isValid + assert error == "" + + +@pytest.mark.parametrize( + "date, error_message", + [ + ( + "", + "Date is blank", + ), + ( + "25-12-2000", + "date 25-12-2000 must be format YYYY-MM-DD", + ), + ( + "9999-12-25", + "The date 9999-12-25 cannot be in the future", + ), + ], +) +def test_is_date_valid_error(date, error_message): + isValid, error = is_date_valid(date, "date") + + assert not isValid + assert error == error_message