From 0019785c45244c58b98fa890eb527de87aafab33 Mon Sep 17 00:00:00 2001
From: CarlosCoelhoSL <110818364+CarlosCoelhoSL@users.noreply.github.com>
Date: Thu, 5 Dec 2024 16:28:05 +0000
Subject: [PATCH] Add data step 1 (#285)

* first stage added to submodule

* refactors log analysis in add-data

* adds happy path test and start of acceptance test

* adds/fixes tests

* adds pipeline and collection validation

* adds acceptance tests

* tidies logs and comments

* adds consecutive run handling (temporary)

* parametrizes tests

* raise exception for duplicate endpoint added

* fixes log endpoint message

* modifies collection.load() to handle logs without entry in source.csv

* adds log message to collection.load()

* adds extra url validity check

* adds tests for collection and is_url_valid

* removes unnecessary raise

* removes unnecessary pipeline_dir fixtures

* uses specification and organisation classes, moves validation to utils

* removes log_path from Collector.fetch return

* raises HTTPError when failing to collect from URL

* adds logging to try except in collection.py

* changes default logging in collection.py to True

* renames logging boolean

* renames error ogging boolean

* removes collection.py edits, now deletes log after exiting

---------

Co-authored-by: averheecke-tpx <alex.verheecke@tpximpact.com>
---
 digital_land/cli.py                      |  31 ++
 digital_land/commands.py                 | 188 ++++++++++
 digital_land/specification.py            |   7 +
 digital_land/utils/add_data_utils.py     |  50 +++
 tests/acceptance/test_add_data.py        | 262 +++++++++++++
 tests/data/specification/licence.csv     |   3 +
 tests/integration/test_add_data.py       | 444 +++++++++++++++++++++++
 tests/integration/test_add_data_utils.py |  19 +
 tests/unit/test_add_data_utils.py        |  70 ++++
 9 files changed, 1074 insertions(+)
 create mode 100644 digital_land/utils/add_data_utils.py
 create mode 100644 tests/acceptance/test_add_data.py
 create mode 100644 tests/data/specification/licence.csv
 create mode 100644 tests/integration/test_add_data.py
 create mode 100644 tests/integration/test_add_data_utils.py
 create mode 100644 tests/unit/test_add_data_utils.py

diff --git a/digital_land/cli.py b/digital_land/cli.py
index aae62b52..a7b85988 100644
--- a/digital_land/cli.py
+++ b/digital_land/cli.py
@@ -30,6 +30,7 @@
     organisation_check,
     save_state,
     compare_state,
+    add_data,
 )
 
 from digital_land.command_arguments import (
@@ -346,6 +347,36 @@ def retire_endpoints_cmd(config_collections_dir, csv_path):
     return collection_retire_endpoints_and_sources(config_collections_dir, csv_path)
 
 
+@cli.command("add-data")
+@click.argument("csv-path", nargs=1, type=click.Path())
+@click.argument("collection-name", nargs=1, type=click.STRING)
+@click.option("--collection-dir", "-c", nargs=1, type=click.Path(exists=True))
+@click.option(
+    "--specification-dir", "-s", type=click.Path(exists=True), default="specification/"
+)
+@click.option(
+    "--organisation-path",
+    "-o",
+    type=click.Path(exists=True),
+    default="var/cache/organisation.csv",
+)
+def add_data_cmd(
+    csv_path, collection_name, collection_dir, specification_dir, organisation_path
+):
+    csv_file_path = Path(csv_path)
+    if not csv_file_path.is_file():
+        logging.error(f"CSV file not found at path: {csv_path}")
+        sys.exit(2)
+
+    return add_data(
+        csv_file_path,
+        collection_name,
+        collection_dir,
+        specification_dir,
+        organisation_path,
+    )
+
+
 # edit to add collection_name in
 @cli.command("add-endpoints-and-lookups")
 @click.argument("csv-path", nargs=1, type=click.Path())
diff --git a/digital_land/commands.py b/digital_land/commands.py
index e3b3947f..c9c3e441 100644
--- a/digital_land/commands.py
+++ b/digital_land/commands.py
@@ -8,8 +8,10 @@
 from packaging.version import Version
 import pandas as pd
 from pathlib import Path
+from datetime import datetime
 
 import geojson
+from requests import HTTPError
 import shapely
 
 from digital_land.package.organisation import OrganisationPackage
@@ -57,6 +59,7 @@
 from digital_land.configuration.main import Config
 from digital_land.api import API
 from digital_land.state import State
+from digital_land.utils.add_data_utils import clear_log, is_date_valid, is_url_valid
 
 from .register import hash_value
 from .utils.gdal_utils import get_gdal_version
@@ -531,6 +534,191 @@ def collection_add_source(entry, collection, endpoint_url, collection_dir):
     add_source_endpoint(entry, directory=collection_dir)
 
 
+def validate_and_add_data_input(
+    csv_file_path, collection_name, collection_dir, specification_dir, organisation_path
+):
+    expected_cols = [
+        "pipelines",
+        "organisation",
+        "documentation-url",
+        "endpoint-url",
+        "start-date",
+        "licence",
+    ]
+
+    specification = Specification(specification_dir)
+    organisation = Organisation(organisation_path=organisation_path)
+
+    collection = Collection(name=collection_name, directory=collection_dir)
+    collection.load()
+    # ===== FIRST VALIDATION BASED ON IMPORT.CSV INFO
+    # - Check licence, url, date, organisation
+
+    # read and process each record of the new endpoints csv at csv_file_path i.e import.csv
+
+    with open(csv_file_path) as new_endpoints_file:
+        reader = csv.DictReader(new_endpoints_file)
+        csv_columns = reader.fieldnames
+
+        # validate the columns in input .csv
+        for expected_col in expected_cols:
+            if expected_col not in csv_columns:
+                raise Exception(f"required column ({expected_col}) not found in csv")
+
+        for row in reader:
+            # validate licence
+            if row["licence"] == "":
+                raise ValueError("Licence is blank")
+            elif not specification.licence.get(row["licence"], None):
+                raise ValueError(
+                    f"Licence '{row['licence']}' is not a valid licence according to the specification."
+                )
+            # check if urls are not blank and valid urls
+            is_endpoint_valid, endpoint_valid_error = is_url_valid(
+                row["endpoint-url"], "endpoint_url"
+            )
+            is_documentation_valid, documentation_valid_error = is_url_valid(
+                row["documentation-url"], "documentation_url"
+            )
+            if not is_endpoint_valid or not is_documentation_valid:
+                raise ValueError(
+                    f"{endpoint_valid_error} \n {documentation_valid_error}"
+                )
+
+            # if there is no start-date, do we want to populate it with today's date?
+            if row["start-date"]:
+                valid_date, error = is_date_valid(row["start-date"], "start-date")
+                if not valid_date:
+                    raise ValueError(error)
+
+            # validate organisation
+            if row["organisation"] == "":
+                raise ValueError("The organisation must not be blank")
+            elif not organisation.lookup(row["organisation"]):
+                raise ValueError(
+                    f"The given organisation '{row['organisation']}' is not in our valid organisations"
+                )
+
+            # validate pipeline(s) - do they exist and are they in the collection
+            pipelines = row["pipelines"].split(";")
+            for pipeline in pipelines:
+                if not specification.dataset.get(pipeline, None):
+                    raise ValueError(
+                        f"'{pipeline}' is not a valid dataset in the specification"
+                    )
+                collection_in_specification = specification.dataset.get(
+                    pipeline, None
+                ).get("collection")
+                if collection_name != collection_in_specification:
+                    raise ValueError(
+                        f"'{pipeline}' does not belong to provided collection {collection_name}"
+                    )
+
+    # VALIDATION DONE, NOW ADD TO COLLECTION
+    print("======================================================================")
+    print("Endpoint and source details")
+    print("======================================================================")
+    print("Endpoint URL: ", row["endpoint-url"])
+    print("Endpoint Hash:", hash_value(row["endpoint-url"]))
+    print("Documentation URL: ", row["documentation-url"])
+    print()
+
+    endpoints = []
+    # if endpoint already exists, it will indicate it and quit function here
+    if collection.add_source_endpoint(row):
+        endpoint = {
+            "endpoint-url": row["endpoint-url"],
+            "endpoint": hash_value(row["endpoint-url"]),
+            "end-date": row.get("end-date", ""),
+            "plugin": row.get("plugin"),
+            "licence": row["licence"],
+        }
+        endpoints.append(endpoint)
+    else:
+        # We rely on the add_source_endpoint function to log why it couldn't be added
+        raise Exception(
+            "Endpoint and source could not be added - is this a duplicate endpoint?"
+        )
+
+    # if successfully added we can now attempt to fetch from endpoint
+    collector = Collector(collection_dir=collection_dir)
+    endpoint_resource_info = {}
+    for endpoint in endpoints:
+        status = collector.fetch(
+            url=endpoint["endpoint-url"],
+            endpoint=endpoint["endpoint"],
+            end_date=endpoint["end-date"],
+            plugin=endpoint["plugin"],
+        )
+        try:
+            log_path = collector.log_path(datetime.utcnow(), endpoint["endpoint"])
+            with open(log_path, "r") as f:
+                log = json.load(f)
+        except Exception as e:
+            print(
+                f"Error: The log file for {endpoint} could not be read from path {log_path}.\n{e}"
+            )
+            break
+
+        status = log.get("status", None)
+        # Raise exception if status is not 200
+        if not status or status != "200":
+            exception = log.get("exception", None)
+            raise HTTPError(
+                f"Failed to collect from URL with status: {status if status else exception}"
+            )
+
+        # Resource and path will only be printed if downloaded successfully but should only happen if status is 200
+        resource = log.get("resource", None)
+        if resource:
+            print(
+                "Resource collected: ",
+                resource,
+            )
+            print(
+                "Resource Path is: ",
+                Path(collection_dir) / "resource" / resource,
+            )
+
+        print(f"Log Status for {endpoint['endpoint']}: The status is {status}")
+        endpoint_resource_info.update(
+            {
+                "endpoint": endpoint["endpoint"],
+                "resource": log.get("resource"),
+                "pipelines": row["pipelines"].split(";"),
+            }
+        )
+
+    return collection, endpoint_resource_info
+
+
+def add_data(
+    csv_file_path, collection_name, collection_dir, specification_dir, organisation_path
+):
+    # Potentially track a list of files to clean up at the end of session? e.g log file
+
+    # First validate the input .csv and collect from the endpoint
+    collection, endpoint_resource_info = validate_and_add_data_input(
+        csv_file_path,
+        collection_name,
+        collection_dir,
+        specification_dir,
+        organisation_path,
+    )
+    # At this point the endpoint will have been added to the collection
+
+    user_response = (
+        input("Do you want to continue processing this resource? (yes/no): ")
+        .strip()
+        .lower()
+    )
+
+    if user_response != "yes":
+        print("Operation cancelled by user.")
+        clear_log(collection_dir, endpoint_resource_info["endpoint"])
+        return
+
+
 def add_endpoints_and_lookups(
     csv_file_path,
     collection_name,
diff --git a/digital_land/specification.py b/digital_land/specification.py
index 6facce5a..67a5034f 100644
--- a/digital_land/specification.py
+++ b/digital_land/specification.py
@@ -39,6 +39,7 @@ def __init__(self, path="specification"):
         self.schema_field = {}
         self.typology = {}
         self.pipeline = {}
+        self.licence = {}
         self.load_dataset(path)
         self.load_schema(path)
         self.load_dataset_schema(path)
@@ -48,6 +49,7 @@ def __init__(self, path="specification"):
         self.load_typology(path)
         self.load_pipeline(path)
         self.load_dataset_field(path)
+        self.load_licence(path)
 
         self.index_field()
         self.index_schema()
@@ -111,6 +113,11 @@ def load_pipeline(self, path):
         for row in reader:
             self.pipeline[row["pipeline"]] = row
 
+    def load_licence(self, path):
+        reader = csv.DictReader(open(os.path.join(path, "licence.csv")))
+        for row in reader:
+            self.licence[row["licence"]] = row
+
     def index_schema(self):
         self.schema_dataset = {}
         for dataset, d in self.dataset_schema.items():
diff --git a/digital_land/utils/add_data_utils.py b/digital_land/utils/add_data_utils.py
new file mode 100644
index 00000000..4f95f6d7
--- /dev/null
+++ b/digital_land/utils/add_data_utils.py
@@ -0,0 +1,50 @@
+import os
+from datetime import datetime
+from urllib.parse import urlparse
+
+from digital_land.collect import Collector
+
+
+def is_url_valid(url, url_type):
+    if not url or url.strip() == "":
+        return False, f"The {url_type} must be populated"
+
+    parsed_url = urlparse(url)
+    # is  url scheme valid i.e start with http:// or https://
+    if parsed_url.scheme not in ["http", "https"] or not parsed_url.scheme:
+        return False, f"The {url_type} must start with 'http://' or 'https://'"
+
+    # does url have domain
+    if not parsed_url.netloc:
+        return False, f"The {url_type} must have a domain"
+
+    # ensure domain has correct format
+    if "." not in parsed_url.netloc:
+        return (
+            False,
+            f"The {url_type} must have a valid domain with a top-level domain (e.g., '.gov.uk', '.com')",
+        )
+
+    return True, ""
+
+
+def is_date_valid(date, date_type):
+    if len(date) == 0:
+        return False, "Date is blank"
+    try:
+        date = datetime.strptime(date, "%Y-%m-%d").date()
+    # need to catch ValueError here otherwise datetime will raise it's own error, not the clear format we want
+    except ValueError:
+        return False, f"{date_type} {date} must be format YYYY-MM-DD"
+
+    if date > datetime.today().date():
+        return False, f"The {date_type} {date} cannot be in the future"
+
+    return True, ""
+
+
+def clear_log(collection_dir, endpoint):
+    collector = Collector(collection_dir=collection_dir)
+    log_path = collector.log_path(datetime.utcnow(), endpoint)
+    if os.path.isfile(log_path):
+        os.remove(log_path)
diff --git a/tests/acceptance/test_add_data.py b/tests/acceptance/test_add_data.py
new file mode 100644
index 00000000..2cdf96cd
--- /dev/null
+++ b/tests/acceptance/test_add_data.py
@@ -0,0 +1,262 @@
+import csv
+import os
+import tempfile
+from unittest.mock import Mock
+from click.testing import CliRunner
+import pytest
+
+from digital_land.cli import cli
+from tests.acceptance.conftest import copy_latest_specification_files_to
+
+
+@pytest.fixture(scope="module")
+def specification_dir(tmp_path_factory):
+    specification_dir = tmp_path_factory.mktemp("specification")
+    copy_latest_specification_files_to(specification_dir)
+    return specification_dir
+
+
+@pytest.fixture(scope="function")
+def collection_dir(tmp_path_factory):
+    collection_dir = tmp_path_factory.mktemp("collection")
+
+    # create source csv
+    source_fieldnames = [
+        "attribution",
+        "collection",
+        "documentation-url",
+        "endpoint",
+        "licence",
+        "organisation",
+        "pipelines",
+        "entry-date",
+        "start-date",
+        "end-date",
+    ]
+
+    with open(os.path.join(collection_dir, "source.csv"), "w") as f:
+        dictwriter = csv.DictWriter(f, fieldnames=source_fieldnames)
+        dictwriter.writeheader()
+
+    # create endpoint csv
+    endpoint_fieldnames = [
+        "endpoint",
+        "endpoint-url",
+        "parameters",
+        "plugin",
+        "entry-date",
+        "start-date",
+        "end-date",
+    ]
+
+    with open(os.path.join(collection_dir, "endpoint.csv"), "w") as f:
+        dictwriter = csv.DictWriter(f, fieldnames=endpoint_fieldnames)
+        dictwriter.writeheader()
+    return collection_dir
+
+
+@pytest.fixture(scope="module")
+def organisation_csv():
+    organisation_path = tempfile.NamedTemporaryFile().name
+    organisation_fieldnames = [
+        "dataset",
+        "end-date",
+        "entity",
+        "entry-date",
+        "name",
+        "organisation",
+        "prefix",
+        "reference",
+        "start-date",
+    ]
+    organisation_row = {
+        "dataset": "local-authority",
+        "end-date": "",
+        "entity": 314,
+        "entry-date": "2023-11-19",
+        "name": "South Staffordshire Council",
+        "organisation": "local-authority:SST",
+        "prefix": "local-authority",
+        "reference": "SST",
+        "start-date": "",
+    }
+
+    with open(organisation_path, "w") as f:
+        writer = csv.DictWriter(f, fieldnames=organisation_fieldnames)
+        writer.writeheader()
+        writer.writerow(organisation_row)
+
+    return organisation_path
+
+
+@pytest.fixture
+def mock_request_get(mocker):
+    data = {"reference": "1", "value": "test"}
+    csv_content = str(data).encode("utf-8")
+
+    mock_response = Mock()
+    mock_response.status_code = 200
+    mock_response.request.headers = {"test": "test"}
+    mock_response.headers = {"test": "test"}
+    mock_response.content = csv_content
+    mocker.patch(
+        "requests.Session.get",
+        return_value=mock_response,
+    )
+
+
+def create_input_csv(
+    data,
+    fieldnames=[
+        "organisation",
+        "documentation-url",
+        "endpoint-url",
+        "start-date",
+        "pipelines",
+        "plugin",
+        "licence",
+    ],
+):
+    tmp_input_path = tempfile.NamedTemporaryFile().name
+
+    with open(tmp_input_path, "w") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerow(data)
+
+    return tmp_input_path
+
+
+def test_cli_add_data(
+    collection_dir,
+    specification_dir,
+    organisation_csv,
+    mock_request_get,
+    monkeypatch,
+):
+    no_error_input_data = {
+        "organisation": "local-authority:SST",
+        "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+        "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+        "start-date": "",
+        "pipelines": "conservation-area",
+        "plugin": "",
+        "licence": "ogl3",
+    }
+    csv_path = create_input_csv(no_error_input_data)
+
+    # Mock in user input
+    monkeypatch.setattr("builtins.input", lambda _: "yes")
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "add-data",
+            csv_path,
+            "conservation-area",
+            "--collection-dir",
+            str(collection_dir),
+            "--specification-dir",
+            str(specification_dir),
+            "--organisation-path",
+            str(organisation_csv),
+        ],
+    )
+
+    assert result.exit_code == 0
+
+
+def test_cli_add_data_incorrect_input_data(
+    collection_dir,
+    specification_dir,
+    organisation_csv,
+    mock_request_get,
+):
+    incorrect_input_data = {
+        "organisation": "",
+        "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+        "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+        "start-date": "",
+        "pipelines": "conservation-area",
+        "plugin": "",
+        "licence": "ogl3",
+    }
+    csv_path = create_input_csv(incorrect_input_data)
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "add-data",
+            csv_path,
+            "conservation-area",
+            "--collection-dir",
+            str(collection_dir),
+            "--specification-dir",
+            str(specification_dir),
+            "--organisation-path",
+            str(organisation_csv),
+        ],
+    )
+    assert result.exit_code == 1
+    assert "organisation must not be blank" in str(result.exception)
+
+
+# This test exists as there is potential for the collection.load() to fail when
+# there are leftover log files from a previous run
+def test_cli_add_data_consecutive_runs(
+    collection_dir,
+    specification_dir,
+    organisation_csv,
+    mock_request_get,
+    monkeypatch,
+):
+    no_error_input_data = {
+        "organisation": "local-authority:SST",
+        "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+        "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+        "start-date": "",
+        "pipelines": "conservation-area",
+        "plugin": "",
+        "licence": "ogl3",
+    }
+    csv_path = create_input_csv(no_error_input_data)
+
+    # Mock in user input
+    monkeypatch.setattr("builtins.input", lambda _: "no")
+
+    runner = CliRunner()
+    result = runner.invoke(
+        cli,
+        [
+            "add-data",
+            csv_path,
+            "conservation-area",
+            "--collection-dir",
+            str(collection_dir),
+            "--specification-dir",
+            str(specification_dir),
+            "--organisation-path",
+            str(organisation_csv),
+        ],
+    )
+    assert result.exit_code == 0
+
+    monkeypatch.setattr("builtins.input", lambda _: "yes")
+    # Now run a second time
+    result = runner.invoke(
+        cli,
+        [
+            "add-data",
+            csv_path,
+            "conservation-area",
+            "--collection-dir",
+            str(collection_dir),
+            "--specification-dir",
+            str(specification_dir),
+            "--organisation-path",
+            str(organisation_csv),
+        ],
+    )
+    assert result.exit_code == 0
diff --git a/tests/data/specification/licence.csv b/tests/data/specification/licence.csv
new file mode 100644
index 00000000..29f32936
--- /dev/null
+++ b/tests/data/specification/licence.csv
@@ -0,0 +1,3 @@
+end-date,entity,entry-date,licence,start-date,text
+,,,licence1,,
+,,,licence2,,
\ No newline at end of file
diff --git a/tests/integration/test_add_data.py b/tests/integration/test_add_data.py
new file mode 100644
index 00000000..25f37f55
--- /dev/null
+++ b/tests/integration/test_add_data.py
@@ -0,0 +1,444 @@
+import csv
+import logging
+import os
+import tempfile
+from unittest.mock import Mock
+import pytest
+from requests import HTTPError
+
+from digital_land.commands import validate_and_add_data_input
+from tests.acceptance.conftest import copy_latest_specification_files_to
+
+
+@pytest.fixture(scope="module")
+def specification_dir(tmp_path_factory):
+    specification_dir = tmp_path_factory.mktemp("specification")
+    copy_latest_specification_files_to(specification_dir)
+    return specification_dir
+
+
+@pytest.fixture(scope="function")
+def collection_dir(tmp_path_factory):
+    collection_dir = tmp_path_factory.mktemp("collection")
+
+    # create source csv
+    source_fieldnames = [
+        "attribution",
+        "collection",
+        "documentation-url",
+        "endpoint",
+        "licence",
+        "organisation",
+        "pipelines",
+        "entry-date",
+        "start-date",
+        "end-date",
+    ]
+
+    with open(os.path.join(collection_dir, "source.csv"), "w") as f:
+        dictwriter = csv.DictWriter(f, fieldnames=source_fieldnames)
+        dictwriter.writeheader()
+
+    # create endpoint csv
+    endpoint_fieldnames = [
+        "endpoint",
+        "endpoint-url",
+        "parameters",
+        "plugin",
+        "entry-date",
+        "start-date",
+        "end-date",
+    ]
+
+    with open(os.path.join(collection_dir, "endpoint.csv"), "w") as f:
+        dictwriter = csv.DictWriter(f, fieldnames=endpoint_fieldnames)
+        dictwriter.writeheader()
+    return collection_dir
+
+
+@pytest.fixture(scope="module")
+def organisation_csv():
+    organisation_path = tempfile.NamedTemporaryFile().name
+    organisation_fieldnames = [
+        "dataset",
+        "end-date",
+        "entity",
+        "entry-date",
+        "name",
+        "organisation",
+        "prefix",
+        "reference",
+        "start-date",
+    ]
+    organisation_row = {
+        "dataset": "local-authority",
+        "end-date": "",
+        "entity": 314,
+        "entry-date": "2023-11-19",
+        "name": "South Staffordshire Council",
+        "organisation": "local-authority:SST",
+        "prefix": "local-authority",
+        "reference": "SST",
+        "start-date": "",
+    }
+
+    with open(organisation_path, "w") as f:
+        writer = csv.DictWriter(f, fieldnames=organisation_fieldnames)
+        writer.writeheader()
+        writer.writerow(organisation_row)
+
+    return organisation_path
+
+
+@pytest.fixture
+def mock_request_get(mocker):
+    data = {"reference": "1", "value": "test"}
+    csv_content = str(data).encode("utf-8")
+
+    mock_response = Mock()
+    mock_response.status_code = 200
+    mock_response.request.headers = {"test": "test"}
+    mock_response.headers = {"test": "test"}
+    mock_response.content = csv_content
+    mocker.patch(
+        "requests.Session.get",
+        return_value=mock_response,
+    )
+
+
+def create_input_csv(
+    data,
+    fieldnames=[
+        "organisation",
+        "documentation-url",
+        "endpoint-url",
+        "start-date",
+        "pipelines",
+        "plugin",
+        "licence",
+    ],
+):
+    tmp_input_path = tempfile.NamedTemporaryFile().name
+
+    with open(tmp_input_path, "w") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerow(data)
+
+    return tmp_input_path
+
+
+def test_validate_and_add_data_input_no_error(
+    collection_dir,
+    specification_dir,
+    organisation_csv,
+    caplog,
+    mock_request_get,
+):
+    collection_name = "conservation-area"
+    no_error_input_data = {
+        "organisation": "local-authority:SST",
+        "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+        "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+        "start-date": "",
+        "pipelines": "conservation-area",
+        "plugin": "",
+        "licence": "ogl3",
+    }
+
+    tmp_input_path = create_input_csv(no_error_input_data)
+
+    with caplog.at_level(logging.ERROR):
+        validate_and_add_data_input(
+            tmp_input_path,
+            collection_name,
+            collection_dir,
+            specification_dir,
+            organisation_csv,
+        )
+        assert len(caplog.text) == 0
+
+
+def test_validate_and_add_data_input_missing_columns(
+    collection_dir, specification_dir, organisation_csv, mock_request_get
+):
+    collection_name = "conservation-area"
+    missing_column_input_data = {
+        "organisation": "local-authority:SST",
+        "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+        "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+        "start-date": "",
+        "pipelines": "conservation-area",
+        "plugin": "",
+    }
+    missing_column_fieldnames = [
+        "organisation",
+        "documentation-url",
+        "endpoint-url",
+        "start-date",
+        "pipelines",
+        "plugin",
+    ]
+    tmp_input_path = create_input_csv(
+        missing_column_input_data, fieldnames=missing_column_fieldnames
+    )
+
+    with pytest.raises(Exception) as error:
+        validate_and_add_data_input(
+            tmp_input_path,
+            collection_name,
+            collection_dir,
+            specification_dir,
+            organisation_csv,
+        )
+    assert "required column (licence) not found in csv" in str(error)
+
+
+@pytest.mark.parametrize(
+    "input_data, error_message",
+    [
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "",
+            },
+            "Licence is blank",
+        ),
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "incorrect",
+            },
+            "'incorrect' is not a valid licence according to the specification",
+        ),
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "",
+                "start-date": "",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "endpoint_url must be populated",
+        ),
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "endpoint_url must start with 'http://' or 'https://'",
+        ),
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://sstaffs",
+                "start-date": "",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "The endpoint_url must have a valid domain with a top-level domain (e.g., '.gov.uk', '.com')",
+        ),
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "01/01/2000",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "start-date 01/01/2000 must be format YYYY-MM-DD'",
+        ),
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "9999-01-01",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "start-date 9999-01-01 cannot be in the future",
+        ),
+        (
+            {
+                "organisation": "",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "organisation must not be blank",
+        ),
+        (
+            {
+                "organisation": "???",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "",
+                "pipelines": "conservation-area",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "'???' is not in our valid organisations",
+        ),
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "",
+                "pipelines": "conservation-area;invalid-pipeline",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "'invalid-pipeline' is not a valid dataset in the specification",
+        ),
+        (
+            {
+                "organisation": "local-authority:SST",
+                "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+                "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+                "start-date": "",
+                "pipelines": "conservation-area;brownfield-land",
+                "plugin": "",
+                "licence": "ogl3",
+            },
+            "'brownfield-land' does not belong to provided collection conservation-area",
+        ),
+    ],
+)
+def test_validate_and_add_data(
+    collection_dir,
+    specification_dir,
+    organisation_csv,
+    mock_request_get,
+    input_data,
+    error_message,
+):
+    collection_name = "conservation-area"
+    tmp_input_path = create_input_csv(input_data)
+    with pytest.raises(ValueError) as error:
+        validate_and_add_data_input(
+            tmp_input_path,
+            collection_name,
+            collection_dir,
+            specification_dir,
+            organisation_csv,
+        )
+    assert error_message in str(error)
+
+
+def test_validate_and_add_data_input_non_200(
+    collection_dir, specification_dir, organisation_csv, mocker
+):
+
+    mock_response = Mock()
+    mock_response.status_code = 404
+    mock_response.request.headers = {"test": "test"}
+    mock_response.headers = {"test": "test"}
+    mock_response.content = ""
+    mocker.patch(
+        "requests.Session.get",
+        return_value=mock_response,
+    )
+    collection_name = "conservation-area"
+    no_error_input_data = {
+        "organisation": "local-authority:SST",
+        "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-are",
+        "endpoint-url": "https://www.sstaffs.gov.uk/random_url",
+        "start-date": "",
+        "pipelines": "conservation-area",
+        "plugin": "",
+        "licence": "ogl3",
+    }
+
+    tmp_input_path = create_input_csv(no_error_input_data)
+
+    with pytest.raises(HTTPError) as error:
+        validate_and_add_data_input(
+            tmp_input_path,
+            collection_name,
+            collection_dir,
+            specification_dir,
+            organisation_csv,
+        )
+
+    assert "Failed to collect from URL with status: 404" in str(error)
+
+
+def test_validate_and_add_data_input_duplicate_endpoint(
+    collection_dir,
+    specification_dir,
+    organisation_csv,
+    capsys,
+    mock_request_get,
+):
+    endpoint_fieldnames = [
+        "endpoint",
+        "endpoint-url",
+        "parameters",
+        "plugin",
+        "entry-date",
+        "start-date",
+        "end-date",
+    ]
+    endpoint_duplicate_row = {
+        "endpoint": "ebeea8689113e04aa6e709520e0b437aa0f39e80952498dd00663d9cfabb2030",
+        "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+        "parameters": "",
+        "plugin": "",
+        "entry-date": "",
+        "start-date": "",
+        "end-date": "",
+    }
+    with open(os.path.join(collection_dir, "endpoint.csv"), "a") as f:
+        writer = csv.DictWriter(f, fieldnames=endpoint_fieldnames)
+        writer.writeheader()
+        writer.writerow(endpoint_duplicate_row)
+
+    collection_name = "conservation-area"
+    no_error_input_data = {
+        "organisation": "local-authority:SST",
+        "documentation-url": "https://www.sstaffs.gov.uk/planning/conservation-and-heritage/south-staffordshires-conservation-areas",
+        "endpoint-url": "https://www.sstaffs.gov.uk/sites/default/files/2024-11/South Staffs Conservation Area document dataset_1.csv",
+        "start-date": "",
+        "pipelines": "conservation-area",
+        "plugin": "",
+        "licence": "ogl3",
+    }
+
+    tmp_input_path = create_input_csv(no_error_input_data)
+
+    with pytest.raises(Exception):
+        validate_and_add_data_input(
+            tmp_input_path,
+            collection_name,
+            collection_dir,
+            specification_dir,
+            organisation_csv,
+        )
+    assert "endpoint already exists" in capsys.readouterr().out
diff --git a/tests/integration/test_add_data_utils.py b/tests/integration/test_add_data_utils.py
new file mode 100644
index 00000000..64f87ee5
--- /dev/null
+++ b/tests/integration/test_add_data_utils.py
@@ -0,0 +1,19 @@
+from datetime import datetime
+import os
+
+from digital_land.utils.add_data_utils import clear_log
+
+
+def test_clear_logs(tmp_path_factory):
+    today = datetime.utcnow().isoformat()[:10]
+    endpoint = "endpoint"
+    collection_dir = tmp_path_factory.mktemp("random_collection")
+
+    file_path = os.path.join(collection_dir, "log", today, f"{endpoint}.json")
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, "w") as f:
+        f.write("hello")
+
+    clear_log(collection_dir, endpoint)
+
+    assert not os.path.isfile(file_path)
diff --git a/tests/unit/test_add_data_utils.py b/tests/unit/test_add_data_utils.py
new file mode 100644
index 00000000..a0752e79
--- /dev/null
+++ b/tests/unit/test_add_data_utils.py
@@ -0,0 +1,70 @@
+import pytest
+
+from digital_land.commands import is_url_valid
+from digital_land.utils.add_data_utils import is_date_valid
+
+
+def test_is_url_valid():
+    isValid, error = is_url_valid("https://www.google.com", "URL")
+
+    assert isValid
+    assert error == ""
+
+
+@pytest.mark.parametrize(
+    "url, error_message",
+    [
+        (
+            "",
+            "The URL must be populated",
+        ),
+        (
+            "www.google.com",
+            "The URL must start with 'http://' or 'https://'",
+        ),
+        (
+            "https:///query=?a=1&b=2",
+            "The URL must have a domain",
+        ),
+        (
+            "https://google",
+            "The URL must have a valid domain with a top-level domain (e.g., '.gov.uk', '.com')",
+        ),
+    ],
+)
+def test_is_url_valid_error(url, error_message):
+    isValid, error = is_url_valid(url, "URL")
+
+    assert not isValid
+    assert error == error_message
+
+
+def test_is_date_valid():
+    isValid, error = is_date_valid("2000-12-25", "date")
+
+    assert isValid
+    assert error == ""
+
+
+@pytest.mark.parametrize(
+    "date, error_message",
+    [
+        (
+            "",
+            "Date is blank",
+        ),
+        (
+            "25-12-2000",
+            "date 25-12-2000 must be format YYYY-MM-DD",
+        ),
+        (
+            "9999-12-25",
+            "The date 9999-12-25 cannot be in the future",
+        ),
+    ],
+)
+def test_is_date_valid_error(date, error_message):
+    isValid, error = is_date_valid(date, "date")
+
+    assert not isValid
+    assert error == error_message