From c96644bc58ff7fad3de38fede4c007f8d3eff4d4 Mon Sep 17 00:00:00 2001
From: Chris Johns <95475146+cjohns-scottlogic@users.noreply.github.com>
Date: Tue, 18 Jun 2024 14:43:34 +0100
Subject: [PATCH] Split collection and package building

Use a separate job to  build the package (organisation.csv file) after the collection.
---
 .github/workflows/build_package.yml | 109 +++++++++++++++++++++++
 .github/workflows/run.yml           |   5 +-
 Makefile                            |  23 +----
 bin/check_organisation_csv.py       | 129 ----------------------------
 bin/create_organisation_csv.py      |  49 -----------
 5 files changed, 117 insertions(+), 198 deletions(-)
 create mode 100644 .github/workflows/build_package.yml
 delete mode 100755 bin/check_organisation_csv.py
 delete mode 100644 bin/create_organisation_csv.py

diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml
new file mode 100644
index 00000000..909ec804
--- /dev/null
+++ b/.github/workflows/build_package.yml
@@ -0,0 +1,109 @@
+name: Run collection
+on:
+  workflow_call:
+    secrets:
+      DLB_BOT_EMAIL:
+        required: true
+      DLB_BOT_TOKEN:
+        required: true
+      DLB_BOT_USERNAME:
+        required: true
+      AWS_S3_ACCESS_KEY_ID:
+        required: true
+      AWS_S3_SECRET_ACCESS_KEY:
+        required: true
+env:
+  DLB_BOT_EMAIL: ${{ secrets.DLB_BOT_EMAIL }}
+  DLB_BOT_TOKEN: ${{ secrets.DLB_BOT_TOKEN }}
+  DLB_BOT_USERNAME: ${{ secrets.DLB_BOT_USERNAME }}
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+
+    - name: Free up disk space
+      run: |
+        df -h
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /usr/local/lib/android
+        sudo rm -rf /opt/ghc
+        echo
+        df -h
+
+    - uses: actions/checkout@v3
+      with:
+        lfs: true
+
+    - uses: actions/setup-python@v4
+      with:
+        python-version: 3.8
+
+    - name: Configure git
+      run: |
+        git config user.email "${DLB_BOT_EMAIL}"
+        git config user.name "${DLB_BOT_USERNAME}"
+        git remote set-url origin https://${DLB_BOT_USERNAME}:${DLB_BOT_TOKEN}@github.com/${GITHUB_REPOSITORY}.git
+        git checkout ${GITHUB_REF_NAME}
+
+    - name: Update makerules
+      run: make makerules
+
+    - name: Install dependencies
+      run: make init
+
+    - name: Create the package
+      run: make package
+
+    - name: Configure Development AWS Credentials
+      if: always()
+      uses: aws-actions/configure-aws-credentials@v1-node16
+      with:
+        aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}}
+        aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}}
+        aws-region: eu-west-2
+
+    - name: Save datasets to Development S3
+      env:
+        COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
+        HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
+      run: make save-dataset
+
+    - name: Save expectations to Development S3
+      if: always()
+      env:
+        COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}}
+      run: make save-expectations
+
+    - name: Configure Staging AWS Credentials
+      if: always()
+      uses: aws-actions/configure-aws-credentials@v1-node16
+      with:
+        aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}}
+        aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}}
+        aws-region: eu-west-2
+
+    - name: Save datasets to Staging S3
+      env:
+        COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
+        HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
+      run: make save-dataset
+
+    - name: Save expectations to Staging S3
+      if: always()
+      env:
+        COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}}
+      run: make save-expectations
+
+    - name: Configure Production AWS Credentials
+      if: always()
+      uses: aws-actions/configure-aws-credentials@v1-node16
+      with:
+        aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}}
+        aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}}
+        aws-region: eu-west-2
+
+    - name: Save datasets to Prod S3
+      env:
+        COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
+        HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}}
+      run: make save-dataset
diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml
index 9a548297..7b63f701 100644
--- a/.github/workflows/run.yml
+++ b/.github/workflows/run.yml
@@ -7,4 +7,7 @@ jobs:
   call-workflow:
     uses: digital-land/collection-template/.github/workflows/callable_run.yml@main
     secrets: inherit
-
+  build-package:
+    needs: call-workflow
+    uses: ./.github/workflows/build_package.yml
+    secrets: inherit
diff --git a/Makefile b/Makefile
index 751f73c9..56438877 100644
--- a/Makefile
+++ b/Makefile
@@ -3,29 +3,14 @@ include makerules/development.mk
 include makerules/collection.mk
 include makerules/pipeline.mk
 
-# 
-#  Combine the individual organisation datasets into a single organisation.csv  
-#  TBD: make from a specification datapackage definition
-#
-ORGANISATION_DATASETS=\
-	$(DEVELOPMENT_CORPORATION_DATASET)\
-	$(GOVERNMENT_ORGANISATION_DATASET)\
-	$(LOCAL_AUTHORITY_DATASET)\
-	$(NATIONAL_PARK_AUTHORITY_DATASET)\
-	$(NONPROFIT_DATASET)\
-	$(PASSENGER_TRANSPORT_EXECUTIVE_DATASET)\
-	$(PUBLIC_AUTHORITY_DATASET)\
-	$(REGIONAL_PARK_AUTHORITY_DATASET)\
-	$(WASTE_AUTHORITY_DATASET)
+package:: dataset/organisation.csv dataset/organisation-check.csv
 
-dataset:: dataset/organisation.csv dataset/organisation-check.csv
-
-dataset/organisation.csv: $(ORGANISATION_DATASETS)
-	python3 bin/create_organisation_csv.py --output-path $@ 
+dataset/organisation.csv:
+	digital-land organisation-create --download-url="https://files.planning.data.gov.uk/organisation-collection/dataset" --output-path $@
 
 # check organisation datapackage
 dataset/organisation-check.csv: dataset/organisation.csv var/cache/local-planning-authority.csv
-	python3 bin/check_organisation_csv.py --output-path $@ 
+	digital-land organisation-check --output-path $@
 
 var/cache/local-planning-authority.csv:
 	@mkdir -p $(CACHE_DIR)
diff --git a/bin/check_organisation_csv.py b/bin/check_organisation_csv.py
deleted file mode 100755
index 0d2a049e..00000000
--- a/bin/check_organisation_csv.py
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env python3
-
-# check the integrity of the organisation.csv file
-
-import sys
-import click
-import re
-import csv
-from digital_land.specification import Specification
-
-
-organisations = {}
-specification = Specification()
-lpas = {}
-entities = {}
-wikidatas = {}
-bas = {}
-odcs = {}
-curies = {}
-issues = []
-
-
-def load_lpas(path):
-    for row in csv.DictReader(open(path)):
-        lpas[row["reference"]] = row
-
-
-def load(path):
-    for row in csv.DictReader(open(path)):
-        curie = row.get("organisation", "")
-        if not curie:
-            curie = f'{row["prefix"]}:{row["reference"]}'
-        organisations.setdefault(curie, {})
-        for field, value in row.items():
-            if value:
-                organisations[curie][field] = value
-
-
-def log_issue(severity, row, issue, field="", value=""):
-    line = {
-        "datapackage": "organisation",
-        "entity": row["entity"],
-        "prefix": row["prefix"],
-        "reference": row["reference"],
-        "severity": severity,
-        "issue": issue,
-        "field": field,
-        "value": value,
-    }
-    if severity in ["critical", "error"]:
-        print(f'{line["severity"]} {line["prefix"]}:{line["reference"]} {issue} {field} {value}', file=sys.stderr)
-    issues.append(line)
-
-
-def save_issues(path):
-    fieldnames = ["datapackage", "severity", "entity", "prefix", "reference", "issue", "field", "value"]
-    w = csv.DictWriter(open(path, "w"), fieldnames=fieldnames, extrasaction="ignore")
-    w.writeheader()
-    for row in issues:
-        w.writerow(row)
-
-
-def check():
-    for organisation, row in organisations.items():
-
-        # look for duplicate entities
-        if row["entity"] in entities:
-            log_issue("error", row, "duplicate entity")
-        else:
-            entities[row["entity"]] = organisation
-
-        # check wikidata
-        wikidata = row.get("wikidata", "")
-        if wikidata and wikidata in wikidatas:
-            severity = "warning" if row["entity"] in ["600001"] else "error"
-            log_issue(severity, row, "duplicate value", field="wikidata", value=row["wikidata"])
-        else:
-            wikidatas[row["wikidata"]] = organisation
-
-        # check LPA value against dataset
-        lpa = row.get("local-planning-authority", "")
-        if not lpa:
-            if (row["dataset"] in ["local-authority", "national-park-authority"]) and (
-                row.get("local-authority-type", "") not in ["CTY", "COMB", "SRA"]
-            ):
-                severity = "warning" if row.get("end-date", "") else "error"
-                log_issue(severity, row, "missing", field="local-planning-authority")
-        elif lpa not in lpas:
-            log_issue("error", row, "unknown", field="local-planning-authority", value=lpa)
-        else:
-            lpas[lpa]["organisation"] = organisation
-
-        # check billing-authority
-        ba = row.get("billing-authority", "")
-        if not ba:
-            if row["dataset"] not in ["government-organisation"]:
-                severity = "warning" if row.get("end-date", "") else "error"
-                log_issue(severity, row, "missing", field="billing-authority")
-        elif ba in bas:
-            log_issue("error", row, "duplicate value", field="billing-authority", value=row["billing-authority"])
-        else:
-            bas[row["billing-authority"]] = organisation
-
-        # check opendatacommunities-uri
-        odc = row.get("opendatacommunities-uri", "")
-        if not odc:
-            if row["dataset"] not in ["government-organisation"]:
-                severity = "warning" if row.get("end-date", "") else "error"
-                log_issue(severity, row, "missing", field="opendatacommunities-uri")
-        elif odc in odcs:
-            log_issue("error", row, "duplicate value", field="opendatacommunities-uri", value=row["opendatacommunities-uri"])
-        else:
-            odcs[row["opendatacommunities-uri"]] = organisation
-
-
-
-@click.command()
-@click.option(
-    "--output-path", type=click.Path(), default="dataset/organisation-check.csv"
-)
-def cli(output_path):
-    load_lpas("var/cache/local-planning-authority.csv")
-    load("dataset/organisation.csv")
-    check()
-    save_issues(output_path)
-
-
-if __name__ == "__main__":
-    cli()
diff --git a/bin/create_organisation_csv.py b/bin/create_organisation_csv.py
deleted file mode 100644
index 6ab32c3f..00000000
--- a/bin/create_organisation_csv.py
+++ /dev/null
@@ -1,49 +0,0 @@
-import click
-from os import listdir
-from pathlib import Path
-import csv
-
-from digital_land.specification import Specification
-
-@click.command()
-@click.option("--flattened-dir", type=click.Path(exists=True), default="flattened/")
-@click.option(
-    "--specification-dir", type=click.Path(exists=True), default="specification/"
-)
-@click.option("--output-path", type=click.Path(), default="dataset/organisation.csv")
-def create_org_csv_cli(flattened_dir, specification_dir,output_path):
-    specification = Specification(path=specification_dir)
-
-    # get field names
-    org_field_names = specification.schema_field['organisation']
-
-    
-    # get get file list
-    filenames = listdir(flattened_dir)
-    filenames = [ filename for filename in filenames if filename.endswith('.csv') ]
-    
-    orgs = []
-    for file in filenames:
-        filepath = Path(flattened_dir)/ file
-        with open(filepath, newline="") as f:
-            for row in csv.DictReader(f):
-                # hack to replace "_" with "-" in fieldnames
-                if row['typology'] == 'organisation':
-                    row = { k.replace("_", "-"): v for k, v in row.items() }
-                    if not row.get('organisation',None):
-                        row['organisation'] = row['dataset'] + ':' + row['reference']
-                    org = {k:v for k,v in row.items() if k  in org_field_names}
-                    orgs.append(org)
-
-    # write list of dicts
-    output_path = Path(output_path)
-    with open(output_path, "w", newline="") as f:
-        w = csv.DictWriter(f, fieldnames=org_field_names, extrasaction='ignore')
-        w.writeheader()
-        w.writerows(orgs)
-
-    return
-
-
-if __name__ == "__main__":
-    create_org_csv_cli()