From c96644bc58ff7fad3de38fede4c007f8d3eff4d4 Mon Sep 17 00:00:00 2001 From: Chris Johns <95475146+cjohns-scottlogic@users.noreply.github.com> Date: Tue, 18 Jun 2024 14:43:34 +0100 Subject: [PATCH] Split collection and package building Use a separate job to build the package (organisation.csv file) after the collection. --- .github/workflows/build_package.yml | 109 +++++++++++++++++++++++ .github/workflows/run.yml | 5 +- Makefile | 23 +---- bin/check_organisation_csv.py | 129 ---------------------------- bin/create_organisation_csv.py | 49 ----------- 5 files changed, 117 insertions(+), 198 deletions(-) create mode 100644 .github/workflows/build_package.yml delete mode 100755 bin/check_organisation_csv.py delete mode 100644 bin/create_organisation_csv.py diff --git a/.github/workflows/build_package.yml b/.github/workflows/build_package.yml new file mode 100644 index 00000000..909ec804 --- /dev/null +++ b/.github/workflows/build_package.yml @@ -0,0 +1,109 @@ +name: Run collection +on: + workflow_call: + secrets: + DLB_BOT_EMAIL: + required: true + DLB_BOT_TOKEN: + required: true + DLB_BOT_USERNAME: + required: true + AWS_S3_ACCESS_KEY_ID: + required: true + AWS_S3_SECRET_ACCESS_KEY: + required: true +env: + DLB_BOT_EMAIL: ${{ secrets.DLB_BOT_EMAIL }} + DLB_BOT_TOKEN: ${{ secrets.DLB_BOT_TOKEN }} + DLB_BOT_USERNAME: ${{ secrets.DLB_BOT_USERNAME }} +jobs: + build: + runs-on: ubuntu-latest + steps: + + - name: Free up disk space + run: | + df -h + sudo rm -rf /usr/share/dotnet + sudo rm -rf /usr/local/lib/android + sudo rm -rf /opt/ghc + echo + df -h + + - uses: actions/checkout@v3 + with: + lfs: true + + - uses: actions/setup-python@v4 + with: + python-version: 3.8 + + - name: Configure git + run: | + git config user.email "${DLB_BOT_EMAIL}" + git config user.name "${DLB_BOT_USERNAME}" + git remote set-url origin https://${DLB_BOT_USERNAME}:${DLB_BOT_TOKEN}@github.com/${GITHUB_REPOSITORY}.git + git checkout ${GITHUB_REF_NAME} + + - name: Update makerules + run: make makerules + + - name: Install dependencies + run: make init + + - name: Create the package + run: make package + + - name: Configure Development AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@v1-node16 + with: + aws-access-key-id: ${{secrets.DEVELOPMENT_AWS_ACCESS_KEY_ID}} + aws-secret-access-key: ${{secrets.DEVELOPMENT_AWS_ACCESS_SECRET}} + aws-region: eu-west-2 + + - name: Save datasets to Development S3 + env: + COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} + HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} + run: make save-dataset + + - name: Save expectations to Development S3 + if: always() + env: + COLLECTION_DATASET_BUCKET_NAME: ${{secrets.DEVELOPMENT_DATA_S3_BUCKET}} + run: make save-expectations + + - name: Configure Staging AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@v1-node16 + with: + aws-access-key-id: ${{secrets.STAGING_AWS_ACCESS_KEY_ID}} + aws-secret-access-key: ${{secrets.STAGING_AWS_ACCESS_SECRET}} + aws-region: eu-west-2 + + - name: Save datasets to Staging S3 + env: + COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} + HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} + run: make save-dataset + + - name: Save expectations to Staging S3 + if: always() + env: + COLLECTION_DATASET_BUCKET_NAME: ${{secrets.STAGING_DATA_S3_BUCKET}} + run: make save-expectations + + - name: Configure Production AWS Credentials + if: always() + uses: aws-actions/configure-aws-credentials@v1-node16 + with: + aws-access-key-id: ${{secrets.PROD_AWS_ACCESS_KEY_ID}} + aws-secret-access-key: ${{secrets.PROD_AWS_ACCESS_SECRET}} + aws-region: eu-west-2 + + - name: Save datasets to Prod S3 + env: + COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} + HOISTED_COLLECTION_DATASET_BUCKET_NAME: ${{secrets.PRODUCTION_DATA_S3_BUCKET}} + run: make save-dataset diff --git a/.github/workflows/run.yml b/.github/workflows/run.yml index 9a548297..7b63f701 100644 --- a/.github/workflows/run.yml +++ b/.github/workflows/run.yml @@ -7,4 +7,7 @@ jobs: call-workflow: uses: digital-land/collection-template/.github/workflows/callable_run.yml@main secrets: inherit - + build-package: + needs: call-workflow + uses: ./.github/workflows/build_package.yml + secrets: inherit diff --git a/Makefile b/Makefile index 751f73c9..56438877 100644 --- a/Makefile +++ b/Makefile @@ -3,29 +3,14 @@ include makerules/development.mk include makerules/collection.mk include makerules/pipeline.mk -# -# Combine the individual organisation datasets into a single organisation.csv -# TBD: make from a specification datapackage definition -# -ORGANISATION_DATASETS=\ - $(DEVELOPMENT_CORPORATION_DATASET)\ - $(GOVERNMENT_ORGANISATION_DATASET)\ - $(LOCAL_AUTHORITY_DATASET)\ - $(NATIONAL_PARK_AUTHORITY_DATASET)\ - $(NONPROFIT_DATASET)\ - $(PASSENGER_TRANSPORT_EXECUTIVE_DATASET)\ - $(PUBLIC_AUTHORITY_DATASET)\ - $(REGIONAL_PARK_AUTHORITY_DATASET)\ - $(WASTE_AUTHORITY_DATASET) +package:: dataset/organisation.csv dataset/organisation-check.csv -dataset:: dataset/organisation.csv dataset/organisation-check.csv - -dataset/organisation.csv: $(ORGANISATION_DATASETS) - python3 bin/create_organisation_csv.py --output-path $@ +dataset/organisation.csv: + digital-land organisation-create --download-url="https://files.planning.data.gov.uk/organisation-collection/dataset" --output-path $@ # check organisation datapackage dataset/organisation-check.csv: dataset/organisation.csv var/cache/local-planning-authority.csv - python3 bin/check_organisation_csv.py --output-path $@ + digital-land organisation-check --output-path $@ var/cache/local-planning-authority.csv: @mkdir -p $(CACHE_DIR) diff --git a/bin/check_organisation_csv.py b/bin/check_organisation_csv.py deleted file mode 100755 index 0d2a049e..00000000 --- a/bin/check_organisation_csv.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 - -# check the integrity of the organisation.csv file - -import sys -import click -import re -import csv -from digital_land.specification import Specification - - -organisations = {} -specification = Specification() -lpas = {} -entities = {} -wikidatas = {} -bas = {} -odcs = {} -curies = {} -issues = [] - - -def load_lpas(path): - for row in csv.DictReader(open(path)): - lpas[row["reference"]] = row - - -def load(path): - for row in csv.DictReader(open(path)): - curie = row.get("organisation", "") - if not curie: - curie = f'{row["prefix"]}:{row["reference"]}' - organisations.setdefault(curie, {}) - for field, value in row.items(): - if value: - organisations[curie][field] = value - - -def log_issue(severity, row, issue, field="", value=""): - line = { - "datapackage": "organisation", - "entity": row["entity"], - "prefix": row["prefix"], - "reference": row["reference"], - "severity": severity, - "issue": issue, - "field": field, - "value": value, - } - if severity in ["critical", "error"]: - print(f'{line["severity"]} {line["prefix"]}:{line["reference"]} {issue} {field} {value}', file=sys.stderr) - issues.append(line) - - -def save_issues(path): - fieldnames = ["datapackage", "severity", "entity", "prefix", "reference", "issue", "field", "value"] - w = csv.DictWriter(open(path, "w"), fieldnames=fieldnames, extrasaction="ignore") - w.writeheader() - for row in issues: - w.writerow(row) - - -def check(): - for organisation, row in organisations.items(): - - # look for duplicate entities - if row["entity"] in entities: - log_issue("error", row, "duplicate entity") - else: - entities[row["entity"]] = organisation - - # check wikidata - wikidata = row.get("wikidata", "") - if wikidata and wikidata in wikidatas: - severity = "warning" if row["entity"] in ["600001"] else "error" - log_issue(severity, row, "duplicate value", field="wikidata", value=row["wikidata"]) - else: - wikidatas[row["wikidata"]] = organisation - - # check LPA value against dataset - lpa = row.get("local-planning-authority", "") - if not lpa: - if (row["dataset"] in ["local-authority", "national-park-authority"]) and ( - row.get("local-authority-type", "") not in ["CTY", "COMB", "SRA"] - ): - severity = "warning" if row.get("end-date", "") else "error" - log_issue(severity, row, "missing", field="local-planning-authority") - elif lpa not in lpas: - log_issue("error", row, "unknown", field="local-planning-authority", value=lpa) - else: - lpas[lpa]["organisation"] = organisation - - # check billing-authority - ba = row.get("billing-authority", "") - if not ba: - if row["dataset"] not in ["government-organisation"]: - severity = "warning" if row.get("end-date", "") else "error" - log_issue(severity, row, "missing", field="billing-authority") - elif ba in bas: - log_issue("error", row, "duplicate value", field="billing-authority", value=row["billing-authority"]) - else: - bas[row["billing-authority"]] = organisation - - # check opendatacommunities-uri - odc = row.get("opendatacommunities-uri", "") - if not odc: - if row["dataset"] not in ["government-organisation"]: - severity = "warning" if row.get("end-date", "") else "error" - log_issue(severity, row, "missing", field="opendatacommunities-uri") - elif odc in odcs: - log_issue("error", row, "duplicate value", field="opendatacommunities-uri", value=row["opendatacommunities-uri"]) - else: - odcs[row["opendatacommunities-uri"]] = organisation - - - -@click.command() -@click.option( - "--output-path", type=click.Path(), default="dataset/organisation-check.csv" -) -def cli(output_path): - load_lpas("var/cache/local-planning-authority.csv") - load("dataset/organisation.csv") - check() - save_issues(output_path) - - -if __name__ == "__main__": - cli() diff --git a/bin/create_organisation_csv.py b/bin/create_organisation_csv.py deleted file mode 100644 index 6ab32c3f..00000000 --- a/bin/create_organisation_csv.py +++ /dev/null @@ -1,49 +0,0 @@ -import click -from os import listdir -from pathlib import Path -import csv - -from digital_land.specification import Specification - -@click.command() -@click.option("--flattened-dir", type=click.Path(exists=True), default="flattened/") -@click.option( - "--specification-dir", type=click.Path(exists=True), default="specification/" -) -@click.option("--output-path", type=click.Path(), default="dataset/organisation.csv") -def create_org_csv_cli(flattened_dir, specification_dir,output_path): - specification = Specification(path=specification_dir) - - # get field names - org_field_names = specification.schema_field['organisation'] - - - # get get file list - filenames = listdir(flattened_dir) - filenames = [ filename for filename in filenames if filename.endswith('.csv') ] - - orgs = [] - for file in filenames: - filepath = Path(flattened_dir)/ file - with open(filepath, newline="") as f: - for row in csv.DictReader(f): - # hack to replace "_" with "-" in fieldnames - if row['typology'] == 'organisation': - row = { k.replace("_", "-"): v for k, v in row.items() } - if not row.get('organisation',None): - row['organisation'] = row['dataset'] + ':' + row['reference'] - org = {k:v for k,v in row.items() if k in org_field_names} - orgs.append(org) - - # write list of dicts - output_path = Path(output_path) - with open(output_path, "w", newline="") as f: - w = csv.DictWriter(f, fieldnames=org_field_names, extrasaction='ignore') - w.writeheader() - w.writerows(orgs) - - return - - -if __name__ == "__main__": - create_org_csv_cli()