From 4e6cd8170cf7532e0f0cf64fc4d793e89df845b8 Mon Sep 17 00:00:00 2001 From: Christopher Johns Date: Tue, 18 Jun 2024 11:37:01 +0100 Subject: [PATCH] Removed old scripts. --- bin/check_organisation_csv.py | 129 --------------------------------- bin/create_organisation_csv.py | 49 ------------- 2 files changed, 178 deletions(-) delete mode 100755 bin/check_organisation_csv.py delete mode 100644 bin/create_organisation_csv.py diff --git a/bin/check_organisation_csv.py b/bin/check_organisation_csv.py deleted file mode 100755 index 0d2a049e..00000000 --- a/bin/check_organisation_csv.py +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env python3 - -# check the integrity of the organisation.csv file - -import sys -import click -import re -import csv -from digital_land.specification import Specification - - -organisations = {} -specification = Specification() -lpas = {} -entities = {} -wikidatas = {} -bas = {} -odcs = {} -curies = {} -issues = [] - - -def load_lpas(path): - for row in csv.DictReader(open(path)): - lpas[row["reference"]] = row - - -def load(path): - for row in csv.DictReader(open(path)): - curie = row.get("organisation", "") - if not curie: - curie = f'{row["prefix"]}:{row["reference"]}' - organisations.setdefault(curie, {}) - for field, value in row.items(): - if value: - organisations[curie][field] = value - - -def log_issue(severity, row, issue, field="", value=""): - line = { - "datapackage": "organisation", - "entity": row["entity"], - "prefix": row["prefix"], - "reference": row["reference"], - "severity": severity, - "issue": issue, - "field": field, - "value": value, - } - if severity in ["critical", "error"]: - print(f'{line["severity"]} {line["prefix"]}:{line["reference"]} {issue} {field} {value}', file=sys.stderr) - issues.append(line) - - -def save_issues(path): - fieldnames = ["datapackage", "severity", "entity", "prefix", "reference", "issue", "field", "value"] - w = csv.DictWriter(open(path, "w"), fieldnames=fieldnames, extrasaction="ignore") - w.writeheader() - for row in issues: - w.writerow(row) - - -def check(): - for organisation, row in organisations.items(): - - # look for duplicate entities - if row["entity"] in entities: - log_issue("error", row, "duplicate entity") - else: - entities[row["entity"]] = organisation - - # check wikidata - wikidata = row.get("wikidata", "") - if wikidata and wikidata in wikidatas: - severity = "warning" if row["entity"] in ["600001"] else "error" - log_issue(severity, row, "duplicate value", field="wikidata", value=row["wikidata"]) - else: - wikidatas[row["wikidata"]] = organisation - - # check LPA value against dataset - lpa = row.get("local-planning-authority", "") - if not lpa: - if (row["dataset"] in ["local-authority", "national-park-authority"]) and ( - row.get("local-authority-type", "") not in ["CTY", "COMB", "SRA"] - ): - severity = "warning" if row.get("end-date", "") else "error" - log_issue(severity, row, "missing", field="local-planning-authority") - elif lpa not in lpas: - log_issue("error", row, "unknown", field="local-planning-authority", value=lpa) - else: - lpas[lpa]["organisation"] = organisation - - # check billing-authority - ba = row.get("billing-authority", "") - if not ba: - if row["dataset"] not in ["government-organisation"]: - severity = "warning" if row.get("end-date", "") else "error" - log_issue(severity, row, "missing", field="billing-authority") - elif ba in bas: - log_issue("error", row, "duplicate value", field="billing-authority", value=row["billing-authority"]) - else: - bas[row["billing-authority"]] = organisation - - # check opendatacommunities-uri - odc = row.get("opendatacommunities-uri", "") - if not odc: - if row["dataset"] not in ["government-organisation"]: - severity = "warning" if row.get("end-date", "") else "error" - log_issue(severity, row, "missing", field="opendatacommunities-uri") - elif odc in odcs: - log_issue("error", row, "duplicate value", field="opendatacommunities-uri", value=row["opendatacommunities-uri"]) - else: - odcs[row["opendatacommunities-uri"]] = organisation - - - -@click.command() -@click.option( - "--output-path", type=click.Path(), default="dataset/organisation-check.csv" -) -def cli(output_path): - load_lpas("var/cache/local-planning-authority.csv") - load("dataset/organisation.csv") - check() - save_issues(output_path) - - -if __name__ == "__main__": - cli() diff --git a/bin/create_organisation_csv.py b/bin/create_organisation_csv.py deleted file mode 100644 index 6ab32c3f..00000000 --- a/bin/create_organisation_csv.py +++ /dev/null @@ -1,49 +0,0 @@ -import click -from os import listdir -from pathlib import Path -import csv - -from digital_land.specification import Specification - -@click.command() -@click.option("--flattened-dir", type=click.Path(exists=True), default="flattened/") -@click.option( - "--specification-dir", type=click.Path(exists=True), default="specification/" -) -@click.option("--output-path", type=click.Path(), default="dataset/organisation.csv") -def create_org_csv_cli(flattened_dir, specification_dir,output_path): - specification = Specification(path=specification_dir) - - # get field names - org_field_names = specification.schema_field['organisation'] - - - # get get file list - filenames = listdir(flattened_dir) - filenames = [ filename for filename in filenames if filename.endswith('.csv') ] - - orgs = [] - for file in filenames: - filepath = Path(flattened_dir)/ file - with open(filepath, newline="") as f: - for row in csv.DictReader(f): - # hack to replace "_" with "-" in fieldnames - if row['typology'] == 'organisation': - row = { k.replace("_", "-"): v for k, v in row.items() } - if not row.get('organisation',None): - row['organisation'] = row['dataset'] + ':' + row['reference'] - org = {k:v for k,v in row.items() if k in org_field_names} - orgs.append(org) - - # write list of dicts - output_path = Path(output_path) - with open(output_path, "w", newline="") as f: - w = csv.DictWriter(f, fieldnames=org_field_names, extrasaction='ignore') - w.writeheader() - w.writerows(orgs) - - return - - -if __name__ == "__main__": - create_org_csv_cli()