From 3b18ebbc81808e382d5bbab5f3c769001a9d63ca Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Mon, 15 May 2023 14:05:05 +0100
Subject: [PATCH 01/12] data pools tester

---
 tests/data_pools_checks/.gitignore            |   3 +
 tests/data_pools_checks/caching.py            |  28 +
 tests/data_pools_checks/examples_data.py      | 175 +++++
 .../run_data_pools_checks.py                  | 716 ++++++++++++++++++
 4 files changed, 922 insertions(+)
 create mode 100644 tests/data_pools_checks/.gitignore
 create mode 100644 tests/data_pools_checks/caching.py
 create mode 100644 tests/data_pools_checks/examples_data.py
 create mode 100644 tests/data_pools_checks/run_data_pools_checks.py

diff --git a/tests/data_pools_checks/.gitignore b/tests/data_pools_checks/.gitignore
new file mode 100644
index 0000000..e6d2a88
--- /dev/null
+++ b/tests/data_pools_checks/.gitignore
@@ -0,0 +1,3 @@
+*~
+typescript*
+*.nc
diff --git a/tests/data_pools_checks/caching.py b/tests/data_pools_checks/caching.py
new file mode 100644
index 0000000..a826a4c
--- /dev/null
+++ b/tests/data_pools_checks/caching.py
@@ -0,0 +1,28 @@
+import os
+import hashlib
+
+
+def cached_output_fn(collection, params, odir=None):
+    if odir == None:
+        odir = '/tmp/roocs' ## FIXME
+    if not os.path.isdir(odir):
+        os.mkdir(odir)
+    vals = [collection]
+    if 'area' in params:
+        vals.append('area')
+        vals.extend(params['area'])
+    if 'time' in params:
+        vals.append('time')
+        vals.extend(list(params['time'].value))
+    if 'level' in params:
+        vals.append('level')
+        vals.extend(list(float(lev) for lev in params['level'].value))
+    vals = tuple(vals)
+    h = hashlib.sha1(str(vals).encode()).hexdigest()
+    #print(f'CACHE FOR: {vals} => {h}')
+    return f'{odir}/cache-{h}.nc'
+
+
+class CachedResult:
+    def __init__(self, path):
+        self.file_uris = [path]
diff --git a/tests/data_pools_checks/examples_data.py b/tests/data_pools_checks/examples_data.py
new file mode 100644
index 0000000..ce0ac4b
--- /dev/null
+++ b/tests/data_pools_checks/examples_data.py
@@ -0,0 +1,175 @@
+# an example with 3 files, including this for the first file:
+#
+#        time = UNLIMITED ; // (588 currently)
+#        bnds = 2 ;
+#        plev = 19 ;
+#        lat = 144 ;
+#        lon = 192 ;
+#
+#        float ta(time, plev, lat, lon)
+#
+example_4d = 'CMIP6.CDRMIP.MOHC.UKESM1-0-LL.esm-ssp585ext.r1i1p1f2.Amon.ta.gn.v20211018'
+
+# an example with 5 files, including this for the first file:
+#
+#        time = UNLIMITED ; // (120 currently)
+#        bnds = 2 ;
+#        lat = 144 ;
+#        lon = 192 ;
+#
+#        float ci(time, lat, lon) ;
+#
+example_3d = 'CMIP6.C4MIP.MOHC.UKESM1-0-LL.ssp534-over-bgc.r4i1p1f2.Amon.ci.gn.v20220708'
+
+examples_rot = ['CMIP6.CMIP.CMCC.CMCC-ESM2.historical.r1i1p1f1.Oyr.o2.gn.v20210114',
+                'CMIP6.AerChemMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.hist-piAer.r1i1p1f1.Ofx.volcello.gn.v20190627']
+
+
+# One dataset from every model that exists at CEDA and has 2d lon / lat arrays
+more_examples_rot = '''
+CMIP6.AerChemMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.hist-piAer.r1i1p1f1.Ofx.volcello.gn.v20190627
+CMIP6.AerChemMIP.NCC.NorESM2-LM.hist-piAer.r1i1p1f1.Omon.volcello.gn.v20191108
+CMIP6.CMIP.CCCma.CanESM5-CanOE.1pctCO2.r1i1p2f1.Ofx.sftof.gn.v20190429
+CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3-LR.piControl.r1i1p1f1.3hr.tos.gn.v20200919
+CMIP6.DAMIP.CAS.FGOALS-g3.hist-aer.r1i1p1f1.Omon.tos.gn.v20200427
+CMIP6.HighResMIP.CAS.FGOALS-f3-H.control-1950.r1i1p1f1.Oday.tos.gn.v20210120
+CMIP6.OMIP.CSIRO-COSIMA.ACCESS-OM2-025.omip2.r1i1p1f1.Oday.tos.gn.v20210617
+CMIP6.OMIP.CSIRO-COSIMA.ACCESS-OM2.omip2-spunup.r1i1p1f1.Oday.tos.gn.v20210616
+CMIP6.OMIP.NTU.TaiESM1-TIMCOM.omip1.r1i1p1f1.Ofx.deptho.gn.v20201028
+CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3P-VHR.historical.r1i1p2f1.Amon.clt.gr.v20201007
+CMIP6.FAFMIP.NOAA-GFDL.GFDL-ESM2M.faf-all.r1i1p1f1.Omon.so.gn.v20180701
+CMIP6.OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f1.Omon.so.gn.v20180701
+'''.strip().split()
+
+# One dataset from every model that exists at CEDA and has 1d but non-coordinate lon / lat arrays
+examples_unstructured = '''
+CMIP6.CMIP.MPI-M.ICON-ESM-LR.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20210215
+CMIP6.CMIP.UA.MCM-UA-1-0.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20190731
+CMIP6.HighResMIP.AWI.AWI-CM-1-1-HR.hist-1950.r1i1p1f2.3hr.tos.gn.v20170825
+CMIP6.HighResMIP.AWI.AWI-CM-1-1-LR.hist-1950.r1i1p1f2.3hr.tos.gn.v20170825
+CMIP6.HighResMIP.NCAR.CESM1-CAM5-SE-HR.control-1950.r1i1p1f1.Amon.ts.gn.v20200724
+CMIP6.HighResMIP.NCAR.CESM1-CAM5-SE-LR.control-1950.r1i1p1f1.Amon.ts.gn.v20200708
+'''.strip().split()
+
+
+# One dataset from every model that exists at CEDA
+all_examples = '''
+CMIP.CSIRO-ARCCSS.ACCESS-CM2.1pctCO2.r1i1p1f1.Amon.clt.gn.v20191109
+C4MIP.CSIRO.ACCESS-ESM1-5.esm-ssp585.r2i1p1f1.Amon.tas.gn.v20191203
+OMIP.CSIRO-COSIMA.ACCESS-OM2.omip2-spunup.r1i1p1f1.Oday.tos.gn.v20210616
+OMIP.CSIRO-COSIMA.ACCESS-OM2-025.omip2.r1i1p1f1.Oday.tos.gn.v20210617
+HighResMIP.AWI.AWI-CM-1-1-HR.hist-1950.r1i1p1f2.3hr.tos.gn.v20170825
+HighResMIP.AWI.AWI-CM-1-1-LR.hist-1950.r1i1p1f2.3hr.tos.gn.v20170825
+CMIP.AWI.AWI-CM-1-1-MR.1pctCO2.r1i1p1f1.3hr.tas.gn.v20181218
+CMIP.AWI.AWI-ESM-1-1-LR.1pctCO2.r1i1p1f1.3hr.tas.gn.v20200212
+HighResMIP.BCC.BCC-CSM2-HR.control-1950.r1i1p1f1.Amon.ts.gn.v20200922
+C4MIP.BCC.BCC-CSM2-MR.1pctCO2-bgc.r1i1p1f1.Amon.cli.gn.v20190321
+AerChemMIP.BCC.BCC-ESM1.hist-piNTCF.r1i1p1f1.Amon.ch4.gn.v20190621
+CMIP.CAMS.CAMS-CSM1-0.1pctCO2.r1i1p1f1.Amon.cli.gn.v20190708
+CMIP.CAS.CAS-ESM2-0.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20201228
+HighResMIP.NCAR.CESM1-CAM5-SE-HR.control-1950.r1i1p1f1.Amon.ts.gn.v20200724
+HighResMIP.NCAR.CESM1-CAM5-SE-LR.control-1950.r1i1p1f1.Amon.ts.gn.v20200708
+C4MIP.NCAR.CESM2.1pctCO2-bgc.r1i1p1f1.Amon.cli.gn.v20190724
+CMIP.NCAR.CESM2-FV2.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20200310
+AerChemMIP.NCAR.CESM2-WACCM.hist-1950HC.r1i1p1f1.Amon.ch4.gn.v20190606
+CMIP.NCAR.CESM2-WACCM-FV2.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20200226
+CMIP.THU.CIESM.1pctCO2.r1i1p1f1.Amon.rlut.gr.v20200417
+CMIP.CMCC.CMCC-CM2-HR4.1pctCO2.r1i1p1f1.6hrPlev.tas.gn.v20210304
+CMIP.CMCC.CMCC-CM2-SR5.1pctCO2.r1i1p1f1.3hr.tas.gn.v20200616
+HighResMIP.CMCC.CMCC-CM2-VHR4.control-1950.r1i1p1f1.6hrPlev.psl.gn.v20200917
+CMIP.CMCC.CMCC-ESM2.1pctCO2.r1i1p1f1.3hr.tas.gn.v20210114
+CFMIP.CNRM-CERFACS.CNRM-CM6-1.abrupt-0p5xCO2.r1i1p1f2.Amon.evspsbl.gr.v20190711
+CMIP.CNRM-CERFACS.CNRM-CM6-1-HR.1pctCO2.r1i1p1f2.Emon.orog.gr.v20191021
+AerChemMIP.CNRM-CERFACS.CNRM-ESM2-1.hist-1950HC.r1i1p1f2.Lmon.baresoilFrac.gr.v20190621
+C4MIP.CCCma.CanESM5.1pctCO2-bgc.r1i1p1f1.AERmon.ps.gn.v20190429
+CMIP.CCCma.CanESM5-CanOE.1pctCO2.r1i1p2f1.Ofx.sftof.gn.v20190429
+CFMIP.LLNL.E3SM-1-0.amip-p4K.r2i1p1f1.Amon.clivi.gr.v20210302
+CMIP.E3SM-Project.E3SM-1-1.historical.r1i1p1f1.AERmon.abs550aer.gr.v20191211
+CMIP.E3SM-Project.E3SM-1-1-ECA.historical.r1i1p1f1.AERmon.abs550aer.gr.v20200623
+CMIP.E3SM-Project.E3SM-2-0.abrupt-4xCO2.r1i1p1f1.Amon.hfls.gr.v20220826
+CMIP.EC-Earth-Consortium.EC-Earth3.1pctCO2.r3i1p1f1.3hr.tas.gr.v20210522
+CMIP.EC-Earth-Consortium.EC-Earth3-AerChem.1pctCO2.r1i1p1f1.3hr.tas.gr.v20200729
+CMIP.EC-Earth-Consortium.EC-Earth3-CC.1pctCO2.r1i1p1f1.Amon.rlut.gr.v20210525
+DCPP.EC-Earth-Consortium.EC-Earth3-HR.dcppA-hindcast.s1990-r10i2p1f1.Amon.rsds.gr.v20201205
+CMIP.EC-Earth-Consortium.EC-Earth3-LR.piControl.r1i1p1f1.3hr.tos.gn.v20200919
+CMIP.EC-Earth-Consortium.EC-Earth3-Veg.1pctCO2.r1i1p1f1.3hr.tas.gr.v20200325
+CMIP.EC-Earth-Consortium.EC-Earth3-Veg-LR.abrupt-4xCO2.r1i1p1f1.Amon.hfls.gr.v20220428
+HighResMIP.EC-Earth-Consortium.EC-Earth3P.control-1950.r1i1p2f1.3hr.clt.gr.v20190906
+HighResMIP.EC-Earth-Consortium.EC-Earth3P-HR.control-1950.r1i1p2f1.3hr.clt.gr.v20181119
+CMIP.EC-Earth-Consortium.EC-Earth3P-VHR.historical.r1i1p2f1.Amon.clt.gr.v20201007
+HighResMIP.ECMWF.ECMWF-IFS-HR.control-1950.r1i1p1f1.6hrPlevPt.psl.gr.v20170915
+HighResMIP.ECMWF.ECMWF-IFS-LR.control-1950.r1i1p1f1.6hrPlevPt.psl.gr.v20180221
+HighResMIP.ECMWF.ECMWF-IFS-MR.control-1950.r1i1p1f1.6hrPlevPt.psl.gr.v20181121
+HighResMIP.CAS.FGOALS-f3-H.control-1950.r1i1p1f1.Oday.tos.gn.v20210120
+CMIP.CAS.FGOALS-f3-L.1pctCO2.r1i1p1f1.Amon.rlut.gr.v20200620
+CMIP.CAS.FGOALS-g3.1pctCO2.r1i1p1f1.3hr.tas.gn.v20191219
+CMIP.FIO-QLNM.FIO-ESM-2-0.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20200302
+CMIP.NOAA-GFDL.GFDL-AM4.amip.r1i1p1f1.AERmon.lwp.gr1.v20180807
+CFMIP.NOAA-GFDL.GFDL-CM4.amip-4xCO2.r1i1p1f1.Amon.evspsbl.gr1.v20180701
+HighResMIP.NOAA-GFDL.GFDL-CM4C192.control-1950.r1i1p1f1.Amon.ts.gr3.v20180701
+FAFMIP.NOAA-GFDL.GFDL-ESM2M.faf-all.r1i1p1f1.Omon.so.gn.v20180701
+AerChemMIP.NOAA-GFDL.GFDL-ESM4.piClim-aer.r1i1p1f1.AERmon.cdnc.gr1.v20180701
+OMIP.NOAA-GFDL.GFDL-OM4p5B.omip1.r1i1p1f1.Omon.so.gn.v20180701
+CFMIP.NASA-GISS.GISS-E2-1-G.abrupt-0p5xCO2.r1i1p1f1.Amon.cli.gn.v20190524
+CMIP.NASA-GISS.GISS-E2-1-G-CC.esm-hist.r1i1p1f1.Amon.clt.gn.v20190815
+CFMIP.NASA-GISS.GISS-E2-1-H.abrupt-2xCO2.r1i1p1f1.Amon.cli.gn.v20190403
+CFMIP.NASA-GISS.GISS-E2-2-G.abrupt-2xCO2.r1i1p1f1.Amon.evspsbl.gn.v20191120
+CMIP.NASA-GISS.GISS-E2-2-H.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20191120
+HighResMIP.MOHC.HadGEM3-GC31-HH.control-1950.r1i1p1f1.3hr.clt.gn.v20180927
+HighResMIP.MOHC.HadGEM3-GC31-HM.control-1950.r1i1p1f1.3hr.clt.gn.v20180713
+CFMIP.MOHC.HadGEM3-GC31-LL.a4SST.r1i1p1f3.AERmon.abs550aer.gn.v20200403
+HighResMIP.MOHC.HadGEM3-GC31-LM.highresSST-future.r1i14p1f1.3hr.pr.gn.v20190710
+HighResMIP.MOHC.HadGEM3-GC31-MH.spinup-1950.r1i1p1f1.3hr.clt.gn.v20171227
+CMIP.MOHC.HadGEM3-GC31-MM.1pctCO2.r1i1p1f3.3hr.clt.gn.v20201113
+HighResMIP.AS-RCEC.HiRAM-SIT-HR.highres-future.r1i1p1f1.Amon.ts.gn.v20210707
+HighResMIP.AS-RCEC.HiRAM-SIT-LR.highres-future.r1i1p1f1.Amon.ts.gn.v20210707
+CMIP.MPI-M.ICON-ESM-LR.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20210215
+CMIP.CCCR-IITM.IITM-ESM.1pctCO2.r1i1p1f1.3hr.tas.gn.v20191204
+CMIP.INM.INM-CM4-8.1pctCO2.r1i1p1f1.Amon.rlut.gr1.v20190530
+CMIP.INM.INM-CM5-0.1pctCO2.r1i1p1f1.Amon.rlut.gr1.v20200226
+HighResMIP.INM.INM-CM5-H.control-1950.r1i1p1f1.Amon.ts.gr1.v20190812
+CMIP.IPSL.IPSL-CM5A2-INCA.1pctCO2.r1i1p1f1.Amon.rlut.gr.v20201218
+HighResMIP.IPSL.IPSL-CM6A-ATM-HR.highresSST-present.r1i1p1f1.3hr.mrsos.gr.v20190122
+C4MIP.IPSL.IPSL-CM6A-LR.1pctCO2-bgc.r1i1p1f1.Amon.huss.gr.v20180914
+CMIP.IPSL.IPSL-CM6A-LR-INCA.abrupt-4xCO2.r1i1p1f1.3hr.pr.gr.v20210113
+CMIP.NIMS-KMA.KACE-1-0-G.1pctCO2.r1i1p1f1.3hr.tas.gr.v20190918
+CMIP.KIOST.KIOST-ESM.1pctCO2.r1i1p1f1.3hr.tas.gr1.v20210601
+CMIP.UA.MCM-UA-1-0.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20190731
+CMIP.MIROC.MIROC-ES2H.historical.r1i1p1f2.Amon.clt.gn.v20210125
+CMIP.MIROC.MIROC-ES2L.1pctCO2.r1i1p1f2.Amon.rlut.gn.v20190823
+AerChemMIP.MIROC.MIROC6.hist-piAer.r1i1p1f1.Amon.cli.gn.v20190807
+AerChemMIP.HAMMOZ-Consortium.MPI-ESM-1-2-HAM.hist-piAer.r1i1p1f1.Ofx.volcello.gn.v20190627
+CMIP.MPI-M.MPI-ESM1-2-HR.1pctCO2.r1i1p1f1.3hr.tas.gn.v20190710
+C4MIP.MPI-M.MPI-ESM1-2-LR.1pctCO2-bgc.r2i1p1f1.Amon.tas.gn.v20190710
+HighResMIP.MPI-M.MPI-ESM1-2-XR.control-1950.r1i1p1f1.6hrPlev.wap.gn.v20180606
+HighResMIP.MRI.MRI-AGCM3-2-H.highresSST-future.r1i1p1f1.Amon.ts.gn.v20200619
+HighResMIP.MRI.MRI-AGCM3-2-S.highresSST-future.r1i1p1f1.Amon.ts.gn.v20200619
+CFMIP.MRI.MRI-ESM2-0.abrupt-0p5xCO2.r1i1p1f1.3hr.huss.gn.v20210308
+CMIP.NUIST.NESM3.1pctCO2.r1i1p1f1.3hr.tas.gn.v20190707
+HighResMIP.MIROC.NICAM16-7S.highresSST-present.r1i1p1f1.3hr.tos.gr.v20190325
+HighResMIP.MIROC.NICAM16-8S.highresSST-present.r1i1p1f1.3hr.tos.gr.v20190830
+HighResMIP.MIROC.NICAM16-9S.highresSST-present.r1i1p1f1.3hr.tos.gr.v20190830
+CMIP.NCC.NorCPM1.1pctCO2.r1i1p1f1.Amon.rlut.gn.v20190914
+CMIP.NCC.NorESM1-F.piControl.r1i1p1f1.AERmon.ua.gn.v20190920
+AerChemMIP.NCC.NorESM2-LM.hist-piAer.r1i1p1f1.Omon.volcello.gn.v20191108
+CMIP.NCC.NorESM2-MM.1pctCO2.r1i1p1f1.6hrPlev.tas.gn.v20210319
+CMIP.SNU.SAM0-UNICON.1pctCO2.r1i1p1f1.3hr.tas.gn.v20190323
+CFMIP.AS-RCEC.TaiESM1.abrupt-0p5xCO2.r1i1p1f1.AERmon.ps.gn.v20210913
+OMIP.NTU.TaiESM1-TIMCOM.omip1.r1i1p1f1.Ofx.deptho.gn.v20201028
+AerChemMIP.MOHC.UKESM1-0-LL.hist-piAer.r1i1p1f2.AERday.cod.gn.v20190813
+CMIP.MOHC.UKESM1-1-LL.1pctCO2.r1i1p1f2.AERmon.abs550aer.gn.v20220513
+ISMIP6.NERC.UKESM1-ice-LL.1pctCO2to4x-withism.r1i1p1f2.Amon.clivi.gn.v20220316
+'''.strip().split()
+
+#data_pool_tests_db = [example_3d, example_4d]
+
+#data_pool_tests_db = examples_rot + [example_3d, example_4d]
+
+data_pool_tests_db = [example_3d, example_4d] + examples_rot
+
+
+
+
+
+
+
diff --git a/tests/data_pools_checks/run_data_pools_checks.py b/tests/data_pools_checks/run_data_pools_checks.py
new file mode 100644
index 0000000..6d024e4
--- /dev/null
+++ b/tests/data_pools_checks/run_data_pools_checks.py
@@ -0,0 +1,716 @@
+from pdb import set_trace as ST
+
+import os
+os.environ['USE_PYGEOS'] = '0'
+import random
+import xarray as xr
+import tempfile
+import datetime
+import numpy as np
+import shutil
+
+from daops.ops.subset import subset
+from roocs_utils.xarray_utils.xarray_utils import open_xr_dataset
+from roocs_utils.parameter.param_utils import time_interval, level_interval
+
+from examples_data import data_pool_tests_db
+from caching import cached_output_fn, CachedResult
+
+
+def main():
+    for collection in data_pool_tests_db:
+        test_subset_in_data_pools(collection)
+    
+    
+def test_subset_in_data_pools(collection):
+    """
+    Do a range of tests on a given collection
+    """
+    
+    ds = open_dataset(collection)
+
+    # subset horizontally and in time
+    for _ in range(3):
+        do_test(collection, ds, do_area=True, do_time=True)
+
+    # subset in all dimensions (though levels will be ignored if there isn't
+    # a level dimension)
+    for _ in range(3):
+        do_test(collection, ds, do_area=True, do_time=True, do_levels=True)
+
+    # and a few special cases to try
+    special_cases = [{'force_lon_wrap': True},
+                     {'force_pole': 'north'},
+                     {'force_pole': 'south'},
+                     {'force_lon_wrap': True, 'force_pole': 'south'}]
+
+    for area_args in special_cases:
+        print(f'Doing special case: {area_args}')
+        do_test(collection, ds, do_area=True, do_time=True, do_levels=True,
+                area_args=area_args)
+
+    # to delete? - extra test that got left in
+    # (not removing for now due to existing cache)
+    do_test(collection, ds, do_area=True, do_time=True, do_levels=True)
+    
+        
+def open_dataset(collection):
+    
+    print(f'opening {collection}')
+
+    if collection[0] == '/':
+        return xr.open_dataset(collection)
+    
+    #====================================
+    ## THIS OUGHT TO WORK, but I need to work out how to configure the paths
+    ## for the tests.  strace shows that it is using /tmp/roocs.ini but this
+    ## is overwritten every time (tests/conftest.py gets imported and
+    ## write_roocs_cfg() is run)
+    ##
+    ## strace also says that it is also looking in all of these:
+    ##     <roocs_dir>/clisops/clisops/etc/roocs.ini
+    ##     <roocs_dir>/daops/daops/etc/roocs.ini
+    ##     <roocs_dir>/roocs-utils/roocs_utils/etc/roocs.ini
+    ##
+    ## For now, symlinking ~/.mini-esgf-data/master/test_data/badc
+    ## to point to the real /badc will do for a workaround -- now it finds 
+    ## the dataset
+    ds = open_xr_dataset(collection)
+    
+    ## OR HERE IS ANOTHER POSSIBLE TEMPORARY WORKAROUND
+    # import glob  # for open_xr_dataset workaround
+    # assert isinstance(collection, str)
+    #paths = glob.glob(f'/badc/cmip6/data/{collection.replace(".","/")}/*.nc')
+    #ds = open_xr_dataset(paths)
+    #====================================
+    
+    return ds
+
+
+def dump_dims(ds, label=''):
+    ignore = ('bnds', 'vertices')
+    print(f'{label} dataset dimensions: '
+          f'{",".join(f"{k}:{v}" for k, v in ds.dims.items() if k not in ignore)}')
+
+
+def get_data(val):
+    return val.data if isinstance(val, (xr.DataArray, xr.IndexVariable)) else val
+    
+    
+def do_test(collection, ds, use_cache=True, **kwargs):
+    """
+    Do an individual test on a collection
+    """
+    
+    print(f'Doing test on {collection} using {kwargs}')
+    dump_dims(ds, label='input')
+    subset_params, expect = prepare_test(ds, **kwargs)
+
+    temp_dir = tempfile.TemporaryDirectory()
+    tmpdir = temp_dir.name
+
+    print('===========================================')
+    print('Doing test with:')
+    print(f'\n  Collection: {collection}')
+    print('\n  Subset params:')
+    for k, v in subset_params.items():
+        if k == "time":
+            v = f"time_interval({v.value})"
+        elif k == "level":
+            v = f"level_interval({tuple([float(lev) for lev in v.value])})"
+        print(f'      {k} : {v}')
+    print('\n  Expect to get:')
+    for k, v in expect.items():
+        if v is not None:
+            print(f'      {k} : {get_data(v)}')
+    if all(k in expect and expect[k] is None
+           for k in ('lons_in_range', 'lats_in_range')):
+        print('  (curvilinear grid; will test lon/lat ranges, not exact vals)')
+            
+    print('\n===========================================')
+        
+    try:
+        if use_cache:
+            cached_fn = cached_output_fn(collection, subset_params)
+            if os.path.exists(cached_fn):
+                print(f'using cache: {cached_fn}')
+                result = CachedResult(cached_fn)           
+            else:
+                result = subset(collection,
+                                output_dir=tmpdir,
+                                **subset_params)
+                fn, = result.file_uris
+                print(f'caching: {cached_fn}')            
+                shutil.copy(fn, cached_fn)
+        else:
+            result = subset(collection,
+                            output_dir=tmpdir,
+                            **subset_params)
+
+        check_result(result, expect, subset_params)
+
+    except KeyboardInterrupt:
+        raise
+    except Exception as exc:
+        print("*******************\n"
+              "*** TEST FAILED ***\n"
+              "*******************\n\n\n")
+        raise  # FIXME: comment out
+    else:
+        print("Test succeeded\n\n\n")
+    finally:        
+        temp_dir.cleanup()
+    
+
+def prepare_test(ds, do_area=False, do_time=False, do_levels=False, area_args=None):
+    '''
+    returns the params to the subset function that will be needed for the test
+    and a dictionary of things to expect to come back from the test
+
+    The boolean inputs do_area, do_time, do_levels control whether to subset
+    in each of these ways.  The input area_args can contain a dictionary of arguments
+    to get_rand_area (ignored if do_area==False)
+    '''
+
+    if area_args == None:
+        area_args = {}
+
+    params = {}
+    expect = {}
+
+    if do_area:
+        area = get_rand_area(ds, **area_args)
+        req_area = [float(v) for v in area['req_area']]
+        
+        ### # it seems that subset treats a longitude range e.g. [350, 10] as [10, 350]
+        ### # change it to [350, 370] etc
+        ### if area_p[2] < area_p[0]:
+        ###     #area_p[2] += 360
+        ###     area_p[0] -= 360
+        ###     
+        params['area'] = req_area
+        
+        expect['lons_in_range'] = area['lons_in_range']
+        ## expect['wrap_lon'] = area['wrap_lon']
+        expect['lats_in_range'] = area['lats_in_range']
+    else:
+        expect['lons_in_range'] = get_lons(ds)
+        expect['lats_in_range'] = get_lats(ds)
+
+    if do_levels:
+        lev_int = get_rand_lev_int(ds)
+        if lev_int is not None:
+            params['level'] = lev_int['req_interval']
+            expect['levs_in_range'] = lev_int['levs_in_range']
+        else:
+            print("WARNING: not requesting level range as no level dimension found")
+            expect['levs_in_range'] = None
+    else:
+        expect['levs_in_range'] = get_levs(ds)
+
+    if do_time:
+        time_int = get_rand_time_int(ds)
+        if time_int is not None:
+            params['time'] = time_int['req_interval']
+            expect['times_in_range'] = time_int['times_in_range']
+        else:
+            print("WARNING: not requesting time range as no time dimension found")
+            expect['times_in_range'] = None
+    else:
+        expect['times_in_range'] = get_times(ds)
+        
+    return params, expect
+
+
+def get_rand_time_int(ds):
+    """
+    Returns a dictionary containing a randomly chosen time interval
+    (in the form needed by the subset function) and the time values
+    that would be expected when subsetting using that interval
+    """
+    times = get_times(ds)
+    if times is None:
+        return None
+    t_start, t_end, vals_in_range = get_rand_range(times)
+    ts_start = get_time_string(t_start)
+    ts_end = get_time_string(t_end)
+    return {'req_interval': time_interval(ts_start, ts_end),
+            'times_in_range': vals_in_range}
+
+
+def get_rand_lev_int(ds):
+    """
+    As get_rand_time_int, but for levels
+    """
+    levs = get_levs(ds)
+    if levs is None:
+        return None
+    z_start, z_end, vals_in_range = get_rand_range(levs)
+    return {'req_interval': level_interval(z_start, z_end),
+            'levs_in_range': vals_in_range}
+
+
+def get_time_string(when):
+    """
+    Turns a datetime, or time value seen in xarray,
+    into a string understood by time_interval
+    """
+    if isinstance(when, datetime.datetime):
+        t = when
+    else:
+        t = when.values.tolist()
+        
+    return (f'{t.year:04d}-{t.month:02d}-{t.day:02d}'
+            f'T{t.hour:02d}:{t.minute:02d}:{t.second:02d}')
+
+
+def get_rand_area(ds, force_lon_wrap=False, force_pole=None):
+    """Returns a dictionary containing a randomly chosen area
+    (tuple of lon_w, lat_s, lon_e, lat_n) and the lon and lat
+    values that would be expected when subsetting using that area.
+
+    In the curvilinear case (lon and lat variables in the file are 2d), the
+    expected values will be None rather than arrays of expected values.  The
+    reason for this is that it is not possible to validate a *specific* set of
+    lon, lat values in any case (as although the subsetting will set any
+    points that are out of range to NaN, but NaN values could also be because
+    of missing data points within the range).
+    """
+
+    ## (lon0, lon1, lons_in_range), wrap_lon = get_rand_lon_range(ds)
+    (lon0, lon1, lons_in_range) = get_rand_lon_range(ds, force_wrap=force_lon_wrap)
+    (lat0, lat1, lats_in_range) = get_rand_lat_range(ds, force_pole=force_pole)
+    
+    return {'req_area': (lon0, lat0, lon1, lat1),
+            'lons_in_range': lons_in_range,
+            ## 'wrap_lon': wrap_lon,
+            'lats_in_range': lats_in_range}
+
+
+def get_wrap_lon(lons):
+    """
+    Get the longitude at which the wrapping occurs.
+    Note - the exact value is not used in the current version of the calling code,
+    the main purpose here is to distinguish global from limited area, so that the
+    requested area for subsetting does not wrap for a limited area grid.
+    """
+    minlon = lons.min()
+    maxlon = lons.max()
+    if maxlon - minlon < 270:
+        # assume this is a limited area
+        return None
+    elif maxlon - minlon >= 360:
+        raise Exception(f'too wide lon range {minlon} to {maxlon}')
+    elif 0 <= minlon and maxlon < 360:
+        return 360.
+    elif -180 <= minlon and maxlon < 180:
+        return 180.
+    else:
+        raise Exception(f'unsupported lon range {minlon} to {maxlon}')
+    
+
+def get_rand_lon_range(ds, force_wrap=False):
+    """
+    Get a randomly chosen longitude range.  This might include wrapping around
+    (unless the longitudes don't seem to span global range), but if force_wrap is
+    set to True then it ensures that this occurs.
+    """
+    lons = get_lons(ds)
+    wrap_lon = get_wrap_lon(lons)
+    can_wrap=(wrap_lon is not None)
+
+    if force_wrap:
+        print('WARNING: forcing longitude wrapping for what appears to be limited area model')
+        params = {'force': 'wrap'}
+    else:
+        params = {'can_wrap': can_wrap}
+    return get_rand_range(lons, **params)
+    
+    ## wrap_lon = get_wrap_lon(lons)
+    ## print(f'wrap long = {wrap_lon}')
+    ## return get_rand_range(lons, can_wrap=(wrap_lon is not None)), wrap_lon
+    
+
+def get_rand_lat_range(ds, force_pole=None):
+    """
+    Get a randomly chosen latitude range.  If force_pole is set to 'north' or 'south',
+    then the range will extend to the relevant pole.
+    """
+
+    lats = get_lats(ds)
+
+    # using 'force' will ensure that the range returned
+    # by get_rand_range goes to the end of the latitude values,
+    # but for the test, actually use -90 or 90. Which value is
+    # to be overwritten will depend on the ordering. 
+    # 
+    if force_pole == 'north':
+        ret = get_rand_range(lats, force='upper')
+        return (ret[0], 90., ret[2]) if ret[1] >= ret[0] else (90., ret[1], ret[2])
+        params['force'] = 'upper'        
+    elif force_pole == 'south':
+        ret = get_rand_range(lats, force='lower')
+        return (-90., ret[1], ret[2]) if ret[1] >= ret[0] else (ret[0], -90., ret[2])
+    else:
+        return get_rand_range(lats)
+
+### def get_rand_lev_range(ds):
+###     return get_rand_range(get_levs(ds))
+### 
+### def get_rand_time_range(ds):
+###     return get_rand_range(get_times(ds))
+### 
+
+def get_rand_range(var, max_fraction=.1, can_wrap=False, force=None):
+    '''
+    Get a random range from specified variable (which can be any number
+    of dimensions).  Returns tuple of (lower, upper, values in range)
+
+    min and max are chosen based on histogram of values, defaulting to returning
+    a range that includes up to to about 10% of the points (though can be less)
+
+    can_wrap=True means could be used with longitude - e.g. for a wrapping longitude of
+    360 it could return lower=-10 upper=10, and the values in range are in the range -10 to 10
+    where those values from -10 to 0 are based on the values from 350 to 360 in the input
+    
+    force can be used for special cases: 'lower' forces the range to include
+    the lower end (e.g. south pole for latitude), 'upper' forces it to include
+    the upper end (e.g. north pole), 'wrap' forces it to wrap around (the meridian
+    for longitude)
+    '''
+    
+    length = random.uniform(0, max_fraction)
+
+    while True:
+
+        did_wrap = False
+        if force == 'lower':
+            lower_q = 0.
+            upper_q = length
+        elif force == 'upper':
+            lower_q = 1 - length
+            upper_q = 1.            
+        elif force == 'wrap':
+            lower_q = random.uniform(1 - length, 1)
+            upper_q = lower_q + length - 1
+            did_wrap = True
+        elif force is not None:
+            raise ValueError(f'unrecognised "force" value {force}')
+        elif can_wrap:
+            lower_q = random.uniform(0, 1)
+            upper_q = lower_q + length
+            did_wrap = upper_q > 1
+            if did_wrap:
+                upper_q -= 1
+        else:
+            lower_q = random.uniform(0, 1 - length)    
+            upper_q = lower_q + length
+
+        lower = var.quantile(lower_q)
+        upper = var.quantile(upper_q)
+
+        if did_wrap:
+            in_range = (lower <= var) | (var <= upper)
+            if var.ndim == 1:
+                modulus = 360
+                lower_vals = get_data(var[lower <= var])
+                upper_vals = get_data(var[var <= upper])
+                
+                if var.min() >= 0:
+                    # data uses 0-360
+                    # change e.g. 350..10 to -10..10
+                    # (as subset function doesn't seem to like 350..370)
+                    lower -= modulus
+                    lower_vals -= modulus
+                else:
+                    # data uses -180 to 180
+                    # change e.g. 170..-170 to 170..190
+                    upper += modulus
+                    upper_vals += modulus
+                
+                vals_in_range = np.concatenate((lower_vals, upper_vals))
+            else:
+                vals_in_range = None
+        else:
+            in_range = (lower <= var) & (var <= upper)
+            vals_in_range = var[in_range] if var.ndim == 1 else None
+        
+        if in_range.sum() > 0:
+            break
+        length = min(length * 2, 1)
+        
+    if var.ndim == 1 and len(var) > 1 and var[1] < var[0]:
+        # if the variable is 1d and is decreasing, then swap the ordering
+        # of the bounds (which were chosen based on histogram so will be increasing at this point)
+        assert not did_wrap  # longitude wrap not verified for decreasing lons
+        assert lower <= upper
+        lower, upper = upper, lower
+        
+    return (lower, upper, vals_in_range)
+        
+
+def get_lons(ds):
+    "Get the longitude variable for a dataset. Not necessarily a coordinate variable."
+    return get_var_by_stdname(ds, 'longitude')
+def get_lats(ds):
+    "Get the latitude variable for a dataset. Not necessarily a coordinate variable."
+    return get_var_by_stdname(ds, 'latitude')
+
+def get_times(ds):
+    "Get the time coordinate variable for a dataset"
+    return get_axis_by_direction(ds, 'T')
+def get_levs(ds):
+    "Get the height coordinate variable for a dataset"
+    return get_axis_by_direction(ds, 'Z')
+
+## ------------------------------------------------------
+##  Now commented out anything to do with getting ranges,
+##  except in the case of curvilinear grid. Code now tests
+##  the set of actual coord values rather than just the range,
+##  where 1d coord variables.
+## ------------------------------------------------------
+##
+### def get_lon_range(ds, wrap_lon=None):
+###     lons = get_lons(ds)
+###     if wrap_lon is not None:
+###         ST()
+###     return get_range(get_lons(ds))
+###     
+### def get_lat_range(ds):
+###     return get_range(get_lats(ds))
+### 
+### def get_lev_range(ds):
+###     return get_range(get_levs(ds))
+### 
+### def get_time_range(ds):
+###     return get_range(get_times(ds))
+### 
+### def get_range(var):
+###     return (var.min(), var.max())
+##
+## def get_lon_lat_ranges(ds, wrap_lon=None):
+## 
+##     lons = get_lons(ds)
+##     lats = get_lats(ds)
+##     
+##     if is_curvi(lons, lats):
+##         return get_lonlat_ranges_for_curvi(ds, lons, lats)
+##     else:
+##         if wrap_lon is not None:
+##             ST()  ## FIXME
+##         return (get_range(lons), get_range(lats))
+
+
+def get_var_by_stdname(ds, stdname):
+    """
+    Get variable with a given standard name.
+    Will raise an exception if there is not exactly one.
+    """
+    vars = [v for v in ds.variables.values()
+            if v.attrs.get('standard_name') == stdname]
+    var, = vars
+    return var
+
+
+def is_curvi(lons, lats):
+    """
+    Test whether given lon and lat variables correspond to a curvilinear grid
+    (as opposed to 1d coordinate variables).
+    """
+    if len(lons.dims) == 1 and len(lats.dims) == 1:
+        return False
+    elif len(lons.dims) == 2 and lons.dims == lats.dims:
+        return True
+    else:
+        raise Exception(f"unexpected dimensionality of lon, lat arrays: {lon.dims} and {lat.dims}")
+
+def get_lonlat_ranges_for_curvi(ds):
+    '''
+    get ranges of lon, lat values where there is actual data (not masked)
+    for any variable on the lon, lat grid
+    '''
+
+    lons = get_lons(ds)
+    lats = get_lats(ds)
+    
+    grid_dims = lons.dims
+    grid_shape = lons.shape
+
+    # get a list of the variables that are on the lon-lat grid,
+    # other than lon, lat themselves, and then get an array of 
+    # positions where there are non-NaN values in *any* of these 
+    # variables, for any other level / height
+    #
+    # (start with 2d array of False, then use logical OR with 
+    # each variable, although probably there is only one such var)
+    #
+    ##vars_on_grid = [v for v in ds.variables.values()
+    ##                if v.dims[-2:] == grid_dims and 
+    ##                v.attrs.get('standard_name') not in ('longitude', 'latitude')]
+    vars_on_grid = [v for v in ds.data_vars.values()
+                    if v.dims[-2:] == grid_dims]
+    if not vars_on_grid:
+        raise Exception("could not find any data variables")
+    has_data = np.zeros(grid_shape, dtype=bool)
+    for var in vars_on_grid:
+        var_has_data = np.logical_not(np.isnan(var.data))        
+        # reduce to 2d using 'any' in loop - there might be a cleverer way
+        while var_has_data.ndim > 2:
+            var_has_data = np.any(var_has_data, axis=(var_has_data.ndim - 3))
+        assert var_has_data.shape == grid_shape
+        has_data |= var_has_data
+
+    if not has_data.any():
+        print ("WARNING: data variable(s) contain only NaN values")
+        return (None, None)
+        
+    lons_where_data = lons.data[has_data]
+    lats_where_data = lats.data[has_data]
+
+    lon_range_where_data = (lons_where_data.min(), lons_where_data.max())
+    lat_range_where_data = (lats_where_data.min(), lats_where_data.max())
+
+    print('For this curvilinear dataset:')
+    print(f' Lon range where data {lon_range_where_data} '
+          f'(overall {lons.data.min(), lons.data.max()})')
+    print(f' Lat range where data {lat_range_where_data} '
+          f'(overall {lats.data.min(), lats.data.max()})')
+    
+    return (lon_range_where_data, lat_range_where_data)
+
+
+## def is_in_range(actual_range, requested_range, wrap=None, modulus=360):
+##     req0, req1 = requested_range
+##     if wrap is not None and req0 > req1:
+##         return all((req0 <= val < wrap
+##                     or wrap - modulus <= val <= req1
+##                     for val in actual_range))
+##     else:
+##         if req0 > req1:
+##             req0, req1 = req1, req0
+##         return all((req0 <= val <= req1
+##                     for val in actual_range))
+## 
+
+def check_in_range(actual_range, requested_range, label='', **kwargs):
+    """
+    check whether the range of values lies WITHIN the requested range of values;
+    allow for the fact that the requested range might be passed in decreasing order
+    """
+    if not is_in_range(actual_range, requested_range, **kwargs):
+        raise Exception(f'range check for {label} failed')
+    print(f'{label}: Verified range {actual_range} within {requested_range}')
+    
+
+def is_in_range(actual_range, requested_range):
+    """
+    helper for check_in_range.  Returns boolean.
+    """
+    req0, req1 = requested_range
+    if req0 > req1:
+        req0, req1 = req1, req0
+    return all((req0 <= val <= req1
+                for val in actual_range))
+
+
+    
+def check_equal(vals, exp_vals, label=''):
+    """
+    Check whether the values match the expected values
+    """
+    
+    vals = get_data(vals)
+    exp_vals = get_data(exp_vals)
+                             
+    #print(f'\n\n============ {label} =========\n\n')
+    #print(f'Actual vals: {vals}')
+    #print(f'Expected vals: {exp_vals}')
+
+    if not np.array_equal(vals, exp_vals):
+        ST()
+        raise Exception(f'equal values assertion failed for {label}:'
+                        f'actual {vals}, expected: {exp_vals}')
+        
+    print(f'{label}: checked {len(vals)} values match expected values')
+
+    
+def check_result(result, expect, subset_params):
+
+    """
+    Do various checks on the result of subsetting.  result should be an
+    object that has a file_uris property.  expect should be dictionary
+    returned by prepare_test.
+
+    subset_params is the dictionary of keyword args that was passed to
+    subset (needed for curvilinear - see below).
+
+    Currently, the tests that are done are:
+    - check that the exact set of coordinate values matches what was expected,
+    - except that in the case of a curvilinear grid, check to see what lon and
+      lat ranges are spanned by the lon and lat arrays in the file but only
+      including those locations where there is data with non-NaN values; then
+      check that these are within the range that was requested
+    """
+
+    fn, = result.file_uris
+    #os.system(f'ls -l {fn}') ## FIXME: remove?
+    dsout = xr.open_dataset(fn)
+    dump_dims(dsout, label='output')
+
+    lons = get_lons(dsout)
+    lats = get_lats(dsout)
+    if is_curvi(lons, lats) and 'area' in subset_params:
+        area = subset_params['area']
+        req_lon_range = (area[0], area[2])
+        req_lat_range = (area[1], area[3])
+
+        lon_range, lat_range = get_lonlat_ranges_for_curvi(dsout)
+        if lon_range is not None:
+            #check_in_range(lon_range, req_lon_range, label='longitudes', wrap=expect['wrap_lon'])
+            print('Checking that lon-lat values with (unmasked) data in requested range')
+            check_in_range(lon_range, req_lon_range, label='longitudes')
+            check_in_range(lat_range, req_lat_range, label='latitudes')
+        else:
+            print('Skipping lon/lat range check: did not find any data in requested range')
+    else:
+        expected_lons = expect['lons_in_range']
+        expected_lats = expect['lats_in_range']
+
+        check_equal(lons, expected_lons, label='longitudes')
+        check_equal(lats, expected_lats, label='latitudes')
+
+    expected_levs = expect['levs_in_range']
+    if expected_levs is not None:
+        levs = get_levs(dsout)   
+        check_equal(levs, expected_levs, label='levels')
+
+    expected_times = expect['times_in_range']
+    if expected_times is not None:
+        times = get_times(dsout)
+        check_equal(times, expected_times, label='times')
+
+    
+def get_axis_by_direction(ds, direction):
+    """
+    Get the axis with the specified direction attribute ('X', 'Y', 'Z' or 'T')
+    or if there is none, returns None.
+    (If more than one, raises an exception.)
+    """
+    axes = []
+    for name in ds.dims:
+        axis = ds[name]
+        if name == 'bnds':
+            continue
+        if hasattr(axis, 'axis') and axis.axis == direction:
+            axes.append(axis)
+    if len(axes) > 1:
+        raise Exception(f"more than one dimension with axis {direction}")
+    elif len(axes) == 1:
+        return axes[0]
+    else:
+        return None
+
+
+if __name__ == '__main__':
+    random.seed(0)  ## FIXME - remove
+    main()

From ed253d9203061b5475a0f0c0d61b22ae4a4574fc Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Mon, 15 May 2023 14:10:15 +0100
Subject: [PATCH 02/12] quotes

---
 .../run_data_pools_checks.py                  | 238 +++++++++---------
 1 file changed, 119 insertions(+), 119 deletions(-)

diff --git a/tests/data_pools_checks/run_data_pools_checks.py b/tests/data_pools_checks/run_data_pools_checks.py
index 6d024e4..1714267 100644
--- a/tests/data_pools_checks/run_data_pools_checks.py
+++ b/tests/data_pools_checks/run_data_pools_checks.py
@@ -1,7 +1,7 @@
 from pdb import set_trace as ST
 
 import os
-os.environ['USE_PYGEOS'] = '0'
+os.environ["USE_PYGEOS"] = "0"
 import random
 import xarray as xr
 import tempfile
@@ -39,13 +39,13 @@ def test_subset_in_data_pools(collection):
         do_test(collection, ds, do_area=True, do_time=True, do_levels=True)
 
     # and a few special cases to try
-    special_cases = [{'force_lon_wrap': True},
-                     {'force_pole': 'north'},
-                     {'force_pole': 'south'},
-                     {'force_lon_wrap': True, 'force_pole': 'south'}]
+    special_cases = [{"force_lon_wrap": True},
+                     {"force_pole": "north"},
+                     {"force_pole": "south"},
+                     {"force_lon_wrap": True, "force_pole": "south"}]
 
     for area_args in special_cases:
-        print(f'Doing special case: {area_args}')
+        print(f"Doing special case: {area_args}")
         do_test(collection, ds, do_area=True, do_time=True, do_levels=True,
                 area_args=area_args)
 
@@ -56,9 +56,9 @@ def test_subset_in_data_pools(collection):
         
 def open_dataset(collection):
     
-    print(f'opening {collection}')
+    print(f"opening {collection}")
 
-    if collection[0] == '/':
+    if collection[0] == "/":
         return xr.open_dataset(collection)
     
     #====================================
@@ -87,9 +87,9 @@ def open_dataset(collection):
     return ds
 
 
-def dump_dims(ds, label=''):
-    ignore = ('bnds', 'vertices')
-    print(f'{label} dataset dimensions: '
+def dump_dims(ds, label=""):
+    ignore = ("bnds", "vertices")
+    print(f"{label} dataset dimensions: "
           f'{",".join(f"{k}:{v}" for k, v in ds.dims.items() if k not in ignore)}')
 
 
@@ -102,45 +102,45 @@ def do_test(collection, ds, use_cache=True, **kwargs):
     Do an individual test on a collection
     """
     
-    print(f'Doing test on {collection} using {kwargs}')
-    dump_dims(ds, label='input')
+    print(f"Doing test on {collection} using {kwargs}")
+    dump_dims(ds, label="input")
     subset_params, expect = prepare_test(ds, **kwargs)
 
     temp_dir = tempfile.TemporaryDirectory()
     tmpdir = temp_dir.name
 
-    print('===========================================')
-    print('Doing test with:')
-    print(f'\n  Collection: {collection}')
-    print('\n  Subset params:')
+    print("===========================================")
+    print("Doing test with:")
+    print(f"\n  Collection: {collection}")
+    print("\n  Subset params:")
     for k, v in subset_params.items():
         if k == "time":
             v = f"time_interval({v.value})"
         elif k == "level":
             v = f"level_interval({tuple([float(lev) for lev in v.value])})"
-        print(f'      {k} : {v}')
-    print('\n  Expect to get:')
+        print(f"      {k} : {v}")
+    print("\n  Expect to get:")
     for k, v in expect.items():
         if v is not None:
-            print(f'      {k} : {get_data(v)}')
+            print(f"      {k} : {get_data(v)}")
     if all(k in expect and expect[k] is None
-           for k in ('lons_in_range', 'lats_in_range')):
-        print('  (curvilinear grid; will test lon/lat ranges, not exact vals)')
+           for k in ("lons_in_range", "lats_in_range")):
+        print("  (curvilinear grid; will test lon/lat ranges, not exact vals)")
             
-    print('\n===========================================')
+    print("\n===========================================")
         
     try:
         if use_cache:
             cached_fn = cached_output_fn(collection, subset_params)
             if os.path.exists(cached_fn):
-                print(f'using cache: {cached_fn}')
+                print(f"using cache: {cached_fn}")
                 result = CachedResult(cached_fn)           
             else:
                 result = subset(collection,
                                 output_dir=tmpdir,
                                 **subset_params)
                 fn, = result.file_uris
-                print(f'caching: {cached_fn}')            
+                print(f"caching: {cached_fn}")            
                 shutil.copy(fn, cached_fn)
         else:
             result = subset(collection,
@@ -163,14 +163,14 @@ def do_test(collection, ds, use_cache=True, **kwargs):
     
 
 def prepare_test(ds, do_area=False, do_time=False, do_levels=False, area_args=None):
-    '''
+    """
     returns the params to the subset function that will be needed for the test
     and a dictionary of things to expect to come back from the test
 
     The boolean inputs do_area, do_time, do_levels control whether to subset
     in each of these ways.  The input area_args can contain a dictionary of arguments
     to get_rand_area (ignored if do_area==False)
-    '''
+    """
 
     if area_args == None:
         area_args = {}
@@ -180,7 +180,7 @@ def prepare_test(ds, do_area=False, do_time=False, do_levels=False, area_args=No
 
     if do_area:
         area = get_rand_area(ds, **area_args)
-        req_area = [float(v) for v in area['req_area']]
+        req_area = [float(v) for v in area["req_area"]]
         
         ### # it seems that subset treats a longitude range e.g. [350, 10] as [10, 350]
         ### # change it to [350, 370] etc
@@ -188,36 +188,36 @@ def prepare_test(ds, do_area=False, do_time=False, do_levels=False, area_args=No
         ###     #area_p[2] += 360
         ###     area_p[0] -= 360
         ###     
-        params['area'] = req_area
+        params["area"] = req_area
         
-        expect['lons_in_range'] = area['lons_in_range']
-        ## expect['wrap_lon'] = area['wrap_lon']
-        expect['lats_in_range'] = area['lats_in_range']
+        expect["lons_in_range"] = area["lons_in_range"]
+        ## expect["wrap_lon"] = area["wrap_lon"]
+        expect["lats_in_range"] = area["lats_in_range"]
     else:
-        expect['lons_in_range'] = get_lons(ds)
-        expect['lats_in_range'] = get_lats(ds)
+        expect["lons_in_range"] = get_lons(ds)
+        expect["lats_in_range"] = get_lats(ds)
 
     if do_levels:
         lev_int = get_rand_lev_int(ds)
         if lev_int is not None:
-            params['level'] = lev_int['req_interval']
-            expect['levs_in_range'] = lev_int['levs_in_range']
+            params["level"] = lev_int["req_interval"]
+            expect["levs_in_range"] = lev_int["levs_in_range"]
         else:
             print("WARNING: not requesting level range as no level dimension found")
-            expect['levs_in_range'] = None
+            expect["levs_in_range"] = None
     else:
-        expect['levs_in_range'] = get_levs(ds)
+        expect["levs_in_range"] = get_levs(ds)
 
     if do_time:
         time_int = get_rand_time_int(ds)
         if time_int is not None:
-            params['time'] = time_int['req_interval']
-            expect['times_in_range'] = time_int['times_in_range']
+            params["time"] = time_int["req_interval"]
+            expect["times_in_range"] = time_int["times_in_range"]
         else:
             print("WARNING: not requesting time range as no time dimension found")
-            expect['times_in_range'] = None
+            expect["times_in_range"] = None
     else:
-        expect['times_in_range'] = get_times(ds)
+        expect["times_in_range"] = get_times(ds)
         
     return params, expect
 
@@ -234,8 +234,8 @@ def get_rand_time_int(ds):
     t_start, t_end, vals_in_range = get_rand_range(times)
     ts_start = get_time_string(t_start)
     ts_end = get_time_string(t_end)
-    return {'req_interval': time_interval(ts_start, ts_end),
-            'times_in_range': vals_in_range}
+    return {"req_interval": time_interval(ts_start, ts_end),
+            "times_in_range": vals_in_range}
 
 
 def get_rand_lev_int(ds):
@@ -246,8 +246,8 @@ def get_rand_lev_int(ds):
     if levs is None:
         return None
     z_start, z_end, vals_in_range = get_rand_range(levs)
-    return {'req_interval': level_interval(z_start, z_end),
-            'levs_in_range': vals_in_range}
+    return {"req_interval": level_interval(z_start, z_end),
+            "levs_in_range": vals_in_range}
 
 
 def get_time_string(when):
@@ -260,8 +260,8 @@ def get_time_string(when):
     else:
         t = when.values.tolist()
         
-    return (f'{t.year:04d}-{t.month:02d}-{t.day:02d}'
-            f'T{t.hour:02d}:{t.minute:02d}:{t.second:02d}')
+    return (f"{t.year:04d}-{t.month:02d}-{t.day:02d}"
+            f"T{t.hour:02d}:{t.minute:02d}:{t.second:02d}")
 
 
 def get_rand_area(ds, force_lon_wrap=False, force_pole=None):
@@ -281,10 +281,10 @@ def get_rand_area(ds, force_lon_wrap=False, force_pole=None):
     (lon0, lon1, lons_in_range) = get_rand_lon_range(ds, force_wrap=force_lon_wrap)
     (lat0, lat1, lats_in_range) = get_rand_lat_range(ds, force_pole=force_pole)
     
-    return {'req_area': (lon0, lat0, lon1, lat1),
-            'lons_in_range': lons_in_range,
-            ## 'wrap_lon': wrap_lon,
-            'lats_in_range': lats_in_range}
+    return {"req_area": (lon0, lat0, lon1, lat1),
+            "lons_in_range": lons_in_range,
+            ## "wrap_lon": wrap_lon,
+            "lats_in_range": lats_in_range}
 
 
 def get_wrap_lon(lons):
@@ -300,13 +300,13 @@ def get_wrap_lon(lons):
         # assume this is a limited area
         return None
     elif maxlon - minlon >= 360:
-        raise Exception(f'too wide lon range {minlon} to {maxlon}')
+        raise Exception(f"too wide lon range {minlon} to {maxlon}")
     elif 0 <= minlon and maxlon < 360:
         return 360.
     elif -180 <= minlon and maxlon < 180:
         return 180.
     else:
-        raise Exception(f'unsupported lon range {minlon} to {maxlon}')
+        raise Exception(f"unsupported lon range {minlon} to {maxlon}")
     
 
 def get_rand_lon_range(ds, force_wrap=False):
@@ -320,36 +320,36 @@ def get_rand_lon_range(ds, force_wrap=False):
     can_wrap=(wrap_lon is not None)
 
     if force_wrap:
-        print('WARNING: forcing longitude wrapping for what appears to be limited area model')
-        params = {'force': 'wrap'}
+        print("WARNING: forcing longitude wrapping for what appears to be limited area model")
+        params = {"force": "wrap"}
     else:
-        params = {'can_wrap': can_wrap}
+        params = {"can_wrap": can_wrap}
     return get_rand_range(lons, **params)
     
     ## wrap_lon = get_wrap_lon(lons)
-    ## print(f'wrap long = {wrap_lon}')
+    ## print(f"wrap long = {wrap_lon}")
     ## return get_rand_range(lons, can_wrap=(wrap_lon is not None)), wrap_lon
     
 
 def get_rand_lat_range(ds, force_pole=None):
     """
-    Get a randomly chosen latitude range.  If force_pole is set to 'north' or 'south',
+    Get a randomly chosen latitude range.  If force_pole is set to "north" or "south",
     then the range will extend to the relevant pole.
     """
 
     lats = get_lats(ds)
 
-    # using 'force' will ensure that the range returned
+    # using "force" will ensure that the range returned
     # by get_rand_range goes to the end of the latitude values,
     # but for the test, actually use -90 or 90. Which value is
     # to be overwritten will depend on the ordering. 
     # 
-    if force_pole == 'north':
-        ret = get_rand_range(lats, force='upper')
+    if force_pole == "north":
+        ret = get_rand_range(lats, force="upper")
         return (ret[0], 90., ret[2]) if ret[1] >= ret[0] else (90., ret[1], ret[2])
-        params['force'] = 'upper'        
-    elif force_pole == 'south':
-        ret = get_rand_range(lats, force='lower')
+        params["force"] = "upper"        
+    elif force_pole == "south":
+        ret = get_rand_range(lats, force="lower")
         return (-90., ret[1], ret[2]) if ret[1] >= ret[0] else (ret[0], -90., ret[2])
     else:
         return get_rand_range(lats)
@@ -362,7 +362,7 @@ def get_rand_lat_range(ds, force_pole=None):
 ### 
 
 def get_rand_range(var, max_fraction=.1, can_wrap=False, force=None):
-    '''
+    """
     Get a random range from specified variable (which can be any number
     of dimensions).  Returns tuple of (lower, upper, values in range)
 
@@ -373,29 +373,29 @@ def get_rand_range(var, max_fraction=.1, can_wrap=False, force=None):
     360 it could return lower=-10 upper=10, and the values in range are in the range -10 to 10
     where those values from -10 to 0 are based on the values from 350 to 360 in the input
     
-    force can be used for special cases: 'lower' forces the range to include
-    the lower end (e.g. south pole for latitude), 'upper' forces it to include
-    the upper end (e.g. north pole), 'wrap' forces it to wrap around (the meridian
+    force can be used for special cases: "lower" forces the range to include
+    the lower end (e.g. south pole for latitude), "upper" forces it to include
+    the upper end (e.g. north pole), "wrap" forces it to wrap around (the meridian
     for longitude)
-    '''
+    """
     
     length = random.uniform(0, max_fraction)
 
     while True:
 
         did_wrap = False
-        if force == 'lower':
+        if force == "lower":
             lower_q = 0.
             upper_q = length
-        elif force == 'upper':
+        elif force == "upper":
             lower_q = 1 - length
             upper_q = 1.            
-        elif force == 'wrap':
+        elif force == "wrap":
             lower_q = random.uniform(1 - length, 1)
             upper_q = lower_q + length - 1
             did_wrap = True
         elif force is not None:
-            raise ValueError(f'unrecognised "force" value {force}')
+            raise ValueError(f"unrecognised force value {force}")
         elif can_wrap:
             lower_q = random.uniform(0, 1)
             upper_q = lower_q + length
@@ -451,17 +451,17 @@ def get_rand_range(var, max_fraction=.1, can_wrap=False, force=None):
 
 def get_lons(ds):
     "Get the longitude variable for a dataset. Not necessarily a coordinate variable."
-    return get_var_by_stdname(ds, 'longitude')
+    return get_var_by_stdname(ds, "longitude")
 def get_lats(ds):
     "Get the latitude variable for a dataset. Not necessarily a coordinate variable."
-    return get_var_by_stdname(ds, 'latitude')
+    return get_var_by_stdname(ds, "latitude")
 
 def get_times(ds):
     "Get the time coordinate variable for a dataset"
-    return get_axis_by_direction(ds, 'T')
+    return get_axis_by_direction(ds, "T")
 def get_levs(ds):
     "Get the height coordinate variable for a dataset"
-    return get_axis_by_direction(ds, 'Z')
+    return get_axis_by_direction(ds, "Z")
 
 ## ------------------------------------------------------
 ##  Now commented out anything to do with getting ranges,
@@ -507,7 +507,7 @@ def get_var_by_stdname(ds, stdname):
     Will raise an exception if there is not exactly one.
     """
     vars = [v for v in ds.variables.values()
-            if v.attrs.get('standard_name') == stdname]
+            if v.attrs.get("standard_name") == stdname]
     var, = vars
     return var
 
@@ -525,10 +525,10 @@ def is_curvi(lons, lats):
         raise Exception(f"unexpected dimensionality of lon, lat arrays: {lon.dims} and {lat.dims}")
 
 def get_lonlat_ranges_for_curvi(ds):
-    '''
+    """
     get ranges of lon, lat values where there is actual data (not masked)
     for any variable on the lon, lat grid
-    '''
+    """
 
     lons = get_lons(ds)
     lats = get_lats(ds)
@@ -546,7 +546,7 @@ def get_lonlat_ranges_for_curvi(ds):
     #
     ##vars_on_grid = [v for v in ds.variables.values()
     ##                if v.dims[-2:] == grid_dims and 
-    ##                v.attrs.get('standard_name') not in ('longitude', 'latitude')]
+    ##                v.attrs.get("standard_name") not in ("longitude", "latitude")]
     vars_on_grid = [v for v in ds.data_vars.values()
                     if v.dims[-2:] == grid_dims]
     if not vars_on_grid:
@@ -554,7 +554,7 @@ def get_lonlat_ranges_for_curvi(ds):
     has_data = np.zeros(grid_shape, dtype=bool)
     for var in vars_on_grid:
         var_has_data = np.logical_not(np.isnan(var.data))        
-        # reduce to 2d using 'any' in loop - there might be a cleverer way
+        # reduce to 2d using "any" in loop - there might be a cleverer way
         while var_has_data.ndim > 2:
             var_has_data = np.any(var_has_data, axis=(var_has_data.ndim - 3))
         assert var_has_data.shape == grid_shape
@@ -570,11 +570,11 @@ def get_lonlat_ranges_for_curvi(ds):
     lon_range_where_data = (lons_where_data.min(), lons_where_data.max())
     lat_range_where_data = (lats_where_data.min(), lats_where_data.max())
 
-    print('For this curvilinear dataset:')
-    print(f' Lon range where data {lon_range_where_data} '
-          f'(overall {lons.data.min(), lons.data.max()})')
-    print(f' Lat range where data {lat_range_where_data} '
-          f'(overall {lats.data.min(), lats.data.max()})')
+    print("For this curvilinear dataset:")
+    print(f" Lon range where data {lon_range_where_data} "
+          f"(overall {lons.data.min(), lons.data.max()})")
+    print(f" Lat range where data {lat_range_where_data} "
+          f"(overall {lats.data.min(), lats.data.max()})")
     
     return (lon_range_where_data, lat_range_where_data)
 
@@ -592,14 +592,14 @@ def get_lonlat_ranges_for_curvi(ds):
 ##                     for val in actual_range))
 ## 
 
-def check_in_range(actual_range, requested_range, label='', **kwargs):
+def check_in_range(actual_range, requested_range, label="", **kwargs):
     """
     check whether the range of values lies WITHIN the requested range of values;
     allow for the fact that the requested range might be passed in decreasing order
     """
     if not is_in_range(actual_range, requested_range, **kwargs):
-        raise Exception(f'range check for {label} failed')
-    print(f'{label}: Verified range {actual_range} within {requested_range}')
+        raise Exception(f"range check for {label} failed")
+    print(f"{label}: Verified range {actual_range} within {requested_range}")
     
 
 def is_in_range(actual_range, requested_range):
@@ -614,7 +614,7 @@ def is_in_range(actual_range, requested_range):
 
 
     
-def check_equal(vals, exp_vals, label=''):
+def check_equal(vals, exp_vals, label=""):
     """
     Check whether the values match the expected values
     """
@@ -622,16 +622,16 @@ def check_equal(vals, exp_vals, label=''):
     vals = get_data(vals)
     exp_vals = get_data(exp_vals)
                              
-    #print(f'\n\n============ {label} =========\n\n')
-    #print(f'Actual vals: {vals}')
-    #print(f'Expected vals: {exp_vals}')
+    #print(f"\n\n============ {label} =========\n\n")
+    #print(f"Actual vals: {vals}")
+    #print(f"Expected vals: {exp_vals}")
 
     if not np.array_equal(vals, exp_vals):
         ST()
-        raise Exception(f'equal values assertion failed for {label}:'
-                        f'actual {vals}, expected: {exp_vals}')
+        raise Exception(f"equal values assertion failed for {label}:"
+                        f"actual {vals}, expected: {exp_vals}")
         
-    print(f'{label}: checked {len(vals)} values match expected values')
+    print(f"{label}: checked {len(vals)} values match expected values")
 
     
 def check_result(result, expect, subset_params):
@@ -653,55 +653,55 @@ def check_result(result, expect, subset_params):
     """
 
     fn, = result.file_uris
-    #os.system(f'ls -l {fn}') ## FIXME: remove?
+    #os.system(f"ls -l {fn}") ## FIXME: remove?
     dsout = xr.open_dataset(fn)
-    dump_dims(dsout, label='output')
+    dump_dims(dsout, label="output")
 
     lons = get_lons(dsout)
     lats = get_lats(dsout)
-    if is_curvi(lons, lats) and 'area' in subset_params:
-        area = subset_params['area']
+    if is_curvi(lons, lats) and "area" in subset_params:
+        area = subset_params["area"]
         req_lon_range = (area[0], area[2])
         req_lat_range = (area[1], area[3])
 
         lon_range, lat_range = get_lonlat_ranges_for_curvi(dsout)
         if lon_range is not None:
-            #check_in_range(lon_range, req_lon_range, label='longitudes', wrap=expect['wrap_lon'])
-            print('Checking that lon-lat values with (unmasked) data in requested range')
-            check_in_range(lon_range, req_lon_range, label='longitudes')
-            check_in_range(lat_range, req_lat_range, label='latitudes')
+            #check_in_range(lon_range, req_lon_range, label="longitudes", wrap=expect["wrap_lon"])
+            print("Checking that lon-lat values with (unmasked) data in requested range")
+            check_in_range(lon_range, req_lon_range, label="longitudes")
+            check_in_range(lat_range, req_lat_range, label="latitudes")
         else:
-            print('Skipping lon/lat range check: did not find any data in requested range')
+            print("Skipping lon/lat range check: did not find any data in requested range")
     else:
-        expected_lons = expect['lons_in_range']
-        expected_lats = expect['lats_in_range']
+        expected_lons = expect["lons_in_range"]
+        expected_lats = expect["lats_in_range"]
 
-        check_equal(lons, expected_lons, label='longitudes')
-        check_equal(lats, expected_lats, label='latitudes')
+        check_equal(lons, expected_lons, label="longitudes")
+        check_equal(lats, expected_lats, label="latitudes")
 
-    expected_levs = expect['levs_in_range']
+    expected_levs = expect["levs_in_range"]
     if expected_levs is not None:
         levs = get_levs(dsout)   
-        check_equal(levs, expected_levs, label='levels')
+        check_equal(levs, expected_levs, label="levels")
 
-    expected_times = expect['times_in_range']
+    expected_times = expect["times_in_range"]
     if expected_times is not None:
         times = get_times(dsout)
-        check_equal(times, expected_times, label='times')
+        check_equal(times, expected_times, label="times")
 
     
 def get_axis_by_direction(ds, direction):
     """
-    Get the axis with the specified direction attribute ('X', 'Y', 'Z' or 'T')
+    Get the axis with the specified direction attribute ("X", "Y", "Z" or "T")
     or if there is none, returns None.
     (If more than one, raises an exception.)
     """
     axes = []
     for name in ds.dims:
         axis = ds[name]
-        if name == 'bnds':
+        if name == "bnds":
             continue
-        if hasattr(axis, 'axis') and axis.axis == direction:
+        if hasattr(axis, "axis") and axis.axis == direction:
             axes.append(axis)
     if len(axes) > 1:
         raise Exception(f"more than one dimension with axis {direction}")
@@ -711,6 +711,6 @@ def get_axis_by_direction(ds, direction):
         return None
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     random.seed(0)  ## FIXME - remove
     main()

From beaabb946cb5bfc37fabbbc8dad3e1a8d6874486 Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Mon, 15 May 2023 14:13:33 +0100
Subject: [PATCH 03/12] tidy up a little (remove commented-out code)

---
 .../run_data_pools_checks.py                  | 77 +------------------
 1 file changed, 1 insertion(+), 76 deletions(-)

diff --git a/tests/data_pools_checks/run_data_pools_checks.py b/tests/data_pools_checks/run_data_pools_checks.py
index 1714267..5a7876a 100644
--- a/tests/data_pools_checks/run_data_pools_checks.py
+++ b/tests/data_pools_checks/run_data_pools_checks.py
@@ -1,5 +1,3 @@
-from pdb import set_trace as ST
-
 import os
 os.environ["USE_PYGEOS"] = "0"
 import random
@@ -182,16 +180,9 @@ def prepare_test(ds, do_area=False, do_time=False, do_levels=False, area_args=No
         area = get_rand_area(ds, **area_args)
         req_area = [float(v) for v in area["req_area"]]
         
-        ### # it seems that subset treats a longitude range e.g. [350, 10] as [10, 350]
-        ### # change it to [350, 370] etc
-        ### if area_p[2] < area_p[0]:
-        ###     #area_p[2] += 360
-        ###     area_p[0] -= 360
-        ###     
         params["area"] = req_area
         
         expect["lons_in_range"] = area["lons_in_range"]
-        ## expect["wrap_lon"] = area["wrap_lon"]
         expect["lats_in_range"] = area["lats_in_range"]
     else:
         expect["lons_in_range"] = get_lons(ds)
@@ -325,11 +316,7 @@ def get_rand_lon_range(ds, force_wrap=False):
     else:
         params = {"can_wrap": can_wrap}
     return get_rand_range(lons, **params)
-    
-    ## wrap_lon = get_wrap_lon(lons)
-    ## print(f"wrap long = {wrap_lon}")
-    ## return get_rand_range(lons, can_wrap=(wrap_lon is not None)), wrap_lon
-    
+        
 
 def get_rand_lat_range(ds, force_pole=None):
     """
@@ -354,12 +341,6 @@ def get_rand_lat_range(ds, force_pole=None):
     else:
         return get_rand_range(lats)
 
-### def get_rand_lev_range(ds):
-###     return get_rand_range(get_levs(ds))
-### 
-### def get_rand_time_range(ds):
-###     return get_rand_range(get_times(ds))
-### 
 
 def get_rand_range(var, max_fraction=.1, can_wrap=False, force=None):
     """
@@ -463,43 +444,6 @@ def get_levs(ds):
     "Get the height coordinate variable for a dataset"
     return get_axis_by_direction(ds, "Z")
 
-## ------------------------------------------------------
-##  Now commented out anything to do with getting ranges,
-##  except in the case of curvilinear grid. Code now tests
-##  the set of actual coord values rather than just the range,
-##  where 1d coord variables.
-## ------------------------------------------------------
-##
-### def get_lon_range(ds, wrap_lon=None):
-###     lons = get_lons(ds)
-###     if wrap_lon is not None:
-###         ST()
-###     return get_range(get_lons(ds))
-###     
-### def get_lat_range(ds):
-###     return get_range(get_lats(ds))
-### 
-### def get_lev_range(ds):
-###     return get_range(get_levs(ds))
-### 
-### def get_time_range(ds):
-###     return get_range(get_times(ds))
-### 
-### def get_range(var):
-###     return (var.min(), var.max())
-##
-## def get_lon_lat_ranges(ds, wrap_lon=None):
-## 
-##     lons = get_lons(ds)
-##     lats = get_lats(ds)
-##     
-##     if is_curvi(lons, lats):
-##         return get_lonlat_ranges_for_curvi(ds, lons, lats)
-##     else:
-##         if wrap_lon is not None:
-##             ST()  ## FIXME
-##         return (get_range(lons), get_range(lats))
-
 
 def get_var_by_stdname(ds, stdname):
     """
@@ -544,9 +488,6 @@ def get_lonlat_ranges_for_curvi(ds):
     # (start with 2d array of False, then use logical OR with 
     # each variable, although probably there is only one such var)
     #
-    ##vars_on_grid = [v for v in ds.variables.values()
-    ##                if v.dims[-2:] == grid_dims and 
-    ##                v.attrs.get("standard_name") not in ("longitude", "latitude")]
     vars_on_grid = [v for v in ds.data_vars.values()
                     if v.dims[-2:] == grid_dims]
     if not vars_on_grid:
@@ -579,19 +520,6 @@ def get_lonlat_ranges_for_curvi(ds):
     return (lon_range_where_data, lat_range_where_data)
 
 
-## def is_in_range(actual_range, requested_range, wrap=None, modulus=360):
-##     req0, req1 = requested_range
-##     if wrap is not None and req0 > req1:
-##         return all((req0 <= val < wrap
-##                     or wrap - modulus <= val <= req1
-##                     for val in actual_range))
-##     else:
-##         if req0 > req1:
-##             req0, req1 = req1, req0
-##         return all((req0 <= val <= req1
-##                     for val in actual_range))
-## 
-
 def check_in_range(actual_range, requested_range, label="", **kwargs):
     """
     check whether the range of values lies WITHIN the requested range of values;
@@ -627,7 +555,6 @@ def check_equal(vals, exp_vals, label=""):
     #print(f"Expected vals: {exp_vals}")
 
     if not np.array_equal(vals, exp_vals):
-        ST()
         raise Exception(f"equal values assertion failed for {label}:"
                         f"actual {vals}, expected: {exp_vals}")
         
@@ -653,7 +580,6 @@ def check_result(result, expect, subset_params):
     """
 
     fn, = result.file_uris
-    #os.system(f"ls -l {fn}") ## FIXME: remove?
     dsout = xr.open_dataset(fn)
     dump_dims(dsout, label="output")
 
@@ -666,7 +592,6 @@ def check_result(result, expect, subset_params):
 
         lon_range, lat_range = get_lonlat_ranges_for_curvi(dsout)
         if lon_range is not None:
-            #check_in_range(lon_range, req_lon_range, label="longitudes", wrap=expect["wrap_lon"])
             print("Checking that lon-lat values with (unmasked) data in requested range")
             check_in_range(lon_range, req_lon_range, label="longitudes")
             check_in_range(lat_range, req_lat_range, label="latitudes")

From 1c495d12d49c7dd4dc6220da90dc05d6dbf8b25b Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Tue, 29 Aug 2023 23:08:41 +0100
Subject: [PATCH 04/12] add the means to write results to database

---
 tests/data_pools_checks/results_db.py         | 127 ++++++++++++++++++
 .../run_data_pools_checks.py                  |  98 +++++++++++---
 2 files changed, 204 insertions(+), 21 deletions(-)
 create mode 100644 tests/data_pools_checks/results_db.py

diff --git a/tests/data_pools_checks/results_db.py b/tests/data_pools_checks/results_db.py
new file mode 100644
index 0000000..ad096b9
--- /dev/null
+++ b/tests/data_pools_checks/results_db.py
@@ -0,0 +1,127 @@
+import os
+from itertools import chain
+import csv
+import gzip
+import sqlite3
+
+
+class ResultsDB:    
+    
+    def __init__(self,
+                 columns,
+                 csvgz_file='tests.csv.gz', sqlite_file='tests_tmp.db',
+                 sql_table_name = 'test_results',
+                 sql_primary_key='id',
+                 merge_every=100):
+        self.columns = columns[:]
+        self._csvgz_file = csvgz_file
+        self._sqlite_file = sqlite_file
+        self._conn = None
+        self._cur = None
+        self._sql_table_name = sql_table_name
+        self._sql_primary_key = sql_primary_key
+        self._merge_every = merge_every
+        self._num_tmp_rows = 0
+        
+    def read_csvgz(self, src=None):
+        "read .csv.gz, yield a sequence of rows (each is a list)"
+        if src is None:
+            src = self._csvgz_file
+        if not os.path.exists(src):
+            return []
+        with gzip.open(src, 'rt') as f:
+            reader = csv.reader(f)
+            headers = next(reader)
+            if headers != self.columns:
+                raise Exception('CSV file does not have the expected columns')
+            for row in reader:
+                yield row
+
+    def write_csvgz(self, rows, dest=None):
+        "write .csv.gz, input is a sequence of rows"
+        if dest is None:
+            dest = self._csvgz_file
+        tmpname = dest.replace('.gz', '.tmp.gz')
+        try:
+            with gzip.open(tmpname, 'wt') as fout:
+                writer = csv.writer(fout)
+                writer.writerow(self.columns)
+                for row in rows:
+                    writer.writerow(row)
+            os.rename(tmpname, dest)
+        finally:
+            if os.path.exists(tmpname):
+                os.remove(tmpname)
+
+    def add_row(self, **row_dict):
+        "add a single row - goes into the sqlite file"
+        cur, conn = self._get_cur_conn()
+        sql_keys = ','.join(row_dict.keys())
+        sql_vals = ','.join((f"'{str(val)}'" for val in row_dict.values()))
+        sql = f'insert into {self._sql_table_name} ({sql_keys}) values ({sql_vals})'
+        cur.execute(sql)
+        conn.commit()
+        self._num_tmp_rows += 1
+        if self._num_tmp_rows == self._merge_every:
+            self.merge_and_tidy()            
+
+    def _init_sqlite(self):
+        sql_columns = ','.join([f'{self._sql_primary_key} integer PRIMARY KEY'] +
+                               [f'{col} text' for col in self.columns])
+        sql = f'CREATE TABLE IF NOT EXISTS {self._sql_table_name}({sql_columns})'
+        conn = sqlite3.connect(self._sqlite_file)
+        cur = conn.cursor()
+        cur.execute(sql)
+        self._conn = conn
+        self._cur = cur
+
+    def _get_cur_conn(self):
+        if self._cur is None or self._conn is None:
+            self._init_sqlite()
+        return self._cur, self._conn
+
+    def _get_sqlite_rows(self, include_primary_key=False):
+        cur, conn = self._get_cur_conn()
+        cur.execute(f'SELECT * from {self._sql_table_name}')
+        for row in cur:
+            if include_primary_key:
+                yield row
+            else:
+                yield row[1:]
+
+    def _destroy_sqlite(self):
+        if os.path.exists(self._sqlite_file):
+            os.remove(self._sqlite_file)
+        self._conn = self._cur = None
+        self._num_tmp_rows = 0
+                
+    def merge_and_tidy(self):
+        csv_rows = self.read_csvgz()
+        new_rows = self._get_sqlite_rows()
+        self.write_csvgz(chain(csv_rows, new_rows))
+        self._destroy_sqlite()
+
+    def __enter__(self):
+        return self
+        
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        self.merge_and_tidy()
+
+        
+if __name__ == "__main__":
+    import random
+
+    def dump_db():
+        print("=====================")
+        os.system('zcat tests.csv.gz')
+        print()
+        os.system('echo .dump | sqlite3 -header -readonly tests_tmp.db')
+        print("=====================")
+    
+    columns = ['test_location', 'test_time', 'collection', 'area',
+               'level', 'time', 'success', 'message']
+
+    with ResultsDB(columns, merge_every=7) as rdb:
+        for _ in range(13):
+            rdb.add_row(collection='foo', success='False')
+            rdb.add_row(collection=str(random.random()), time='blah', success='True')
diff --git a/tests/data_pools_checks/run_data_pools_checks.py b/tests/data_pools_checks/run_data_pools_checks.py
index 5a7876a..fbd8435 100644
--- a/tests/data_pools_checks/run_data_pools_checks.py
+++ b/tests/data_pools_checks/run_data_pools_checks.py
@@ -6,35 +6,87 @@
 import datetime
 import numpy as np
 import shutil
+import time
 
 from daops.ops.subset import subset
 from roocs_utils.xarray_utils.xarray_utils import open_xr_dataset
 from roocs_utils.parameter.param_utils import time_interval, level_interval
 
+from results_db import ResultsDB
 from examples_data import data_pool_tests_db
 from caching import cached_output_fn, CachedResult
 
 
-def main():
-    for collection in data_pool_tests_db:
-        test_subset_in_data_pools(collection)
-    
+
+def main(test_location=''):
+
+    results_db_columns = ['test_location', 'test_time', 'collection', 'area',
+                          'level', 'time', 'success', 'message']    
     
-def test_subset_in_data_pools(collection):
+    with ResultsDB(results_db_columns) as rdb:
+
+        row_writer = lambda **kwargs: \
+            rdb.add_row(test_location=test_location,
+                        test_time=time.strftime("%Y-%m-%d %H:%M:%S"),
+                        **kwargs)
+        
+        for collection in data_pool_tests_db:
+            test_subset_in_data_pools(collection, row_writer)
+
+
+def test_subset_in_data_pools(collection, row_writer=None):
     """
     Do a range of tests on a given collection
     """
-    
+
     ds = open_dataset(collection)
 
+    for test_kwargs, description in get_tests():
+
+        print(description)
+        subset_params, success, message = do_test(collection, ds, **test_kwargs)
+        
+        if row_writer is not None:
+            store_result(row_writer,
+                         collection, subset_params, success, message)
+
+
+def store_result(row_writer, collection, subset_params, success, message):
+    tm = subset_params.get('time')
+    if tm is not None:
+        tm = '/'.join(str(v) for v in tm.value)
+    level = subset_params.get('level')
+    if level is not None:
+        level = '/'.join(str(float(v)) for v in level.value)
+    area = subset_params.get('area')
+    if area is not None:
+        area = '/'.join(str(v) for v in area)
+    success = str(success)
+
+    row_writer(collection=collection,
+               area=area,
+               level=level,
+               time=tm,
+               success=success,
+               message=message)
+
+            
+def get_tests():
+    """
+    Gets a sequence of tests to do.
+    """
+    
     # subset horizontally and in time
     for _ in range(3):
-        do_test(collection, ds, do_area=True, do_time=True)
+        yield {'do_area': True,
+               'do_time': True}, 'subset by area and time'
 
     # subset in all dimensions (though levels will be ignored if there isn't
     # a level dimension)
     for _ in range(3):
-        do_test(collection, ds, do_area=True, do_time=True, do_levels=True)
+        yield {'do_area': True,
+               'do_time': True,
+               'do_levels': True}, 'subset by area, time and levels'
 
     # and a few special cases to try
     special_cases = [{"force_lon_wrap": True},
@@ -43,14 +95,11 @@ def test_subset_in_data_pools(collection):
                      {"force_lon_wrap": True, "force_pole": "south"}]
 
     for area_args in special_cases:
-        print(f"Doing special case: {area_args}")
-        do_test(collection, ds, do_area=True, do_time=True, do_levels=True,
-                area_args=area_args)
-
-    # to delete? - extra test that got left in
-    # (not removing for now due to existing cache)
-    do_test(collection, ds, do_area=True, do_time=True, do_levels=True)
-    
+        yield {'do_area': True,
+               'do_time': True,
+               'do_levels': True,
+               'area_args': area_args}, f"Doing special case: {area_args}"
+        
         
 def open_dataset(collection):
     
@@ -94,7 +143,7 @@ def dump_dims(ds, label=""):
 def get_data(val):
     return val.data if isinstance(val, (xr.DataArray, xr.IndexVariable)) else val
     
-    
+
 def do_test(collection, ds, use_cache=True, **kwargs):
     """
     Do an individual test on a collection
@@ -150,16 +199,22 @@ def do_test(collection, ds, use_cache=True, **kwargs):
     except KeyboardInterrupt:
         raise
     except Exception as exc:
+        success = False
+        message = str(exc)
         print("*******************\n"
               "*** TEST FAILED ***\n"
               "*******************\n\n\n")
-        raise  # FIXME: comment out
+        #raise  # FIXME: comment out
     else:
+        success = True
+        message = ""
         print("Test succeeded\n\n\n")
     finally:        
         temp_dir.cleanup()
-    
 
+    return subset_params, success, message
+            
+    
 def prepare_test(ds, do_area=False, do_time=False, do_levels=False, area_args=None):
     """
     returns the params to the subset function that will be needed for the test
@@ -635,7 +690,8 @@ def get_axis_by_direction(ds, direction):
     else:
         return None
 
-
+    
 if __name__ == "__main__":
     random.seed(0)  ## FIXME - remove
-    main()
+    main(test_location='CEDA')
+    

From dfb401e6af662d009a9bbdf8b7671fcb15842d39 Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Mon, 4 Sep 2023 17:42:03 +0100
Subject: [PATCH 05/12] tweaks for results_db and add unit tests

---
 tests/data_pools_checks/results_db.py      |  20 +++-
 tests/data_pools_checks/test_results_db.py | 106 +++++++++++++++++++++
 2 files changed, 124 insertions(+), 2 deletions(-)
 create mode 100644 tests/data_pools_checks/test_results_db.py

diff --git a/tests/data_pools_checks/results_db.py b/tests/data_pools_checks/results_db.py
index ad096b9..c800416 100644
--- a/tests/data_pools_checks/results_db.py
+++ b/tests/data_pools_checks/results_db.py
@@ -37,6 +37,22 @@ def read_csvgz(self, src=None):
             for row in reader:
                 yield row
 
+
+    def _read_as_dicts(self, reader, remove_blank=True, **kwargs):
+        if remove_blank:
+            cond = lambda t: t[1]
+        else:
+            cond = lambda t: True
+            
+        return (dict(t for t in zip(self.columns, row) if cond(t))
+                for row in reader(**kwargs))
+                        
+    def read_csvgz_as_dicts(self, **kwargs):
+        return self._read_as_dicts(self.read_csvgz, **kwargs)
+        
+    def read_sqlite_as_dicts(self, **kwargs):
+        return self._read_as_dicts(self.get_sqlite_rows, **kwargs)
+                
     def write_csvgz(self, rows, dest=None):
         "write .csv.gz, input is a sequence of rows"
         if dest is None:
@@ -80,7 +96,7 @@ def _get_cur_conn(self):
             self._init_sqlite()
         return self._cur, self._conn
 
-    def _get_sqlite_rows(self, include_primary_key=False):
+    def get_sqlite_rows(self, include_primary_key=False):
         cur, conn = self._get_cur_conn()
         cur.execute(f'SELECT * from {self._sql_table_name}')
         for row in cur:
@@ -97,7 +113,7 @@ def _destroy_sqlite(self):
                 
     def merge_and_tidy(self):
         csv_rows = self.read_csvgz()
-        new_rows = self._get_sqlite_rows()
+        new_rows = self.get_sqlite_rows()
         self.write_csvgz(chain(csv_rows, new_rows))
         self._destroy_sqlite()
 
diff --git a/tests/data_pools_checks/test_results_db.py b/tests/data_pools_checks/test_results_db.py
new file mode 100644
index 0000000..67e2d5f
--- /dev/null
+++ b/tests/data_pools_checks/test_results_db.py
@@ -0,0 +1,106 @@
+import os
+import random
+import subprocess as sp
+
+from results_db import ResultsDB
+
+
+columns = ['test_location', 'test_time', 'collection', 'area',
+           'level', 'time', 'success', 'message']
+
+tst_data = [{'collection': str(random.random()),
+             'time': 'blah',
+             'success': 'True'}
+            for _ in range(30)]
+
+
+tst_data2 = [{'collection': 'foo',
+              'time': 'bar'},
+             {'collection': 'baz',
+              'message': 'qux'},
+             {'collection': 'quux',
+              'message': 'corge'}]
+
+tst_csv_data = (','.join(columns) + '\n' +
+                ',,foo,,,bar,,\n,,baz,,,,,qux')
+
+tst_sql_data = f'''PRAGMA foreign_keys=OFF;
+BEGIN TRANSACTION;
+CREATE TABLE test_results(id integer PRIMARY KEY,{','.join(f'{col} text' for col in columns)});
+INSERT INTO "test_results" VALUES(1,NULL,NULL,'quux',NULL,NULL,NULL,NULL,'corge');
+COMMIT;'''
+
+tmp_csvgz = '/tmp/test.csv.gz'
+tmp_sqlite = '/tmp/test_tmp.sql'
+
+def _purge():
+    for path in (tmp_csvgz, tmp_sqlite):
+        if os.path.exists(path):
+            os.remove(path)
+
+def _open_test_db(merge_every=10):
+    return ResultsDB(columns,
+                     csvgz_file=tmp_csvgz,
+                     sqlite_file=tmp_sqlite,
+                     merge_every=merge_every)
+
+
+def test_write_and_read():
+    _purge()
+    with _open_test_db() as rdb:
+        for row in tst_data:
+            rdb.add_row(**row)
+            
+    with _open_test_db() as rdb:
+        data_read = list(rdb.read_csvgz_as_dicts())
+
+    assert tst_data == data_read
+
+
+def test_write_and_read_while_open():
+    
+    _purge()
+    merge_every = 7
+    
+    with _open_test_db(merge_every=merge_every) as rdb:
+        for i, row in enumerate(tst_data, start=1):
+            rdb.add_row(**row)
+
+            rows_in_csv = list(rdb.read_csvgz_as_dicts())
+            rows_in_sqlite = list(rdb.read_sqlite_as_dicts())
+
+            print(f'CSV: {len(rows_in_csv)} rows, '
+                  f'sqlite: {len(rows_in_sqlite)} rows, ')
+            
+            assert rows_in_csv + rows_in_sqlite == tst_data[:i]
+            assert len(rows_in_csv) % merge_every == 0
+            assert len(rows_in_sqlite) < merge_every
+
+            
+def test_manual_merge():
+
+    _purge()
+    merge_every = 7
+    
+    with _open_test_db(merge_every=merge_every) as rdb:
+        for i, row in enumerate(tst_data, start=1):
+            rdb.add_row(**row)
+            rdb.merge_and_tidy()
+
+            rows_in_csv = list(rdb.read_csvgz_as_dicts())
+            rows_in_sqlite = list(rdb.read_sqlite_as_dicts())
+            assert rows_in_csv == tst_data[:i]
+            assert rows_in_sqlite == []
+
+
+def test_dump_data():
+
+    _purge()
+    with _open_test_db(merge_every=2) as rdb:
+        for row in tst_data2:
+            rdb.add_row(**row)
+        csvdata = sp.getoutput(f'zcat {tmp_csvgz}')
+        sqldata = sp.getoutput('echo .dump '
+                               f'| sqlite3 -header {tmp_sqlite}')
+    assert csvdata == tst_csv_data
+    assert sqldata == tst_sql_data

From 9de8463956442915a71cc1ff1bcb4b886b44cbdc Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Wed, 13 Sep 2023 16:32:46 +0100
Subject: [PATCH 06/12] field all NaNs check

---
 .../run_data_pools_checks.py                  | 104 ++++++++++++++++--
 1 file changed, 92 insertions(+), 12 deletions(-)

diff --git a/tests/data_pools_checks/run_data_pools_checks.py b/tests/data_pools_checks/run_data_pools_checks.py
index fbd8435..5b8cc6d 100644
--- a/tests/data_pools_checks/run_data_pools_checks.py
+++ b/tests/data_pools_checks/run_data_pools_checks.py
@@ -194,17 +194,15 @@ def do_test(collection, ds, use_cache=True, **kwargs):
                             output_dir=tmpdir,
                             **subset_params)
 
-        check_result(result, expect, subset_params)
+        check_result(result, expect, collection, subset_params, tmpdir)
 
     except KeyboardInterrupt:
         raise
     except Exception as exc:
         success = False
         message = str(exc)
-        print("*******************\n"
-              "*** TEST FAILED ***\n"
-              "*******************\n\n\n")
-        #raise  # FIXME: comment out
+        print_error(f"TEST FAILED: {message}")
+        raise  # FIXME: comment out
     else:
         success = True
         message = ""
@@ -213,7 +211,14 @@ def do_test(collection, ds, use_cache=True, **kwargs):
         temp_dir.cleanup()
 
     return subset_params, success, message
-            
+
+
+def print_error(message):
+    stars = "*" * len(message)
+    print(f"****{stars}****\n"
+          f"*** {message} ***\n"
+          f"****{stars}****\n\n\n")
+    
     
 def prepare_test(ds, do_area=False, do_time=False, do_levels=False, area_args=None):
     """
@@ -522,7 +527,26 @@ def is_curvi(lons, lats):
         return True
     else:
         raise Exception(f"unexpected dimensionality of lon, lat arrays: {lon.dims} and {lat.dims}")
-
+    
+    
+def get_gridded_vars(ds, lons=None, lats=None, curvi=None):
+    if lons is None:
+        lons = get_lons(ds)
+    if lats is None:
+        lats = get_lats(ds)
+    if curvi is None:
+        curvi = is_curvi(lons, lats)
+
+    if curvi:
+        grid_dims = lons.dims
+    else:
+        latdim, = lats.dims
+        londim, = lons.dims
+        grid_dims = (latdim, londim)
+    return [v for v in ds.data_vars.values()
+            if v.dims[-2:] == grid_dims]
+        
+    
 def get_lonlat_ranges_for_curvi(ds):
     """
     get ranges of lon, lat values where there is actual data (not masked)
@@ -532,7 +556,6 @@ def get_lonlat_ranges_for_curvi(ds):
     lons = get_lons(ds)
     lats = get_lats(ds)
     
-    grid_dims = lons.dims
     grid_shape = lons.shape
 
     # get a list of the variables that are on the lon-lat grid,
@@ -543,8 +566,9 @@ def get_lonlat_ranges_for_curvi(ds):
     # (start with 2d array of False, then use logical OR with 
     # each variable, although probably there is only one such var)
     #
-    vars_on_grid = [v for v in ds.data_vars.values()
-                    if v.dims[-2:] == grid_dims]
+    vars_on_grid = get_gridded_vars(ds,
+                                    lons=lons, lats=lats,
+                                    curvi=True)
     if not vars_on_grid:
         raise Exception("could not find any data variables")
     has_data = np.zeros(grid_shape, dtype=bool)
@@ -616,7 +640,7 @@ def check_equal(vals, exp_vals, label=""):
     print(f"{label}: checked {len(vals)} values match expected values")
 
     
-def check_result(result, expect, subset_params):
+def check_result(result, expect, collection, subset_params, tmpdir):
 
     """
     Do various checks on the result of subsetting.  result should be an
@@ -640,7 +664,10 @@ def check_result(result, expect, subset_params):
 
     lons = get_lons(dsout)
     lats = get_lats(dsout)
-    if is_curvi(lons, lats) and "area" in subset_params:
+
+    curvi = is_curvi(lons, lats)
+    
+    if curvi and "area" in subset_params:
         area = subset_params["area"]
         req_lon_range = (area[0], area[2])
         req_lat_range = (area[1], area[3])
@@ -668,7 +695,16 @@ def check_result(result, expect, subset_params):
     if expected_times is not None:
         times = get_times(dsout)
         check_equal(times, expected_times, label="times")
+        timeval = times[0]
+    else:
+        timeval = None
+
+    vars_on_grid = get_gridded_vars(dsout,
+                                    lons=lons, lats=lats,
+                                    curvi=curvi)
 
+    do_nans_check(vars_on_grid, collection, timeval, tmpdir)
+    
     
 def get_axis_by_direction(ds, direction):
     """
@@ -690,6 +726,50 @@ def get_axis_by_direction(ds, direction):
     else:
         return None
 
+
+def do_nans_check(vars_on_grid, collection, timeval, tmpdir):
+    """
+    Check whether any of the variables consist of all NaN values.
+    If so, then extract (for one timestep) the variable on all grid points.
+    If that variable consists of some but not all NaN values, then assume 
+    that the subsetter is working and we are just requesting a region where
+    there are NaN values, so downgrade to a warning -- otherwise raise
+    an exception.
+    """
+
+    vars_with_nans = set()
+    for var in vars_on_grid:
+        if np.all(np.isnan(var.data)):
+            vars_with_nans.add(var.name)
+
+    if vars_with_nans:
+        dsout2 = get_fullfield(collection, tmpdir,
+                               timeval=timeval)
+
+        for varname in vars_with_nans:
+            isnan_ref = np.isnan(dsout2[varname].data)
+            if np.any(isnan_ref) and not np.all(isnan_ref):
+                print(f"Warning: {varname} contains all NaNs (but some non-NaNs exist in full field).")
+            else:
+                raise Exception(f"Variable {varname} contains all NaNs.")
+
+    
+
+
+def get_fullfield(collection, tmpdir, timeval=None):
+    """
+    Get the result of subsetting, optionally by time to a particular
+    time value, but not in any other dimensions.
+    """    
+    params = {}
+    if timeval is not None:
+        t_iso = timeval.data.tolist().isoformat()
+        params['time'] = time_interval(t_iso, t_iso)
+
+    result = subset(collection, output_dir=tmpdir, **params)
+    fn, = result.file_uris
+    return xr.open_dataset(fn)
+
     
 if __name__ == "__main__":
     random.seed(0)  ## FIXME - remove

From 7003d26ec9560d7d7827ec7ae922043e094d8e12 Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Wed, 13 Sep 2023 17:39:50 +0100
Subject: [PATCH 07/12] refactor as class and add caching for get_fullfield

---
 .../run_data_pools_checks.py                  | 1466 +++++++++--------
 1 file changed, 742 insertions(+), 724 deletions(-)

diff --git a/tests/data_pools_checks/run_data_pools_checks.py b/tests/data_pools_checks/run_data_pools_checks.py
index 5b8cc6d..2db0c1b 100644
--- a/tests/data_pools_checks/run_data_pools_checks.py
+++ b/tests/data_pools_checks/run_data_pools_checks.py
@@ -18,760 +18,778 @@
 
 
 
-def main(test_location=''):
 
-    results_db_columns = ['test_location', 'test_time', 'collection', 'area',
-                          'level', 'time', 'success', 'message']    
-    
-    with ResultsDB(results_db_columns) as rdb:
-
-        row_writer = lambda **kwargs: \
-            rdb.add_row(test_location=test_location,
-                        test_time=time.strftime("%Y-%m-%d %H:%M:%S"),
-                        **kwargs)
-        
-        for collection in data_pool_tests_db:
-            test_subset_in_data_pools(collection, row_writer)
-
-
-def test_subset_in_data_pools(collection, row_writer=None):
-    """
-    Do a range of tests on a given collection
-    """
-
-    ds = open_dataset(collection)
-
-    for test_kwargs, description in get_tests():
-
-        print(description)
-        subset_params, success, message = do_test(collection, ds, **test_kwargs)
-        
-        if row_writer is not None:
-            store_result(row_writer,
-                         collection, subset_params, success, message)
-
-
-def store_result(row_writer, collection, subset_params, success, message):
-    tm = subset_params.get('time')
-    if tm is not None:
-        tm = '/'.join(str(v) for v in tm.value)
-    level = subset_params.get('level')
-    if level is not None:
-        level = '/'.join(str(float(v)) for v in level.value)
-    area = subset_params.get('area')
-    if area is not None:
-        area = '/'.join(str(v) for v in area)
-    success = str(success)
-
-    row_writer(collection=collection,
-               area=area,
-               level=level,
-               time=tm,
-               success=success,
-               message=message)
-
-            
-def get_tests():
-    """
-    Gets a sequence of tests to do.
-    """
-    
-    # subset horizontally and in time
-    for _ in range(3):
-        yield {'do_area': True,
-               'do_time': True}, 'subset by area and time'
-
-    # subset in all dimensions (though levels will be ignored if there isn't
-    # a level dimension)
-    for _ in range(3):
-        yield {'do_area': True,
-               'do_time': True,
-               'do_levels': True}, 'subset by area, time and levels'
-
-    # and a few special cases to try
-    special_cases = [{"force_lon_wrap": True},
-                     {"force_pole": "north"},
-                     {"force_pole": "south"},
-                     {"force_lon_wrap": True, "force_pole": "south"}]
-
-    for area_args in special_cases:
-        yield {'do_area': True,
-               'do_time': True,
-               'do_levels': True,
-               'area_args': area_args}, f"Doing special case: {area_args}"
-        
-        
-def open_dataset(collection):
-    
-    print(f"opening {collection}")
+class SubsetTester:
 
-    if collection[0] == "/":
-        return xr.open_dataset(collection)
-    
-    #====================================
-    ## THIS OUGHT TO WORK, but I need to work out how to configure the paths
-    ## for the tests.  strace shows that it is using /tmp/roocs.ini but this
-    ## is overwritten every time (tests/conftest.py gets imported and
-    ## write_roocs_cfg() is run)
-    ##
-    ## strace also says that it is also looking in all of these:
-    ##     <roocs_dir>/clisops/clisops/etc/roocs.ini
-    ##     <roocs_dir>/daops/daops/etc/roocs.ini
-    ##     <roocs_dir>/roocs-utils/roocs_utils/etc/roocs.ini
-    ##
-    ## For now, symlinking ~/.mini-esgf-data/master/test_data/badc
-    ## to point to the real /badc will do for a workaround -- now it finds 
-    ## the dataset
-    ds = open_xr_dataset(collection)
-    
-    ## OR HERE IS ANOTHER POSSIBLE TEMPORARY WORKAROUND
-    # import glob  # for open_xr_dataset workaround
-    # assert isinstance(collection, str)
-    #paths = glob.glob(f'/badc/cmip6/data/{collection.replace(".","/")}/*.nc')
-    #ds = open_xr_dataset(paths)
-    #====================================
-    
-    return ds
 
+    def __init__(self,
+                 test_location='',
+                 use_cache=True):
 
-def dump_dims(ds, label=""):
-    ignore = ("bnds", "vertices")
-    print(f"{label} dataset dimensions: "
-          f'{",".join(f"{k}:{v}" for k, v in ds.dims.items() if k not in ignore)}')
+        self._test_location = test_location
+        self._use_cache = use_cache
+        self._tmpdir = None
 
 
-def get_data(val):
-    return val.data if isinstance(val, (xr.DataArray, xr.IndexVariable)) else val
-    
+    def main(self):
 
-def do_test(collection, ds, use_cache=True, **kwargs):
-    """
-    Do an individual test on a collection
-    """
-    
-    print(f"Doing test on {collection} using {kwargs}")
-    dump_dims(ds, label="input")
-    subset_params, expect = prepare_test(ds, **kwargs)
-
-    temp_dir = tempfile.TemporaryDirectory()
-    tmpdir = temp_dir.name
-
-    print("===========================================")
-    print("Doing test with:")
-    print(f"\n  Collection: {collection}")
-    print("\n  Subset params:")
-    for k, v in subset_params.items():
-        if k == "time":
-            v = f"time_interval({v.value})"
-        elif k == "level":
-            v = f"level_interval({tuple([float(lev) for lev in v.value])})"
-        print(f"      {k} : {v}")
-    print("\n  Expect to get:")
-    for k, v in expect.items():
-        if v is not None:
-            print(f"      {k} : {get_data(v)}")
-    if all(k in expect and expect[k] is None
-           for k in ("lons_in_range", "lats_in_range")):
-        print("  (curvilinear grid; will test lon/lat ranges, not exact vals)")
-            
-    print("\n===========================================")
-        
-    try:
-        if use_cache:
-            cached_fn = cached_output_fn(collection, subset_params)
-            if os.path.exists(cached_fn):
-                print(f"using cache: {cached_fn}")
-                result = CachedResult(cached_fn)           
-            else:
-                result = subset(collection,
-                                output_dir=tmpdir,
-                                **subset_params)
-                fn, = result.file_uris
-                print(f"caching: {cached_fn}")            
-                shutil.copy(fn, cached_fn)
-        else:
-            result = subset(collection,
-                            output_dir=tmpdir,
-                            **subset_params)
+        results_db_columns = ['test_location', 'test_time', 'collection', 'area',
+                              'level', 'time', 'success', 'message']    
 
-        check_result(result, expect, collection, subset_params, tmpdir)
-
-    except KeyboardInterrupt:
-        raise
-    except Exception as exc:
-        success = False
-        message = str(exc)
-        print_error(f"TEST FAILED: {message}")
-        raise  # FIXME: comment out
-    else:
-        success = True
-        message = ""
-        print("Test succeeded\n\n\n")
-    finally:        
-        temp_dir.cleanup()
-
-    return subset_params, success, message
-
-
-def print_error(message):
-    stars = "*" * len(message)
-    print(f"****{stars}****\n"
-          f"*** {message} ***\n"
-          f"****{stars}****\n\n\n")
-    
-    
-def prepare_test(ds, do_area=False, do_time=False, do_levels=False, area_args=None):
-    """
-    returns the params to the subset function that will be needed for the test
-    and a dictionary of things to expect to come back from the test
-
-    The boolean inputs do_area, do_time, do_levels control whether to subset
-    in each of these ways.  The input area_args can contain a dictionary of arguments
-    to get_rand_area (ignored if do_area==False)
-    """
-
-    if area_args == None:
-        area_args = {}
-
-    params = {}
-    expect = {}
-
-    if do_area:
-        area = get_rand_area(ds, **area_args)
-        req_area = [float(v) for v in area["req_area"]]
-        
-        params["area"] = req_area
-        
-        expect["lons_in_range"] = area["lons_in_range"]
-        expect["lats_in_range"] = area["lats_in_range"]
-    else:
-        expect["lons_in_range"] = get_lons(ds)
-        expect["lats_in_range"] = get_lats(ds)
-
-    if do_levels:
-        lev_int = get_rand_lev_int(ds)
-        if lev_int is not None:
-            params["level"] = lev_int["req_interval"]
-            expect["levs_in_range"] = lev_int["levs_in_range"]
-        else:
-            print("WARNING: not requesting level range as no level dimension found")
-            expect["levs_in_range"] = None
-    else:
-        expect["levs_in_range"] = get_levs(ds)
-
-    if do_time:
-        time_int = get_rand_time_int(ds)
-        if time_int is not None:
-            params["time"] = time_int["req_interval"]
-            expect["times_in_range"] = time_int["times_in_range"]
-        else:
-            print("WARNING: not requesting time range as no time dimension found")
-            expect["times_in_range"] = None
-    else:
-        expect["times_in_range"] = get_times(ds)
-        
-    return params, expect
-
-
-def get_rand_time_int(ds):
-    """
-    Returns a dictionary containing a randomly chosen time interval
-    (in the form needed by the subset function) and the time values
-    that would be expected when subsetting using that interval
-    """
-    times = get_times(ds)
-    if times is None:
-        return None
-    t_start, t_end, vals_in_range = get_rand_range(times)
-    ts_start = get_time_string(t_start)
-    ts_end = get_time_string(t_end)
-    return {"req_interval": time_interval(ts_start, ts_end),
-            "times_in_range": vals_in_range}
-
-
-def get_rand_lev_int(ds):
-    """
-    As get_rand_time_int, but for levels
-    """
-    levs = get_levs(ds)
-    if levs is None:
-        return None
-    z_start, z_end, vals_in_range = get_rand_range(levs)
-    return {"req_interval": level_interval(z_start, z_end),
-            "levs_in_range": vals_in_range}
-
-
-def get_time_string(when):
-    """
-    Turns a datetime, or time value seen in xarray,
-    into a string understood by time_interval
-    """
-    if isinstance(when, datetime.datetime):
-        t = when
-    else:
-        t = when.values.tolist()
-        
-    return (f"{t.year:04d}-{t.month:02d}-{t.day:02d}"
-            f"T{t.hour:02d}:{t.minute:02d}:{t.second:02d}")
-
-
-def get_rand_area(ds, force_lon_wrap=False, force_pole=None):
-    """Returns a dictionary containing a randomly chosen area
-    (tuple of lon_w, lat_s, lon_e, lat_n) and the lon and lat
-    values that would be expected when subsetting using that area.
-
-    In the curvilinear case (lon and lat variables in the file are 2d), the
-    expected values will be None rather than arrays of expected values.  The
-    reason for this is that it is not possible to validate a *specific* set of
-    lon, lat values in any case (as although the subsetting will set any
-    points that are out of range to NaN, but NaN values could also be because
-    of missing data points within the range).
-    """
-
-    ## (lon0, lon1, lons_in_range), wrap_lon = get_rand_lon_range(ds)
-    (lon0, lon1, lons_in_range) = get_rand_lon_range(ds, force_wrap=force_lon_wrap)
-    (lat0, lat1, lats_in_range) = get_rand_lat_range(ds, force_pole=force_pole)
-    
-    return {"req_area": (lon0, lat0, lon1, lat1),
-            "lons_in_range": lons_in_range,
-            ## "wrap_lon": wrap_lon,
-            "lats_in_range": lats_in_range}
-
-
-def get_wrap_lon(lons):
-    """
-    Get the longitude at which the wrapping occurs.
-    Note - the exact value is not used in the current version of the calling code,
-    the main purpose here is to distinguish global from limited area, so that the
-    requested area for subsetting does not wrap for a limited area grid.
-    """
-    minlon = lons.min()
-    maxlon = lons.max()
-    if maxlon - minlon < 270:
-        # assume this is a limited area
-        return None
-    elif maxlon - minlon >= 360:
-        raise Exception(f"too wide lon range {minlon} to {maxlon}")
-    elif 0 <= minlon and maxlon < 360:
-        return 360.
-    elif -180 <= minlon and maxlon < 180:
-        return 180.
-    else:
-        raise Exception(f"unsupported lon range {minlon} to {maxlon}")
-    
+        with ResultsDB(results_db_columns) as rdb:
 
-def get_rand_lon_range(ds, force_wrap=False):
-    """
-    Get a randomly chosen longitude range.  This might include wrapping around
-    (unless the longitudes don't seem to span global range), but if force_wrap is
-    set to True then it ensures that this occurs.
-    """
-    lons = get_lons(ds)
-    wrap_lon = get_wrap_lon(lons)
-    can_wrap=(wrap_lon is not None)
-
-    if force_wrap:
-        print("WARNING: forcing longitude wrapping for what appears to be limited area model")
-        params = {"force": "wrap"}
-    else:
-        params = {"can_wrap": can_wrap}
-    return get_rand_range(lons, **params)
-        
-
-def get_rand_lat_range(ds, force_pole=None):
-    """
-    Get a randomly chosen latitude range.  If force_pole is set to "north" or "south",
-    then the range will extend to the relevant pole.
-    """
-
-    lats = get_lats(ds)
-
-    # using "force" will ensure that the range returned
-    # by get_rand_range goes to the end of the latitude values,
-    # but for the test, actually use -90 or 90. Which value is
-    # to be overwritten will depend on the ordering. 
-    # 
-    if force_pole == "north":
-        ret = get_rand_range(lats, force="upper")
-        return (ret[0], 90., ret[2]) if ret[1] >= ret[0] else (90., ret[1], ret[2])
-        params["force"] = "upper"        
-    elif force_pole == "south":
-        ret = get_rand_range(lats, force="lower")
-        return (-90., ret[1], ret[2]) if ret[1] >= ret[0] else (ret[0], -90., ret[2])
-    else:
-        return get_rand_range(lats)
-
-
-def get_rand_range(var, max_fraction=.1, can_wrap=False, force=None):
-    """
-    Get a random range from specified variable (which can be any number
-    of dimensions).  Returns tuple of (lower, upper, values in range)
-
-    min and max are chosen based on histogram of values, defaulting to returning
-    a range that includes up to to about 10% of the points (though can be less)
-
-    can_wrap=True means could be used with longitude - e.g. for a wrapping longitude of
-    360 it could return lower=-10 upper=10, and the values in range are in the range -10 to 10
-    where those values from -10 to 0 are based on the values from 350 to 360 in the input
-    
-    force can be used for special cases: "lower" forces the range to include
-    the lower end (e.g. south pole for latitude), "upper" forces it to include
-    the upper end (e.g. north pole), "wrap" forces it to wrap around (the meridian
-    for longitude)
-    """
-    
-    length = random.uniform(0, max_fraction)
-
-    while True:
-
-        did_wrap = False
-        if force == "lower":
-            lower_q = 0.
-            upper_q = length
-        elif force == "upper":
-            lower_q = 1 - length
-            upper_q = 1.            
-        elif force == "wrap":
-            lower_q = random.uniform(1 - length, 1)
-            upper_q = lower_q + length - 1
-            did_wrap = True
-        elif force is not None:
-            raise ValueError(f"unrecognised force value {force}")
-        elif can_wrap:
-            lower_q = random.uniform(0, 1)
-            upper_q = lower_q + length
-            did_wrap = upper_q > 1
-            if did_wrap:
-                upper_q -= 1
-        else:
-            lower_q = random.uniform(0, 1 - length)    
-            upper_q = lower_q + length
-
-        lower = var.quantile(lower_q)
-        upper = var.quantile(upper_q)
-
-        if did_wrap:
-            in_range = (lower <= var) | (var <= upper)
-            if var.ndim == 1:
-                modulus = 360
-                lower_vals = get_data(var[lower <= var])
-                upper_vals = get_data(var[var <= upper])
-                
-                if var.min() >= 0:
-                    # data uses 0-360
-                    # change e.g. 350..10 to -10..10
-                    # (as subset function doesn't seem to like 350..370)
-                    lower -= modulus
-                    lower_vals -= modulus
-                else:
-                    # data uses -180 to 180
-                    # change e.g. 170..-170 to 170..190
-                    upper += modulus
-                    upper_vals += modulus
-                
-                vals_in_range = np.concatenate((lower_vals, upper_vals))
-            else:
-                vals_in_range = None
+            row_writer = lambda **kwargs: \
+                rdb.add_row(test_location=self._test_location,
+                            test_time=time.strftime("%Y-%m-%d %H:%M:%S"),
+                            **kwargs)
+
+            for collection in data_pool_tests_db:
+                self.test_subset_in_data_pools(collection, row_writer)
+
+
+    def test_subset_in_data_pools(self, collection, row_writer=None):
+        """
+        Do a range of tests on a given collection
+        """
+
+        ds = self._open_dataset(collection)
+
+        for test_kwargs, description in self._get_tests():
+
+            print(description)
+            subset_params, success, message = self._do_test(collection, ds, **test_kwargs)
+
+            if row_writer is not None:
+                self._store_result(row_writer,
+                                  collection, subset_params, success, message)
+
+
+    def _store_result(self, row_writer, collection, subset_params, success, message):
+        tm = subset_params.get('time')
+        if tm is not None:
+            tm = '/'.join(str(v) for v in tm.value)
+        level = subset_params.get('level')
+        if level is not None:
+            level = '/'.join(str(float(v)) for v in level.value)
+        area = subset_params.get('area')
+        if area is not None:
+            area = '/'.join(str(v) for v in area)
+        success = str(success)
+
+        row_writer(collection=collection,
+                   area=area,
+                   level=level,
+                   time=tm,
+                   success=success,
+                   message=message)
+
+
+    def _get_tests(self):
+        """
+        Gets a sequence of tests to do.
+        """
+
+        # subset horizontally and in time
+        for _ in range(3):
+            yield {'do_area': True,
+                   'do_time': True}, 'subset by area and time'
+
+        # subset in all dimensions (though levels will be ignored if there isn't
+        # a level dimension)
+        for _ in range(3):
+            yield {'do_area': True,
+                   'do_time': True,
+                   'do_levels': True}, 'subset by area, time and levels'
+
+        # and a few special cases to try
+        special_cases = [{"force_lon_wrap": True},
+                         {"force_pole": "north"},
+                         {"force_pole": "south"},
+                         {"force_lon_wrap": True, "force_pole": "south"}]
+
+        for area_args in special_cases:
+            yield {'do_area': True,
+                   'do_time': True,
+                   'do_levels': True,
+                   'area_args': area_args}, f"Doing special case: {area_args}"
+
+
+    def _open_dataset(self, collection):
+
+        print(f"opening {collection}")
+
+        if collection[0] == "/":
+            return xr.open_dataset(collection)
+
+        #====================================
+        ## THIS OUGHT TO WORK, but I need to work out how to configure the paths
+        ## for the tests.  strace shows that it is using /tmp/roocs.ini but this
+        ## is overwritten every time (tests/conftest.py gets imported and
+        ## write_roocs_cfg() is run)
+        ##
+        ## strace also says that it is also looking in all of these:
+        ##     <roocs_dir>/clisops/clisops/etc/roocs.ini
+        ##     <roocs_dir>/daops/daops/etc/roocs.ini
+        ##     <roocs_dir>/roocs-utils/roocs_utils/etc/roocs.ini
+        ##
+        ## For now, symlinking ~/.mini-esgf-data/master/test_data/badc
+        ## to point to the real /badc will do for a workaround -- now it finds 
+        ## the dataset
+        ds = open_xr_dataset(collection)
+
+        ## OR HERE IS ANOTHER POSSIBLE TEMPORARY WORKAROUND
+        # import glob  # for open_xr_dataset workaround
+        # assert isinstance(collection, str)
+        #paths = glob.glob(f'/badc/cmip6/data/{collection.replace(".","/")}/*.nc')
+        #ds = open_xr_dataset(paths)
+        #====================================
+
+        return ds
+
+
+    def _dump_dims(self, ds, label=""):
+        ignore = ("bnds", "vertices")
+        print(f"{label} dataset dimensions: "
+              f'{",".join(f"{k}:{v}" for k, v in ds.dims.items() if k not in ignore)}')
+
+
+    def _get_data(self, val):
+        return val.data if isinstance(val, (xr.DataArray, xr.IndexVariable)) else val
+
+
+    def _do_test(self, collection, ds, **kwargs):
+        """
+        Do an individual test on a collection
+        """
+
+        print(f"Doing test on {collection} using {kwargs}")
+        self._dump_dims(ds, label="input")
+        subset_params, expect = self._prepare_test(ds, **kwargs)
+
+        temp_dir = tempfile.TemporaryDirectory()
+        self._tmpdir = temp_dir.name
+
+        print("===========================================")
+        print("Doing test with:")
+        print(f"\n  Collection: {collection}")
+        print("\n  Subset params:")
+        for k, v in subset_params.items():
+            if k == "time":
+                v = f"time_interval({v.value})"
+            elif k == "level":
+                v = f"level_interval({tuple([float(lev) for lev in v.value])})"
+            print(f"      {k} : {v}")
+        print("\n  Expect to get:")
+        for k, v in expect.items():
+            if v is not None:
+                print(f"      {k} : {self._get_data(v)}")
+        if all(k in expect and expect[k] is None
+               for k in ("lons_in_range", "lats_in_range")):
+            print("  (curvilinear grid; will test lon/lat ranges, not exact vals)")
+
+        print("\n===========================================")
+
+        try:
+            result = self._subset(collection, **subset_params)
+
+            self._check_result(result, expect, collection, subset_params)
+
+        except KeyboardInterrupt:
+            raise
+        except Exception as exc:
+            success = False
+            message = str(exc)
+            self._print_error(f"TEST FAILED: {message}")
+            raise  # FIXME: comment out
         else:
-            in_range = (lower <= var) & (var <= upper)
-            vals_in_range = var[in_range] if var.ndim == 1 else None
-        
-        if in_range.sum() > 0:
-            break
-        length = min(length * 2, 1)
-        
-    if var.ndim == 1 and len(var) > 1 and var[1] < var[0]:
-        # if the variable is 1d and is decreasing, then swap the ordering
-        # of the bounds (which were chosen based on histogram so will be increasing at this point)
-        assert not did_wrap  # longitude wrap not verified for decreasing lons
-        assert lower <= upper
-        lower, upper = upper, lower
-        
-    return (lower, upper, vals_in_range)
-        
-
-def get_lons(ds):
-    "Get the longitude variable for a dataset. Not necessarily a coordinate variable."
-    return get_var_by_stdname(ds, "longitude")
-def get_lats(ds):
-    "Get the latitude variable for a dataset. Not necessarily a coordinate variable."
-    return get_var_by_stdname(ds, "latitude")
-
-def get_times(ds):
-    "Get the time coordinate variable for a dataset"
-    return get_axis_by_direction(ds, "T")
-def get_levs(ds):
-    "Get the height coordinate variable for a dataset"
-    return get_axis_by_direction(ds, "Z")
-
-
-def get_var_by_stdname(ds, stdname):
-    """
-    Get variable with a given standard name.
-    Will raise an exception if there is not exactly one.
-    """
-    vars = [v for v in ds.variables.values()
-            if v.attrs.get("standard_name") == stdname]
-    var, = vars
-    return var
-
-
-def is_curvi(lons, lats):
-    """
-    Test whether given lon and lat variables correspond to a curvilinear grid
-    (as opposed to 1d coordinate variables).
-    """
-    if len(lons.dims) == 1 and len(lats.dims) == 1:
-        return False
-    elif len(lons.dims) == 2 and lons.dims == lats.dims:
-        return True
-    else:
-        raise Exception(f"unexpected dimensionality of lon, lat arrays: {lon.dims} and {lat.dims}")
-    
-    
-def get_gridded_vars(ds, lons=None, lats=None, curvi=None):
-    if lons is None:
-        lons = get_lons(ds)
-    if lats is None:
-        lats = get_lats(ds)
-    if curvi is None:
-        curvi = is_curvi(lons, lats)
-
-    if curvi:
-        grid_dims = lons.dims
-    else:
-        latdim, = lats.dims
-        londim, = lons.dims
-        grid_dims = (latdim, londim)
-    return [v for v in ds.data_vars.values()
-            if v.dims[-2:] == grid_dims]
-        
-    
-def get_lonlat_ranges_for_curvi(ds):
-    """
-    get ranges of lon, lat values where there is actual data (not masked)
-    for any variable on the lon, lat grid
-    """
-
-    lons = get_lons(ds)
-    lats = get_lats(ds)
-    
-    grid_shape = lons.shape
-
-    # get a list of the variables that are on the lon-lat grid,
-    # other than lon, lat themselves, and then get an array of 
-    # positions where there are non-NaN values in *any* of these 
-    # variables, for any other level / height
-    #
-    # (start with 2d array of False, then use logical OR with 
-    # each variable, although probably there is only one such var)
-    #
-    vars_on_grid = get_gridded_vars(ds,
-                                    lons=lons, lats=lats,
-                                    curvi=True)
-    if not vars_on_grid:
-        raise Exception("could not find any data variables")
-    has_data = np.zeros(grid_shape, dtype=bool)
-    for var in vars_on_grid:
-        var_has_data = np.logical_not(np.isnan(var.data))        
-        # reduce to 2d using "any" in loop - there might be a cleverer way
-        while var_has_data.ndim > 2:
-            var_has_data = np.any(var_has_data, axis=(var_has_data.ndim - 3))
-        assert var_has_data.shape == grid_shape
-        has_data |= var_has_data
-
-    if not has_data.any():
-        print ("WARNING: data variable(s) contain only NaN values")
-        return (None, None)
-        
-    lons_where_data = lons.data[has_data]
-    lats_where_data = lats.data[has_data]
-
-    lon_range_where_data = (lons_where_data.min(), lons_where_data.max())
-    lat_range_where_data = (lats_where_data.min(), lats_where_data.max())
-
-    print("For this curvilinear dataset:")
-    print(f" Lon range where data {lon_range_where_data} "
-          f"(overall {lons.data.min(), lons.data.max()})")
-    print(f" Lat range where data {lat_range_where_data} "
-          f"(overall {lats.data.min(), lats.data.max()})")
-    
-    return (lon_range_where_data, lat_range_where_data)
+            success = True
+            message = ""
+            print("Test succeeded\n\n\n")
+        finally:        
+            temp_dir.cleanup()
 
+        return subset_params, success, message
 
-def check_in_range(actual_range, requested_range, label="", **kwargs):
-    """
-    check whether the range of values lies WITHIN the requested range of values;
-    allow for the fact that the requested range might be passed in decreasing order
-    """
-    if not is_in_range(actual_range, requested_range, **kwargs):
-        raise Exception(f"range check for {label} failed")
-    print(f"{label}: Verified range {actual_range} within {requested_range}")
-    
 
-def is_in_range(actual_range, requested_range):
-    """
-    helper for check_in_range.  Returns boolean.
-    """
-    req0, req1 = requested_range
-    if req0 > req1:
-        req0, req1 = req1, req0
-    return all((req0 <= val <= req1
-                for val in actual_range))
+    def _print_error(self, message):
+        stars = "*" * len(message)
+        print(f"****{stars}****\n"
+              f"*** {message} ***\n"
+              f"****{stars}****\n\n\n")
 
 
-    
-def check_equal(vals, exp_vals, label=""):
-    """
-    Check whether the values match the expected values
-    """
-    
-    vals = get_data(vals)
-    exp_vals = get_data(exp_vals)
-                             
-    #print(f"\n\n============ {label} =========\n\n")
-    #print(f"Actual vals: {vals}")
-    #print(f"Expected vals: {exp_vals}")
-
-    if not np.array_equal(vals, exp_vals):
-        raise Exception(f"equal values assertion failed for {label}:"
-                        f"actual {vals}, expected: {exp_vals}")
-        
-    print(f"{label}: checked {len(vals)} values match expected values")
+    def _prepare_test(self, ds, do_area=False, do_time=False, do_levels=False, area_args=None):
+        """
+        returns the params to the subset function that will be needed for the test
+        and a dictionary of things to expect to come back from the test
 
-    
-def check_result(result, expect, collection, subset_params, tmpdir):
+        The boolean inputs do_area, do_time, do_levels control whether to subset
+        in each of these ways.  The input area_args can contain a dictionary of arguments
+        to get_rand_area (ignored if do_area==False)
+        """
 
-    """
-    Do various checks on the result of subsetting.  result should be an
-    object that has a file_uris property.  expect should be dictionary
-    returned by prepare_test.
+        if area_args == None:
+            area_args = {}
 
-    subset_params is the dictionary of keyword args that was passed to
-    subset (needed for curvilinear - see below).
+        params = {}
+        expect = {}
 
-    Currently, the tests that are done are:
-    - check that the exact set of coordinate values matches what was expected,
-    - except that in the case of a curvilinear grid, check to see what lon and
-      lat ranges are spanned by the lon and lat arrays in the file but only
-      including those locations where there is data with non-NaN values; then
-      check that these are within the range that was requested
-    """
+        if do_area:
+            area = self._get_rand_area(ds, **area_args)
+            req_area = [float(v) for v in area["req_area"]]
 
-    fn, = result.file_uris
-    dsout = xr.open_dataset(fn)
-    dump_dims(dsout, label="output")
+            params["area"] = req_area
 
-    lons = get_lons(dsout)
-    lats = get_lats(dsout)
+            expect["lons_in_range"] = area["lons_in_range"]
+            expect["lats_in_range"] = area["lats_in_range"]
+        else:
+            expect["lons_in_range"] = self._get_lons(ds)
+            expect["lats_in_range"] = self._get_lats(ds)
+
+        if do_levels:
+            lev_int = self._get_rand_lev_int(ds)
+            if lev_int is not None:
+                params["level"] = lev_int["req_interval"]
+                expect["levs_in_range"] = lev_int["levs_in_range"]
+            else:
+                print("WARNING: not requesting level range as no level dimension found")
+                expect["levs_in_range"] = None
+        else:
+            expect["levs_in_range"] = self._get_levs(ds)
 
-    curvi = is_curvi(lons, lats)
-    
-    if curvi and "area" in subset_params:
-        area = subset_params["area"]
-        req_lon_range = (area[0], area[2])
-        req_lat_range = (area[1], area[3])
-
-        lon_range, lat_range = get_lonlat_ranges_for_curvi(dsout)
-        if lon_range is not None:
-            print("Checking that lon-lat values with (unmasked) data in requested range")
-            check_in_range(lon_range, req_lon_range, label="longitudes")
-            check_in_range(lat_range, req_lat_range, label="latitudes")
+        if do_time:
+            time_int = self._get_rand_time_int(ds)
+            if time_int is not None:
+                params["time"] = time_int["req_interval"]
+                expect["times_in_range"] = time_int["times_in_range"]
+            else:
+                print("WARNING: not requesting time range as no time dimension found")
+                expect["times_in_range"] = None
         else:
-            print("Skipping lon/lat range check: did not find any data in requested range")
-    else:
-        expected_lons = expect["lons_in_range"]
-        expected_lats = expect["lats_in_range"]
-
-        check_equal(lons, expected_lons, label="longitudes")
-        check_equal(lats, expected_lats, label="latitudes")
-
-    expected_levs = expect["levs_in_range"]
-    if expected_levs is not None:
-        levs = get_levs(dsout)   
-        check_equal(levs, expected_levs, label="levels")
-
-    expected_times = expect["times_in_range"]
-    if expected_times is not None:
-        times = get_times(dsout)
-        check_equal(times, expected_times, label="times")
-        timeval = times[0]
-    else:
-        timeval = None
-
-    vars_on_grid = get_gridded_vars(dsout,
-                                    lons=lons, lats=lats,
-                                    curvi=curvi)
-
-    do_nans_check(vars_on_grid, collection, timeval, tmpdir)
-    
-    
-def get_axis_by_direction(ds, direction):
-    """
-    Get the axis with the specified direction attribute ("X", "Y", "Z" or "T")
-    or if there is none, returns None.
-    (If more than one, raises an exception.)
-    """
-    axes = []
-    for name in ds.dims:
-        axis = ds[name]
-        if name == "bnds":
-            continue
-        if hasattr(axis, "axis") and axis.axis == direction:
-            axes.append(axis)
-    if len(axes) > 1:
-        raise Exception(f"more than one dimension with axis {direction}")
-    elif len(axes) == 1:
-        return axes[0]
-    else:
-        return None
-
-
-def do_nans_check(vars_on_grid, collection, timeval, tmpdir):
-    """
-    Check whether any of the variables consist of all NaN values.
-    If so, then extract (for one timestep) the variable on all grid points.
-    If that variable consists of some but not all NaN values, then assume 
-    that the subsetter is working and we are just requesting a region where
-    there are NaN values, so downgrade to a warning -- otherwise raise
-    an exception.
-    """
-
-    vars_with_nans = set()
-    for var in vars_on_grid:
-        if np.all(np.isnan(var.data)):
-            vars_with_nans.add(var.name)
-
-    if vars_with_nans:
-        dsout2 = get_fullfield(collection, tmpdir,
-                               timeval=timeval)
-
-        for varname in vars_with_nans:
-            isnan_ref = np.isnan(dsout2[varname].data)
-            if np.any(isnan_ref) and not np.all(isnan_ref):
-                print(f"Warning: {varname} contains all NaNs (but some non-NaNs exist in full field).")
+            expect["times_in_range"] = self._get_times(ds)
+
+        return params, expect
+
+
+    def _get_rand_time_int(self, ds):
+        """
+        Returns a dictionary containing a randomly chosen time interval
+        (in the form needed by the subset function) and the time values
+        that would be expected when subsetting using that interval
+        """
+        times = self._get_times(ds)
+        if times is None:
+            return None
+        t_start, t_end, vals_in_range = self._get_rand_range(times)
+        ts_start = self._get_time_string(t_start)
+        ts_end = self._get_time_string(t_end)
+        return {"req_interval": time_interval(ts_start, ts_end),
+                "times_in_range": vals_in_range}
+
+
+    def _get_rand_lev_int(self, ds):
+        """
+        As get_rand_time_int, but for levels
+        """
+        levs = self._get_levs(ds)
+        if levs is None:
+            return None
+        z_start, z_end, vals_in_range = self._get_rand_range(levs)
+        return {"req_interval": level_interval(z_start, z_end),
+                "levs_in_range": vals_in_range}
+
+
+    def _get_time_string(self, when):
+        """
+        Turns a datetime, or time value seen in xarray,
+        into a string understood by time_interval
+        """
+        if isinstance(when, datetime.datetime):
+            t = when
+        else:
+            t = when.values.tolist()
+
+        return (f"{t.year:04d}-{t.month:02d}-{t.day:02d}"
+                f"T{t.hour:02d}:{t.minute:02d}:{t.second:02d}")
+
+
+    def _get_rand_area(self, ds, force_lon_wrap=False, force_pole=None):
+        """Returns a dictionary containing a randomly chosen area
+        (tuple of lon_w, lat_s, lon_e, lat_n) and the lon and lat
+        values that would be expected when subsetting using that area.
+
+        In the curvilinear case (lon and lat variables in the file are 2d), the
+        expected values will be None rather than arrays of expected values.  The
+        reason for this is that it is not possible to validate a *specific* set of
+        lon, lat values in any case (as although the subsetting will set any
+        points that are out of range to NaN, but NaN values could also be because
+        of missing data points within the range).
+        """
+
+        ## (lon0, lon1, lons_in_range), wrap_lon = self._get_rand_lon_range(ds)
+        (lon0, lon1, lons_in_range) = self._get_rand_lon_range(ds, force_wrap=force_lon_wrap)
+        (lat0, lat1, lats_in_range) = self._get_rand_lat_range(ds, force_pole=force_pole)
+
+        return {"req_area": (lon0, lat0, lon1, lat1),
+                "lons_in_range": lons_in_range,
+                ## "wrap_lon": wrap_lon,
+                "lats_in_range": lats_in_range}
+
+
+    def _get_wrap_lon(self, lons):
+        """
+        Get the longitude at which the wrapping occurs.
+        Note - the exact value is not used in the current version of the calling code,
+        the main purpose here is to distinguish global from limited area, so that the
+        requested area for subsetting does not wrap for a limited area grid.
+        """
+        minlon = lons.min()
+        maxlon = lons.max()
+        if maxlon - minlon < 270:
+            # assume this is a limited area
+            return None
+        elif maxlon - minlon >= 360:
+            raise Exception(f"too wide lon range {minlon} to {maxlon}")
+        elif 0 <= minlon and maxlon < 360:
+            return 360.
+        elif -180 <= minlon and maxlon < 180:
+            return 180.
+        else:
+            raise Exception(f"unsupported lon range {minlon} to {maxlon}")
+
+
+    def _get_rand_lon_range(self, ds, force_wrap=False):
+        """
+        Get a randomly chosen longitude range.  This might include wrapping around
+        (unless the longitudes don't seem to span global range), but if force_wrap is
+        set to True then it ensures that this occurs.
+        """
+        lons = self._get_lons(ds)
+        wrap_lon = self._get_wrap_lon(lons)
+        can_wrap=(wrap_lon is not None)
+
+        if force_wrap:
+            print("WARNING: forcing longitude wrapping for what appears to be limited area model")
+            params = {"force": "wrap"}
+        else:
+            params = {"can_wrap": can_wrap}
+        return self._get_rand_range(lons, **params)
+
+
+    def _get_rand_lat_range(self, ds, force_pole=None):
+        """
+        Get a randomly chosen latitude range.  If force_pole is set to "north" or "south",
+        then the range will extend to the relevant pole.
+        """
+
+        lats = self._get_lats(ds)
+
+        # using "force" will ensure that the range returned
+        # by get_rand_range goes to the end of the latitude values,
+        # but for the test, actually use -90 or 90. Which value is
+        # to be overwritten will depend on the ordering. 
+        # 
+        if force_pole == "north":
+            ret = self._get_rand_range(lats, force="upper")
+            return (ret[0], 90., ret[2]) if ret[1] >= ret[0] else (90., ret[1], ret[2])
+            params["force"] = "upper"        
+        elif force_pole == "south":
+            ret = self._get_rand_range(lats, force="lower")
+            return (-90., ret[1], ret[2]) if ret[1] >= ret[0] else (ret[0], -90., ret[2])
+        else:
+            return self._get_rand_range(lats)
+
+
+    def _get_rand_range(self, var, max_fraction=.1, can_wrap=False, force=None):
+        """
+        Get a random range from specified variable (which can be any number
+        of dimensions).  Returns tuple of (lower, upper, values in range)
+
+        min and max are chosen based on histogram of values, defaulting to returning
+        a range that includes up to to about 10% of the points (though can be less)
+
+        can_wrap=True means could be used with longitude - e.g. for a wrapping longitude of
+        360 it could return lower=-10 upper=10, and the values in range are in the range -10 to 10
+        where those values from -10 to 0 are based on the values from 350 to 360 in the input
+
+        force can be used for special cases: "lower" forces the range to include
+        the lower end (e.g. south pole for latitude), "upper" forces it to include
+        the upper end (e.g. north pole), "wrap" forces it to wrap around (the meridian
+        for longitude)
+        """
+
+        length = random.uniform(0, max_fraction)
+
+        while True:
+
+            did_wrap = False
+            if force == "lower":
+                lower_q = 0.
+                upper_q = length
+            elif force == "upper":
+                lower_q = 1 - length
+                upper_q = 1.            
+            elif force == "wrap":
+                lower_q = random.uniform(1 - length, 1)
+                upper_q = lower_q + length - 1
+                did_wrap = True
+            elif force is not None:
+                raise ValueError(f"unrecognised force value {force}")
+            elif can_wrap:
+                lower_q = random.uniform(0, 1)
+                upper_q = lower_q + length
+                did_wrap = upper_q > 1
+                if did_wrap:
+                    upper_q -= 1
             else:
-                raise Exception(f"Variable {varname} contains all NaNs.")
+                lower_q = random.uniform(0, 1 - length)    
+                upper_q = lower_q + length
 
-    
+            lower = var.quantile(lower_q)
+            upper = var.quantile(upper_q)
+
+            if did_wrap:
+                in_range = (lower <= var) | (var <= upper)
+                if var.ndim == 1:
+                    modulus = 360
+                    lower_vals = self._get_data(var[lower <= var])
+                    upper_vals = self._get_data(var[var <= upper])
+
+                    if var.min() >= 0:
+                        # data uses 0-360
+                        # change e.g. 350..10 to -10..10
+                        # (as subset function doesn't seem to like 350..370)
+                        lower -= modulus
+                        lower_vals -= modulus
+                    else:
+                        # data uses -180 to 180
+                        # change e.g. 170..-170 to 170..190
+                        upper += modulus
+                        upper_vals += modulus
+
+                    vals_in_range = np.concatenate((lower_vals, upper_vals))
+                else:
+                    vals_in_range = None
+            else:
+                in_range = (lower <= var) & (var <= upper)
+                vals_in_range = var[in_range] if var.ndim == 1 else None
+
+            if in_range.sum() > 0:
+                break
+            length = min(length * 2, 1)
+
+        if var.ndim == 1 and len(var) > 1 and var[1] < var[0]:
+            # if the variable is 1d and is decreasing, then swap the ordering
+            # of the bounds (which were chosen based on histogram so will be increasing at this point)
+            assert not did_wrap  # longitude wrap not verified for decreasing lons
+            assert lower <= upper
+            lower, upper = upper, lower
+
+        return (lower, upper, vals_in_range)
+
+
+    def _get_lons(self, ds):
+        "Get the longitude variable for a dataset. Not necessarily a coordinate variable."
+        return self._get_var_by_stdname(ds, "longitude")
+    def _get_lats(self, ds):
+        "Get the latitude variable for a dataset. Not necessarily a coordinate variable."
+        return self._get_var_by_stdname(ds, "latitude")
+
+    def _get_times(self, ds):
+        "Get the time coordinate variable for a dataset"
+        return self._get_axis_by_direction(ds, "T")
+    def _get_levs(self, ds):
+        "Get the height coordinate variable for a dataset"
+        return self._get_axis_by_direction(ds, "Z")
+
+
+    def _get_var_by_stdname(self, ds, stdname):
+        """
+        Get variable with a given standard name.
+        Will raise an exception if there is not exactly one.
+        """
+        vars = [v for v in ds.variables.values()
+                if v.attrs.get("standard_name") == stdname]
+        var, = vars
+        return var
+
+
+    def _is_curvi(self, lons, lats):
+        """
+        Test whether given lon and lat variables correspond to a curvilinear grid
+        (as opposed to 1d coordinate variables).
+        """
+        if len(lons.dims) == 1 and len(lats.dims) == 1:
+            return False
+        elif len(lons.dims) == 2 and lons.dims == lats.dims:
+            return True
+        else:
+            raise Exception(f"unexpected dimensionality of lon, lat arrays: {lon.dims} and {lat.dims}")
+
+
+    def _get_gridded_vars(self, ds, lons=None, lats=None, curvi=None):
+        if lons is None:
+            lons = self._get_lons(ds)
+        if lats is None:
+            lats = self._get_lats(ds)
+        if curvi is None:
+            curvi = self._is_curvi(lons, lats)
+
+        if curvi:
+            grid_dims = lons.dims
+        else:
+            latdim, = lats.dims
+            londim, = lons.dims
+            grid_dims = (latdim, londim)
+        return [v for v in ds.data_vars.values()
+                if v.dims[-2:] == grid_dims]
+
+
+    def _get_lonlat_ranges_for_curvi(self, ds):
+        """
+        get ranges of lon, lat values where there is actual data (not masked)
+        for any variable on the lon, lat grid
+        """
+
+        lons = self._get_lons(ds)
+        lats = self._get_lats(ds)
+
+        grid_shape = lons.shape
+
+        # get a list of the variables that are on the lon-lat grid,
+        # other than lon, lat themselves, and then get an array of 
+        # positions where there are non-NaN values in *any* of these 
+        # variables, for any other level / height
+        #
+        # (start with 2d array of False, then use logical OR with 
+        # each variable, although probably there is only one such var)
+        #
+        vars_on_grid = self._get_gridded_vars(ds,
+                                             lons=lons, lats=lats,
+                                             curvi=True)
+        if not vars_on_grid:
+            raise Exception("could not find any data variables")
+        has_data = np.zeros(grid_shape, dtype=bool)
+        for var in vars_on_grid:
+            var_has_data = np.logical_not(np.isnan(var.data))        
+            # reduce to 2d using "any" in loop - there might be a cleverer way
+            while var_has_data.ndim > 2:
+                var_has_data = np.any(var_has_data, axis=(var_has_data.ndim - 3))
+            assert var_has_data.shape == grid_shape
+            has_data |= var_has_data
+
+        if not has_data.any():
+            print ("WARNING: data variable(s) contain only NaN values")
+            return (None, None)
+
+        lons_where_data = lons.data[has_data]
+        lats_where_data = lats.data[has_data]
+
+        lon_range_where_data = (lons_where_data.min(), lons_where_data.max())
+        lat_range_where_data = (lats_where_data.min(), lats_where_data.max())
+
+        print("For this curvilinear dataset:")
+        print(f" Lon range where data {lon_range_where_data} "
+              f"(overall {lons.data.min(), lons.data.max()})")
+        print(f" Lat range where data {lat_range_where_data} "
+              f"(overall {lats.data.min(), lats.data.max()})")
+
+        return (lon_range_where_data, lat_range_where_data)
+
+
+    def _check_in_range(self, actual_range, requested_range, label="", **kwargs):
+        """
+        check whether the range of values lies WITHIN the requested range of values;
+        allow for the fact that the requested range might be passed in decreasing order
+        """
+        if not self._is_in_range(actual_range, requested_range, **kwargs):
+            raise Exception(f"range check for {label} failed")
+        print(f"{label}: Verified range {actual_range} within {requested_range}")
+
+
+    def _is_in_range(self, actual_range, requested_range):
+        """
+        helper for check_in_range.  Returns boolean.
+        """
+        req0, req1 = requested_range
+        if req0 > req1:
+            req0, req1 = req1, req0
+        return all((req0 <= val <= req1
+                    for val in actual_range))
+
+
+    def _check_equal(self, vals, exp_vals, label=""):
+        """
+        Check whether the values match the expected values
+        """
+
+        vals = self._get_data(vals)
+        exp_vals = self._get_data(exp_vals)
+
+        #print(f"\n\n============ {label} =========\n\n")
+        #print(f"Actual vals: {vals}")
+        #print(f"Expected vals: {exp_vals}")
+
+        if not np.array_equal(vals, exp_vals):
+            raise Exception(f"equal values assertion failed for {label}:"
+                            f"actual {vals}, expected: {exp_vals}")
+
+        print(f"{label}: checked {len(vals)} values match expected values")
+
+
+    def _check_result(self, result, expect, collection, subset_params):
+
+        """
+        Do various checks on the result of subsetting.  result should be an
+        object that has a file_uris property.  expect should be dictionary
+        returned by prepare_test.
+
+        subset_params is the dictionary of keyword args that was passed to
+        subset (needed for curvilinear - see below).
+
+        Currently, the tests that are done are:
+        - check that the exact set of coordinate values matches what was expected,
+        - except that in the case of a curvilinear grid, check to see what lon and
+          lat ranges are spanned by the lon and lat arrays in the file but only
+          including those locations where there is data with non-NaN values; then
+          check that these are within the range that was requested
+        """
+
+        fn, = result.file_uris
+        dsout = xr.open_dataset(fn)
+        self._dump_dims(dsout, label="output")
+
+        lons = self._get_lons(dsout)
+        lats = self._get_lats(dsout)
+
+        curvi = self._is_curvi(lons, lats)
+
+        if curvi and "area" in subset_params:
+            area = subset_params["area"]
+            req_lon_range = (area[0], area[2])
+            req_lat_range = (area[1], area[3])
+
+            lon_range, lat_range = self._get_lonlat_ranges_for_curvi(dsout)
+            if lon_range is not None:
+                print("Checking that lon-lat values with (unmasked) data in requested range")
+                self._check_in_range(lon_range, req_lon_range, label="longitudes")
+                self._check_in_range(lat_range, req_lat_range, label="latitudes")
+            else:
+                print("Skipping lon/lat range check: did not find any data in requested range")
+        else:
+            expected_lons = expect["lons_in_range"]
+            expected_lats = expect["lats_in_range"]
+
+            self._check_equal(lons, expected_lons, label="longitudes")
+            self._check_equal(lats, expected_lats, label="latitudes")
+
+        expected_levs = expect["levs_in_range"]
+        if expected_levs is not None:
+            levs = self._get_levs(dsout)   
+            self._check_equal(levs, expected_levs, label="levels")
+
+        expected_times = expect["times_in_range"]
+        if expected_times is not None:
+            times = self._get_times(dsout)
+            self._check_equal(times, expected_times, label="times")
+            timeval = times[0]
+        else:
+            timeval = None
+
+        vars_on_grid = self._get_gridded_vars(dsout,
+                                             lons=lons, lats=lats,
+                                             curvi=curvi)
+
+        self._do_nans_check(vars_on_grid, collection, timeval)
+
+
+    def _get_axis_by_direction(self, ds, direction):
+        """
+        Get the axis with the specified direction attribute ("X", "Y", "Z" or "T")
+        or if there is none, returns None.
+        (If more than one, raises an exception.)
+        """
+        axes = []
+        for name in ds.dims:
+            axis = ds[name]
+            if name == "bnds":
+                continue
+            if hasattr(axis, "axis") and axis.axis == direction:
+                axes.append(axis)
+        if len(axes) > 1:
+            raise Exception(f"more than one dimension with axis {direction}")
+        elif len(axes) == 1:
+            return axes[0]
+        else:
+            return None
+
+
+    def _do_nans_check(self, vars_on_grid, collection, timeval):
+        """
+        Check whether any of the variables consist of all NaN values.
+        If so, then extract (for one timestep) the variable on all grid points.
+        If that variable consists of some but not all NaN values, then assume 
+        that the subsetter is working and we are just requesting a region where
+        there are NaN values, so downgrade to a warning -- otherwise raise
+        an exception.
+        """
+
+        vars_with_nans = set()
+        for var in vars_on_grid:
+            if np.all(np.isnan(var.data)):
+                vars_with_nans.add(var.name)
+
+        if vars_with_nans:
+            dsout2 = self._get_fullfield(collection, timeval=timeval)
+
+            for varname in vars_with_nans:
+                isnan_ref = np.isnan(dsout2[varname].data)
+                if np.any(isnan_ref) and not np.all(isnan_ref):
+                    print(f"Warning: {varname} contains all NaNs (but some non-NaNs exist in full field).")
+                else:
+                    raise Exception(f"Variable {varname} contains all NaNs.")
+
+
+    def _subset(self, collection, **subset_params):
+
+        if self._use_cache:
+            cached_fn = cached_output_fn(collection, subset_params)
+            if os.path.exists(cached_fn):
+                print(f"using cache: {cached_fn}")
+                result = CachedResult(cached_fn)           
+            else:
+                result = subset(collection,
+                                output_dir=self._tmpdir,
+                                **subset_params)
+                fn, = result.file_uris
+                print(f"caching: {cached_fn}")            
+                shutil.copy(fn, cached_fn)
+        else:
+            result = subset(collection,
+                            output_dir=self._tmpdir,
+                            **subset_params)
+        return result
+
+
+    def _get_fullfield(self, collection, timeval=None):
+        """
+        Get the result of subsetting, optionally by time to a particular
+        time value, but not in any other dimensions.
+        """    
+        params = {}
+        if timeval is not None:
+            t_iso = timeval.data.tolist().isoformat()
+            params['time'] = time_interval(t_iso, t_iso)
 
+        result = self._subset(collection, **params)
+        fn, = result.file_uris
+        return xr.open_dataset(fn)
 
-def get_fullfield(collection, tmpdir, timeval=None):
-    """
-    Get the result of subsetting, optionally by time to a particular
-    time value, but not in any other dimensions.
-    """    
-    params = {}
-    if timeval is not None:
-        t_iso = timeval.data.tolist().isoformat()
-        params['time'] = time_interval(t_iso, t_iso)
 
-    result = subset(collection, output_dir=tmpdir, **params)
-    fn, = result.file_uris
-    return xr.open_dataset(fn)
 
     
 if __name__ == "__main__":
     random.seed(0)  ## FIXME - remove
-    main(test_location='CEDA')
+    tester = SubsetTester(test_location='CEDA')
+    tester.main()
     

From 907fcd8e5f55eb172f3a03994dc6145b15a76d79 Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Fri, 22 Sep 2023 14:04:02 +0100
Subject: [PATCH 08/12] add cli entry point for data-pools-checks

---
 setup.py                                      |  1 +
 .../run_data_pools_checks.py                  | 35 +++++++++++++------
 2 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index 29a6299..66084b0 100644
--- a/setup.py
+++ b/setup.py
@@ -80,6 +80,7 @@
     entry_points={
         "console_scripts": [
             "daops=daops.cli:main",
+            "data-pools-checks=tests.data_pools_checks.run_data_pools_checks:cli"
         ],
     },
     long_description=_long_description,
diff --git a/tests/data_pools_checks/run_data_pools_checks.py b/tests/data_pools_checks/run_data_pools_checks.py
index 2db0c1b..326e6fa 100644
--- a/tests/data_pools_checks/run_data_pools_checks.py
+++ b/tests/data_pools_checks/run_data_pools_checks.py
@@ -7,21 +7,19 @@
 import numpy as np
 import shutil
 import time
+import argparse
 
 from daops.ops.subset import subset
 from roocs_utils.xarray_utils.xarray_utils import open_xr_dataset
 from roocs_utils.parameter.param_utils import time_interval, level_interval
 
-from results_db import ResultsDB
-from examples_data import data_pool_tests_db
-from caching import cached_output_fn, CachedResult
-
-
+from .results_db import ResultsDB
+from .examples_data import data_pool_tests_db
+from .caching import cached_output_fn, CachedResult
 
 
 class SubsetTester:
 
-
     def __init__(self,
                  test_location='',
                  use_cache=True):
@@ -786,10 +784,25 @@ def _get_fullfield(self, collection, timeval=None):
         return xr.open_dataset(fn)
 
 
-
-    
-if __name__ == "__main__":
-    random.seed(0)  ## FIXME - remove
-    tester = SubsetTester(test_location='CEDA')
+def _cli_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('test_location',
+                        help='test location, e.g. CEDA')
+    parser.add_argument('--seed', '-s', type=int,
+                        help='value for random seed')
+    parser.add_argument('--cache', '-c', action='store_true',
+                        help='use cache of subsetter output')
+    return parser.parse_args()
+
+
+def cli():
+    args = _cli_arg_parser()
+    if args.seed is not None:
+        random.seed(args.seed)
+    else:
+        if args.cache:
+            print('Warning: using --cache but not --seed; cache hits are unlikely.')
+    tester = SubsetTester(test_location=args.test_location,
+                          use_cache=args.cache)
     tester.main()
     

From 30a1d15e9f9344afad3f71434827d7c713789c8b Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Fri, 22 Sep 2023 14:15:37 +0100
Subject: [PATCH 09/12] improve functionality of ResultsDB class to include
 read/only and opening without pointing to sqlite file, and expand tests
 accordingly

---
 tests/data_pools_checks/results_db.py         | 89 ++++++++++++++-----
 .../run_data_pools_checks.py                  |  5 +-
 tests/data_pools_checks/test_results_db.py    | 44 +++++++--
 3 files changed, 111 insertions(+), 27 deletions(-)

diff --git a/tests/data_pools_checks/results_db.py b/tests/data_pools_checks/results_db.py
index c800416..0d8633c 100644
--- a/tests/data_pools_checks/results_db.py
+++ b/tests/data_pools_checks/results_db.py
@@ -12,17 +12,33 @@ def __init__(self,
                  csvgz_file='tests.csv.gz', sqlite_file='tests_tmp.db',
                  sql_table_name = 'test_results',
                  sql_primary_key='id',
-                 merge_every=100):
-        self.columns = columns[:]
+                 merge_every=100,
+                 read_only=False):
         self._csvgz_file = csvgz_file
+        if sqlite_file is None:
+            self._cursor = False
+        else:
+            self._cursor = None            
         self._sqlite_file = sqlite_file
-        self._conn = None
-        self._cur = None
+        if columns is None:
+            # special case: CSV file must already exist, columns are read in from it
+            self.columns = self._read_csvgz_columns()
+        else:
+            self.columns = columns[:]
         self._sql_table_name = sql_table_name
         self._sql_primary_key = sql_primary_key
         self._merge_every = merge_every
         self._num_tmp_rows = 0
+        self._read_only = read_only
+        if self._sqlite_file is None:
+            self._merge_every = 0
+
         
+    def _read_csvgz_columns(self):
+        with gzip.open(self._csvgz_file, 'rt') as f:
+            reader = csv.reader(f)
+            return next(reader)
+                
     def read_csvgz(self, src=None):
         "read .csv.gz, yield a sequence of rows (each is a list)"
         if src is None:
@@ -37,7 +53,6 @@ def read_csvgz(self, src=None):
             for row in reader:
                 yield row
 
-
     def _read_as_dicts(self, reader, remove_blank=True, **kwargs):
         if remove_blank:
             cond = lambda t: t[1]
@@ -55,6 +70,7 @@ def read_sqlite_as_dicts(self, **kwargs):
                 
     def write_csvgz(self, rows, dest=None):
         "write .csv.gz, input is a sequence of rows"
+        self._check_not_read_only()
         if dest is None:
             dest = self._csvgz_file
         tmpname = dest.replace('.gz', '.tmp.gz')
@@ -71,62 +87,93 @@ def write_csvgz(self, rows, dest=None):
 
     def add_row(self, **row_dict):
         "add a single row - goes into the sqlite file"
-        cur, conn = self._get_cur_conn()
+        self._check_not_read_only()
+        cur = self._get_cursor()
+        if cur is False:
+            raise Exception('Cannot add row without an sqlite db')        
         sql_keys = ','.join(row_dict.keys())
         sql_vals = ','.join((f"'{str(val)}'" for val in row_dict.values()))
         sql = f'insert into {self._sql_table_name} ({sql_keys}) values ({sql_vals})'
         cur.execute(sql)
-        conn.commit()
+        cur.connection.commit()
         self._num_tmp_rows += 1
         if self._num_tmp_rows == self._merge_every:
             self.merge_and_tidy()            
 
     def _init_sqlite(self):
+        if self._cursor is False:
+            return
         sql_columns = ','.join([f'{self._sql_primary_key} integer PRIMARY KEY'] +
                                [f'{col} text' for col in self.columns])
         sql = f'CREATE TABLE IF NOT EXISTS {self._sql_table_name}({sql_columns})'
-        conn = sqlite3.connect(self._sqlite_file)
-        cur = conn.cursor()
-        cur.execute(sql)
-        self._conn = conn
-        self._cur = cur
+        path = os.path.realpath(self._sqlite_file)            
+        try:
+            if self._read_only:
+                conn = sqlite3.connect(f'file:{path}?mode=ro', uri=True)
+            else:
+                conn = sqlite3.connect(path)
+            cursor = conn.cursor()
+            cursor.execute(sql)
+            self._cursor = cursor
+        except sqlite3.OperationalError:
+            self._cursor = False
+
 
-    def _get_cur_conn(self):
-        if self._cur is None or self._conn is None:
+    def _get_cursor(self):
+        if self._cursor is None:
             self._init_sqlite()
-        return self._cur, self._conn
+        return self._cursor
 
     def get_sqlite_rows(self, include_primary_key=False):
-        cur, conn = self._get_cur_conn()
-        cur.execute(f'SELECT * from {self._sql_table_name}')
-        for row in cur:
+        cursor = self._get_cursor()
+        if cursor is False:
+            return
+        cursor.execute(f'SELECT * from {self._sql_table_name}')
+        for row in cursor:
             if include_primary_key:
                 yield row
             else:
                 yield row[1:]
 
     def _destroy_sqlite(self):
+        self._check_not_read_only()
         if os.path.exists(self._sqlite_file):
             os.remove(self._sqlite_file)
-        self._conn = self._cur = None
+        self._cursor = None
         self._num_tmp_rows = 0
                 
     def merge_and_tidy(self):
+        self._check_not_read_only()
+        if self._cursor is False:
+            return
         csv_rows = self.read_csvgz()
         new_rows = self.get_sqlite_rows()
         self.write_csvgz(chain(csv_rows, new_rows))
         self._destroy_sqlite()
 
+    def _check_not_read_only(self):
+        if self._read_only:
+            raise Exception('write operation requested for read-only db')
+        
     def __enter__(self):
         return self
         
     def __exit__(self, exc_type, exc_value, exc_traceback):
-        self.merge_and_tidy()
+        if not self._read_only:
+            self.merge_and_tidy()
 
         
 if __name__ == "__main__":
     import random
 
+    print("""
+Running simple tester for results DB.
+For more extensive tests, use unit tester. From top of repo:
+
+   pytest -c null -s tests/data_pools_checks/test_results_db.py
+
+""")
+
     def dump_db():
         print("=====================")
         os.system('zcat tests.csv.gz')
@@ -141,3 +188,5 @@ def dump_db():
         for _ in range(13):
             rdb.add_row(collection='foo', success='False')
             rdb.add_row(collection=str(random.random()), time='blah', success='True')
+
+    dump_db()
diff --git a/tests/data_pools_checks/run_data_pools_checks.py b/tests/data_pools_checks/run_data_pools_checks.py
index 326e6fa..dbf2df8 100644
--- a/tests/data_pools_checks/run_data_pools_checks.py
+++ b/tests/data_pools_checks/run_data_pools_checks.py
@@ -34,7 +34,10 @@ def main(self):
         results_db_columns = ['test_location', 'test_time', 'collection', 'area',
                               'level', 'time', 'success', 'message']    
 
-        with ResultsDB(results_db_columns) as rdb:
+        file_stem = f'tests_{self._test_location}'
+        with ResultsDB(results_db_columns,
+                       csvgz_file=f'{file_stem}.csv.gz',
+                       sqlite_file=f'{file_stem}_tmp.db') as rdb:
 
             row_writer = lambda **kwargs: \
                 rdb.add_row(test_location=self._test_location,
diff --git a/tests/data_pools_checks/test_results_db.py b/tests/data_pools_checks/test_results_db.py
index 67e2d5f..5af5a8c 100644
--- a/tests/data_pools_checks/test_results_db.py
+++ b/tests/data_pools_checks/test_results_db.py
@@ -1,6 +1,7 @@
 import os
 import random
 import subprocess as sp
+import pytest
 
 from results_db import ResultsDB
 
@@ -38,12 +39,12 @@ def _purge():
         if os.path.exists(path):
             os.remove(path)
 
-def _open_test_db(merge_every=10):
-    return ResultsDB(columns,
-                     csvgz_file=tmp_csvgz,
-                     sqlite_file=tmp_sqlite,
-                     merge_every=merge_every)
-
+def _open_test_db(**kwargs):
+    params = {'csvgz_file': tmp_csvgz,
+              'sqlite_file': tmp_sqlite,
+              'merge_every': 10}
+    params.update(kwargs)
+    return ResultsDB(columns, **params)
 
 def test_write_and_read():
     _purge()
@@ -93,6 +94,37 @@ def test_manual_merge():
             assert rows_in_sqlite == []
 
 
+def test_ro_db():
+    _purge()
+    test_write_and_read()
+    with _open_test_db(read_only=True) as rdb:
+        data_read = list(rdb.read_csvgz_as_dicts())
+    assert tst_data == data_read
+
+
+def test_write_to_rodb():
+    _purge()
+    with _open_test_db(read_only=True) as rdb:
+        with pytest.raises(Exception):
+            rdb.add_row(**tst_data[0])
+
+
+def test_db_no_sqlite():
+    _purge()
+    test_write_and_read()
+    with _open_test_db(sqlite_file=None) as rdb:
+        data_read = list(rdb.read_csvgz_as_dicts())
+    assert tst_data == data_read
+
+
+def test_db_write_no_sqlite():
+    _purge()
+    test_write_and_read()
+    with _open_test_db(sqlite_file=None) as rdb:
+        with pytest.raises(Exception):
+            rdb.add_row(**tst_data[0])
+
+            
 def test_dump_data():
 
     _purge()

From 52e32b1697f607eb93828dc509cde6ddd7ab6bad Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Fri, 22 Sep 2023 14:34:07 +0100
Subject: [PATCH 10/12] add script to merge csv.gz files containing subset
 tester logs

---
 setup.py                             |  3 +-
 tests/data_pools_checks/merge_csv.py | 75 ++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 1 deletion(-)
 create mode 100644 tests/data_pools_checks/merge_csv.py

diff --git a/setup.py b/setup.py
index 66084b0..435a68d 100644
--- a/setup.py
+++ b/setup.py
@@ -80,7 +80,8 @@
     entry_points={
         "console_scripts": [
             "daops=daops.cli:main",
-            "data-pools-checks=tests.data_pools_checks.run_data_pools_checks:cli"
+            "data-pools-checks=tests.data_pools_checks.run_data_pools_checks:cli",
+            "merge-test-logs=tests.data_pools_checks.merge_csv:cli"
         ],
     },
     long_description=_long_description,
diff --git a/tests/data_pools_checks/merge_csv.py b/tests/data_pools_checks/merge_csv.py
new file mode 100644
index 0000000..057f8eb
--- /dev/null
+++ b/tests/data_pools_checks/merge_csv.py
@@ -0,0 +1,75 @@
+#/usr/bin/env python
+
+import os
+import argparse
+from itertools import chain
+
+from .results_db import ResultsDB
+
+
+def remove_dups_from_sorted(lst):
+    first = True
+    for item in lst:
+        if first or item != prev:
+            first = False
+            prev = item
+            yield item
+    
+
+def merge_csv(infiles, outfile):
+
+    all_files = infiles
+    if os.path.exists(outfile):
+        all_files.append(outfile)
+
+    with ResultsDB(None, csvgz_file=all_files[0], read_only=True) as dbin:
+        columns = dbin.columns
+
+    rows = []
+    for fn in all_files:
+        with ResultsDB(columns, csvgz_file=fn, sqlite_file=None, read_only=True) as dbin:
+            rows.extend(list(dbin.read_csvgz()))
+
+    primary_index = columns.index('test_time')
+
+    key_getter = lambda arr: \
+        (arr[primary_index],) + tuple([arr[i] for i in range(len(arr)) if i != primary_index])
+    
+    rows.sort(key=key_getter)    
+
+    with ResultsDB(columns, csvgz_file=outfile, sqlite_file=None) as dbout:
+        dbout.write_csvgz(remove_dups_from_sorted(rows))
+
+
+def _cli_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output', '-o', 
+                        help='output filename (may be pre-existing)',
+                        required=True,
+                        type=_csvgz_file,
+                        nargs=1)
+    parser.add_argument('input_files',
+                        type=_existing_csvgz_file,
+                        help='input filename(s)',
+                        nargs='+')
+    return parser.parse_args()
+
+
+def _csvgz_file(path):
+    if not path.endswith('.csv.gz'):
+        raise argparse.ArgumentTypeError(f'filename {path} does not end .csv.gz')
+    return path
+
+
+def _existing_csvgz_file(path):
+    if not os.path.exists(path):
+        raise argparse.ArgumentTypeError(f'file {path} does not exist')
+    return _csvgz_file(path)
+
+
+def cli():
+    args = _cli_arg_parser()
+    out_file, = args.output
+    print(f'Merging contents of files {args.input_files} into {out_file}')
+    merge_csv(args.input_files, out_file)
+    print('done')

From bd67eeff88a2c85f68b3b266221f3aa52e023389 Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Wed, 11 Oct 2023 12:20:46 +0100
Subject: [PATCH 11/12] fix test_results_db.py: change sql test data to use
 select instead of dump; also change relative import

---
 tests/data_pools_checks/__init__.py        |  0
 tests/data_pools_checks/test_results_db.py | 11 ++++-------
 2 files changed, 4 insertions(+), 7 deletions(-)
 create mode 100644 tests/data_pools_checks/__init__.py

diff --git a/tests/data_pools_checks/__init__.py b/tests/data_pools_checks/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data_pools_checks/test_results_db.py b/tests/data_pools_checks/test_results_db.py
index 5af5a8c..215fb16 100644
--- a/tests/data_pools_checks/test_results_db.py
+++ b/tests/data_pools_checks/test_results_db.py
@@ -3,7 +3,7 @@
 import subprocess as sp
 import pytest
 
-from results_db import ResultsDB
+from .results_db import ResultsDB
 
 
 columns = ['test_location', 'test_time', 'collection', 'area',
@@ -25,11 +25,7 @@
 tst_csv_data = (','.join(columns) + '\n' +
                 ',,foo,,,bar,,\n,,baz,,,,,qux')
 
-tst_sql_data = f'''PRAGMA foreign_keys=OFF;
-BEGIN TRANSACTION;
-CREATE TABLE test_results(id integer PRIMARY KEY,{','.join(f'{col} text' for col in columns)});
-INSERT INTO "test_results" VALUES(1,NULL,NULL,'quux',NULL,NULL,NULL,NULL,'corge');
-COMMIT;'''
+tst_sql_data =f'id|{"|".join(columns)}\n1|||quux|||||corge'
 
 tmp_csvgz = '/tmp/test.csv.gz'
 tmp_sqlite = '/tmp/test_tmp.sql'
@@ -132,7 +128,8 @@ def test_dump_data():
         for row in tst_data2:
             rdb.add_row(**row)
         csvdata = sp.getoutput(f'zcat {tmp_csvgz}')
-        sqldata = sp.getoutput('echo .dump '
+        sqldata = sp.getoutput('echo "select * from test_results" '
                                f'| sqlite3 -header {tmp_sqlite}')
+
     assert csvdata == tst_csv_data
     assert sqldata == tst_sql_data

From 6a45ca23c9e4cebdfd101f34b8985879d8be4c19 Mon Sep 17 00:00:00 2001
From: Alan Iwi <alan.iwi@stfc.ac.uk>
Date: Wed, 25 Oct 2023 14:38:32 +0100
Subject: [PATCH 12/12] Create README.md

---
 tests/data_pools_checks/README.md | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 tests/data_pools_checks/README.md

diff --git a/tests/data_pools_checks/README.md b/tests/data_pools_checks/README.md
new file mode 100644
index 0000000..7eb8c00
--- /dev/null
+++ b/tests/data_pools_checks/README.md
@@ -0,0 +1,26 @@
+This directory provides:
+
+- A command called `data-pools-checks` (implemented in
+  `run_data_pools_checks.py`) that runs the subsetter on a number of test
+  cases in order to try out a variety of different types of datasets
+  (including for example some on curvilinear grids). This will randomly select
+  the bounds for the subsets, although there is a command line option to set a
+  random seed (for example `--seed 0`) to give repeatable results, and
+  optionally this can be combined with a `--cache` option to cache the
+  subsetter output (under `/tmp`) rather than rerunning the subsetter every
+  time the script is run.  Results from the checks (containing the name of the
+  collection, the subsetting ranges used, and the test success value) are
+  written initially into an sqlite database and then (periodically and also on
+  exit) these are moved into a compressed CSV file.
+
+- A command `merge-test-logs` (implemented in `merge_csv.py`) will merge the
+  output logs from the above tester (as obtained from different sites) into a
+  single compressed CSV file.  The `data-pools-checks` command takes an
+  argument which is the site (e.g. `DKRZ`) and this is written both into the
+  contents of the output `.csv.gz` file (a column called "test location") and
+  also its filename, so the merge command will take a number of these files,
+  and merge them into the specified output file, removing any duplicates.
+
+- Also a file is included with some unit tests (`test_results_db.py`) to
+  accompany the `ResultsDB` class (in `results_db.py`) that is used to
+  implement how test results are stored.