Merge branch 'main' into pre-commit-ci-update-config

Ouranosinc · Dec 9, 2024 · a12d1f9 · a12d1f9
2 parents 7ade465 + 2ede288
commit a12d1f9
Show file tree

Hide file tree

Showing 13 changed files with 148 additions and 157 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -348,6 +348,7 @@ jobs:
             api.electricitymap.org:443
             api.github.com:443
             api.green-coding.io:443
+            conda.anaconda.org:443
             coveralls.io:443
             files.pythonhosted.org:443
             github.com:443

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -20,6 +20,8 @@ Bug fixes
 Internal changes
 ^^^^^^^^^^^^^^^^
 * Changed french translations with word "pluvieux" to "avec précipitations". (:issue:`1960`, :pull:`1994`).
+* Using different time for `ref` and `hist` is now explicitly forbidden in many bias adjustment methods (e.g. `EmpiricalQuantileMapping`). Methods that combine `ref,hist,sim` in a same `map_groups` also require the time arrays to be equal in size. (:issue:`1903`, :pull:`1995`)
+* Nans in `OTC` and `dOTC` are only dropped and put back in place at the lowest level so that the size of time array never changes on xarray levels. (:pull:`1995`)
 * `streamflow` entry replaced with `q` in ``variables.yml``.  (:issue:`1912`, :pull:`1996`)
 * In order to address 403 (forbidden) request errors when retrieving data from GitHub via ReadTheDocs, the ``nimbus`` class has been modified to use an overloaded `fetch` method that appends a User-Agent header to the request. (:pull:`2001`).
 * Addressed a very rare race condition that can happen if `pytest` is tearing down the test environment when running across multiple workers. (:pull:`1863`).

diff --git a/docs/notebooks/customize.ipynb b/docs/notebooks/customize.ipynb
@@ -36,7 +36,7 @@
    "outputs": [],
    "source": [
     "tasmax = (\n",
-    "    xr.tutorial.open_dataset(\"air_temperature\")\n",
+    "    xr.tutorial.load_dataset(\"air_temperature\")\n",
     "    .air.resample(time=\"D\")\n",
     "    .max(keep_attrs=True)\n",
     ")\n",

diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb
@@ -69,7 +69,7 @@
    "outputs": [],
    "source": [
     "# Daily temperature data from xarray's tutorials\n",
-    "ds = xr.tutorial.open_dataset(\"air_temperature\").resample(time=\"D\").mean()\n",
+    "ds = xr.tutorial.load_dataset(\"air_temperature\").resample(time=\"D\").mean()\n",
     "tas = ds.isel(lat=0, lon=0).air\n",
     "\n",
     "# Compute the smoothed series\n",

diff --git a/docs/notebooks/units.ipynb b/docs/notebooks/units.ipynb
@@ -48,7 +48,7 @@
    "outputs": [],
    "source": [
     "# See the Usage page for details on opening datasets, subsetting and resampling.\n",
-    "ds = xr.tutorial.open_dataset(\"air_temperature\")\n",
+    "ds = xr.tutorial.load_dataset(\"air_temperature\")\n",
     "tas = (\n",
     "    ds.air.sel(lat=40, lon=270, method=\"nearest\")\n",
     "    .resample(time=\"D\")\n",
@@ -193,7 +193,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ds = xr.tutorial.open_dataset(\"air_temperature\")\n",
+    "ds = xr.tutorial.load_dataset(\"air_temperature\")\n",
     "tas_6h = ds.air.sel(\n",
     "    lat=40, lon=270, method=\"nearest\"\n",
     ")  # no resampling, original data is 6-hourly\n",

diff --git a/docs/notebooks/usage.ipynb b/docs/notebooks/usage.ipynb
@@ -139,7 +139,7 @@
    "source": [
     "# Show that data is not at a daily time frequency\n",
     "\n",
-    "ds6h = xr.tutorial.open_dataset(\"air_temperature\")\n",
+    "ds6h = xr.tutorial.load_dataset(\"air_temperature\")\n",
     "xr.infer_freq(ds6h.time)"
    ]
   },

diff --git a/environment.yml b/environment.yml
@@ -1,7 +1,6 @@
 name: xclim
 channels:
   - conda-forge
-  - defaults
 dependencies:
   - python >=3.10,<3.14
   - boltons >=20.1

diff --git a/pyproject.toml b/pyproject.toml
@@ -134,7 +134,7 @@ target-version = [
 ]
 
 [tool.bumpversion]
-current_version = "0.53.3-dev.6"
+current_version = "0.53.3-dev.7"
 commit = true
 commit_args = "--no-verify"
 tag = false

diff --git a/tests/test_sdba/conftest.py b/tests/test_sdba/conftest.py
@@ -114,7 +114,7 @@ def ref_hist_sim_tuto(socket_enabled):  # noqa: F841
     """
 
     def _ref_hist_sim_tuto(sim_offset=3, delta=0.1, smth_win=3, trend=True):
-        ds = xr.tutorial.open_dataset("air_temperature")
+        ds = xr.tutorial.load_dataset("air_temperature")
         ref = ds.air.resample(time="D").mean(keep_attrs=True)
         hist = ref.rolling(time=smth_win, min_periods=1).mean(keep_attrs=True) + delta
         hist.attrs["units"] = ref.attrs["units"]

diff --git a/tests/test_sdba/test_adjustment.py b/tests/test_sdba/test_adjustment.py
@@ -66,6 +66,28 @@ def test_harmonize_units_multivariate(self, series, random, use_dask):
         ds, ds2 = unstack_variables(da), unstack_variables(da2)
         assert (ds.tas.units == ds2.tas.units) & (ds.pr.units == ds2.pr.units)
 
+    def test_matching_times(self, series, random):
+        n = 10
+        u = random.random(n)
+        da = series(u, "tas", start="2000-01-01")
+        da2 = series(u, "tas", start="2010-01-01")
+        with pytest.raises(
+            ValueError,
+            match="`ref` and `hist` have distinct time arrays, this is not supported for BaseAdjustment adjustment.",
+        ):
+            BaseAdjustment._check_matching_times(ref=da, hist=da2)
+
+    def test_matching_time_sizes(self, series, random):
+        n = 10
+        u = random.random(n)
+        da = series(u, "tas", start="2000-01-01")
+        da2 = da.isel(time=slice(0, 5)).copy()
+        with pytest.raises(
+            ValueError,
+            match="Inputs have different size for the time array, this is not supported for BaseAdjustment adjustment.",
+        ):
+            BaseAdjustment._check_matching_time_sizes(da, da2)
+
 
 class TestLoci:
     @pytest.mark.parametrize("group,dec", (["time", 2], ["time.month", 1]))
@@ -871,53 +893,6 @@ def test_compare_sbck(self, random, series):
         scen_sbck = scen_sbck.to_numpy()
         assert np.allclose(scen, scen_sbck)
 
-    def test_shape(self, random, series):
-        pytest.importorskip("ot")
-        pytest.importorskip("SBCK", minversion="0.4.0")
-        ref_ns = 300
-        hist_ns = 200
-        ref_u = random.random(ref_ns)
-        hist_u = random.random(hist_ns)
-
-        ref_xd = uniform(loc=1000, scale=100)
-        ref_yd = norm(loc=0, scale=100)
-        ref_zd = norm(loc=500, scale=100)
-        hist_xd = norm(loc=-500, scale=100)
-        hist_yd = uniform(loc=-1000, scale=100)
-        hist_zd = uniform(loc=-10, scale=100)
-
-        ref_x = ref_xd.ppf(ref_u)
-        ref_y = ref_yd.ppf(ref_u)
-        ref_z = ref_zd.ppf(ref_u)
-        hist_x = hist_xd.ppf(hist_u)
-        hist_y = hist_yd.ppf(hist_u)
-        hist_z = hist_zd.ppf(hist_u)
-
-        ref_na = 10
-        hist_na = 15
-        ref_idx = random.choice(range(ref_ns), size=ref_na, replace=False)
-        ref_x[ref_idx] = None
-        hist_idx = random.choice(range(hist_ns), size=hist_na, replace=False)
-        hist_x[hist_idx] = None
-
-        ref_x = series(ref_x, "tas").rename("x")
-        ref_y = series(ref_y, "tas").rename("y")
-        ref_z = series(ref_z, "tas").rename("z")
-        ref = xr.merge([ref_x, ref_y, ref_z])
-        ref = stack_variables(ref)
-
-        hist_x = series(hist_x, "tas").rename("x")
-        hist_y = series(hist_y, "tas").rename("y")
-        hist_z = series(hist_z, "tas").rename("z")
-        hist = xr.merge([hist_x, hist_y, hist_z])
-        hist = stack_variables(hist)
-
-        scen = OTC.adjust(ref, hist)
-
-        assert scen.shape == (3, hist_ns - hist_na)
-        hist = unstack_variables(hist)
-        assert not np.isin(hist.x[hist.x.isnull()].time.values, scen.time.values).any()
-
 
 # TODO: Add tests for normalization methods
 class TestdOTC:
@@ -1004,69 +979,33 @@ def test_compare_sbck(self, random, series, use_dask, cov_factor):
         scen_sbck = scen_sbck.to_numpy()
         assert np.allclose(scen, scen_sbck)
 
-    def test_shape(self, random, series):
+    # just check it runs
+    def test_different_times(self, tasmax_series, tasmin_series):
         pytest.importorskip("ot")
-        pytest.importorskip("SBCK", minversion="0.4.0")
-        ref_ns = 300
-        hist_ns = 200
-        sim_ns = 400
-        ref_u = random.random(ref_ns)
-        hist_u = random.random(hist_ns)
-        sim_u = random.random(sim_ns)
-
-        ref_xd = uniform(loc=1000, scale=100)
-        ref_yd = norm(loc=0, scale=100)
-        ref_zd = norm(loc=500, scale=100)
-        hist_xd = norm(loc=-500, scale=100)
-        hist_yd = uniform(loc=-1000, scale=100)
-        hist_zd = uniform(loc=-10, scale=100)
-        sim_xd = norm(loc=0, scale=100)
-        sim_yd = uniform(loc=0, scale=100)
-        sim_zd = uniform(loc=10, scale=100)
-
-        ref_x = ref_xd.ppf(ref_u)
-        ref_y = ref_yd.ppf(ref_u)
-        ref_z = ref_zd.ppf(ref_u)
-        hist_x = hist_xd.ppf(hist_u)
-        hist_y = hist_yd.ppf(hist_u)
-        hist_z = hist_zd.ppf(hist_u)
-        sim_x = sim_xd.ppf(sim_u)
-        sim_y = sim_yd.ppf(sim_u)
-        sim_z = sim_zd.ppf(sim_u)
-
-        ref_na = 10
-        hist_na = 15
-        sim_na = 20
-        ref_idx = random.choice(range(ref_ns), size=ref_na, replace=False)
-        ref_x[ref_idx] = None
-        hist_idx = random.choice(range(hist_ns), size=hist_na, replace=False)
-        hist_x[hist_idx] = None
-        sim_idx = random.choice(range(sim_ns), size=sim_na, replace=False)
-        sim_x[sim_idx] = None
-
-        ref_x = series(ref_x, "tas").rename("x")
-        ref_y = series(ref_y, "tas").rename("y")
-        ref_z = series(ref_z, "tas").rename("z")
-        ref = xr.merge([ref_x, ref_y, ref_z])
-        ref = stack_variables(ref)
-
-        hist_x = series(hist_x, "tas").rename("x")
-        hist_y = series(hist_y, "tas").rename("y")
-        hist_z = series(hist_z, "tas").rename("z")
-        hist = xr.merge([hist_x, hist_y, hist_z])
-        hist = stack_variables(hist)
-
-        sim_x = series(sim_x, "tas").rename("x")
-        sim_y = series(sim_y, "tas").rename("y")
-        sim_z = series(sim_z, "tas").rename("z")
-        sim = xr.merge([sim_x, sim_y, sim_z])
-        sim = stack_variables(sim)
-
-        scen = dOTC.adjust(ref, hist, sim)
-
-        assert scen.shape == (3, sim_ns - sim_na)
-        sim = unstack_variables(sim)
-        assert not np.isin(sim.x[sim.x.isnull()].time.values, scen.time.values).any()
+        # `sim` has a different time than `ref,hist` (but same size)
+        ref = xr.merge(
+            [
+                tasmax_series(np.arange(730).astype(float), start="2000-01-01").chunk(
+                    {"time": -1}
+                ),
+                tasmin_series(np.arange(730).astype(float), start="2000-01-01").chunk(
+                    {"time": -1}
+                ),
+            ]
+        )
+        hist = ref.copy()
+        sim = xr.merge(
+            [
+                tasmax_series(np.arange(730).astype(float), start="2020-01-01").chunk(
+                    {"time": -1}
+                ),
+                tasmin_series(np.arange(730).astype(float), start="2020-01-01").chunk(
+                    {"time": -1}
+                ),
+            ]
+        )
+        ref, hist, sim = (stack_variables(arr) for arr in [ref, hist, sim])
+        dOTC.adjust(ref, hist, sim)
 
 
 def test_raise_on_multiple_chunks(tas_series):

diff --git a/xclim/__init__.py b/xclim/__init__.py
@@ -13,7 +13,7 @@
 
 __author__ = """Travis Logan"""
 __email__ = "[email protected]"
-__version__ = "0.53.3-dev.6"
+__version__ = "0.53.3-dev.7"
 
 
 with _resources.as_file(_resources.files("xclim.data")) as _module_data:

diff --git a/xclim/sdba/_adjustment.py b/xclim/sdba/_adjustment.py
@@ -988,6 +988,12 @@ def _otc_adjust(
     ----------
     :cite:cts:`sdba-robin_2021`
     """
+    # nans are removed and put back in place at the end
+    X_og = X.copy()
+    mask = (~np.isnan(X)).all(axis=1)
+    X = X[mask]
+    Y = Y[(~np.isnan(Y)).all(axis=1)]
+
     # Initialize parameters
     if bin_width is None:
         bin_width = u.bin_width_estimator([Y, X])
@@ -1042,7 +1048,11 @@ def _otc_adjust(
     if jitter_inside_bins:
         out += np.random.uniform(low=-bin_width / 2, high=bin_width / 2, size=out.shape)
 
-    return out
+    # reintroduce nans
+    Z = X_og
+    Z[mask] = out
+    Z[~mask] = np.nan
+    return Z
 
 
 @map_groups(scen=[Grouper.DIM])
@@ -1102,9 +1112,9 @@ def otc_adjust(
             )
 
     ref_map = {d: f"ref_{d}" for d in dim}
-    ref = ref.rename(ref_map).stack(dim_ref=ref_map.values()).dropna(dim="dim_ref")
+    ref = ref.rename(ref_map).stack(dim_ref=ref_map.values())
 
-    hist = hist.stack(dim_hist=dim).dropna(dim="dim_hist")
+    hist = hist.stack(dim_hist=dim)
 
     if isinstance(bin_width, dict):
         bin_width = {
@@ -1134,12 +1144,7 @@ def otc_adjust(
         vectorize=True,
     )
 
-    # Pad dim differences with NA to please map_blocks
-    ref = ref.unstack().rename({v: k for k, v in ref_map.items()})
     scen = scen.unstack().rename("scen")
-    for d in dim:
-        full_d = xr.concat([ref[d], scen[d]], dim=d).drop_duplicates(d)
-        scen = scen.reindex({d: full_d})
 
     return scen.to_dataset()
 
@@ -1193,6 +1198,12 @@ def _dotc_adjust(
     ----------
     :cite:cts:`sdba-robin_2021`
     """
+    # nans are removed and put back in place at the end
+    X1_og = X1.copy()
+    mask = ~np.isnan(X1).any(axis=1)
+    X1 = X1[mask]
+    X0 = X0[~np.isnan(X0).any(axis=1)]
+    Y0 = Y0[~np.isnan(Y0).any(axis=1)]
     # Initialize parameters
     if isinstance(bin_width, dict):
         _bin_width = u.bin_width_estimator([Y0, X0, X1])
@@ -1259,7 +1270,7 @@ def _dotc_adjust(
             Y1[:, j] = Y0[:, j] + motion[:, j]
 
     # Map sim to the evolution of ref
-    Z1 = _otc_adjust(
+    out = _otc_adjust(
         X1,
         Y1,
         bin_width=bin_width,
@@ -1268,6 +1279,10 @@ def _dotc_adjust(
         jitter_inside_bins=jitter_inside_bins,
         normalization=normalization,
     )
+    # reintroduce nans
+    Z1 = X1_og
+    Z1[mask] = out
+    Z1[~mask] = np.nan
 
     return Z1
 
@@ -1339,14 +1354,12 @@ def dotc_adjust(
 
     # Drop data added by map_blocks and prepare for apply_ufunc
     hist_map = {d: f"hist_{d}" for d in dim}
-    hist = (
-        hist.rename(hist_map).stack(dim_hist=hist_map.values()).dropna(dim="dim_hist")
-    )
+    hist = hist.rename(hist_map).stack(dim_hist=hist_map.values())
 
     ref_map = {d: f"ref_{d}" for d in dim}
-    ref = ref.rename(ref_map).stack(dim_ref=ref_map.values()).dropna(dim="dim_ref")
+    ref = ref.rename(ref_map).stack(dim_ref=ref_map.values())
 
-    sim = sim.stack(dim_sim=dim).dropna(dim="dim_sim")
+    sim = sim.stack(dim_sim=dim)
 
     if kind is not None:
         kind = {
@@ -1387,12 +1400,6 @@ def dotc_adjust(
         vectorize=True,
     )
 
-    # Pad dim differences with NA to please map_blocks
-    hist = hist.unstack().rename({v: k for k, v in hist_map.items()})
-    ref = ref.unstack().rename({v: k for k, v in ref_map.items()})
     scen = scen.unstack().rename("scen")
-    for d in dim:
-        full_d = xr.concat([hist[d], ref[d], scen[d]], dim=d).drop_duplicates(d)
-        scen = scen.reindex({d: full_d})
 
     return scen.to_dataset()