deshaw · peytondmurray · Mar 25, 2024 · Mar 12, 2024
diff --git a/pyproject.toml b/pyproject.toml
@@ -91,6 +91,7 @@ ignore = [
 
 [tool.ruff.per-file-ignores]
 "__init__.py" = ["F401"]
+"test_*.py" = ["ANN001"]
 
 [project.optional-dependencies]
 dev = ["pre-commit>=3.6.0"]

diff --git a/versioned_hdf5/backend.py b/versioned_hdf5/backend.py
@@ -398,10 +398,7 @@ def create_virtual_dataset(
     slices = {c: s.reduce() for c, s in slices.items()}
 
     if len(raw_data) == 0:
-        shape = ()
-        layout = VirtualLayout((1,), dtype=raw_data.dtype)
-        vs = VirtualSource(".", name=raw_data.name, shape=(1,), dtype=raw_data.dtype)
-        layout[0] = vs[()]
+        layout = VirtualLayout(shape=(0,), dtype=raw_data.dtype)
     else:
         # Chunks in the raw dataset are expanded along the first dimension only.
         # Since the chunks are pointed to by virtual datasets, it doesn't make

diff --git a/versioned_hdf5/tests/test_api.py b/versioned_hdf5/tests/test_api.py
@@ -2646,3 +2646,37 @@ def test_versions_property(vfile):
     # Delete some versions and check for the correct versions again
     delete_versions(vfile, versions_to_delete)
     assert set(all_versions(vfile.f)) == set(vfile.versions)
+
+
+def test_make_empty_dataset(tmp_path):
+    """Check that creating a dataset before making it empty can be done successfully.
+
+    This test would pass unless the file gets closed/reopened for each operation,
+    which is why we do that here; unsure about why that is, but it must be related to
+    flushing reads/writes.
+
+    See https://github.com/deshaw/versioned-hdf5/issues/314 for context.
+    """
+    path = tmp_path / "tmp.h5"
+    with h5py.File(path, "w") as f:
+        vf = VersionedHDF5File(f)
+        with vf.stage_version("r0") as sv:
+            sv.create_dataset("values", data=np.array([1, 2, 3]))
+
+    with h5py.File(path, "r+") as f:
+        vf = VersionedHDF5File(f)
+        with vf.stage_version("r1") as sv:
+            sv["values"].resize((0,))
+
+    with h5py.File(path, "r+") as f:
+        delete_versions(f, ["r0"])
+
+    with h5py.File(path, "r+") as f:
+        vf = VersionedHDF5File(f)
+        with vf.stage_version("r2") as sv:
+            sv["values"].resize((0,))
+
+    with h5py.File(path, "r+") as f:
+        vf = VersionedHDF5File(f)
+        cv = vf[vf.current_version]
+        assert_equal(cv["values"][:], np.array([]))
diff --git a/versioned_hdf5/tests/test_backend.py b/versioned_hdf5/tests/test_backend.py
@@ -772,3 +772,31 @@ def test_write_dataset_compression(h5file):
     assert ds.dtype == np.float64
     assert ds.compression == "gzip"
     assert ds.compression_opts == 3
+
+
+def test_create_empty_virtual_dataset(setup_vfile):
+    """Check that creating an empty virtual dataset writes no raw data.
+
+    Also check that the empty virtual dataset is formed correctly.
+    See https://github.com/deshaw/versioned-hdf5/issues/314 for context.
+    """
+    name = "empty_dataset"
+
+    with setup_vfile(version_name="r0") as f:
+        write_dataset(f, "empty_dataset", np.array([]))
+        create_virtual_dataset(
+            f,
+            "r0",
+            name,
+            (0,),
+            {},
+        )
+
+        # Check that the raw data has only fill_value in it
+        assert_equal(f["_version_data"][name]["raw_data"][:], 0.0)
+
+        # Check that the virtual data is empty
+        ds = f["_version_data"]["versions"]["r0"][name][:]
+        assert_equal(ds, np.array([]))
+        assert ds.shape == (0,)
+        assert ds.size == 0