Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make create_virtual_dataset handle zero-length raw_data correctly #315

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ ignore = [

[tool.ruff.per-file-ignores]
"__init__.py" = ["F401"]
"test_*.py" = ["ANN001"]

[project.optional-dependencies]
dev = ["pre-commit>=3.6.0"]
Expand Down
5 changes: 1 addition & 4 deletions versioned_hdf5/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,10 +398,7 @@ def create_virtual_dataset(
slices = {c: s.reduce() for c, s in slices.items()}

if len(raw_data) == 0:
shape = ()
layout = VirtualLayout((1,), dtype=raw_data.dtype)
vs = VirtualSource(".", name=raw_data.name, shape=(1,), dtype=raw_data.dtype)
layout[0] = vs[()]
layout = VirtualLayout(shape=(0,), dtype=raw_data.dtype)
else:
# Chunks in the raw dataset are expanded along the first dimension only.
# Since the chunks are pointed to by virtual datasets, it doesn't make
Expand Down
34 changes: 34 additions & 0 deletions versioned_hdf5/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2646,3 +2646,37 @@ def test_versions_property(vfile):
# Delete some versions and check for the correct versions again
delete_versions(vfile, versions_to_delete)
assert set(all_versions(vfile.f)) == set(vfile.versions)


def test_make_empty_dataset(tmp_path):
"""Check that creating a dataset before making it empty can be done successfully.

This test would pass unless the file gets closed/reopened for each operation,
which is why we do that here; unsure about why that is, but it must be related to
flushing reads/writes.

See https://github.com/deshaw/versioned-hdf5/issues/314 for context.
"""
path = tmp_path / "tmp.h5"
with h5py.File(path, "w") as f:
vf = VersionedHDF5File(f)
with vf.stage_version("r0") as sv:
sv.create_dataset("values", data=np.array([1, 2, 3]))

with h5py.File(path, "r+") as f:
vf = VersionedHDF5File(f)
with vf.stage_version("r1") as sv:
sv["values"].resize((0,))

with h5py.File(path, "r+") as f:
delete_versions(f, ["r0"])

with h5py.File(path, "r+") as f:
vf = VersionedHDF5File(f)
with vf.stage_version("r2") as sv:
sv["values"].resize((0,))

with h5py.File(path, "r+") as f:
vf = VersionedHDF5File(f)
cv = vf[vf.current_version]
assert_equal(cv["values"][:], np.array([]))
28 changes: 28 additions & 0 deletions versioned_hdf5/tests/test_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,3 +772,31 @@ def test_write_dataset_compression(h5file):
assert ds.dtype == np.float64
assert ds.compression == "gzip"
assert ds.compression_opts == 3


def test_create_empty_virtual_dataset(setup_vfile):
"""Check that creating an empty virtual dataset writes no raw data.

Also check that the empty virtual dataset is formed correctly.
See https://github.com/deshaw/versioned-hdf5/issues/314 for context.
"""
name = "empty_dataset"

with setup_vfile(version_name="r0") as f:
write_dataset(f, "empty_dataset", np.array([]))
create_virtual_dataset(
f,
"r0",
name,
(0,),
{},
)

# Check that the raw data has only fill_value in it
assert_equal(f["_version_data"][name]["raw_data"][:], 0.0)

# Check that the virtual data is empty
ds = f["_version_data"]["versions"]["r0"][name][:]
assert_equal(ds, np.array([]))
assert ds.shape == (0,)
assert ds.size == 0
Loading