Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Review test matrix; test vs. h5py>=3.8.0 #407

Merged
merged 1 commit into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 34 additions & 5 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,24 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.9', '3.10', '3.11']
h5py-version: ['dev']
numpy-version: ['latest', '1.24.4']
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
numpy-version: ['latest']
ndindex-version: ['latest']
h5py-version: ['latest']
include:
- python-version: '3.9'
numpy-version: '1.24.4'
ndindex-version: '1.5.1'
h5py-version: '3.8.0'
- python-version: '3.11' # numpy 1.24.4 has wheels up to Python 3.11
numpy-version: '1.24.4'
ndindex-version: 'latest'
h5py-version: 'latest'
- python-version: '3.13'
numpy-version: 'latest'
ndindex-version: 'latest'
h5py-version: 'dev'

fail-fast: false
steps:
- uses: actions/checkout@v4
Expand All @@ -22,9 +37,23 @@ jobs:

- name: Install target numpy version
if: matrix.numpy-version != 'latest'
run: pip install numpy~=${{ matrix.numpy-version }}

- name: Install target ndindex version
if: matrix.ndindex-version != 'latest'
run: pip install ndindex~=${{ matrix.ndindex-version }}

- name: Install latest h5py version
if: matrix.h5py-version == 'latest'
run: |
# Build against system-wide libhdf5
pip install h5py --no-binary :all:

- name: Install target h5py version
if: matrix.h5py-version != 'latest' && matrix.h5py-version != 'dev'
run: |
pip install numpy~=${{ matrix.numpy-version }}
pip list
# Build against system-wide libhdf5
pip install h5py~=${{ matrix.h5py-version }} --no-binary :all:

- name: Install development h5py version
if: matrix.h5py-version == 'dev'
Expand Down
3 changes: 2 additions & 1 deletion docs/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ Dependencies

Currently, Versioned HDF5 has the following runtime dependencies:

- ``python>=3.6``
- ``numpy``
- ``h5py``
- ``ndindex``

Refer to ``pyproject.toml`` for minimum supported versions.
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ authors = [
]
description = "Versioned HDF5 provides a versioned abstraction on top of h5py"
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.9"
peytondmurray marked this conversation as resolved.
Show resolved Hide resolved
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
]
dependencies = [
"numpy",
"h5py",
"numpy>=1.24.4",
peytondmurray marked this conversation as resolved.
Show resolved Hide resolved
"h5py>=3.8.0",
peytondmurray marked this conversation as resolved.
Show resolved Hide resolved
"ndindex>=1.5.1",
]
urls = { Homepage = "https://github.com/deshaw/versioned-hdf5" }
Expand Down
80 changes: 28 additions & 52 deletions versioned_hdf5/backend.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from __future__ import annotations

import datetime
import logging
import os
import textwrap
from collections.abc import Iterator

import numpy as np
from h5py import Dataset, VirtualLayout, VirtualSource, h5s, h5z
from h5py import Dataset, VirtualLayout, h5s, h5z
from h5py._hl.filters import guess_chunk
from h5py._hl.selections import select
from h5py._selector import Selector
from ndindex import ChunkSize, Slice, Tuple, ndindex
from numpy.testing import assert_array_equal

Expand All @@ -32,8 +35,6 @@ def get_chunks(shape, dtype, chunk_size):


def initialize(f):
import datetime

from .versions import TIMESTAMP_FMT

version_data = f.create_group("_version_data")
Expand Down Expand Up @@ -438,70 +439,45 @@ def write_dataset_chunks(f, name, data_dict):
def create_virtual_dataset(
f, version_name, name, shape, slices, attrs=None, fillvalue=None
):
from h5py._hl.selections import select
from h5py._hl.vds import VDSmap
"""Create a new virtual dataset by stitching the chunks of the
raw dataset together, as indicated by the slices dict.

See Also
--------
_recreate_virtual_dataset
"""
raw_data = f["_version_data"][name]["raw_data"]
raw_data_shape = raw_data.shape
slices = {c: s.reduce() for c, s in slices.items()}
raw_data_name = raw_data.name.encode("utf-8")

if len(raw_data) == 0:
layout = VirtualLayout(shape=(0,), dtype=raw_data.dtype)
else:
layout = VirtualLayout(shape, dtype=raw_data.dtype)
layout._src_filenames.add(b".")
space = h5s.create_simple(shape)
selector = Selector(space)
Comment on lines +456 to +459
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


# Chunks in the raw dataset are expanded along the first dimension only.
# Since the chunks are pointed to by virtual datasets, it doesn't make
# sense to expand the chunks in the raw dataset along multiple dimensions
# (the true layout of the chunks in the raw dataset is irrelevant).
for c, s in slices.items():
if len(c.args[0]) != len(s):
raise ValueError(f"Inconsistent slices dictionary ({c.args[0]}, {s})")

# h5py 3.3 changed the VirtualLayout code so that it no longer uses
# sources. See https://github.com/h5py/h5py/pull/1905.
layout = VirtualLayout(shape, dtype=raw_data.dtype)
layout_has_sources = hasattr(layout, "sources")
if not layout_has_sources:
from h5py import _selector

layout._src_filenames.add(b".")
space = h5s.create_simple(shape)
selector = _selector.Selector(space)

for c, s in slices.items():
for c, s0 in slices.items():
if len(c.args[0]) != len(s0):
raise ValueError(f"Inconsistent slices dictionary ({c.args[0]}, {s0})")
if c.isempty():
continue
# idx = Tuple(s, *Tuple(*[slice(0, i) for i in shape[1:]]).as_subindex(Tuple(*c.args[1:])).args)
S = [Slice(0, len(c.args[i])) for i in range(1, len(shape))]
idx = Tuple(s, *S)
# assert c.newshape(shape) == vs[idx.raw].shape, (c, shape, s)

# This is equivalent to
#
# layout[c.raw] = vs[idx.raw]
#
# but faster because vs[idx.raw] does a deepcopy(vs), which is
# slow. We need different versions for h5py 2 and 3 because the
# virtual sources code was rewritten.
if not layout_has_sources:
key = idx.raw
vs_sel = select(raw_data.shape, key, dataset=None)

sel = selector.make_selection(c.raw)
layout.dcpl.set_virtual(
sel.id, b".", raw_data.name.encode("utf-8"), vs_sel.id
)

else:
vs_sel = select(raw_data_shape, idx.raw, None)
layout_sel = select(shape, c.raw, None)
layout.sources.append(
VDSmap(layout_sel.id, ".", raw_data.name, vs_sel.id)
)
s = (s0.reduce().raw, *(slice(0, len(ci), 1) for ci in c.args[1:]))

dtype = raw_data.dtype
if dtype.metadata and (
"vlen" in dtype.metadata or "h5py_encoding" in dtype.metadata
):
# This is equivalent to `layout[c] = vs[s]`,
# but faster because vs[s] deep-copies vs, which is slow.
vs_sel = select(raw_data_shape, s, dataset=None)
sel = selector.make_selection(c.raw)
layout.dcpl.set_virtual(sel.id, b".", raw_data_name, vs_sel.id)

dtype_meta = raw_data.dtype.metadata
if dtype_meta and ("vlen" in dtype_meta or "h5py_encoding" in dtype_meta):
# Variable length string dtype
# (https://h5py.readthedocs.io/en/2.10.0/strings.html). Setting the
# fillvalue in this case doesn't work
Expand Down
58 changes: 21 additions & 37 deletions versioned_hdf5/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from h5py import __version__ as h5py_version
from h5py import h5s
from h5py._hl.selections import select
from h5py._hl.vds import VDSmap
from h5py._selector import Selector
from h5py.h5i import get_name
from ndindex import ChunkSize, Slice, Tuple
from ndindex.ndindex import NDIndex
Expand Down Expand Up @@ -338,6 +338,9 @@ def _recreate_virtual_dataset(f, name, versions, raw_data_chunks_map, tmp=False)
are placed alongside the existing ones. Otherwise the existing virtual
datasets are replaced.
See Also
--------
create_virtual_dataset
"""
raw_data = f["_version_data"][name]["raw_data"]

Expand All @@ -347,47 +350,32 @@ def _recreate_virtual_dataset(f, name, versions, raw_data_chunks_map, tmp=False)

group = f["_version_data/versions"][version_name]
dataset = group[name]

# See the comments in create_virtual_dataset
layout = VirtualLayout(dataset.shape, dtype=dataset.dtype)
layout_has_sources = hasattr(layout, "sources")

if not layout_has_sources:
from h5py import _selector

# If a dataset has no data except for the fillvalue, it will not be virtual
if dataset.is_virtual:
layout._src_filenames.add(b".")
space = h5s.create_simple(dataset.shape)
selector = _selector.Selector(space)
selector = Selector(space)
raw_data_shape = raw_data.shape
raw_data_name = raw_data.name.encode("utf-8")

# If a dataset has no data except for the fillvalue, it will not be virtual
if dataset.is_virtual:
virtual_sources = dataset.virtual_sources()
for vmap in virtual_sources:
vspace, fname, dset_name, src_space = vmap
fname = fname.encode("utf-8")
assert fname == b".", fname
vspace, fname, _, src_space = vmap
assert fname == "."

vslice = spaceid_to_slice(vspace)
src_slice = spaceid_to_slice(src_space)
if src_slice not in raw_data_chunks_map:
raise ValueError(
f"Could not find the chunk for {vslice} ({src_slice} in the old raw dataset) for {name!r} in {version_name!r}"
)
new_src_slice = raw_data_chunks_map[src_slice]

if not layout_has_sources:
key = new_src_slice.raw
vs_sel = select(raw_data.shape, key, dataset=None)

sel = selector.make_selection(vslice.raw)
layout.dcpl.set_virtual(
sel.id, b".", raw_data.name.encode("utf-8"), vs_sel.id
)
else:
vs_sel = select(raw_data.shape, new_src_slice.raw, None)
layout_sel = select(dataset.shape, vslice.raw, None)
new_vmap = VDSmap(layout_sel.id, fname, dset_name, vs_sel.id)
layout.sources.append(new_vmap)
new_src_slice = raw_data_chunks_map[src_slice]
vs_sel = select(raw_data_shape, new_src_slice.raw, dataset=None)
sel = selector.make_selection(vslice.raw)
layout.dcpl.set_virtual(sel.id, b".", raw_data_name, vs_sel.id)

head, tail = posixpath.split(name)
tmp_name = "_tmp_" + tail
Expand Down Expand Up @@ -743,16 +731,12 @@ def _new_vds_layout(d, name1, name2):
vspace, fname, dset_name, src_space = vmap
assert dset_name.startswith(name1)
dset_name = _replace_prefix(dset_name, name1, name2)
fname = fname.encode("utf-8")
new_vmap = VDSmap(vspace, fname, dset_name, src_space)
# h5py 3.3 changed the VirtualLayout code. See
# https://github.com/h5py/h5py/pull/1905.
if hasattr(layout, "sources"):
layout.sources.append(new_vmap)
else:
layout.dcpl.set_virtual(
vspace, fname, dset_name.encode("utf-8"), src_space
)
layout.dcpl.set_virtual(
vspace,
fname.encode("utf-8"),
dset_name.encode("utf-8"),
src_space,
)
return layout

old_layout = _new_vds_layout(oldd, old.name, new.name)
Expand Down
Loading
Loading