Skip to content

Commit

Permalink
Review test matrix
Browse files Browse the repository at this point in the history
  • Loading branch information
crusaderky committed Dec 16, 2024
1 parent 7587b7e commit 028cc40
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 159 deletions.
39 changes: 34 additions & 5 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,24 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.9', '3.10', '3.11']
h5py-version: ['dev']
numpy-version: ['latest', '1.24.4']
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
numpy-version: ['latest']
ndindex-version: ['latest']
h5py-version: ['latest']
include:
- python-version: '3.9'
numpy-version: '1.24.4'
ndindex-version: '1.5.1'
h5py-version: '3.8.0'
- python-version: '3.11' # numpy 1.24.4 has wheels up to Python 3.11
numpy-version: '1.24.4'
ndindex-version: 'latest'
h5py-version: 'latest'
- python-version: '3.13'
numpy-version: 'latest'
ndindex-version: 'latest'
h5py-version: 'dev'

fail-fast: false
steps:
- uses: actions/checkout@v4
Expand All @@ -22,9 +37,23 @@ jobs:
- name: Install target numpy version
if: matrix.numpy-version != 'latest'
run: pip install numpy~=${{ matrix.numpy-version }}

- name: Install target ndindex version
if: matrix.ndindex-version != 'latest'
run: pip install ndindex~=${{ matrix.ndindex-version }}

- name: Install latest h5py version
if: matrix.h5py-version == 'latest'
run: |
# Build against system-wide libhdf5
pip install h5py --no-binary :all:
- name: Install target h5py version
if: matrix.h5py-version != 'latest' && matrix.h5py-version != 'dev'
run: |
pip install numpy~=${{ matrix.numpy-version }}
pip list
# Build against system-wide libhdf5
pip install h5py~=${{ matrix.h5py-version }} --no-binary :all:
- name: Install development h5py version
if: matrix.h5py-version == 'dev'
Expand Down
3 changes: 2 additions & 1 deletion docs/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ Dependencies

Currently, Versioned HDF5 has the following runtime dependencies:

- ``python>=3.6``
- ``numpy``
- ``h5py``
- ``ndindex``

Refer to ``pyproject.toml`` for minimum supported versions.
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ authors = [
]
description = "Versioned HDF5 provides a versioned abstraction on top of h5py"
readme = "README.md"
requires-python = ">=3.8"
requires-python = ">=3.9"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: BSD License",
"Operating System :: OS Independent",
]
dependencies = [
"numpy",
"h5py",
"numpy>=1.24.4",
"h5py>=3.8.0",
"ndindex>=1.5.1",
]
urls = { Homepage = "https://github.com/deshaw/versioned-hdf5" }
Expand Down
80 changes: 28 additions & 52 deletions versioned_hdf5/backend.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from __future__ import annotations

import datetime
import logging
import os
import textwrap
from collections.abc import Iterator

import numpy as np
from h5py import Dataset, VirtualLayout, VirtualSource, h5s, h5z
from h5py import Dataset, VirtualLayout, h5s, h5z
from h5py._hl.filters import guess_chunk
from h5py._hl.selections import select
from h5py._selector import Selector
from ndindex import ChunkSize, Slice, Tuple, ndindex
from numpy.testing import assert_array_equal

Expand All @@ -32,8 +35,6 @@ def get_chunks(shape, dtype, chunk_size):


def initialize(f):
import datetime

from .versions import TIMESTAMP_FMT

version_data = f.create_group("_version_data")
Expand Down Expand Up @@ -438,70 +439,45 @@ def write_dataset_chunks(f, name, data_dict):
def create_virtual_dataset(
f, version_name, name, shape, slices, attrs=None, fillvalue=None
):
from h5py._hl.selections import select
from h5py._hl.vds import VDSmap
"""Create a new virtual dataset by stitching the chunks of the
raw dataset together, as indicated by the slices dict.
See Also
--------
_recreate_virtual_dataset
"""
raw_data = f["_version_data"][name]["raw_data"]
raw_data_shape = raw_data.shape
slices = {c: s.reduce() for c, s in slices.items()}
raw_data_name = raw_data.name.encode("utf-8")

if len(raw_data) == 0:
layout = VirtualLayout(shape=(0,), dtype=raw_data.dtype)
else:
layout = VirtualLayout(shape, dtype=raw_data.dtype)
layout._src_filenames.add(b".")
space = h5s.create_simple(shape)
selector = Selector(space)

# Chunks in the raw dataset are expanded along the first dimension only.
# Since the chunks are pointed to by virtual datasets, it doesn't make
# sense to expand the chunks in the raw dataset along multiple dimensions
# (the true layout of the chunks in the raw dataset is irrelevant).
for c, s in slices.items():
if len(c.args[0]) != len(s):
raise ValueError(f"Inconsistent slices dictionary ({c.args[0]}, {s})")

# h5py 3.3 changed the VirtualLayout code so that it no longer uses
# sources. See https://github.com/h5py/h5py/pull/1905.
layout = VirtualLayout(shape, dtype=raw_data.dtype)
layout_has_sources = hasattr(layout, "sources")
if not layout_has_sources:
from h5py import _selector

layout._src_filenames.add(b".")
space = h5s.create_simple(shape)
selector = _selector.Selector(space)

for c, s in slices.items():
for c, s0 in slices.items():
if len(c.args[0]) != len(s0):
raise ValueError(f"Inconsistent slices dictionary ({c.args[0]}, {s0})")
if c.isempty():
continue
# idx = Tuple(s, *Tuple(*[slice(0, i) for i in shape[1:]]).as_subindex(Tuple(*c.args[1:])).args)
S = [Slice(0, len(c.args[i])) for i in range(1, len(shape))]
idx = Tuple(s, *S)
# assert c.newshape(shape) == vs[idx.raw].shape, (c, shape, s)

# This is equivalent to
#
# layout[c.raw] = vs[idx.raw]
#
# but faster because vs[idx.raw] does a deepcopy(vs), which is
# slow. We need different versions for h5py 2 and 3 because the
# virtual sources code was rewritten.
if not layout_has_sources:
key = idx.raw
vs_sel = select(raw_data.shape, key, dataset=None)

sel = selector.make_selection(c.raw)
layout.dcpl.set_virtual(
sel.id, b".", raw_data.name.encode("utf-8"), vs_sel.id
)

else:
vs_sel = select(raw_data_shape, idx.raw, None)
layout_sel = select(shape, c.raw, None)
layout.sources.append(
VDSmap(layout_sel.id, ".", raw_data.name, vs_sel.id)
)
s = (s0.reduce().raw, *(slice(0, len(ci), 1) for ci in c.args[1:]))

dtype = raw_data.dtype
if dtype.metadata and (
"vlen" in dtype.metadata or "h5py_encoding" in dtype.metadata
):
# This is equivalent to `layout[c] = vs[s]`,
# but faster because vs[s] deep-copies vs, which is slow.
vs_sel = select(raw_data_shape, s, dataset=None)
sel = selector.make_selection(c.raw)
layout.dcpl.set_virtual(sel.id, b".", raw_data_name, vs_sel.id)

dtype_meta = raw_data.dtype.metadata
if dtype_meta and ("vlen" in dtype_meta or "h5py_encoding" in dtype_meta):
# Variable length string dtype
# (https://h5py.readthedocs.io/en/2.10.0/strings.html). Setting the
# fillvalue in this case doesn't work
Expand Down
58 changes: 21 additions & 37 deletions versioned_hdf5/replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from h5py import __version__ as h5py_version
from h5py import h5s
from h5py._hl.selections import select
from h5py._hl.vds import VDSmap
from h5py._selector import Selector
from h5py.h5i import get_name
from ndindex import ChunkSize, Slice, Tuple
from ndindex.ndindex import NDIndex
Expand Down Expand Up @@ -338,6 +338,9 @@ def _recreate_virtual_dataset(f, name, versions, raw_data_chunks_map, tmp=False)
are placed alongside the existing ones. Otherwise the existing virtual
datasets are replaced.
See Also
--------
create_virtual_dataset
"""
raw_data = f["_version_data"][name]["raw_data"]

Expand All @@ -347,47 +350,32 @@ def _recreate_virtual_dataset(f, name, versions, raw_data_chunks_map, tmp=False)

group = f["_version_data/versions"][version_name]
dataset = group[name]

# See the comments in create_virtual_dataset
layout = VirtualLayout(dataset.shape, dtype=dataset.dtype)
layout_has_sources = hasattr(layout, "sources")

if not layout_has_sources:
from h5py import _selector

# If a dataset has no data except for the fillvalue, it will not be virtual
if dataset.is_virtual:
layout._src_filenames.add(b".")
space = h5s.create_simple(dataset.shape)
selector = _selector.Selector(space)
selector = Selector(space)
raw_data_shape = raw_data.shape
raw_data_name = raw_data.name.encode("utf-8")

# If a dataset has no data except for the fillvalue, it will not be virtual
if dataset.is_virtual:
virtual_sources = dataset.virtual_sources()
for vmap in virtual_sources:
vspace, fname, dset_name, src_space = vmap
fname = fname.encode("utf-8")
assert fname == b".", fname
vspace, fname, _, src_space = vmap
assert fname == "."

vslice = spaceid_to_slice(vspace)
src_slice = spaceid_to_slice(src_space)
if src_slice not in raw_data_chunks_map:
raise ValueError(
f"Could not find the chunk for {vslice} ({src_slice} in the old raw dataset) for {name!r} in {version_name!r}"
)
new_src_slice = raw_data_chunks_map[src_slice]

if not layout_has_sources:
key = new_src_slice.raw
vs_sel = select(raw_data.shape, key, dataset=None)

sel = selector.make_selection(vslice.raw)
layout.dcpl.set_virtual(
sel.id, b".", raw_data.name.encode("utf-8"), vs_sel.id
)
else:
vs_sel = select(raw_data.shape, new_src_slice.raw, None)
layout_sel = select(dataset.shape, vslice.raw, None)
new_vmap = VDSmap(layout_sel.id, fname, dset_name, vs_sel.id)
layout.sources.append(new_vmap)
new_src_slice = raw_data_chunks_map[src_slice]
vs_sel = select(raw_data_shape, new_src_slice.raw, dataset=None)
sel = selector.make_selection(vslice.raw)
layout.dcpl.set_virtual(sel.id, b".", raw_data_name, vs_sel.id)

head, tail = posixpath.split(name)
tmp_name = "_tmp_" + tail
Expand Down Expand Up @@ -743,16 +731,12 @@ def _new_vds_layout(d, name1, name2):
vspace, fname, dset_name, src_space = vmap
assert dset_name.startswith(name1)
dset_name = _replace_prefix(dset_name, name1, name2)
fname = fname.encode("utf-8")
new_vmap = VDSmap(vspace, fname, dset_name, src_space)
# h5py 3.3 changed the VirtualLayout code. See
# https://github.com/h5py/h5py/pull/1905.
if hasattr(layout, "sources"):
layout.sources.append(new_vmap)
else:
layout.dcpl.set_virtual(
vspace, fname, dset_name.encode("utf-8"), src_space
)
layout.dcpl.set_virtual(
vspace,
fname.encode("utf-8"),
dset_name.encode("utf-8"),
src_space,
)
return layout

old_layout = _new_vds_layout(oldd, old.name, new.name)
Expand Down
Loading

0 comments on commit 028cc40

Please sign in to comment.