diff --git a/versioned_hdf5/meson.build b/versioned_hdf5/meson.build index 74da38ff..8fdcedc1 100644 --- a/versioned_hdf5/meson.build +++ b/versioned_hdf5/meson.build @@ -34,7 +34,6 @@ py.extension_module( subdir: 'versioned_hdf5', dependencies: compiled_deps, include_directories: [npy_include_path], - override_options : ['cython_language=cpp'], ) cython_args = [ diff --git a/versioned_hdf5/slicetools.pyx b/versioned_hdf5/slicetools.pyx index aaccd9df..6c6de32f 100644 --- a/versioned_hdf5/slicetools.pyx +++ b/versioned_hdf5/slicetools.pyx @@ -1,4 +1,3 @@ -# distutils: language=c++ import sys from functools import lru_cache @@ -18,7 +17,6 @@ from numpy.typing import ArrayLike from libc.stddef cimport ptrdiff_t, size_t from libc.stdio cimport FILE, fclose -from libcpp.vector cimport vector from versioned_hdf5.cytools import np_hsize_t from versioned_hdf5.cytools cimport ceil_a_over_b, count2stop, hsize_t, stop2count @@ -145,100 +143,6 @@ def hyperslab_to_slice(start, stride, count, block): return Slice(start, end, stride) -@cython.infer_types(True) -cdef _spaceid_to_slice(space_id: hid_t): - """ - Helper function to read the data for `space_id` selection and - convert it to a Tuple of slices. - """ - sel_type = H5Sget_select_type(space_id) - - if sel_type == H5S_sel_type.H5S_SEL_ALL: - return Tuple() - elif sel_type == H5S_sel_type.H5S_SEL_HYPERSLABS: - slices = [] - - rank = H5Sget_simple_extent_ndims(space_id) - if rank < 0: - raise HDF5Error() - start_array = vector[hsize_t](rank) - stride_array = vector[hsize_t](rank) - count_array = vector[hsize_t](rank) - block_array = vector[hsize_t](rank) - - if H5Sget_regular_hyperslab( - space_id, - start_array.data(), - stride_array.data(), - count_array.data(), - block_array.data(), - ) < 0: - raise HDF5Error() - - for i in range(rank): - start = start_array[i] - stride = stride_array[i] - count = count_array[i] - block = block_array[i] - if not (block == 1 or count == 1): - raise NotImplementedError("Nontrivial blocks are not yet supported") - end = start + (stride * (count - 1) + 1) * block - stride = stride if block == 1 else 1 - slices.append(Slice(start, end, stride)) - - return Tuple(*slices) - elif sel_type == H5S_sel_type.H5S_SEL_NONE: - return Tuple(Slice(0, 0)) - else: - raise NotImplementedError("Point selections are not yet supported") - - -@cython.infer_types(True) -cpdef build_data_dict(dcpl, raw_data_name: str): - """ - Function to build the "data_dict" of a versioned virtual dataset. - - All virtual datasets created by versioned-hdf5 should have chunks in - exactly one raw dataset `raw_data_name` in the same file. - This function blindly assumes this is the case. - - :param dcpl: the dataset creation property list of the versioned dataset - :param raw_data_name: the name of the corresponding raw dataset - :return: a dictionary mapping the `Tuple` of the virtual dataset chunk - to a `Slice` in the raw dataset. - """ - data_dict = {} - - with phil: - dcpl_id: hid_t = dcpl.id - virtual_count: size_t = dcpl.get_virtual_count() - - for j in range(virtual_count): - vspace_id = H5Pget_virtual_vspace(dcpl_id, j) - if vspace_id == H5I_INVALID_HID: - raise HDF5Error() - try: - vspace_slice_tuple = _spaceid_to_slice(vspace_id) - finally: - if H5Sclose(vspace_id) < 0: - raise HDF5Error() - - srcspace_id = H5Pget_virtual_srcspace(dcpl_id, j) - if srcspace_id == H5I_INVALID_HID: - raise HDF5Error() - try: - srcspace_slice_tuple = _spaceid_to_slice(srcspace_id) - finally: - if H5Sclose(srcspace_id) < 0: - raise HDF5Error() - - # the slice into the raw_data (srcspace_slice_tuple) is only - # on the first axis - data_dict[vspace_slice_tuple] = srcspace_slice_tuple.args[0] - - return data_dict - - @cython.boundscheck(False) @cython.wraparound(False) @cython.infer_types(True) diff --git a/versioned_hdf5/subchunk_map.pxd b/versioned_hdf5/subchunk_map.pxd index 4195bcf3..6a190a70 100644 --- a/versioned_hdf5/subchunk_map.pxd +++ b/versioned_hdf5/subchunk_map.pxd @@ -10,7 +10,6 @@ cdef class IndexChunkMapper: cdef readonly hsize_t n_chunks cdef readonly hsize_t last_chunk_size - cpdef tuple[object, object, object] chunk_submap(self, hsize_t chunk_idx) cpdef tuple[object, object | None] read_many_slices_params(self) cpdef object chunks_indexer(self) diff --git a/versioned_hdf5/subchunk_map.py b/versioned_hdf5/subchunk_map.py index 8b9f6bd8..8753bde3 100755 --- a/versioned_hdf5/subchunk_map.py +++ b/versioned_hdf5/subchunk_map.py @@ -5,23 +5,15 @@ import abc import enum -import itertools -from collections.abc import Iterator from typing import TYPE_CHECKING, Any import cython import numpy as np from cython import bint, ssize_t -from ndindex import ChunkSize, Slice, Tuple, ndindex +from ndindex import ChunkSize, Tuple, ndindex from numpy.typing import NDArray -from .cytools import ( - ceil_a_over_b, - count2stop, - np_hsize_t, - smallest_step_after, - stop2count, -) +from .cytools import ceil_a_over_b, np_hsize_t, smallest_step_after, stop2count from .tools import asarray if TYPE_CHECKING: # pragma: nocover @@ -48,15 +40,6 @@ ) -class DropAxis(enum.Enum): - _drop_axis = 0 - - -# Returned instead of an AnySlicer. Signals that the axis should be removed when -# aggregated into an AnySlicerND. -DROP_AXIS = DropAxis._drop_axis - - # This is an abstract class, which should inherit from abc.ABC # or have metaclass=abc.ABCMeta. Neither are supported by Cython though. @cython.cclass @@ -104,29 +87,6 @@ def _chunk_start_stop(self, chunk_idx: hsize_t) -> tuple[hsize_t, hsize_t]: stop = min(start + self.chunk_size, self.dset_size) return start, stop - @cython.ccall - @abc.abstractmethod - def chunk_submap( - self, chunk_idx: hsize_t - ) -> tuple[Slice, AnySlicer | DropAxis, AnySlicer]: - """Given a chunk index, return a tuple of - - data_dict key - key of the data_dict (see build_data_dict()) - value_subidx - the slicer selecting the points within the sliced array - (the return value for __getitem__, the value parameter for __setitem__) - chunk_subidx - the slicer selecting the points within the input chunks. - - In other words, in the simplified one-dimensional case: - - _, value_subidx, chunk_subidx = mapper.chunk_submap(i) - chunk_view = base_arr[chunk_idx * chunk_size:(chunk_idx + 1) * chunk_size] - return_value[value_subidx] = chunk_view[chunk_subidx] # __getitem__ - chunk_view[chunk_subidx] = value_param[value_subidx] # __setitem__ - """ - @cython.ccall @abc.abstractmethod def read_many_slices_params( @@ -362,31 +322,6 @@ def __init__( super().__init__(chunk_indices, dset_size, chunk_size) - @cython.ccall - def chunk_submap(self, chunk_idx: hsize_t) -> tuple[Slice, slice, slice]: - chunk_start, chunk_stop = self._chunk_start_stop(chunk_idx) - sel_start = self.start - sel_stop = self.stop - sel_step = self.step - - abs_start = max(chunk_start, sel_start) - # Get the smallest lcm multiple of common that is >= start - abs_start = smallest_step_after(abs_start, sel_start % sel_step, sel_step) - - # shift start so that it is relative to index - value_sub_start = (abs_start - sel_start) // sel_step - value_sub_stop = ceil_a_over_b(min(chunk_stop, sel_stop) - sel_start, sel_step) - - chunk_sub_start = abs_start - chunk_start - count = value_sub_stop - value_sub_start - chunk_sub_stop = count2stop(chunk_sub_start, count, sel_step) - - return ( - Slice(chunk_start, chunk_stop, 1), - slice(value_sub_start, value_sub_stop, 1), - slice(chunk_sub_start, chunk_sub_stop, sel_step), - ) - @cython.cfunc @cython.nogil @cython.exceptval(check=False) @@ -465,12 +400,6 @@ def __init__(self, idx: hsize_t, dset_size: hsize_t, chunk_size: hsize_t): chunk_indices = np.array([idx // chunk_size], dtype=np_hsize_t) super().__init__(chunk_indices, dset_size, chunk_size) - @cython.ccall - def chunk_submap(self, chunk_idx: hsize_t) -> tuple[Slice, DropAxis, int]: - chunk_start, chunk_stop = self._chunk_start_stop(chunk_idx) - chunk_sub_idx = self.idx - chunk_start - return Slice(chunk_start, chunk_stop, 1), DROP_AXIS, chunk_sub_idx - @cython.cfunc @cython.nogil @cython.exceptval(check=False) @@ -527,14 +456,6 @@ def __init__(self, other: IndexChunkMapper): self._chunks_indexer = other.chunks_indexer() super().__init__(other.chunk_indices, other.dset_size, other.chunk_size) - @cython.ccall - def chunk_submap( - self, chunk_idx: hsize_t - ) -> tuple[Slice, AnySlicer | DropAxis, AnySlicer]: - raise NotImplementedError( # pragma: nocover - "not used in legacy as_subchunk_map" - ) - @cython.cfunc @cython.nogil @cython.exceptval(check=False) @@ -589,28 +510,6 @@ def __init__( chunk_indices = np.unique(idx // chunk_size) super().__init__(chunk_indices, dset_size, chunk_size) - @cython.ccall - def chunk_submap( - self, chunk_idx: hsize_t - ) -> tuple[Slice, NDArray[np_hsize_t] | slice, NDArray[np_hsize_t] | slice]: - chunk_start, chunk_stop = self._chunk_start_stop(chunk_idx) - - if self.is_ascending: - # O(n*logn) - start_idx, stop_idx = np.searchsorted(self.idx, [chunk_start, chunk_stop]) - mask = slice(int(start_idx), int(stop_idx), 1) - # TODO optimize monotonic descending - else: - # O(n^2) - mask = (chunk_start <= self.idx) & (self.idx < chunk_stop) - mask = _maybe_array_idx_to_slice(mask) - - return ( - Slice(chunk_start, chunk_stop, 1), - mask, - _maybe_array_idx_to_slice(self.idx[mask] - chunk_start), - ) - @cython.ccall def read_many_slices_params( self, @@ -1206,72 +1105,3 @@ def read_many_slices_params_nd( assert outs == nslices return ndslices - - -def as_subchunk_map( - idx: Any, - shape: tuple[int, ...], - chunk_size: tuple[int, ...] | ChunkSize, -) -> Iterator[ - tuple[ - Tuple, - AnySlicerND, - AnySlicerND, - ] -]: - """Computes the chunk selection assignment. In particular, given a `chunk_size` - it returns triple (chunk_idx, value_sub_idx, chunk_sub_idx) such that for a - chunked Dataset `ds` we can translate selections like - - >> value = ds[idx] - - into selecting from the individual chunks of `ds` as - - >> value = np.empty(output_shape) - >> for chunk_idx, value_sub_idx, chunk_sub_idx in as_subchunk_map( - .. idx, ds.shape, ds.chunk_size - .. ): - .. value[value_sub_idx] = ds.data_dict[chunk_idx][chunk_sub_idx] - - Similarly, assignments like - - >> ds[idx] = value - - can be translated into - - >> for chunk_idx, value_sub_idx, chunk_sub_idx in as_subchunk_map( - .. idx, ds.shape, ds.chunk_size - .. ): - .. ds.data_dict[chunk_idx][chunk_sub_idx] = value[value_sub_idx] - - :param idx: the "index" to read from / write to the Dataset - :param shape: the shape of the Dataset - :param chunk_size: the `ChunkSize` of the Dataset - :return: a generator of `(chunk_idx, value_sub_idx, chunk_sub_idx)` tuples - """ - idx, mappers = index_chunk_mappers(idx, shape, chunk_size) - if not mappers: - return - idx_len = len(idx.args) - - mapper: IndexChunkMapper # noqa: F842 - chunk_subindexes = [ - [mapper.chunk_submap(chunk_idx) for chunk_idx in mapper.chunk_indices] - for mapper in mappers - ] - - # Now combine the chunk_slices and subindexes for each dimension into tuples - # across all dimensions. - for p in itertools.product(*chunk_subindexes): - chunk_slices, value_subidxs, chunk_subidxs = zip(*p) - - # skip dimensions which were sliced away - value_subidxs = tuple( - value_subidx - for value_subidx in value_subidxs - if value_subidx is not DROP_AXIS - ) - # skip suffix dimensions - chunk_subidxs = chunk_subidxs[:idx_len] - - yield Tuple(*chunk_slices), value_subidxs, chunk_subidxs diff --git a/versioned_hdf5/tests/test_subchunk_map.py b/versioned_hdf5/tests/test_subchunk_map.py index 0070b167..29f35778 100644 --- a/versioned_hdf5/tests/test_subchunk_map.py +++ b/versioned_hdf5/tests/test_subchunk_map.py @@ -13,11 +13,9 @@ from ..cytools import np_hsize_t from ..slicetools import read_many_slices from ..subchunk_map import ( - DROP_AXIS, EntireChunksMapper, SliceMapper, TransferType, - as_subchunk_map, index_chunk_mappers, read_many_slices_params_nd, ) @@ -155,34 +153,6 @@ def idx_shape_chunks_st( return idx, shape, chunks -@pytest.mark.slow -@given(idx_shape_chunks_st()) -@hypothesis.settings(max_examples=max_examples, deadline=None) -def test_as_subchunk_map(args): - idx, shape, chunks = args - - source = np.arange(1, np.prod(shape) + 1, dtype=np.int32).reshape(shape) - expect = source[idx] - actual = np.zeros_like(expect) - - for chunk_idx, value_sub_idx, chunk_sub_idx in as_subchunk_map(idx, shape, chunks): - chunk_idx = chunk_idx.raw - - # Test that chunk_idx selects whole chunks - assert isinstance(chunk_idx, tuple) - assert len(chunk_idx) == len(chunks) - for i, c, d in zip(chunk_idx, chunks, shape): - assert isinstance(i, slice) - assert i.start % c == 0 - assert i.stop == min(i.start + c, d) - assert i.step == 1 - - assert not actual[value_sub_idx].any(), "overlapping value_sub_idx" - actual[value_sub_idx] = source[chunk_idx][chunk_sub_idx] - - assert_array_equal(actual, expect) - - def test_mapper_attributes(): _, (mapper,) = index_chunk_mappers(slice(5), (6,), (3,)) assert mapper.dset_size == 6 @@ -208,14 +178,11 @@ def test_chunks_indexer(args): return # Early exit for empty index assert len(shape) == len(chunks) == len(mappers) == 1 dset_size = shape[0] + chunk_size = chunks[0] mapper = mappers[0] assert mapper.dset_size == shape[0] assert mapper.chunk_size == chunks[0] - source = np.arange(1, dset_size + 1) - expect = source[idx] - actual = np.zeros_like(expect) - all_chunks = np.arange(mapper.n_chunks) sel_chunks = all_chunks[mapper.chunks_indexer()] whole_chunks = sel_chunks[mapper.whole_chunks_idxidx()] @@ -227,23 +194,37 @@ def test_chunks_indexer(args): # Test that whole_chunks is a subset of sel_chunks assert np.setdiff1d(whole_chunks, sel_chunks, assume_unique=True).size == 0 - for i in sel_chunks: - source_idx, value_sub_idx, chunk_sub_idx = mapper.chunk_submap(i) - chunk = source[source_idx.raw] + # Test correctness of chunks_indexer() and whole_chunks_idxidx() + src = np.arange(1, dset_size + 1) + dst = np.zeros_like(src) + slices, chunk_to_slices = mapper.read_many_slices_params() - if value_sub_idx is DROP_AXIS: - value_sub_idx = () - actual[value_sub_idx] = chunk[chunk_sub_idx] + slice_offsets = np.arange(0, dset_size, chunk_size, dtype=np_hsize_t) + slice_offsets = slice_offsets[mapper.chunk_indices] + if chunk_to_slices is not None: + slice_offsets = np.repeat(slice_offsets, np.diff(chunk_to_slices).astype(int)) + slices[:, 0] += slice_offsets - coverage = np.zeros_like(chunk) - coverage[chunk_sub_idx] = 1 - assert coverage.any(), "chunk selected by chunk_indexer() is not covered" - if i in whole_chunks: - assert coverage.all(), "whole chunk is partially covered" - else: - assert not coverage.all(), "partial chunk is wholly covered" + read_many_slices( + src=src, + dst=dst, + src_start=slices[:, 0:1], # chunk_sub_start + dst_start=slices[:, 0:1], # chunk_sub_start + count=slices[:, 2:3], # count + src_stride=slices[:, 3:4], # chunk_sub_stride + dst_stride=slices[:, 3:4], # chunk_sub_stride + ) - assert_array_equal(actual, expect) + actual_sel_chunks = [] + actual_whole_chunks = [] + for chunk_idx in all_chunks: + dst_chunk = dst[(start := chunk_idx * chunk_size) : start + chunk_size] + if dst_chunk.any(): + actual_sel_chunks.append(chunk_idx) + if dst_chunk.all(): + actual_whole_chunks.append(chunk_idx) + assert_array_equal(actual_sel_chunks, sel_chunks) + assert_array_equal(actual_whole_chunks, whole_chunks) @pytest.mark.slow @@ -523,28 +504,6 @@ def test_simplify_indices(): assert mapper.step == 2 -def test_chunk_submap_simplifies_indices(): - """Test that, when a fancy index can't be globally simplified to a slice, - as_subchunk_map still attemps to simplify the individual chunk subindices. - """ - _, (mapper,) = index_chunk_mappers( - [True, False, True, False] # chunk 0 - + [True, True, False, False] # chunk 1 - + [True, False, True, True], # chunk 2 - (12,), - (4,), - ) - _, value_sub_idx, chunk_sub_idx = mapper.chunk_submap(0) - assert value_sub_idx == slice(0, 2, 1) - assert chunk_sub_idx == slice(0, 3, 2) - _, value_sub_idx, chunk_sub_idx = mapper.chunk_submap(1) - assert value_sub_idx == slice(2, 4, 1) - assert chunk_sub_idx == slice(0, 2, 1) - _, value_sub_idx, chunk_sub_idx = mapper.chunk_submap(2) - assert value_sub_idx == slice(4, 7, 1) - assert_array_equal(chunk_sub_idx, [0, 2, 3]) # Can't be simplified - - def test_chunks_indexer_simplifies_indices(): """Test that chunks_indexer() and whole_chunks_indexer() return a slice if possible""" _, (mapper,) = index_chunk_mappers(slice(None, None, 3), (10,), (2,))