From c0140241abfaac2bc05afaf28f927bc2e60ed592 Mon Sep 17 00:00:00 2001 From: pdmurray Date: Sun, 14 Apr 2024 10:52:38 -0700 Subject: [PATCH] Partition accepts ChunkSize; clean up unused variable; type annotations --- versioned_hdf5/backend.py | 29 +++++++++++++---------------- versioned_hdf5/versions.py | 1 - 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/versioned_hdf5/backend.py b/versioned_hdf5/backend.py index 3ba19bd6..53e70de3 100644 --- a/versioned_hdf5/backend.py +++ b/versioned_hdf5/backend.py @@ -818,7 +818,6 @@ def apply( Mapping between {slices in virtual dataset: slices in raw dataset} which were written by this function; and shape of the current dataset """ - chunk_size = self.chunks[0] new_shape_index = Tuple(*[Slice(0, i) for i in self.shape]) raw_data: Dataset = f["_version_data"][name]["raw_data"] # type: ignore @@ -826,13 +825,8 @@ def apply( # the chunks modified by the reshape operation current_slices = slices.copy() - # # Clear out the existing slices dict as a side effect; every slice - # # gets touched by a ResizeOperation, and some of them will be invalidated (if they fall - # # on the edge of the dataset) - # slices.clear() - new_slices = {} - for vchunk in partition(new_shape_index, chunk_size): + for vchunk in partition(new_shape_index, self.chunks): # If the new virtual chunk is in the old set of slices, just use the same # raw data the virtual chunk is already mapped to. Pop it out of the slices # dict so that we don't need to iterate over it when computing parts of the @@ -1079,9 +1073,11 @@ def write_dataset_operations( def write_operations( f: File, version_name: str, name: str, operations: List[WriteOperation] -) -> tuple[Dict[Tuple, Tuple], tuple[int]]: +) -> tuple[Dict[Tuple, Tuple], tuple[int, ...]]: """Carry out a sequence of write operations on the file. + If no operations are pending, just return the previous version slices and shape. + Parameters ---------- f : File @@ -1095,15 +1091,12 @@ def write_operations( Returns ------- - tuple[Dict[Tuple, Tuple], tuple[int]] + tuple[Dict[Tuple, Tuple], tuple[int, ...]] (Slices map, shape of virtual dataset post-write) The slices map is a mapping from {virtual dataset slice: raw dataset slice}. The virtual dataset is created elsewhere using the slices return here. """ - if not operations: - return {}, () # type: ignore - if name not in f["_version_data"]: raise NotImplementedError( "Use write_dataset() if the dataset does not yet exist" @@ -1111,6 +1104,7 @@ def write_operations( slices = get_previous_version_slices(f, version_name, name) shape = get_previous_version_shape(f, version_name, name) + for operation in operations: slices, shape = operation.apply(f, name, version_name, slices, shape) @@ -1533,7 +1527,7 @@ def split_across_unused( def partition( obj: Union[np.ndarray, Tuple], - chunks: Union[int, tuple[int, ...]], + chunks: Union[int, tuple[int, ...], ChunkSize], ) -> Iterator[Tuple]: """Break an array or a Tuple of slices into chunks of the given chunk size. @@ -1541,10 +1535,11 @@ def partition( ---------- obj : Union[np.ndarray, Tuple] Array or Tuple index to partition - chunks : Union[int, tuple[int, ...]] + chunks : Union[int, tuple[int, ...], ChunkSize] If this is an int, this is the size of each partitioned chunk. - Multidimensional chunks should supply a tuple giving the chunk - size in each dimension. + If it is a tuple of ints or a ChunkSize, consider the indices of the + object the shape of the chunks. Multidimensional chunks should supply + a tuple giving the chunk size in each dimension. Returns ------- @@ -1560,6 +1555,8 @@ def partition( if isinstance(chunks, (int, np.integer)): chunks = (chunks,) + elif isinstance(chunks, ChunkSize): + chunks = tuple(chunks) yield from ChunkSize(chunks).as_subchunks(index, shape) diff --git a/versioned_hdf5/versions.py b/versioned_hdf5/versions.py index 7005f2d9..202832f6 100644 --- a/versioned_hdf5/versions.py +++ b/versioned_hdf5/versions.py @@ -135,7 +135,6 @@ def commit_version( shape = None if isinstance(data, InMemoryDataset): - shape = data.shape if not data._operations: # The virtual dataset was not changed from the previous # version. Just copy it to the new version directly.