From e53f5b6429300bbe2f7c27ee8207f63e0130d1b3 Mon Sep 17 00:00:00 2001 From: Arvid Bessen Date: Tue, 7 Nov 2023 17:22:35 -0500 Subject: [PATCH] Include string length when hashing When hashing `dtype=object` arrays we need to include the length of the individual strings, otherwise arrays with identical contents after concatenating all elements hash to the same value: ``` >>> import h5py ... import versioned_hdf5 ... import numpy as np ... ... path = "temp.h5" ... ... with h5py.File(path, mode="w") as f: ... file = versioned_hdf5.VersionedHDF5File(f) ... with file.stage_version("r0") as group: ... group.create_dataset( ... "values", shape=(2,), dtype=h5py.string_dtype(encoding='ascii'), data=np.array([b"a", b"bc"], dtype=object), ... maxshape=(None,), chunks=(10000,), compression="gzip", shuffle=False, ... ) ... with file.stage_version("r1") as group: ... group.create_dataset( ... "values", shape=(2,), dtype=h5py.string_dtype(encoding='ascii'), data=np.array([b"ab", b"c"], dtype=object), ... maxshape=(None,), chunks=(10000,), compression="gzip", shuffle=False, ... ) ... ... with h5py.File(path, mode="r") as f: ... file = versioned_hdf5.VersionedHDF5File(f) ... for i in range(2): ... print(file[f"r{i}"]["values"][:]) ... [b'a' b'bc'] [b'a' b'bc'] ``` By including the length of each string as part of the hash the hashes will now be unique. This also includes the changes in https://github.com/deshaw/versioned-hdf5/pull/287 and supersedes that PR. Fixes https://github.com/deshaw/versioned-hdf5/issues/288 --- versioned_hdf5/api.py | 15 ++++--- versioned_hdf5/backend.py | 4 +- versioned_hdf5/hashtable.py | 3 ++ versioned_hdf5/tests/test_api.py | 60 +++++++++++++------------- versioned_hdf5/tests/test_hashtable.py | 26 +++++++++++ 5 files changed, 69 insertions(+), 39 deletions(-) diff --git a/versioned_hdf5/api.py b/versioned_hdf5/api.py index 18a3cf77..dcce9e12 100644 --- a/versioned_hdf5/api.py +++ b/versioned_hdf5/api.py @@ -12,7 +12,7 @@ from contextlib import contextmanager import datetime -from .backend import initialize, DATA_VERSION, CORRUPT_DATA_VERSION +from .backend import initialize, DATA_VERSION, CORRUPT_DATA_VERSIONS from .versions import (create_version_group, commit_version, get_version_by_timestamp, get_nth_previous_version, set_current_version, all_versions, delete_version, ) @@ -71,21 +71,22 @@ def __init__(self, f): else: # This is not a new file; check data version identifier for compatibility if self.data_version_identifier < DATA_VERSION: - if self.data_version_identifier == CORRUPT_DATA_VERSION: + if self.data_version_identifier in CORRUPT_DATA_VERSIONS: raise ValueError( - f'Versioned Hdf5 file {f.filename} has data_version {CORRUPT_DATA_VERSION}, ' + f'Versioned Hdf5 file {f.filename} has data_version {self.data_version_identifier}, ' 'which has corrupted hash_tables. ' - 'See https://github.com/deshaw/versioned-hdf5/issues/256 for details. ' + 'See https://github.com/deshaw/versioned-hdf5/issues/256 and ' + 'https://github.com/deshaw/versioned-hdf5/issues/288 for details. ' 'You should recreate the file from scratch. ' 'In an emergency you could also rebuild the hash tables by calling ' - f'VersionedHDF5File({f.filename!r}).rebuild_hashtables() and use ' + f'with h5py.File({f.filename!r}) as f: VersionedHDF5File(f).rebuild_hashtables() and use ' f'delete_versions to delete all versions after the upgrade to ' - f'data_version {CORRUPT_DATA_VERSION} if you can identify them.') + f'data_version == {self.data_version_identifier} if you can identify them.') if any(self._find_object_dtype_data_groups()): logger.warning('Detected dtype="O" arrays which are not reused when creating new versions. ' 'See https://github.com/deshaw/versioned-hdf5/issues/256 for details. ' 'Rebuilding hash tables for %s is recommended by calling ' - 'VersionedHDF5File(%r).rebuild_object_dtype_hashtables().', + 'with h5py.File(%r) as f: VersionedHDF5File(f).rebuild_object_dtype_hashtables().', f.filename, f.filename) else: if f.mode == 'r+': diff --git a/versioned_hdf5/backend.py b/versioned_hdf5/backend.py index 508ff798..615ca879 100644 --- a/versioned_hdf5/backend.py +++ b/versioned_hdf5/backend.py @@ -6,8 +6,8 @@ from .hashtable import Hashtable DEFAULT_CHUNK_SIZE = 2**12 -DATA_VERSION = 3 -CORRUPT_DATA_VERSION = 2 # data_version 2 has broken hashtables, always need to rebuild +DATA_VERSION = 4 +CORRUPT_DATA_VERSIONS = frozenset([2, 3]) # data_version 2 has broken hashtables, always need to rebuild def normalize_dtype(dtype): return np.array([], dtype=dtype).dtype diff --git a/versioned_hdf5/hashtable.py b/versioned_hdf5/hashtable.py index 025c9a5a..a65d2748 100644 --- a/versioned_hdf5/hashtable.py +++ b/versioned_hdf5/hashtable.py @@ -1,4 +1,5 @@ import hashlib +import struct from collections.abc import MutableMapping from functools import lru_cache @@ -123,6 +124,8 @@ def hash(self, data: np.ndarray): # default to utf-8 encoding since it's a superset of ascii (the only other # valid encoding supported in h5py) value = value.encode('utf-8') + # hash the length of value ('n' is ssize_t, which matches the internal type for lengths) + hash_value.update(struct.pack('n', len(value))) hash_value.update(value) hash_value.update(bytes(str(data.shape), 'utf-8')) return hash_value.digest() diff --git a/versioned_hdf5/tests/test_api.py b/versioned_hdf5/tests/test_api.py index 2dd6f3e1..17ad706d 100644 --- a/versioned_hdf5/tests/test_api.py +++ b/versioned_hdf5/tests/test_api.py @@ -1975,38 +1975,38 @@ def test_rebuild_hashtable(tmp_path, caplog): # Ensure new hash is stable expected = np.array( [ - 51, - 0, - 100, - 40, - 27, - 131, - 129, - 10, - 220, - 28, - 67, - 88, - 241, - 101, - 82, + 252, + 36, + 175, + 92, + 195, + 20, + 142, + 4, + 239, + 58, + 202, 209, - 99, - 40, - 93, - 96, + 147, + 120, + 100, + 137, + 156, 220, - 140, - 22, - 20, - 128, - 26, - 215, - 42, - 44, - 196, - 5, - 204 + 173, + 52, + 45, + 1, + 230, + 255, + 252, + 205, + 149, + 145, + 65, + 175, + 239, + 159 ], dtype=np.uint8 ) diff --git a/versioned_hdf5/tests/test_hashtable.py b/versioned_hdf5/tests/test_hashtable.py index 7ac48bbd..0a4079ee 100644 --- a/versioned_hdf5/tests/test_hashtable.py +++ b/versioned_hdf5/tests/test_hashtable.py @@ -86,3 +86,29 @@ def test_object_dtype_hashes_values(tmp_path): file = VersionedHDF5File(f) for i in range(N): assert file[f"r{i}"]["values"][()] == b"a"*(i+1) + + +def test_object_dtype_hashes_concatenated_values(tmp_path): + """Test that object dtype arrays hash values which concatenate + to the same string to different hashes. + See https://github.com/deshaw/versioned-hdf5/issues/288. + """ + filename = tmp_path / "test.h5" + with h5py.File(filename, mode="w") as f: + file = VersionedHDF5File(f) + with file.stage_version("r0") as group: + group.create_dataset( + "values", dtype=h5py.string_dtype(encoding='ascii'), + data=np.array([b"a", b"b", b"cd"], dtype=object), + maxshape=(None,), chunks=(100,) + ) + with file.stage_version("r1") as group: + group["values"] = np.array([b"ab", b"", b"cd"], dtype=object) + with file.stage_version("r2") as group: + group["values"] = np.array([b"ab", b"c", b"d"], dtype=object) + + with h5py.File(filename, mode="r") as f: + file = VersionedHDF5File(f) + np.testing.assert_equal(file["r0"]["values"][:], np.array([b"a", b"b", b"cd"], dtype=object)) + np.testing.assert_equal(file["r1"]["values"][:], np.array([b"ab", b"", b"cd"], dtype=object)) + np.testing.assert_equal(file["r2"]["values"][:], np.array([b"ab", b"c", b"d"], dtype=object))