Include string length when hashing

When hashing `dtype=object` arrays we need to include the length of the individual strings, otherwise arrays with identical contents after concatenating all elements hash to the same value: ``` >>> import h5py ... import versioned_hdf5 ... import numpy as np ... ... path = "temp.h5" ... ... with h5py.File(path, mode="w") as f: ... file = versioned_hdf5.VersionedHDF5File(f) ... with file.stage_version("r0") as group: ... group.create_dataset( ... "values", shape=(2,), dtype=h5py.string_dtype(encoding='ascii'), data=np.array([b"a", b"bc"], dtype=object), ... maxshape=(None,), chunks=(10000,), compression="gzip", shuffle=False, ... ) ... with file.stage_version("r1") as group: ... group.create_dataset( ... "values", shape=(2,), dtype=h5py.string_dtype(encoding='ascii'), data=np.array([b"ab", b"c"], dtype=object), ... maxshape=(None,), chunks=(10000,), compression="gzip", shuffle=False, ... ) ... ... with h5py.File(path, mode="r") as f: ... file = versioned_hdf5.VersionedHDF5File(f) ... for i in range(2): ... print(file[f"r{i}"]["values"][:]) ... [b'a' b'bc'] [b'a' b'bc'] ``` By including the length of each string as part of the hash the hashes will now be unique. This also includes the changes in #287 and supersedes that PR. Fixes #288
deshaw · Nov 7, 2023 · e53f5b6 · e53f5b6
1 parent 66e5a73
commit e53f5b6
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 39 deletions.
diff --git a/versioned_hdf5/api.py b/versioned_hdf5/api.py
@@ -12,7 +12,7 @@
 from contextlib import contextmanager
 import datetime
 
-from .backend import initialize, DATA_VERSION, CORRUPT_DATA_VERSION
+from .backend import initialize, DATA_VERSION, CORRUPT_DATA_VERSIONS
 from .versions import (create_version_group, commit_version,
                        get_version_by_timestamp, get_nth_previous_version,
                        set_current_version, all_versions, delete_version, )
@@ -71,21 +71,22 @@ def __init__(self, f):
         else:
             # This is not a new file; check data version identifier for compatibility
             if self.data_version_identifier < DATA_VERSION:
-                if self.data_version_identifier == CORRUPT_DATA_VERSION:
+                if self.data_version_identifier in CORRUPT_DATA_VERSIONS:
                     raise ValueError(
-                        f'Versioned Hdf5 file {f.filename} has data_version {CORRUPT_DATA_VERSION}, '
+                        f'Versioned Hdf5 file {f.filename} has data_version {self.data_version_identifier}, '
                         'which has corrupted hash_tables. '
-                        'See https://github.com/deshaw/versioned-hdf5/issues/256 for details. '
+                        'See https://github.com/deshaw/versioned-hdf5/issues/256 and '
+                        'https://github.com/deshaw/versioned-hdf5/issues/288 for details. '
                         'You should recreate the file from scratch. '
                         'In an emergency you could also rebuild the hash tables by calling '
-                        f'VersionedHDF5File({f.filename!r}).rebuild_hashtables() and use '
+                        f'with h5py.File({f.filename!r}) as f: VersionedHDF5File(f).rebuild_hashtables() and use '
                         f'delete_versions to delete all versions after the upgrade to '
-                        f'data_version {CORRUPT_DATA_VERSION} if you can identify them.')
+                        f'data_version == {self.data_version_identifier} if you can identify them.')
                 if any(self._find_object_dtype_data_groups()):
                     logger.warning('Detected dtype="O" arrays which are not reused when creating new versions. '
                                    'See https://github.com/deshaw/versioned-hdf5/issues/256 for details. '
                                    'Rebuilding hash tables for %s is recommended by calling '
-                                   'VersionedHDF5File(%r).rebuild_object_dtype_hashtables().',
+                                   'with h5py.File(%r) as f: VersionedHDF5File(f).rebuild_object_dtype_hashtables().',
                                    f.filename, f.filename)
                 else:
                     if f.mode == 'r+':

diff --git a/versioned_hdf5/backend.py b/versioned_hdf5/backend.py
@@ -6,8 +6,8 @@
 from .hashtable import Hashtable
 
 DEFAULT_CHUNK_SIZE = 2**12
-DATA_VERSION = 3
-CORRUPT_DATA_VERSION = 2  # data_version 2 has broken hashtables, always need to rebuild
+DATA_VERSION = 4
+CORRUPT_DATA_VERSIONS = frozenset([2, 3])  # data_version 2 has broken hashtables, always need to rebuild
 
 def normalize_dtype(dtype):
     return np.array([], dtype=dtype).dtype

diff --git a/versioned_hdf5/hashtable.py b/versioned_hdf5/hashtable.py
@@ -1,4 +1,5 @@
 import hashlib
+import struct
 from collections.abc import MutableMapping
 from functools import lru_cache
 
@@ -123,6 +124,8 @@ def hash(self, data: np.ndarray):
                     # default to utf-8 encoding since it's a superset of ascii (the only other
                     # valid encoding supported in h5py)
                     value = value.encode('utf-8')
+                # hash the length of value ('n' is ssize_t, which matches the internal type for lengths)
+                hash_value.update(struct.pack('n', len(value)))
                 hash_value.update(value)
             hash_value.update(bytes(str(data.shape), 'utf-8'))
             return hash_value.digest()

diff --git a/versioned_hdf5/tests/test_api.py b/versioned_hdf5/tests/test_api.py
@@ -1975,38 +1975,38 @@ def test_rebuild_hashtable(tmp_path, caplog):
     # Ensure new hash is stable
     expected = np.array(
         [
-            51,
-            0,
-            100,
-            40,
-            27,
-            131,
-            129,
-            10,
-            220,
-            28,
-            67,
-            88,
-            241,
-            101,
-            82,
+            252,
+            36,
+            175,
+            92,
+            195,
+            20,
+            142,
+            4,
+            239,
+            58,
+            202,
             209,
-            99,
-            40,
-            93,
-            96,
+            147,
+            120,
+            100,
+            137,
+            156,
             220,
-            140,
-            22,
-            20,
-            128,
-            26,
-            215,
-            42,
-            44,
-            196,
-            5,
-            204
+            173,
+            52,
+            45,
+            1,
+            230,
+            255,
+            252,
+            205,
+            149,
+            145,
+            65,
+            175,
+            239,
+            159
         ],
         dtype=np.uint8
     )

diff --git a/versioned_hdf5/tests/test_hashtable.py b/versioned_hdf5/tests/test_hashtable.py
@@ -86,3 +86,29 @@ def test_object_dtype_hashes_values(tmp_path):
         file = VersionedHDF5File(f)
         for i in range(N):
             assert file[f"r{i}"]["values"][()] == b"a"*(i+1)
+
+
+def test_object_dtype_hashes_concatenated_values(tmp_path):
+    """Test that object dtype arrays hash values which concatenate
+    to the same string to different hashes.
+    See https://github.com/deshaw/versioned-hdf5/issues/288.
+    """
+    filename = tmp_path / "test.h5"
+    with h5py.File(filename, mode="w") as f:
+        file = VersionedHDF5File(f)
+        with file.stage_version("r0") as group:
+            group.create_dataset(
+                "values",  dtype=h5py.string_dtype(encoding='ascii'),
+                data=np.array([b"a", b"b", b"cd"], dtype=object),
+                maxshape=(None,), chunks=(100,)
+            )
+        with file.stage_version("r1") as group:
+            group["values"] = np.array([b"ab", b"", b"cd"], dtype=object)
+        with file.stage_version("r2") as group:
+            group["values"] = np.array([b"ab", b"c", b"d"], dtype=object)
+
+    with h5py.File(filename, mode="r") as f:
+        file = VersionedHDF5File(f)
+        np.testing.assert_equal(file["r0"]["values"][:], np.array([b"a", b"b", b"cd"], dtype=object))
+        np.testing.assert_equal(file["r1"]["values"][:], np.array([b"ab", b"", b"cd"], dtype=object))
+        np.testing.assert_equal(file["r2"]["values"][:], np.array([b"ab", b"c", b"d"], dtype=object))
-Original file line number
+Diff line change
@@ Expand Up / @@ -1975,38 +1975,38 @@ def test_rebuild_hashtable(tmp_path, caplog): @@
         # Ensure new hash is stable
         expected = np.array(
             [
-,
-,
-,
-,
-,
-,
-,
-,
-,
-,
-,
-,
-,
-,
-,
+,
+,
+,
+,
+,
+,
+,
+,
+,
+,
+,
 ,
-,
-,
-,
-,
+,
+,
+,
+,
+,
 ,
-,
-,
-,
-,
-,
-,
-,
-,
-,
-,
+,
+,
+,
+,
+,
+,
+,
+,
+,
+,
+,
+,
+,
             ],
             dtype=np.uint8
         )
@@ Expand Down @@