Skip to content

Commit

Permalink
Include string length when hashing
Browse files Browse the repository at this point in the history
When hashing `dtype=object` arrays we need to include the
length of the individual strings, otherwise arrays with
identical contents after concatenating all elements
hash to the same value:
```
>>> import h5py
... import versioned_hdf5
... import numpy as np
...
... path = "temp.h5"
...
... with h5py.File(path, mode="w") as f:
...     file = versioned_hdf5.VersionedHDF5File(f)
...     with file.stage_version("r0") as group:
...         group.create_dataset(
...             "values", shape=(2,), dtype=h5py.string_dtype(encoding='ascii'), data=np.array([b"a", b"bc"], dtype=object),
...             maxshape=(None,), chunks=(10000,), compression="gzip", shuffle=False,
...         )
...     with file.stage_version("r1") as group:
...         group.create_dataset(
...             "values", shape=(2,), dtype=h5py.string_dtype(encoding='ascii'), data=np.array([b"ab", b"c"], dtype=object),
...             maxshape=(None,), chunks=(10000,), compression="gzip", shuffle=False,
...         )
...
... with h5py.File(path, mode="r") as f:
...     file = versioned_hdf5.VersionedHDF5File(f)
...     for i in range(2):
...         print(file[f"r{i}"]["values"][:])
...
[b'a' b'bc']
[b'a' b'bc']
```

By including the length of each string as part of the hash the hashes
will now be unique.

This also includes the changes in #287
and supersedes that PR.

Fixes #288
  • Loading branch information
ArvidJB committed Nov 7, 2023
1 parent 66e5a73 commit e53f5b6
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 39 deletions.
15 changes: 8 additions & 7 deletions versioned_hdf5/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from contextlib import contextmanager
import datetime

from .backend import initialize, DATA_VERSION, CORRUPT_DATA_VERSION
from .backend import initialize, DATA_VERSION, CORRUPT_DATA_VERSIONS
from .versions import (create_version_group, commit_version,
get_version_by_timestamp, get_nth_previous_version,
set_current_version, all_versions, delete_version, )
Expand Down Expand Up @@ -71,21 +71,22 @@ def __init__(self, f):
else:
# This is not a new file; check data version identifier for compatibility
if self.data_version_identifier < DATA_VERSION:
if self.data_version_identifier == CORRUPT_DATA_VERSION:
if self.data_version_identifier in CORRUPT_DATA_VERSIONS:
raise ValueError(
f'Versioned Hdf5 file {f.filename} has data_version {CORRUPT_DATA_VERSION}, '
f'Versioned Hdf5 file {f.filename} has data_version {self.data_version_identifier}, '
'which has corrupted hash_tables. '
'See https://github.com/deshaw/versioned-hdf5/issues/256 for details. '
'See https://github.com/deshaw/versioned-hdf5/issues/256 and '
'https://github.com/deshaw/versioned-hdf5/issues/288 for details. '
'You should recreate the file from scratch. '
'In an emergency you could also rebuild the hash tables by calling '
f'VersionedHDF5File({f.filename!r}).rebuild_hashtables() and use '
f'with h5py.File({f.filename!r}) as f: VersionedHDF5File(f).rebuild_hashtables() and use '
f'delete_versions to delete all versions after the upgrade to '
f'data_version {CORRUPT_DATA_VERSION} if you can identify them.')
f'data_version == {self.data_version_identifier} if you can identify them.')
if any(self._find_object_dtype_data_groups()):
logger.warning('Detected dtype="O" arrays which are not reused when creating new versions. '
'See https://github.com/deshaw/versioned-hdf5/issues/256 for details. '
'Rebuilding hash tables for %s is recommended by calling '
'VersionedHDF5File(%r).rebuild_object_dtype_hashtables().',
'with h5py.File(%r) as f: VersionedHDF5File(f).rebuild_object_dtype_hashtables().',
f.filename, f.filename)
else:
if f.mode == 'r+':
Expand Down
4 changes: 2 additions & 2 deletions versioned_hdf5/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
from .hashtable import Hashtable

DEFAULT_CHUNK_SIZE = 2**12
DATA_VERSION = 3
CORRUPT_DATA_VERSION = 2 # data_version 2 has broken hashtables, always need to rebuild
DATA_VERSION = 4
CORRUPT_DATA_VERSIONS = frozenset([2, 3]) # data_version 2 has broken hashtables, always need to rebuild

def normalize_dtype(dtype):
return np.array([], dtype=dtype).dtype
Expand Down
3 changes: 3 additions & 0 deletions versioned_hdf5/hashtable.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import hashlib
import struct
from collections.abc import MutableMapping
from functools import lru_cache

Expand Down Expand Up @@ -123,6 +124,8 @@ def hash(self, data: np.ndarray):
# default to utf-8 encoding since it's a superset of ascii (the only other
# valid encoding supported in h5py)
value = value.encode('utf-8')
# hash the length of value ('n' is ssize_t, which matches the internal type for lengths)
hash_value.update(struct.pack('n', len(value)))
hash_value.update(value)
hash_value.update(bytes(str(data.shape), 'utf-8'))
return hash_value.digest()
Expand Down
60 changes: 30 additions & 30 deletions versioned_hdf5/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1975,38 +1975,38 @@ def test_rebuild_hashtable(tmp_path, caplog):
# Ensure new hash is stable
expected = np.array(
[
51,
0,
100,
40,
27,
131,
129,
10,
220,
28,
67,
88,
241,
101,
82,
252,
36,
175,
92,
195,
20,
142,
4,
239,
58,
202,
209,
99,
40,
93,
96,
147,
120,
100,
137,
156,
220,
140,
22,
20,
128,
26,
215,
42,
44,
196,
5,
204
173,
52,
45,
1,
230,
255,
252,
205,
149,
145,
65,
175,
239,
159
],
dtype=np.uint8
)
Expand Down
26 changes: 26 additions & 0 deletions versioned_hdf5/tests/test_hashtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,29 @@ def test_object_dtype_hashes_values(tmp_path):
file = VersionedHDF5File(f)
for i in range(N):
assert file[f"r{i}"]["values"][()] == b"a"*(i+1)


def test_object_dtype_hashes_concatenated_values(tmp_path):
"""Test that object dtype arrays hash values which concatenate
to the same string to different hashes.
See https://github.com/deshaw/versioned-hdf5/issues/288.
"""
filename = tmp_path / "test.h5"
with h5py.File(filename, mode="w") as f:
file = VersionedHDF5File(f)
with file.stage_version("r0") as group:
group.create_dataset(
"values", dtype=h5py.string_dtype(encoding='ascii'),
data=np.array([b"a", b"b", b"cd"], dtype=object),
maxshape=(None,), chunks=(100,)
)
with file.stage_version("r1") as group:
group["values"] = np.array([b"ab", b"", b"cd"], dtype=object)
with file.stage_version("r2") as group:
group["values"] = np.array([b"ab", b"c", b"d"], dtype=object)

with h5py.File(filename, mode="r") as f:
file = VersionedHDF5File(f)
np.testing.assert_equal(file["r0"]["values"][:], np.array([b"a", b"b", b"cd"], dtype=object))
np.testing.assert_equal(file["r1"]["values"][:], np.array([b"ab", b"", b"cd"], dtype=object))
np.testing.assert_equal(file["r2"]["values"][:], np.array([b"ab", b"c", b"d"], dtype=object))

0 comments on commit e53f5b6

Please sign in to comment.