fix(decode_bytes): error handling led to data loss in subsequent ch…

…unks The contained new test documents how error handling in `decode_bytes()` caused data in a subsequent chunk to be skipped. The reason for that was that the pointer variable was not reset when a chunk (or rather the joined data of any previous chunks) was fully decoded. Decoding in the next chunk would start from the position last recorded during error handling in a previous chunk.
datalad · Jul 13, 2024 · dfa1d20 · dfa1d20
1 parent 977016a
commit dfa1d20
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 0 deletions.
diff --git a/datalad_next/itertools/decode_bytes.py b/datalad_next/itertools/decode_bytes.py
@@ -119,6 +119,10 @@ def handle_decoding_error(position: int,
             try:
                 yield joined_data[position:].decode(encoding)
                 joined_data = b''
+                # must reset the pointer for successful decoded
+                # parts too, otherwise we start too far into a new chunk's
+                # content
+                position = 0
             except UnicodeDecodeError as e:
                 # If an encoding error occurs, we first check whether it was
                 # in the middle of `joined_data` or whether it extends until the

diff --git a/datalad_next/itertools/tests/test_decode_bytes.py b/datalad_next/itertools/tests/test_decode_bytes.py
@@ -40,3 +40,10 @@ def test_no_empty_strings():
 def test_multiple_errors():
     r = ''.join(decode_bytes([b'08 War \xaf No \xaf More \xaf Trouble.shn.mp3']))
     assert r == '08 War \\xaf No \\xaf More \\xaf Trouble.shn.mp3'
+
+
+def test_error_chunks():
+    # this verifies that error handling in a previous chunk does not
+    # cause data loss in a subsequent chunk
+    r = ''.join(decode_bytes([b'08 War \xaf No', b'1234567890']))
+    assert r == '08 War \\xaf No1234567890'