diff --git a/datalad_next/itertools/decode_bytes.py b/datalad_next/itertools/decode_bytes.py index bb2cca63..14e365fd 100644 --- a/datalad_next/itertools/decode_bytes.py +++ b/datalad_next/itertools/decode_bytes.py @@ -119,6 +119,10 @@ def handle_decoding_error(position: int, try: yield joined_data[position:].decode(encoding) joined_data = b'' + # must reset the pointer for successful decoded + # parts too, otherwise we start too far into a new chunk's + # content + position = 0 except UnicodeDecodeError as e: # If an encoding error occurs, we first check whether it was # in the middle of `joined_data` or whether it extends until the diff --git a/datalad_next/itertools/tests/test_decode_bytes.py b/datalad_next/itertools/tests/test_decode_bytes.py index 6139f7ca..e848ef1f 100644 --- a/datalad_next/itertools/tests/test_decode_bytes.py +++ b/datalad_next/itertools/tests/test_decode_bytes.py @@ -40,3 +40,10 @@ def test_no_empty_strings(): def test_multiple_errors(): r = ''.join(decode_bytes([b'08 War \xaf No \xaf More \xaf Trouble.shn.mp3'])) assert r == '08 War \\xaf No \\xaf More \\xaf Trouble.shn.mp3' + + +def test_error_chunks(): + # this verifies that error handling in a previous chunk does not + # cause data loss in a subsequent chunk + r = ''.join(decode_bytes([b'08 War \xaf No', b'1234567890'])) + assert r == '08 War \\xaf No1234567890'