diff --git a/datalad_next/runners/tests/test_data_processors.py b/datalad_next/runners/tests/test_data_processors.py index 89b81b44e..9b03c16f0 100644 --- a/datalad_next/runners/tests/test_data_processors.py +++ b/datalad_next/runners/tests/test_data_processors.py @@ -3,6 +3,8 @@ import json from itertools import chain +import pytest + from ..data_processors import ( ProcessorPipeline, SplitLinesProcessor, @@ -184,3 +186,29 @@ def test_processor_removal(): source = chain([chunk], stream) if chunk else stream assert b''.join(source) == b'content' + + +def test_split_decoding(): + encoded = 'ö'.encode('utf-8') + part_1, part_2 = encoded[:1], encoded[1:] + + # check that incomplete encodings are caught + decoded, remaining = decode_utf8_processor([part_1]) + assert decoded == [] + assert remaining == [part_1] + + # vreify that the omplete encoding decodes correctly + decoded, remaining = decode_utf8_processor([part_1, part_2]) + assert decoded == ['ö'] + assert remaining == [] + + +def test_pipeline_finishing(): + encoded = 'ö'.encode('utf-8') + part_1, part_2 = encoded[:1], encoded[1:] + + pipeline = ProcessorPipeline([decode_utf8_processor]) + res = pipeline.process(part_1) + assert res == [] + with pytest.raises(UnicodeDecodeError): + pipeline.finalize()