diff --git a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java index 9f49439b9..ec2ff0cb7 100644 --- a/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java +++ b/cbor/src/main/java/com/fasterxml/jackson/dataformat/cbor/CBORParser.java @@ -1232,7 +1232,7 @@ public String nextFieldName() throws IOException if (name != null) { _inputPtr += lenMarker; } else { - name = _decodeShortName(lenMarker); + name = _decodeContiguousName(lenMarker); name = _addDecodedToSymbols(lenMarker, name); } } @@ -2122,17 +2122,21 @@ protected void _finishToken() throws IOException } return; } - if (len > (_inputEnd - _inputPtr)) { - // or if not, could we read? - if (len >= _inputBuffer.length) { - // If not enough space, need handling similar to chunked - _finishLongText(len); + // 29-Jan-2021, tatu: as per [dataformats-binary#238] must keep in mind that + // the longest individual unit is 4 bytes (surrogate pair) so we + // actually need len+3 bytes to avoid bounds checks + final int needed = len + 3; + final int available = _inputEnd - _inputPtr; + + if ((available >= needed) + // if not, could we read? NOTE: we do not require it, just attempt to read + || ((_inputBuffer.length >= needed) + && _tryToLoadToHaveAtLeast(needed))) { + _finishShortText(len); return; - } - _loadToHaveAtLeast(len); } - // offline for better optimization - _finishShortText(len); + // If not enough space, need handling similar to chunked + _finishLongText(len); } /** @@ -2184,7 +2188,7 @@ private final String _finishShortText(int len) throws IOException if (outBuf.length < len) { // one minor complication outBuf = _textBuffer.expandCurrentSegment(len); } - + int outPtr = 0; int inPtr = _inputPtr; _inputPtr += len; @@ -2200,7 +2204,6 @@ private final String _finishShortText(int len) throws IOException return _textBuffer.setCurrentAndReturn(outPtr); } } - final int[] codes = UTF8_UNIT_CODES; do { i = inputBuf[inPtr++] & 0xFF; @@ -2208,25 +2211,40 @@ private final String _finishShortText(int len) throws IOException case 0: break; case 1: - i = ((i & 0x1F) << 6) | (inputBuf[inPtr++] & 0x3F); + { + final int c2 = inputBuf[inPtr++]; + if ((c2 & 0xC0) != 0x080) { + _reportInvalidOther(c2 & 0xFF, inPtr); + } + i = ((i & 0x1F) << 6) | (c2 & 0x3F); + } break; case 2: - i = ((i & 0x0F) << 12) - | ((inputBuf[inPtr++] & 0x3F) << 6) - | (inputBuf[inPtr++] & 0x3F); + { + final int c2 = inputBuf[inPtr++]; + if ((c2 & 0xC0) != 0x080) { + _reportInvalidOther(c2 & 0xFF, inPtr); + } + final int c3 = inputBuf[inPtr++]; + if ((c3 & 0xC0) != 0x080) { + _reportInvalidOther(c3 & 0xFF, inPtr); + } + i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + } break; case 3: + // 30-Jan-2021, tatu: TODO - validate these too? i = ((i & 0x07) << 18) - | ((inputBuf[inPtr++] & 0x3F) << 12) - | ((inputBuf[inPtr++] & 0x3F) << 6) - | (inputBuf[inPtr++] & 0x3F); + | ((inputBuf[inPtr++] & 0x3F) << 12) + | ((inputBuf[inPtr++] & 0x3F) << 6) + | (inputBuf[inPtr++] & 0x3F); // note: this is the codepoint value; need to split, too i -= 0x10000; outBuf[outPtr++] = (char) (0xD800 | (i >> 10)); i = 0xDC00 | (i & 0x3FF); break; default: // invalid - _reportError("Invalid byte "+Integer.toHexString(i)+" in Unicode text block"); + _reportInvalidInitial(i); } outBuf[outPtr++] = (char) i; } while (inPtr < end); @@ -2594,7 +2612,7 @@ protected final JsonToken _decodePropertyName() throws IOException if (name != null) { _inputPtr += lenMarker; } else { - name = _decodeShortName(lenMarker); + name = _decodeContiguousName(lenMarker); name = _addDecodedToSymbols(lenMarker, name); } } @@ -2610,7 +2628,7 @@ protected final JsonToken _decodePropertyName() throws IOException return JsonToken.FIELD_NAME; } - private final String _decodeShortName(int len) throws IOException + private final String _decodeContiguousName(int len) throws IOException { // note: caller ensures we have enough bytes available int outPtr = 0; @@ -2623,7 +2641,7 @@ private final String _decodeShortName(int len) throws IOException final int[] codes = UTF8_UNIT_CODES; final byte[] inBuf = _inputBuffer; - // First a tight loop for Ascii + // First a tight loop for ASCII final int end = inPtr + len; while (true) { int i = inBuf[inPtr] & 0xFF; @@ -2645,25 +2663,40 @@ private final String _decodeShortName(int len) throws IOException // trickiest one, need surrogate handling switch (code) { case 1: - i = ((i & 0x1F) << 6) | (inBuf[inPtr++] & 0x3F); + { + final int c2 = inBuf[inPtr++]; + if ((c2 & 0xC0) != 0x080) { + _reportInvalidOther(c2 & 0xFF, inPtr); + } + i = ((i & 0x1F) << 6) | (c2 & 0x3F); + } break; case 2: - i = ((i & 0x0F) << 12) - | ((inBuf[inPtr++] & 0x3F) << 6) - | (inBuf[inPtr++] & 0x3F); + { + final int c2 = inBuf[inPtr++]; + if ((c2 & 0xC0) != 0x080) { + _reportInvalidOther(c2 & 0xFF, inPtr); + } + final int c3 = inBuf[inPtr++]; + if ((c3 & 0xC0) != 0x080) { + _reportInvalidOther(c3 & 0xFF, inPtr); + } + i = ((i & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); + } break; case 3: + // 30-Jan-2021, tatu: TODO - validate surrogate case too? i = ((i & 0x07) << 18) - | ((inBuf[inPtr++] & 0x3F) << 12) - | ((inBuf[inPtr++] & 0x3F) << 6) - | (inBuf[inPtr++] & 0x3F); + | ((inBuf[inPtr++] & 0x3F) << 12) + | ((inBuf[inPtr++] & 0x3F) << 6) + | (inBuf[inPtr++] & 0x3F); // note: this is the codepoint value; need to split, too i -= 0x10000; outBuf[outPtr++] = (char) (0xD800 | (i >> 10)); i = 0xDC00 | (i & 0x3FF); break; default: // invalid - _reportError("Invalid byte "+Integer.toHexString(i)+" in Object name"); + _reportError("Invalid UTF-8 byte 0x"+Integer.toHexString(i)+" in Object property name"); } } outBuf[outPtr++] = (char) i; @@ -2688,7 +2721,7 @@ private final String _decodeLongerName(int len) throws IOException _inputPtr += len; return name; } - name = _decodeShortName(len); + name = _decodeContiguousName(len); return _addDecodedToSymbols(len, name); } diff --git a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java index 7050fdbf3..984a34596 100644 --- a/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java +++ b/cbor/src/test/java/com/fasterxml/jackson/dataformat/cbor/parse/ParseInvalidUTF8String236Test.java @@ -54,4 +54,74 @@ public void testShortString236TruncatedString() throws Exception } } } + + public void testShortString237InvalidTextValue() throws Exception + { + // String with length of 2 bytes, but a few null bytes as fillers to + // avoid buffer boundary + // (2nd byte implies 2-byte sequence but 3rd byte does not have high-bit set) + byte[] input2 = {0x62, (byte) 0xCF, 0x2d, + 0, 0, 0, 0, 0, 0}; + try (CBORParser p = cborParser(input2)) { + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + try { + String str = p.getText(); + fail("Should have failed, did not, String = '"+str+"'"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8 middle byte 0x2d"); + } + } + + // but let's also validate 3-byte variant as well + byte[] input3 = {0x63, (byte) 0xEF, (byte) 0x8e, 0x2d, + 0, 0, 0, 0, 0, 0}; + try (CBORParser p = cborParser(input3)) { + assertToken(JsonToken.VALUE_STRING, p.nextToken()); + try { + String str = p.getText(); + fail("Should have failed, did not, String = '"+str+"'"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8 middle byte 0x2d"); + } + } + } + + public void testShortString237InvalidName() throws Exception + { + // Object with 2-byte invalid name + byte[] input2 = { (byte) 0xBF, // Object, indefinite length + 0x62, (byte) 0xCF, 0x2e, // 2-byte name but invalid second byte + 0x21, // int value of 33 + (byte) 0xFF, // Object END marker + 0, 0, 0, 0 // padding + }; + try (CBORParser p = cborParser(input2)) { + assertToken(JsonToken.START_OBJECT, p.nextToken()); + try { + p.nextToken(); + String str = p.getText(); + fail("Should have failed, did not, String = '"+str+"'"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8 middle byte 0x2e"); + } + } + + // but let's also validate 3-byte variant as well + byte[] input3 = { (byte) 0xBF, // Object, indefinite length + 0x62, (byte) 0xEF, (byte) 0x8e, 0x2f, // 3-byte name but invalid third byte + 0x22, // int value of 34 + (byte) 0xFF, // Object END marker + 0, 0, 0, 0 // padding + }; + try (CBORParser p = cborParser(input3)) { + assertToken(JsonToken.START_OBJECT, p.nextToken()); + try { + p.nextToken(); + String str = p.getText(); + fail("Should have failed, did not, String = '"+str+"'"); + } catch (StreamReadException e) { + verifyException(e, "Invalid UTF-8 middle byte 0x2f"); + } + } + } } diff --git a/release-notes/VERSION-2.x b/release-notes/VERSION-2.x index fbb033b5e..236df554c 100644 --- a/release-notes/VERSION-2.x +++ b/release-notes/VERSION-2.x @@ -11,7 +11,7 @@ Project: jackson-datatypes-binaryModules: 2.13.0 (not yet released) -No changes since 2.12 +#239: Should validate UTF-8 multi-byte validity for short decode path too 2.12.2 (not yet released)