From b9b36ae78515715242aa45c0129a163fb3b5d08f Mon Sep 17 00:00:00 2001 From: eritque0arcus Date: Thu, 25 Apr 2024 00:59:47 -0400 Subject: [PATCH] fix: fix data been overwritten by incorrect type --- src/pycJSON_decode.c | 2 +- src/str.c | 69 ++++++++++++++++++++++++++++++++++++++++---- src/str.h | 4 ++- 3 files changed, 68 insertions(+), 7 deletions(-) diff --git a/src/pycJSON_decode.c b/src/pycJSON_decode.c index 69f06dd..82e5e0e 100644 --- a/src/pycJSON_decode.c +++ b/src/pycJSON_decode.c @@ -173,7 +173,7 @@ static bool parse_string(PyObject **item, parse_buffer *const input_buffer) { } else { uint32_t unicode_value; // what is actually len of this utf8 sequence - int skip = get_unicode_value(input_end, &unicode_value); + int skip = get_unicode_value_usc4(input_end, &unicode_value); if (unicode_value == 0) { PyErr_SetString(PyExc_ValueError, "Invalid utf8 string."); return false; diff --git a/src/str.c b/src/str.c index e516050..be064f8 100644 --- a/src/str.c +++ b/src/str.c @@ -59,7 +59,8 @@ int get_utf8_type(uint32_t unciode_value) { } } -int get_unicode_value(const char *str, Py_UCS4 *re) { +int get_unicode_value_usc4(const char *str, Py_UCS4 *re) { + assert(re != NULL); if (str[0] & 0b10000000 && !str[0] & 0b01000000) { // Continue bytes should be skipped. return 0; @@ -83,9 +84,58 @@ int get_unicode_value(const char *str, Py_UCS4 *re) { } } +int get_unicode_value_usc2(const char *str, Py_UCS2 *re) { + assert(re != NULL); + if (str[0] & 0b10000000 && !str[0] & 0b01000000) { + // Continue bytes should be skipped. + *re = -1; + return 0; + } + if (str[0] & 0b10000000) { + if (str[0] & 0b00100000) { + if (str[0] & 0b00010000) { + // NOPE + *re = -1; + return 0; + } else { + *re = ((str[0] & 0b1111) << 12) + ((str[1] & 0b111111) << 6) + (str[2] & 0b111111); + return 3; // 3 bytes can be represented under 0xffff + } + } else { + *re = ((str[0] & 0b11111) << 6) + (str[1] & 0b111111); + return 2; + } + } else { + *re = str[0] & 0b1111111; + return 1; + } +} + +int get_unicode_value_usc1(const char *str, Py_UCS1 *re) { + assert(re != NULL); + if (str[0] & 0b10000000 && !str[0] & 0b01000000) { + // Continue bytes should be skipped. + *re = -1; + return 0; + } + if (str[0] & 0b10000000) { + if (str[0] & 0b00100000) { + // NOPE + *re = -1; + return 0; + } else { + *re = ((str[0] & 0b11111) << 6) + (str[1] & 0b111111); + return 2; // some of 2 bytes can be represented under 0xff + } + } else { + *re = str[0] & 0b1111111; + return 1; + } +} + bool str2unicode_1byte(PyObject **re, const char *str, const long alloc, const long num) { typedef Py_UCS1 t; - *re = PyUnicode_New(alloc, 0xFF); // TODO must be at least alloc + 1 idk why + *re = PyUnicode_New(alloc, 0xFF); if (*re == NULL) { PyErr_Format(PyExc_MemoryError, "Failed to parse string: allocation failure"); return false; @@ -93,6 +143,8 @@ bool str2unicode_1byte(PyObject **re, const char *str, const long alloc, const l t *data = (t *) ((PyCompactUnicodeObject *) *re + 1); long real_len = 0; for (int i = 0; i < num - 1; i++) { + // no overflow + assert(real_len < alloc); if (str[i] == '\\') { switch (str[++i]) { PARSE_STRING_CHAR_MATCHER(str[i], data, char) @@ -106,7 +158,7 @@ bool str2unicode_1byte(PyObject **re, const char *str, const long alloc, const l return false; } } else { - const int skip = get_unicode_value(str + i, data++); + const int skip = get_unicode_value_usc1(str + i, data++); if (skip == 0) { PyErr_SetString(PyExc_ValueError, "Invalid utf8 string."); return false; @@ -115,6 +167,7 @@ bool str2unicode_1byte(PyObject **re, const char *str, const long alloc, const l } real_len++; } + *data = 0; // PyUnicode_Resize(re, real_len); assert(real_len == alloc); // TODO remove real_len after testing return true; @@ -130,6 +183,8 @@ bool str2unicode_2byte(PyObject **re, const char *str, const long alloc, const l t *data = (t *) ((PyCompactUnicodeObject *) *re + 1); long real_len = 0; for (int i = 0; i < num - 1; i++) { + // no overflow + assert(real_len < alloc); if (str[i] == '\\') { switch (str[++i]) { PARSE_STRING_CHAR_MATCHER(str[i], data, t) @@ -143,7 +198,7 @@ bool str2unicode_2byte(PyObject **re, const char *str, const long alloc, const l return false; } } else { - const int skip = get_unicode_value(str + i, data++); + const int skip = get_unicode_value_usc2(str + i, data++); if (skip == 0) { PyErr_SetString(PyExc_ValueError, "Invalid utf8 string."); return false; @@ -152,6 +207,7 @@ bool str2unicode_2byte(PyObject **re, const char *str, const long alloc, const l } real_len++; } + *data = 0; // PyUnicode_Resize(re, real_len); assert(real_len == alloc); // TODO remove real_len after testing return true; @@ -167,6 +223,8 @@ bool str2unicode_4byte(PyObject **re, const char *str, const long alloc, const l t *data = (t *) ((PyCompactUnicodeObject *) *re + 1); long real_len = 0; for (int i = 0; i < num - 1; i++) { + // no overflow + assert(real_len < alloc); if (str[i] == '\\') { switch (str[++i]) { PARSE_STRING_CHAR_MATCHER(str[i], data, t) @@ -180,7 +238,7 @@ bool str2unicode_4byte(PyObject **re, const char *str, const long alloc, const l return false; } } else { - const int skip = get_unicode_value(str + i, data++); + const int skip = get_unicode_value_usc4(str + i, data++); if (skip == 0) { PyErr_SetString(PyExc_ValueError, "Invalid utf8 string."); return false; @@ -189,6 +247,7 @@ bool str2unicode_4byte(PyObject **re, const char *str, const long alloc, const l } real_len++; } + *data = 0; // PyUnicode_Resize(re, real_len); assert(real_len == alloc); // TODO remove real_len after testing return true; diff --git a/src/str.h b/src/str.h index ce52217..a3b4be4 100644 --- a/src/str.h +++ b/src/str.h @@ -4,7 +4,9 @@ #include int get_utf8_type(uint32_t unciode_value); -int get_unicode_value(const char *str, Py_UCS4 *re); +int get_unicode_value_usc4(const char *str, Py_UCS4 *re); +int get_unicode_value_usc2(const char *str, Py_UCS2 *re); +int get_unicode_value_usc1(const char *str, Py_UCS1 *re); bool str2unicode_1byte(PyObject **re, const char *str, long alloc, long num); bool str2unicode_2byte(PyObject **re, const char *str, long alloc, long num); bool str2unicode_4byte(PyObject **re, const char *str, long alloc, long num);